diff --git a/fine_unsloth.py b/fine_unsloth.py new file mode 100644 index 0000000..3d003ea --- /dev/null +++ b/fine_unsloth.py @@ -0,0 +1,202 @@ +import os +import logging +from datetime import datetime +from datasets import load_dataset, Dataset +from transformers import LlamaForCausalLM, PreTrainedTokenizerFast, Trainer, TrainingArguments +import math +import psutil +import torch +import concurrent.futures +from multiprocessing import cpu_count + +# Configure logging +log_filename = 'finetuning.log' +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', filename=log_filename, filemode='w') + +# Add console handler to logging +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.DEBUG) +console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) +logging.getLogger().addHandler(console_handler) + +# Add more logging for system resource usage +def log_system_resources(): + memory_info = psutil.virtual_memory() + logging.info(f"Memory usage: {memory_info.percent}%") + cpu_info = psutil.cpu_percent(interval=1) + logging.info(f"CPU usage: {cpu_info}%") + +# Define a function to generate a unique output directory +def generate_unique_output_dir(base_dir): + timestamp = datetime.now().strftime('%Y%m%d-%H%M%S') + return os.path.join(base_dir, f"finetuned_model_{timestamp}") + +# Load the dataset +def load_and_prepare_dataset(dataset_path): + try: + dataset = load_dataset('json', data_files=dataset_path, split='train[:10%]') + logging.info(f"Dataset loaded successfully with {len(dataset)} examples") + return dataset + except Exception as e: + logging.error(f"Error loading dataset: {e}") + raise + +# Tokenize the dataset +def tokenize_function(examples, tokenizer): + return tokenizer(examples["tokens"], padding="max_length", truncation=True, max_length=512) + +# Compute metrics function +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = predictions.argmax(dim=-1) + labels = labels.where(labels != -100, predictions) + accuracy = (predictions == labels).float().mean() + return {'accuracy': accuracy.item()} + +def main(): + model_path = r"C:\AI Stuff\Joseph\Models\models--cognitivecomputations--dolphin-2.9-llama3-8b\snapshots\5aeb036f9215c558b483a654a8c6e1cc22e841bf" + dataset_path = r"C:\AI Stuff\Joseph\Josephing\Official Script\Compiled Text\total_books.jsonl" + output_dir = r"C:\AI Stuff\Joseph\Josephing\Official Script\FineTunedModels" + + logging.info(f"Model path: {model_path}") + logging.info(f"Dataset path: {dataset_path}") + logging.info(f"Output dir: {output_dir}") + + # Verify paths are strings + assert isinstance(model_path, str), f"model_path is not a string: {model_path}" + assert isinstance(dataset_path, str), f"dataset_path is not a string: {dataset_path}" + assert isinstance(output_dir, str), f"output_dir is not a string: {output_dir}" + + # Log system resources + log_system_resources() + + # Ensure the model path exists and list its contents + if os.path.exists(model_path): + logging.info(f"Directory exists: {model_path}") + logging.info(f"Contents of the directory: {os.listdir(model_path)}") + else: + logging.error(f"Model path does not exist: {model_path}") + raise FileNotFoundError(f"Model path does not exist: {model_path}") + + try: + # Log the type of model_path before using it + logging.info(f"Type of model_path: {type(model_path)}") + logging.info(f"Type of dataset_path: {type(dataset_path)}") + logging.info(f"Type of output_dir: {type(output_dir)}") + + # Load the tokenizer and model + tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path, legacy=False) + model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16) + + # Ensure that special tokens are properly initialized + model.resize_token_embeddings(len(tokenizer)) + + # Ensure that special tokens are fine-tuned + special_tokens = list(tokenizer.special_tokens_map.values()) + logging.info(f"Special tokens to be fine-tuned: {special_tokens}") + model.config.pad_token_id = tokenizer.pad_token_id + model.config.eos_token_id = tokenizer.eos_token_id + model.config.bos_token_id = tokenizer.bos_token_id + model.config.sep_token_id = tokenizer.sep_token_id + + except Exception as e: + logging.error(f"Error loading tokenizer or model: {e}") + raise + + # Log system resources after loading model + log_system_resources() + + # Load and prepare the dataset + dataset = load_and_prepare_dataset(dataset_path) + + # Inspect dataset structure + logging.info(f"Dataset features: {dataset.features}") + + # Tokenize the dataset using multithreading + logging.info("Using multithreading for tokenization") + + tokenized_data = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_count()) as executor: + futures = [executor.submit(tokenize_function, example, tokenizer) for example in dataset] + for future in concurrent.futures.as_completed(futures): + try: + tokenized_data.append(future.result()) + except Exception as e: + logging.error(f"Error in tokenization: {e}") + + # Convert the list back to a Dataset object + tokenized_datasets = Dataset.from_dict({'input_ids': [item['input_ids'] for item in tokenized_data], + 'attention_mask': [item['attention_mask'] for item in tokenized_data]}) + + # Check the columns in the tokenized dataset + logging.info(f"Tokenized dataset columns: {tokenized_datasets.column_names}") + + if 'input_ids' not in tokenized_datasets.column_names or 'attention_mask' not in tokenized_datasets.column_names: + logging.error(f"Tokenization failed to produce expected columns. Columns present: {tokenized_datasets.column_names}") + raise ValueError("Tokenization did not produce the expected columns.") + + logging.info("Dataset tokenization completed successfully.") + + # Set training arguments + training_args = TrainingArguments( + output_dir=generate_unique_output_dir(output_dir), + overwrite_output_dir=True, + num_train_epochs=3, + per_device_train_batch_size=2, + gradient_accumulation_steps=4, + learning_rate=2e-5, + lr_scheduler_type="cosine", + warmup_steps=500, + save_steps=1000, + save_total_limit=2, + logging_dir='./logs', + logging_steps=10, # More frequent logging + eval_strategy="steps", + eval_steps=500, + report_to="none", + dataloader_num_workers=cpu_count(), # Use all available CPU cores + fp16=True, + optim="adamw_torch", + ) + + # Create a Trainer instance + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets, + eval_dataset=tokenized_datasets, + compute_metrics=compute_metrics + ) + + # Start training + try: + logging.info("Starting training process.") + trainer.train() + logging.info("Training completed successfully.") + except Exception as e: + logging.error(f"Error during training: {e}") + raise + + # Evaluate the model + try: + logging.info("Starting evaluation process.") + eval_result = trainer.evaluate() + perplexity = math.exp(eval_result["eval_loss"]) + eval_result["perplexity"] = perplexity + logging.info(f"Evaluation results: {eval_result}") + except Exception as e: + logging.error(f"Error during evaluation: {e}") + + # Save the model + try: + logging.info("Saving the fine-tuned model.") + model_path = os.path.join(training_args.output_dir, "final_model") + model.save_pretrained(model_path) + tokenizer.save_pretrained(model_path) + logging.info(f"Model saved to {model_path}") + except Exception as e: + logging.error(f"Error saving the model: {e}") + +if __name__ == "__main__": + log_system_resources() + main()