From 3d355e484f8c0ba87d25d3f0cfc48a94cfcef1b3 Mon Sep 17 00:00:00 2001 From: Boris Feld Date: Thu, 25 Apr 2024 10:50:13 +0200 Subject: [PATCH 1/3] Fix transformers example --- .../Comet_with_Hugging_Face_Trainer.ipynb | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb b/integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb index b84d1126..f4996dfb 100644 --- a/integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb +++ b/integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb @@ -11,11 +11,13 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "5L-2VqFDWgGx" + }, "source": [ - "[Hugging Face](https://huggingface.co/docs) is a community and data science platform that provides tools that enable users to build, train and deploy ML models based on open source (OS) code and technologies. Primarily known for their `transformers` library, Hugging Face has helped democratized access to these models by providing a unified API to train and evaluate a number of popular models for NLP. \n", + "[Hugging Face](https://huggingface.co/docs) is a community and data science platform that provides tools that enable users to build, train and deploy ML models based on open source (OS) code and technologies. Primarily known for their `transformers` library, Hugging Face has helped democratized access to these models by providing a unified API to train and evaluate a number of popular models for NLP.\n", "\n", - "Comet integrates with Hugging Face's `Trainer` object, allowing you to log your model parameters, metrics, and assets such as model checkpoints. Learn more about our integration [here](https://www.comet.com/docs/v2/integrations/ml-frameworks/huggingface/) \n", + "Comet integrates with Hugging Face's `Trainer` object, allowing you to log your model parameters, metrics, and assets such as model checkpoints. Learn more about our integration [here](https://www.comet.com/docs/v2/integrations/ml-frameworks/huggingface/)\n", "\n", "Curious about how Comet can help you build better models, faster? Find out more about [Comet](https://www.comet.com/site/products/ml-experiment-tracking/?utm_campaign=transformers&utm_medium=colab) and our [other integrations](https://www.comet.ml/docs/v2/integrations/overview/)\n", "\n", @@ -83,6 +85,7 @@ "outputs": [], "source": [ "PRE_TRAINED_MODEL_NAME = \"distilbert-base-uncased\"\n", + "MAX_MODEL_LENGTH = 512 # If the model don't define it\n", "SEED = 42" ] }, @@ -138,7 +141,12 @@ "outputs": [], "source": [ "def tokenize_function(examples):\n", - " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + " return tokenizer(\n", + " examples[\"text\"],\n", + " padding=\"max_length\",\n", + " max_length=MAX_MODEL_LENGTH,\n", + " truncation=True,\n", + " )\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)" @@ -314,19 +322,11 @@ ")\n", "trainer.train()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { - "collapsed_sections": [], "name": "Comet with Hugging Face", "provenance": [] }, @@ -345,9 +345,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.9.1" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } From 3e5e1d0840430edcfb845fb5240b4fb836d19c40 Mon Sep 17 00:00:00 2001 From: Boris Feld Date: Thu, 25 Apr 2024 12:06:54 +0200 Subject: [PATCH 2/3] Switch to a maintained HF model --- .../notebooks/Comet_with_Hugging_Face_Trainer.ipynb | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb b/integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb index f4996dfb..d958ee0d 100644 --- a/integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb +++ b/integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb @@ -84,8 +84,7 @@ }, "outputs": [], "source": [ - "PRE_TRAINED_MODEL_NAME = \"distilbert-base-uncased\"\n", - "MAX_MODEL_LENGTH = 512 # If the model don't define it\n", + "PRE_TRAINED_MODEL_NAME = \"distilbert/distilroberta-base\"\n", "SEED = 42" ] }, @@ -144,7 +143,6 @@ " return tokenizer(\n", " examples[\"text\"],\n", " padding=\"max_length\",\n", - " max_length=MAX_MODEL_LENGTH,\n", " truncation=True,\n", " )\n", "\n", @@ -345,7 +343,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.12" } }, "nbformat": 4, From 6a82c97573a5716a9001c74211135a29086ae9ff Mon Sep 17 00:00:00 2001 From: Boris Feld Date: Thu, 25 Apr 2024 15:20:12 +0200 Subject: [PATCH 3/3] Pin transformers to a slightly older version for now --- guides/get-started/Comet_Quickstart.ipynb | 2 +- .../Alpaca_Lora_Finetuning_with_Comet.ipynb | 2 +- .../gradio/notebooks/Gradio_and_Comet.ipynb | 36 +- ...del_Inferences_with_Comet_and_Gradio.ipynb | 2 +- .../requirements.txt | 2 +- notebooks/Comet_and_Hugging_Face.ipynb | 373 ------------------ 6 files changed, 22 insertions(+), 395 deletions(-) delete mode 100644 notebooks/Comet_and_Hugging_Face.ipynb diff --git a/guides/get-started/Comet_Quickstart.ipynb b/guides/get-started/Comet_Quickstart.ipynb index ee85912f..3b740a92 100644 --- a/guides/get-started/Comet_Quickstart.ipynb +++ b/guides/get-started/Comet_Quickstart.ipynb @@ -426,7 +426,7 @@ }, "outputs": [], "source": [ - "%pip install -U comet_ml torch datasets transformers scikit-learn accelerate" + "%pip install -U comet_ml torch datasets \"transformers<4.40.0\" scikit-learn accelerate" ] }, { diff --git a/integrations/llm/finetuning/alpaca-lora/notebooks/Alpaca_Lora_Finetuning_with_Comet.ipynb b/integrations/llm/finetuning/alpaca-lora/notebooks/Alpaca_Lora_Finetuning_with_Comet.ipynb index 6f214b5c..f59813c6 100644 --- a/integrations/llm/finetuning/alpaca-lora/notebooks/Alpaca_Lora_Finetuning_with_Comet.ipynb +++ b/integrations/llm/finetuning/alpaca-lora/notebooks/Alpaca_Lora_Finetuning_with_Comet.ipynb @@ -88,7 +88,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -r alpaca-lora/requirements.txt scipy" + "%pip install -r alpaca-lora/requirements.txt scipy \"transformers<4.40.0\"" ] }, { diff --git a/integrations/model-evaluation/gradio/notebooks/Gradio_and_Comet.ipynb b/integrations/model-evaluation/gradio/notebooks/Gradio_and_Comet.ipynb index c5fff82a..359ed2fa 100644 --- a/integrations/model-evaluation/gradio/notebooks/Gradio_and_Comet.ipynb +++ b/integrations/model-evaluation/gradio/notebooks/Gradio_and_Comet.ipynb @@ -55,7 +55,7 @@ }, "outputs": [], "source": [ - "%pip install -U comet_ml gradio altair torch torchvision transformers requests Pillow" + "%pip install -U comet_ml gradio altair torch torchvision \"transformers<4.40.0\" requests Pillow" ] }, { @@ -118,7 +118,9 @@ "from PIL import Image\n", "from torchvision import transforms\n", "\n", - "torch.hub.download_url_to_file(\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "torch.hub.download_url_to_file(\n", + " \"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\"\n", + ")\n", "\n", "model = torch.hub.load(\"pytorch/vision:v0.6.0\", \"resnet18\", pretrained=True).eval()\n", "\n", @@ -138,9 +140,7 @@ "inputs = gr.Image()\n", "outputs = gr.Label(num_top_classes=3)\n", "\n", - "io = gr.Interface(\n", - " fn=predict, inputs=inputs, outputs=outputs, examples=[\"dog.jpg\"]\n", - ")\n", + "io = gr.Interface(fn=predict, inputs=inputs, outputs=outputs, examples=[\"dog.jpg\"])\n", "io.launch(inline=False, share=True)\n", "\n", "experiment = comet_ml.Experiment()\n", @@ -200,7 +200,7 @@ " \"max_length\": 50,\n", " \"temperature\": 0.7,\n", " \"top_k\": 50,\n", - " \"no_repeat_ngram_size\": 2\n", + " \"no_repeat_ngram_size\": 2,\n", "}\n", "model = model.to(device)\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", @@ -220,17 +220,14 @@ " )\n", " return \".\".join(output.split(\".\")[:-1]) + \".\"\n", "\n", + "\n", "input_text = gr.Textbox(label=\"Input Prompt\")\n", "output_text = gr.Textbox(label=\"Generated Output\")\n", "io = gr.Interface(\n", " generate_text,\n", " inputs=input_text,\n", " outputs=output_text,\n", - " examples=[\n", - " [\n", - " \"The dectective looked at the room full of suspects and said, \"\n", - " ]\n", - " ],\n", + " examples=[[\"The dectective looked at the room full of suspects and said, \"]],\n", ")\n", "io.launch(inline=False, share=True)\n", "\n", @@ -273,9 +270,7 @@ "do_lower_case = True\n", "model_version = \"distilbert-base-uncased-distilled-squad\"\n", "\n", - "tokenizer = AutoTokenizer.from_pretrained(\n", - " model_version, do_lower_case=do_lower_case\n", - ")\n", + "tokenizer = AutoTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)\n", "tokenizer.pad_token = \"[PAD]\"\n", "model = AutoModelForQuestionAnswering.from_pretrained(\n", " model_version, output_attentions=True, pad_token_id=tokenizer.eos_token_id\n", @@ -286,10 +281,11 @@ "\n", "def qa_func(context, question):\n", " prediction = qa(question=question, context=context)\n", - " answer = prediction['answer']\n", + " answer = prediction[\"answer\"]\n", "\n", " return answer\n", "\n", + "\n", "io = gr.Interface(\n", " qa_func,\n", " inputs=[\n", @@ -297,12 +293,16 @@ " gr.Textbox(label=\"Question\"),\n", " ],\n", " outputs=[gr.Textbox(label=\"Answer\")],\n", - " examples=[[\"\"\"A Moon landing is the arrival of a spacecraft on the surface of the Moon.\n", + " examples=[\n", + " [\n", + " \"\"\"A Moon landing is the arrival of a spacecraft on the surface of the Moon.\n", " This includes both crewed and robotic missions. The first human-made object to touch the Moon was the Soviet Union's Luna 2, on 13 September 1959.\n", " The United States' Apollo 11 was the first crewed mission to land on the Moon, on 20 July 1969. \n", " There were six crewed U.S. landings between 1969 and 1972, and numerous uncrewed landings, with no soft landings happening between 22 August 1976 and 14 December 2013.\n", - " \"\"\", \"What year did the first crewed mission land on the moon?\"]\n", - " ]\n", + " \"\"\",\n", + " \"What year did the first crewed mission land on the moon?\",\n", + " ]\n", + " ],\n", ")\n", "io.launch(inline=False, share=True)\n", "\n", diff --git a/integrations/model-evaluation/gradio/notebooks/Logging_Model_Inferences_with_Comet_and_Gradio.ipynb b/integrations/model-evaluation/gradio/notebooks/Logging_Model_Inferences_with_Comet_and_Gradio.ipynb index 9bb32ffa..3b18abf2 100644 --- a/integrations/model-evaluation/gradio/notebooks/Logging_Model_Inferences_with_Comet_and_Gradio.ipynb +++ b/integrations/model-evaluation/gradio/notebooks/Logging_Model_Inferences_with_Comet_and_Gradio.ipynb @@ -42,7 +42,7 @@ }, "outputs": [], "source": [ - "%pip install comet_ml torch transformers \"gradio>=4.0\" shap" + "%pip install comet_ml torch \"transformers<4.40.0\" \"gradio>=4.0\" shap" ] }, { diff --git a/integrations/model-training/hugging_face/transformers-distilbert-fine-tuning/requirements.txt b/integrations/model-training/hugging_face/transformers-distilbert-fine-tuning/requirements.txt index c2a47abc..ff7b74dc 100644 --- a/integrations/model-training/hugging_face/transformers-distilbert-fine-tuning/requirements.txt +++ b/integrations/model-training/hugging_face/transformers-distilbert-fine-tuning/requirements.txt @@ -3,4 +3,4 @@ comet_ml pandas scikit-learn torch -transformers +transformers<4.40.0 diff --git a/notebooks/Comet_and_Hugging_Face.ipynb b/notebooks/Comet_and_Hugging_Face.ipynb deleted file mode 100644 index 0cf8650f..00000000 --- a/notebooks/Comet_and_Hugging_Face.ipynb +++ /dev/null @@ -1,373 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Comet and Hugging Face.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "KU7cVuqkmiOZ" - }, - "source": [ - "%pip install comet_ml\n", - "%pip install transformers==3.3.1" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "To15jUCoAW6T" - }, - "source": [ - "import comet_ml" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "D_MlG7M2x6yb" - }, - "source": [ - "!wget https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "zFC922P1ebiY" - }, - "source": [ - "import os\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "mmdj_S_vml1k" - }, - "source": [ - "import transformers\n", - "from transformers import AutoTokenizer\n", - "from transformers import BertForSequenceClassification, Trainer, TrainingArguments" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "mttl_keG2_Gv" - }, - "source": [ - "import torch" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "AP5m24X9jBd1" - }, - "source": [ - "PRE_TRAINED_MODEL_NAME = \"distilbert-base-uncased\"" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZodQLf99i5Tt" - }, - "source": [ - "tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "XxrIr46eaKLU" - }, - "source": [ - "import pandas as pd\n", - "\n", - "df = pd.read_csv(\"./title_conference.csv\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "X4z7AENkaQzN" - }, - "source": [ - "df.head()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "2X6J27qhabL4" - }, - "source": [ - "df['Conference'] = pd.Categorical(df['Conference'])\n", - "df['Target'] = df['Conference'].cat.codes" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "t3v8tCICeJwG" - }, - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "train_data, test_data = train_test_split(df, test_size=0.2, stratify=df[\"Target\"])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZlxAi8U9ezps" - }, - "source": [ - "train_texts, train_labels = train_data['Title'].values.tolist(), train_data['Target'].values.tolist()\n", - "test_texts, test_labels = test_data['Title'].values.tolist(), test_data['Target'].values.tolist()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "3Ofm3Nf7SiSX" - }, - "source": [ - "def preprocess(texts, labels):\n", - " encoded = tokenizer(\n", - " texts, \n", - " add_special_tokens=True,\n", - " truncation=True, \n", - " max_length=64, \n", - " pad_to_max_length=True,\n", - " return_attention_mask=True, \n", - " return_tensors='pt',\n", - " )\n", - " \n", - " return encoded, torch.tensor(labels) " - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "SHYyvoZAc6zm" - }, - "source": [ - "train_encoded, train_labels = preprocess(train_texts, train_labels)\n", - "test_encoded, test_labels = preprocess(test_texts, test_labels)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "LgbEnilJ4Zif" - }, - "source": [ - "class Dataset(torch.utils.data.Dataset):\n", - " def __init__(self, encodings, labels):\n", - " self.encodings = encodings\n", - " self.labels = labels\n", - "\n", - " def __getitem__(self, idx):\n", - " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n", - " item['labels'] = torch.tensor(self.labels[idx])\n", - " return item\n", - "\n", - " def __len__(self):\n", - " return len(self.labels)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "oTrcPNOau4us" - }, - "source": [ - "train_dataset = Dataset(train_encoded, train_labels)\n", - "test_dataset = Dataset(test_encoded, test_labels)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "ccxiZLo-zbmZ" - }, - "source": [ - "model = BertForSequenceClassification.from_pretrained(\n", - " PRE_TRAINED_MODEL_NAME, \n", - " num_labels=len(df[\"Target\"].unique()), \n", - " output_attentions=False,\n", - " output_hidden_states=False,\n", - ")\n", - "\n", - "# Tell pytorch to run this model on the GPU.\n", - "model.cuda()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "7w1oU0nlzNti" - }, - "source": [ - "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n", - "\n", - "def compute_metrics(pred): \n", - " experiment = comet_ml.get_global_experiment()\n", - " \n", - " labels = pred.label_ids\n", - " preds = pred.predictions.argmax(-1)\n", - " precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')\n", - " acc = accuracy_score(labels, preds)\n", - "\n", - " if experiment:\n", - " experiment.log_confusion_matrix(preds, labels)\n", - "\n", - " return {\n", - " 'accuracy': acc,\n", - " 'f1': f1,\n", - " 'precision': precision,\n", - " 'recall': recall\n", - " }" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "V6eIRRh0RWO8" - }, - "source": [ - "# Training Parameters\n", - "EPOCHS = 5" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "GOPNH3KXtdEj" - }, - "source": [ - "import itertools\n", - "\n", - "decays = [0.0, 0.5, 0.99]\n", - "learning_rates = [5.0e-5, 3.0e-5, 2.0e-5, 1.0e-5]\n", - "batch_sizes = [32, 64, 128]\n", - "\n", - "parameters = [\n", - " {\"weight_decay\": x[0], \"learning_rate\": x[1], \"batch_size\": x[2]} for x in list(itertools.product(*[decays, learning_rates, batch_sizes]))\n", - "]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "11_6DsP3u0TA" - }, - "source": [ - "from tqdm import tqdm\n", - "\n", - "%env COMET_MODE=ONLINE\n", - "%env COMET_API_KEY=\n", - "%env COMET_PROJECT_NAME=transformers\n", - "\n", - "for idx, p in tqdm(enumerate(parameters)):\n", - " weight_decay = p[\"weight_decay\"]\n", - " learning_rate = p[\"learning_rate\"]\n", - " batch_size = p[\"batch_size\"]\n", - "\n", - " training_args = TrainingArguments(\n", - " seed=42,\n", - " output_dir='./results', \n", - " overwrite_output_dir=True, \n", - " num_train_epochs=EPOCHS, \n", - " per_device_train_batch_size=batch_size, \n", - " per_device_eval_batch_size=batch_size,\n", - " warmup_steps=500, \n", - " weight_decay=weight_decay, \n", - " learning_rate=learning_rate, \n", - " evaluation_strategy=\"epoch\",\n", - " do_train=True,\n", - " do_eval=True \n", - " )\n", - " trainer = Trainer(\n", - " model=model, \n", - " args=training_args, \n", - " train_dataset=train_dataset, \n", - " eval_dataset=test_dataset,\n", - " compute_metrics=compute_metrics, \n", - " )\n", - " trainer.train()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "AD6IKE1DysN4" - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file