diff --git a/fast-DreamBooth.ipynb b/Copy_of_fast_DreamBooth.ipynb similarity index 87% rename from fast-DreamBooth.ipynb rename to Copy_of_fast_DreamBooth.ipynb index 4f471b5f..3aee1fa6 100644 --- a/fast-DreamBooth.ipynb +++ b/Copy_of_fast_DreamBooth.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, { "cell_type": "markdown", "metadata": { @@ -14,10 +24,13 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "A4Bae3VP6UsE" + "id": "A4Bae3VP6UsE", + "cellView": "form" }, "outputs": [], "source": [ + "#@title Mount Gdrive\n", + "\n", "from google.colab import drive\n", "drive.mount('/content/gdrive')" ] @@ -26,33 +39,35 @@ "cell_type": "code", "execution_count": null, "metadata": { - "cellView": "form", - "id": "QyvcqeiL65Tj" + "id": "QyvcqeiL65Tj", + "cellView": "form" }, "outputs": [], "source": [ "#@markdown # Dependencies\n", "\n", "from IPython.utils import capture\n", + "from subprocess import getoutput\n", "import time\n", "\n", - "print('\u001b[1;32mInstalling dependencies...')\n", + "print('[1;32mInstalling dependencies...')\n", "with capture.capture_output() as cap:\n", " %cd /content/\n", - " !pip install -q accelerate==0.12.0\n", - " for i in range(1,6):\n", - " !wget -q \"https://github.com/TheLastBen/fast-stable-diffusion/raw/main/Dependencies/Dependencies.{i}\"\n", - " !mv \"Dependencies.{i}\" \"Dependencies.7z.00{i}\"\n", - " !7z x -y Dependencies.7z.001\n", - " time.sleep(2)\n", - " !cp -r /content/usr/local/lib/python3.8/dist-packages /usr/local/lib/python3.8/\n", - " !rm -r /content/usr\n", - " for i in range(1,6):\n", - " !rm \"Dependencies.7z.00{i}\"\n", - " !pip uninstall -y diffusers\n", - " !git clone --branch updt https://github.com/TheLastBen/diffusers\n", - " !pip install -q /content/diffusers\n", - "print('\u001b[1;32mDone, proceed') " + " !pip install -q --no-deps accelerate==0.12.0\n", + " !wget -q -i \"https://github.com/TheLastBen/fast-stable-diffusion/raw/main/Dependencies/dbdeps.txt\"\n", + " for i in range(1,8):\n", + " !mv \"deps.{i}\" \"deps.7z.00{i}\"\n", + " !7z x -y -o/ deps.7z.001\n", + " !rm *.00* *.txt\n", + " !git clone --depth 1 --branch updt https://github.com/TheLastBen/diffusers\n", + " s = getoutput('nvidia-smi')\n", + " if \"A100\" in s:\n", + " !wget -q https://github.com/TheLastBen/fast-stable-diffusion/raw/main/precompiled/A100/A100\n", + " !rm -r /usr/local/lib/python3.8/dist-packages/xformers\n", + " !7z x -y -o/usr/local/lib/python3.8/dist-packages/ /content/A100\n", + " !rm /content/A100\n", + "!git clone https://github.com/nawnie/dreamboothtrainers.git\n", + "print('\u001b[1;32mDone, proceed')" ] }, { @@ -68,8 +83,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "cellView": "form", - "id": "O3KHGKqyeJp9" + "id": "O3KHGKqyeJp9", + "cellView": "form" }, "outputs": [], "source": [ @@ -78,7 +93,7 @@ "from IPython.display import clear_output\n", "import wget\n", "\n", - "#@markdown - Skip this cell if you are loading a previous session\n", + "#@markdown - Skip this cell if you are loading a previous session that contains a trained model.\n", "\n", "#@markdown ---\n", "\n", @@ -86,8 +101,6 @@ "\n", "#@markdown - Choose which version to finetune.\n", "\n", - "#@markdown ---\n", - "\n", "with capture.capture_output() as cap: \n", " %cd /content/\n", "\n", @@ -108,14 +121,14 @@ "\n", "#@markdown Or\n", "\n", - "CKPT_Path = \"\" #@param {type:\"string\"}\n", + "CKPT_Path = \"/content/gdrive/MyDrive/A_Training_folder/models/Realistic_proto.ckpt\" #@param {type:\"string\"}\n", "\n", "#@markdown Or\n", "\n", "CKPT_Link = \"\" #@param {type:\"string\"}\n", "\n", "#@markdown - A CKPT direct link, huggingface CKPT link or a shared CKPT from gdrive.\n", - "#@markdown ---\n", + "\n", "\n", "def downloadmodel():\n", " token=Huggingface_Token\n", @@ -166,6 +179,7 @@ " !git config core.sparsecheckout true\n", " !echo -e \"scheduler\\ntext_encoder\\ntokenizer\\nunet\\nvae\\nfeature_extractor\\nmodel_index.json\" > .git/info/sparse-checkout\n", " !git pull origin main\n", + " !rm -r /content/stable-diffusion-v2-768/.git\n", " clear_output()\n", " print('\u001b[1;32mDONE !')\n", "\n", @@ -182,6 +196,7 @@ " !git config core.sparsecheckout true\n", " !echo -e \"scheduler\\ntext_encoder\\ntokenizer\\nunet\\nvae\\nfeature_extractor\\nmodel_index.json\" > .git/info/sparse-checkout\n", " !git pull origin main\n", + " !rm -r /content/stable-diffusion-v2-512/.git\n", " clear_output()\n", " print('\u001b[1;32mDONE !')\n", " \n", @@ -233,7 +248,7 @@ " !rm model_index.json\n", " time.sleep(1)\n", " wget.download('https://raw.githubusercontent.com/TheLastBen/fast-stable-diffusion/main/Dreambooth/model_index.json')\n", - " !sed -i 's@\"clip_sample\": false@@g' /content/stable-diffusion-custom/scheduler/scheduler_config.json\n", + " !sed -i 's@\"clip_sample\": false,@@g' /content/stable-diffusion-custom/scheduler/scheduler_config.json\n", " !sed -i 's@\"trained_betas\": null,@\"trained_betas\": null@g' /content/stable-diffusion-custom/scheduler/scheduler_config.json\n", " !sed -i 's@\"sample_size\": 256,@\"sample_size\": 512,@g' /content/stable-diffusion-custom/vae/config.json \n", " %cd /content/ \n", @@ -356,8 +371,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "cellView": "form", - "id": "A1B299g-_VJo" + "id": "A1B299g-_VJo", + "cellView": "form" }, "outputs": [], "source": [ @@ -507,8 +522,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "cellView": "form", - "id": "LC4ukG60fgMy" + "id": "LC4ukG60fgMy", + "cellView": "form" }, "outputs": [], "source": [ @@ -534,12 +549,17 @@ "if not os.path.exists(str(INSTANCE_DIR)):\n", " %mkdir -p \"$INSTANCE_DIR\"\n", "\n", - "IMAGES_FOLDER_OPTIONAL=\"\" #@param{type: 'string'}\n", + "IMAGES_FOLDER_OPTIONAL=\"/content/gdrive/MyDrive/A_Training_folder/Cali\" #@param{type: 'string'}\n", + "\n", + "if os.path.exists(IMAGES_FOLDER_OPTIONAL+\"/.ipynb_checkpoints\"):\n", + " %rm -r $IMAGES_FOLDER_OPTIONAL\"/.ipynb_checkpoints\"\n", + "if os.path.exists(IMAGES_FOLDER_OPTIONAL+\"/Desktop.ini\"):\n", + " %rm -r $IMAGES_FOLDER_OPTIONAL\"/Desktop.ini\"\n", "\n", "#@markdown - If you prefer to specify directly the folder of the pictures instead of uploading, this will add the pictures to the existing (if any) instance images. Leave EMPTY to upload.\n", "\n", - "Crop_images= True #@param{type: 'boolean'}\n", - "Crop_size = \"512\" #@param [\"512\", \"576\", \"640\", \"704\", \"768\", \"832\", \"896\", \"960\", \"1024\"]\n", + "Crop_images= False #@param{type: 'boolean'}\n", + "Crop_size = \"576\" #@param [\"512\", \"576\", \"640\", \"704\", \"768\", \"832\", \"896\", \"960\", \"1024\"]\n", "Crop_size=int(Crop_size)\n", "\n", "#@markdown - Unless you want to crop them manually in a precise way, you don't need to crop your instance images externally.\n", @@ -610,7 +630,7 @@ "\n", "with capture.capture_output() as cap:\n", " %cd \"$INSTANCE_DIR\"\n", - " !find . -name \"* *\" -type f | rename 's/ /-/g' \n", + " !find . -name \"* *\" -type f | rename 's/ /_/g' \n", "\n", " %cd $SESSION_DIR\n", " !rm instance_images.zip\n", @@ -622,8 +642,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "cellView": "form", - "id": "LxEv3u8mQos3" + "id": "LxEv3u8mQos3", + "cellView": "form" }, "outputs": [], "source": [ @@ -632,14 +652,14 @@ "from PIL import Image\n", "from tqdm import tqdm\n", "\n", - "#@markdown #Concept Images\n", + "#@markdown #Concept Images (Regularization)\n", "#@markdown ----\n", "\n", "#@markdown\n", "#@markdown - Run this `optional` cell to upload concept pictures. If you're traning on a specific face, skip this cell.\n", - "#@markdown - Training a model on a restricted number of instance images tends to indoctrinate it and limit its imagination, so concept images help re-opening its \"mind\" to diversity and greatly widen the range of possibilities of the output, concept images should contain anything related to the instance pictures, including objects, ideas, scenes, phenomenons, concepts (obviously), don't be afraid to slightly diverge from the trained style. The resolution of the pictures doesn't matter.\n", + "#@markdown - Training a model on a restricted number of instance images tends to indoctrinate it and limit its imagination, so concept images help re-opening its \"mind\" to diversity and greatly widen the range of possibilities of the output, concept images should contain anything related to the instance pictures, including objects, ideas, scenes, phenomenons, concepts (obviously), don't be afraid to slightly diverge from the trained style.\n", "\n", - "Remove_existing_concept_images= True #@param{type: 'boolean'}\n", + "Remove_existing_concept_images= False #@param{type: 'boolean'}\n", "#@markdown - Uncheck the box to keep the existing concept images.\n", "\n", "\n", @@ -647,11 +667,15 @@ " if os.path.exists(str(CONCEPT_DIR)):\n", " !rm -r \"$CONCEPT_DIR\"\n", "\n", + "\n", "if not os.path.exists(str(CONCEPT_DIR)):\n", " %mkdir -p \"$CONCEPT_DIR\"\n", "\n", - "IMAGES_FOLDER_OPTIONAL=\"\" #@param{type: 'string'}\n", - "\n", + "IMAGES_FOLDER_OPTIONAL=\"/content/gdrive/MyDrive/A_Training_folder/woman2\" #@param{type: 'string'}\n", + "if os.path.exists(IMAGES_FOLDER_OPTIONAL+\"/.ipynb_checkpoints\"):\n", + " %rm -r $IMAGES_FOLDER_OPTIONAL\"/.ipynb_checkpoints\"\n", + "if os.path.exists(IMAGES_FOLDER_OPTIONAL+\"/Desktop.ini\"):\n", + " %rm -r $IMAGES_FOLDER_OPTIONAL\"/Desktop.ini\"\n", "#@markdown - If you prefer to specify directly the folder of the pictures instead of uploading, this will add the pictures to the existing (if any) concept images. Leave EMPTY to upload.\n", "\n", "Crop_images= True \n", @@ -750,7 +774,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "cellView": "form", "id": "1-9QbkfAVYYU" }, "outputs": [], @@ -765,6 +788,9 @@ "import time\n", "import random\n", "\n", + "# Determine number of images in the Instance folder\n", + "Img_Count = (len([entry for entry in os.listdir(INSTANCE_DIR) if os.path.isfile(os.path.join(INSTANCE_DIR, entry))]))\n", + "\n", "if os.path.exists(INSTANCE_DIR+\"/.ipynb_checkpoints\"):\n", " %rm -r $INSTANCE_DIR\"/.ipynb_checkpoints\"\n", "\n", @@ -772,7 +798,8 @@ " %rm -r $CONCEPT_DIR\"/.ipynb_checkpoints\" \n", "\n", "Resume_Training = False #@param {type:\"boolean\"}\n", - "\n", + "# user input request if a prior training has been started\n", + "# but resume is not selected\n", "try:\n", " resume\n", " if resume and not Resume_Training:\n", @@ -796,40 +823,82 @@ "\n", "#@markdown - If you're not satisfied with the result, check this box, run again the cell and it will continue training the current model.\n", "\n", - "MODELT_NAME=MODEL_NAME\n", "\n", - "UNet_Training_Steps=3000 #@param{type: 'number'}\n", - "UNet_Learning_Rate = 2e-6 #@param [\"1e-6\",\"2e-6\",\"3e-6\",\"4e-6\",\"5e-6\"] {type:\"raw\"}\n", - "untlr=UNet_Learning_Rate\n", "\n", - "#@markdown - Start with 3000 or lower, test the model, higher leaning rate = faster learning but higher risk of overfitting, if not enough, resume training for 1000 steps, keep testing until you get the desired output, `set it to 0 to train only the text_encoder`. \n", + "# declare Unet training Vaiables\n", "\n", - "Text_Encoder_Training_Steps=350 #@param{type: 'number'}\n", + "MODELT_NAME=MODEL_NAME\n", + "Repeats=50 #@param{type:\"number\"}\n", + "warmup_steps=0 #@param{type:\"number\"}\n", + "wu=warmup_steps\n", + "batch_size=4 #@param{type:\"number\"}\n", + "bs=batch_size\n", + "gradient_steps=2 #@param{type:\"number\"}\n", + "gs=gradient_steps\n", + "UNet_Training_Steps=((Repeats*Img_Count)/(gs*bs))\n", + "UNet_Learning_Rate = 2e-6 #@param [\"2e-6\", \"1e-6\", \"1e-5\", \"1e-4\", \"5e-7\"] {type:\"raw\", allow-input: true}\n", + "\n", + "#@markdown * 1e-7 is lowest, 1e-4 is the highest, 2e-7 is twice as fast as 1e-7 experiment and adjust the repeats to accomidate diffrent learning rates \n", + "\n", + "lr_schedule = \"polynomial\" #@param [\"polynomial\", \"constant\"] {allow-input: true}\n", + "untlr=UNet_Learning_Rate\n", + "UNet_Training_Steps=int(UNet_Training_Steps+wu)\n", "\n", - "#@markdown - 350-600 steps is enough for a small dataset, keep this number small to avoid overfitting, set to 0 to disable, `set it to 0 before resuming training if it is already trained`.\n", + "#@markdown - These default settings are for a dataset of 10 pictures which is enough for training a face, start with 650 or lower, test the model, if not enough, resume training for 150 steps, keep testing until you get the desired output, `set it to 0 to train only the text_encoder`. \n", "\n", - "Text_Encoder_Concept_Training_Steps=0 #@param{type: 'number'}\n", + "Text_Encoder_Training_steps=0 #@param{type: 'number'}\n", + "#@markdown - 200-450 steps is enough for a small dataset, keep this number small to avoid overfitting, set to 0 to disable, `set it to 0 before resuming training if it is already trained`.\n", "\n", + "# declare text batch size\n", + "Text_Batch_Size = 7 #@param {type:\"integer\"}\n", + "tbs=Text_Batch_Size\n", + "\n", + "Text_Encoder_Concept_Training_steps=0 #@param{type: 'number'}\n", + "# adjust text steps for batch size\n", + "Text_Encoder_Concept_Training_Steps=(Text_Encoder_Concept_Training_steps/tbs)\n", + "Text_Encoder_Training_Steps=(Text_Encoder_Training_steps/tbs)\n", + "Text_Encoder_Concept_Training_Steps=int(Text_Encoder_Concept_Training_Steps)\n", + "Text_Encoder_Training_Steps=int(Text_Encoder_Training_Steps)\n", "#@markdown - Suitable for training a style/concept as it acts as heavy regularization, set it to 1500 steps for 200 concept images (you can go higher), set to 0 to disable, set both the settings above to 0 to fintune only the text_encoder on the concept, `set it to 0 before resuming training if it is already trained`.\n", "\n", - "Text_Encoder_Learning_Rate = 1e-6 #@param [\"1e-6\",\"8e-7\",\"6e-7\",\"5e-7\",\"4e-7\"] {type:\"raw\"}\n", + "Text_Encoder_Learning_Rate = 2e-6 #@param [\"2e-6\", \"8e-7\", \"6e-7\", \"5e-7\", \"4e-7\"] {type:\"raw\", allow-input: true}\n", "txlr=Text_Encoder_Learning_Rate\n", "\n", - "#@markdown - Learning rate for both text_encoder and concept_text_encoder, keep it low to avoid overfitting (1e-6 is higher than 4e-7)\n", + "#@markdown - Learning rate for both text_encoder and concept_text_encoder, keep it low to avoid overfitting (1e-7 is lowest, 1e-4 is the highest, 2e-7 is twice as fast as 1e-7 experiment and adjust the repeats to accomidate diffrent learning rates )\n", "\n", "trnonltxt=\"\"\n", "if UNet_Training_Steps==0:\n", " trnonltxt=\"--train_only_text_encoder\"\n", "\n", - "Seed='' \n", + "Seed = 42825032 #@param {type:\"integer\"}\n", "\n", "Style_Training = False #@param {type:\"boolean\"}\n", "\n", - "#@markdown - Further reduce overfitting, suitable when training a style or a general theme, don't check the box at the beginning, check it after training for at least 2000 steps.\n", + "#@markdown -Forced Drop out, Drops caption from images, helps fine tuning a style without over-fitting simpsons model could of benefitted from this\n", "\n", "Style=\"\"\n", "if Style_Training:\n", - " Style=\"--Style\"\n", + " Style = \"--Style\"\n", + "\n", + "Flip_Images = True #@param {type:\"boolean\"}\n", + "Percent_to_flip = 10 #@param{type:\"raw\"}\n", + "flip_rate = (Percent_to_flip/100)\n", + "\n", + "#@markdown Flip a random 10% of images, helps add veriety to smaller data-sets\n", + "\n", + "flip=\"\"\n", + "if Flip_Images:\n", + " flip=\"--hflip\"\n", + "\n", + "Conditional_dropout = 10 #@param {type:\"raw\"}\n", + "\n", + "#@markdown drop a random X% of images, helps avoid over fitting, very similar to style training\n", + "\n", + "drop='0'\n", + "drop= (Conditional_dropout/100)\n", + "\n", + "\n", + "\n", "\n", "Resolution = \"512\" #@param [\"512\", \"576\", \"640\", \"704\", \"768\", \"832\", \"896\", \"960\", \"1024\"]\n", "Res=int(Resolution)\n", @@ -892,22 +961,18 @@ " Textenc=\"\"\n", "\n", "#@markdown ---------------------------\n", - "Save_Checkpoint_Every_n_Steps = False #@param {type:\"boolean\"}\n", - "Save_Checkpoint_Every=500 #@param{type: 'number'}\n", - "if Save_Checkpoint_Every==None:\n", - " Save_Checkpoint_Every=1\n", - "#@markdown - Minimum 200 steps between each save.\n", + "Save_Checkpoint_Every_n_Steps = True #@param {type:\"boolean\"}\n", + "#@markdown How many repats/epochs between saves\n", + "Save_Checkpoint_Every=25 #@param{type: 'number'}\n", "stp=0\n", - "Start_saving_from_the_step=500 #@param{type: 'number'}\n", - "if Start_saving_from_the_step==None:\n", - " Start_saving_from_the_step=0\n", - "if (Start_saving_from_the_step < 200):\n", - " Start_saving_from_the_step=Save_Checkpoint_Every\n", - "stpsv=Start_saving_from_the_step\n", + "stpsv=10\n", "if Save_Checkpoint_Every_n_Steps:\n", - " stp=Save_Checkpoint_Every\n", - "#@markdown - Start saving intermediary checkpoints from this step.\n", + " stp=((Save_Checkpoint_Every*Img_Count)/(gs*bs))\n", + "stp=int(stp)\n", + "Number_Of_Samples = 4 #@param {type:\"integer\"}\n", + "NoS=Number_Of_Samples\n", "\n", + "prompt= \"\" #@param{type:\"string\"}\n", "Disconnect_after_training=False #@param {type:\"boolean\"}\n", "\n", "#@markdown - Auto-disconnect from google colab after the training to avoid wasting compute units.\n", @@ -926,21 +991,22 @@ " --seed=$Seed \\\n", " --resolution=512 \\\n", " --mixed_precision=$precision \\\n", - " --train_batch_size=1 \\\n", + " --train_batch_size=$tbs \\\n", " --gradient_accumulation_steps=1 $GC \\\n", " --use_8bit_adam \\\n", " --learning_rate=$txlr \\\n", " --lr_scheduler=\"polynomial\" \\\n", - " --lr_warmup_steps=0 \\\n", + " --lr_warmup_steps=10 \\\n", " --max_train_steps=$Training_Steps\n", "\n", "def train_only_unet(stpsv, stp, SESSION_DIR, MODELT_NAME, INSTANCE_DIR, OUTPUT_DIR, PT, Seed, Res, precision, Training_Steps):\n", " clear_output()\n", " if resuming==\"Yes\":\n", " print('\u001b[1;32mResuming Training...\u001b[0m') \n", - " print('\u001b[1;33mTraining the UNet...\u001b[0m')\n", - " !accelerate launch /content/diffusers/examples/dreambooth/train_dreambooth.py \\\n", + " print('\u001b[1;33mTraining the UNet...\u001b[0m Saving every:'+str(stp)+' Steps')\n", + " !accelerate launch /content/dreamboothtrainers/Trainer.py \\\n", " $Style \\\n", + " $flip \\\n", " --image_captions_filename \\\n", " --train_only_unet \\\n", " --save_starting_step=$stpsv \\\n", @@ -950,15 +1016,19 @@ " --instance_data_dir=\"$INSTANCE_DIR\" \\\n", " --output_dir=\"$OUTPUT_DIR\" \\\n", " --instance_prompt=\"$PT\" \\\n", + " --n_save_sample=$NoS \\\n", + " --save_sample_prompt=\"$prompt\" \\\n", " --seed=$Seed \\\n", " --resolution=$Res \\\n", " --mixed_precision=$precision \\\n", - " --train_batch_size=1 \\\n", - " --gradient_accumulation_steps=1 $GC \\\n", + " --train_batch_size=$bs \\\n", + " --gradient_accumulation_steps=$gs $GC \\\n", " --use_8bit_adam \\\n", " --learning_rate=$untlr \\\n", - " --lr_scheduler=\"polynomial\" \\\n", - " --lr_warmup_steps=0 \\\n", + " --lr_scheduler=\"$lr_schedule\" \\\n", + " --Drop_out=$drop \\\n", + " --flip_rate=$flip_rate \\\n", + " --lr_warmup_steps=$wu \\\n", " --max_train_steps=$Training_Steps\n", "\n", "\n", @@ -967,15 +1037,28 @@ " if os.path.exists(OUTPUT_DIR+'/'+'text_encoder_trained'):\n", " %rm -r $OUTPUT_DIR\"/text_encoder_trained\"\n", " dump_only_textenc(trnonltxt, MODELT_NAME, INSTANCE_DIR, OUTPUT_DIR, PT, Seed, precision, Training_Steps=stptxt)\n", - "if Enable_Text_Encoder_Concept_Training and os.listdir(CONCEPT_DIR)!=[]:\n", - " clear_output()\n", - " if resuming==\"Yes\":\n", - " print('\u001b[1;32mResuming Training...\u001b[0m') \n", - " print('\u001b[1;33mTraining the text encoder on the concept...\u001b[0m')\n", - " dump_only_textenc(trnonltxt, MODELT_NAME, CONCEPT_DIR, OUTPUT_DIR, PT, Seed, precision, Training_Steps=stptxtc)\n", - "elif Enable_Text_Encoder_Concept_Training and os.listdir(CONCEPT_DIR)==[]:\n", - " print('\u001b[1;31mNo concept images found, skipping concept training...')\n", - " time.sleep(8)\n", + "\n", + "if Enable_Text_Encoder_Concept_Training:\n", + " if os.path.exists(CONCEPT_DIR):\n", + " if os.listdir(CONCEPT_DIR)!=[]:\n", + " # clear_output()\n", + " if resuming==\"Yes\":\n", + " print('\u001b[1;32mResuming Training...\u001b[0m') \n", + " print('\u001b[1;33mTraining the text encoder on the concept...\u001b[0m')\n", + " dump_only_textenc(trnonltxt, MODELT_NAME, CONCEPT_DIR, OUTPUT_DIR, PT, Seed, precision, Training_Steps=stptxtc)\n", + " else:\n", + " # clear_output()\n", + " if resuming==\"Yes\":\n", + " print('\u001b[1;32mResuming Training...\u001b[0m') \n", + " print('\u001b[1;31mNo concept images found, skipping concept training...')\n", + " time.sleep(8)\n", + " else:\n", + " #clear_output()\n", + " if resuming==\"Yes\":\n", + " print('\u001b[1;32mResuming Training...\u001b[0m')\n", + " print('\u001b[1;31mNo concept images found, skipping concept training...')\n", + " time.sleep(8)\n", + " \n", "if UNet_Training_Steps!=0:\n", " train_only_unet(stpsv, stp, SESSION_DIR, MODELT_NAME, INSTANCE_DIR, OUTPUT_DIR, PT, Seed, Res, precision, Training_Steps=UNet_Training_Steps)\n", " \n", @@ -984,9 +1067,9 @@ " prc=\"--fp16\" if precision==\"fp16\" else \"\"\n", " if V2:\n", " !python /content/diffusers/scripts/convertosdv2.py $prc $OUTPUT_DIR $SESSION_DIR/$Session_Name\".ckpt\"\n", - " clear_output()\n", + " #clear_output()\n", " if os.path.exists(SESSION_DIR+\"/\"+INSTANCE_NAME+'.ckpt'):\n", - " clear_output()\n", + " #clear_output()\n", " print(\"\u001b[1;32mDONE, the CKPT model is in your Gdrive in the sessions folder\")\n", " if Disconnect_after_training :\n", " time.sleep(20) \n", @@ -995,14 +1078,14 @@ " print(\"\u001b[1;31mSomething went wrong\") \n", " else: \n", " !wget -O /content/convertosd.py https://github.com/TheLastBen/fast-stable-diffusion/raw/main/Dreambooth/convertosd.py\n", - " clear_output()\n", + " #clear_output()\n", " if precision==\"no\":\n", " !sed -i '226s@.*@@' /content/convertosd.py\n", " !sed -i '201s@.*@ model_path = \"{OUTPUT_DIR}\"@' /content/convertosd.py\n", " !sed -i '202s@.*@ checkpoint_path= \"{SESSION_DIR}/{Session_Name}.ckpt\"@' /content/convertosd.py\n", " !python /content/convertosd.py\n", - " !rm /content/convertosd.py\n", - " clear_output()\n", + "\n", + " #clear_output()\n", " if os.path.exists(SESSION_DIR+\"/\"+INSTANCE_NAME+'.ckpt'): \n", " print(\"\u001b[1;32mDONE, the CKPT model is in your Gdrive in the sessions folder\")\n", " if Disconnect_after_training :\n", @@ -1051,7 +1134,7 @@ "\n", "#@markdown - Leave empty if you want to use the current trained model.\n", "\n", - "Use_Custom_Path = False #@param {type:\"boolean\"}\n", + "Use_Custom_Path = True #@param {type:\"boolean\"}\n", "\n", "try:\n", " INSTANCE_NAME\n", @@ -1097,15 +1180,15 @@ " %cd /content/\n", " !ln -s /content/gdrive/MyDrive/sd/stable-diffusion-webui/cache/huggingface ../root/.cache/\n", " !ln -s /content/gdrive/MyDrive/sd/stable-diffusion-webui/cache/torch ../root/.cache/\n", - " !wget -O /content/gdrive/MyDrive/sd/stable-diffusion-webui/modules/shared.py https://raw.githubusercontent.com/AUTOMATIC1111/stable-diffusion-webui/master/modules/shared.py\n", "\n", "if Update_repo:\n", - " with capture.capture_output() as cap: \n", - " !rm /content/gdrive/MyDrive/sd/stable-diffusion-webui/webui.sh \n", + " with capture.capture_output() as cap:\n", + " !rm /content/gdrive/MyDrive/sd/stable-diffusion-webui/webui.sh\n", " !rm /content/gdrive/MyDrive/sd/stable-diffusion-webui/modules/paths.py\n", - " !rm /content/gdrive/MyDrive/sd/stable-diffusion-webui/webui.py \n", + " !rm /content/gdrive/MyDrive/sd/stable-diffusion-webui/webui.py\n", " !rm /content/gdrive/MyDrive/sd/stable-diffusion-webui/modules/ui.py\n", " !rm /content/gdrive/MyDrive/sd/stable-diffusion-webui/style.css\n", + " !rm /content/gdrive/MyDrive/sd/stable-diffusion-webui/modules/shared.py\n", " %cd /content/gdrive/MyDrive/sd/stable-diffusion-webui/\n", " print('\u001b[1;32m')\n", " !git pull\n", @@ -1122,9 +1205,9 @@ " !git clone https://github.com/sczhou/CodeFormer\n", " !git clone https://github.com/crowsonkb/k-diffusion\n", " !mv /content/gdrive/MyDrive/sd/stablediffusion/src/CLIP /content/gdrive/MyDrive/sd/stablediffusion/src/clip\n", - " !mv /content/gdrive/MyDrive/sd/stablediffusion/src/BLIP /content/gdrive/MyDrive/sd/stablediffusion/src/blip \n", - " !mv /content/gdrive/MyDrive/sd/stablediffusion/src/CodeFormer /content/gdrive/MyDrive/sd/stablediffusion/src/codeformer \n", - " !cp -r /content/gdrive/MyDrive/sd/stablediffusion/src/k-diffusion/k_diffusion /content/gdrive/MyDrive/sd/stable-diffusion-webui/ \n", + " !mv /content/gdrive/MyDrive/sd/stablediffusion/src/BLIP /content/gdrive/MyDrive/sd/stablediffusion/src/blip\n", + " !mv /content/gdrive/MyDrive/sd/stablediffusion/src/CodeFormer /content/gdrive/MyDrive/sd/stablediffusion/src/codeformer\n", + " !cp -r /content/gdrive/MyDrive/sd/stablediffusion/src/k-diffusion/k_diffusion /content/gdrive/MyDrive/sd/stable-diffusion-webui/\n", "\n", "\n", "with capture.capture_output() as cap: \n", @@ -1135,14 +1218,14 @@ " if not os.path.exists('/tools/node/bin/lt'):\n", " !npm install -g localtunnel\n", "\n", - "with capture.capture_output() as cap: \n", + "with capture.capture_output() as cap:\n", " %cd /content/gdrive/MyDrive/sd/stable-diffusion-webui/\n", - " time.sleep(1)\n", " !wget -O webui.py https://raw.githubusercontent.com/AUTOMATIC1111/stable-diffusion-webui/master/webui.py\n", " !sed -i 's@ui.create_ui().*@ui.create_ui();shared.demo.queue(concurrency_count=999999,status_update_rate=0.1)@' /content/gdrive/MyDrive/sd/stable-diffusion-webui/webui.py\n", " %cd /content/gdrive/MyDrive/sd/stable-diffusion-webui/modules/\n", + " !wget -O shared.py https://raw.githubusercontent.com/AUTOMATIC1111/stable-diffusion-webui/master/modules/shared.py\n", " !wget -O ui.py https://raw.githubusercontent.com/AUTOMATIC1111/stable-diffusion-webui/master/modules/ui.py\n", - " !sed -i 's@css = \"\".*@with open(os.path.join(script_path, \"style.css\"), \"r\", encoding=\"utf8\") as file:\\n css = file.read()@' /content/gdrive/MyDrive/sd/stable-diffusion-webui/modules/ui.py \n", + " !sed -i 's@css = \"\".*@with open(os.path.join(script_path, \"style.css\"), \"r\", encoding=\"utf8\") as file:\\n css = file.read()@' /content/gdrive/MyDrive/sd/stable-diffusion-webui/modules/ui.py\n", " %cd /content/gdrive/MyDrive/sd/stable-diffusion-webui\n", " !wget -O style.css https://raw.githubusercontent.com/AUTOMATIC1111/stable-diffusion-webui/master/style.css\n", " !sed -i 's@min-height: 4.*@min-height: 5.5em;@g' /content/gdrive/MyDrive/sd/stable-diffusion-webui/style.css\n", @@ -1151,7 +1234,7 @@ " %cd /content\n", "\n", "\n", - "Use_Gradio_Server = True #@param {type:\"boolean\"}\n", + "Use_Gradio_Server = False #@param {type:\"boolean\"}\n", "#@markdown - Only if you have trouble connecting to the local server.\n", "\n", "Large_Model= False #@param {type:\"boolean\"}\n", @@ -1533,7 +1616,8 @@ "bbKbx185zqlz", "AaLtXBbPleBr" ], - "provenance": [] + "provenance": [], + "include_colab_link": true }, "kernelspec": { "display_name": "Python 3", @@ -1541,7 +1625,8 @@ }, "language_info": { "name": "python" - } + }, + "gpuClass": "standard" }, "nbformat": 4, "nbformat_minor": 0 diff --git a/README.md b/README.md index 2b96b07a..64dd2321 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,14 @@ -# fast-stable-diffusion Colab Notebooks, AUTOMATIC1111 + DreamBooth -Colab adaptations AUTOMATIC1111 Webui and Dreambooth, train your model using this easy simple and fast colab, all you have to do is enter you huggingface token once, and it will cache all the files in GDrive, including the trained model and you will be able to use it directly from the colab, make sure you use high quality reference pictures for the training, enjoy !! - - -
                                                   AUTOMATIC1111                                      DreamBooth - -
                                          - -              - +Code taken from:Huggingface, ShivamShrirao, XavieroXiao, and of course TheLastBen, whom this is forked and based off of https://github.com/ShivamShrirao/diffusers https://github.com/XavierXiao/Dreambooth-Stable-Diffusion https://github.com/TheLastBen/fast-stable-diffusion -[Step by Step guide](https://github.com/Excalibro1/fast-stable-diffusionwik/wiki/fast-stable-diffusion-wiki) by Excalibro1 +ImageGeneration during Training qul Ui update -Dreambooth paper : https://dreambooth.github.io/ +Planned Emphasis on 1.5 training i do not plan to remove 2.x but i may end up having ddefault settings that do not play well with 2.x, options will always be made available to change these, the Goal of this colab is to be simple but give the User a wide Variety of options that are not currently easily accessible. -SD implementation by @XavierXiao : https://github.com/XavierXiao/Dreambooth-Stable-Diffusion +save logs to gdrive combine unet and text encoder training if wanted + +[gold animation](https://user-images.githubusercontent.com/106923464/210042127-f07fb7da-5632-4b53-9932-e27cda5f6f6e.png) + +![prompt](https://user-images.githubusercontent.com/106923464/210042395-dcdae63a-eabf-420a-978d-d6d62a1e2f50.png) + + +Last bit of credit im going to give is to victorchall and his every EveryDream-Trainer Project he has been extremly helpful in learining about how these programs work EveryDream is not dreambooth, but i do recomend giving it a try for HUGE data sets or professional projects! diff --git a/train_dreambooth.py b/train_dreambooth.py new file mode 100644 index 00000000..65878b6a --- /dev/null +++ b/train_dreambooth.py @@ -0,0 +1,852 @@ +import argparse +import itertools +import math +import os +from pathlib import Path +from typing import Optional +import subprocess +import sys + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch.utils.data import Dataset + +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed +from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel +from diffusers.optimization import get_scheduler +from huggingface_hub import HfFolder, Repository, whoami +from PIL import Image +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import CLIPTextModel, CLIPTokenizer + + +logger = get_logger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=True, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--class_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of class images.", + ) + parser.add_argument( + "--instance_prompt", + type=str, + default=None, + help="The prompt with identifier specifying the instance", + ) + parser.add_argument( + "--class_prompt", + type=str, + default="", + help="The prompt to specify images in the same class as provided instance images.", + ) + parser.add_argument( + "--with_prior_preservation", + default=False, + action="store_true", + help="Flag to add prior preservation loss.", + ) + parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") + parser.add_argument( + "--num_class_images", + type=int, + default=100, + help=( + "Minimal class images for prior preservation loss. If not have enough images, additional images will be" + " sampled with class_prompt." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution" + ) + parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder") + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + parser.add_argument("--num_train_epochs", type=int, default=1) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-6, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default="no", + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose" + "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10." + "and an Nvidia Ampere GPU." + ), + ) + + parser.add_argument( + "--save_n_steps", + type=int, + default=1, + help=("Save the model every n global_steps"), + ) + + + parser.add_argument( + "--save_starting_step", + type=int, + default=1, + help=("The step from which it starts saving intermediary checkpoints"), + ) + + parser.add_argument( + "--stop_text_encoder_training", + type=int, + default=1000000, + help=("The step at which the text_encoder is no longer trained"), + ) + + + parser.add_argument( + "--image_captions_filename", + action="store_true", + help="Get captions from filename", + ) + + + parser.add_argument( + "--dump_only_text_encoder", + action="store_true", + default=False, + help="Dump only text-encoder", + ) + + parser.add_argument( + "--train_only_unet", + action="store_true", + default=False, + help="Train only the unet", + ) + + parser.add_argument( + "--train_only_text_encoder", + action="store_true", + default=False, + help="Train only the text-encoder", + ) + + parser.add_argument( + "--Style", + action="store_true", + default=False, + help="Further reduce overfitting", + ) + + parser.add_argument( + "--Session_dir", + type=str, + default="", + help="Current session directory", + ) + + parser.add_argument( + "--external_captions", + action="store_true", + default=False, + help="Use captions stored in a txt file", + ) + + parser.add_argument( + "--captions_dir", + type=str, + default="", + help="The folder where captions files are stored", + ) + + + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + + args = parser.parse_args() + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + if args.instance_data_dir is None: + raise ValueError("You must specify a train data directory.") + + if args.with_prior_preservation: + if args.class_data_dir is None: + raise ValueError("You must specify a data directory for class images.") + if args.class_prompt is None: + raise ValueError("You must specify prompt for class images.") + + return args + + +class DreamBoothDataset(Dataset): + """ + A dataset to prepare the instance and class images with the prompts for fine-tuning the model. + It pre-processes the images and the tokenizes prompts. + """ + + def __init__( + self, + instance_data_root, + instance_prompt, + tokenizer, + args, + class_data_root=None, + class_prompt=None, + size=512, + center_crop=False, + ): + self.size = size + self.center_crop = center_crop + self.tokenizer = tokenizer + self.image_captions_filename = None + + self.instance_data_root = Path(instance_data_root) + if not self.instance_data_root.exists(): + raise ValueError("Instance images root doesn't exists.") + + self.instance_images_path = list(Path(instance_data_root).iterdir()) + self.num_instance_images = len(self.instance_images_path) + self.instance_prompt = instance_prompt + self._length = self.num_instance_images + + if args.image_captions_filename: + self.image_captions_filename = True + + if class_data_root is not None: + self.class_data_root = Path(class_data_root) + self.class_data_root.mkdir(parents=True, exist_ok=True) + self.class_images_path = list(self.class_data_root.iterdir()) + self.num_class_images = len(self.class_images_path) + self._length = max(self.num_class_images, self.num_instance_images) + self.class_prompt = class_prompt + else: + self.class_data_root = None + + self.image_transforms = transforms.Compose( + [ + transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __len__(self): + return self._length + + def __getitem__(self, index, args=parse_args()): + example = {} + path = self.instance_images_path[index % self.num_instance_images] + instance_image = Image.open(path) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + + instance_prompt = self.instance_prompt + + if self.image_captions_filename: + filename = Path(path).stem + + pt=''.join([i for i in filename if not i.isdigit()]) + pt=pt.replace("_"," ") + pt=pt.replace("(","") + pt=pt.replace(")","") + pt=pt.replace("-","") + pt=pt.replace("conceptimagedb","") + + if args.external_captions: + cptpth=os.path.join(args.captions_dir, filename+'.txt') + if os.path.exists(cptpth): + with open(cptpth, "r") as f: + instance_prompt=pt+' '+f.read() + else: + instance_prompt=pt + else: + if args.Style: + instance_prompt = "" + else: + instance_prompt = pt + sys.stdout.write(" " +instance_prompt[:45]+" ") + sys.stdout.flush() + + + example["instance_images"] = self.image_transforms(instance_image) + example["instance_prompt_ids"] = self.tokenizer( + instance_prompt, + padding="do_not_pad", + truncation=True, + max_length=self.tokenizer.model_max_length, + ).input_ids + + if self.class_data_root: + class_image = Image.open(self.class_images_path[index % self.num_class_images]) + if not class_image.mode == "RGB": + class_image = class_image.convert("RGB") + example["class_images"] = self.image_transforms(class_image) + example["class_prompt_ids"] = self.tokenizer( + self.class_prompt, + padding="do_not_pad", + truncation=True, + max_length=self.tokenizer.model_max_length, + ).input_ids + + return example + + + +class PromptDataset(Dataset): + "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + + def __init__(self, prompt, num_samples): + self.prompt = prompt + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + example = {} + example["prompt"] = self.prompt + example["index"] = index + return example + + +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): + if token is None: + token = HfFolder.get_token() + if organization is None: + username = whoami(token)["name"] + return f"{username}/{model_id}" + else: + return f"{organization}/{model_id}" + + +def main(): + args = parse_args() + logging_dir = Path(args.output_dir, args.logging_dir) + i=args.save_starting_step + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with="tensorboard", + logging_dir=logging_dir, + ) + + # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate + # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. + # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. + if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1: + raise ValueError( + "Gradient accumulation is not supported when training the text encoder in distributed training. " + "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." + ) + + if args.seed is not None: + set_seed(args.seed) + + if args.with_prior_preservation: + class_images_dir = Path(args.class_data_dir) + if not class_images_dir.exists(): + class_images_dir.mkdir(parents=True) + cur_class_images = len(list(class_images_dir.iterdir())) + + if cur_class_images < args.num_class_images: + torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32 + pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, torch_dtype=torch_dtype + ) + pipeline.set_progress_bar_config(disable=True) + + num_new_images = args.num_class_images - cur_class_images + logger.info(f"Number of class images to sample: {num_new_images}.") + + sample_dataset = PromptDataset(args.class_prompt, num_new_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + + sample_dataloader = accelerator.prepare(sample_dataloader) + pipeline.to(accelerator.device) + + for example in tqdm( + sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process + ): + with torch.autocast("cuda"): + images = pipeline(example["prompt"]).images + + for i, image in enumerate(images): + image.save(class_images_dir / f"{example['index'][i] + cur_class_images}.jpg") + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + if args.hub_model_id is None: + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) + else: + repo_name = args.hub_model_id + repo = Repository(args.output_dir, clone_from=repo_name) + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name) + elif args.pretrained_model_name_or_path: + tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer") + + # Load models and create wrapper for stable diffusion + if args.train_only_unet or args.dump_only_text_encoder: + if os.path.exists(str(args.output_dir+"/text_encoder_trained")): + text_encoder = CLIPTextModel.from_pretrained(args.output_dir, subfolder="text_encoder_trained") + else: + text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder") + else: + text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder") + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae") + unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet") + + vae.requires_grad_(False) + if not args.train_text_encoder: + text_encoder.requires_grad_(False) + + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + if args.train_text_encoder: + text_encoder.gradient_checkpointing_enable() + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + + # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + params_to_optimize = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters() + ) + optimizer = optimizer_class( + params_to_optimize, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler") + + train_dataset = DreamBoothDataset( + instance_data_root=args.instance_data_dir, + instance_prompt=args.instance_prompt, + class_data_root=args.class_data_dir if args.with_prior_preservation else None, + class_prompt=args.class_prompt, + tokenizer=tokenizer, + size=args.resolution, + center_crop=args.center_crop, + args=args, + ) + + def collate_fn(examples): + input_ids = [example["instance_prompt_ids"] for example in examples] + pixel_values = [example["instance_images"] for example in examples] + + # Concat class and instance examples for prior preservation. + # We do this to avoid doing two forward passes. + if args.with_prior_preservation: + input_ids += [example["class_prompt_ids"] for example in examples] + pixel_values += [example["class_images"] for example in examples] + + pixel_values = torch.stack(pixel_values) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + + input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids + + batch = { + "input_ids": input_ids, + "pixel_values": pixel_values, + } + return batch + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn + ) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + ) + + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, optimizer, train_dataloader, lr_scheduler + ) + + weight_dtype = torch.float32 + if args.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif args.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move text_encode and vae to gpu. + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + vae.to(accelerator.device, dtype=weight_dtype) + if not args.train_text_encoder: + text_encoder.to(accelerator.device, dtype=weight_dtype) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + accelerator.init_trackers("dreambooth", config=vars(args)) + + def bar(prg): + br='|'+'█' * prg + ' ' * (25-prg)+'|' + return br + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num batches each epoch = {len(train_dataloader)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) + global_step = 0 + + for epoch in range(args.num_train_epochs): + unet.train() + if args.train_text_encoder: + text_encoder.train() + for step, batch in enumerate(train_dataloader): + with accelerator.accumulate(unet): + # Convert images to latent space + latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() + latents = latents * 0.18215 + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder(batch["input_ids"])[0] + + # Predict the noise residual + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + # Get the target for loss depending on the prediction type + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + if args.with_prior_preservation: + # Chunk the noise and model_pred into two parts and compute the loss on each part separately. + model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) + target, target_prior = torch.chunk(target, 2, dim=0) + + # Compute instance loss + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean() + + # Compute prior loss + prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") + + # Add the prior loss to the instance loss. + loss = loss + args.prior_loss_weight * prior_loss + else: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + accelerator.backward(loss) + if accelerator.sync_gradients: + params_to_clip = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) + if args.train_text_encoder + else unet.parameters() + ) + accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + global_step += 1 + + fll=round((global_step*100)/args.max_train_steps) + fll=round(fll/4) + pr=bar(fll) + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + progress_bar.set_description_str("Progress:"+pr) + accelerator.log(logs, step=global_step) + + if global_step >= args.max_train_steps: + break + + if args.train_text_encoder and global_step == args.stop_text_encoder_training and global_step >= 5: + if accelerator.is_main_process: + print(" " +" Freezing the text_encoder ..."+" ") + frz_dir=args.output_dir + "/text_encoder_frozen" + if os.path.exists(frz_dir): + subprocess.call('rm -r '+ frz_dir, shell=True) + os.mkdir(frz_dir) + pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), + ) + pipeline.text_encoder.save_pretrained(frz_dir) + + if args.save_n_steps >= 1: + if global_step < args.max_train_steps and global_step+1==i: + ckpt_name = "_step_" + str(global_step+1) + save_dir = Path(args.output_dir+ckpt_name) + save_dir=str(save_dir) + save_dir=save_dir.replace(" ", "_") + if not os.path.exists(save_dir): + os.mkdir(save_dir) + inst=save_dir[16:] + inst=inst.replace(" ", "_") + print(" SAVING CHECKPOINT...") + # Create the pipeline using the trained modules and save it. + if accelerator.is_main_process: + pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), + ) + pipeline.save_pretrained(save_dir) + frz_dir=args.output_dir + "/text_encoder_frozen" + if args.train_text_encoder and os.path.exists(frz_dir): + subprocess.call('rm -r '+save_dir+'/text_encoder/*.*', shell=True) + subprocess.call('cp -f '+frz_dir +'/*.* '+ save_dir+'/text_encoder', shell=True) + chkpth=args.Session_dir+"/"+inst+".ckpt" + if args.mixed_precision=="fp16": + subprocess.call('python /content/diffusers/scripts/convertosdv2.py ' + save_dir + ' ' + chkpth + ' --fp16', shell=True) + else: + subprocess.call('python /content/diffusers/scripts/convertosdv2.py ' + save_dir + ' ' + chkpth, shell=True) + print("Done, resuming training ...") + subprocess.call('rm -r '+ save_dir, shell=True) + i=i+args.save_n_steps + + if args.external_captions and global_step == args.stop_text_encoder_training and global_step >= 5: + subprocess.call('mv '+args.captions_dir+' '+args.captions_dir+'off', shell=True) + + accelerator.wait_for_everyone() + + # Create the pipeline using using the trained modules and save it. + if accelerator.is_main_process: + if args.dump_only_text_encoder: + txt_dir=args.output_dir + "/text_encoder_trained" + if args.train_only_text_encoder: + pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + text_encoder=accelerator.unwrap_model(text_encoder), + ) + pipeline.save_pretrained(args.output_dir) + else: + if not os.path.exists(txt_dir): + os.mkdir(txt_dir) + pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), + ) + pipeline.text_encoder.save_pretrained(txt_dir) + + elif args.train_only_unet: + pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), + ) + pipeline.save_pretrained(args.output_dir) + txt_dir=args.output_dir + "/text_encoder_trained" + if os.path.exists(txt_dir): + subprocess.call('rm -r '+txt_dir, shell=True) + + else: + pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), + ) + frz_dir=args.output_dir + "/text_encoder_frozen" + pipeline.save_pretrained(args.output_dir) + if args.train_text_encoder and os.path.exists(frz_dir): + subprocess.call('mv -f '+frz_dir +'/*.* '+ args.output_dir+'/text_encoder', shell=True) + subprocess.call('rm -r '+ frz_dir, shell=True) + + if args.push_to_hub: + repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True) + + if os.path.exists(args.captions_dir+'off'): + subprocess.call('mv '+args.captions_dir+'off '+args.captions_dir, shell=True) + + + accelerator.end_training() + +if __name__ == "__main__": + main()