[Feature] Support AnimateDiff, a popular text2animation method (#1980)

* first commit for animatediff * fix lint errors * modify readme file and add readme_zh-CN.md * fix some typos in readme * delete test_animatediff.py * add some docstring * fix cross attention for 512*512 animation quality * fix some initial setting for cpu load * add unittest samples * modify unittest codes * remove duplicated unittest files * modify unittest codes for minimum memory * modify test_unet3d resolution for minimum memory unittest * modify test_unet_blocks3d input resolution for minimum memory unittest * modify animatediff.py for gradio * add gradio app for animatediff * skip test with large memory * fix environment building * fix merging conflict * Add different style ckpt * fix environment building * add new motion module * add prompts for all config files in README * add image in README * fix sd ckpt auto downloading * remove unused import in test code * align README_zh and README * fix building error * delete unused comments * fix test memory * fix text_model error for later transformer version * fix comment copyright * add animatediff gradio README * modify some copyright in motion_module.py * modify README for better test guidance * fix inference without xformers and mimsave for higher version of imageio * fix errors in different versions of imageio * add train tutorial and pretrained models * fix some comments in README * delete personal information * fix gradio sd selection * add some tips for run gradio * add pretrained links --------- Co-authored-by: rangoliu <[email protected]>
open-mmlab · Sep 20, 2023 · bd7c295 · bd7c295
1 parent 9e55603
commit bd7c295
Show file tree

Hide file tree

Showing 27 changed files with 6,518 additions and 1 deletion.
diff --git a/configs/animatediff/README.md b/configs/animatediff/README.md
diff --git a/configs/animatediff/README_zh-CN.md b/configs/animatediff/README_zh-CN.md
diff --git a/configs/animatediff/animatediff_Lyriel.py b/configs/animatediff/animatediff_Lyriel.py
@@ -0,0 +1,61 @@
+# config for model
+stable_diffusion_v15_url = 'runwayml/stable-diffusion-v1-5'
+models_path = '/home/AnimateDiff/models/'
+randomness = dict(
+    seed=[
+        10917152860782582783, 6399018107401806238, 15875751942533906793,
+        6653196880059936551
+    ],
+    diff_rank_seed=True)
+
+diffusion_scheduler = dict(
+    type='DDIMScheduler',
+    beta_end=0.012,
+    beta_schedule='linear',
+    beta_start=0.00085,
+    num_train_timesteps=1000,
+    prediction_type='epsilon',
+    set_alpha_to_one=True,
+    clip_sample=False,
+    thresholding=False,
+    steps_offset=1)
+
+model = dict(
+    type='AnimateDiff',
+    vae=dict(
+        type='AutoencoderKL',
+        from_pretrained=stable_diffusion_v15_url,
+        subfolder='vae'),
+    unet=dict(
+        type='UNet3DConditionMotionModel',
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        use_motion_module=True,
+        motion_module_resolutions=[1, 2, 4, 8],
+        motion_module_mid_block=False,
+        motion_module_decoder_only=False,
+        motion_module_type='Vanilla',
+        motion_module_kwargs=dict(
+            num_attention_heads=8,
+            num_transformer_block=1,
+            attention_block_types=['Temporal_Self', 'Temporal_Self'],
+            temporal_position_encoding=True,
+            temporal_position_encoding_max_len=24,
+            temporal_attention_dim_div=1),
+        subfolder='unet',
+        from_pretrained=stable_diffusion_v15_url),
+    text_encoder=dict(
+        type='ClipWrapper',
+        clip_type='huggingface',
+        pretrained_model_name_or_path=stable_diffusion_v15_url,
+        subfolder='text_encoder'),
+    tokenizer=stable_diffusion_v15_url,
+    scheduler=diffusion_scheduler,
+    test_scheduler=diffusion_scheduler,
+    data_preprocessor=dict(type='DataPreprocessor'),
+    motion_module_cfg=dict(path=models_path + 'Motion_Module/mm_sd_v14.ckpt'),
+    dream_booth_lora_cfg=dict(
+        type='ToonYou',
+        path=models_path + 'DreamBooth_LoRA/lyriel_v16.safetensors',
+        steps=25,
+        guidance_scale=7.5))
diff --git a/configs/animatediff/animatediff_MajicMix.py b/configs/animatediff/animatediff_MajicMix.py
@@ -0,0 +1,62 @@
+# config for model
+stable_diffusion_v15_url = 'runwayml/stable-diffusion-v1-5'
+models_path = '/home/AnimateDiff/models/'
+randomness = dict(
+    seed=[
+        1572448948722921032, 1099474677988590681, 6488833139725635347,
+        18339859844376517918
+    ],
+    diff_rank_seed=True)
+
+diffusion_scheduler = dict(
+    type='DDIMScheduler',
+    beta_end=0.012,
+    beta_schedule='linear',
+    beta_start=0.00085,
+    num_train_timesteps=1000,
+    prediction_type='epsilon',
+    set_alpha_to_one=True,
+    clip_sample=False,
+    thresholding=False,
+    steps_offset=1)
+
+model = dict(
+    type='AnimateDiff',
+    vae=dict(
+        type='AutoencoderKL',
+        from_pretrained=stable_diffusion_v15_url,
+        subfolder='vae'),
+    unet=dict(
+        type='UNet3DConditionMotionModel',
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        use_motion_module=True,
+        motion_module_resolutions=[1, 2, 4, 8],
+        motion_module_mid_block=False,
+        motion_module_decoder_only=False,
+        motion_module_type='Vanilla',
+        motion_module_kwargs=dict(
+            num_attention_heads=8,
+            num_transformer_block=1,
+            attention_block_types=['Temporal_Self', 'Temporal_Self'],
+            temporal_position_encoding=True,
+            temporal_position_encoding_max_len=24,
+            temporal_attention_dim_div=1),
+        subfolder='unet',
+        from_pretrained=stable_diffusion_v15_url),
+    text_encoder=dict(
+        type='ClipWrapper',
+        clip_type='huggingface',
+        pretrained_model_name_or_path=stable_diffusion_v15_url,
+        subfolder='text_encoder'),
+    tokenizer=stable_diffusion_v15_url,
+    scheduler=diffusion_scheduler,
+    test_scheduler=diffusion_scheduler,
+    data_preprocessor=dict(type='DataPreprocessor'),
+    motion_module_cfg=dict(path=models_path + 'Motion_Module/mm_sd_v14.ckpt'),
+    dream_booth_lora_cfg=dict(
+        type='ToonYou',
+        path=models_path +
+        'DreamBooth_LoRA/majicmixRealistic_v5Preview.safetensors',
+        steps=25,
+        guidance_scale=7.5))
diff --git a/configs/animatediff/animatediff_RcnzCartoon.py b/configs/animatediff/animatediff_RcnzCartoon.py
@@ -0,0 +1,61 @@
+# config for model
+stable_diffusion_v15_url = 'runwayml/stable-diffusion-v1-5'
+models_path = '/home/AnimateDiff/models/'
+randomness = dict(
+    seed=[
+        16931037867122267877, 2094308009433392066, 4292543217695451092,
+        15572665120852309890
+    ],
+    diff_rank_seed=True)
+
+diffusion_scheduler = dict(
+    type='DDIMScheduler',
+    beta_end=0.012,
+    beta_schedule='linear',
+    beta_start=0.00085,
+    num_train_timesteps=1000,
+    prediction_type='epsilon',
+    set_alpha_to_one=True,
+    clip_sample=False,
+    thresholding=False,
+    steps_offset=1)
+
+model = dict(
+    type='AnimateDiff',
+    vae=dict(
+        type='AutoencoderKL',
+        from_pretrained=stable_diffusion_v15_url,
+        subfolder='vae'),
+    unet=dict(
+        type='UNet3DConditionMotionModel',
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        use_motion_module=True,
+        motion_module_resolutions=[1, 2, 4, 8],
+        motion_module_mid_block=False,
+        motion_module_decoder_only=False,
+        motion_module_type='Vanilla',
+        motion_module_kwargs=dict(
+            num_attention_heads=8,
+            num_transformer_block=1,
+            attention_block_types=['Temporal_Self', 'Temporal_Self'],
+            temporal_position_encoding=True,
+            temporal_position_encoding_max_len=24,
+            temporal_attention_dim_div=1),
+        subfolder='unet',
+        from_pretrained=stable_diffusion_v15_url),
+    text_encoder=dict(
+        type='ClipWrapper',
+        clip_type='huggingface',
+        pretrained_model_name_or_path=stable_diffusion_v15_url,
+        subfolder='text_encoder'),
+    tokenizer=stable_diffusion_v15_url,
+    scheduler=diffusion_scheduler,
+    test_scheduler=diffusion_scheduler,
+    data_preprocessor=dict(type='DataPreprocessor'),
+    motion_module_cfg=dict(path=models_path + 'Motion_Module/mm_sd_v14.ckpt'),
+    dream_booth_lora_cfg=dict(
+        type='ToonYou',
+        path=models_path + 'DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors',
+        steps=25,
+        guidance_scale=7.5))
diff --git a/configs/animatediff/animatediff_RealisticVision.py b/configs/animatediff/animatediff_RealisticVision.py
@@ -0,0 +1,62 @@
+# config for model
+stable_diffusion_v15_url = 'runwayml/stable-diffusion-v1-5'
+models_path = '/home/AnimateDiff/models/'
+randomness = dict(
+    seed=[
+        5658137986800322009, 12099779162349365895, 10499524853910852697,
+        16768009035333711932
+    ],
+    diff_rank_seed=True)
+
+diffusion_scheduler = dict(
+    type='DDIMScheduler',
+    beta_end=0.012,
+    beta_schedule='linear',
+    beta_start=0.00085,
+    num_train_timesteps=1000,
+    prediction_type='epsilon',
+    set_alpha_to_one=True,
+    clip_sample=False,
+    thresholding=False,
+    steps_offset=1)
+
+model = dict(
+    type='AnimateDiff',
+    vae=dict(
+        type='AutoencoderKL',
+        from_pretrained=stable_diffusion_v15_url,
+        subfolder='vae'),
+    unet=dict(
+        type='UNet3DConditionMotionModel',
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        use_motion_module=True,
+        motion_module_resolutions=[1, 2, 4, 8],
+        motion_module_mid_block=False,
+        motion_module_decoder_only=False,
+        motion_module_type='Vanilla',
+        motion_module_kwargs=dict(
+            num_attention_heads=8,
+            num_transformer_block=1,
+            attention_block_types=['Temporal_Self', 'Temporal_Self'],
+            temporal_position_encoding=True,
+            temporal_position_encoding_max_len=24,
+            temporal_attention_dim_div=1),
+        subfolder='unet',
+        from_pretrained=stable_diffusion_v15_url),
+    text_encoder=dict(
+        type='ClipWrapper',
+        clip_type='huggingface',
+        pretrained_model_name_or_path=stable_diffusion_v15_url,
+        subfolder='text_encoder'),
+    tokenizer=stable_diffusion_v15_url,
+    scheduler=diffusion_scheduler,
+    test_scheduler=diffusion_scheduler,
+    data_preprocessor=dict(type='DataPreprocessor'),
+    motion_module_cfg=dict(path=models_path + 'Motion_Module/mm_sd_v14.ckpt'),
+    dream_booth_lora_cfg=dict(
+        type='ToonYou',
+        path=models_path +
+        'DreamBooth_LoRA/realisticVisionV20_v20.safetensors',
+        steps=25,
+        guidance_scale=7.5))
diff --git a/configs/animatediff/animatediff_RealisticVision_v2.py b/configs/animatediff/animatediff_RealisticVision_v2.py
@@ -0,0 +1,64 @@
+# config for model
+stable_diffusion_v15_url = 'runwayml/stable-diffusion-v1-5'
+models_path = '/home/AnimateDiff/models/'
+randomness = dict(
+    seed=[
+        13100322578370451493, 14752961627088720670, 9329399085567825781,
+        16987697414827649302
+    ],
+    diff_rank_seed=True)
+
+diffusion_scheduler = dict(
+    type='DDIMScheduler',
+    beta_end=0.012,
+    beta_schedule='linear',
+    beta_start=0.00085,
+    num_train_timesteps=1000,
+    prediction_type='epsilon',
+    set_alpha_to_one=True,
+    clip_sample=False,
+    thresholding=False,
+    steps_offset=1)
+
+model = dict(
+    type='AnimateDiff',
+    vae=dict(
+        type='AutoencoderKL',
+        from_pretrained=stable_diffusion_v15_url,
+        subfolder='vae'),
+    unet=dict(
+        type='UNet3DConditionMotionModel',
+        use_inflated_groupnorm=True,
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        use_motion_module=True,
+        motion_module_resolutions=[1, 2, 4, 8],
+        motion_module_mid_block=True,
+        motion_module_decoder_only=False,
+        motion_module_type='Vanilla',
+        motion_module_kwargs=dict(
+            num_attention_heads=8,
+            num_transformer_block=1,
+            attention_block_types=['Temporal_Self', 'Temporal_Self'],
+            temporal_position_encoding=True,
+            temporal_position_encoding_max_len=32,
+            temporal_attention_dim_div=1),
+        subfolder='unet',
+        from_pretrained=stable_diffusion_v15_url),
+    text_encoder=dict(
+        type='ClipWrapper',
+        clip_type='huggingface',
+        pretrained_model_name_or_path=stable_diffusion_v15_url,
+        subfolder='text_encoder'),
+    tokenizer=stable_diffusion_v15_url,
+    scheduler=diffusion_scheduler,
+    test_scheduler=diffusion_scheduler,
+    data_preprocessor=dict(type='DataPreprocessor'),
+    motion_module_cfg=dict(path=models_path +
+                           'Motion_Module/mm_sd_v15_v2.ckpt'),
+    dream_booth_lora_cfg=dict(
+        type='ToonYou',
+        path=models_path +
+        'DreamBooth_LoRA/realisticVisionV20_v20.safetensors',
+        steps=25,
+        guidance_scale=7.5))
diff --git a/configs/animatediff/animatediff_ToonYou.py b/configs/animatediff/animatediff_ToonYou.py
@@ -0,0 +1,80 @@
+# config for model
+stable_diffusion_v15_url = 'runwayml/stable-diffusion-v1-5'
+models_path = '/home/AnimateDiff/models/'
+randomness = dict(
+    seed=[
+        10788741199826055526, 6520604954829636163, 6519455744612555650,
+        16372571278361863751
+    ],
+    diff_rank_seed=True)
+
+val_prompts = [
+    'best quality, masterpiece, 1girl, looking at viewer,\
+        blurry background, upper body, contemporary, dress',
+    'masterpiece, best quality, 1girl, solo, cherry blossoms,\
+        hanami, pink flower, white flower, spring season, wisteria,\
+            petals, flower, plum blossoms, outdoors, falling petals,\
+                white hair, black eyes,',
+    'best quality, masterpiece, 1boy, formal, abstract,\
+        looking at viewer, masculine, marble pattern',
+    'best quality, masterpiece, 1girl, cloudy sky,\
+        dandelion, contrapposto, alternate hairstyle,'
+]
+val_neg_propmts = [
+    '',
+    'badhandv4,easynegative,ng_deepnegative_v1_75t,verybadimagenegative_v1.3,\
+        bad-artist, bad_prompt_version2-neg, teeth',
+    '',
+    '',
+]
+diffusion_scheduler = dict(
+    type='DDIMScheduler',
+    beta_end=0.012,
+    beta_schedule='linear',
+    beta_start=0.00085,
+    num_train_timesteps=1000,
+    prediction_type='epsilon',
+    set_alpha_to_one=True,
+    clip_sample=False,
+    thresholding=False,
+    steps_offset=1)
+
+model = dict(
+    type='AnimateDiff',
+    vae=dict(
+        type='AutoencoderKL',
+        from_pretrained=stable_diffusion_v15_url,
+        subfolder='vae'),
+    unet=dict(
+        type='UNet3DConditionMotionModel',
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        use_motion_module=True,
+        motion_module_resolutions=[1, 2, 4, 8],
+        motion_module_mid_block=False,
+        motion_module_decoder_only=False,
+        motion_module_type='Vanilla',
+        motion_module_kwargs=dict(
+            num_attention_heads=8,
+            num_transformer_block=1,
+            attention_block_types=['Temporal_Self', 'Temporal_Self'],
+            temporal_position_encoding=True,
+            temporal_position_encoding_max_len=24,
+            temporal_attention_dim_div=1),
+        subfolder='unet',
+        from_pretrained=stable_diffusion_v15_url),
+    text_encoder=dict(
+        type='ClipWrapper',
+        clip_type='huggingface',
+        pretrained_model_name_or_path=stable_diffusion_v15_url,
+        subfolder='text_encoder'),
+    tokenizer=stable_diffusion_v15_url,
+    scheduler=diffusion_scheduler,
+    test_scheduler=diffusion_scheduler,
+    data_preprocessor=dict(type='DataPreprocessor'),
+    motion_module_cfg=dict(path=models_path + 'Motion_Module/mm_sd_v14.ckpt'),
+    dream_booth_lora_cfg=dict(
+        type='ToonYou',
+        path=models_path + 'DreamBooth_LoRA/toonyou_beta3.safetensors',
+        steps=25,
+        guidance_scale=7.5))