Merge branch 'main' into bnb-follow-up

huggingface · Oct 22, 2024 · 8a99701 · 8a99701
2 parents 3dbe41f + b0ffe92
commit 8a99701
Show file tree

Hide file tree

Showing 75 changed files with 706 additions and 198 deletions.
diff --git a/docs/source/en/api/pipelines/controlnet_flux.md b/docs/source/en/api/pipelines/controlnet_flux.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team, The InstantX Team, and the XLabs Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -31,6 +31,14 @@ This controlnet code is implemented by [The InstantX Team](https://huggingface.c
 | Depth | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Depth) |
 | Union | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union) |
 
+XLabs ControlNets are also supported, which was contributed by the [XLabs team](https://huggingface.co/XLabs-AI).
+
+| ControlNet type | Developer | Link |
+| -------- | ---------- | ---- |
+| Canny | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-canny-diffusers) |
+| Depth | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-depth-diffusers) |
+| HED | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-hed-diffusers) |
+
 
 <Tip>
 

diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
@@ -54,6 +54,11 @@ image = pipe(
 image.save("sd3_hello_world.png")
 ```
 
+**Note:** Stable Diffusion 3.5 can also be run using the SD3 pipeline, and all mentioned optimizations and techniques apply to it as well. In total there are three official models in the SD3 family:
+- [`stabilityai/stable-diffusion-3-medium-diffusers`](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers)
+- [`stabilityai/stable-diffusion-3.5-large`](https://huggingface.co/stabilityai/stable-diffusion-3-5-large)
+- [`stabilityai/stable-diffusion-3.5-large-turbo`](https://huggingface.co/stabilityai/stable-diffusion-3-5-large-turbo)
+
 ## Memory Optimisations for SD3
 
 SD3 uses three text encoders, one if which is the very large T5-XXL model. This makes it challenging to run the model on GPUs with less than 24GB of VRAM, even when using `fp16` precision. The following section outlines a few memory optimizations in Diffusers that make it easier to run SD3 on low resource hardware.

diff --git a/examples/community/README.md b/examples/community/README.md
@@ -4336,19 +4336,19 @@ The Abstract of the paper:
 
 **64x64**
 :-------------------------:
-| <img src="https://github.com/user-attachments/assets/9e7bb2cd-45a0-4bd1-adb8-23e283baed39" width="222" height="222" alt="bird_64"> |
+| <img src="https://github.com/user-attachments/assets/032738eb-c6cd-4fd9-b4d7-a7317b4b6528" width="222" height="222" alt="bird_64_64"> |
 
 - `256×256, nesting_level=1`: 1.776 GiB. With `150` DDIM inference steps:
 
 **64x64**             |  **256x256**
 :-------------------------:|:-------------------------:
-| <img src="https://github.com/user-attachments/assets/6b724c2e-5e6a-4b63-9b65-c1182cbb67e0" width="222" height="222" alt="64x64"> | <img src="https://github.com/user-attachments/assets/7dbab2ad-bf40-4a73-ab04-f178347cb7d5" width="222" height="222" alt="256x256"> |
+| <img src="https://github.com/user-attachments/assets/21b9ad8b-eea6-4603-80a2-31180f391589" width="222" height="222" alt="bird_256_64"> | <img src="https://github.com/user-attachments/assets/fc411682-8a36-422c-9488-395b77d4406e" width="222" height="222" alt="bird_256_256"> |
 
-- `1024×1024, nesting_level=2`: 1.792 GiB. As one can realize the cost of adding another layer is really negligible. With `250` DDIM inference steps:
+- `1024×1024, nesting_level=2`: 1.792 GiB. As one can realize the cost of adding another layer is really negligible in this context! With `250` DDIM inference steps:
 
 **64x64**             |  **256x256**  |  **1024x1024**
 :-------------------------:|:-------------------------:|:-------------------------:
-| <img src="https://github.com/user-attachments/assets/4a9454e4-e20a-4736-a196-270e2ae796c0" width="222" height="222" alt="64x64"> | <img src="https://github.com/user-attachments/assets/4a96555d-0fda-4303-82b1-a4d886f770b9" width="222" height="222" alt="256x256"> | <img src="https://github.com/user-attachments/assets/e0239b7a-ab73-4d45-8f3e-b4e6b4b50abe" width="222" height="222" alt="1024x1024"> |
+| <img src="https://github.com/user-attachments/assets/febf4b98-3dee-4a8e-9946-fd42e1f232e6" width="222" height="222" alt="bird_1024_64"> | <img src="https://github.com/user-attachments/assets/c5f85b40-5d6d-4267-a92a-c89dff015b9b" width="222" height="222" alt="bird_1024_256"> | <img src="https://github.com/user-attachments/assets/ad66b913-4367-4cb9-889e-bc06f4d96148" width="222" height="222" alt="bird_1024_1024"> |
 
 ```py
 from diffusers import DiffusionPipeline
@@ -4362,8 +4362,7 @@ pipe = DiffusionPipeline.from_pretrained("tolgacangoz/matryoshka-diffusion-model
 
 prompt0 = "a blue jay stops on the top of a helmet of Japanese samurai, background with sakura tree"
 prompt = f"breathtaking {prompt0}. award-winning, professional, highly detailed"
-negative_prompt = "deformed, mutated, ugly, disfigured, blur, blurry, noise, noisy"
-image = pipe(prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=50).images
+image = pipe(prompt, num_inference_steps=50).images
 make_image_grid(image, rows=1, cols=len(image))
 
 # pipe.change_nesting_level(<int>)  # 0, 1, or 2

diff --git a/examples/community/matryoshka.py b/examples/community/matryoshka.py
@@ -107,15 +107,16 @@
 
         >>> # nesting_level=0 -> 64x64; nesting_level=1 -> 256x256 - 64x64; nesting_level=2 -> 1024x1024 - 256x256 - 64x64
         >>> pipe = DiffusionPipeline.from_pretrained("tolgacangoz/matryoshka-diffusion-models",
-        >>>                                          custom_pipeline="matryoshka").to("cuda")
+        ...                                         nesting_level=0,
+        ...                                         trust_remote_code=False,  # One needs to give permission for this code to run
+        ...                                         ).to("cuda")
 
         >>> prompt0 = "a blue jay stops on the top of a helmet of Japanese samurai, background with sakura tree"
         >>> prompt = f"breathtaking {prompt0}. award-winning, professional, highly detailed"
-        >>> negative_prompt = "deformed, mutated, ugly, disfigured, blur, blurry, noise, noisy"
-        >>> image = pipe(prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=50).images
+        >>> image = pipe(prompt, num_inference_steps=50).images
         >>> make_image_grid(image, rows=1, cols=len(image))
 
-        >>> pipe.change_nesting_level(<int>)  # 0, 1, or 2
+        >>> # pipe.change_nesting_level(<int>)  # 0, 1, or 2
         >>> # 50+, 100+, and 250+ num_inference_steps are recommended for nesting levels 0, 1, and 2 respectively.
         ```
 """
@@ -420,6 +421,7 @@ def __init__(
         self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
 
         self.scales = None
+        self.schedule_shifted_power = 1.0
 
     def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
         """
@@ -532,6 +534,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
     def get_schedule_shifted(self, alpha_prod, scale_factor=None):
         if (scale_factor is not None) and (scale_factor > 1):  # rescale noise schedule
+            scale_factor = scale_factor**self.schedule_shifted_power
             snr = alpha_prod / (1 - alpha_prod)
             scaled_snr = snr / scale_factor
             alpha_prod = 1 / (1 + 1 / scaled_snr)
@@ -639,17 +642,14 @@ def step(
         # 4. Clip or threshold "predicted x_0"
         if self.config.thresholding:
             if len(model_output) > 1:
-                pred_original_sample = [
-                    self._threshold_sample(p_o_s * scale) / scale
-                    for p_o_s, scale in zip(pred_original_sample, self.scales)
-                ]
+                pred_original_sample = [self._threshold_sample(p_o_s) for p_o_s in pred_original_sample]
             else:
                 pred_original_sample = self._threshold_sample(pred_original_sample)
         elif self.config.clip_sample:
             if len(model_output) > 1:
                 pred_original_sample = [
-                    (p_o_s * scale).clamp(-self.config.clip_sample_range, self.config.clip_sample_range) / scale
-                    for p_o_s, scale in zip(pred_original_sample, self.scales)
+                    p_o_s.clamp(-self.config.clip_sample_range, self.config.clip_sample_range)
+                    for p_o_s in pred_original_sample
                 ]
             else:
                 pred_original_sample = pred_original_sample.clamp(
@@ -3816,6 +3816,8 @@ def __init__(
 
         if hasattr(unet, "nest_ratio"):
             scheduler.scales = unet.nest_ratio + [1]
+            if nesting_level == 2:
+                scheduler.schedule_shifted_power = 2.0
 
         self.register_modules(
             text_encoder=text_encoder,
@@ -3842,12 +3844,14 @@ def change_nesting_level(self, nesting_level: int):
             ).to(self.device)
             self.config.nesting_level = 1
             self.scheduler.scales = self.unet.nest_ratio + [1]
+            self.scheduler.schedule_shifted_power = 1.0
         elif nesting_level == 2:
             self.unet = NestedUNet2DConditionModel.from_pretrained(
                 "tolgacangoz/matryoshka-diffusion-models", subfolder="unet/nesting_level_2"
             ).to(self.device)
             self.config.nesting_level = 2
             self.scheduler.scales = self.unet.nest_ratio + [1]
+            self.scheduler.schedule_shifted_power = 2.0
         else:
             raise ValueError("Currently, nesting levels 0, 1, and 2 are supported.")
 
@@ -4627,8 +4631,8 @@ def __call__(
         image = latents
 
         if self.scheduler.scales is not None:
-            for i, (img, scale) in enumerate(zip(image, self.scheduler.scales)):
-                image[i] = self.image_processor.postprocess(img * scale, output_type=output_type)[0]
+            for i, img in enumerate(image):
+                image[i] = self.image_processor.postprocess(img, output_type=output_type)[0]
         else:
             image = self.image_processor.postprocess(image, output_type=output_type)
 

diff --git a/examples/controlnet/README_sd3.md b/examples/controlnet/README_sd3.md
@@ -104,7 +104,7 @@ from diffusers.utils import load_image
 import torch
 
 base_model_path = "stabilityai/stable-diffusion-3-medium-diffusers"
-controlnet_path = "sd3-controlnet-out/checkpoint-6500/controlnet"
+controlnet_path = "DavyMorgan/sd3-controlnet-out"
 
 controlnet = SD3ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
 pipe = StableDiffusion3ControlNetPipeline.from_pretrained(

diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
@@ -1048,7 +1048,9 @@ def load_model_hook(models, input_dir):
 
                 # Add noise to the latents according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                noisy_latents = noise_scheduler.add_noise(latents.float(), noise.float(), timesteps).to(
+                    dtype=weight_dtype
+                )
 
                 # Get the text embedding for conditioning
                 encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]

diff --git a/examples/controlnet/train_controlnet_sd3.py b/examples/controlnet/train_controlnet_sd3.py
@@ -50,7 +50,7 @@
 )
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3, free_memory
-from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 from diffusers.utils.torch_utils import is_compiled_module
 
@@ -64,17 +64,6 @@
 logger = get_logger(__name__)
 
 
-def image_grid(imgs, rows, cols):
-    assert len(imgs) == rows * cols
-
-    w, h = imgs[0].size
-    grid = Image.new("RGB", size=(cols * w, rows * h))
-
-    for i, img in enumerate(imgs):
-        grid.paste(img, box=(i % cols * w, i // cols * h))
-    return grid
-
-
 def log_validation(controlnet, args, accelerator, weight_dtype, step, is_final_validation=False):
     logger.info("Running validation... ")
 
@@ -224,7 +213,7 @@ def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=N
             validation_image.save(os.path.join(repo_folder, "image_control.png"))
             img_str += f"prompt: {validation_prompt}\n"
             images = [validation_image] + images
-            image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+            make_image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
             img_str += f"![images_{i})](./images_{i}.png)\n"
 
     model_description = f"""

diff --git a/examples/controlnet/train_controlnet_sdxl.py b/examples/controlnet/train_controlnet_sdxl.py
@@ -1210,7 +1210,9 @@ def compute_embeddings(batch, proportion_empty_prompts, text_encoders, tokenizer
 
                 # Add noise to the latents according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                noisy_latents = noise_scheduler.add_noise(latents.float(), noise.float(), timesteps).to(
+                    dtype=weight_dtype
+                )
 
                 # ControlNet conditioning.
                 controlnet_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)