diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..1c954cfe --- /dev/null +++ b/.dockerignore @@ -0,0 +1,19 @@ +# The .dockerignore file excludes files from the container build process. +# +# https://docs.docker.com/engine/reference/builder/#dockerignore-file + +# Exclude Git files +.git +.github +.gitignore + +# Exclude Python cache files +__pycache__ +.mypy_cache +.pytest_cache +.ruff_cache + +# Exclude Python virtual environment +/venv +pretrained_models/ +output*.mp4 diff --git a/.gitignore b/.gitignore index c8a1724a..fc7400c8 100644 --- a/.gitignore +++ b/.gitignore @@ -176,3 +176,7 @@ pretrained_models/ # Secret files hostfile gradio_cached_examples/ + +# cog demo files +output*.mp4 +.cog diff --git a/README.md b/README.md index 88bf425b..983a8dc2 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,8 @@ + + ## Open-Sora: Democratizing Efficient Video Production for All diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 00000000..77fe4add --- /dev/null +++ b/cog.yaml @@ -0,0 +1,51 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + # set to true if your model requires a GPU + gpu: true + cuda: "12.1" + + # a list of ubuntu apt packages to install + system_packages: + - "libgl1-mesa-glx" + # - "libglib2.0-0" + + # python version in the form '3.11' or '3.11.4' + python_version: "3.10" + + # a list of packages in the format == + python_packages: + - "torch==2.1.0" + - "torchvision" + - "packaging" + - "ninja" + - "xformers" + - "colossalai" + - "accelerate" + - "diffusers" + - "ftfy" + - "gdown" + - "mmengine" + - "pre-commit" + - "pyav" + - "tensorboard" + - "timm" + - "tqdm" + - "transformers" + - "wandb" + # - "numpy==1.19.4" + # - "torch==1.8.0" + # - "torchvision==0.9.0" + + # commands run after the environment is setup + run: + - 'pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git' + - 'pip install --no-build-isolation flash-attn' + - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.2/pget_Linux_x86_64" && chmod +x /usr/local/bin/pget + + # - "echo env is ready!" + # - "echo another command if needed" + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 00000000..5d167a80 --- /dev/null +++ b/predict.py @@ -0,0 +1,168 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + +import os +import random +import subprocess +import shutil +import time +from typing import List + +import numpy as np +import torch +from cog import BasePredictor, Input, Path +from mmengine.config import Config +from mmengine.runner import set_random_seed + +from opensora.datasets import save_sample +from opensora.registry import MODELS, SCHEDULERS, build_module +from opensora.utils.config_utils import merge_args +from opensora.utils.misc import to_torch_dtype + +MAX_SEED = np.iinfo(np.int32).max + +MODEL_URL = "https://weights.replicate.delivery/default/open-sora/opensora.tar" +WEIGHTS_FOLDER = "pretrained_models" + +def download_weights(url, dest, extract=True): + start = time.time() + print("downloading url: ", url) + print("downloading to: ", dest) + args = ["pget"] + if extract: + args.append("-x") + subprocess.check_call(args + [url, dest], close_fds=False) + print("downloading took: ", time.time() - start) + +def cog_config(): + # taken from 16x512x512.py + cfg = Config(dict( + num_frames = 16, + fps = 24 // 3, + image_size = (512, 512), + dtype = "fp16", + batch_size = 2, + seed = 42, + prompt_path = "./assets/texts/t2v_samples.txt", + save_dir = "./outputs/samples/", + )) + + cfg.model = dict( + type="STDiT-XL/2", + space_scale=1.0, + time_scale=1.0, + enable_flashattn=True, + enable_layernorm_kernel=True, + from_pretrained="PRETRAINED_MODEL" + ) + cfg.vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=128, + ) + cfg.text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, + ) + cfg.scheduler = dict( + type="iddpm", + num_sampling_steps=100, + cfg_scale=7.0, + ) + return cfg + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + # install open sora from github repo + subprocess.check_call("pip install -q .".split()) + + # download model + if not os.path.exists(WEIGHTS_FOLDER): + download_weights(MODEL_URL, WEIGHTS_FOLDER, extract=True) + + # command line arguments from opensora.utils.config_utils + extra_args = Config({ + 'seed': 42, + 'ckpt_path': "pretrained_models/Open-Sora/OpenSora-v1-HQ-16x512x512.pth", + 'batch-size': None, + 'prompt-path': None, + 'save-dir': None, + 'num-sampling-steps': None, + 'cfg_scale': None, + }) + + self.cfg = cog_config() + self.cfg = merge_args(self.cfg, args=extra_args, training=False) + + torch.set_grad_enabled(False) + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + self.dtype = to_torch_dtype(self.cfg.dtype) + + input_size = (self.cfg.num_frames, *self.cfg.image_size) + self.vae = build_module(self.cfg.vae, MODELS) + self.latent_size = self.vae.get_latent_size(input_size) + self.text_encoder = build_module(self.cfg.text_encoder, MODELS, device=self.device) # T5 must be fp32 + self.model = build_module( + self.cfg.model, + MODELS, + input_size=self.latent_size, + in_channels=self.vae.out_channels, + caption_channels=self.text_encoder.output_dim, + model_max_length=self.text_encoder.model_max_length, + dtype=self.dtype, + enable_sequence_parallelism=False, + ) + self.text_encoder.y_embedder = self.model.y_embedder # hack for classifier-free guidance + + self.vae = self.vae.to(self.device, self.dtype).eval() + self.model = self.model.to(self.device, self.dtype).eval() + self.scheduler = build_module(self.cfg.scheduler, SCHEDULERS) + + self.model_args = dict() + if self.cfg.multi_resolution: + image_size = self.cfg.image_size + hw = torch.tensor([image_size], device=self.device, dtype=self.dtype).repeat(self.cfg.batch_size, 1) + ar = torch.tensor([[image_size[0] / image_size[1]]], device=self.device, dtype=self.dtype).repeat(self.cfg.batch_size, 1) + self.model_args["data_info"] = dict(ar=ar, hw=hw) + + + def predict( + self, + prompt: str = Input(description="Prompt for the video"), + seed: int = Input(description="Seed. Leave blank to randomise", default=None), + ) -> List[Path]: + """Run a single prediction on the model""" + + # remove old output directory + save_dir = self.cfg.save_dir + if os.path.exists(save_dir): + shutil.rmtree(save_dir) + + os.makedirs(save_dir, exist_ok=True) + + # randomize seed + if seed is None: + seed = random.randint(0, MAX_SEED) + print(f"Using seed {seed}...") + set_random_seed(seed=seed) + + samples = self.scheduler.sample( + self.model, + self.text_encoder, + z_size=(self.vae.out_channels, *self.latent_size), + prompts=[prompt], + device=self.device, + additional_args=self.model_args, + ) + samples = self.vae.decode(samples.to(self.dtype)) + + save_path = os.path.join(save_dir, f"output") + save_sample(samples[0], fps=self.cfg.fps, save_path=save_path) # write file to {save_path}.mp4 + + return [Path(f"{save_path}.mp4")]