diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..1c954cfe
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,19 @@
+# The .dockerignore file excludes files from the container build process.
+# https://docs.docker.com/engine/reference/builder/#dockerignore-file
+# Exclude Git files
+# Exclude Python cache files
+# Exclude Python virtual environment
diff --git a/.gitignore b/.gitignore
index c8a1724a..fc7400c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -176,3 +176,7 @@ pretrained_models/
# Secret files
+# cog demo files
diff --git a/README.md b/README.md
index 88bf425b..983a8dc2 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,8 @@
## Open-Sora: Democratizing Efficient Video Production for All
diff --git a/cog.yaml b/cog.yaml
new file mode 100644
index 00000000..77fe4add
--- /dev/null
+++ b/cog.yaml
@@ -0,0 +1,51 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+ # set to true if your model requires a GPU
+ gpu: true
+ cuda: "12.1"
+ # a list of ubuntu apt packages to install
+ system_packages:
+ - "libgl1-mesa-glx"
+ # - "libglib2.0-0"
+ # python version in the form '3.11' or '3.11.4'
+ python_version: "3.10"
+ # a list of packages in the format ==
+ python_packages:
+ - "torch==2.1.0"
+ - "torchvision"
+ - "packaging"
+ - "ninja"
+ - "xformers"
+ - "colossalai"
+ - "accelerate"
+ - "diffusers"
+ - "ftfy"
+ - "gdown"
+ - "mmengine"
+ - "pre-commit"
+ - "pyav"
+ - "tensorboard"
+ - "timm"
+ - "tqdm"
+ - "transformers"
+ - "wandb"
+ # - "numpy==1.19.4"
+ # - "torch==1.8.0"
+ # - "torchvision==0.9.0"
+ # commands run after the environment is setup
+ run:
+ - 'pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git'
+ - 'pip install --no-build-isolation flash-attn'
+ - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.2/pget_Linux_x86_64" && chmod +x /usr/local/bin/pget
+ # - "echo env is ready!"
+ # - "echo another command if needed"
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
diff --git a/predict.py b/predict.py
new file mode 100644
index 00000000..5d167a80
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,168 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+import os
+import random
+import subprocess
+import shutil
+import time
+from typing import List
+import numpy as np
+import torch
+from cog import BasePredictor, Input, Path
+from mmengine.config import Config
+from mmengine.runner import set_random_seed
+from opensora.datasets import save_sample
+from opensora.registry import MODELS, SCHEDULERS, build_module
+from opensora.utils.config_utils import merge_args
+from opensora.utils.misc import to_torch_dtype
+MAX_SEED = np.iinfo(np.int32).max
+MODEL_URL = "https://weights.replicate.delivery/default/open-sora/opensora.tar"
+WEIGHTS_FOLDER = "pretrained_models"
+def download_weights(url, dest, extract=True):
+ start = time.time()
+ print("downloading url: ", url)
+ print("downloading to: ", dest)
+ args = ["pget"]
+ if extract:
+ args.append("-x")
+ subprocess.check_call(args + [url, dest], close_fds=False)
+ print("downloading took: ", time.time() - start)
+def cog_config():
+ # taken from 16x512x512.py
+ cfg = Config(dict(
+ num_frames = 16,
+ fps = 24 // 3,
+ image_size = (512, 512),
+ dtype = "fp16",
+ batch_size = 2,
+ seed = 42,
+ prompt_path = "./assets/texts/t2v_samples.txt",
+ save_dir = "./outputs/samples/",
+ ))
+ cfg.model = dict(
+ type="STDiT-XL/2",
+ space_scale=1.0,
+ time_scale=1.0,
+ enable_flashattn=True,
+ enable_layernorm_kernel=True,
+ from_pretrained="PRETRAINED_MODEL"
+ )
+ cfg.vae = dict(
+ type="VideoAutoencoderKL",
+ from_pretrained="stabilityai/sd-vae-ft-ema",
+ micro_batch_size=128,
+ )
+ cfg.text_encoder = dict(
+ type="t5",
+ from_pretrained="./pretrained_models/t5_ckpts",
+ model_max_length=120,
+ )
+ cfg.scheduler = dict(
+ type="iddpm",
+ num_sampling_steps=100,
+ cfg_scale=7.0,
+ )
+ return cfg
+class Predictor(BasePredictor):
+ def setup(self) -> None:
+ """Load the model into memory to make running multiple predictions efficient"""
+ # install open sora from github repo
+ subprocess.check_call("pip install -q .".split())
+ # download model
+ if not os.path.exists(WEIGHTS_FOLDER):
+ download_weights(MODEL_URL, WEIGHTS_FOLDER, extract=True)
+ # command line arguments from opensora.utils.config_utils
+ extra_args = Config({
+ 'seed': 42,
+ 'ckpt_path': "pretrained_models/Open-Sora/OpenSora-v1-HQ-16x512x512.pth",
+ 'batch-size': None,
+ 'prompt-path': None,
+ 'save-dir': None,
+ 'num-sampling-steps': None,
+ 'cfg_scale': None,
+ })
+ self.cfg = cog_config()
+ self.cfg = merge_args(self.cfg, args=extra_args, training=False)
+ torch.set_grad_enabled(False)
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
+ self.dtype = to_torch_dtype(self.cfg.dtype)
+ input_size = (self.cfg.num_frames, *self.cfg.image_size)
+ self.vae = build_module(self.cfg.vae, MODELS)
+ self.latent_size = self.vae.get_latent_size(input_size)
+ self.text_encoder = build_module(self.cfg.text_encoder, MODELS, device=self.device) # T5 must be fp32
+ self.model = build_module(
+ self.cfg.model,
+ input_size=self.latent_size,
+ in_channels=self.vae.out_channels,
+ caption_channels=self.text_encoder.output_dim,
+ model_max_length=self.text_encoder.model_max_length,
+ dtype=self.dtype,
+ enable_sequence_parallelism=False,
+ )
+ self.text_encoder.y_embedder = self.model.y_embedder # hack for classifier-free guidance
+ self.vae = self.vae.to(self.device, self.dtype).eval()
+ self.model = self.model.to(self.device, self.dtype).eval()
+ self.scheduler = build_module(self.cfg.scheduler, SCHEDULERS)
+ self.model_args = dict()
+ if self.cfg.multi_resolution:
+ image_size = self.cfg.image_size
+ hw = torch.tensor([image_size], device=self.device, dtype=self.dtype).repeat(self.cfg.batch_size, 1)
+ ar = torch.tensor([[image_size[0] / image_size[1]]], device=self.device, dtype=self.dtype).repeat(self.cfg.batch_size, 1)
+ self.model_args["data_info"] = dict(ar=ar, hw=hw)
+ def predict(
+ self,
+ prompt: str = Input(description="Prompt for the video"),
+ seed: int = Input(description="Seed. Leave blank to randomise", default=None),
+ ) -> List[Path]:
+ """Run a single prediction on the model"""
+ # remove old output directory
+ save_dir = self.cfg.save_dir
+ if os.path.exists(save_dir):
+ shutil.rmtree(save_dir)
+ os.makedirs(save_dir, exist_ok=True)
+ # randomize seed
+ if seed is None:
+ seed = random.randint(0, MAX_SEED)
+ print(f"Using seed {seed}...")
+ set_random_seed(seed=seed)
+ samples = self.scheduler.sample(
+ self.model,
+ self.text_encoder,
+ z_size=(self.vae.out_channels, *self.latent_size),
+ prompts=[prompt],
+ device=self.device,
+ additional_args=self.model_args,
+ )
+ samples = self.vae.decode(samples.to(self.dtype))
+ save_path = os.path.join(save_dir, f"output")
+ save_sample(samples[0], fps=self.cfg.fps, save_path=save_path) # write file to {save_path}.mp4
+ return [Path(f"{save_path}.mp4")]