diff --git a/README.md b/README.md index 6d63714..42bc602 100644 --- a/README.md +++ b/README.md @@ -295,6 +295,7 @@ VideoTuna/ |T2V-Models|HxWxL|Checkpoints| |:---------|:---------|:--------| +|Mochi|848x480, 3s|[Hugging Face](https://huggingface.co/genmo/mochi-1-preview) |CogVideoX-2B|720x480, 6s|[Hugging Face](https://huggingface.co/THUDM/CogVideoX-2b) |CogVideoX-5B|720x480, 6s|[Hugging Face](https://huggingface.co/THUDM/CogVideoX-5b) |Open-Sora 1.0|512×512x16|[Hugging Face](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth) @@ -361,6 +362,7 @@ After downloading, the model checkpoints should be placed as [Checkpoint Structu Task|Model|Command|Length (#frames)|Resolution|Inference Time (s)|GPU Memory (GiB)| |:---------|:---------|:---------|:---------|:---------|:---------|:---------| +|T2V|Mochi|`bash shscripts/inference_mochi.sh`|84|480x848|109.0|26| |I2V|CogVideoX-5b-I2V|`bash shscripts/inference_cogVideo_i2v_diffusers.sh`|49|576x1024|310.4|4.78| |T2V|CogVideoX-2b|`bash shscripts/inference_cogVideo_t2v_diffusers.sh`|49|576x1024|107.6|2.32| |T2V|Open Sora V1.0|`bash shscripts/inference_opensora_v10_16x256x256.sh`|16|256x256|11.2|23.99| @@ -442,6 +444,7 @@ We support video alignment post-training to align human perference for video dif ## Acknowledgement We thank the following repos for sharing their awesome models and codes! +* [Mochi](https://www.genmo.ai/blog): A new SOTA in open-source video generation models * [VideoCrafter2](https://github.com/AILab-CVC/VideoCrafter): Overcoming Data Limitations for High-Quality Video Diffusion Models * [VideoCrafter1](https://github.com/AILab-CVC/VideoCrafter): Open Diffusion Models for High-Quality Video Generation * [DynamiCrafter](https://github.com/Doubiiu/DynamiCrafter): Animating Open-domain Images with Video Diffusion Priors diff --git a/requirements.txt b/requirements.txt index 3d61286..e8c7560 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,4 +38,4 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn git+https://github.com/huggingface/diffusers open_clip_torch==2.12.0 lmdeploy -moviepy +moviepy==1.0.2 diff --git a/scripts/inference_mochi.py b/scripts/inference_mochi.py new file mode 100644 index 0000000..5a59833 --- /dev/null +++ b/scripts/inference_mochi.py @@ -0,0 +1,39 @@ +import torch +from diffusers import MochiPipeline +from diffusers.utils import export_to_video +import argparse +import os + +# create arg parser +parser = argparse.ArgumentParser() +parser.add_argument("--ckpt_path", type=str, default="genmo/mochi-1-preview") +parser.add_argument("--prompt_file", type=str, default="inputs/t2v/prompts.txt") +parser.add_argument("--savedir", type=str, default="results/t2v/") +parser.add_argument("--height", type=int, default=480) +parser.add_argument("--width", type=int, default=848) +parser.add_argument("--bs", type=int, default=1) +parser.add_argument("--fps", type=int, default=28) +parser.add_argument("--seed", type=int, default=123) + +args = parser.parse_args() + +os.makedirs(args.savedir, exist_ok=True) + +pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", variant="bf16", torch_dtype=torch.bfloat16) +# Enable memory savings +pipe.enable_model_cpu_offload() +pipe.enable_vae_tiling() + +# there are many prompts in the prompt_file, we need to read them all +with open(args.prompt_file, 'r') as file: + prompts = file.readlines() + +# set seed +torch.manual_seed(args.seed) + +for index, prompt in enumerate(prompts): + + with torch.autocast("cuda", torch.bfloat16, cache_enabled=False): + frames = pipe(prompt, num_frames=84).frames[0] + + export_to_video(frames, f"{args.savedir}/mochi_{index}.mp4", fps=30) diff --git a/shscripts/inference_mochi.sh b/shscripts/inference_mochi.sh new file mode 100644 index 0000000..9bb29f5 --- /dev/null +++ b/shscripts/inference_mochi.sh @@ -0,0 +1,14 @@ +ckpt='checkpoints/mochi-1-preview' +prompt_file="inputs/t2v/prompts.txt" +savedir="results/t2v/mochi2" +height=480 +width=848 + +python3 scripts/inference_mochi.py \ + --ckpt_path $ckpt \ + --prompt_file $prompt_file \ + --savedir $savedir \ + --bs 1 --height $height --width $width \ + --fps 28 \ + --seed 124 + diff --git a/tools/video_comparison/check_input.py b/tools/video_comparison/check_input.py index acf4f55..3c7d522 100644 --- a/tools/video_comparison/check_input.py +++ b/tools/video_comparison/check_input.py @@ -3,6 +3,7 @@ parser = argparse.ArgumentParser(description='Check the input directory') parser.add_argument('--input_dir', type=str, help='The input should be a directory', required=True) +parser.add_argument('--seed', type=int, help='The seed for the random number generator', default=42) args = parser.parse_args() # check if there are images in the input directory, jpg/png... @@ -26,6 +27,7 @@ for index, line in enumerate(lines): prompt = line.strip() print(f'creating image {index} using prompt: {prompt}') + out = pipe( prompt=prompt, guidance_scale=0., @@ -33,6 +35,7 @@ width=1024, num_inference_steps=4, max_sequence_length=256, + generator=torch.Generator("cuda").manual_seed(args.seed) ).images[0] index_str = str(index).zfill(5) out.save(f"{args.input_dir}/prompt_{index_str}.png") diff --git a/tools/video_comparison/combine.py b/tools/video_comparison/combine.py index e47e960..8ab2adc 100644 --- a/tools/video_comparison/combine.py +++ b/tools/video_comparison/combine.py @@ -1,13 +1,14 @@ import os import argparse import glob -from moviepy.editor import VideoFileClip, clips_array, vfx, TextClip +from moviepy.editor import VideoFileClip, clips_array from PIL import Image, ImageDraw, ImageFont import numpy as np parser = argparse.ArgumentParser(description='Check the input directory') parser.add_argument('--input_dir', type=str, help='The input should be a directory', required=True) parser.add_argument('--save_dir', type=str, help='The directory of saving results', required=True) +parser.add_argument('--unified_height', type=int, help='The height of the unified video', default=320) args = parser.parse_args() methods = glob.glob(f'{args.save_dir}/*/*') @@ -40,12 +41,8 @@ def add_text_to_frame(frame, text='hi', position=(0,0)): max_duration = max([clip.duration for clip in clips]) clips = [clip.set_end(max_duration).set_fps(max_fps) for clip in clips] - # txt_clip = TextClip('hello world', color='orange', size=(100, 100)) - # txt_clip = txt_clip.set_position('center').set_duration(max_duration) - # clips = [clip.resize(height=1080) for clip in clips] - # video_heights = [clip.size[1] for clip in clips] - # print(methods) - # print(len(clips)) + clips = [clip.resize(height=args.unified_height) for clip in clips] + clips_with_name = [] for index, clip in enumerate(clips): method = methods[index].split('/')[-1] diff --git a/tools/video_comparison/compare.sh b/tools/video_comparison/compare.sh old mode 100644 new mode 100755 index 2873c5a..fd766a2 --- a/tools/video_comparison/compare.sh +++ b/tools/video_comparison/compare.sh @@ -4,7 +4,7 @@ input_dir='inputs/t2v' save_dir='results/compare1/' seed=42 unified_visualization_height=320 -inference_methods="videocrafter2;dynamicrafter;cogvideo—t2v;cogvideo—i2v;opensora" +inference_methods="videocrafter2;dynamicrafter;cogvideo—t2v;cogvideo—i2v;opensora;mochi" #### check input #### # Check if the directory exists @@ -26,7 +26,7 @@ python tools/video_comparison/check_input.py --input_dir=$input_dir --seed=$seed ################################ videocrafter2 ################################ ckpt='checkpoints/videocrafter/t2v_v2_512/model.ckpt' -config='configs/train/000_videocrafter2ft/config.yaml' +config='configs/001_videocrafter2/vc2_t2v_320x512.yaml' prompt_file="${input_dir}/prompts.txt" height=320 width=512 @@ -46,7 +46,7 @@ fi ################################ dynamicrafter ################################ ckpt=checkpoints/dynamicrafter/i2v_576x1024/model.ckpt -config=configs/train/002_dynamicrafterft_1024/config.yaml +config=configs/002_dynamicrafter/dc_i2v_1024.yaml prompt_dir="${input_dir}" height=576 width=1024 @@ -93,7 +93,7 @@ fi ################################ opensora ################################ ckpt="checkpoints/open-sora/t2v_v10/OpenSora-v1-HQ-16x256x256.pth" -config='configs/train/001_opensorav10/config_opensorav10.yaml' +config='configs/003_opensora/opensorav10_256x256.yaml' height=256 width=256 fps=8 @@ -116,5 +116,24 @@ if [[ $inference_methods == *"opensora"* ]]; then --frames 16 fi +################################ mochi ################################ +if [[ $inference_methods == *"mochi"* ]]; then + ckpt='genmo/mochi-1-preview' + prompt_file="${input_dir}/prompts.txt" + height=480 + width=848 + savedir="${save_dir}/t2v/mochi-${width}x${height}-28fps" + + python3 scripts/inference_mochi.py \ + --ckpt_path $ckpt \ + --prompt_file $prompt_file \ + --savedir $savedir \ + --bs 1 --height $height --width $width \ + --fps 28 \ + --seed ${seed} +fi + + + #### combine video python3 tools/video_comparison/combine.py --save_dir=$save_dir --input_dir=$input_dir --unified_height=$unified_visualization_height