From e9b3dc8d36c3602097ecfffb0fabd81b9d11924b Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 26 May 2026 15:01:06 +0000 Subject: [PATCH 01/14] add wan example i2v/t2v task Signed-off-by: changwangss --- .../diffusion_model/diffusers/wan/README.md | 126 +++++++++ .../diffusion_model/diffusers/wan/main.py | 253 ++++++++++++++++++ .../diffusers/wan/run_benchmark.sh | 168 ++++++++++++ .../diffusers/wan/run_quant.sh | 54 ++++ 4 files changed, 601 insertions(+) create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/README.md create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/main.py create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md new file mode 100644 index 00000000000..150920cac3c --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -0,0 +1,126 @@ +# Step-by-Step + +This example provides a unified Wan entry for quantization and evaluation, with both t2v and i2v support. + +# Prerequisite + +## 1. Environment + +```shell +pip install -r requirements.txt +# Use latest dev branch if needed before release +# INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@main +# pip install git+https://github.com/intel/auto-round.git@main +pip install neural-compressor-pt==3.7 +pip install auto-round +``` + +## 2. Prepare Model + +Use a local Wan diffusers model path, for example: + +- Wan2.2-T2V-A14B-Diffusers +- Wan2.2-I2V-A14B-Diffusers + +Download example (from Hugging Face): + +```bash +pip install -U "huggingface_hub[cli]" + +# t2v model +huggingface-cli download Wan-AI/Wan2.2-T2V-A14B-Diffusers \ + --local-dir /path/to/Wan2.2-T2V-A14B-Diffusers + +# i2v model +huggingface-cli download Wan-AI/Wan2.2-I2V-A14B-Diffusers \ + --local-dir /path/to/Wan2.2-I2V-A14B-Diffusers +``` + +## 3. Prepare Dataset + +Clone VBench and prepare the required data: + +```bash +git clone https://github.com/Vchitect/VBench.git +``` + +- t2v: pass txt with --prompt_file +- i2v: pass image folder with --image_folder and corresponding --info_json + +# Run + +## Quantization + +```bash +# topology supports wan_mxfp8 or wan_fp8 +bash run_quant.sh \ + --topology=wan_mxfp8 \ + --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ + --task=t2v \ + --output_model=wan_mxfp8_model +``` + +## Inference + Evaluation + +### t2v BF16 + +```bash +bash run_benchmark.sh \ + --topology=wan_bf16 \ + --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ + --task=t2v \ + --prompt_file=/path/to/VBench/prompts/prompts_per_dimension/subject_consistency.txt \ + --output_video_path=wan_bf16_video \ + --accuracy +``` + +### t2v FP8 / MXFP8 + +```bash +# topology supports wan_mxfp8 or wan_fp8 +bash run_benchmark.sh \ + --topology=wan_mxfp8 \ + --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ + --quantized_model=wan_mxfp8_model \ + --task=t2v \ + --prompt_file=/path/to/VBench/prompts/prompts_per_dimension/subject_consistency.txt \ + --output_video_path=wan_mxfp8_video \ + --accuracy +``` + +### i2v BF16 + +For i2v BF16, provide --image_folder and --info_json explicitly: + +```bash +bash run_benchmark.sh \ + --topology=wan_bf16 \ + --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \ + --task=i2v \ + --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \ + --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ + --output_video_path=wan_i2v_bf16_video \ + --accuracy +``` + +### i2v FP8 / MXFP8 + +```bash +# topology supports wan_mxfp8 or wan_fp8 +bash run_benchmark.sh \ + --topology=wan_mxfp8 \ + --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \ + --quantized_model=wan_mxfp8_model \ + --task=i2v \ + --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \ + --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ + --output_video_path=wan_i2v_mxfp8_video \ + --accuracy +``` + +# Notes + +- Quantized weights are saved under: + - /transformer + - /transformer_2 + diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py new file mode 100755 index 00000000000..256aa05dd5c --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py @@ -0,0 +1,253 @@ +import argparse +import json +import os +import random + +import numpy as np +import torch +from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, WanPipeline, WanTransformer3DModel +from diffusers.utils import export_to_video, load_image +from functools import partial +from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare + +from auto_round.data_type.fp8 import quant_fp8_sym +from auto_round.data_type.mxfp import quant_mx_rceil + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Wan quantization and evaluation example.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--model", "--model_name", "--model_name_or_path", required=True, type=str, help="Wan model path") + parser.add_argument("--task", default="t2v", choices=["t2v", "i2v"], help="Wan task type") + parser.add_argument("--scheme", default="BF16", choices=["BF16", "FP8", "MXFP8"], type=str, help="Quantization scheme") + parser.add_argument("--quantize", action="store_true") + parser.add_argument("--inference", action="store_true") + parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="Directory to save quantized transformer weights") + parser.add_argument("--prompt_file", type=str, default=None, help="T2V prompt txt file path") + parser.add_argument("--image_folder", type=str, default=None, help="I2V image folder path") + parser.add_argument("--info_json", type=str, default=None, help="I2V info json file path") + parser.add_argument("--output_video_path", default="./tmp_video", type=str, help="Directory to save generated videos") + parser.add_argument("--limit", default=-1, type=int, help="Limit the number of prompts for evaluation") + parser.add_argument("--seed", default=42, type=int, help="Random seed") + parser.add_argument("--height", default=720, type=int) + parser.add_argument("--width", default=1280, type=int) + parser.add_argument("--num_frames", default=81, type=int) + parser.add_argument("--num_inference_steps", default=40, type=int) + parser.add_argument("--guidance_scale", default=4.0, type=float, help="Guidance scale for t2v/i2v") + parser.add_argument("--guidance_scale_2", default=3.0, type=float, help="Second guidance scale for t2v only") + parser.add_argument("--fps", default=16, type=int) + parser.add_argument("--ratio", default="16-9", type=str, help="Aspect ratio used by i2v VBench dataset") + parser.add_argument("--image_max_area", default=480 * 832, type=int, help="Maximum i2v image area") + return parser.parse_args() + + +def setup_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + +def get_scheme_config(scheme): + if scheme == "FP8": + return {"bits": 8, "data_type": "fp", "group_size": 0, "sym": True} + if scheme == "MXFP8": + return {"bits": 8, "data_type": "mx_fp", "group_size": 32} + return None + + +def build_pipeline(args): + if args.task == "t2v": + vae = AutoencoderKLWan.from_pretrained(args.model, subfolder="vae", torch_dtype=torch.float32) + pipe = WanPipeline.from_pretrained(args.model, vae=vae, torch_dtype=torch.bfloat16) + pipe.enable_model_cpu_offload() + return pipe + + if args.task == "i2v": + pipe = WanImageToVideoPipeline.from_pretrained(args.model, torch_dtype=torch.bfloat16) + pipe.enable_model_cpu_offload() + return pipe + + raise ValueError(f"Unsupported task: {args.task}. Supported tasks are: i2v, t2v") + + +def quantize_pipleine(pipe, args): + scheme_cfg = get_scheme_config(args.scheme) + if scheme_cfg is None: + raise ValueError("BF16 does not need quantization. Use --scheme FP8 or --scheme MXFP8 with --quantize.") + + + qconfig = AutoRoundConfig( + iters=0, + export_format="fake", + output_dir=args.output_dir, + disable_opt_rtn=True, + scheme=scheme_cfg, + ) + pipe = prepare(pipe, qconfig) + convert(pipe, qconfig) + + +def apply_activation_qdq(pipe, scheme): + if scheme == "BF16": + return + + if scheme == "FP8": + def act_qdq_forward(module, x, *f_args, **f_kwargs): + qdq_x, _, _ = quant_fp8_sym(x, group_size=0) + return module.orig_forward(qdq_x, *f_args, **f_kwargs) + else: + def act_qdq_forward(module, x, *f_args, **f_kwargs): + qdq_x, _, _ = quant_mx_rceil(x, bits=8, group_size=32, data_type="mx_fp_rceil") + return module.orig_forward(qdq_x, *f_args, **f_kwargs) + + for module_name in ["transformer", "transformer_2"]: + module = getattr(pipe, module_name) + for n, m in module.named_modules(): + if m.__class__.__name__ == "Linear" and "blocks" in n: + m.orig_forward = m.forward + m.forward = partial(act_qdq_forward, m) + + +def load_quantized_transformers(pipe, output_dir): + for module_name in ["transformer", "transformer_2"]: + q_path = os.path.join(output_dir, module_name) + if not os.path.isdir(q_path): + raise ValueError(f"Quantized path does not exist: {q_path}") + print(f"Loading quantized {module_name} from {q_path}") + setattr(pipe, module_name, WanTransformer3DModel.from_pretrained(q_path, torch_dtype=torch.bfloat16)) + + +def build_t2v_inputs(args): + prompt_file = args.prompt_file + + if not prompt_file: + raise ValueError("--prompt_file is required for t2v inference/eval") + if not os.path.exists(prompt_file): + raise FileNotFoundError(f"Prompt file not found: {prompt_file}") + + with open(prompt_file, "r", encoding="utf-8") as f: + prompt_list = [line.strip() for line in f if line.strip()] + + if args.limit >= 0: + prompt_list = prompt_list[: args.limit] + + return [{"prompt": prompt} for prompt in prompt_list] + + +def build_i2v_inputs(args): + image_folder = args.image_folder + info_json = args.info_json + + if not image_folder: + raise ValueError("--image_folder is required for i2v inference/eval") + if not info_json: + raise ValueError("--info_json is required for i2v inference/eval") + if not os.path.isdir(image_folder): + raise FileNotFoundError(f"Image folder not found: {image_folder}") + if not os.path.exists(info_json): + raise FileNotFoundError(f"Info json not found: {info_json}") + + with open(info_json, "r", encoding="utf-8") as f: + info_list = json.load(f) + + results = [] + for info in info_list: + image_path = os.path.join(image_folder, info["image_name"]) + if not os.path.exists(image_path): + continue + results.append( + { + "prompt": info["prompt_en"], + "image_path": image_path, + } + ) + + if args.limit >= 0: + results = results[: args.limit] + return results + + +def safe_output_path(base_dir, prompt): + return os.path.join(base_dir, f"{prompt}-0.mp4") + + +@torch.no_grad() +def run_inference(args, pipe): + setup_seed(args.seed) + os.makedirs(args.output_video_path, exist_ok=True) + gen = torch.Generator(device="cuda" if torch.cuda.is_available() else "cpu").manual_seed(args.seed) + + if args.task == "t2v": + inputs = build_t2v_inputs(args) + else: + inputs = build_i2v_inputs(args) + + for item in inputs: + prompt = item["prompt"] + save_path = safe_output_path(args.output_video_path, prompt) + if os.path.exists(save_path): + continue + + if args.task == "t2v": + frames = pipe( + prompt=prompt, + height=args.height, + width=args.width, + num_frames=args.num_frames, + guidance_scale=args.guidance_scale, + guidance_scale_2=args.guidance_scale_2, + num_inference_steps=args.num_inference_steps, + generator=gen, + ).frames[0] + else: + image = load_image(item["image_path"]) + aspect_ratio = image.height / image.width + mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1] + height = round(np.sqrt(args.image_max_area * aspect_ratio)) // mod_value * mod_value + width = round(np.sqrt(args.image_max_area / aspect_ratio)) // mod_value * mod_value + image = image.resize((width, height)) + + frames = pipe( + image=image, + prompt=prompt, + height=height, + width=width, + num_frames=args.num_frames, + guidance_scale=args.guidance_scale, + num_inference_steps=args.num_inference_steps, + generator=gen, + ).frames[0] + + export_to_video(frames, save_path, fps=args.fps) + print(f"Saved: {save_path}") + + +def main(): + args = parse_args() + + if not (args.quantize or args.inference): + raise ValueError("Please enable at least one stage: --quantize or --inference") + + if args.quantize or args.inference: + pipe = build_pipeline(args) + else: + pipe = None + + if args.quantize: + quantize_pipleine(pipe, args) + + if args.inference: + if args.scheme in ["FP8", "MXFP8"]: + load_quantized_transformers(pipe, args.output_dir) + apply_activation_qdq(pipe, args.scheme) + run_inference(args, pipe) + + +if __name__ == "__main__": + main() + + diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh new file mode 100755 index 00000000000..4532f902af5 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh @@ -0,0 +1,168 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_benchmark +} + +function ensure_vbench_repo { + if [ ! -d "VBench" ]; then + echo "VBench directory not found. Start cloning https://github.com/Vchitect/VBench.git ..." + git clone https://github.com/Vchitect/VBench.git + if [ $? -ne 0 ]; then + echo "Error: failed to clone VBench." + exit 1 + fi + fi +} + +function prepare_vbench_inputs { + if [ "${task}" = "t2v" ] && [ -z "${prompt_file}" ]; then + echo "Error: --prompt_file is required for task=t2v" + exit 1 + fi + + if [ "${task}" = "i2v" ]; then + if [ -z "${image_folder}" ]; then + echo "Error: --image_folder is required for task=i2v" + exit 1 + fi + if [ -z "${info_json}" ]; then + echo "Error: --info_json is required for task=i2v" + exit 1 + fi + fi + + if [ -n "${prompt_file}" ] && [ ! -f "${prompt_file}" ]; then + echo "Error: prompt_file not found: ${prompt_file}" + exit 1 + fi + if [ -n "${image_folder}" ] && [ ! -d "${image_folder}" ]; then + echo "Error: image_folder not found: ${image_folder}" + exit 1 + fi + if [ -n "${info_json}" ] && [ ! -f "${info_json}" ]; then + echo "Error: info_json not found: ${info_json}" + exit 1 + fi +} + +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology="${var#*=}" + ;; + --input_model=*) + input_model="${var#*=}" + ;; + --task=*) + task="${var#*=}" + ;; + --quantized_model=*) + tuned_checkpoint="${var#*=}" + ;; + --output_video_path=*) + output_video_path="${var#*=}" + ;; + --prompt_file=*) + prompt_file="${var#*=}" + ;; + --image_folder=*) + image_folder="${var#*=}" + ;; + --info_json=*) + info_json="${var#*=}" + ;; + --limit=*) + limit="${var#*=}" + ;; + --accuracy) + accuracy=true + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done +} + +function run_benchmark { + task=${task:="t2v"} + limit=${limit:=-1} + tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"} + output_video_path=${output_video_path:="./tmp_video"} + accuracy=${accuracy:=false} + + if [[ ! "${output_video_path}" = /* ]]; then + output_video_path=$(realpath -s "$(pwd)/${output_video_path}") + fi + + if [ "${topology}" = "wan_bf16" ]; then + scheme="BF16" + elif [ "${topology}" = "wan_fp8" ]; then + scheme="FP8" + elif [ "${topology}" = "wan_mxfp8" ]; then + scheme="MXFP8" + else + echo "Error: unsupported topology ${topology}, use wan_bf16/wan_fp8/wan_mxfp8" + exit 1 + fi + + ensure_vbench_repo + + prepare_vbench_inputs + + benchmark_cmd=( + python3 main.py + --model "${input_model}" + --task "${task}" + --scheme "${scheme}" + --output_dir "${tuned_checkpoint}" + --output_video_path "${output_video_path}" + --limit "${limit}" + --inference + ) + + if [ -n "${prompt_file}" ]; then + benchmark_cmd+=(--prompt_file "${prompt_file}") + fi + if [ -n "${image_folder}" ]; then + benchmark_cmd+=(--image_folder "${image_folder}") + fi + if [ -n "${info_json}" ]; then + benchmark_cmd+=(--info_json "${info_json}") + fi + + "${benchmark_cmd[@]}" + + if [ "${accuracy}" = "true" ]; then + if [ "${task}" = "t2v" ]; then + echo "Start VBench evaluation for t2v..." + pushd VBench + python evaluate.py \ + --dimension subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency \ + --videos_path "${output_video_path}" \ + --mode=vbench_standard + popd + elif [ "${task}" = "i2v" ]; then + echo "Start VBench evaluation for i2v..." + pushd VBench + python evaluate_i2v.py \ + --dimension i2v_background i2v_subject subject_consistency background_consistency motion_smoothness \ + --videos_path "${output_video_path}" \ + --mode=vbench_standard + popd + else + echo "--accuracy does not support task=${task}. Supported tasks: t2v, i2v." + exit 1 + fi + else + echo "Video generation finished. Use --accuracy to run VBench evaluation for t2v/i2v." + fi +} + +main "$@" diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh new file mode 100755 index 00000000000..ae1ff41e1bb --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh @@ -0,0 +1,54 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning +} + +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var | cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var | cut -f2 -d=) + ;; + --task=*) + task=$(echo $var | cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var | cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done +} + +function run_tuning { + tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"} + task=${task:="t2v"} + + if [ "${topology}" = "wan_fp8" ]; then + extra_cmd="--scheme FP8" + elif [ "${topology}" = "wan_mxfp8" ]; then + extra_cmd="--scheme MXFP8" + else + echo "Error: unsupported topology ${topology}, use wan_fp8 or wan_mxfp8" + exit 1 + fi + + python3 main.py \ + --model ${input_model} \ + --task ${task} \ + --output_dir ${tuned_checkpoint} \ + --quantize \ + ${extra_cmd} +} + +main "$@" From 9c2ea404f999198abf4f8f52f50f68a4b14def0d Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 27 May 2026 14:19:57 +0000 Subject: [PATCH 02/14] improve benchmark Signed-off-by: changwangss --- .../diffusion_model/diffusers/wan/README.md | 49 +++++--- .../diffusion_model/diffusers/wan/main.py | 40 +++++- .../diffusers/wan/run_benchmark.sh | 116 ++++++++++++++---- 3 files changed, 164 insertions(+), 41 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md index 150920cac3c..02d6c4e64e9 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/README.md +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -9,7 +9,7 @@ This example provides a unified Wan entry for quantization and evaluation, with ```shell pip install -r requirements.txt # Use latest dev branch if needed before release -# INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@main +# INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master # pip install git+https://github.com/intel/auto-round.git@main pip install neural-compressor-pt==3.7 pip install auto-round @@ -44,8 +44,9 @@ Clone VBench and prepare the required data: git clone https://github.com/Vchitect/VBench.git ``` -- t2v: pass txt with --prompt_file -- i2v: pass image folder with --image_folder and corresponding --info_json +- t2v: pass prompt folder with --prompt_folder, and set --dimension to select `${prompt_folder}/${dimension}.txt` +- t2v: can pass --dimension for evaluation filtering (validated dimensions include `subject_consistency` and `overall_consistency`) +- i2v: pass --image_folder, --info_json, and --dimension (validated dimensions include `i2v_subject`, `i2v_background`, `subject_consistency`, `background_consistency`, and `motion_smoothness`) # Run @@ -69,9 +70,9 @@ bash run_benchmark.sh \ --topology=wan_bf16 \ --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ --task=t2v \ - --prompt_file=/path/to/VBench/prompts/prompts_per_dimension/subject_consistency.txt \ - --output_video_path=wan_bf16_video \ - --accuracy + --dimension=subject_consistency \ + --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \ + --output_video_path=wan_bf16_video ``` ### t2v FP8 / MXFP8 @@ -83,24 +84,24 @@ bash run_benchmark.sh \ --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ --quantized_model=wan_mxfp8_model \ --task=t2v \ - --prompt_file=/path/to/VBench/prompts/prompts_per_dimension/subject_consistency.txt \ - --output_video_path=wan_mxfp8_video \ - --accuracy + --dimension=overall_consistency \ + --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \ + --output_video_path=wan_mxfp8_video ``` ### i2v BF16 -For i2v BF16, provide --image_folder and --info_json explicitly: +For i2v BF16, provide --image_folder, --info_json, and --dimension explicitly: ```bash bash run_benchmark.sh \ --topology=wan_bf16 \ --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \ --task=i2v \ + --dimension=i2v_subject \ --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \ --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ - --output_video_path=wan_i2v_bf16_video \ - --accuracy + --output_video_path=wan_i2v_bf16_video ``` ### i2v FP8 / MXFP8 @@ -112,10 +113,30 @@ bash run_benchmark.sh \ --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \ --quantized_model=wan_mxfp8_model \ --task=i2v \ + --dimension=i2v_background \ --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \ --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ - --output_video_path=wan_i2v_mxfp8_video \ - --accuracy + --output_video_path=wan_i2v_mxfp8_video +``` + +## Accuracy Evaluation + +For evaluating existing videos, call VBench directly. + +```bash +# t2v accuracy on generated videos +cd /path/to/VBench +python evaluate.py \ + --dimension subject_consistency motion_smoothness aesthetic_quality overall_consistency imaging_quality \ + --videos_path /path/to/wan_t2v_bf16_video \ + --mode vbench_standard + +# i2v accuracy on generated videos +cd /path/to/VBench +python evaluate_i2v.py \ + --dimension i2v_background i2v_subject subject_consistency background_consistency motion_smoothness \ + --videos_path /path/to/wan_i2v_bf16_video \ + --mode vbench_standard ``` # Notes diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py index 256aa05dd5c..a3e2d6b249c 100755 --- a/examples/pytorch/diffusion_model/diffusers/wan/main.py +++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py @@ -25,9 +25,19 @@ def parse_args(): parser.add_argument("--quantize", action="store_true") parser.add_argument("--inference", action="store_true") parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="Directory to save quantized transformer weights") - parser.add_argument("--prompt_file", type=str, default=None, help="T2V prompt txt file path") + parser.add_argument("--prompt_folder", type=str, default=None, help="T2V prompt folder path") parser.add_argument("--image_folder", type=str, default=None, help="I2V image folder path") parser.add_argument("--info_json", type=str, default=None, help="I2V info json file path") + parser.add_argument( + "--dimension", + type=str, + default=None, + help=( + "VBench dimension used by t2v/i2v evaluation or input filtering " + "(validated examples: t2v=subject_consistency,overall_consistency; " + "i2v=i2v_subject,i2v_background)" + ), + ) parser.add_argument("--output_video_path", default="./tmp_video", type=str, help="Directory to save generated videos") parser.add_argument("--limit", default=-1, type=int, help="Limit the number of prompts for evaluation") parser.add_argument("--seed", default=42, type=int, help="Random seed") @@ -122,16 +132,28 @@ def load_quantized_transformers(pipe, output_dir): def build_t2v_inputs(args): - prompt_file = args.prompt_file + prompt_folder = args.prompt_folder + + if not prompt_folder: + raise ValueError("--prompt_folder is required for t2v inference/eval") + if not args.dimension: + raise ValueError("--dimension is required for t2v inference/eval") + if not os.path.isdir(prompt_folder): + raise FileNotFoundError(f"Prompt folder not found: {prompt_folder}") - if not prompt_file: - raise ValueError("--prompt_file is required for t2v inference/eval") + prompt_file = os.path.join(prompt_folder, f"{args.dimension}.txt") if not os.path.exists(prompt_file): - raise FileNotFoundError(f"Prompt file not found: {prompt_file}") + raise FileNotFoundError(f"Prompt file not found for dimension '{args.dimension}': {prompt_file}") with open(prompt_file, "r", encoding="utf-8") as f: prompt_list = [line.strip() for line in f if line.strip()] + if args.dimension not in {"subject_consistency", "overall_consistency"}: + print( + "[WARN] t2v --dimension is not in validated examples " + "(subject_consistency, overall_consistency). Continue anyway." + ) + if args.limit >= 0: prompt_list = prompt_list[: args.limit] @@ -146,6 +168,11 @@ def build_i2v_inputs(args): raise ValueError("--image_folder is required for i2v inference/eval") if not info_json: raise ValueError("--info_json is required for i2v inference/eval") + if not args.dimension: + raise ValueError( + "--dimension is required for i2v inference/eval " + "(validated examples: i2v_subject, i2v_background)" + ) if not os.path.isdir(image_folder): raise FileNotFoundError(f"Image folder not found: {image_folder}") if not os.path.exists(info_json): @@ -156,6 +183,9 @@ def build_i2v_inputs(args): results = [] for info in info_list: + if args.dimension not in info["dimension"]: + continue + image_path = os.path.join(image_folder, info["image_name"]) if not os.path.exists(image_path): continue diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh index 4532f902af5..61955560fd7 100755 --- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh @@ -18,9 +18,15 @@ function ensure_vbench_repo { } function prepare_vbench_inputs { - if [ "${task}" = "t2v" ] && [ -z "${prompt_file}" ]; then - echo "Error: --prompt_file is required for task=t2v" - exit 1 + if [ "${task}" = "t2v" ]; then + if [ -z "${prompt_folder}" ]; then + echo "Error: --prompt_folder is required for task=t2v" + exit 1 + fi + if [ -z "${dimension}" ]; then + echo "Error: --dimension is required for task=t2v" + exit 1 + fi fi if [ "${task}" = "i2v" ]; then @@ -32,10 +38,14 @@ function prepare_vbench_inputs { echo "Error: --info_json is required for task=i2v" exit 1 fi + if [ -z "${dimension}" ]; then + echo "Error: --dimension is required for task=i2v" + exit 1 + fi fi - if [ -n "${prompt_file}" ] && [ ! -f "${prompt_file}" ]; then - echo "Error: prompt_file not found: ${prompt_file}" + if [ -n "${prompt_folder}" ] && [ ! -d "${prompt_folder}" ]; then + echo "Error: prompt_folder not found: ${prompt_folder}" exit 1 fi if [ -n "${image_folder}" ] && [ ! -d "${image_folder}" ]; then @@ -49,41 +59,94 @@ function prepare_vbench_inputs { } function init_params { - for var in "$@" - do - case $var in + while [[ $# -gt 0 ]]; do + case "$1" in --topology=*) - topology="${var#*=}" + topology="${1#*=}" + shift + ;; + --topology) + topology="$2" + shift 2 ;; --input_model=*) - input_model="${var#*=}" + input_model="${1#*=}" + shift + ;; + --input_model) + input_model="$2" + shift 2 ;; --task=*) - task="${var#*=}" + task="${1#*=}" + shift + ;; + --task) + task="$2" + shift 2 ;; --quantized_model=*) - tuned_checkpoint="${var#*=}" + tuned_checkpoint="${1#*=}" + shift + ;; + --quantized_model) + tuned_checkpoint="$2" + shift 2 ;; --output_video_path=*) - output_video_path="${var#*=}" + output_video_path="${1#*=}" + shift + ;; + --output_video_path) + output_video_path="$2" + shift 2 + ;; + --prompt_folder=*) + prompt_folder="${1#*=}" + shift ;; - --prompt_file=*) - prompt_file="${var#*=}" + --prompt_folder) + prompt_folder="$2" + shift 2 ;; --image_folder=*) - image_folder="${var#*=}" + image_folder="${1#*=}" + shift + ;; + --image_folder) + image_folder="$2" + shift 2 ;; --info_json=*) - info_json="${var#*=}" + info_json="${1#*=}" + shift + ;; + --info_json) + info_json="$2" + shift 2 + ;; + --dimension=*) + dimension="${1#*=}" + shift + ;; + --dimension) + dimension="$2" + shift 2 ;; --limit=*) - limit="${var#*=}" + limit="${1#*=}" + shift + ;; + --limit) + limit="$2" + shift 2 ;; --accuracy) accuracy=true + shift ;; *) - echo "Error: No such parameter: ${var}" + echo "Error: No such parameter: $1" exit 1 ;; esac @@ -127,8 +190,8 @@ function run_benchmark { --inference ) - if [ -n "${prompt_file}" ]; then - benchmark_cmd+=(--prompt_file "${prompt_file}") + if [ -n "${prompt_folder}" ]; then + benchmark_cmd+=(--prompt_folder "${prompt_folder}") fi if [ -n "${image_folder}" ]; then benchmark_cmd+=(--image_folder "${image_folder}") @@ -136,15 +199,24 @@ function run_benchmark { if [ -n "${info_json}" ]; then benchmark_cmd+=(--info_json "${info_json}") fi + if [ -n "${dimension}" ]; then + benchmark_cmd+=(--dimension "${dimension}") + fi "${benchmark_cmd[@]}" if [ "${accuracy}" = "true" ]; then if [ "${task}" = "t2v" ]; then echo "Start VBench evaluation for t2v..." + local t2v_dims + if [ -n "${dimension}" ]; then + t2v_dims="${dimension}" + else + t2v_dims="subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency" + fi pushd VBench python evaluate.py \ - --dimension subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency \ + --dimension ${t2v_dims} \ --videos_path "${output_video_path}" \ --mode=vbench_standard popd From a875b79c33f03b27fe9dcba895260c501f947f13 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 28 May 2026 05:43:27 +0000 Subject: [PATCH 03/14] update readme and add multi-gpus support and shard dataset Signed-off-by: changwangss --- .../diffusion_model/diffusers/wan/README.md | 49 +++--- .../diffusion_model/diffusers/wan/main.py | 43 ++++- .../diffusers/wan/run_benchmark.sh | 154 ++++++++++++++---- .../diffusers/wan/split_i2v_info.py | 55 +++++++ .../diffusers/wan/split_t2v_prompts.py | 47 ++++++ 5 files changed, 295 insertions(+), 53 deletions(-) mode change 100644 => 100755 examples/pytorch/diffusion_model/diffusers/wan/README.md create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md old mode 100644 new mode 100755 index 02d6c4e64e9..e9c8440b658 --- a/examples/pytorch/diffusion_model/diffusers/wan/README.md +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -11,7 +11,7 @@ pip install -r requirements.txt # Use latest dev branch if needed before release # INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master # pip install git+https://github.com/intel/auto-round.git@main -pip install neural-compressor-pt==3.7 +pip install neural-compressor-pt pip install auto-round ``` @@ -37,14 +37,20 @@ huggingface-cli download Wan-AI/Wan2.2-I2V-A14B-Diffusers \ ``` ## 3. Prepare Dataset - -Clone VBench and prepare the required data: +Clone VBench to prepare the required dataset, then download i2v data: ```bash +# recommended: install VBench from pip +python3 -m pip install VBench + +# required for dataset preparation git clone https://github.com/Vchitect/VBench.git +cd VBench +bash vbench2_beta_i2v/download_data.sh ``` - t2v: pass prompt folder with --prompt_folder, and set --dimension to select `${prompt_folder}/${dimension}.txt` +- t2v/i2v: pass comma-separated values in `--dimension` to run multiple dimensions in one command (e.g., `subject_consistency,overall_consistency`) - t2v: can pass --dimension for evaluation filtering (validated dimensions include `subject_consistency` and `overall_consistency`) - i2v: pass --image_folder, --info_json, and --dimension (validated dimensions include `i2v_subject`, `i2v_background`, `subject_consistency`, `background_consistency`, and `motion_smoothness`) @@ -63,19 +69,20 @@ bash run_quant.sh \ ## Inference + Evaluation -### t2v BF16 +### t2v bf16 ```bash bash run_benchmark.sh \ --topology=wan_bf16 \ --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ --task=t2v \ - --dimension=subject_consistency \ + --dimension=subject_consistency,overall_consistency \ --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \ - --output_video_path=wan_bf16_video + --output_video_path=wan_t2v_bf16_video \ + --accuracy ``` -### t2v FP8 / MXFP8 +### t2v mxfp8/fp8 ```bash # topology supports wan_mxfp8 or wan_fp8 @@ -84,27 +91,27 @@ bash run_benchmark.sh \ --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ --quantized_model=wan_mxfp8_model \ --task=t2v \ - --dimension=overall_consistency \ + --dimension=subject_consistency,overall_consistency \ --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \ - --output_video_path=wan_mxfp8_video + --output_video_path=wan_t2v_mxfp8_video \ + --accuracy ``` -### i2v BF16 - -For i2v BF16, provide --image_folder, --info_json, and --dimension explicitly: +### i2v bf16 ```bash bash run_benchmark.sh \ --topology=wan_bf16 \ --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \ --task=i2v \ - --dimension=i2v_subject \ + --dimension=i2v_background,i2v_subject \ --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \ --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ - --output_video_path=wan_i2v_bf16_video + --output_video_path=wan_i2v_bf16_video \ + --accuracy ``` -### i2v FP8 / MXFP8 +### i2v mxfp8/fp8 ```bash # topology supports wan_mxfp8 or wan_fp8 @@ -113,15 +120,19 @@ bash run_benchmark.sh \ --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \ --quantized_model=wan_mxfp8_model \ --task=i2v \ - --dimension=i2v_background \ + --dimension=i2v_background,i2v_subject \ --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \ --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ - --output_video_path=wan_i2v_mxfp8_video + --output_video_path=wan_i2v_mxfp8_video \ + --accuracy ``` -## Accuracy Evaluation +Note: For sharding and multi-GPU execution, set `--gpu_ids` (for example `--gpu_ids=0,1,2,3`) or set `CUDA_VISIBLE_DEVICES` before running `run_benchmark.sh`. + +### Standalone Accuracy Evaluation (Optional) -For evaluating existing videos, call VBench directly. +If you already use `--accuracy` in `run_benchmark.sh`, you can skip this section. +Use this section when you want to evaluate existing videos without re-running generation. ```bash # t2v accuracy on generated videos diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py index a3e2d6b249c..85194cd9efe 100755 --- a/examples/pytorch/diffusion_model/diffusers/wan/main.py +++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py @@ -50,6 +50,17 @@ def parse_args(): parser.add_argument("--fps", default=16, type=int) parser.add_argument("--ratio", default="16-9", type=str, help="Aspect ratio used by i2v VBench dataset") parser.add_argument("--image_max_area", default=480 * 832, type=int, help="Maximum i2v image area") + parser.add_argument( + "--mxfp8_chunk_rows", + default=2048, + type=int, + help="Row chunk size for MXFP8 activation QDQ", + ) + parser.add_argument( + "--disable_mxfp8_inplace_qdq", + action="store_true", + help="Disable in-place MXFP8 activation QDQ", + ) return parser.parse_args() @@ -101,7 +112,7 @@ def quantize_pipleine(pipe, args): convert(pipe, qconfig) -def apply_activation_qdq(pipe, scheme): +def apply_activation_qdq(pipe, scheme, runtime_args): if scheme == "BF16": return @@ -111,7 +122,32 @@ def act_qdq_forward(module, x, *f_args, **f_kwargs): return module.orig_forward(qdq_x, *f_args, **f_kwargs) else: def act_qdq_forward(module, x, *f_args, **f_kwargs): - qdq_x, _, _ = quant_mx_rceil(x, bits=8, group_size=32, data_type="mx_fp_rceil") + chunk_rows = max(1, int(getattr(runtime_args, "mxfp8_chunk_rows", 2048))) + use_inplace = not getattr(runtime_args, "disable_mxfp8_inplace_qdq", False) + + if use_inplace and x.is_cuda: + # Chunked in-place QDQ reduces peak activation memory on large tensors. + x_2d = x.reshape(-1, x.shape[-1]) + total_rows = x_2d.shape[0] + for start in range(0, total_rows, chunk_rows): + end = min(start + chunk_rows, total_rows) + qdq_chunk = quant_mx_rceil( + x_2d[start:end], + bits=8, + group_size=32, + data_type="mx_fp_rceil", + )[0] + x_2d[start:end].copy_(qdq_chunk) + del qdq_chunk + qdq_x = x + else: + qdq_x = quant_mx_rceil( + x, + bits=8, + group_size=32, + data_type="mx_fp_rceil", + )[0] + return module.orig_forward(qdq_x, *f_args, **f_kwargs) for module_name in ["transformer", "transformer_2"]: @@ -198,6 +234,7 @@ def build_i2v_inputs(args): if args.limit >= 0: results = results[: args.limit] + return results @@ -273,7 +310,7 @@ def main(): if args.inference: if args.scheme in ["FP8", "MXFP8"]: load_quantized_transformers(pipe, args.output_dir) - apply_activation_qdq(pipe, args.scheme) + apply_activation_qdq(pipe, args.scheme, args) run_inference(args, pipe) diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh index 61955560fd7..e65b01b1bf2 100755 --- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh @@ -133,6 +133,14 @@ function init_params { dimension="$2" shift 2 ;; + --gpu_ids=*) + gpu_ids="${1#*=}" + shift + ;; + --gpu_ids) + gpu_ids="$2" + shift 2 + ;; --limit=*) limit="${1#*=}" shift @@ -179,52 +187,136 @@ function run_benchmark { prepare_vbench_inputs - benchmark_cmd=( - python3 main.py - --model "${input_model}" - --task "${task}" - --scheme "${scheme}" - --output_dir "${tuned_checkpoint}" - --output_video_path "${output_video_path}" - --limit "${limit}" - --inference - ) - - if [ -n "${prompt_folder}" ]; then - benchmark_cmd+=(--prompt_folder "${prompt_folder}") - fi - if [ -n "${image_folder}" ]; then - benchmark_cmd+=(--image_folder "${image_folder}") - fi - if [ -n "${info_json}" ]; then - benchmark_cmd+=(--info_json "${info_json}") + normalized_dimensions="${dimension//,/ }" + read -r -a dimension_list <<< "${normalized_dimensions}" + + if [ -n "${gpu_ids}" ]; then + gpu_list="${gpu_ids}" + else + gpu_list="${CUDA_VISIBLE_DEVICES:-}" fi - if [ -n "${dimension}" ]; then - benchmark_cmd+=(--dimension "${dimension}") + + if [ -n "${gpu_list}" ]; then + normalized_gpu_ids="${gpu_list//,/ }" + read -r -a gpu_array <<< "${normalized_gpu_ids}" + visible_gpus=${#gpu_array[@]} + echo "visible_gpus: ${visible_gpus}" + else + gpu_array=() fi - "${benchmark_cmd[@]}" + mkdir -p "${output_video_path}" + shard_tmp_root="${output_video_path}/.prompt_shards" + + function build_benchmark_cmd { + local cur_prompt_folder="$2" + local cur_info_json="$3" + local cmd=( + python3 main.py + --model "${input_model}" + --task "${task}" + --scheme "${scheme}" + --output_dir "${tuned_checkpoint}" + --output_video_path "${output_video_path}" + --limit "${limit}" + --inference + ) + + if [ -n "${cur_prompt_folder}" ]; then + cmd+=(--prompt_folder "${cur_prompt_folder}") + elif [ -n "${prompt_folder}" ]; then + cmd+=(--prompt_folder "${prompt_folder}") + fi + if [ -n "${image_folder}" ]; then + cmd+=(--image_folder "${image_folder}") + fi + if [ -n "${cur_info_json}" ]; then + cmd+=(--info_json "${cur_info_json}") + elif [ -n "${info_json}" ]; then + cmd+=(--info_json "${info_json}") + fi + if [ -n "$1" ]; then + cmd+=(--dimension "$1") + fi + + printf '%q ' "${cmd[@]}" + } + + if [ ${#gpu_array[@]} -eq 0 ]; then + if [ ${#dimension_list[@]} -eq 0 ]; then + eval "$(build_benchmark_cmd "" "" "")" + else + for cur_dimension in "${dimension_list[@]}"; do + eval "$(build_benchmark_cmd "${cur_dimension}" "" "")" + done + fi + else + if [ ${#dimension_list[@]} -eq 0 ]; then + echo "Error: multi-GPU sharding requires --dimension" + exit 1 + fi + + num_shards=${#gpu_array[@]} + for cur_dimension in "${dimension_list[@]}"; do + dim_shard_root="${shard_tmp_root}/${cur_dimension}" + rm -rf "${dim_shard_root}" + if [ "${task}" = "t2v" ]; then + prompt_file="${prompt_folder}/${cur_dimension}.txt" + python3 split_t2v_prompts.py \ + --prompt_file "${prompt_file}" \ + --num_shards "${num_shards}" \ + --output_root "${dim_shard_root}" + else + python3 split_i2v_info.py \ + --info_json "${info_json}" \ + --dimension "${cur_dimension}" \ + --num_shards "${num_shards}" \ + --output_root "${dim_shard_root}" + fi + + program_pid=() + for shard_id in "${!gpu_array[@]}"; do + gpu_id="${gpu_array[$shard_id]}" + log_suffix="${cur_dimension}" + if [ -z "${log_suffix}" ]; then + log_suffix="all" + fi + log_file="${output_video_path}/${log_suffix}.gpu${gpu_id}.log" + shard_prompt_folder="" + shard_info_json="" + + if [ "${task}" = "t2v" ]; then + shard_prompt_folder="${dim_shard_root}/shard_${shard_id}" + else + shard_info_json="${dim_shard_root}/shard_${shard_id}/info.json" + fi + + cmd="$(build_benchmark_cmd "${cur_dimension}" "${shard_prompt_folder}" "${shard_info_json}")" + CUDA_VISIBLE_DEVICES="${gpu_id}" bash -lc "${cmd}" > "${log_file}" 2>&1 & + program_pid+=("$!") + echo "Start (PID: ${program_pid[-1]}, GPU: ${gpu_id}, dimension: ${cur_dimension})" + done + + for pid in "${program_pid[@]}"; do + wait "${pid}" || exit 1 + done + done + fi if [ "${accuracy}" = "true" ]; then if [ "${task}" = "t2v" ]; then echo "Start VBench evaluation for t2v..." - local t2v_dims - if [ -n "${dimension}" ]; then - t2v_dims="${dimension}" - else - t2v_dims="subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency" - fi pushd VBench python evaluate.py \ - --dimension ${t2v_dims} \ + --dimension "subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency" \ --videos_path "${output_video_path}" \ - --mode=vbench_standard + --mode=vbench_standard popd elif [ "${task}" = "i2v" ]; then echo "Start VBench evaluation for i2v..." pushd VBench python evaluate_i2v.py \ - --dimension i2v_background i2v_subject subject_consistency background_consistency motion_smoothness \ + --dimension "i2v_background i2v_subject subject_consistency background_consistency motion_smoothness" \ --videos_path "${output_video_path}" \ --mode=vbench_standard popd diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py b/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py new file mode 100755 index 00000000000..326666467ea --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +import argparse +import json +import os + + +def parse_args(): + parser = argparse.ArgumentParser(description="Split i2v info_json into per-shard files for one dimension.") + parser.add_argument("--info_json", required=True, type=str, help="Path to full i2v info json") + parser.add_argument("--dimension", required=True, type=str, help="Target dimension") + parser.add_argument("--num_shards", required=True, type=int, help="Total shard count") + parser.add_argument("--output_root", required=True, type=str, help="Root dir to write shard json files") + return parser.parse_args() + + +def has_dimension(info, target_dimension): + dims = info.get("dimension", []) + if isinstance(dims, str): + dims = [dims] + return target_dimension in dims + + +def main(): + args = parse_args() + + if args.num_shards < 1: + raise ValueError("--num_shards must be >= 1") + if not os.path.isfile(args.info_json): + raise FileNotFoundError(f"Info json not found: {args.info_json}") + + with open(args.info_json, "r", encoding="utf-8") as f: + info_list = json.load(f) + + filtered = [item for item in info_list if has_dimension(item, args.dimension)] + + shard_buckets = [[] for _ in range(args.num_shards)] + for idx, item in enumerate(filtered): + shard_buckets[idx % args.num_shards].append(item) + + os.makedirs(args.output_root, exist_ok=True) + for shard_id, shard_items in enumerate(shard_buckets): + shard_dir = os.path.join(args.output_root, f"shard_{shard_id}") + os.makedirs(shard_dir, exist_ok=True) + shard_info_json = os.path.join(shard_dir, "info.json") + with open(shard_info_json, "w", encoding="utf-8") as f: + json.dump(shard_items, f, ensure_ascii=False, indent=2) + + print( + f"Split {len(filtered)} i2v entries for dimension '{args.dimension}' " + f"into {args.num_shards} shards under {args.output_root}" + ) + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py b/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py new file mode 100755 index 00000000000..dffe19eb9e0 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +import argparse +import os + + +def parse_args(): + parser = argparse.ArgumentParser(description="Split t2v prompt file into per-shard prompt folders.") + parser.add_argument("--prompt_file", required=True, type=str, help="Path to .txt") + parser.add_argument("--num_shards", required=True, type=int, help="Total shard count") + parser.add_argument("--output_root", required=True, type=str, help="Root directory to write shard folders") + return parser.parse_args() + + +def main(): + args = parse_args() + + if args.num_shards < 1: + raise ValueError("--num_shards must be >= 1") + if not os.path.isfile(args.prompt_file): + raise FileNotFoundError(f"Prompt file not found: {args.prompt_file}") + + dimension = os.path.splitext(os.path.basename(args.prompt_file))[0] + + with open(args.prompt_file, "r", encoding="utf-8") as f: + prompts = [line.strip() for line in f if line.strip()] + + os.makedirs(args.output_root, exist_ok=True) + + shard_buckets = [[] for _ in range(args.num_shards)] + for idx, prompt in enumerate(prompts): + shard_buckets[idx % args.num_shards].append(prompt) + + for shard_id, shard_prompts in enumerate(shard_buckets): + shard_dir = os.path.join(args.output_root, f"shard_{shard_id}") + os.makedirs(shard_dir, exist_ok=True) + shard_prompt_file = os.path.join(shard_dir, f"{dimension}.txt") + with open(shard_prompt_file, "w", encoding="utf-8") as f: + for prompt in shard_prompts: + f.write(prompt + "\n") + + print( + f"Split {len(prompts)} prompts from {args.prompt_file} into {args.num_shards} shards under {args.output_root}" + ) + + +if __name__ == "__main__": + main() From 3503937a9c85a915ceb2329acc496718f7e3cee7 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 28 May 2026 08:22:44 +0000 Subject: [PATCH 04/14] add ratio for evaluate_i2v.py Signed-off-by: changwangss --- examples/pytorch/diffusion_model/diffusers/wan/README.md | 1 + examples/pytorch/diffusion_model/diffusers/wan/main.py | 0 examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh | 1 + examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh | 0 4 files changed, 2 insertions(+) mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/README.md mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/main.py mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md old mode 100755 new mode 100644 index e9c8440b658..33cf1f63ce5 --- a/examples/pytorch/diffusion_model/diffusers/wan/README.md +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -147,6 +147,7 @@ cd /path/to/VBench python evaluate_i2v.py \ --dimension i2v_background i2v_subject subject_consistency background_consistency motion_smoothness \ --videos_path /path/to/wan_i2v_bf16_video \ + --ratio 16-9 \ --mode vbench_standard ``` diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py old mode 100755 new mode 100644 diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh old mode 100755 new mode 100644 index e65b01b1bf2..02ccd86bba5 --- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh @@ -318,6 +318,7 @@ function run_benchmark { python evaluate_i2v.py \ --dimension "i2v_background i2v_subject subject_consistency background_consistency motion_smoothness" \ --videos_path "${output_video_path}" \ + --ratio "16-9" \ --mode=vbench_standard popd else diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh old mode 100755 new mode 100644 From 1469b24e87b9e064997626779bb321deeb823278 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 28 May 2026 09:45:33 +0000 Subject: [PATCH 05/14] improve mxfp8 oom by row chunk Signed-off-by: changwangss --- .../diffusion_model/diffusers/wan/main.py | 5 +++++ .../diffusers/wan/run_benchmark.sh | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py index 85194cd9efe..c4c9aeab9f1 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/main.py +++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py @@ -166,6 +166,11 @@ def load_quantized_transformers(pipe, output_dir): print(f"Loading quantized {module_name} from {q_path}") setattr(pipe, module_name, WanTransformer3DModel.from_pretrained(q_path, torch_dtype=torch.bfloat16)) + # Quantized modules are replaced after pipeline construction; refresh offload hooks + # so newly attached modules follow the same device movement policy. + if hasattr(pipe, "enable_model_cpu_offload"): + pipe.enable_model_cpu_offload() + def build_t2v_inputs(args): prompt_folder = args.prompt_folder diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh index 02ccd86bba5..420e8ba010f 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh @@ -149,6 +149,18 @@ function init_params { limit="$2" shift 2 ;; + --mxfp8_chunk_rows=*) + mxfp8_chunk_rows="${1#*=}" + shift + ;; + --mxfp8_chunk_rows) + mxfp8_chunk_rows="$2" + shift 2 + ;; + --disable_mxfp8_inplace_qdq) + disable_mxfp8_inplace_qdq=true + shift + ;; --accuracy) accuracy=true shift @@ -167,6 +179,7 @@ function run_benchmark { tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"} output_video_path=${output_video_path:="./tmp_video"} accuracy=${accuracy:=false} + disable_mxfp8_inplace_qdq=${disable_mxfp8_inplace_qdq:=false} if [[ ! "${output_video_path}" = /* ]]; then output_video_path=$(realpath -s "$(pwd)/${output_video_path}") @@ -238,6 +251,12 @@ function run_benchmark { if [ -n "$1" ]; then cmd+=(--dimension "$1") fi + if [ -n "${mxfp8_chunk_rows}" ]; then + cmd+=(--mxfp8_chunk_rows "${mxfp8_chunk_rows}") + fi + if [ "${disable_mxfp8_inplace_qdq}" = "true" ]; then + cmd+=(--disable_mxfp8_inplace_qdq) + fi printf '%q ' "${cmd[@]}" } From 630863c6b03d0cd59977ec9795519baabbdbfe05 Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 1 Jun 2026 07:25:24 +0000 Subject: [PATCH 06/14] remove requirement Signed-off-by: changwangss --- examples/pytorch/diffusion_model/diffusers/wan/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md index 33cf1f63ce5..c89b6e86800 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/README.md +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -7,12 +7,13 @@ This example provides a unified Wan entry for quantization and evaluation, with ## 1. Environment ```shell -pip install -r requirements.txt # Use latest dev branch if needed before release # INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master # pip install git+https://github.com/intel/auto-round.git@main pip install neural-compressor-pt pip install auto-round +# evaluation +pip install VBench ``` ## 2. Prepare Model @@ -40,9 +41,6 @@ huggingface-cli download Wan-AI/Wan2.2-I2V-A14B-Diffusers \ Clone VBench to prepare the required dataset, then download i2v data: ```bash -# recommended: install VBench from pip -python3 -m pip install VBench - # required for dataset preparation git clone https://github.com/Vchitect/VBench.git cd VBench From d020210b94fb85fd2ae3a2a4ff9f8b8e5e3cfcf3 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 2 Jun 2026 07:51:12 +0000 Subject: [PATCH 07/14] add requirements Signed-off-by: changwangss --- .../diffusion_model/diffusers/wan/README.md | 12 +++++------ .../diffusers/wan/requirements.txt | 21 +++++++++++++++++++ 2 files changed, 27 insertions(+), 6 deletions(-) create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/requirements.txt diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md index c89b6e86800..426ccb54540 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/README.md +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -10,10 +10,9 @@ This example provides a unified Wan entry for quantization and evaluation, with # Use latest dev branch if needed before release # INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master # pip install git+https://github.com/intel/auto-round.git@main -pip install neural-compressor-pt -pip install auto-round -# evaluation -pip install VBench + +# install all runtime dependencies (including evaluation package VBench) +pip install -r requirements.txt ``` ## 2. Prepare Model @@ -26,14 +25,15 @@ Use a local Wan diffusers model path, for example: Download example (from Hugging Face): ```bash +# optional: update CLI to latest version pip install -U "huggingface_hub[cli]" # t2v model -huggingface-cli download Wan-AI/Wan2.2-T2V-A14B-Diffusers \ +hf download Wan-AI/Wan2.2-T2V-A14B-Diffusers \ --local-dir /path/to/Wan2.2-T2V-A14B-Diffusers # i2v model -huggingface-cli download Wan-AI/Wan2.2-I2V-A14B-Diffusers \ +hf download Wan-AI/Wan2.2-I2V-A14B-Diffusers \ --local-dir /path/to/Wan2.2-I2V-A14B-Diffusers ``` diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt new file mode 100644 index 00000000000..ad846ada0ff --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt @@ -0,0 +1,21 @@ +# Core runtime +numpy +torch +torchvision +diffusers +transformers +accelerate +huggingface_hub +safetensors + +# Quantization stack +neural-compressor-pt +auto-round + +# Utilities used by example scripts +pillow +einops +requests + +# Evaluation package (dataset/eval helpers) +VBench From bfd62b514d74d056cc1d6e657a7a9235ba17411d Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 12 Jun 2026 03:09:39 +0000 Subject: [PATCH 08/14] add example readme Signed-off-by: changwangss --- examples/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/examples/README.md b/examples/README.md index 26e4a5792d7..2b567ef4ef6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -39,6 +39,19 @@ Intel® Neural Compressor validated examples with multiple compression technique Quantization (MXFP8/FP8) link + + Wan2.2-I2V-A14B-Diffusers + Image to Video + Quantization (MXFP8/FP8) + link + + + Wan2.2-T2V-A14B-Diffusers + Text to Video + Quantization (MXFP8/FP8) + link + + Llama-4-Scout-17B-16E-Instruct Multimodal Modeling From 2433e9dbbe347fbbf51342a33a5806be6afb1892 Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 12 Jun 2026 08:33:51 +0000 Subject: [PATCH 09/14] update VBench install method Signed-off-by: changwangss --- .../diffusion_model/diffusers/wan/README.md | 1 + .../diffusers/wan/requirements.txt | 28 ++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md index 426ccb54540..81f91be3622 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/README.md +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -13,6 +13,7 @@ This example provides a unified Wan entry for quantization and evaluation, with # install all runtime dependencies (including evaluation package VBench) pip install -r requirements.txt +pip install VBench --no-deps ``` ## 2. Prepare Model diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt index ad846ada0ff..b855a26543b 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt +++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt @@ -1,5 +1,4 @@ # Core runtime -numpy torch torchvision diffusers @@ -13,9 +12,30 @@ neural-compressor-pt auto-round # Utilities used by example scripts -pillow einops -requests # Evaluation package (dataset/eval helpers) -VBench +Pillow +numpy<2.0.0 +matplotlib +timm>=0.9,<=1.0.12 +wheel +cython +tensorboard +scipy +opencv-python +scikit-learn +scikit-image +openai-clip +decord +requests +pyyaml +pyiqa +lvis +fairscale>=0.4.4 +fvcore +easydict +urllib3 +boto3 +omegaconf +pycocoevalcap From d8d5a2e16376bf797f7475304434f0cb09df3304 Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 15 Jun 2026 07:37:27 +0000 Subject: [PATCH 10/14] add vbench_dir and gdown==4.7.3, imageio-ffepeg Signed-off-by: changwangss --- .../diffusion_model/diffusers/wan/README.md | 5 +++++ .../diffusers/wan/requirements.txt | 2 ++ .../diffusers/wan/run_benchmark.sh | 19 +++++++++++++++---- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md index 81f91be3622..681aadfbd10 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/README.md +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -68,6 +68,11 @@ bash run_quant.sh \ ## Inference + Evaluation +When `--accuracy` is enabled, `run_benchmark.sh` runs VBench evaluation scripts from a local VBench repo. + +- Default VBench path is `$(dirname run_benchmark.sh)/VBench`. +- If your VBench repo is elsewhere, pass `--vbench_dir=/path/to/VBench`. + ### t2v bf16 ```bash diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt index b855a26543b..dd0a3842c04 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt +++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt @@ -39,3 +39,5 @@ urllib3 boto3 omegaconf pycocoevalcap +imageio-ffmpeg +gdown==4.7.3 diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh index 420e8ba010f..aac55b02af3 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh @@ -1,15 +1,17 @@ #!/bin/bash set -x +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + function main { init_params "$@" run_benchmark } function ensure_vbench_repo { - if [ ! -d "VBench" ]; then + if [ ! -d "${vbench_dir}" ]; then echo "VBench directory not found. Start cloning https://github.com/Vchitect/VBench.git ..." - git clone https://github.com/Vchitect/VBench.git + git clone https://github.com/Vchitect/VBench.git "${vbench_dir}" if [ $? -ne 0 ]; then echo "Error: failed to clone VBench." exit 1 @@ -165,6 +167,14 @@ function init_params { accuracy=true shift ;; + --vbench_dir=*) + vbench_dir="${1#*=}" + shift + ;; + --vbench_dir) + vbench_dir="$2" + shift 2 + ;; *) echo "Error: No such parameter: $1" exit 1 @@ -180,6 +190,7 @@ function run_benchmark { output_video_path=${output_video_path:="./tmp_video"} accuracy=${accuracy:=false} disable_mxfp8_inplace_qdq=${disable_mxfp8_inplace_qdq:=false} + vbench_dir=${vbench_dir:="${SCRIPT_DIR}/VBench"} if [[ ! "${output_video_path}" = /* ]]; then output_video_path=$(realpath -s "$(pwd)/${output_video_path}") @@ -325,7 +336,7 @@ function run_benchmark { if [ "${accuracy}" = "true" ]; then if [ "${task}" = "t2v" ]; then echo "Start VBench evaluation for t2v..." - pushd VBench + pushd "${vbench_dir}" python evaluate.py \ --dimension "subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency" \ --videos_path "${output_video_path}" \ @@ -333,7 +344,7 @@ function run_benchmark { popd elif [ "${task}" = "i2v" ]; then echo "Start VBench evaluation for i2v..." - pushd VBench + pushd "${vbench_dir}" python evaluate_i2v.py \ --dimension "i2v_background i2v_subject subject_consistency background_consistency motion_smoothness" \ --videos_path "${output_video_path}" \ From c7fa3ec645fe7d51e93828c519f9698680d1fab3 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Tue, 16 Jun 2026 12:46:33 +0800 Subject: [PATCH 11/14] add setup.sh Signed-off-by: chensuyue --- examples/pytorch/diffusion_model/diffusers/wan/setup.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/setup.sh diff --git a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh new file mode 100644 index 00000000000..a741abf3cff --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh @@ -0,0 +1,2 @@ +pip install -r requirements.txt +pip install VBench --no-deps \ No newline at end of file From 5adf041e2dbf8d0579216fb47bbe35a7188f36c0 Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Tue, 16 Jun 2026 14:07:19 +0800 Subject: [PATCH 12/14] Update setup.sh --- examples/pytorch/diffusion_model/diffusers/wan/setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh index a741abf3cff..c9f9700dbbc 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh @@ -1,2 +1,2 @@ -pip install -r requirements.txt -pip install VBench --no-deps \ No newline at end of file +pip install --no-cache-dir -r requirements.txt +pip install VBench --no-deps From b2968eb2dabf3e4fb26da44055957174fb5ff6a3 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 16 Jun 2026 13:20:33 +0000 Subject: [PATCH 13/14] i2v/t2v use quantized model loading directly Signed-off-by: changwangss --- .../diffusion_model/diffusers/wan/README.md | 27 +++++++++++++------ .../diffusion_model/diffusers/wan/main.py | 15 ----------- .../diffusers/wan/run_benchmark.sh | 10 ------- .../diffusers/wan/split_i2v_info.py | 0 .../diffusers/wan/split_t2v_prompts.py | 0 5 files changed, 19 insertions(+), 33 deletions(-) mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md index 681aadfbd10..85883127570 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/README.md +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -57,13 +57,26 @@ bash vbench2_beta_i2v/download_data.sh ## Quantization +### t2v + ```bash # topology supports wan_mxfp8 or wan_fp8 bash run_quant.sh \ --topology=wan_mxfp8 \ --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ --task=t2v \ - --output_model=wan_mxfp8_model + --output_model=wan_mxfp8_model_t2v +``` + +### i2v + +```bash +# topology supports wan_mxfp8 or wan_fp8 +bash run_quant.sh \ + --topology=wan_mxfp8 \ + --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \ + --task=i2v \ + --output_model=wan_mxfp8_model_i2v ``` ## Inference + Evaluation @@ -92,11 +105,10 @@ bash run_benchmark.sh \ # topology supports wan_mxfp8 or wan_fp8 bash run_benchmark.sh \ --topology=wan_mxfp8 \ - --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ - --quantized_model=wan_mxfp8_model \ + --input_model=wan_mxfp8_model_t2v \ --task=t2v \ --dimension=subject_consistency,overall_consistency \ - --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \ + --prompt_folder=./VBench/prompts/prompts_per_dimension/ \ --output_video_path=wan_t2v_mxfp8_video \ --accuracy ``` @@ -121,12 +133,11 @@ bash run_benchmark.sh \ # topology supports wan_mxfp8 or wan_fp8 bash run_benchmark.sh \ --topology=wan_mxfp8 \ - --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \ - --quantized_model=wan_mxfp8_model \ + --input_model=wan_mxfp8_model_i2v \ --task=i2v \ --dimension=i2v_background,i2v_subject \ - --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \ - --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ + --image_folder=./VBench/vbench2_beta_i2v/data/crop/16-9 \ + --info_json=./VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ --output_video_path=wan_i2v_mxfp8_video \ --accuracy ``` diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py index c4c9aeab9f1..c36ebef7177 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/main.py +++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py @@ -158,20 +158,6 @@ def act_qdq_forward(module, x, *f_args, **f_kwargs): m.forward = partial(act_qdq_forward, m) -def load_quantized_transformers(pipe, output_dir): - for module_name in ["transformer", "transformer_2"]: - q_path = os.path.join(output_dir, module_name) - if not os.path.isdir(q_path): - raise ValueError(f"Quantized path does not exist: {q_path}") - print(f"Loading quantized {module_name} from {q_path}") - setattr(pipe, module_name, WanTransformer3DModel.from_pretrained(q_path, torch_dtype=torch.bfloat16)) - - # Quantized modules are replaced after pipeline construction; refresh offload hooks - # so newly attached modules follow the same device movement policy. - if hasattr(pipe, "enable_model_cpu_offload"): - pipe.enable_model_cpu_offload() - - def build_t2v_inputs(args): prompt_folder = args.prompt_folder @@ -314,7 +300,6 @@ def main(): if args.inference: if args.scheme in ["FP8", "MXFP8"]: - load_quantized_transformers(pipe, args.output_dir) apply_activation_qdq(pipe, args.scheme, args) run_inference(args, pipe) diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh index aac55b02af3..1ea381f547a 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh @@ -87,14 +87,6 @@ function init_params { task="$2" shift 2 ;; - --quantized_model=*) - tuned_checkpoint="${1#*=}" - shift - ;; - --quantized_model) - tuned_checkpoint="$2" - shift 2 - ;; --output_video_path=*) output_video_path="${1#*=}" shift @@ -186,7 +178,6 @@ function init_params { function run_benchmark { task=${task:="t2v"} limit=${limit:=-1} - tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"} output_video_path=${output_video_path:="./tmp_video"} accuracy=${accuracy:=false} disable_mxfp8_inplace_qdq=${disable_mxfp8_inplace_qdq:=false} @@ -240,7 +231,6 @@ function run_benchmark { --model "${input_model}" --task "${task}" --scheme "${scheme}" - --output_dir "${tuned_checkpoint}" --output_video_path "${output_video_path}" --limit "${limit}" --inference diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py b/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py old mode 100755 new mode 100644 diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py b/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py old mode 100755 new mode 100644 From 90a572d71c656965fc1b5c1c77b32907b75e9181 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 17 Jun 2026 02:32:56 +0000 Subject: [PATCH 14/14] add s2v Signed-off-by: changwangss --- .../diffusion_model/diffusers/wan/README.md | 177 +++++-- .../diffusers/wan/evaluate_manifest_no_gt.py | 489 ++++++++++++++++++ .../diffusion_model/diffusers/wan/main.py | 6 +- .../diffusers/wan/prepare_s2v_dataset.py | 137 +++++ ...uirements.txt => requirements_i2v_t2v.txt} | 1 + .../diffusers/wan/requirements_s2v.txt | 41 ++ .../diffusers/wan/run_benchmark.sh | 417 ++++++++++----- .../diffusers/wan/run_quant.sh | 41 +- .../diffusion_model/diffusers/wan/setup.sh | 55 +- .../diffusers/wan/split_s2v_manifest.py | 68 +++ .../diffusion_model/diffusers/wan/wan_s2v.py | 310 +++++++++++ 11 files changed, 1545 insertions(+), 197 deletions(-) create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/evaluate_manifest_no_gt.py create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/prepare_s2v_dataset.py rename examples/pytorch/diffusion_model/diffusers/wan/{requirements.txt => requirements_i2v_t2v.txt} (99%) create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/requirements_s2v.txt create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/split_s2v_manifest.py create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/wan_s2v.py diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md index 85883127570..b4f48ddb23e 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/README.md +++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md @@ -1,61 +1,88 @@ # Step-by-Step -This example provides a unified Wan entry for quantization and evaluation, with both t2v and i2v support. +This example offers a unified Wan workflow for quantization and evaluation, covering `t2v` and `i2v` via `main.py`, and `s2v` via `wan_s2v.py` -# Prerequisite +# Prerequisites -## 1. Environment +## 1 Environment -```shell -# Use latest dev branch if needed before release -# INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master -# pip install git+https://github.com/intel/auto-round.git@main +You can also use the helper script to install task-specific dependencies: -# install all runtime dependencies (including evaluation package VBench) -pip install -r requirements.txt -pip install VBench --no-deps -``` - -## 2. Prepare Model - -Use a local Wan diffusers model path, for example: +```bash +# t2v / i2v setup (installs requirements_i2v_t2v.txt and VBench by default) +bash setup.sh --task t2v +bash setup.sh --task i2v -- Wan2.2-T2V-A14B-Diffusers -- Wan2.2-I2V-A14B-Diffusers +# s2v setup (installs requirements_s2v.txt, skips VBench by default) +bash setup.sh --task s2v +``` -Download example (from Hugging Face): +## 2 Model Preparation ```bash # optional: update CLI to latest version pip install -U "huggingface_hub[cli]" -# t2v model + hf download Wan-AI/Wan2.2-T2V-A14B-Diffusers \ --local-dir /path/to/Wan2.2-T2V-A14B-Diffusers -# i2v model + hf download Wan-AI/Wan2.2-I2V-A14B-Diffusers \ --local-dir /path/to/Wan2.2-I2V-A14B-Diffusers + + +hf download Wan-AI/Wan2.2-S2V-14B \ + --local-dir /path/to/Wan2.2-S2V-14B ``` -## 3. Prepare Dataset -Clone VBench to prepare the required dataset, then download i2v data: + +## 3 Dataset Preparation + +### t2v / i2v + +Both `t2v` and `i2v` use VBench data. +Recommended default is manual preparation for better reproducibility and control. +Use `--vbench_dir=/path/to/VBench` in `run_benchmark.sh` when your VBench repo is not under the default path. + +If you prepare VBench manually: ```bash -# required for dataset preparation git clone https://github.com/Vchitect/VBench.git cd VBench bash vbench2_beta_i2v/download_data.sh ``` -- t2v: pass prompt folder with --prompt_folder, and set --dimension to select `${prompt_folder}/${dimension}.txt` -- t2v/i2v: pass comma-separated values in `--dimension` to run multiple dimensions in one command (e.g., `subject_consistency,overall_consistency`) -- t2v: can pass --dimension for evaluation filtering (validated dimensions include `subject_consistency` and `overall_consistency`) -- i2v: pass --image_folder, --info_json, and --dimension (validated dimensions include `i2v_subject`, `i2v_background`, `subject_consistency`, `background_consistency`, and `motion_smoothness`) +Then use: +- t2v: +- `--prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/` +- `--dimension=subject_consistency,overall_consistency` + +- i2v: +- `--image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9` +- `--info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json` +- `--dimension=i2v_background,i2v_subject` + +### s2v + +Recommended default is manual preparation. + +```bash +# Step 1: clone EchoMimicV3 repo +git clone https://github.com/antgroup/echomimic_v3.git /path/to/echomimic_v3 + +# Step 2: build s2v manifest json +python3 prepare_s2v_dataset.py \ + --repo-dir /path/to/echomimic_v3 \ + --manifest-out /path/to/s2v_input_manifest.json + +``` + +The generated /path/to/s2v_input_manifest.json is the s2v input manifest passed via --manifest_path, with image and audio stored as absolute paths. # Run -## Quantization +## Quantization ### t2v @@ -79,12 +106,33 @@ bash run_quant.sh \ --output_model=wan_mxfp8_model_i2v ``` -## Inference + Evaluation +### s2v -When `--accuracy` is enabled, `run_benchmark.sh` runs VBench evaluation scripts from a local VBench repo. +```bash +# Prepare Wan2.2 for s2v +git clone https://github.com/Wan-Video/Wan2.2.git /path/to/Wan2.2 +``` + +```bash +# topology supports wan_mxfp8 or wan_fp8 +bash run_quant.sh \ + --topology=wan_mxfp8 \ + --input_model=/path/to/Wan2.2-S2V-14B \ + --task=s2v \ + --wan_dir=/path/to/Wan2.2 \ + --output_model=wan_mxfp8_model_s2v +``` -- Default VBench path is `$(dirname run_benchmark.sh)/VBench`. -- If your VBench repo is elsewhere, pass `--vbench_dir=/path/to/VBench`. +Note: +- For `task=s2v`, prepare Wan2.2 manually and pass `--wan_dir=/path/to/Wan2.2` when needed. +- `run_quant.sh` sets `PYTHONPATH` internally for s2v, so you do not need to export it manually. +- For `task=s2v`, `run_quant.sh` dispatches to `wan_s2v.py --quantize` in this example. + + + +## Inference + Evaluation + +Note: For `task=t2v/i2v`, prepare VBench manually first, and pass `--vbench_dir=/path/to/VBench` when needed. ### t2v bf16 @@ -94,7 +142,7 @@ bash run_benchmark.sh \ --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \ --task=t2v \ --dimension=subject_consistency,overall_consistency \ - --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \ + --vbench_dir=/path/to/VBench \ --output_video_path=wan_t2v_bf16_video \ --accuracy ``` @@ -102,13 +150,12 @@ bash run_benchmark.sh \ ### t2v mxfp8/fp8 ```bash -# topology supports wan_mxfp8 or wan_fp8 bash run_benchmark.sh \ --topology=wan_mxfp8 \ --input_model=wan_mxfp8_model_t2v \ --task=t2v \ --dimension=subject_consistency,overall_consistency \ - --prompt_folder=./VBench/prompts/prompts_per_dimension/ \ + --vbench_dir=/path/to/VBench \ --output_video_path=wan_t2v_mxfp8_video \ --accuracy ``` @@ -121,8 +168,7 @@ bash run_benchmark.sh \ --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \ --task=i2v \ --dimension=i2v_background,i2v_subject \ - --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \ - --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ + --vbench_dir=/path/to/VBench \ --output_video_path=wan_i2v_bf16_video \ --accuracy ``` @@ -130,19 +176,55 @@ bash run_benchmark.sh \ ### i2v mxfp8/fp8 ```bash -# topology supports wan_mxfp8 or wan_fp8 bash run_benchmark.sh \ --topology=wan_mxfp8 \ --input_model=wan_mxfp8_model_i2v \ --task=i2v \ --dimension=i2v_background,i2v_subject \ - --image_folder=./VBench/vbench2_beta_i2v/data/crop/16-9 \ - --info_json=./VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \ + --vbench_dir=/path/to/VBench \ --output_video_path=wan_i2v_mxfp8_video \ --accuracy ``` -Note: For sharding and multi-GPU execution, set `--gpu_ids` (for example `--gpu_ids=0,1,2,3`) or set `CUDA_VISIBLE_DEVICES` before running `run_benchmark.sh`. +### s2v bf16 + +Note: For `task=s2v`, prepare Wan2.2 manually and pass `--wan_dir=/path/to/Wan2.2` when needed. + +```bash +bash run_benchmark.sh \ + --topology=wan_bf16 \ + --task=s2v \ + --input_model=/path/to/Wan2.2-S2V-14B \ + --wan_dir=/path/to/Wan2.2 \ + --manifest_path=/path/to/s2v_input_manifest.json \ + --output_video_path=wan_s2v_bf16_video \ + --accuracy +``` + +### s2v mxfp8/fp8 + +```bash +bash run_benchmark.sh \ + --topology=wan_mxfp8 \ + --task=s2v \ + --input_model=/path/to/Wan2.2-S2V-14B \ + --quantized_model=wan_mxfp8_model_s2v \ + --wan_dir=/path/to/Wan2.2 \ + --manifest_path=/path/to/s2v_input_manifest.json \ + --output_video_path=wan_s2v_mxfp8_video \ + --accuracy +``` + +When `task=s2v` and `--accuracy` is set, `run_benchmark.sh` will run generation via `wan_s2v.py`, then run `evaluate_manifest_no_gt.py`. + +- Optional eval arg: `--s2v_eval_output` (default: `${output_video_path}/evaluation_no_gt_metrics_s2v.json`) +- Internal defaults: matched manifest `${output_video_path}/s2v_manifest_with_generate_video.json`, `max_frames=32`, `metric_size=192` + + +Metric note: current s2v benchmark manifest does not provide ground-truth videos, so `FID` and `FVD` are not computed. The script reports proxy metrics from available image/audio/generated-video signals (for example SSIM, PSNR, Sync-C, HKC, HKV, CSIM, EFID). + +For sharding and multi-GPU execution, set `--gpu_ids` (for example `--gpu_ids=0,1,2,3`) or set `CUDA_VISIBLE_DEVICES` before running `run_benchmark.sh`. + ### Standalone Accuracy Evaluation (Optional) @@ -164,11 +246,12 @@ python evaluate_i2v.py \ --videos_path /path/to/wan_i2v_bf16_video \ --ratio 16-9 \ --mode vbench_standard -``` - -# Notes -- Quantized weights are saved under: - - /transformer - - /transformer_2 +# s2v standalone evaluation from generated manifest. +python evaluate_manifest_no_gt.py \ + --manifest /path/to/wan_s2v_output/s2v_manifest_with_generate_video.json \ + --output /path/to/wan_s2v_output/evaluation_no_gt_metrics_s2v.json \ + --max_frames 32 \ + --metric_size 192 +``` diff --git a/examples/pytorch/diffusion_model/diffusers/wan/evaluate_manifest_no_gt.py b/examples/pytorch/diffusion_model/diffusers/wan/evaluate_manifest_no_gt.py new file mode 100644 index 00000000000..26dfccf8fc6 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/evaluate_manifest_no_gt.py @@ -0,0 +1,489 @@ +import argparse +import glob +import json +import math +from pathlib import Path + +import cv2 +import numpy as np +from scipy.io import wavfile + + +def _resolve_manifest_path(path_value: str, root: Path) -> Path: + """Use absolute paths from manifest directly; fallback to root for relative paths.""" + p = Path(path_value) + if p.is_absolute(): + return p + return root / p + + +def _build_sample_dict(manifest): + if isinstance(manifest, dict): + return {str(k): v for k, v in manifest.items() if isinstance(v, dict)} + if isinstance(manifest, list): + result = {} + for idx, item in enumerate(manifest): + if not isinstance(item, dict): + continue + sample_id = str(item.get("id", idx)) + result[sample_id] = item + return result + raise ValueError("Manifest must be a JSON object or list") + + +def build_matched_manifest(source_manifest_path: Path, generated_video_dir: Path): + with source_manifest_path.open("r", encoding="utf-8") as f: + source_manifest = json.load(f) + + source_samples = _build_sample_dict(source_manifest) + manifest_root = source_manifest_path.parent + video_files = sorted(glob.glob(str(generated_video_dir / "*.mp4"))) + + matched = {} + for sample_id, sample in source_samples.items(): + prompt = sample.get("prompt") + image = sample.get("image") + audio = sample.get("audio") + if not prompt or not image or not audio: + continue + + if not Path(image).is_absolute(): + image = str((manifest_root / image).resolve()) + if not Path(audio).is_absolute(): + audio = str((manifest_root / audio).resolve()) + + prefix = f"{sample_id}_" + candidates = [vp for vp in video_files if Path(vp).name.startswith(prefix)] + if not candidates: + continue + + matched[sample_id] = { + "prompt": prompt, + "image": image, + "audio": audio, + "generate_video": str(Path(candidates[-1]).resolve()), + } + + return matched + + +def psnr(img1: np.ndarray, img2: np.ndarray) -> float: + mse = np.mean((img1.astype(np.float64) - img2.astype(np.float64)) ** 2) + if mse <= 1e-12: + return 100.0 + return 20.0 * math.log10(255.0 / math.sqrt(mse)) + + +def ssim(img1: np.ndarray, img2: np.ndarray) -> float: + # Standard single-image SSIM implementation on grayscale images. + if img1.ndim == 3: + img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY) + if img2.ndim == 3: + img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY) + + img1 = img1.astype(np.float64) + img2 = img2.astype(np.float64) + + c1 = (0.01 * 255) ** 2 + c2 = (0.03 * 255) ** 2 + + kernel = cv2.getGaussianKernel(11, 1.5) + window = kernel @ kernel.T + + mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5] + mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5] + mu1_sq = mu1 * mu1 + mu2_sq = mu2 * mu2 + mu1_mu2 = mu1 * mu2 + + sigma1_sq = cv2.filter2D(img1 * img1, -1, window)[5:-5, 5:-5] - mu1_sq + sigma2_sq = cv2.filter2D(img2 * img2, -1, window)[5:-5, 5:-5] - mu2_sq + sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2 + + ssim_map = ((2 * mu1_mu2 + c1) * (2 * sigma12 + c2)) / ( + (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2) + ) + return float(ssim_map.mean()) + + +def read_video_frames(video_path: Path, max_frames: int = 120): + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + raise RuntimeError(f"Failed to open video: {video_path}") + + frames = [] + while cap.isOpened() and len(frames) < max_frames: + ok, frame = cap.read() + if not ok: + break + frames.append(frame) + + cap.release() + if not frames: + raise RuntimeError(f"No frame read from video: {video_path}") + return frames + + +def _read_audio_mono(audio_path: Path): + sr, wav = wavfile.read(str(audio_path)) + if wav.ndim == 2: + wav = wav.mean(axis=1) + wav = wav.astype(np.float32) + if wav.dtype != np.float32: + max_abs = np.max(np.abs(wav)) if np.max(np.abs(wav)) > 0 else 1.0 + wav = wav / max_abs + return wav, int(sr) + + +def _resample_1d(x: np.ndarray, target_len: int): + if len(x) == target_len: + return x + if len(x) == 0: + return np.zeros(target_len, dtype=np.float32) + xp = np.linspace(0.0, 1.0, num=len(x), endpoint=True) + xnew = np.linspace(0.0, 1.0, num=target_len, endpoint=True) + return np.interp(xnew, xp, x).astype(np.float32) + + +def _face_mouth_roi(gray: np.ndarray, face_cascade): + faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(40, 40)) + if len(faces) > 0: + x, y, w, h = max(faces, key=lambda b: b[2] * b[3]) + x1 = int(x + 0.2 * w) + x2 = int(x + 0.8 * w) + y1 = int(y + 0.55 * h) + y2 = int(y + 0.95 * h) + else: + h, w = gray.shape[:2] + x1, x2 = int(0.3 * w), int(0.7 * w) + y1, y2 = int(0.58 * h), int(0.9 * h) + x1 = max(0, min(x1, gray.shape[1] - 1)) + x2 = max(x1 + 1, min(x2, gray.shape[1])) + y1 = max(0, min(y1, gray.shape[0] - 1)) + y2 = max(y1 + 1, min(y2, gray.shape[0])) + return x1, y1, x2, y2 + + +def compute_sync_c(video_frames, audio_path: Path, fps: float = 25.0): + # Proxy Sync-C: correlation between mouth-region motion and audio energy envelope. + try: + audio, sr = _read_audio_mono(audio_path) + if len(video_frames) < 2 or len(audio) < 2: + return 0.0 + + samples_per_frame = max(1, int(sr / fps)) + # Use per-frame audio energy then align with frame-difference count. + n_audio_frames = max(1, len(audio) // samples_per_frame) + audio = audio[: n_audio_frames * samples_per_frame] + audio_frame = audio.reshape(n_audio_frames, samples_per_frame) + audio_energy = np.mean(np.abs(audio_frame), axis=1) + + face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml") + motions = [] + prev = cv2.cvtColor(video_frames[0], cv2.COLOR_BGR2GRAY) + for fr in video_frames[1:]: + cur = cv2.cvtColor(fr, cv2.COLOR_BGR2GRAY) + x1, y1, x2, y2 = _face_mouth_roi(cur, face_cascade) + d = np.mean(np.abs(cur[y1:y2, x1:x2].astype(np.float32) - prev[y1:y2, x1:x2].astype(np.float32))) + motions.append(d) + prev = cur + + motions = np.asarray(motions, dtype=np.float32) + if len(motions) < 2: + return 0.0 + audio_aligned = _resample_1d(audio_energy.astype(np.float32), len(motions)) + if np.std(audio_aligned) < 1e-8 or np.std(motions) < 1e-8: + return 0.0 + corr = np.corrcoef(audio_aligned, motions)[0, 1] + if np.isnan(corr): + return 0.0 + return float(np.clip(corr, -1.0, 1.0)) + except Exception: + return 0.0 + + +def compute_hkc_hkv(video_frames): + # Proxy HKC/HKV using side-region hand-motion energy statistics. + if len(video_frames) < 2: + return 0.0, 0.0 + + prev = cv2.cvtColor(video_frames[0], cv2.COLOR_BGR2GRAY) + hand_motion = [] + for fr in video_frames[1:]: + cur = cv2.cvtColor(fr, cv2.COLOR_BGR2GRAY) + h, w = cur.shape + # two side regions where hands often appear in portrait talking videos + left = (slice(int(0.35 * h), int(0.85 * h)), slice(0, int(0.32 * w))) + right = (slice(int(0.35 * h), int(0.85 * h)), slice(int(0.68 * w), w)) + dleft = np.mean(np.abs(cur[left].astype(np.float32) - prev[left].astype(np.float32))) + dright = np.mean(np.abs(cur[right].astype(np.float32) - prev[right].astype(np.float32))) + hand_motion.append((dleft + dright) * 0.5) + prev = cur + + hm = np.asarray(hand_motion, dtype=np.float32) + if hm.size == 0: + return 0.0, 0.0 + # HKC proxy in [0,1]: normalized average hand activity. + hkc = float(np.clip(hm.mean() / 25.0, 0.0, 1.0)) + hkv = float(np.var(hm)) + return hkc, hkv + + +def compute_csim(reference_bgr: np.ndarray, target_bgr: np.ndarray): + # Proxy CSIM: cosine similarity of color+texture descriptor. + def feat(img): + img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_AREA) + hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + hist = cv2.calcHist([hsv], [0, 1], None, [24, 24], [0, 180, 0, 256]).flatten().astype(np.float32) + hist = hist / (np.linalg.norm(hist) + 1e-8) + g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + edges = cv2.Canny(g, 60, 160).astype(np.float32) + edges = cv2.resize(edges, (56, 56), interpolation=cv2.INTER_AREA).flatten() + edges = edges / (np.linalg.norm(edges) + 1e-8) + return np.concatenate([hist, edges], axis=0) + + f1 = feat(reference_bgr) + f2 = feat(target_bgr) + sim = float(np.dot(f1, f2) / ((np.linalg.norm(f1) * np.linalg.norm(f2)) + 1e-8)) + return float(np.clip(sim, -1.0, 1.0)) + + +def _feature_for_efid(img_bgr: np.ndarray): + img = cv2.resize(img_bgr, (224, 224), interpolation=cv2.INTER_AREA) + hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + hist_hs = cv2.calcHist([hsv], [0, 1], None, [32, 32], [0, 180, 0, 256]).flatten().astype(np.float64) + hist_hs = hist_hs / (np.sum(hist_hs) + 1e-12) + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float64) / 255.0 + stat = np.array([gray.mean(), gray.std()], dtype=np.float64) + return np.concatenate([hist_hs, stat], axis=0) + + +def _sqrtm_psd(mat: np.ndarray): + # Symmetric PSD matrix square root via eigen decomposition. + vals, vecs = np.linalg.eigh(mat) + vals = np.clip(vals, 0.0, None) + return (vecs * np.sqrt(vals)) @ vecs.T + + +def frechet_distance(feats1: np.ndarray, feats2: np.ndarray): + mu1 = np.mean(feats1, axis=0) + mu2 = np.mean(feats2, axis=0) + s1 = np.cov(feats1, rowvar=False) + s2 = np.cov(feats2, rowvar=False) + if s1.ndim == 0: + s1 = np.array([[float(s1)]]) + if s2.ndim == 0: + s2 = np.array([[float(s2)]]) + + covmean = _sqrtm_psd(s1 @ s2) + diff = mu1 - mu2 + fid = diff @ diff + np.trace(s1 + s2 - 2.0 * covmean) + return float(max(fid, 0.0)) + + +def evaluate_manifest( + manifest_path: Path, + output_path: Path, + max_frames: int, + metric_size: int, +): + with manifest_path.open("r", encoding="utf-8") as f: + manifest = json.load(f) + manifest_dir = manifest_path.parent + + if not isinstance(manifest, dict) or not manifest: + raise ValueError("Manifest must be a non-empty object.") + + per_sample = {} + ssim_first_list = [] + psnr_first_list = [] + ssim_avg_frames_list = [] + psnr_avg_frames_list = [] + sync_c_list = [] + hkc_list = [] + hkv_list = [] + csim_list = [] + efid_ref_feats = [] + efid_gen_feats = [] + failed = [] + + for sample_id, sample in manifest.items(): + try: + image_rel = sample.get("image") + video_rel = sample.get("generate_video") + audio_rel = sample.get("audio") + if not image_rel or not video_rel: + raise ValueError("Missing image or generate_video field") + + image_path = _resolve_manifest_path(image_rel, manifest_dir) + video_path = _resolve_manifest_path(video_rel, manifest_dir) + audio_path = _resolve_manifest_path(audio_rel, manifest_dir) if audio_rel else None + + if not image_path.exists(): + raise FileNotFoundError(f"Image not found: {image_path}") + if not video_path.exists(): + raise FileNotFoundError(f"Video not found: {video_path}") + + ref = cv2.imread(str(image_path)) + if ref is None: + raise RuntimeError(f"Cannot read image: {image_path}") + + frames = read_video_frames(video_path, max_frames=max_frames) + + # Resize for faster and more stable metric computation. + ref_m = cv2.resize(ref, (metric_size, metric_size), interpolation=cv2.INTER_AREA) + frames_m = [cv2.resize(fr, (metric_size, metric_size), interpolation=cv2.INTER_AREA) for fr in frames] + + first = frames_m[0] + ssim_first = ssim(ref_m, first) + psnr_first = psnr(ref_m, first) + sync_c = compute_sync_c(frames_m, audio_path, fps=25.0) if audio_path else 0.0 + hkc, hkv = compute_hkc_hkv(frames_m) + csim = compute_csim(ref_m, first) + + ssim_frames = [] + psnr_frames = [] + for fr in frames_m: + ssim_frames.append(ssim(ref_m, fr)) + psnr_frames.append(psnr(ref_m, fr)) + + ssim_avg = float(np.mean(ssim_frames)) + psnr_avg = float(np.mean(psnr_frames)) + + per_sample[sample_id] = { + "image": image_rel, + "generate_video": video_rel, + "num_frames_used": len(frames), + "ssim_image_vs_first_frame": ssim_first, + "psnr_image_vs_first_frame": psnr_first, + "ssim_image_vs_all_frames_avg": ssim_avg, + "psnr_image_vs_all_frames_avg": psnr_avg, + "Sync-C": sync_c, + "HKC": hkc, + "HKV": hkv, + "CSIM": csim, + } + + ssim_first_list.append(ssim_first) + psnr_first_list.append(psnr_first) + ssim_avg_frames_list.append(ssim_avg) + psnr_avg_frames_list.append(psnr_avg) + sync_c_list.append(sync_c) + hkc_list.append(hkc) + hkv_list.append(hkv) + csim_list.append(csim) + + efid_ref_feats.append(_feature_for_efid(ref_m)) + efid_gen_feats.append(_feature_for_efid(first)) + except Exception as e: + failed.append({"sample_id": sample_id, "error": str(e)}) + + efid = None + if len(efid_ref_feats) >= 2 and len(efid_gen_feats) >= 2: + efid = frechet_distance(np.stack(efid_ref_feats, axis=0), np.stack(efid_gen_feats, axis=0)) + + summary = { + "num_samples_total": len(manifest), + "num_samples_success": len(per_sample), + "num_samples_failed": len(failed), + "metrics": { + "ssim_image_vs_first_frame_mean": float(np.mean(ssim_first_list)) if ssim_first_list else None, + "psnr_image_vs_first_frame_mean": float(np.mean(psnr_first_list)) if psnr_first_list else None, + "ssim_image_vs_all_frames_avg_mean": float(np.mean(ssim_avg_frames_list)) if ssim_avg_frames_list else None, + "psnr_image_vs_all_frames_avg_mean": float(np.mean(psnr_avg_frames_list)) if psnr_avg_frames_list else None, + "Sync-C_mean": float(np.mean(sync_c_list)) if sync_c_list else None, + "HKC_mean": float(np.mean(hkc_list)) if hkc_list else None, + "HKV_mean": float(np.mean(hkv_list)) if hkv_list else None, + "CSIM_mean": float(np.mean(csim_list)) if csim_list else None, + "EFID_reference_vs_firstframe": efid, + }, + "unavailable_metrics": { + "FID": "Unavailable without real image/video distribution (ground-truth set).", + "FVD": "Unavailable without real video set and feature extractor pipeline for real vs generated distributions.", + "reason": "Current manifest contains prompt/image/audio/generate_video but no real video references.", + }, + "metric_notes": { + "Sync-C": "No-GT proxy via audio-energy and mouth-motion correlation.", + "HKC": "No-GT proxy via side-region hand-motion confidence.", + "HKV": "No-GT proxy via variance of hand-motion energy.", + "CSIM": "No-GT proxy identity similarity using color+texture cosine similarity.", + "EFID": "No-GT proxy Fréchet distance between reference-image and generated-first-frame handcrafted features.", + }, + } + + output = { + "config": { + "manifest_path": str(manifest_path), + "path_resolution": "Use absolute paths in manifest directly; relative paths are resolved by manifest dir.", + "max_frames": max_frames, + "metric_size": metric_size, + "note": "This is no-ground-truth evaluation. SSIM/PSNR are computed as image-to-video fidelity proxies.", + }, + "summary": summary, + "per_sample": per_sample, + "failed_samples": failed, + } + + with output_path.open("w", encoding="utf-8") as f: + json.dump(output, f, ensure_ascii=False, indent=2) + + return output + + +def main(): + parser = argparse.ArgumentParser(description="No-GT evaluation for S2V manifest") + parser.add_argument( + "--manifest", + type=str, + default="./s2v_manifest_with_generate_video.json", + help="Input manifest JSON path. Default: ./s2v_manifest_with_generate_video.json", + ) + parser.add_argument( + "--output", + type=str, + default="./evaluation_no_gt_metrics_s2v.json", + help="Output metrics JSON path. Default: ./evaluation_no_gt_metrics_s2v.json", + ) + parser.add_argument( + "--generated_video_dir", + type=str, + default=None, + help="Directory containing generated mp4 files. If set, evaluator will build matched manifest from --manifest.", + ) + parser.add_argument( + "--matched_manifest_output", + type=str, + default=None, + help="Output path for matched manifest with generate_video field.", + ) + parser.add_argument("--max_frames", type=int, default=120) + parser.add_argument("--metric_size", type=int, default=256) + args = parser.parse_args() + + manifest_path = Path(args.manifest) + if args.generated_video_dir: + matched = build_matched_manifest( + source_manifest_path=manifest_path, + generated_video_dir=Path(args.generated_video_dir), + ) + matched_manifest_output = Path(args.matched_manifest_output) if args.matched_manifest_output else Path( + "./s2v_manifest_with_generate_video.json" + ) + matched_manifest_output.parent.mkdir(parents=True, exist_ok=True) + matched_manifest_output.write_text(json.dumps(matched, ensure_ascii=False, indent=2), encoding="utf-8") + manifest_path = matched_manifest_output + + out = evaluate_manifest( + manifest_path=manifest_path, + output_path=Path(args.output), + max_frames=args.max_frames, + metric_size=args.metric_size, + ) + + print("Evaluation done.") + print(json.dumps(out["summary"], ensure_ascii=False, indent=2)) + print(f"Saved: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py index c36ebef7177..bd22612a21a 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/main.py +++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py @@ -84,12 +84,14 @@ def build_pipeline(args): if args.task == "t2v": vae = AutoencoderKLWan.from_pretrained(args.model, subfolder="vae", torch_dtype=torch.float32) pipe = WanPipeline.from_pretrained(args.model, vae=vae, torch_dtype=torch.bfloat16) - pipe.enable_model_cpu_offload() + if not args.quantize: + pipe.enable_model_cpu_offload() return pipe if args.task == "i2v": pipe = WanImageToVideoPipeline.from_pretrained(args.model, torch_dtype=torch.bfloat16) - pipe.enable_model_cpu_offload() + if not args.quantize: + pipe.enable_model_cpu_offload() return pipe raise ValueError(f"Unsupported task: {args.task}. Supported tasks are: i2v, t2v") diff --git a/examples/pytorch/diffusion_model/diffusers/wan/prepare_s2v_dataset.py b/examples/pytorch/diffusion_model/diffusers/wan/prepare_s2v_dataset.py new file mode 100644 index 00000000000..241b08bca25 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/prepare_s2v_dataset.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +import argparse +import json +from pathlib import Path + +IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp"} +AUDIO_EXT_PRIORITY = [".wav", ".mp3", ".m4a", ".flac", ".ogg"] + +def load_prompts(prompt_dir: Path): + prompts = {} + if not prompt_dir.exists(): + return prompts + for path in sorted(prompt_dir.glob("*.txt")): + stem = path.stem + text = path.read_text(encoding="utf-8", errors="ignore").strip() + if text: + prompts[stem] = text + return prompts + + +def build_audio_index(audio_dir: Path): + index = {} + if not audio_dir.exists(): + return index + for path in sorted(audio_dir.iterdir()): + if not path.is_file(): + continue + suffix = path.suffix.lower() + if suffix not in AUDIO_EXT_PRIORITY: + continue + index.setdefault(path.stem, []).append(path) + return index + + +def pick_audio(paths): + best = None + best_rank = 10**9 + for p in paths: + rank = AUDIO_EXT_PRIORITY.index(p.suffix.lower()) + if rank < best_rank: + best = p + best_rank = rank + return best + + +def build_manifest(dataset_dir: Path): + dataset_dir = dataset_dir.resolve() + img_dir = dataset_dir / "imgs" + audio_dir = dataset_dir / "audios" + prompt_dir = dataset_dir / "prompts" + + if not img_dir.exists(): + raise FileNotFoundError(f"Missing image directory: {img_dir}") + + prompts = load_prompts(prompt_dir) + audio_index = build_audio_index(audio_dir) + + manifest = {} + skipped = [] + + for image_path in sorted(img_dir.iterdir()): + if not image_path.is_file(): + continue + if image_path.suffix.lower() not in IMAGE_EXTS: + continue + + sample_id = image_path.stem + prompt = prompts.get(sample_id, None) + audio_candidates = audio_index.get(sample_id, []) + audio_path = pick_audio(audio_candidates) if audio_candidates else None + + reasons = [] + if not prompt: + reasons.append("missing_prompt") + if not audio_path: + reasons.append("missing_audio") + if reasons: + skipped.append({"id": sample_id, "reasons": reasons}) + continue + + item = { + "prompt": prompt or "", + "image": str(image_path.resolve()), + "audio": str(audio_path.resolve()) if audio_path else "", + } + manifest[sample_id] = item + + return manifest, skipped + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Build s2v manifest from a local EchoMimicV3 repo" + ) + parser.add_argument( + "--repo-dir", + required=True, + help="Local path for EchoMimicV3 repo (must already exist).", + ) + parser.add_argument( + "--manifest-out", + required=True, + help="Output manifest path.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + repo_dir = Path(args.repo_dir).resolve() + if not repo_dir.exists(): + raise FileNotFoundError(f"Repo directory not found: {repo_dir}") + + dataset_dir = (repo_dir / "datasets" / "echomimicv3_demos").resolve() + if not dataset_dir.exists(): + raise FileNotFoundError(f"Dataset directory not found: {dataset_dir}") + + manifest, skipped = build_manifest(dataset_dir=dataset_dir) + + out_path = Path(args.manifest_out).resolve() + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=4), encoding="utf-8") + + summary = { + "repo_dir": str(repo_dir), + "dataset_dir": str(dataset_dir), + "manifest_out": str(out_path), + "total_samples": len(manifest), + "skipped_samples": len(skipped), + "first_skipped": skipped[:10], + } + print(json.dumps(summary, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements_i2v_t2v.txt similarity index 99% rename from examples/pytorch/diffusion_model/diffusers/wan/requirements.txt rename to examples/pytorch/diffusion_model/diffusers/wan/requirements_i2v_t2v.txt index dd0a3842c04..8fbd0a14215 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt +++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements_i2v_t2v.txt @@ -41,3 +41,4 @@ omegaconf pycocoevalcap imageio-ffmpeg gdown==4.7.3 + diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements_s2v.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements_s2v.txt new file mode 100644 index 00000000000..1f4ee5df9e7 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements_s2v.txt @@ -0,0 +1,41 @@ +# Core runtime for wan_s2v.py +numpy +torch +torchvision +transformers +accelerate +huggingface_hub +safetensors + +# Quantization stack used by s2v activation QDQ +neural-compressor-pt +auto-round + +# Utilities +pillow +einops +requests + +# S2V evaluation metrics (evaluate_manifest_no_gt.py) +opencv-python-headless +scipy + +# Wan2.2 s2v runtime dependencies +openai-whisper +HyperPyYAML +onnxruntime +inflect +wetext +omegaconf +conformer +hydra-core +lightning +rich +gdown +matplotlib +wget +pyarrow +pyworld +librosa +decord +modelscope diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh index 1ea381f547a..df53e856138 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh @@ -1,61 +1,53 @@ #!/bin/bash set -x -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) - function main { init_params "$@" run_benchmark } function ensure_vbench_repo { - if [ ! -d "${vbench_dir}" ]; then - echo "VBench directory not found. Start cloning https://github.com/Vchitect/VBench.git ..." - git clone https://github.com/Vchitect/VBench.git "${vbench_dir}" - if [ $? -ne 0 ]; then - echo "Error: failed to clone VBench." - exit 1 - fi + if [ -d "${vbench_dir}" ]; then + return fi + + echo "Error: VBench directory not found: ${vbench_dir}" + echo "Please prepare VBench manually and pass --vbench_dir=/path/to/VBench if needed." + exit 1 } -function prepare_vbench_inputs { - if [ "${task}" = "t2v" ]; then - if [ -z "${prompt_folder}" ]; then - echo "Error: --prompt_folder is required for task=t2v" - exit 1 - fi - if [ -z "${dimension}" ]; then - echo "Error: --dimension is required for task=t2v" - exit 1 - fi - fi +function ensure_vbench_data { + local prompt_root="${vbench_dir}/prompts/prompts_per_dimension" + local i2v_image_root="${vbench_dir}/vbench2_beta_i2v/data/crop/16-9" + local i2v_info_file="${vbench_dir}/vbench2_beta_i2v/vbench2_i2v_full_info.json" - if [ "${task}" = "i2v" ]; then - if [ -z "${image_folder}" ]; then - echo "Error: --image_folder is required for task=i2v" - exit 1 - fi - if [ -z "${info_json}" ]; then - echo "Error: --info_json is required for task=i2v" - exit 1 - fi - if [ -z "${dimension}" ]; then - echo "Error: --dimension is required for task=i2v" - exit 1 - fi + if [ -d "${prompt_root}" ] && [ -d "${i2v_image_root}" ] && [ -f "${i2v_info_file}" ]; then + return fi - if [ -n "${prompt_folder}" ] && [ ! -d "${prompt_folder}" ]; then - echo "Error: prompt_folder not found: ${prompt_folder}" - exit 1 + echo "Error: VBench data is incomplete under ${vbench_dir}." + echo "Please prepare VBench data manually (for example run vbench2_beta_i2v/download_data.sh in your VBench repo)." + exit 1 +} + +function ensure_wan_repo { + if [ -d "${wan_dir}" ]; then + return fi - if [ -n "${image_folder}" ] && [ ! -d "${image_folder}" ]; then - echo "Error: image_folder not found: ${image_folder}" + + echo "Error: Wan2.2 directory not found: ${wan_dir}" + echo "Please prepare Wan2.2 manually and pass --wan_dir=/path/to/Wan2.2 if needed." + exit 1 +} + +function ensure_s2v_manifest { + if [ -z "${manifest_path}" ]; then + echo "Error: --manifest_path is required for task=s2v" exit 1 fi - if [ -n "${info_json}" ] && [ ! -f "${info_json}" ]; then - echo "Error: info_json not found: ${info_json}" + + if [ ! -f "${manifest_path}" ]; then + echo "Error: manifest_path not found: ${manifest_path}" exit 1 fi } @@ -87,6 +79,14 @@ function init_params { task="$2" shift 2 ;; + --quantized_model=*) + tuned_checkpoint="${1#*=}" + shift + ;; + --quantized_model) + tuned_checkpoint="$2" + shift 2 + ;; --output_video_path=*) output_video_path="${1#*=}" shift @@ -127,6 +127,30 @@ function init_params { dimension="$2" shift 2 ;; + --manifest_path=*) + manifest_path="${1#*=}" + shift + ;; + --manifest_path) + manifest_path="$2" + shift 2 + ;; + --wan_dir=*) + wan_dir="${1#*=}" + shift + ;; + --wan_dir) + wan_dir="$2" + shift 2 + ;; + --vbench_dir=*) + vbench_dir="${1#*=}" + shift + ;; + --vbench_dir) + vbench_dir="$2" + shift 2 + ;; --gpu_ids=*) gpu_ids="${1#*=}" shift @@ -159,12 +183,12 @@ function init_params { accuracy=true shift ;; - --vbench_dir=*) - vbench_dir="${1#*=}" + --s2v_eval_output=*) + s2v_eval_output="${1#*=}" shift ;; - --vbench_dir) - vbench_dir="$2" + --s2v_eval_output) + s2v_eval_output="$2" shift 2 ;; *) @@ -178,10 +202,14 @@ function init_params { function run_benchmark { task=${task:="t2v"} limit=${limit:=-1} + tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"} output_video_path=${output_video_path:="./tmp_video"} accuracy=${accuracy:=false} disable_mxfp8_inplace_qdq=${disable_mxfp8_inplace_qdq:=false} - vbench_dir=${vbench_dir:="${SCRIPT_DIR}/VBench"} + s2v_eval_output=${s2v_eval_output:=""} + script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + wan_dir=${wan_dir:="${script_dir}/Wan2.2"} + vbench_dir=${vbench_dir:="${script_dir}/VBench"} if [[ ! "${output_video_path}" = /* ]]; then output_video_path=$(realpath -s "$(pwd)/${output_video_path}") @@ -198,12 +226,27 @@ function run_benchmark { exit 1 fi - ensure_vbench_repo + if [ "${task}" != "s2v" ]; then + ensure_vbench_repo + ensure_vbench_data + if [ "${task}" = "t2v" ]; then + prompt_folder=${prompt_folder:="${vbench_dir}/prompts/prompts_per_dimension"} + fi + if [ "${task}" = "i2v" ]; then + image_folder=${image_folder:="${vbench_dir}/vbench2_beta_i2v/data/crop/16-9"} + info_json=${info_json:="${vbench_dir}/vbench2_beta_i2v/vbench2_i2v_full_info.json"} + fi + else + ensure_wan_repo + ensure_s2v_manifest + fi - prepare_vbench_inputs + if [ "${task}" != "s2v" ] && [ -z "${dimension}" ]; then + echo "Error: --dimension is required for task=${task}" + exit 1 + fi - normalized_dimensions="${dimension//,/ }" - read -r -a dimension_list <<< "${normalized_dimensions}" + mkdir -p "${output_video_path}" if [ -n "${gpu_ids}" ]; then gpu_list="${gpu_ids}" @@ -220,107 +263,175 @@ function run_benchmark { gpu_array=() fi - mkdir -p "${output_video_path}" - shard_tmp_root="${output_video_path}/.prompt_shards" - - function build_benchmark_cmd { - local cur_prompt_folder="$2" - local cur_info_json="$3" - local cmd=( - python3 main.py - --model "${input_model}" - --task "${task}" - --scheme "${scheme}" - --output_video_path "${output_video_path}" - --limit "${limit}" - --inference - ) - - if [ -n "${cur_prompt_folder}" ]; then - cmd+=(--prompt_folder "${cur_prompt_folder}") - elif [ -n "${prompt_folder}" ]; then - cmd+=(--prompt_folder "${prompt_folder}") - fi - if [ -n "${image_folder}" ]; then - cmd+=(--image_folder "${image_folder}") - fi - if [ -n "${cur_info_json}" ]; then - cmd+=(--info_json "${cur_info_json}") - elif [ -n "${info_json}" ]; then - cmd+=(--info_json "${info_json}") - fi - if [ -n "$1" ]; then - cmd+=(--dimension "$1") - fi - if [ -n "${mxfp8_chunk_rows}" ]; then - cmd+=(--mxfp8_chunk_rows "${mxfp8_chunk_rows}") - fi - if [ "${disable_mxfp8_inplace_qdq}" = "true" ]; then - cmd+=(--disable_mxfp8_inplace_qdq) - fi + if [ "${task}" = "s2v" ]; then + function build_s2v_cmd { + local cur_manifest_path="$1" + local cmd=( + env "PYTHONPATH=${wan_dir}:${PYTHONPATH}" python3 wan_s2v.py + --model "${input_model}" + --task "s2v-14B" + --scheme "${scheme}" + --output_video_path "${output_video_path}" + --manifest_path "${cur_manifest_path}" + --inference + ) - printf '%q ' "${cmd[@]}" - } + if [ "${scheme}" != "BF16" ]; then + cmd+=(--quantized_model "${tuned_checkpoint}") + fi + + if [ -n "${mxfp8_chunk_rows}" ]; then + cmd+=(--mxfp8_chunk_rows "${mxfp8_chunk_rows}") + fi + if [ "${disable_mxfp8_inplace_qdq}" = "true" ]; then + cmd+=(--disable_mxfp8_inplace_qdq) + fi - if [ ${#gpu_array[@]} -eq 0 ]; then - if [ ${#dimension_list[@]} -eq 0 ]; then - eval "$(build_benchmark_cmd "" "" "")" + printf "%q " "${cmd[@]}" + } + + if [ ${#gpu_array[@]} -eq 0 ]; then + run_cmd="$(build_s2v_cmd "${manifest_path}")" + eval "${run_cmd}" else - for cur_dimension in "${dimension_list[@]}"; do - eval "$(build_benchmark_cmd "${cur_dimension}" "" "")" - done - fi - else - if [ ${#dimension_list[@]} -eq 0 ]; then - echo "Error: multi-GPU sharding requires --dimension" - exit 1 - fi + num_shards=${#gpu_array[@]} + s2v_shard_root="${output_video_path}/.manifest_shards" + rm -rf "${s2v_shard_root}" - num_shards=${#gpu_array[@]} - for cur_dimension in "${dimension_list[@]}"; do - dim_shard_root="${shard_tmp_root}/${cur_dimension}" - rm -rf "${dim_shard_root}" - if [ "${task}" = "t2v" ]; then - prompt_file="${prompt_folder}/${cur_dimension}.txt" - python3 split_t2v_prompts.py \ - --prompt_file "${prompt_file}" \ - --num_shards "${num_shards}" \ - --output_root "${dim_shard_root}" - else - python3 split_i2v_info.py \ - --info_json "${info_json}" \ - --dimension "${cur_dimension}" \ - --num_shards "${num_shards}" \ - --output_root "${dim_shard_root}" - fi + python3 split_s2v_manifest.py \ + --manifest_path "${manifest_path}" \ + --num_shards "${num_shards}" \ + --output_root "${s2v_shard_root}" program_pid=() for shard_id in "${!gpu_array[@]}"; do gpu_id="${gpu_array[$shard_id]}" - log_suffix="${cur_dimension}" - if [ -z "${log_suffix}" ]; then - log_suffix="all" + shard_manifest_path="${s2v_shard_root}/shard_${shard_id}/manifest.json" + if [ ! -f "${shard_manifest_path}" ]; then + echo "Skip empty shard_${shard_id} on GPU ${gpu_id}" + continue fi - log_file="${output_video_path}/${log_suffix}.gpu${gpu_id}.log" - shard_prompt_folder="" - shard_info_json="" + log_file="${output_video_path}/s2v.gpu${gpu_id}.log" + run_cmd="$(build_s2v_cmd "${shard_manifest_path}")" + CUDA_VISIBLE_DEVICES="${gpu_id}" bash -lc "${run_cmd}" > "${log_file}" 2>&1 & + program_pid+=("$!") + echo "Start (PID: ${program_pid[-1]}, GPU: ${gpu_id}, shard: ${shard_id})" + done + if [ ${#program_pid[@]} -eq 0 ]; then + echo "Error: no non-empty s2v shards to run. Check --manifest_path content." + exit 1 + fi + + for pid in "${program_pid[@]}"; do + wait "${pid}" || exit 1 + done + fi + else + normalized_dimensions="${dimension//,/ }" + read -r -a dimension_list <<< "${normalized_dimensions}" + + shard_tmp_root="${output_video_path}/.prompt_shards" + + function build_benchmark_cmd { + local cur_prompt_folder="$2" + local cur_info_json="$3" + local cmd=( + python3 main.py + --model "${input_model}" + --task "${task}" + --scheme "${scheme}" + --output_video_path "${output_video_path}" + --limit "${limit}" + --inference + ) + + if [ -n "${cur_prompt_folder}" ]; then + cmd+=(--prompt_folder "${cur_prompt_folder}") + elif [ -n "${prompt_folder}" ]; then + cmd+=(--prompt_folder "${prompt_folder}") + fi + if [ -n "${image_folder}" ]; then + cmd+=(--image_folder "${image_folder}") + fi + if [ -n "${cur_info_json}" ]; then + cmd+=(--info_json "${cur_info_json}") + elif [ -n "${info_json}" ]; then + cmd+=(--info_json "${info_json}") + fi + if [ -n "$1" ]; then + cmd+=(--dimension "$1") + fi + if [ -n "${mxfp8_chunk_rows}" ]; then + cmd+=(--mxfp8_chunk_rows "${mxfp8_chunk_rows}") + fi + if [ "${disable_mxfp8_inplace_qdq}" = "true" ]; then + cmd+=(--disable_mxfp8_inplace_qdq) + fi + + printf "%q " "${cmd[@]}" + } + + if [ ${#gpu_array[@]} -eq 0 ]; then + if [ ${#dimension_list[@]} -eq 0 ]; then + eval "$(build_benchmark_cmd "" "" "")" + else + for cur_dimension in "${dimension_list[@]}"; do + eval "$(build_benchmark_cmd "${cur_dimension}" "" "")" + done + fi + else + if [ ${#dimension_list[@]} -eq 0 ]; then + echo "Error: multi-GPU sharding requires --dimension" + exit 1 + fi + + num_shards=${#gpu_array[@]} + for cur_dimension in "${dimension_list[@]}"; do + dim_shard_root="${shard_tmp_root}/${cur_dimension}" + rm -rf "${dim_shard_root}" if [ "${task}" = "t2v" ]; then - shard_prompt_folder="${dim_shard_root}/shard_${shard_id}" + prompt_file="${prompt_folder}/${cur_dimension}.txt" + python3 split_t2v_prompts.py \ + --prompt_file "${prompt_file}" \ + --num_shards "${num_shards}" \ + --output_root "${dim_shard_root}" else - shard_info_json="${dim_shard_root}/shard_${shard_id}/info.json" + python3 split_i2v_info.py \ + --info_json "${info_json}" \ + --dimension "${cur_dimension}" \ + --num_shards "${num_shards}" \ + --output_root "${dim_shard_root}" fi - cmd="$(build_benchmark_cmd "${cur_dimension}" "${shard_prompt_folder}" "${shard_info_json}")" - CUDA_VISIBLE_DEVICES="${gpu_id}" bash -lc "${cmd}" > "${log_file}" 2>&1 & - program_pid+=("$!") - echo "Start (PID: ${program_pid[-1]}, GPU: ${gpu_id}, dimension: ${cur_dimension})" - done + program_pid=() + for shard_id in "${!gpu_array[@]}"; do + gpu_id="${gpu_array[$shard_id]}" + log_suffix="${cur_dimension}" + if [ -z "${log_suffix}" ]; then + log_suffix="all" + fi + log_file="${output_video_path}/${log_suffix}.gpu${gpu_id}.log" + shard_prompt_folder="" + shard_info_json="" - for pid in "${program_pid[@]}"; do - wait "${pid}" || exit 1 + if [ "${task}" = "t2v" ]; then + shard_prompt_folder="${dim_shard_root}/shard_${shard_id}" + else + shard_info_json="${dim_shard_root}/shard_${shard_id}/info.json" + fi + + run_cmd="$(build_benchmark_cmd "${cur_dimension}" "${shard_prompt_folder}" "${shard_info_json}")" + CUDA_VISIBLE_DEVICES="${gpu_id}" bash -lc "${run_cmd}" > "${log_file}" 2>&1 & + program_pid+=("$!") + echo "Start (PID: ${program_pid[-1]}, GPU: ${gpu_id}, dimension: ${cur_dimension})" + done + + for pid in "${program_pid[@]}"; do + wait "${pid}" || exit 1 + done done - done + fi fi if [ "${accuracy}" = "true" ]; then @@ -330,7 +441,7 @@ function run_benchmark { python evaluate.py \ --dimension "subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency" \ --videos_path "${output_video_path}" \ - --mode=vbench_standard + --mode=vbench_standard popd elif [ "${task}" = "i2v" ]; then echo "Start VBench evaluation for i2v..." @@ -341,12 +452,42 @@ function run_benchmark { --ratio "16-9" \ --mode=vbench_standard popd + elif [ "${task}" = "s2v" ]; then + echo "Start s2v evaluation..." + s2v_eval_script="${script_dir}/evaluate_manifest_no_gt.py" + s2v_eval_manifest="${output_video_path}/s2v_manifest_with_generate_video.json" + if [ ! -f "${s2v_eval_script}" ]; then + echo "Error: s2v evaluation script not found: ${s2v_eval_script}" + exit 1 + fi + if [ -z "${s2v_eval_output}" ]; then + s2v_eval_output="${output_video_path}/evaluation_no_gt_metrics_s2v.json" + fi + + eval_cmd=( + python3 "${s2v_eval_script}" + --manifest "${manifest_path}" + --generated_video_dir "${output_video_path}" + --matched_manifest_output "${s2v_eval_manifest}" + --output "${s2v_eval_output}" + --max_frames "32" + --metric_size "192" + ) + printf "%q " "${eval_cmd[@]}" && echo + "${eval_cmd[@]}" + + echo "S2V evaluation finished." + echo "- matched manifest: ${s2v_eval_manifest}" + echo "- metrics output: ${s2v_eval_output}" else - echo "--accuracy does not support task=${task}. Supported tasks: t2v, i2v." - exit 1 + echo "--accuracy currently does not support task=${task}. Generated videos are saved at ${output_video_path}." fi else - echo "Video generation finished. Use --accuracy to run VBench evaluation for t2v/i2v." + if [ "${task}" = "s2v" ]; then + echo "S2V generation finished. Videos are in ${output_video_path}." + else + echo "Video generation finished. Use --accuracy to run VBench evaluation for t2v/i2v." + fi fi } diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh index ae1ff41e1bb..01420acebf9 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh @@ -6,6 +6,16 @@ function main { run_tuning } +function ensure_wan_repo { + if [ -d "${wan_dir}" ]; then + return + fi + + echo "Error: Wan2.2 directory not found: ${wan_dir}" + echo "Please prepare Wan2.2 manually and pass --wan_dir=/path/to/Wan2.2 if needed." + exit 1 +} + function init_params { for var in "$@" do @@ -22,6 +32,9 @@ function init_params { --output_model=*) tuned_checkpoint=$(echo $var | cut -f2 -d=) ;; + --wan_dir=*) + wan_dir=$(echo $var | cut -f2 -d=) + ;; *) echo "Error: No such parameter: ${var}" exit 1 @@ -33,22 +46,34 @@ function init_params { function run_tuning { tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"} task=${task:="t2v"} + script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + wan_dir=${wan_dir:="${script_dir}/Wan2.2"} if [ "${topology}" = "wan_fp8" ]; then - extra_cmd="--scheme FP8" + scheme="FP8" elif [ "${topology}" = "wan_mxfp8" ]; then - extra_cmd="--scheme MXFP8" + scheme="MXFP8" else echo "Error: unsupported topology ${topology}, use wan_fp8 or wan_mxfp8" exit 1 fi - python3 main.py \ - --model ${input_model} \ - --task ${task} \ - --output_dir ${tuned_checkpoint} \ - --quantize \ - ${extra_cmd} + if [ "${task}" = "s2v" ]; then + ensure_wan_repo + env "PYTHONPATH=${wan_dir}:${PYTHONPATH}" python3 wan_s2v.py \ + --model ${input_model} \ + --task s2v-14B \ + --scheme ${scheme} \ + --quantize \ + --output_dir ${tuned_checkpoint} + else + python3 main.py \ + --model ${input_model} \ + --task ${task} \ + --scheme ${scheme} \ + --quantize \ + --output_dir ${tuned_checkpoint} + fi } main "$@" diff --git a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh index c9f9700dbbc..da60a112471 100644 --- a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh +++ b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh @@ -1,2 +1,53 @@ -pip install --no-cache-dir -r requirements.txt -pip install VBench --no-deps +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +task="" + +usage() { + cat <<'USAGE' +Usage: bash setup.sh --task t2v|i2v|s2v +USAGE +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --task) + task="${2:-}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Error: unknown argument: $1" + usage + exit 1 + ;; + esac +done + +if [[ -z "$task" ]]; then + echo "Error: --task is required" + usage + exit 1 +fi + +if [[ "$task" == "s2v" ]]; then + req_file="${SCRIPT_DIR}/requirements_s2v.txt" +elif [[ "$task" == "t2v" || "$task" == "i2v" ]]; then + req_file="${SCRIPT_DIR}/requirements_i2v_t2v.txt" +else + echo "Error: unsupported task: $task" + usage + exit 1 +fi + +pip install --no-cache-dir -r "$req_file" + +if [[ "$task" == "t2v" || "$task" == "i2v" ]]; then + pip install --no-cache-dir VBench --no-deps +fi + +echo "Setup completed for task: $task" diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_s2v_manifest.py b/examples/pytorch/diffusion_model/diffusers/wan/split_s2v_manifest.py new file mode 100644 index 00000000000..b0223f95dee --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/split_s2v_manifest.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +import argparse +import json +import os + + +def parse_args(): + parser = argparse.ArgumentParser(description="Split s2v manifest into per-shard JSON files.") + parser.add_argument("--manifest_path", required=True, type=str, help="Path to full s2v manifest JSON") + parser.add_argument("--num_shards", required=True, type=int, help="Total shard count") + parser.add_argument("--output_root", required=True, type=str, help="Root dir to write shard JSON files") + return parser.parse_args() + + +def split_list_items(items, num_shards): + buckets = [[] for _ in range(num_shards)] + for idx, item in enumerate(items): + buckets[idx % num_shards].append(item) + return buckets + + +def split_dict_items(items, num_shards): + buckets = [dict() for _ in range(num_shards)] + for idx, key in enumerate(items.keys()): + shard_id = idx % num_shards + buckets[shard_id][key] = items[key] + return buckets + + +def main(): + args = parse_args() + + if args.num_shards < 1: + raise ValueError("--num_shards must be >= 1") + if not os.path.isfile(args.manifest_path): + raise FileNotFoundError(f"Manifest file not found: {args.manifest_path}") + + with open(args.manifest_path, "r", encoding="utf-8") as f: + manifest = json.load(f) + + if isinstance(manifest, list): + shard_buckets = split_list_items(manifest, args.num_shards) + elif isinstance(manifest, dict): + shard_buckets = split_dict_items(manifest, args.num_shards) + else: + raise ValueError("Manifest must be a JSON object or list") + + os.makedirs(args.output_root, exist_ok=True) + written_shards = 0 + for shard_id, shard_manifest in enumerate(shard_buckets): + if len(shard_manifest) == 0: + continue + shard_dir = os.path.join(args.output_root, f"shard_{shard_id}") + os.makedirs(shard_dir, exist_ok=True) + shard_manifest_path = os.path.join(shard_dir, "manifest.json") + with open(shard_manifest_path, "w", encoding="utf-8") as f: + json.dump(shard_manifest, f, ensure_ascii=False, indent=2) + written_shards += 1 + + total_count = len(manifest) + print( + f"Split {total_count} s2v samples into {written_shards} non-empty shards " + f"(requested {args.num_shards}) under {args.output_root}" + ) + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/diffusion_model/diffusers/wan/wan_s2v.py b/examples/pytorch/diffusion_model/diffusers/wan/wan_s2v.py new file mode 100644 index 00000000000..75ee40fbf32 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/wan/wan_s2v.py @@ -0,0 +1,310 @@ +import argparse +import json +import logging +import os +import re +from functools import partial +from pathlib import Path + +import torch +from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare + +import wan +from auto_round.data_type.fp8 import quant_fp8_sym +from auto_round.data_type.mxfp import quant_mx_rceil +from auto_round.utils import get_block_names, get_module +from wan.configs import MAX_AREA_CONFIGS, WAN_CONFIGS +from wan.utils.utils import merge_video_audio, save_video + + +def parse_args(): + parser = argparse.ArgumentParser(description="Wan s2v quantization and inference example") + parser.add_argument("--model", required=True, type=str, help="Wan S2V checkpoint directory") + parser.add_argument("--task", default="s2v-14B", choices=["s2v-14B"], type=str) + parser.add_argument("--scheme", default="BF16", choices=["BF16", "FP8", "MXFP8"], type=str) + parser.add_argument("--quantize", action="store_true", help="Quantize Wan S2V noise model with AutoRound") + parser.add_argument("--inference", action="store_true", help="Run S2V inference") + parser.add_argument("--output_dir", "--quantized_model", default="./tmp_autoround_s2v", type=str, help="Output dir for quantized model") + + parser.add_argument("--output_video_path", default="./wan_s2v_video", type=str) + parser.add_argument("--manifest_path", default=None, type=str, help="Path to JSON with prompt/image/audio samples") + + parser.add_argument("--prompt", default=None, type=str) + parser.add_argument("--image", default=None, type=str) + parser.add_argument("--audio", default=None, type=str) + + parser.add_argument("--size", default="1280*720", type=str) + parser.add_argument("--infer_frames", default=80, type=int) + parser.add_argument("--num_clip", default=None, type=int) + + parser.add_argument("--sample_solver", default="unipc", choices=["unipc", "dpm++"], type=str) + parser.add_argument("--sample_steps", default=None, type=int) + parser.add_argument("--sample_shift", default=None, type=float) + parser.add_argument("--sample_guide_scale", default=None, type=float) + parser.add_argument("--seed", default=42, type=int) + + parser.add_argument("--enable_tts", action="store_true") + parser.add_argument("--tts_prompt_audio", default=None, type=str) + parser.add_argument("--tts_prompt_text", default=None, type=str) + parser.add_argument("--tts_text", default=None, type=str) + parser.add_argument("--pose_video", default=None, type=str) + parser.add_argument("--start_from_ref", action="store_true") + parser.add_argument("--offload_model", action="store_true") + + parser.add_argument("--mxfp8_chunk_rows", default=2048, type=int) + parser.add_argument("--disable_mxfp8_inplace_qdq", action="store_true") + return parser.parse_args() + + +def setup_logging(): + logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s: %(message)s") + + +def sanitize_filename(text): + if not text: + return "sample" + clean = re.sub(r"[^0-9a-zA-Z._-]+", "_", text).strip("_") + return clean[:80] if clean else "sample" + + +def build_samples(args): + if not args.manifest_path: + raise ValueError("S2V requires --manifest_path") + + manifest_path = Path(args.manifest_path) + if not manifest_path.exists(): + raise FileNotFoundError(f"Manifest file not found: {manifest_path}") + + with manifest_path.open("r", encoding="utf-8") as f: + manifest = json.load(f) + + entries = [] + if isinstance(manifest, dict): + iterator = manifest.items() + elif isinstance(manifest, list): + iterator = [(str(i), item) for i, item in enumerate(manifest)] + else: + raise ValueError("Manifest must be a JSON object or list") + + for sample_id, sample in iterator: + if not isinstance(sample, dict): + continue + prompt = sample.get("prompt", args.prompt) + image = sample.get("image", args.image) + audio = sample.get("audio", args.audio) + + if not prompt or not image or not audio: + logging.warning("Skip sample %s: missing prompt/image/audio", sample_id) + continue + entries.append({"id": str(sample_id), "prompt": prompt, "image": image, "audio": audio}) + + if not entries: + raise ValueError("No valid samples found in manifest") + return entries + + +def apply_activation_qdq(model, args): + if args.scheme == "BF16": + return + + if args.scheme == "FP8": + logging.info("Enable FP8 activation QDQ for S2V linear layers") + + def act_qdq_forward(module, x, *fwd_args, **fwd_kwargs): + qdq_x, _, _ = quant_fp8_sym(x, group_size=0) + return module.orig_forward(qdq_x, *fwd_args, **fwd_kwargs) + + for _, module in model.named_modules(): + if module.__class__.__name__ == "Linear": + module.orig_forward = module.forward + module.forward = partial(act_qdq_forward, module) + return + + logging.info( + "Enable MXFP8 activation QDQ (inplace=%s, chunk_rows=%s)", + not args.disable_mxfp8_inplace_qdq, + args.mxfp8_chunk_rows, + ) + + def act_qdq_forward(module, x, *fwd_args, **fwd_kwargs): + chunk_rows = max(1, int(args.mxfp8_chunk_rows)) + use_inplace = not args.disable_mxfp8_inplace_qdq + + if use_inplace and x.is_cuda: + x_2d = x.reshape(-1, x.shape[-1]) + total_rows = x_2d.shape[0] + for start in range(0, total_rows, chunk_rows): + end = min(start + chunk_rows, total_rows) + qdq_chunk = quant_mx_rceil( + x_2d[start:end], + bits=8, + group_size=32, + data_type="mx_fp_rceil", + )[0] + x_2d[start:end].copy_(qdq_chunk) + del qdq_chunk + qdq_x = x + else: + qdq_x = quant_mx_rceil(x, bits=8, group_size=32, data_type="mx_fp_rceil")[0] + + return module.orig_forward(qdq_x, *fwd_args, **fwd_kwargs) + + for block_names in get_block_names(model): + for block_name in block_names: + block = get_module(model, block_name) + for _, module in block.named_modules(): + if module.__class__.__name__ == "Linear": + module.orig_forward = module.forward + module.forward = partial(act_qdq_forward, module) + + +def quantize_noise_model(model, args): + if args.scheme == "BF16": + raise ValueError("BF16 does not need quantization. Use --scheme FP8 or --scheme MXFP8.") + + layer_config = {} + kwargs = {} + if args.scheme == "FP8": + for name, module in model.named_modules(): + if module.__class__.__name__ == "Linear": + layer_config[name] = {"bits": 8, "data_type": "fp", "group_size": 0, "sym": True} + else: + kwargs["scheme"] = { + "bits": 8, + "group_size": 32, + "data_type": "mx_fp", + } + + os.makedirs(args.output_dir, exist_ok=True) + qconfig = AutoRoundConfig( + iters=0, + disable_opt_rtn=True, + layer_config=layer_config, + export_format="fake", + output_dir=args.output_dir, + ) + + logging.info("Prepare + convert S2V noise model (%s)", args.scheme) + model = prepare(model, qconfig) + model = convert(model) + logging.info("S2V quantization done. Output saved to %s", args.output_dir) + +def load_quantized_noise_model(wan_s2v, output_dir): + from wan.modules.s2v.model_s2v import WanModel_S2V + + noise_model = WanModel_S2V.from_pretrained( + output_dir, + torch_dtype=torch.bfloat16 + ) + noise_model.eval() + logging.info("Loading quantized noise_model from %s", output_dir) + setattr(wan_s2v, "noise_model", noise_model) + + +def run_inference(wan_s2v, args, cfg): + os.makedirs(args.output_video_path, exist_ok=True) + + samples = build_samples(args) + logging.info("Start S2V generation, total samples: %s", len(samples)) + + for sample in samples: + prompt = sample["prompt"] + image_path = sample["image"] + audio_path = sample["audio"] + base = f"{sample['id']}_{sanitize_filename(prompt)}.mp4" + save_file = os.path.join(args.output_video_path, base) + save_file_abs = os.path.abspath(save_file) + + if os.path.exists(save_file_abs): + logging.info("Skip %s: video already exists: %s", sample["id"], save_file_abs) + continue + + if not os.path.exists(image_path): + logging.warning("Skip %s: image not found: %s", sample["id"], image_path) + continue + if not os.path.exists(audio_path) and not args.enable_tts: + logging.warning("Skip %s: audio not found: %s", sample["id"], audio_path) + continue + + video = wan_s2v.generate( + input_prompt=prompt, + ref_image_path=image_path, + audio_path=audio_path, + enable_tts=args.enable_tts, + tts_prompt_audio=args.tts_prompt_audio, + tts_prompt_text=args.tts_prompt_text, + tts_text=args.tts_text, + num_repeat=args.num_clip, + pose_video=args.pose_video, + max_area=MAX_AREA_CONFIGS[args.size], + infer_frames=args.infer_frames, + shift=args.sample_shift, + sample_solver=args.sample_solver, + sampling_steps=args.sample_steps, + guide_scale=args.sample_guide_scale, + seed=args.seed, + offload_model=args.offload_model, + init_first_frame=args.start_from_ref, + ) + + save_video( + tensor=video[None], + save_file=save_file_abs, + fps=cfg.sample_fps, + nrow=1, + normalize=True, + value_range=(-1, 1), + ) + + if args.enable_tts: + merge_video_audio(video_path=save_file_abs, audio_path="tts.wav") + else: + merge_video_audio(video_path=save_file_abs, audio_path=audio_path) + + logging.info("Saved: %s", save_file_abs) + + if torch.cuda.is_available(): + torch.cuda.synchronize() + +def main(): + args = parse_args() + setup_logging() + if args.task not in WAN_CONFIGS: + raise ValueError(f"Unsupported task: {args.task}") + if args.size not in MAX_AREA_CONFIGS: + raise ValueError(f"Unsupported --size {args.size}; valid keys: {list(MAX_AREA_CONFIGS.keys())}") + + cfg = WAN_CONFIGS[args.task] + if args.sample_steps is None: + args.sample_steps = cfg.sample_steps + if args.sample_shift is None: + args.sample_shift = cfg.sample_shift + if args.sample_guide_scale is None: + args.sample_guide_scale = cfg.sample_guide_scale + + logging.info("Create WanS2V pipeline from %s", args.model) + wan_s2v = wan.WanS2V( + config=cfg, + checkpoint_dir=args.model, + device_id=0, + rank=0, + t5_fsdp=False, + dit_fsdp=False, + use_sp=False, + t5_cpu=False, + convert_model_dtype=True, + ) + + if args.quantize: + quantize_noise_model(wan_s2v.noise_model, args) + + if args.inference: + if args.scheme in ["FP8","MXFP8"]: + load_quantized_noise_model(wan_s2v, args.output_dir) + apply_activation_qdq(wan_s2v.noise_model, args) + + run_inference(wan_s2v, args, cfg) + + +if __name__ == "__main__": + main()