From e9b3dc8d36c3602097ecfffb0fabd81b9d11924b Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Tue, 26 May 2026 15:01:06 +0000
Subject: [PATCH 01/14] add wan example i2v/t2v task

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../diffusion_model/diffusers/wan/README.md   | 126 +++++++++
 .../diffusion_model/diffusers/wan/main.py     | 253 ++++++++++++++++++
 .../diffusers/wan/run_benchmark.sh            | 168 ++++++++++++
 .../diffusers/wan/run_quant.sh                |  54 ++++
 4 files changed, 601 insertions(+)
 create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/README.md
 create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/main.py
 create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
 create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
new file mode 100644
index 00000000000..150920cac3c
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -0,0 +1,126 @@
+# Step-by-Step
+
+This example provides a unified Wan entry for quantization and evaluation, with both t2v and i2v support.
+
+# Prerequisite
+
+## 1. Environment
+
+```shell
+pip install -r requirements.txt
+# Use latest dev branch if needed before release
+# INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@main
+# pip install git+https://github.com/intel/auto-round.git@main
+pip install neural-compressor-pt==3.7
+pip install auto-round
+```
+
+## 2. Prepare Model
+
+Use a local Wan diffusers model path, for example:
+
+- Wan2.2-T2V-A14B-Diffusers
+- Wan2.2-I2V-A14B-Diffusers
+
+Download example (from Hugging Face):
+
+```bash
+pip install -U "huggingface_hub[cli]"
+
+# t2v model
+huggingface-cli download Wan-AI/Wan2.2-T2V-A14B-Diffusers \
+  --local-dir /path/to/Wan2.2-T2V-A14B-Diffusers
+
+# i2v model
+huggingface-cli download Wan-AI/Wan2.2-I2V-A14B-Diffusers \
+  --local-dir /path/to/Wan2.2-I2V-A14B-Diffusers
+```
+
+## 3. Prepare Dataset
+
+Clone VBench and prepare the required data:
+
+```bash
+git clone https://github.com/Vchitect/VBench.git
+```
+
+- t2v: pass txt with --prompt_file
+- i2v: pass image folder with --image_folder and corresponding --info_json
+
+# Run
+
+## Quantization
+
+```bash
+# topology supports wan_mxfp8 or wan_fp8
+bash run_quant.sh \
+  --topology=wan_mxfp8 \
+  --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
+  --task=t2v \
+  --output_model=wan_mxfp8_model
+```
+
+## Inference + Evaluation
+
+### t2v BF16
+
+```bash
+bash run_benchmark.sh \
+  --topology=wan_bf16 \
+  --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
+  --task=t2v \
+  --prompt_file=/path/to/VBench/prompts/prompts_per_dimension/subject_consistency.txt \
+  --output_video_path=wan_bf16_video \
+  --accuracy
+```
+
+### t2v FP8 / MXFP8
+
+```bash
+# topology supports wan_mxfp8 or wan_fp8
+bash run_benchmark.sh \
+  --topology=wan_mxfp8 \
+  --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
+  --quantized_model=wan_mxfp8_model \
+  --task=t2v \
+  --prompt_file=/path/to/VBench/prompts/prompts_per_dimension/subject_consistency.txt \
+  --output_video_path=wan_mxfp8_video \
+  --accuracy
+```
+
+### i2v BF16
+
+For i2v BF16, provide --image_folder and --info_json explicitly:
+
+```bash
+bash run_benchmark.sh \
+  --topology=wan_bf16 \
+  --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \
+  --task=i2v \
+  --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \
+  --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
+  --output_video_path=wan_i2v_bf16_video \
+  --accuracy
+```
+
+### i2v FP8 / MXFP8
+
+```bash
+# topology supports wan_mxfp8 or wan_fp8
+bash run_benchmark.sh \
+  --topology=wan_mxfp8 \
+  --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \
+  --quantized_model=wan_mxfp8_model \
+  --task=i2v \
+  --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \
+  --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
+  --output_video_path=wan_i2v_mxfp8_video \
+  --accuracy
+```
+
+# Notes
+
+- Quantized weights are saved under:
+  - <output_model>/transformer
+  - <output_model>/transformer_2
+
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py
new file mode 100755
index 00000000000..256aa05dd5c
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py
@@ -0,0 +1,253 @@
+import argparse
+import json
+import os
+import random
+
+import numpy as np
+import torch
+from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, WanPipeline, WanTransformer3DModel
+from diffusers.utils import export_to_video, load_image
+from functools import partial
+from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare
+
+from auto_round.data_type.fp8 import quant_fp8_sym
+from auto_round.data_type.mxfp import quant_mx_rceil
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Wan quantization and evaluation example.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--model", "--model_name", "--model_name_or_path", required=True, type=str, help="Wan model path")
+    parser.add_argument("--task", default="t2v", choices=["t2v", "i2v"], help="Wan task type")
+    parser.add_argument("--scheme", default="BF16", choices=["BF16", "FP8", "MXFP8"], type=str, help="Quantization scheme")
+    parser.add_argument("--quantize", action="store_true")
+    parser.add_argument("--inference", action="store_true")
+    parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="Directory to save quantized transformer weights")
+    parser.add_argument("--prompt_file", type=str, default=None, help="T2V prompt txt file path")
+    parser.add_argument("--image_folder", type=str, default=None, help="I2V image folder path")
+    parser.add_argument("--info_json", type=str, default=None, help="I2V info json file path")
+    parser.add_argument("--output_video_path", default="./tmp_video", type=str, help="Directory to save generated videos")
+    parser.add_argument("--limit", default=-1, type=int, help="Limit the number of prompts for evaluation")
+    parser.add_argument("--seed", default=42, type=int, help="Random seed")
+    parser.add_argument("--height", default=720, type=int)
+    parser.add_argument("--width", default=1280, type=int)
+    parser.add_argument("--num_frames", default=81, type=int)
+    parser.add_argument("--num_inference_steps", default=40, type=int)
+    parser.add_argument("--guidance_scale", default=4.0, type=float, help="Guidance scale for t2v/i2v")
+    parser.add_argument("--guidance_scale_2", default=3.0, type=float, help="Second guidance scale for t2v only")
+    parser.add_argument("--fps", default=16, type=int)
+    parser.add_argument("--ratio", default="16-9", type=str, help="Aspect ratio used by i2v VBench dataset")
+    parser.add_argument("--image_max_area", default=480 * 832, type=int, help="Maximum i2v image area")
+    return parser.parse_args()
+
+
+def setup_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_scheme_config(scheme):
+    if scheme == "FP8":
+        return {"bits": 8, "data_type": "fp", "group_size": 0, "sym": True}
+    if scheme == "MXFP8":
+        return {"bits": 8, "data_type": "mx_fp", "group_size": 32}
+    return None
+
+
+def build_pipeline(args):
+    if args.task == "t2v":
+        vae = AutoencoderKLWan.from_pretrained(args.model, subfolder="vae", torch_dtype=torch.float32)
+        pipe = WanPipeline.from_pretrained(args.model, vae=vae, torch_dtype=torch.bfloat16)
+        pipe.enable_model_cpu_offload()
+        return pipe
+
+    if args.task == "i2v":
+        pipe = WanImageToVideoPipeline.from_pretrained(args.model, torch_dtype=torch.bfloat16)
+        pipe.enable_model_cpu_offload()
+        return pipe
+
+    raise ValueError(f"Unsupported task: {args.task}. Supported tasks are: i2v, t2v")
+
+
+def quantize_pipleine(pipe, args):
+    scheme_cfg = get_scheme_config(args.scheme)
+    if scheme_cfg is None:
+        raise ValueError("BF16 does not need quantization. Use --scheme FP8 or --scheme MXFP8 with --quantize.")
+
+
+    qconfig = AutoRoundConfig(
+        iters=0,
+        export_format="fake",
+        output_dir=args.output_dir,
+        disable_opt_rtn=True,
+        scheme=scheme_cfg,
+    )
+    pipe = prepare(pipe, qconfig)
+    convert(pipe, qconfig)
+
+
+def apply_activation_qdq(pipe, scheme):
+    if scheme == "BF16":
+        return
+
+    if scheme == "FP8":
+        def act_qdq_forward(module, x, *f_args, **f_kwargs):
+            qdq_x, _, _ = quant_fp8_sym(x, group_size=0)
+            return module.orig_forward(qdq_x, *f_args, **f_kwargs)
+    else:
+        def act_qdq_forward(module, x, *f_args, **f_kwargs):
+            qdq_x, _, _ = quant_mx_rceil(x, bits=8, group_size=32, data_type="mx_fp_rceil")
+            return module.orig_forward(qdq_x, *f_args, **f_kwargs)
+
+    for module_name in ["transformer", "transformer_2"]:
+        module = getattr(pipe, module_name)
+        for n, m in module.named_modules():
+            if m.__class__.__name__ == "Linear" and "blocks" in n:
+                m.orig_forward = m.forward
+                m.forward = partial(act_qdq_forward, m)
+
+
+def load_quantized_transformers(pipe, output_dir):
+    for module_name in ["transformer", "transformer_2"]:
+        q_path = os.path.join(output_dir, module_name)
+        if not os.path.isdir(q_path):
+            raise ValueError(f"Quantized path does not exist: {q_path}")
+        print(f"Loading quantized {module_name} from {q_path}")
+        setattr(pipe, module_name, WanTransformer3DModel.from_pretrained(q_path, torch_dtype=torch.bfloat16))
+
+
+def build_t2v_inputs(args):
+    prompt_file = args.prompt_file
+
+    if not prompt_file:
+        raise ValueError("--prompt_file is required for t2v inference/eval")
+    if not os.path.exists(prompt_file):
+        raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
+
+    with open(prompt_file, "r", encoding="utf-8") as f:
+        prompt_list = [line.strip() for line in f if line.strip()]
+
+    if args.limit >= 0:
+        prompt_list = prompt_list[: args.limit]
+
+    return [{"prompt": prompt} for prompt in prompt_list]
+
+
+def build_i2v_inputs(args):
+    image_folder = args.image_folder
+    info_json = args.info_json
+
+    if not image_folder:
+        raise ValueError("--image_folder is required for i2v inference/eval")
+    if not info_json:
+        raise ValueError("--info_json is required for i2v inference/eval")
+    if not os.path.isdir(image_folder):
+        raise FileNotFoundError(f"Image folder not found: {image_folder}")
+    if not os.path.exists(info_json):
+        raise FileNotFoundError(f"Info json not found: {info_json}")
+
+    with open(info_json, "r", encoding="utf-8") as f:
+        info_list = json.load(f)
+
+    results = []
+    for info in info_list:
+        image_path = os.path.join(image_folder, info["image_name"])
+        if not os.path.exists(image_path):
+            continue
+        results.append(
+            {
+                "prompt": info["prompt_en"],
+                "image_path": image_path,
+            }
+        )
+
+    if args.limit >= 0:
+        results = results[: args.limit]
+    return results
+
+
+def safe_output_path(base_dir, prompt):
+    return os.path.join(base_dir, f"{prompt}-0.mp4")
+
+
+@torch.no_grad()
+def run_inference(args, pipe):
+    setup_seed(args.seed)
+    os.makedirs(args.output_video_path, exist_ok=True)
+    gen = torch.Generator(device="cuda" if torch.cuda.is_available() else "cpu").manual_seed(args.seed)
+
+    if args.task == "t2v":
+        inputs = build_t2v_inputs(args)
+    else:
+        inputs = build_i2v_inputs(args)
+
+    for item in inputs:
+        prompt = item["prompt"]
+        save_path = safe_output_path(args.output_video_path, prompt)
+        if os.path.exists(save_path):
+            continue
+
+        if args.task == "t2v":
+            frames = pipe(
+                prompt=prompt,
+                height=args.height,
+                width=args.width,
+                num_frames=args.num_frames,
+                guidance_scale=args.guidance_scale,
+                guidance_scale_2=args.guidance_scale_2,
+                num_inference_steps=args.num_inference_steps,
+                generator=gen,
+            ).frames[0]
+        else:
+            image = load_image(item["image_path"])
+            aspect_ratio = image.height / image.width
+            mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+            height = round(np.sqrt(args.image_max_area * aspect_ratio)) // mod_value * mod_value
+            width = round(np.sqrt(args.image_max_area / aspect_ratio)) // mod_value * mod_value
+            image = image.resize((width, height))
+
+            frames = pipe(
+                image=image,
+                prompt=prompt,
+                height=height,
+                width=width,
+                num_frames=args.num_frames,
+                guidance_scale=args.guidance_scale,
+                num_inference_steps=args.num_inference_steps,
+                generator=gen,
+            ).frames[0]
+
+        export_to_video(frames, save_path, fps=args.fps)
+        print(f"Saved: {save_path}")
+
+
+def main():
+    args = parse_args()
+
+    if not (args.quantize or args.inference):
+        raise ValueError("Please enable at least one stage: --quantize or --inference")
+
+    if args.quantize or args.inference:
+        pipe = build_pipeline(args)
+    else:
+        pipe = None
+
+    if args.quantize:
+        quantize_pipleine(pipe, args)
+
+    if args.inference:
+        if args.scheme in ["FP8", "MXFP8"]:
+            load_quantized_transformers(pipe, args.output_dir)
+            apply_activation_qdq(pipe, args.scheme)
+        run_inference(args, pipe)
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
new file mode 100755
index 00000000000..4532f902af5
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_benchmark
+}
+
+function ensure_vbench_repo {
+  if [ ! -d "VBench" ]; then
+    echo "VBench directory not found. Start cloning https://github.com/Vchitect/VBench.git ..."
+    git clone https://github.com/Vchitect/VBench.git
+    if [ $? -ne 0 ]; then
+      echo "Error: failed to clone VBench."
+      exit 1
+    fi
+  fi
+}
+
+function prepare_vbench_inputs {
+  if [ "${task}" = "t2v" ] && [ -z "${prompt_file}" ]; then
+    echo "Error: --prompt_file is required for task=t2v"
+    exit 1
+  fi
+
+  if [ "${task}" = "i2v" ]; then
+    if [ -z "${image_folder}" ]; then
+      echo "Error: --image_folder is required for task=i2v"
+      exit 1
+    fi
+    if [ -z "${info_json}" ]; then
+      echo "Error: --info_json is required for task=i2v"
+      exit 1
+    fi
+  fi
+
+  if [ -n "${prompt_file}" ] && [ ! -f "${prompt_file}" ]; then
+    echo "Error: prompt_file not found: ${prompt_file}"
+    exit 1
+  fi
+  if [ -n "${image_folder}" ] && [ ! -d "${image_folder}" ]; then
+    echo "Error: image_folder not found: ${image_folder}"
+    exit 1
+  fi
+  if [ -n "${info_json}" ] && [ ! -f "${info_json}" ]; then
+    echo "Error: info_json not found: ${info_json}"
+    exit 1
+  fi
+}
+
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+        topology="${var#*=}"
+      ;;
+      --input_model=*)
+        input_model="${var#*=}"
+      ;;
+      --task=*)
+        task="${var#*=}"
+      ;;
+      --quantized_model=*)
+        tuned_checkpoint="${var#*=}"
+      ;;
+      --output_video_path=*)
+        output_video_path="${var#*=}"
+      ;;
+      --prompt_file=*)
+        prompt_file="${var#*=}"
+      ;;
+      --image_folder=*)
+        image_folder="${var#*=}"
+      ;;
+      --info_json=*)
+        info_json="${var#*=}"
+      ;;
+      --limit=*)
+        limit="${var#*=}"
+      ;;
+      --accuracy)
+        accuracy=true
+      ;;
+      *)
+        echo "Error: No such parameter: ${var}"
+        exit 1
+      ;;
+    esac
+  done
+}
+
+function run_benchmark {
+  task=${task:="t2v"}
+  limit=${limit:=-1}
+  tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"}
+  output_video_path=${output_video_path:="./tmp_video"}
+  accuracy=${accuracy:=false}
+
+  if [[ ! "${output_video_path}" = /* ]]; then
+    output_video_path=$(realpath -s "$(pwd)/${output_video_path}")
+  fi
+
+  if [ "${topology}" = "wan_bf16" ]; then
+    scheme="BF16"
+  elif [ "${topology}" = "wan_fp8" ]; then
+    scheme="FP8"
+  elif [ "${topology}" = "wan_mxfp8" ]; then
+    scheme="MXFP8"
+  else
+    echo "Error: unsupported topology ${topology}, use wan_bf16/wan_fp8/wan_mxfp8"
+    exit 1
+  fi
+
+  ensure_vbench_repo
+
+  prepare_vbench_inputs
+
+  benchmark_cmd=(
+    python3 main.py
+    --model "${input_model}"
+    --task "${task}"
+    --scheme "${scheme}"
+    --output_dir "${tuned_checkpoint}"
+    --output_video_path "${output_video_path}"
+    --limit "${limit}"
+    --inference
+  )
+
+  if [ -n "${prompt_file}" ]; then
+    benchmark_cmd+=(--prompt_file "${prompt_file}")
+  fi
+  if [ -n "${image_folder}" ]; then
+    benchmark_cmd+=(--image_folder "${image_folder}")
+  fi
+  if [ -n "${info_json}" ]; then
+    benchmark_cmd+=(--info_json "${info_json}")
+  fi
+
+  "${benchmark_cmd[@]}"
+
+  if [ "${accuracy}" = "true" ]; then
+    if [ "${task}" = "t2v" ]; then
+      echo "Start VBench evaluation for t2v..."
+      pushd VBench
+      python evaluate.py \
+        --dimension subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency \
+        --videos_path "${output_video_path}" \
+        --mode=vbench_standard
+      popd
+    elif [ "${task}" = "i2v" ]; then
+      echo "Start VBench evaluation for i2v..."
+      pushd VBench
+      python evaluate_i2v.py \
+        --dimension i2v_background i2v_subject subject_consistency background_consistency motion_smoothness \
+        --videos_path "${output_video_path}" \
+        --mode=vbench_standard
+      popd
+    else
+      echo "--accuracy does not support task=${task}. Supported tasks: t2v, i2v."
+      exit 1
+    fi
+  else
+    echo "Video generation finished. Use --accuracy to run VBench evaluation for t2v/i2v."
+  fi
+}
+
+main "$@"
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh
new file mode 100755
index 00000000000..ae1ff41e1bb
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_tuning
+}
+
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+        topology=$(echo $var | cut -f2 -d=)
+      ;;
+      --input_model=*)
+        input_model=$(echo $var | cut -f2 -d=)
+      ;;
+      --task=*)
+        task=$(echo $var | cut -f2 -d=)
+      ;;
+      --output_model=*)
+        tuned_checkpoint=$(echo $var | cut -f2 -d=)
+      ;;
+      *)
+        echo "Error: No such parameter: ${var}"
+        exit 1
+      ;;
+    esac
+  done
+}
+
+function run_tuning {
+  tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"}
+  task=${task:="t2v"}
+
+  if [ "${topology}" = "wan_fp8" ]; then
+    extra_cmd="--scheme FP8"
+  elif [ "${topology}" = "wan_mxfp8" ]; then
+    extra_cmd="--scheme MXFP8"
+  else
+    echo "Error: unsupported topology ${topology}, use wan_fp8 or wan_mxfp8"
+    exit 1
+  fi
+
+  python3 main.py \
+    --model ${input_model} \
+    --task ${task} \
+    --output_dir ${tuned_checkpoint} \
+    --quantize \
+    ${extra_cmd}
+}
+
+main "$@"

From 9c2ea404f999198abf4f8f52f50f68a4b14def0d Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Wed, 27 May 2026 14:19:57 +0000
Subject: [PATCH 02/14] improve benchmark

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../diffusion_model/diffusers/wan/README.md   |  49 +++++---
 .../diffusion_model/diffusers/wan/main.py     |  40 +++++-
 .../diffusers/wan/run_benchmark.sh            | 116 ++++++++++++++----
 3 files changed, 164 insertions(+), 41 deletions(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
index 150920cac3c..02d6c4e64e9 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -9,7 +9,7 @@ This example provides a unified Wan entry for quantization and evaluation, with
 ```shell
 pip install -r requirements.txt
 # Use latest dev branch if needed before release
-# INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@main
+# INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master
 # pip install git+https://github.com/intel/auto-round.git@main
 pip install neural-compressor-pt==3.7
 pip install auto-round
@@ -44,8 +44,9 @@ Clone VBench and prepare the required data:
 git clone https://github.com/Vchitect/VBench.git
 ```
 
-- t2v: pass txt with --prompt_file
-- i2v: pass image folder with --image_folder and corresponding --info_json
+- t2v: pass prompt folder with --prompt_folder, and set --dimension to select `${prompt_folder}/${dimension}.txt`
+- t2v: can pass --dimension for evaluation filtering (validated dimensions include `subject_consistency` and `overall_consistency`)
+- i2v: pass --image_folder, --info_json, and --dimension (validated dimensions include `i2v_subject`, `i2v_background`, `subject_consistency`, `background_consistency`, and `motion_smoothness`)
 
 # Run
 
@@ -69,9 +70,9 @@ bash run_benchmark.sh \
   --topology=wan_bf16 \
   --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
   --task=t2v \
-  --prompt_file=/path/to/VBench/prompts/prompts_per_dimension/subject_consistency.txt \
-  --output_video_path=wan_bf16_video \
-  --accuracy
+  --dimension=subject_consistency \
+  --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \
+  --output_video_path=wan_bf16_video
 ```
 
 ### t2v FP8 / MXFP8
@@ -83,24 +84,24 @@ bash run_benchmark.sh \
   --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
   --quantized_model=wan_mxfp8_model \
   --task=t2v \
-  --prompt_file=/path/to/VBench/prompts/prompts_per_dimension/subject_consistency.txt \
-  --output_video_path=wan_mxfp8_video \
-  --accuracy
+  --dimension=overall_consistency \
+  --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \
+  --output_video_path=wan_mxfp8_video
 ```
 
 ### i2v BF16
 
-For i2v BF16, provide --image_folder and --info_json explicitly:
+For i2v BF16, provide --image_folder, --info_json, and --dimension explicitly:
 
 ```bash
 bash run_benchmark.sh \
   --topology=wan_bf16 \
   --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \
   --task=i2v \
+  --dimension=i2v_subject \
   --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \
   --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
-  --output_video_path=wan_i2v_bf16_video \
-  --accuracy
+  --output_video_path=wan_i2v_bf16_video 
 ```
 
 ### i2v FP8 / MXFP8
@@ -112,10 +113,30 @@ bash run_benchmark.sh \
   --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \
   --quantized_model=wan_mxfp8_model \
   --task=i2v \
+  --dimension=i2v_background \
   --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \
   --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
-  --output_video_path=wan_i2v_mxfp8_video \
-  --accuracy
+  --output_video_path=wan_i2v_mxfp8_video
+```
+
+## Accuracy Evaluation
+
+For evaluating existing videos, call VBench directly.
+
+```bash
+# t2v accuracy on generated videos
+cd /path/to/VBench
+python evaluate.py \
+  --dimension subject_consistency motion_smoothness aesthetic_quality overall_consistency imaging_quality \
+  --videos_path /path/to/wan_t2v_bf16_video \
+  --mode vbench_standard
+
+# i2v accuracy on generated videos
+cd /path/to/VBench
+python evaluate_i2v.py \
+  --dimension i2v_background i2v_subject subject_consistency background_consistency motion_smoothness \
+  --videos_path /path/to/wan_i2v_bf16_video \
+  --mode vbench_standard
 ```
 
 # Notes
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py
index 256aa05dd5c..a3e2d6b249c 100755
--- a/examples/pytorch/diffusion_model/diffusers/wan/main.py
+++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py
@@ -25,9 +25,19 @@ def parse_args():
     parser.add_argument("--quantize", action="store_true")
     parser.add_argument("--inference", action="store_true")
     parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="Directory to save quantized transformer weights")
-    parser.add_argument("--prompt_file", type=str, default=None, help="T2V prompt txt file path")
+    parser.add_argument("--prompt_folder", type=str, default=None, help="T2V prompt folder path")
     parser.add_argument("--image_folder", type=str, default=None, help="I2V image folder path")
     parser.add_argument("--info_json", type=str, default=None, help="I2V info json file path")
+    parser.add_argument(
+        "--dimension",
+        type=str,
+        default=None,
+        help=(
+            "VBench dimension used by t2v/i2v evaluation or input filtering "
+            "(validated examples: t2v=subject_consistency,overall_consistency; "
+            "i2v=i2v_subject,i2v_background)"
+        ),
+    )
     parser.add_argument("--output_video_path", default="./tmp_video", type=str, help="Directory to save generated videos")
     parser.add_argument("--limit", default=-1, type=int, help="Limit the number of prompts for evaluation")
     parser.add_argument("--seed", default=42, type=int, help="Random seed")
@@ -122,16 +132,28 @@ def load_quantized_transformers(pipe, output_dir):
 
 
 def build_t2v_inputs(args):
-    prompt_file = args.prompt_file
+    prompt_folder = args.prompt_folder
+
+    if not prompt_folder:
+        raise ValueError("--prompt_folder is required for t2v inference/eval")
+    if not args.dimension:
+        raise ValueError("--dimension is required for t2v inference/eval")
+    if not os.path.isdir(prompt_folder):
+        raise FileNotFoundError(f"Prompt folder not found: {prompt_folder}")
 
-    if not prompt_file:
-        raise ValueError("--prompt_file is required for t2v inference/eval")
+    prompt_file = os.path.join(prompt_folder, f"{args.dimension}.txt")
     if not os.path.exists(prompt_file):
-        raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
+        raise FileNotFoundError(f"Prompt file not found for dimension '{args.dimension}': {prompt_file}")
 
     with open(prompt_file, "r", encoding="utf-8") as f:
         prompt_list = [line.strip() for line in f if line.strip()]
 
+    if args.dimension not in {"subject_consistency", "overall_consistency"}:
+        print(
+            "[WARN] t2v --dimension is not in validated examples "
+            "(subject_consistency, overall_consistency). Continue anyway."
+        )
+
     if args.limit >= 0:
         prompt_list = prompt_list[: args.limit]
 
@@ -146,6 +168,11 @@ def build_i2v_inputs(args):
         raise ValueError("--image_folder is required for i2v inference/eval")
     if not info_json:
         raise ValueError("--info_json is required for i2v inference/eval")
+    if not args.dimension:
+        raise ValueError(
+            "--dimension is required for i2v inference/eval "
+            "(validated examples: i2v_subject, i2v_background)"
+        )
     if not os.path.isdir(image_folder):
         raise FileNotFoundError(f"Image folder not found: {image_folder}")
     if not os.path.exists(info_json):
@@ -156,6 +183,9 @@ def build_i2v_inputs(args):
 
     results = []
     for info in info_list:
+        if args.dimension not in info["dimension"]:
+            continue
+
         image_path = os.path.join(image_folder, info["image_name"])
         if not os.path.exists(image_path):
             continue
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
index 4532f902af5..61955560fd7 100755
--- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
@@ -18,9 +18,15 @@ function ensure_vbench_repo {
 }
 
 function prepare_vbench_inputs {
-  if [ "${task}" = "t2v" ] && [ -z "${prompt_file}" ]; then
-    echo "Error: --prompt_file is required for task=t2v"
-    exit 1
+  if [ "${task}" = "t2v" ]; then
+    if [ -z "${prompt_folder}" ]; then
+      echo "Error: --prompt_folder is required for task=t2v"
+      exit 1
+    fi
+    if [ -z "${dimension}" ]; then
+      echo "Error: --dimension is required for task=t2v"
+      exit 1
+    fi
   fi
 
   if [ "${task}" = "i2v" ]; then
@@ -32,10 +38,14 @@ function prepare_vbench_inputs {
       echo "Error: --info_json is required for task=i2v"
       exit 1
     fi
+    if [ -z "${dimension}" ]; then
+      echo "Error: --dimension is required for task=i2v"
+      exit 1
+    fi
   fi
 
-  if [ -n "${prompt_file}" ] && [ ! -f "${prompt_file}" ]; then
-    echo "Error: prompt_file not found: ${prompt_file}"
+  if [ -n "${prompt_folder}" ] && [ ! -d "${prompt_folder}" ]; then
+    echo "Error: prompt_folder not found: ${prompt_folder}"
     exit 1
   fi
   if [ -n "${image_folder}" ] && [ ! -d "${image_folder}" ]; then
@@ -49,41 +59,94 @@ function prepare_vbench_inputs {
 }
 
 function init_params {
-  for var in "$@"
-  do
-    case $var in
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
       --topology=*)
-        topology="${var#*=}"
+        topology="${1#*=}"
+        shift
+      ;;
+      --topology)
+        topology="$2"
+        shift 2
       ;;
       --input_model=*)
-        input_model="${var#*=}"
+        input_model="${1#*=}"
+        shift
+      ;;
+      --input_model)
+        input_model="$2"
+        shift 2
       ;;
       --task=*)
-        task="${var#*=}"
+        task="${1#*=}"
+        shift
+      ;;
+      --task)
+        task="$2"
+        shift 2
       ;;
       --quantized_model=*)
-        tuned_checkpoint="${var#*=}"
+        tuned_checkpoint="${1#*=}"
+        shift
+      ;;
+      --quantized_model)
+        tuned_checkpoint="$2"
+        shift 2
       ;;
       --output_video_path=*)
-        output_video_path="${var#*=}"
+        output_video_path="${1#*=}"
+        shift
+      ;;
+      --output_video_path)
+        output_video_path="$2"
+        shift 2
+      ;;
+      --prompt_folder=*)
+        prompt_folder="${1#*=}"
+        shift
       ;;
-      --prompt_file=*)
-        prompt_file="${var#*=}"
+      --prompt_folder)
+        prompt_folder="$2"
+        shift 2
       ;;
       --image_folder=*)
-        image_folder="${var#*=}"
+        image_folder="${1#*=}"
+        shift
+      ;;
+      --image_folder)
+        image_folder="$2"
+        shift 2
       ;;
       --info_json=*)
-        info_json="${var#*=}"
+        info_json="${1#*=}"
+        shift
+      ;;
+      --info_json)
+        info_json="$2"
+        shift 2
+      ;;
+      --dimension=*)
+        dimension="${1#*=}"
+        shift
+      ;;
+      --dimension)
+        dimension="$2"
+        shift 2
       ;;
       --limit=*)
-        limit="${var#*=}"
+        limit="${1#*=}"
+        shift
+      ;;
+      --limit)
+        limit="$2"
+        shift 2
       ;;
       --accuracy)
         accuracy=true
+        shift
       ;;
       *)
-        echo "Error: No such parameter: ${var}"
+        echo "Error: No such parameter: $1"
         exit 1
       ;;
     esac
@@ -127,8 +190,8 @@ function run_benchmark {
     --inference
   )
 
-  if [ -n "${prompt_file}" ]; then
-    benchmark_cmd+=(--prompt_file "${prompt_file}")
+  if [ -n "${prompt_folder}" ]; then
+    benchmark_cmd+=(--prompt_folder "${prompt_folder}")
   fi
   if [ -n "${image_folder}" ]; then
     benchmark_cmd+=(--image_folder "${image_folder}")
@@ -136,15 +199,24 @@ function run_benchmark {
   if [ -n "${info_json}" ]; then
     benchmark_cmd+=(--info_json "${info_json}")
   fi
+  if [ -n "${dimension}" ]; then
+    benchmark_cmd+=(--dimension "${dimension}")
+  fi
 
   "${benchmark_cmd[@]}"
 
   if [ "${accuracy}" = "true" ]; then
     if [ "${task}" = "t2v" ]; then
       echo "Start VBench evaluation for t2v..."
+      local t2v_dims
+      if [ -n "${dimension}" ]; then
+        t2v_dims="${dimension}"
+      else
+        t2v_dims="subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency"
+      fi
       pushd VBench
       python evaluate.py \
-        --dimension subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency \
+        --dimension ${t2v_dims} \
         --videos_path "${output_video_path}" \
         --mode=vbench_standard
       popd

From a875b79c33f03b27fe9dcba895260c501f947f13 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Thu, 28 May 2026 05:43:27 +0000
Subject: [PATCH 03/14] update readme and add multi-gpus support and shard
 dataset

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../diffusion_model/diffusers/wan/README.md   |  49 +++---
 .../diffusion_model/diffusers/wan/main.py     |  43 ++++-
 .../diffusers/wan/run_benchmark.sh            | 154 ++++++++++++++----
 .../diffusers/wan/split_i2v_info.py           |  55 +++++++
 .../diffusers/wan/split_t2v_prompts.py        |  47 ++++++
 5 files changed, 295 insertions(+), 53 deletions(-)
 mode change 100644 => 100755 examples/pytorch/diffusion_model/diffusers/wan/README.md
 create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py
 create mode 100755 examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
old mode 100644
new mode 100755
index 02d6c4e64e9..e9c8440b658
--- a/examples/pytorch/diffusion_model/diffusers/wan/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -11,7 +11,7 @@ pip install -r requirements.txt
 # Use latest dev branch if needed before release
 # INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master
 # pip install git+https://github.com/intel/auto-round.git@main
-pip install neural-compressor-pt==3.7
+pip install neural-compressor-pt
 pip install auto-round
 ```
 
@@ -37,14 +37,20 @@ huggingface-cli download Wan-AI/Wan2.2-I2V-A14B-Diffusers \
 ```
 
 ## 3. Prepare Dataset
-
-Clone VBench and prepare the required data:
+Clone VBench to prepare the required dataset, then download i2v data:
 
 ```bash
+# recommended: install VBench from pip
+python3 -m pip install VBench
+
+# required for dataset preparation
 git clone https://github.com/Vchitect/VBench.git
+cd VBench
+bash vbench2_beta_i2v/download_data.sh
 ```
 
 - t2v: pass prompt folder with --prompt_folder, and set --dimension to select `${prompt_folder}/${dimension}.txt`
+- t2v/i2v: pass comma-separated values in `--dimension` to run multiple dimensions in one command (e.g., `subject_consistency,overall_consistency`)
 - t2v: can pass --dimension for evaluation filtering (validated dimensions include `subject_consistency` and `overall_consistency`)
 - i2v: pass --image_folder, --info_json, and --dimension (validated dimensions include `i2v_subject`, `i2v_background`, `subject_consistency`, `background_consistency`, and `motion_smoothness`)
 
@@ -63,19 +69,20 @@ bash run_quant.sh \
 
 ## Inference + Evaluation
 
-### t2v BF16
+### t2v bf16
 
 ```bash
 bash run_benchmark.sh \
   --topology=wan_bf16 \
   --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
   --task=t2v \
-  --dimension=subject_consistency \
+  --dimension=subject_consistency,overall_consistency \
   --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \
-  --output_video_path=wan_bf16_video
+  --output_video_path=wan_t2v_bf16_video \
+  --accuracy
 ```
 
-### t2v FP8 / MXFP8
+### t2v mxfp8/fp8
 
 ```bash
 # topology supports wan_mxfp8 or wan_fp8
@@ -84,27 +91,27 @@ bash run_benchmark.sh \
   --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
   --quantized_model=wan_mxfp8_model \
   --task=t2v \
-  --dimension=overall_consistency \
+  --dimension=subject_consistency,overall_consistency \
   --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \
-  --output_video_path=wan_mxfp8_video
+  --output_video_path=wan_t2v_mxfp8_video \
+  --accuracy
 ```
 
-### i2v BF16
-
-For i2v BF16, provide --image_folder, --info_json, and --dimension explicitly:
+### i2v bf16
 
 ```bash
 bash run_benchmark.sh \
   --topology=wan_bf16 \
   --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \
   --task=i2v \
-  --dimension=i2v_subject \
+  --dimension=i2v_background,i2v_subject \
   --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \
   --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
-  --output_video_path=wan_i2v_bf16_video 
+  --output_video_path=wan_i2v_bf16_video \
+  --accuracy
 ```
 
-### i2v FP8 / MXFP8
+### i2v mxfp8/fp8
 
 ```bash
 # topology supports wan_mxfp8 or wan_fp8
@@ -113,15 +120,19 @@ bash run_benchmark.sh \
   --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \
   --quantized_model=wan_mxfp8_model \
   --task=i2v \
-  --dimension=i2v_background \
+  --dimension=i2v_background,i2v_subject \
   --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \
   --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
-  --output_video_path=wan_i2v_mxfp8_video
+  --output_video_path=wan_i2v_mxfp8_video \
+  --accuracy
 ```
 
-## Accuracy Evaluation
+Note: For sharding and multi-GPU execution, set `--gpu_ids` (for example `--gpu_ids=0,1,2,3`) or set `CUDA_VISIBLE_DEVICES` before running `run_benchmark.sh`.
+
+### Standalone Accuracy Evaluation (Optional)
 
-For evaluating existing videos, call VBench directly.
+If you already use `--accuracy` in `run_benchmark.sh`, you can skip this section.
+Use this section when you want to evaluate existing videos without re-running generation.
 
 ```bash
 # t2v accuracy on generated videos
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py
index a3e2d6b249c..85194cd9efe 100755
--- a/examples/pytorch/diffusion_model/diffusers/wan/main.py
+++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py
@@ -50,6 +50,17 @@ def parse_args():
     parser.add_argument("--fps", default=16, type=int)
     parser.add_argument("--ratio", default="16-9", type=str, help="Aspect ratio used by i2v VBench dataset")
     parser.add_argument("--image_max_area", default=480 * 832, type=int, help="Maximum i2v image area")
+    parser.add_argument(
+        "--mxfp8_chunk_rows",
+        default=2048,
+        type=int,
+        help="Row chunk size for MXFP8 activation QDQ",
+    )
+    parser.add_argument(
+        "--disable_mxfp8_inplace_qdq",
+        action="store_true",
+        help="Disable in-place MXFP8 activation QDQ",
+    )
     return parser.parse_args()
 
 
@@ -101,7 +112,7 @@ def quantize_pipleine(pipe, args):
     convert(pipe, qconfig)
 
 
-def apply_activation_qdq(pipe, scheme):
+def apply_activation_qdq(pipe, scheme, runtime_args):
     if scheme == "BF16":
         return
 
@@ -111,7 +122,32 @@ def act_qdq_forward(module, x, *f_args, **f_kwargs):
             return module.orig_forward(qdq_x, *f_args, **f_kwargs)
     else:
         def act_qdq_forward(module, x, *f_args, **f_kwargs):
-            qdq_x, _, _ = quant_mx_rceil(x, bits=8, group_size=32, data_type="mx_fp_rceil")
+            chunk_rows = max(1, int(getattr(runtime_args, "mxfp8_chunk_rows", 2048)))
+            use_inplace = not getattr(runtime_args, "disable_mxfp8_inplace_qdq", False)
+
+            if use_inplace and x.is_cuda:
+                # Chunked in-place QDQ reduces peak activation memory on large tensors.
+                x_2d = x.reshape(-1, x.shape[-1])
+                total_rows = x_2d.shape[0]
+                for start in range(0, total_rows, chunk_rows):
+                    end = min(start + chunk_rows, total_rows)
+                    qdq_chunk = quant_mx_rceil(
+                        x_2d[start:end],
+                        bits=8,
+                        group_size=32,
+                        data_type="mx_fp_rceil",
+                    )[0]
+                    x_2d[start:end].copy_(qdq_chunk)
+                    del qdq_chunk
+                qdq_x = x
+            else:
+                qdq_x = quant_mx_rceil(
+                    x,
+                    bits=8,
+                    group_size=32,
+                    data_type="mx_fp_rceil",
+                )[0]
+
             return module.orig_forward(qdq_x, *f_args, **f_kwargs)
 
     for module_name in ["transformer", "transformer_2"]:
@@ -198,6 +234,7 @@ def build_i2v_inputs(args):
 
     if args.limit >= 0:
         results = results[: args.limit]
+
     return results
 
 
@@ -273,7 +310,7 @@ def main():
     if args.inference:
         if args.scheme in ["FP8", "MXFP8"]:
             load_quantized_transformers(pipe, args.output_dir)
-            apply_activation_qdq(pipe, args.scheme)
+            apply_activation_qdq(pipe, args.scheme, args)
         run_inference(args, pipe)
 
 
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
index 61955560fd7..e65b01b1bf2 100755
--- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
@@ -133,6 +133,14 @@ function init_params {
         dimension="$2"
         shift 2
       ;;
+      --gpu_ids=*)
+        gpu_ids="${1#*=}"
+        shift
+      ;;
+      --gpu_ids)
+        gpu_ids="$2"
+        shift 2
+      ;;
       --limit=*)
         limit="${1#*=}"
         shift
@@ -179,52 +187,136 @@ function run_benchmark {
 
   prepare_vbench_inputs
 
-  benchmark_cmd=(
-    python3 main.py
-    --model "${input_model}"
-    --task "${task}"
-    --scheme "${scheme}"
-    --output_dir "${tuned_checkpoint}"
-    --output_video_path "${output_video_path}"
-    --limit "${limit}"
-    --inference
-  )
-
-  if [ -n "${prompt_folder}" ]; then
-    benchmark_cmd+=(--prompt_folder "${prompt_folder}")
-  fi
-  if [ -n "${image_folder}" ]; then
-    benchmark_cmd+=(--image_folder "${image_folder}")
-  fi
-  if [ -n "${info_json}" ]; then
-    benchmark_cmd+=(--info_json "${info_json}")
+  normalized_dimensions="${dimension//,/ }"
+  read -r -a dimension_list <<< "${normalized_dimensions}"
+
+  if [ -n "${gpu_ids}" ]; then
+    gpu_list="${gpu_ids}"
+  else
+    gpu_list="${CUDA_VISIBLE_DEVICES:-}"
   fi
-  if [ -n "${dimension}" ]; then
-    benchmark_cmd+=(--dimension "${dimension}")
+
+  if [ -n "${gpu_list}" ]; then
+    normalized_gpu_ids="${gpu_list//,/ }"
+    read -r -a gpu_array <<< "${normalized_gpu_ids}"
+    visible_gpus=${#gpu_array[@]}
+    echo "visible_gpus: ${visible_gpus}"
+  else
+    gpu_array=()
   fi
 
-  "${benchmark_cmd[@]}"
+  mkdir -p "${output_video_path}"
+  shard_tmp_root="${output_video_path}/.prompt_shards"
+
+  function build_benchmark_cmd {
+    local cur_prompt_folder="$2"
+    local cur_info_json="$3"
+    local cmd=(
+      python3 main.py
+      --model "${input_model}"
+      --task "${task}"
+      --scheme "${scheme}"
+      --output_dir "${tuned_checkpoint}"
+      --output_video_path "${output_video_path}"
+      --limit "${limit}"
+      --inference
+    )
+
+    if [ -n "${cur_prompt_folder}" ]; then
+      cmd+=(--prompt_folder "${cur_prompt_folder}")
+    elif [ -n "${prompt_folder}" ]; then
+      cmd+=(--prompt_folder "${prompt_folder}")
+    fi
+    if [ -n "${image_folder}" ]; then
+      cmd+=(--image_folder "${image_folder}")
+    fi
+    if [ -n "${cur_info_json}" ]; then
+      cmd+=(--info_json "${cur_info_json}")
+    elif [ -n "${info_json}" ]; then
+      cmd+=(--info_json "${info_json}")
+    fi
+    if [ -n "$1" ]; then
+      cmd+=(--dimension "$1")
+    fi
+
+    printf '%q ' "${cmd[@]}"
+  }
+
+  if [ ${#gpu_array[@]} -eq 0 ]; then
+    if [ ${#dimension_list[@]} -eq 0 ]; then
+      eval "$(build_benchmark_cmd "" "" "")"
+    else
+      for cur_dimension in "${dimension_list[@]}"; do
+        eval "$(build_benchmark_cmd "${cur_dimension}" "" "")"
+      done
+    fi
+  else
+    if [ ${#dimension_list[@]} -eq 0 ]; then
+      echo "Error: multi-GPU sharding requires --dimension"
+      exit 1
+    fi
+
+    num_shards=${#gpu_array[@]}
+    for cur_dimension in "${dimension_list[@]}"; do
+      dim_shard_root="${shard_tmp_root}/${cur_dimension}"
+      rm -rf "${dim_shard_root}"
+      if [ "${task}" = "t2v" ]; then
+        prompt_file="${prompt_folder}/${cur_dimension}.txt"
+        python3 split_t2v_prompts.py \
+          --prompt_file "${prompt_file}" \
+          --num_shards "${num_shards}" \
+          --output_root "${dim_shard_root}"
+      else
+        python3 split_i2v_info.py \
+          --info_json "${info_json}" \
+          --dimension "${cur_dimension}" \
+          --num_shards "${num_shards}" \
+          --output_root "${dim_shard_root}"
+      fi
+
+      program_pid=()
+      for shard_id in "${!gpu_array[@]}"; do
+        gpu_id="${gpu_array[$shard_id]}"
+        log_suffix="${cur_dimension}"
+        if [ -z "${log_suffix}" ]; then
+          log_suffix="all"
+        fi
+        log_file="${output_video_path}/${log_suffix}.gpu${gpu_id}.log"
+        shard_prompt_folder=""
+        shard_info_json=""
+
+        if [ "${task}" = "t2v" ]; then
+          shard_prompt_folder="${dim_shard_root}/shard_${shard_id}"
+        else
+          shard_info_json="${dim_shard_root}/shard_${shard_id}/info.json"
+        fi
+
+        cmd="$(build_benchmark_cmd "${cur_dimension}" "${shard_prompt_folder}" "${shard_info_json}")"
+        CUDA_VISIBLE_DEVICES="${gpu_id}" bash -lc "${cmd}" > "${log_file}" 2>&1 &
+        program_pid+=("$!")
+        echo "Start (PID: ${program_pid[-1]}, GPU: ${gpu_id}, dimension: ${cur_dimension})"
+      done
+
+      for pid in "${program_pid[@]}"; do
+        wait "${pid}" || exit 1
+      done
+    done
+  fi
 
   if [ "${accuracy}" = "true" ]; then
     if [ "${task}" = "t2v" ]; then
       echo "Start VBench evaluation for t2v..."
-      local t2v_dims
-      if [ -n "${dimension}" ]; then
-        t2v_dims="${dimension}"
-      else
-        t2v_dims="subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency"
-      fi
       pushd VBench
       python evaluate.py \
-        --dimension ${t2v_dims} \
+        --dimension "subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency" \
         --videos_path "${output_video_path}" \
-        --mode=vbench_standard
+        --mode=vbench_standard 
       popd
     elif [ "${task}" = "i2v" ]; then
       echo "Start VBench evaluation for i2v..."
       pushd VBench
       python evaluate_i2v.py \
-        --dimension i2v_background i2v_subject subject_consistency background_consistency motion_smoothness \
+        --dimension "i2v_background i2v_subject subject_consistency background_consistency motion_smoothness" \
         --videos_path "${output_video_path}" \
         --mode=vbench_standard
       popd
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py b/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py
new file mode 100755
index 00000000000..326666467ea
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Split i2v info_json into per-shard files for one dimension.")
+    parser.add_argument("--info_json", required=True, type=str, help="Path to full i2v info json")
+    parser.add_argument("--dimension", required=True, type=str, help="Target dimension")
+    parser.add_argument("--num_shards", required=True, type=int, help="Total shard count")
+    parser.add_argument("--output_root", required=True, type=str, help="Root dir to write shard json files")
+    return parser.parse_args()
+
+
+def has_dimension(info, target_dimension):
+    dims = info.get("dimension", [])
+    if isinstance(dims, str):
+        dims = [dims]
+    return target_dimension in dims
+
+
+def main():
+    args = parse_args()
+
+    if args.num_shards < 1:
+        raise ValueError("--num_shards must be >= 1")
+    if not os.path.isfile(args.info_json):
+        raise FileNotFoundError(f"Info json not found: {args.info_json}")
+
+    with open(args.info_json, "r", encoding="utf-8") as f:
+        info_list = json.load(f)
+
+    filtered = [item for item in info_list if has_dimension(item, args.dimension)]
+
+    shard_buckets = [[] for _ in range(args.num_shards)]
+    for idx, item in enumerate(filtered):
+        shard_buckets[idx % args.num_shards].append(item)
+
+    os.makedirs(args.output_root, exist_ok=True)
+    for shard_id, shard_items in enumerate(shard_buckets):
+        shard_dir = os.path.join(args.output_root, f"shard_{shard_id}")
+        os.makedirs(shard_dir, exist_ok=True)
+        shard_info_json = os.path.join(shard_dir, "info.json")
+        with open(shard_info_json, "w", encoding="utf-8") as f:
+            json.dump(shard_items, f, ensure_ascii=False, indent=2)
+
+    print(
+        f"Split {len(filtered)} i2v entries for dimension '{args.dimension}' "
+        f"into {args.num_shards} shards under {args.output_root}"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py b/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py
new file mode 100755
index 00000000000..dffe19eb9e0
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+import argparse
+import os
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Split t2v prompt file into per-shard prompt folders.")
+    parser.add_argument("--prompt_file", required=True, type=str, help="Path to <dimension>.txt")
+    parser.add_argument("--num_shards", required=True, type=int, help="Total shard count")
+    parser.add_argument("--output_root", required=True, type=str, help="Root directory to write shard folders")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    if args.num_shards < 1:
+        raise ValueError("--num_shards must be >= 1")
+    if not os.path.isfile(args.prompt_file):
+        raise FileNotFoundError(f"Prompt file not found: {args.prompt_file}")
+
+    dimension = os.path.splitext(os.path.basename(args.prompt_file))[0]
+
+    with open(args.prompt_file, "r", encoding="utf-8") as f:
+        prompts = [line.strip() for line in f if line.strip()]
+
+    os.makedirs(args.output_root, exist_ok=True)
+
+    shard_buckets = [[] for _ in range(args.num_shards)]
+    for idx, prompt in enumerate(prompts):
+        shard_buckets[idx % args.num_shards].append(prompt)
+
+    for shard_id, shard_prompts in enumerate(shard_buckets):
+        shard_dir = os.path.join(args.output_root, f"shard_{shard_id}")
+        os.makedirs(shard_dir, exist_ok=True)
+        shard_prompt_file = os.path.join(shard_dir, f"{dimension}.txt")
+        with open(shard_prompt_file, "w", encoding="utf-8") as f:
+            for prompt in shard_prompts:
+                f.write(prompt + "\n")
+
+    print(
+        f"Split {len(prompts)} prompts from {args.prompt_file} into {args.num_shards} shards under {args.output_root}"
+    )
+
+
+if __name__ == "__main__":
+    main()

From 3503937a9c85a915ceb2329acc496718f7e3cee7 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Thu, 28 May 2026 08:22:44 +0000
Subject: [PATCH 04/14] add ratio for evaluate_i2v.py

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 examples/pytorch/diffusion_model/diffusers/wan/README.md        | 1 +
 examples/pytorch/diffusion_model/diffusers/wan/main.py          | 0
 examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh | 1 +
 examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh     | 0
 4 files changed, 2 insertions(+)
 mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/README.md
 mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/main.py
 mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
 mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
old mode 100755
new mode 100644
index e9c8440b658..33cf1f63ce5
--- a/examples/pytorch/diffusion_model/diffusers/wan/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -147,6 +147,7 @@ cd /path/to/VBench
 python evaluate_i2v.py \
   --dimension i2v_background i2v_subject subject_consistency background_consistency motion_smoothness \
   --videos_path /path/to/wan_i2v_bf16_video \
+  --ratio 16-9 \
   --mode vbench_standard
 ```
 
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py
old mode 100755
new mode 100644
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
old mode 100755
new mode 100644
index e65b01b1bf2..02ccd86bba5
--- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
@@ -318,6 +318,7 @@ function run_benchmark {
       python evaluate_i2v.py \
         --dimension "i2v_background i2v_subject subject_consistency background_consistency motion_smoothness" \
         --videos_path "${output_video_path}" \
+        --ratio "16-9" \
         --mode=vbench_standard
       popd
     else
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh
old mode 100755
new mode 100644

From 1469b24e87b9e064997626779bb321deeb823278 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Thu, 28 May 2026 09:45:33 +0000
Subject: [PATCH 05/14] improve mxfp8 oom by row chunk

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../diffusion_model/diffusers/wan/main.py     |  5 +++++
 .../diffusers/wan/run_benchmark.sh            | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py
index 85194cd9efe..c4c9aeab9f1 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/main.py
+++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py
@@ -166,6 +166,11 @@ def load_quantized_transformers(pipe, output_dir):
         print(f"Loading quantized {module_name} from {q_path}")
         setattr(pipe, module_name, WanTransformer3DModel.from_pretrained(q_path, torch_dtype=torch.bfloat16))
 
+    # Quantized modules are replaced after pipeline construction; refresh offload hooks
+    # so newly attached modules follow the same device movement policy.
+    if hasattr(pipe, "enable_model_cpu_offload"):
+        pipe.enable_model_cpu_offload()
+
 
 def build_t2v_inputs(args):
     prompt_folder = args.prompt_folder
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
index 02ccd86bba5..420e8ba010f 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
@@ -149,6 +149,18 @@ function init_params {
         limit="$2"
         shift 2
       ;;
+      --mxfp8_chunk_rows=*)
+        mxfp8_chunk_rows="${1#*=}"
+        shift
+      ;;
+      --mxfp8_chunk_rows)
+        mxfp8_chunk_rows="$2"
+        shift 2
+      ;;
+      --disable_mxfp8_inplace_qdq)
+        disable_mxfp8_inplace_qdq=true
+        shift
+      ;;
       --accuracy)
         accuracy=true
         shift
@@ -167,6 +179,7 @@ function run_benchmark {
   tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"}
   output_video_path=${output_video_path:="./tmp_video"}
   accuracy=${accuracy:=false}
+  disable_mxfp8_inplace_qdq=${disable_mxfp8_inplace_qdq:=false}
 
   if [[ ! "${output_video_path}" = /* ]]; then
     output_video_path=$(realpath -s "$(pwd)/${output_video_path}")
@@ -238,6 +251,12 @@ function run_benchmark {
     if [ -n "$1" ]; then
       cmd+=(--dimension "$1")
     fi
+    if [ -n "${mxfp8_chunk_rows}" ]; then
+      cmd+=(--mxfp8_chunk_rows "${mxfp8_chunk_rows}")
+    fi
+    if [ "${disable_mxfp8_inplace_qdq}" = "true" ]; then
+      cmd+=(--disable_mxfp8_inplace_qdq)
+    fi
 
     printf '%q ' "${cmd[@]}"
   }

From 630863c6b03d0cd59977ec9795519baabbdbfe05 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Mon, 1 Jun 2026 07:25:24 +0000
Subject: [PATCH 06/14] remove requirement

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 examples/pytorch/diffusion_model/diffusers/wan/README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
index 33cf1f63ce5..c89b6e86800 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -7,12 +7,13 @@ This example provides a unified Wan entry for quantization and evaluation, with
 ## 1. Environment
 
 ```shell
-pip install -r requirements.txt
 # Use latest dev branch if needed before release
 # INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master
 # pip install git+https://github.com/intel/auto-round.git@main
 pip install neural-compressor-pt
 pip install auto-round
+# evaluation
+pip install VBench
 ```
 
 ## 2. Prepare Model
@@ -40,9 +41,6 @@ huggingface-cli download Wan-AI/Wan2.2-I2V-A14B-Diffusers \
 Clone VBench to prepare the required dataset, then download i2v data:
 
 ```bash
-# recommended: install VBench from pip
-python3 -m pip install VBench
-
 # required for dataset preparation
 git clone https://github.com/Vchitect/VBench.git
 cd VBench

From d020210b94fb85fd2ae3a2a4ff9f8b8e5e3cfcf3 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Tue, 2 Jun 2026 07:51:12 +0000
Subject: [PATCH 07/14] add requirements

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../diffusion_model/diffusers/wan/README.md   | 12 +++++------
 .../diffusers/wan/requirements.txt            | 21 +++++++++++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)
 create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/requirements.txt

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
index c89b6e86800..426ccb54540 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -10,10 +10,9 @@ This example provides a unified Wan entry for quantization and evaluation, with
 # Use latest dev branch if needed before release
 # INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master
 # pip install git+https://github.com/intel/auto-round.git@main
-pip install neural-compressor-pt
-pip install auto-round
-# evaluation
-pip install VBench
+
+# install all runtime dependencies (including evaluation package VBench)
+pip install -r requirements.txt
 ```
 
 ## 2. Prepare Model
@@ -26,14 +25,15 @@ Use a local Wan diffusers model path, for example:
 Download example (from Hugging Face):
 
 ```bash
+# optional: update CLI to latest version
 pip install -U "huggingface_hub[cli]"
 
 # t2v model
-huggingface-cli download Wan-AI/Wan2.2-T2V-A14B-Diffusers \
+hf download Wan-AI/Wan2.2-T2V-A14B-Diffusers \
   --local-dir /path/to/Wan2.2-T2V-A14B-Diffusers
 
 # i2v model
-huggingface-cli download Wan-AI/Wan2.2-I2V-A14B-Diffusers \
+hf download Wan-AI/Wan2.2-I2V-A14B-Diffusers \
   --local-dir /path/to/Wan2.2-I2V-A14B-Diffusers
 ```
 
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
new file mode 100644
index 00000000000..ad846ada0ff
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
@@ -0,0 +1,21 @@
+# Core runtime
+numpy
+torch
+torchvision
+diffusers
+transformers
+accelerate
+huggingface_hub
+safetensors
+
+# Quantization stack
+neural-compressor-pt
+auto-round
+
+# Utilities used by example scripts
+pillow
+einops
+requests
+
+# Evaluation package (dataset/eval helpers)
+VBench

From bfd62b514d74d056cc1d6e657a7a9235ba17411d Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Fri, 12 Jun 2026 03:09:39 +0000
Subject: [PATCH 08/14] add example readme

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 examples/README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 26e4a5792d7..2b567ef4ef6 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -39,6 +39,19 @@ Intel® Neural Compressor validated examples with multiple compression technique
     <td>Quantization (MXFP8/FP8)</td>
     <td><a href="./pytorch/diffusion_model/diffusers/flux">link</a></td>
 </tr>
+<tr>
+    <td>Wan2.2-I2V-A14B-Diffusers</td>
+    <td>Image to Video</td>
+    <td>Quantization (MXFP8/FP8)</td>
+    <td><a href="./pytorch/diffusion_model/diffusers/wan">link</a></td>
+</tr>
+<tr>
+    <td>Wan2.2-T2V-A14B-Diffusers</td>
+    <td>Text to Video</td>
+    <td>Quantization (MXFP8/FP8)</td>
+    <td><a href="./pytorch/diffusion_model/diffusers/wan">link</a></td>
+</tr>
+
 <tr>
     <td>Llama-4-Scout-17B-16E-Instruct</td>
     <td>Multimodal Modeling</td>

From 2433e9dbbe347fbbf51342a33a5806be6afb1892 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Fri, 12 Jun 2026 08:33:51 +0000
Subject: [PATCH 09/14] update VBench install method

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../diffusion_model/diffusers/wan/README.md   |  1 +
 .../diffusers/wan/requirements.txt            | 28 ++++++++++++++++---
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
index 426ccb54540..81f91be3622 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -13,6 +13,7 @@ This example provides a unified Wan entry for quantization and evaluation, with
 
 # install all runtime dependencies (including evaluation package VBench)
 pip install -r requirements.txt
+pip install VBench --no-deps
 ```
 
 ## 2. Prepare Model
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
index ad846ada0ff..b855a26543b 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
+++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
@@ -1,5 +1,4 @@
 # Core runtime
-numpy
 torch
 torchvision
 diffusers
@@ -13,9 +12,30 @@ neural-compressor-pt
 auto-round
 
 # Utilities used by example scripts
-pillow
 einops
-requests
 
 # Evaluation package (dataset/eval helpers)
-VBench
+Pillow
+numpy<2.0.0
+matplotlib
+timm>=0.9,<=1.0.12
+wheel
+cython
+tensorboard
+scipy
+opencv-python
+scikit-learn
+scikit-image
+openai-clip
+decord
+requests
+pyyaml
+pyiqa
+lvis
+fairscale>=0.4.4
+fvcore
+easydict
+urllib3
+boto3
+omegaconf
+pycocoevalcap

From d8d5a2e16376bf797f7475304434f0cb09df3304 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Mon, 15 Jun 2026 07:37:27 +0000
Subject: [PATCH 10/14] add vbench_dir and gdown==4.7.3, imageio-ffepeg

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../diffusion_model/diffusers/wan/README.md   |  5 +++++
 .../diffusers/wan/requirements.txt            |  2 ++
 .../diffusers/wan/run_benchmark.sh            | 19 +++++++++++++++----
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
index 81f91be3622..681aadfbd10 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -68,6 +68,11 @@ bash run_quant.sh \
 
 ## Inference + Evaluation
 
+When `--accuracy` is enabled, `run_benchmark.sh` runs VBench evaluation scripts from a local VBench repo.
+
+- Default VBench path is `$(dirname run_benchmark.sh)/VBench`.
+- If your VBench repo is elsewhere, pass `--vbench_dir=/path/to/VBench`.
+
 ### t2v bf16
 
 ```bash
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
index b855a26543b..dd0a3842c04 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
+++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
@@ -39,3 +39,5 @@ urllib3
 boto3
 omegaconf
 pycocoevalcap
+imageio-ffmpeg
+gdown==4.7.3
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
index 420e8ba010f..aac55b02af3 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
@@ -1,15 +1,17 @@
 #!/bin/bash
 set -x
 
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+
 function main {
   init_params "$@"
   run_benchmark
 }
 
 function ensure_vbench_repo {
-  if [ ! -d "VBench" ]; then
+  if [ ! -d "${vbench_dir}" ]; then
     echo "VBench directory not found. Start cloning https://github.com/Vchitect/VBench.git ..."
-    git clone https://github.com/Vchitect/VBench.git
+    git clone https://github.com/Vchitect/VBench.git "${vbench_dir}"
     if [ $? -ne 0 ]; then
       echo "Error: failed to clone VBench."
       exit 1
@@ -165,6 +167,14 @@ function init_params {
         accuracy=true
         shift
       ;;
+      --vbench_dir=*)
+        vbench_dir="${1#*=}"
+        shift
+      ;;
+      --vbench_dir)
+        vbench_dir="$2"
+        shift 2
+      ;;
       *)
         echo "Error: No such parameter: $1"
         exit 1
@@ -180,6 +190,7 @@ function run_benchmark {
   output_video_path=${output_video_path:="./tmp_video"}
   accuracy=${accuracy:=false}
   disable_mxfp8_inplace_qdq=${disable_mxfp8_inplace_qdq:=false}
+  vbench_dir=${vbench_dir:="${SCRIPT_DIR}/VBench"}
 
   if [[ ! "${output_video_path}" = /* ]]; then
     output_video_path=$(realpath -s "$(pwd)/${output_video_path}")
@@ -325,7 +336,7 @@ function run_benchmark {
   if [ "${accuracy}" = "true" ]; then
     if [ "${task}" = "t2v" ]; then
       echo "Start VBench evaluation for t2v..."
-      pushd VBench
+      pushd "${vbench_dir}"
       python evaluate.py \
         --dimension "subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency" \
         --videos_path "${output_video_path}" \
@@ -333,7 +344,7 @@ function run_benchmark {
       popd
     elif [ "${task}" = "i2v" ]; then
       echo "Start VBench evaluation for i2v..."
-      pushd VBench
+      pushd "${vbench_dir}"
       python evaluate_i2v.py \
         --dimension "i2v_background i2v_subject subject_consistency background_consistency motion_smoothness" \
         --videos_path "${output_video_path}" \

From c7fa3ec645fe7d51e93828c519f9698680d1fab3 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Tue, 16 Jun 2026 12:46:33 +0800
Subject: [PATCH 11/14] add setup.sh

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 examples/pytorch/diffusion_model/diffusers/wan/setup.sh | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/setup.sh

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh
new file mode 100644
index 00000000000..a741abf3cff
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh
@@ -0,0 +1,2 @@
+pip install -r requirements.txt
+pip install VBench --no-deps
\ No newline at end of file

From 5adf041e2dbf8d0579216fb47bbe35a7188f36c0 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Tue, 16 Jun 2026 14:07:19 +0800
Subject: [PATCH 12/14] Update setup.sh

---
 examples/pytorch/diffusion_model/diffusers/wan/setup.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh
index a741abf3cff..c9f9700dbbc 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh
@@ -1,2 +1,2 @@
-pip install -r requirements.txt
-pip install VBench --no-deps
\ No newline at end of file
+pip install --no-cache-dir -r requirements.txt
+pip install VBench --no-deps

From b2968eb2dabf3e4fb26da44055957174fb5ff6a3 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Tue, 16 Jun 2026 13:20:33 +0000
Subject: [PATCH 13/14] i2v/t2v use quantized model loading directly

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../diffusion_model/diffusers/wan/README.md   | 27 +++++++++++++------
 .../diffusion_model/diffusers/wan/main.py     | 15 -----------
 .../diffusers/wan/run_benchmark.sh            | 10 -------
 .../diffusers/wan/split_i2v_info.py           |  0
 .../diffusers/wan/split_t2v_prompts.py        |  0
 5 files changed, 19 insertions(+), 33 deletions(-)
 mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py
 mode change 100755 => 100644 examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
index 681aadfbd10..85883127570 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -57,13 +57,26 @@ bash vbench2_beta_i2v/download_data.sh
 
 ## Quantization
 
+### t2v
+
 ```bash
 # topology supports wan_mxfp8 or wan_fp8
 bash run_quant.sh \
   --topology=wan_mxfp8 \
   --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
   --task=t2v \
-  --output_model=wan_mxfp8_model
+  --output_model=wan_mxfp8_model_t2v
+```
+
+### i2v
+
+```bash
+# topology supports wan_mxfp8 or wan_fp8
+bash run_quant.sh \
+  --topology=wan_mxfp8 \
+  --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \
+  --task=i2v \
+  --output_model=wan_mxfp8_model_i2v
 ```
 
 ## Inference + Evaluation
@@ -92,11 +105,10 @@ bash run_benchmark.sh \
 # topology supports wan_mxfp8 or wan_fp8
 bash run_benchmark.sh \
   --topology=wan_mxfp8 \
-  --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
-  --quantized_model=wan_mxfp8_model \
+  --input_model=wan_mxfp8_model_t2v \
   --task=t2v \
   --dimension=subject_consistency,overall_consistency \
-  --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \
+  --prompt_folder=./VBench/prompts/prompts_per_dimension/ \
   --output_video_path=wan_t2v_mxfp8_video \
   --accuracy
 ```
@@ -121,12 +133,11 @@ bash run_benchmark.sh \
 # topology supports wan_mxfp8 or wan_fp8
 bash run_benchmark.sh \
   --topology=wan_mxfp8 \
-  --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \
-  --quantized_model=wan_mxfp8_model \
+  --input_model=wan_mxfp8_model_i2v \
   --task=i2v \
   --dimension=i2v_background,i2v_subject \
-  --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \
-  --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
+  --image_folder=./VBench/vbench2_beta_i2v/data/crop/16-9 \
+  --info_json=./VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
   --output_video_path=wan_i2v_mxfp8_video \
   --accuracy
 ```
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py
index c4c9aeab9f1..c36ebef7177 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/main.py
+++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py
@@ -158,20 +158,6 @@ def act_qdq_forward(module, x, *f_args, **f_kwargs):
                 m.forward = partial(act_qdq_forward, m)
 
 
-def load_quantized_transformers(pipe, output_dir):
-    for module_name in ["transformer", "transformer_2"]:
-        q_path = os.path.join(output_dir, module_name)
-        if not os.path.isdir(q_path):
-            raise ValueError(f"Quantized path does not exist: {q_path}")
-        print(f"Loading quantized {module_name} from {q_path}")
-        setattr(pipe, module_name, WanTransformer3DModel.from_pretrained(q_path, torch_dtype=torch.bfloat16))
-
-    # Quantized modules are replaced after pipeline construction; refresh offload hooks
-    # so newly attached modules follow the same device movement policy.
-    if hasattr(pipe, "enable_model_cpu_offload"):
-        pipe.enable_model_cpu_offload()
-
-
 def build_t2v_inputs(args):
     prompt_folder = args.prompt_folder
 
@@ -314,7 +300,6 @@ def main():
 
     if args.inference:
         if args.scheme in ["FP8", "MXFP8"]:
-            load_quantized_transformers(pipe, args.output_dir)
             apply_activation_qdq(pipe, args.scheme, args)
         run_inference(args, pipe)
 
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
index aac55b02af3..1ea381f547a 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
@@ -87,14 +87,6 @@ function init_params {
         task="$2"
         shift 2
       ;;
-      --quantized_model=*)
-        tuned_checkpoint="${1#*=}"
-        shift
-      ;;
-      --quantized_model)
-        tuned_checkpoint="$2"
-        shift 2
-      ;;
       --output_video_path=*)
         output_video_path="${1#*=}"
         shift
@@ -186,7 +178,6 @@ function init_params {
 function run_benchmark {
   task=${task:="t2v"}
   limit=${limit:=-1}
-  tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"}
   output_video_path=${output_video_path:="./tmp_video"}
   accuracy=${accuracy:=false}
   disable_mxfp8_inplace_qdq=${disable_mxfp8_inplace_qdq:=false}
@@ -240,7 +231,6 @@ function run_benchmark {
       --model "${input_model}"
       --task "${task}"
       --scheme "${scheme}"
-      --output_dir "${tuned_checkpoint}"
       --output_video_path "${output_video_path}"
       --limit "${limit}"
       --inference
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py b/examples/pytorch/diffusion_model/diffusers/wan/split_i2v_info.py
old mode 100755
new mode 100644
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py b/examples/pytorch/diffusion_model/diffusers/wan/split_t2v_prompts.py
old mode 100755
new mode 100644

From 90a572d71c656965fc1b5c1c77b32907b75e9181 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Wed, 17 Jun 2026 02:32:56 +0000
Subject: [PATCH 14/14] add s2v

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../diffusion_model/diffusers/wan/README.md   | 177 +++++--
 .../diffusers/wan/evaluate_manifest_no_gt.py  | 489 ++++++++++++++++++
 .../diffusion_model/diffusers/wan/main.py     |   6 +-
 .../diffusers/wan/prepare_s2v_dataset.py      | 137 +++++
 ...uirements.txt => requirements_i2v_t2v.txt} |   1 +
 .../diffusers/wan/requirements_s2v.txt        |  41 ++
 .../diffusers/wan/run_benchmark.sh            | 417 ++++++++++-----
 .../diffusers/wan/run_quant.sh                |  41 +-
 .../diffusion_model/diffusers/wan/setup.sh    |  55 +-
 .../diffusers/wan/split_s2v_manifest.py       |  68 +++
 .../diffusion_model/diffusers/wan/wan_s2v.py  | 310 +++++++++++
 11 files changed, 1545 insertions(+), 197 deletions(-)
 create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/evaluate_manifest_no_gt.py
 create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/prepare_s2v_dataset.py
 rename examples/pytorch/diffusion_model/diffusers/wan/{requirements.txt => requirements_i2v_t2v.txt} (99%)
 create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/requirements_s2v.txt
 create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/split_s2v_manifest.py
 create mode 100644 examples/pytorch/diffusion_model/diffusers/wan/wan_s2v.py

diff --git a/examples/pytorch/diffusion_model/diffusers/wan/README.md b/examples/pytorch/diffusion_model/diffusers/wan/README.md
index 85883127570..b4f48ddb23e 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/README.md
+++ b/examples/pytorch/diffusion_model/diffusers/wan/README.md
@@ -1,61 +1,88 @@
 # Step-by-Step
 
-This example provides a unified Wan entry for quantization and evaluation, with both t2v and i2v support.
+This example offers a unified Wan workflow for quantization and evaluation, covering `t2v` and `i2v` via `main.py`, and `s2v` via `wan_s2v.py`
 
-# Prerequisite
+# Prerequisites
 
-## 1. Environment
+## 1 Environment
 
-```shell
-# Use latest dev branch if needed before release
-# INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master
-# pip install git+https://github.com/intel/auto-round.git@main
+You can also use the helper script to install task-specific dependencies:
 
-# install all runtime dependencies (including evaluation package VBench)
-pip install -r requirements.txt
-pip install VBench --no-deps
-```
-
-## 2. Prepare Model
-
-Use a local Wan diffusers model path, for example:
+```bash
+# t2v / i2v setup (installs requirements_i2v_t2v.txt and VBench by default)
+bash setup.sh --task t2v
+bash setup.sh --task i2v
 
-- Wan2.2-T2V-A14B-Diffusers
-- Wan2.2-I2V-A14B-Diffusers
+# s2v setup (installs requirements_s2v.txt, skips VBench by default)
+bash setup.sh --task s2v
+```
 
-Download example (from Hugging Face):
 
+## 2 Model Preparation
 ```bash
 # optional: update CLI to latest version
 pip install -U "huggingface_hub[cli]"
 
-# t2v model
+
 hf download Wan-AI/Wan2.2-T2V-A14B-Diffusers \
   --local-dir /path/to/Wan2.2-T2V-A14B-Diffusers
 
-# i2v model
+
 hf download Wan-AI/Wan2.2-I2V-A14B-Diffusers \
   --local-dir /path/to/Wan2.2-I2V-A14B-Diffusers
+
+
+hf download Wan-AI/Wan2.2-S2V-14B \
+  --local-dir /path/to/Wan2.2-S2V-14B
 ```
 
-## 3. Prepare Dataset
-Clone VBench to prepare the required dataset, then download i2v data:
+
+## 3 Dataset Preparation
+
+### t2v / i2v 
+
+Both `t2v` and `i2v` use VBench data.
+Recommended default is manual preparation for better reproducibility and control.
+Use `--vbench_dir=/path/to/VBench` in `run_benchmark.sh` when your VBench repo is not under the default path.
+
+If you prepare VBench manually:
 
 ```bash
-# required for dataset preparation
 git clone https://github.com/Vchitect/VBench.git
 cd VBench
 bash vbench2_beta_i2v/download_data.sh
 ```
 
-- t2v: pass prompt folder with --prompt_folder, and set --dimension to select `${prompt_folder}/${dimension}.txt`
-- t2v/i2v: pass comma-separated values in `--dimension` to run multiple dimensions in one command (e.g., `subject_consistency,overall_consistency`)
-- t2v: can pass --dimension for evaluation filtering (validated dimensions include `subject_consistency` and `overall_consistency`)
-- i2v: pass --image_folder, --info_json, and --dimension (validated dimensions include `i2v_subject`, `i2v_background`, `subject_consistency`, `background_consistency`, and `motion_smoothness`)
+Then use:
+- t2v:
+- `--prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/`
+- `--dimension=subject_consistency,overall_consistency`
+
+- i2v:
+- `--image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9`
+- `--info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json`
+- `--dimension=i2v_background,i2v_subject`
+
+### s2v
+
+Recommended default is manual preparation.
+
+```bash
+# Step 1: clone EchoMimicV3 repo
+git clone https://github.com/antgroup/echomimic_v3.git /path/to/echomimic_v3
+
+# Step 2: build s2v manifest json
+python3 prepare_s2v_dataset.py \
+  --repo-dir /path/to/echomimic_v3 \
+  --manifest-out /path/to/s2v_input_manifest.json
+
+```
+
+The generated /path/to/s2v_input_manifest.json is the s2v input manifest passed via --manifest_path, with image and audio stored as absolute paths.
 
 # Run
 
-## Quantization
+##  Quantization
 
 ### t2v
 
@@ -79,12 +106,33 @@ bash run_quant.sh \
   --output_model=wan_mxfp8_model_i2v
 ```
 
-## Inference + Evaluation
+### s2v
 
-When `--accuracy` is enabled, `run_benchmark.sh` runs VBench evaluation scripts from a local VBench repo.
+```bash
+# Prepare Wan2.2 for s2v
+git clone https://github.com/Wan-Video/Wan2.2.git /path/to/Wan2.2
+```
+
+```bash
+# topology supports wan_mxfp8 or wan_fp8
+bash run_quant.sh \
+  --topology=wan_mxfp8 \
+  --input_model=/path/to/Wan2.2-S2V-14B \
+  --task=s2v \
+  --wan_dir=/path/to/Wan2.2 \
+  --output_model=wan_mxfp8_model_s2v
+```
 
-- Default VBench path is `$(dirname run_benchmark.sh)/VBench`.
-- If your VBench repo is elsewhere, pass `--vbench_dir=/path/to/VBench`.
+Note:
+- For `task=s2v`, prepare Wan2.2 manually and pass `--wan_dir=/path/to/Wan2.2` when needed.
+- `run_quant.sh` sets `PYTHONPATH` internally for s2v, so you do not need to export it manually.
+- For `task=s2v`, `run_quant.sh` dispatches to `wan_s2v.py --quantize` in this example.
+
+
+
+## Inference + Evaluation
+
+Note: For `task=t2v/i2v`, prepare VBench manually first, and pass `--vbench_dir=/path/to/VBench` when needed.
 
 ### t2v bf16
 
@@ -94,7 +142,7 @@ bash run_benchmark.sh \
   --input_model=/path/to/Wan2.2-T2V-A14B-Diffusers \
   --task=t2v \
   --dimension=subject_consistency,overall_consistency \
-  --prompt_folder=/path/to/VBench/prompts/prompts_per_dimension/ \
+  --vbench_dir=/path/to/VBench \
   --output_video_path=wan_t2v_bf16_video \
   --accuracy
 ```
@@ -102,13 +150,12 @@ bash run_benchmark.sh \
 ### t2v mxfp8/fp8
 
 ```bash
-# topology supports wan_mxfp8 or wan_fp8
 bash run_benchmark.sh \
   --topology=wan_mxfp8 \
   --input_model=wan_mxfp8_model_t2v \
   --task=t2v \
   --dimension=subject_consistency,overall_consistency \
-  --prompt_folder=./VBench/prompts/prompts_per_dimension/ \
+  --vbench_dir=/path/to/VBench \
   --output_video_path=wan_t2v_mxfp8_video \
   --accuracy
 ```
@@ -121,8 +168,7 @@ bash run_benchmark.sh \
   --input_model=/path/to/Wan2.2-I2V-A14B-Diffusers \
   --task=i2v \
   --dimension=i2v_background,i2v_subject \
-  --image_folder=/path/to/VBench/vbench2_beta_i2v/data/crop/16-9 \
-  --info_json=/path/to/VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
+  --vbench_dir=/path/to/VBench \
   --output_video_path=wan_i2v_bf16_video \
   --accuracy
 ```
@@ -130,19 +176,55 @@ bash run_benchmark.sh \
 ### i2v mxfp8/fp8
 
 ```bash
-# topology supports wan_mxfp8 or wan_fp8
 bash run_benchmark.sh \
   --topology=wan_mxfp8 \
   --input_model=wan_mxfp8_model_i2v \
   --task=i2v \
   --dimension=i2v_background,i2v_subject \
-  --image_folder=./VBench/vbench2_beta_i2v/data/crop/16-9 \
-  --info_json=./VBench/vbench2_beta_i2v/vbench2_i2v_full_info.json \
+  --vbench_dir=/path/to/VBench \
   --output_video_path=wan_i2v_mxfp8_video \
   --accuracy
 ```
 
-Note: For sharding and multi-GPU execution, set `--gpu_ids` (for example `--gpu_ids=0,1,2,3`) or set `CUDA_VISIBLE_DEVICES` before running `run_benchmark.sh`.
+### s2v bf16
+
+Note: For `task=s2v`, prepare Wan2.2 manually and pass `--wan_dir=/path/to/Wan2.2` when needed.
+
+```bash
+bash run_benchmark.sh \
+  --topology=wan_bf16 \
+  --task=s2v \
+  --input_model=/path/to/Wan2.2-S2V-14B \
+  --wan_dir=/path/to/Wan2.2 \
+  --manifest_path=/path/to/s2v_input_manifest.json \
+  --output_video_path=wan_s2v_bf16_video \
+  --accuracy
+```
+
+### s2v mxfp8/fp8
+
+```bash
+bash run_benchmark.sh \
+  --topology=wan_mxfp8 \
+  --task=s2v \
+  --input_model=/path/to/Wan2.2-S2V-14B \
+  --quantized_model=wan_mxfp8_model_s2v \
+  --wan_dir=/path/to/Wan2.2 \
+  --manifest_path=/path/to/s2v_input_manifest.json \
+  --output_video_path=wan_s2v_mxfp8_video \
+  --accuracy
+```
+
+When `task=s2v` and `--accuracy` is set, `run_benchmark.sh` will run generation via `wan_s2v.py`, then run `evaluate_manifest_no_gt.py`.
+
+- Optional eval arg: `--s2v_eval_output` (default: `${output_video_path}/evaluation_no_gt_metrics_s2v.json`)
+- Internal defaults: matched manifest `${output_video_path}/s2v_manifest_with_generate_video.json`, `max_frames=32`, `metric_size=192`
+
+
+Metric note: current s2v benchmark manifest does not provide ground-truth videos, so `FID` and `FVD` are not computed. The script reports proxy metrics from available image/audio/generated-video signals (for example SSIM, PSNR, Sync-C, HKC, HKV, CSIM, EFID).
+
+For sharding and multi-GPU execution, set `--gpu_ids` (for example `--gpu_ids=0,1,2,3`) or set `CUDA_VISIBLE_DEVICES` before running `run_benchmark.sh`.
+
 
 ### Standalone Accuracy Evaluation (Optional)
 
@@ -164,11 +246,12 @@ python evaluate_i2v.py \
   --videos_path /path/to/wan_i2v_bf16_video \
   --ratio 16-9 \
   --mode vbench_standard
-```
-
-# Notes
 
-- Quantized weights are saved under:
-  - <output_model>/transformer
-  - <output_model>/transformer_2
+# s2v standalone evaluation from generated manifest.
+python evaluate_manifest_no_gt.py \
+  --manifest /path/to/wan_s2v_output/s2v_manifest_with_generate_video.json \
+  --output /path/to/wan_s2v_output/evaluation_no_gt_metrics_s2v.json \
+  --max_frames 32 \
+  --metric_size 192
+```
 
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/evaluate_manifest_no_gt.py b/examples/pytorch/diffusion_model/diffusers/wan/evaluate_manifest_no_gt.py
new file mode 100644
index 00000000000..26dfccf8fc6
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/evaluate_manifest_no_gt.py
@@ -0,0 +1,489 @@
+import argparse
+import glob
+import json
+import math
+from pathlib import Path
+
+import cv2
+import numpy as np
+from scipy.io import wavfile
+
+
+def _resolve_manifest_path(path_value: str, root: Path) -> Path:
+	"""Use absolute paths from manifest directly; fallback to root for relative paths."""
+	p = Path(path_value)
+	if p.is_absolute():
+		return p
+	return root / p
+
+
+def _build_sample_dict(manifest):
+	if isinstance(manifest, dict):
+		return {str(k): v for k, v in manifest.items() if isinstance(v, dict)}
+	if isinstance(manifest, list):
+		result = {}
+		for idx, item in enumerate(manifest):
+			if not isinstance(item, dict):
+				continue
+			sample_id = str(item.get("id", idx))
+			result[sample_id] = item
+		return result
+	raise ValueError("Manifest must be a JSON object or list")
+
+
+def build_matched_manifest(source_manifest_path: Path, generated_video_dir: Path):
+	with source_manifest_path.open("r", encoding="utf-8") as f:
+		source_manifest = json.load(f)
+
+	source_samples = _build_sample_dict(source_manifest)
+	manifest_root = source_manifest_path.parent
+	video_files = sorted(glob.glob(str(generated_video_dir / "*.mp4")))
+
+	matched = {}
+	for sample_id, sample in source_samples.items():
+		prompt = sample.get("prompt")
+		image = sample.get("image")
+		audio = sample.get("audio")
+		if not prompt or not image or not audio:
+			continue
+
+		if not Path(image).is_absolute():
+			image = str((manifest_root / image).resolve())
+		if not Path(audio).is_absolute():
+			audio = str((manifest_root / audio).resolve())
+
+		prefix = f"{sample_id}_"
+		candidates = [vp for vp in video_files if Path(vp).name.startswith(prefix)]
+		if not candidates:
+			continue
+
+		matched[sample_id] = {
+			"prompt": prompt,
+			"image": image,
+			"audio": audio,
+			"generate_video": str(Path(candidates[-1]).resolve()),
+		}
+
+	return matched
+
+
+def psnr(img1: np.ndarray, img2: np.ndarray) -> float:
+	mse = np.mean((img1.astype(np.float64) - img2.astype(np.float64)) ** 2)
+	if mse <= 1e-12:
+		return 100.0
+	return 20.0 * math.log10(255.0 / math.sqrt(mse))
+
+
+def ssim(img1: np.ndarray, img2: np.ndarray) -> float:
+	# Standard single-image SSIM implementation on grayscale images.
+	if img1.ndim == 3:
+		img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
+	if img2.ndim == 3:
+		img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
+
+	img1 = img1.astype(np.float64)
+	img2 = img2.astype(np.float64)
+
+	c1 = (0.01 * 255) ** 2
+	c2 = (0.03 * 255) ** 2
+
+	kernel = cv2.getGaussianKernel(11, 1.5)
+	window = kernel @ kernel.T
+
+	mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]
+	mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
+	mu1_sq = mu1 * mu1
+	mu2_sq = mu2 * mu2
+	mu1_mu2 = mu1 * mu2
+
+	sigma1_sq = cv2.filter2D(img1 * img1, -1, window)[5:-5, 5:-5] - mu1_sq
+	sigma2_sq = cv2.filter2D(img2 * img2, -1, window)[5:-5, 5:-5] - mu2_sq
+	sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
+
+	ssim_map = ((2 * mu1_mu2 + c1) * (2 * sigma12 + c2)) / (
+		(mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
+	)
+	return float(ssim_map.mean())
+
+
+def read_video_frames(video_path: Path, max_frames: int = 120):
+	cap = cv2.VideoCapture(str(video_path))
+	if not cap.isOpened():
+		raise RuntimeError(f"Failed to open video: {video_path}")
+
+	frames = []
+	while cap.isOpened() and len(frames) < max_frames:
+		ok, frame = cap.read()
+		if not ok:
+			break
+		frames.append(frame)
+
+	cap.release()
+	if not frames:
+		raise RuntimeError(f"No frame read from video: {video_path}")
+	return frames
+
+
+def _read_audio_mono(audio_path: Path):
+	sr, wav = wavfile.read(str(audio_path))
+	if wav.ndim == 2:
+		wav = wav.mean(axis=1)
+	wav = wav.astype(np.float32)
+	if wav.dtype != np.float32:
+		max_abs = np.max(np.abs(wav)) if np.max(np.abs(wav)) > 0 else 1.0
+		wav = wav / max_abs
+	return wav, int(sr)
+
+
+def _resample_1d(x: np.ndarray, target_len: int):
+	if len(x) == target_len:
+		return x
+	if len(x) == 0:
+		return np.zeros(target_len, dtype=np.float32)
+	xp = np.linspace(0.0, 1.0, num=len(x), endpoint=True)
+	xnew = np.linspace(0.0, 1.0, num=target_len, endpoint=True)
+	return np.interp(xnew, xp, x).astype(np.float32)
+
+
+def _face_mouth_roi(gray: np.ndarray, face_cascade):
+	faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(40, 40))
+	if len(faces) > 0:
+		x, y, w, h = max(faces, key=lambda b: b[2] * b[3])
+		x1 = int(x + 0.2 * w)
+		x2 = int(x + 0.8 * w)
+		y1 = int(y + 0.55 * h)
+		y2 = int(y + 0.95 * h)
+	else:
+		h, w = gray.shape[:2]
+		x1, x2 = int(0.3 * w), int(0.7 * w)
+		y1, y2 = int(0.58 * h), int(0.9 * h)
+	x1 = max(0, min(x1, gray.shape[1] - 1))
+	x2 = max(x1 + 1, min(x2, gray.shape[1]))
+	y1 = max(0, min(y1, gray.shape[0] - 1))
+	y2 = max(y1 + 1, min(y2, gray.shape[0]))
+	return x1, y1, x2, y2
+
+
+def compute_sync_c(video_frames, audio_path: Path, fps: float = 25.0):
+	# Proxy Sync-C: correlation between mouth-region motion and audio energy envelope.
+	try:
+		audio, sr = _read_audio_mono(audio_path)
+		if len(video_frames) < 2 or len(audio) < 2:
+			return 0.0
+
+		samples_per_frame = max(1, int(sr / fps))
+		# Use per-frame audio energy then align with frame-difference count.
+		n_audio_frames = max(1, len(audio) // samples_per_frame)
+		audio = audio[: n_audio_frames * samples_per_frame]
+		audio_frame = audio.reshape(n_audio_frames, samples_per_frame)
+		audio_energy = np.mean(np.abs(audio_frame), axis=1)
+
+		face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
+		motions = []
+		prev = cv2.cvtColor(video_frames[0], cv2.COLOR_BGR2GRAY)
+		for fr in video_frames[1:]:
+			cur = cv2.cvtColor(fr, cv2.COLOR_BGR2GRAY)
+			x1, y1, x2, y2 = _face_mouth_roi(cur, face_cascade)
+			d = np.mean(np.abs(cur[y1:y2, x1:x2].astype(np.float32) - prev[y1:y2, x1:x2].astype(np.float32)))
+			motions.append(d)
+			prev = cur
+
+		motions = np.asarray(motions, dtype=np.float32)
+		if len(motions) < 2:
+			return 0.0
+		audio_aligned = _resample_1d(audio_energy.astype(np.float32), len(motions))
+		if np.std(audio_aligned) < 1e-8 or np.std(motions) < 1e-8:
+			return 0.0
+		corr = np.corrcoef(audio_aligned, motions)[0, 1]
+		if np.isnan(corr):
+			return 0.0
+		return float(np.clip(corr, -1.0, 1.0))
+	except Exception:
+		return 0.0
+
+
+def compute_hkc_hkv(video_frames):
+	# Proxy HKC/HKV using side-region hand-motion energy statistics.
+	if len(video_frames) < 2:
+		return 0.0, 0.0
+
+	prev = cv2.cvtColor(video_frames[0], cv2.COLOR_BGR2GRAY)
+	hand_motion = []
+	for fr in video_frames[1:]:
+		cur = cv2.cvtColor(fr, cv2.COLOR_BGR2GRAY)
+		h, w = cur.shape
+		# two side regions where hands often appear in portrait talking videos
+		left = (slice(int(0.35 * h), int(0.85 * h)), slice(0, int(0.32 * w)))
+		right = (slice(int(0.35 * h), int(0.85 * h)), slice(int(0.68 * w), w))
+		dleft = np.mean(np.abs(cur[left].astype(np.float32) - prev[left].astype(np.float32)))
+		dright = np.mean(np.abs(cur[right].astype(np.float32) - prev[right].astype(np.float32)))
+		hand_motion.append((dleft + dright) * 0.5)
+		prev = cur
+
+	hm = np.asarray(hand_motion, dtype=np.float32)
+	if hm.size == 0:
+		return 0.0, 0.0
+	# HKC proxy in [0,1]: normalized average hand activity.
+	hkc = float(np.clip(hm.mean() / 25.0, 0.0, 1.0))
+	hkv = float(np.var(hm))
+	return hkc, hkv
+
+
+def compute_csim(reference_bgr: np.ndarray, target_bgr: np.ndarray):
+	# Proxy CSIM: cosine similarity of color+texture descriptor.
+	def feat(img):
+		img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_AREA)
+		hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+		hist = cv2.calcHist([hsv], [0, 1], None, [24, 24], [0, 180, 0, 256]).flatten().astype(np.float32)
+		hist = hist / (np.linalg.norm(hist) + 1e-8)
+		g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+		edges = cv2.Canny(g, 60, 160).astype(np.float32)
+		edges = cv2.resize(edges, (56, 56), interpolation=cv2.INTER_AREA).flatten()
+		edges = edges / (np.linalg.norm(edges) + 1e-8)
+		return np.concatenate([hist, edges], axis=0)
+
+	f1 = feat(reference_bgr)
+	f2 = feat(target_bgr)
+	sim = float(np.dot(f1, f2) / ((np.linalg.norm(f1) * np.linalg.norm(f2)) + 1e-8))
+	return float(np.clip(sim, -1.0, 1.0))
+
+
+def _feature_for_efid(img_bgr: np.ndarray):
+	img = cv2.resize(img_bgr, (224, 224), interpolation=cv2.INTER_AREA)
+	hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+	hist_hs = cv2.calcHist([hsv], [0, 1], None, [32, 32], [0, 180, 0, 256]).flatten().astype(np.float64)
+	hist_hs = hist_hs / (np.sum(hist_hs) + 1e-12)
+	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float64) / 255.0
+	stat = np.array([gray.mean(), gray.std()], dtype=np.float64)
+	return np.concatenate([hist_hs, stat], axis=0)
+
+
+def _sqrtm_psd(mat: np.ndarray):
+	# Symmetric PSD matrix square root via eigen decomposition.
+	vals, vecs = np.linalg.eigh(mat)
+	vals = np.clip(vals, 0.0, None)
+	return (vecs * np.sqrt(vals)) @ vecs.T
+
+
+def frechet_distance(feats1: np.ndarray, feats2: np.ndarray):
+	mu1 = np.mean(feats1, axis=0)
+	mu2 = np.mean(feats2, axis=0)
+	s1 = np.cov(feats1, rowvar=False)
+	s2 = np.cov(feats2, rowvar=False)
+	if s1.ndim == 0:
+		s1 = np.array([[float(s1)]])
+	if s2.ndim == 0:
+		s2 = np.array([[float(s2)]])
+
+	covmean = _sqrtm_psd(s1 @ s2)
+	diff = mu1 - mu2
+	fid = diff @ diff + np.trace(s1 + s2 - 2.0 * covmean)
+	return float(max(fid, 0.0))
+
+
+def evaluate_manifest(
+	manifest_path: Path,
+	output_path: Path,
+	max_frames: int,
+	metric_size: int,
+):
+	with manifest_path.open("r", encoding="utf-8") as f:
+		manifest = json.load(f)
+	manifest_dir = manifest_path.parent
+
+	if not isinstance(manifest, dict) or not manifest:
+		raise ValueError("Manifest must be a non-empty object.")
+
+	per_sample = {}
+	ssim_first_list = []
+	psnr_first_list = []
+	ssim_avg_frames_list = []
+	psnr_avg_frames_list = []
+	sync_c_list = []
+	hkc_list = []
+	hkv_list = []
+	csim_list = []
+	efid_ref_feats = []
+	efid_gen_feats = []
+	failed = []
+
+	for sample_id, sample in manifest.items():
+		try:
+			image_rel = sample.get("image")
+			video_rel = sample.get("generate_video")
+			audio_rel = sample.get("audio")
+			if not image_rel or not video_rel:
+				raise ValueError("Missing image or generate_video field")
+
+			image_path = _resolve_manifest_path(image_rel, manifest_dir)
+			video_path = _resolve_manifest_path(video_rel, manifest_dir)
+			audio_path = _resolve_manifest_path(audio_rel, manifest_dir) if audio_rel else None
+
+			if not image_path.exists():
+				raise FileNotFoundError(f"Image not found: {image_path}")
+			if not video_path.exists():
+				raise FileNotFoundError(f"Video not found: {video_path}")
+
+			ref = cv2.imread(str(image_path))
+			if ref is None:
+				raise RuntimeError(f"Cannot read image: {image_path}")
+
+			frames = read_video_frames(video_path, max_frames=max_frames)
+
+			# Resize for faster and more stable metric computation.
+			ref_m = cv2.resize(ref, (metric_size, metric_size), interpolation=cv2.INTER_AREA)
+			frames_m = [cv2.resize(fr, (metric_size, metric_size), interpolation=cv2.INTER_AREA) for fr in frames]
+
+			first = frames_m[0]
+			ssim_first = ssim(ref_m, first)
+			psnr_first = psnr(ref_m, first)
+			sync_c = compute_sync_c(frames_m, audio_path, fps=25.0) if audio_path else 0.0
+			hkc, hkv = compute_hkc_hkv(frames_m)
+			csim = compute_csim(ref_m, first)
+
+			ssim_frames = []
+			psnr_frames = []
+			for fr in frames_m:
+				ssim_frames.append(ssim(ref_m, fr))
+				psnr_frames.append(psnr(ref_m, fr))
+
+			ssim_avg = float(np.mean(ssim_frames))
+			psnr_avg = float(np.mean(psnr_frames))
+
+			per_sample[sample_id] = {
+				"image": image_rel,
+				"generate_video": video_rel,
+				"num_frames_used": len(frames),
+				"ssim_image_vs_first_frame": ssim_first,
+				"psnr_image_vs_first_frame": psnr_first,
+				"ssim_image_vs_all_frames_avg": ssim_avg,
+				"psnr_image_vs_all_frames_avg": psnr_avg,
+				"Sync-C": sync_c,
+				"HKC": hkc,
+				"HKV": hkv,
+				"CSIM": csim,
+			}
+
+			ssim_first_list.append(ssim_first)
+			psnr_first_list.append(psnr_first)
+			ssim_avg_frames_list.append(ssim_avg)
+			psnr_avg_frames_list.append(psnr_avg)
+			sync_c_list.append(sync_c)
+			hkc_list.append(hkc)
+			hkv_list.append(hkv)
+			csim_list.append(csim)
+
+			efid_ref_feats.append(_feature_for_efid(ref_m))
+			efid_gen_feats.append(_feature_for_efid(first))
+		except Exception as e:
+			failed.append({"sample_id": sample_id, "error": str(e)})
+
+	efid = None
+	if len(efid_ref_feats) >= 2 and len(efid_gen_feats) >= 2:
+		efid = frechet_distance(np.stack(efid_ref_feats, axis=0), np.stack(efid_gen_feats, axis=0))
+
+	summary = {
+		"num_samples_total": len(manifest),
+		"num_samples_success": len(per_sample),
+		"num_samples_failed": len(failed),
+		"metrics": {
+			"ssim_image_vs_first_frame_mean": float(np.mean(ssim_first_list)) if ssim_first_list else None,
+			"psnr_image_vs_first_frame_mean": float(np.mean(psnr_first_list)) if psnr_first_list else None,
+			"ssim_image_vs_all_frames_avg_mean": float(np.mean(ssim_avg_frames_list)) if ssim_avg_frames_list else None,
+			"psnr_image_vs_all_frames_avg_mean": float(np.mean(psnr_avg_frames_list)) if psnr_avg_frames_list else None,
+			"Sync-C_mean": float(np.mean(sync_c_list)) if sync_c_list else None,
+			"HKC_mean": float(np.mean(hkc_list)) if hkc_list else None,
+			"HKV_mean": float(np.mean(hkv_list)) if hkv_list else None,
+			"CSIM_mean": float(np.mean(csim_list)) if csim_list else None,
+			"EFID_reference_vs_firstframe": efid,
+		},
+		"unavailable_metrics": {
+			"FID": "Unavailable without real image/video distribution (ground-truth set).",
+			"FVD": "Unavailable without real video set and feature extractor pipeline for real vs generated distributions.",
+			"reason": "Current manifest contains prompt/image/audio/generate_video but no real video references.",
+		},
+		"metric_notes": {
+			"Sync-C": "No-GT proxy via audio-energy and mouth-motion correlation.",
+			"HKC": "No-GT proxy via side-region hand-motion confidence.",
+			"HKV": "No-GT proxy via variance of hand-motion energy.",
+			"CSIM": "No-GT proxy identity similarity using color+texture cosine similarity.",
+			"EFID": "No-GT proxy Fréchet distance between reference-image and generated-first-frame handcrafted features.",
+		},
+	}
+
+	output = {
+		"config": {
+			"manifest_path": str(manifest_path),
+				"path_resolution": "Use absolute paths in manifest directly; relative paths are resolved by manifest dir.",
+			"max_frames": max_frames,
+			"metric_size": metric_size,
+			"note": "This is no-ground-truth evaluation. SSIM/PSNR are computed as image-to-video fidelity proxies.",
+		},
+		"summary": summary,
+		"per_sample": per_sample,
+		"failed_samples": failed,
+	}
+
+	with output_path.open("w", encoding="utf-8") as f:
+		json.dump(output, f, ensure_ascii=False, indent=2)
+
+	return output
+
+
+def main():
+	parser = argparse.ArgumentParser(description="No-GT evaluation for S2V manifest")
+	parser.add_argument(
+		"--manifest",
+		type=str,
+		default="./s2v_manifest_with_generate_video.json",
+		help="Input manifest JSON path. Default: ./s2v_manifest_with_generate_video.json",
+	)
+	parser.add_argument(
+		"--output",
+		type=str,
+		default="./evaluation_no_gt_metrics_s2v.json",
+		help="Output metrics JSON path. Default: ./evaluation_no_gt_metrics_s2v.json",
+	)
+	parser.add_argument(
+		"--generated_video_dir",
+		type=str,
+		default=None,
+		help="Directory containing generated mp4 files. If set, evaluator will build matched manifest from --manifest.",
+	)
+	parser.add_argument(
+		"--matched_manifest_output",
+		type=str,
+		default=None,
+		help="Output path for matched manifest with generate_video field.",
+	)
+	parser.add_argument("--max_frames", type=int, default=120)
+	parser.add_argument("--metric_size", type=int, default=256)
+	args = parser.parse_args()
+
+	manifest_path = Path(args.manifest)
+	if args.generated_video_dir:
+		matched = build_matched_manifest(
+			source_manifest_path=manifest_path,
+			generated_video_dir=Path(args.generated_video_dir),
+		)
+		matched_manifest_output = Path(args.matched_manifest_output) if args.matched_manifest_output else Path(
+			"./s2v_manifest_with_generate_video.json"
+		)
+		matched_manifest_output.parent.mkdir(parents=True, exist_ok=True)
+		matched_manifest_output.write_text(json.dumps(matched, ensure_ascii=False, indent=2), encoding="utf-8")
+		manifest_path = matched_manifest_output
+
+	out = evaluate_manifest(
+		manifest_path=manifest_path,
+		output_path=Path(args.output),
+		max_frames=args.max_frames,
+		metric_size=args.metric_size,
+	)
+
+	print("Evaluation done.")
+	print(json.dumps(out["summary"], ensure_ascii=False, indent=2))
+	print(f"Saved: {args.output}")
+
+
+if __name__ == "__main__":
+	main()
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/main.py b/examples/pytorch/diffusion_model/diffusers/wan/main.py
index c36ebef7177..bd22612a21a 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/main.py
+++ b/examples/pytorch/diffusion_model/diffusers/wan/main.py
@@ -84,12 +84,14 @@ def build_pipeline(args):
     if args.task == "t2v":
         vae = AutoencoderKLWan.from_pretrained(args.model, subfolder="vae", torch_dtype=torch.float32)
         pipe = WanPipeline.from_pretrained(args.model, vae=vae, torch_dtype=torch.bfloat16)
-        pipe.enable_model_cpu_offload()
+        if not args.quantize:
+            pipe.enable_model_cpu_offload()
         return pipe
 
     if args.task == "i2v":
         pipe = WanImageToVideoPipeline.from_pretrained(args.model, torch_dtype=torch.bfloat16)
-        pipe.enable_model_cpu_offload()
+        if not args.quantize:
+            pipe.enable_model_cpu_offload()
         return pipe
 
     raise ValueError(f"Unsupported task: {args.task}. Supported tasks are: i2v, t2v")
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/prepare_s2v_dataset.py b/examples/pytorch/diffusion_model/diffusers/wan/prepare_s2v_dataset.py
new file mode 100644
index 00000000000..241b08bca25
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/prepare_s2v_dataset.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+import argparse
+import json
+from pathlib import Path
+
+IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
+AUDIO_EXT_PRIORITY = [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
+
+def load_prompts(prompt_dir: Path):
+    prompts = {}
+    if not prompt_dir.exists():
+        return prompts
+    for path in sorted(prompt_dir.glob("*.txt")):
+        stem = path.stem
+        text = path.read_text(encoding="utf-8", errors="ignore").strip()
+        if text:
+            prompts[stem] = text
+    return prompts
+
+
+def build_audio_index(audio_dir: Path):
+    index = {}
+    if not audio_dir.exists():
+        return index
+    for path in sorted(audio_dir.iterdir()):
+        if not path.is_file():
+            continue
+        suffix = path.suffix.lower()
+        if suffix not in AUDIO_EXT_PRIORITY:
+            continue
+        index.setdefault(path.stem, []).append(path)
+    return index
+
+
+def pick_audio(paths):
+    best = None
+    best_rank = 10**9
+    for p in paths:
+        rank = AUDIO_EXT_PRIORITY.index(p.suffix.lower())
+        if rank < best_rank:
+            best = p
+            best_rank = rank
+    return best
+
+
+def build_manifest(dataset_dir: Path):
+    dataset_dir = dataset_dir.resolve()
+    img_dir = dataset_dir / "imgs"
+    audio_dir = dataset_dir / "audios"
+    prompt_dir = dataset_dir / "prompts"
+
+    if not img_dir.exists():
+        raise FileNotFoundError(f"Missing image directory: {img_dir}")
+
+    prompts = load_prompts(prompt_dir)
+    audio_index = build_audio_index(audio_dir)
+
+    manifest = {}
+    skipped = []
+
+    for image_path in sorted(img_dir.iterdir()):
+        if not image_path.is_file():
+            continue
+        if image_path.suffix.lower() not in IMAGE_EXTS:
+            continue
+
+        sample_id = image_path.stem
+        prompt = prompts.get(sample_id, None)
+        audio_candidates = audio_index.get(sample_id, [])
+        audio_path = pick_audio(audio_candidates) if audio_candidates else None
+
+        reasons = []
+        if not prompt:
+            reasons.append("missing_prompt")
+        if not audio_path:
+            reasons.append("missing_audio")
+        if reasons:
+            skipped.append({"id": sample_id, "reasons": reasons})
+            continue
+
+        item = {
+            "prompt": prompt or "",
+            "image": str(image_path.resolve()),
+            "audio": str(audio_path.resolve()) if audio_path else "",
+        }
+        manifest[sample_id] = item
+
+    return manifest, skipped
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Build s2v manifest from a local EchoMimicV3 repo"
+    )
+    parser.add_argument(
+        "--repo-dir",
+        required=True,
+        help="Local path for EchoMimicV3 repo (must already exist).",
+    )
+    parser.add_argument(
+        "--manifest-out",
+        required=True,
+        help="Output manifest path.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    repo_dir = Path(args.repo_dir).resolve()
+    if not repo_dir.exists():
+        raise FileNotFoundError(f"Repo directory not found: {repo_dir}")
+
+    dataset_dir = (repo_dir / "datasets" / "echomimicv3_demos").resolve()
+    if not dataset_dir.exists():
+        raise FileNotFoundError(f"Dataset directory not found: {dataset_dir}")
+
+    manifest, skipped = build_manifest(dataset_dir=dataset_dir)
+
+    out_path = Path(args.manifest_out).resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=4), encoding="utf-8")
+
+    summary = {
+        "repo_dir": str(repo_dir),
+        "dataset_dir": str(dataset_dir),
+        "manifest_out": str(out_path),
+        "total_samples": len(manifest),
+        "skipped_samples": len(skipped),
+        "first_skipped": skipped[:10],
+    }
+    print(json.dumps(summary, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements_i2v_t2v.txt
similarity index 99%
rename from examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
rename to examples/pytorch/diffusion_model/diffusers/wan/requirements_i2v_t2v.txt
index dd0a3842c04..8fbd0a14215 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/requirements.txt
+++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements_i2v_t2v.txt
@@ -41,3 +41,4 @@ omegaconf
 pycocoevalcap
 imageio-ffmpeg
 gdown==4.7.3
+
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/requirements_s2v.txt b/examples/pytorch/diffusion_model/diffusers/wan/requirements_s2v.txt
new file mode 100644
index 00000000000..1f4ee5df9e7
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/requirements_s2v.txt
@@ -0,0 +1,41 @@
+# Core runtime for wan_s2v.py
+numpy
+torch
+torchvision
+transformers
+accelerate
+huggingface_hub
+safetensors
+
+# Quantization stack used by s2v activation QDQ
+neural-compressor-pt
+auto-round
+
+# Utilities
+pillow
+einops
+requests
+
+# S2V evaluation metrics (evaluate_manifest_no_gt.py)
+opencv-python-headless
+scipy
+
+# Wan2.2 s2v runtime dependencies
+openai-whisper
+HyperPyYAML
+onnxruntime
+inflect
+wetext
+omegaconf
+conformer
+hydra-core
+lightning
+rich
+gdown
+matplotlib
+wget
+pyarrow
+pyworld
+librosa
+decord
+modelscope
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
index 1ea381f547a..df53e856138 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_benchmark.sh
@@ -1,61 +1,53 @@
 #!/bin/bash
 set -x
 
-SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-
 function main {
   init_params "$@"
   run_benchmark
 }
 
 function ensure_vbench_repo {
-  if [ ! -d "${vbench_dir}" ]; then
-    echo "VBench directory not found. Start cloning https://github.com/Vchitect/VBench.git ..."
-    git clone https://github.com/Vchitect/VBench.git "${vbench_dir}"
-    if [ $? -ne 0 ]; then
-      echo "Error: failed to clone VBench."
-      exit 1
-    fi
+  if [ -d "${vbench_dir}" ]; then
+    return
   fi
+
+  echo "Error: VBench directory not found: ${vbench_dir}"
+  echo "Please prepare VBench manually and pass --vbench_dir=/path/to/VBench if needed."
+  exit 1
 }
 
-function prepare_vbench_inputs {
-  if [ "${task}" = "t2v" ]; then
-    if [ -z "${prompt_folder}" ]; then
-      echo "Error: --prompt_folder is required for task=t2v"
-      exit 1
-    fi
-    if [ -z "${dimension}" ]; then
-      echo "Error: --dimension is required for task=t2v"
-      exit 1
-    fi
-  fi
+function ensure_vbench_data {
+  local prompt_root="${vbench_dir}/prompts/prompts_per_dimension"
+  local i2v_image_root="${vbench_dir}/vbench2_beta_i2v/data/crop/16-9"
+  local i2v_info_file="${vbench_dir}/vbench2_beta_i2v/vbench2_i2v_full_info.json"
 
-  if [ "${task}" = "i2v" ]; then
-    if [ -z "${image_folder}" ]; then
-      echo "Error: --image_folder is required for task=i2v"
-      exit 1
-    fi
-    if [ -z "${info_json}" ]; then
-      echo "Error: --info_json is required for task=i2v"
-      exit 1
-    fi
-    if [ -z "${dimension}" ]; then
-      echo "Error: --dimension is required for task=i2v"
-      exit 1
-    fi
+  if [ -d "${prompt_root}" ] && [ -d "${i2v_image_root}" ] && [ -f "${i2v_info_file}" ]; then
+    return
   fi
 
-  if [ -n "${prompt_folder}" ] && [ ! -d "${prompt_folder}" ]; then
-    echo "Error: prompt_folder not found: ${prompt_folder}"
-    exit 1
+  echo "Error: VBench data is incomplete under ${vbench_dir}."
+  echo "Please prepare VBench data manually (for example run vbench2_beta_i2v/download_data.sh in your VBench repo)."
+  exit 1
+}
+
+function ensure_wan_repo {
+  if [ -d "${wan_dir}" ]; then
+    return
   fi
-  if [ -n "${image_folder}" ] && [ ! -d "${image_folder}" ]; then
-    echo "Error: image_folder not found: ${image_folder}"
+
+  echo "Error: Wan2.2 directory not found: ${wan_dir}"
+  echo "Please prepare Wan2.2 manually and pass --wan_dir=/path/to/Wan2.2 if needed."
+  exit 1
+}
+
+function ensure_s2v_manifest {
+  if [ -z "${manifest_path}" ]; then
+    echo "Error: --manifest_path is required for task=s2v"
     exit 1
   fi
-  if [ -n "${info_json}" ] && [ ! -f "${info_json}" ]; then
-    echo "Error: info_json not found: ${info_json}"
+
+  if [ ! -f "${manifest_path}" ]; then
+    echo "Error: manifest_path not found: ${manifest_path}"
     exit 1
   fi
 }
@@ -87,6 +79,14 @@ function init_params {
         task="$2"
         shift 2
       ;;
+      --quantized_model=*)
+        tuned_checkpoint="${1#*=}"
+        shift
+      ;;
+      --quantized_model)
+        tuned_checkpoint="$2"
+        shift 2
+      ;;
       --output_video_path=*)
         output_video_path="${1#*=}"
         shift
@@ -127,6 +127,30 @@ function init_params {
         dimension="$2"
         shift 2
       ;;
+      --manifest_path=*)
+        manifest_path="${1#*=}"
+        shift
+      ;;
+      --manifest_path)
+        manifest_path="$2"
+        shift 2
+      ;;
+      --wan_dir=*)
+        wan_dir="${1#*=}"
+        shift
+      ;;
+      --wan_dir)
+        wan_dir="$2"
+        shift 2
+      ;;
+      --vbench_dir=*)
+        vbench_dir="${1#*=}"
+        shift
+      ;;
+      --vbench_dir)
+        vbench_dir="$2"
+        shift 2
+      ;;
       --gpu_ids=*)
         gpu_ids="${1#*=}"
         shift
@@ -159,12 +183,12 @@ function init_params {
         accuracy=true
         shift
       ;;
-      --vbench_dir=*)
-        vbench_dir="${1#*=}"
+      --s2v_eval_output=*)
+        s2v_eval_output="${1#*=}"
         shift
       ;;
-      --vbench_dir)
-        vbench_dir="$2"
+      --s2v_eval_output)
+        s2v_eval_output="$2"
         shift 2
       ;;
       *)
@@ -178,10 +202,14 @@ function init_params {
 function run_benchmark {
   task=${task:="t2v"}
   limit=${limit:=-1}
+  tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"}
   output_video_path=${output_video_path:="./tmp_video"}
   accuracy=${accuracy:=false}
   disable_mxfp8_inplace_qdq=${disable_mxfp8_inplace_qdq:=false}
-  vbench_dir=${vbench_dir:="${SCRIPT_DIR}/VBench"}
+  s2v_eval_output=${s2v_eval_output:=""}
+  script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+  wan_dir=${wan_dir:="${script_dir}/Wan2.2"}
+  vbench_dir=${vbench_dir:="${script_dir}/VBench"}
 
   if [[ ! "${output_video_path}" = /* ]]; then
     output_video_path=$(realpath -s "$(pwd)/${output_video_path}")
@@ -198,12 +226,27 @@ function run_benchmark {
     exit 1
   fi
 
-  ensure_vbench_repo
+  if [ "${task}" != "s2v" ]; then
+    ensure_vbench_repo
+    ensure_vbench_data
+    if [ "${task}" = "t2v" ]; then
+      prompt_folder=${prompt_folder:="${vbench_dir}/prompts/prompts_per_dimension"}
+    fi
+    if [ "${task}" = "i2v" ]; then
+      image_folder=${image_folder:="${vbench_dir}/vbench2_beta_i2v/data/crop/16-9"}
+      info_json=${info_json:="${vbench_dir}/vbench2_beta_i2v/vbench2_i2v_full_info.json"}
+    fi
+  else
+    ensure_wan_repo
+    ensure_s2v_manifest
+  fi
 
-  prepare_vbench_inputs
+  if [ "${task}" != "s2v" ] && [ -z "${dimension}" ]; then
+    echo "Error: --dimension is required for task=${task}"
+    exit 1
+  fi
 
-  normalized_dimensions="${dimension//,/ }"
-  read -r -a dimension_list <<< "${normalized_dimensions}"
+  mkdir -p "${output_video_path}"
 
   if [ -n "${gpu_ids}" ]; then
     gpu_list="${gpu_ids}"
@@ -220,107 +263,175 @@ function run_benchmark {
     gpu_array=()
   fi
 
-  mkdir -p "${output_video_path}"
-  shard_tmp_root="${output_video_path}/.prompt_shards"
-
-  function build_benchmark_cmd {
-    local cur_prompt_folder="$2"
-    local cur_info_json="$3"
-    local cmd=(
-      python3 main.py
-      --model "${input_model}"
-      --task "${task}"
-      --scheme "${scheme}"
-      --output_video_path "${output_video_path}"
-      --limit "${limit}"
-      --inference
-    )
-
-    if [ -n "${cur_prompt_folder}" ]; then
-      cmd+=(--prompt_folder "${cur_prompt_folder}")
-    elif [ -n "${prompt_folder}" ]; then
-      cmd+=(--prompt_folder "${prompt_folder}")
-    fi
-    if [ -n "${image_folder}" ]; then
-      cmd+=(--image_folder "${image_folder}")
-    fi
-    if [ -n "${cur_info_json}" ]; then
-      cmd+=(--info_json "${cur_info_json}")
-    elif [ -n "${info_json}" ]; then
-      cmd+=(--info_json "${info_json}")
-    fi
-    if [ -n "$1" ]; then
-      cmd+=(--dimension "$1")
-    fi
-    if [ -n "${mxfp8_chunk_rows}" ]; then
-      cmd+=(--mxfp8_chunk_rows "${mxfp8_chunk_rows}")
-    fi
-    if [ "${disable_mxfp8_inplace_qdq}" = "true" ]; then
-      cmd+=(--disable_mxfp8_inplace_qdq)
-    fi
+  if [ "${task}" = "s2v" ]; then
+    function build_s2v_cmd {
+      local cur_manifest_path="$1"
+      local cmd=(
+        env "PYTHONPATH=${wan_dir}:${PYTHONPATH}" python3 wan_s2v.py
+        --model "${input_model}"
+        --task "s2v-14B"
+        --scheme "${scheme}"
+        --output_video_path "${output_video_path}"
+        --manifest_path "${cur_manifest_path}"
+        --inference
+      )
 
-    printf '%q ' "${cmd[@]}"
-  }
+      if [ "${scheme}" != "BF16" ]; then
+        cmd+=(--quantized_model "${tuned_checkpoint}")
+      fi
+
+      if [ -n "${mxfp8_chunk_rows}" ]; then
+        cmd+=(--mxfp8_chunk_rows "${mxfp8_chunk_rows}")
+      fi
+      if [ "${disable_mxfp8_inplace_qdq}" = "true" ]; then
+        cmd+=(--disable_mxfp8_inplace_qdq)
+      fi
 
-  if [ ${#gpu_array[@]} -eq 0 ]; then
-    if [ ${#dimension_list[@]} -eq 0 ]; then
-      eval "$(build_benchmark_cmd "" "" "")"
+      printf "%q " "${cmd[@]}"
+    }
+
+    if [ ${#gpu_array[@]} -eq 0 ]; then
+      run_cmd="$(build_s2v_cmd "${manifest_path}")"
+      eval "${run_cmd}"
     else
-      for cur_dimension in "${dimension_list[@]}"; do
-        eval "$(build_benchmark_cmd "${cur_dimension}" "" "")"
-      done
-    fi
-  else
-    if [ ${#dimension_list[@]} -eq 0 ]; then
-      echo "Error: multi-GPU sharding requires --dimension"
-      exit 1
-    fi
+      num_shards=${#gpu_array[@]}
+      s2v_shard_root="${output_video_path}/.manifest_shards"
+      rm -rf "${s2v_shard_root}"
 
-    num_shards=${#gpu_array[@]}
-    for cur_dimension in "${dimension_list[@]}"; do
-      dim_shard_root="${shard_tmp_root}/${cur_dimension}"
-      rm -rf "${dim_shard_root}"
-      if [ "${task}" = "t2v" ]; then
-        prompt_file="${prompt_folder}/${cur_dimension}.txt"
-        python3 split_t2v_prompts.py \
-          --prompt_file "${prompt_file}" \
-          --num_shards "${num_shards}" \
-          --output_root "${dim_shard_root}"
-      else
-        python3 split_i2v_info.py \
-          --info_json "${info_json}" \
-          --dimension "${cur_dimension}" \
-          --num_shards "${num_shards}" \
-          --output_root "${dim_shard_root}"
-      fi
+      python3 split_s2v_manifest.py \
+        --manifest_path "${manifest_path}" \
+        --num_shards "${num_shards}" \
+        --output_root "${s2v_shard_root}"
 
       program_pid=()
       for shard_id in "${!gpu_array[@]}"; do
         gpu_id="${gpu_array[$shard_id]}"
-        log_suffix="${cur_dimension}"
-        if [ -z "${log_suffix}" ]; then
-          log_suffix="all"
+        shard_manifest_path="${s2v_shard_root}/shard_${shard_id}/manifest.json"
+        if [ ! -f "${shard_manifest_path}" ]; then
+          echo "Skip empty shard_${shard_id} on GPU ${gpu_id}"
+          continue
         fi
-        log_file="${output_video_path}/${log_suffix}.gpu${gpu_id}.log"
-        shard_prompt_folder=""
-        shard_info_json=""
+        log_file="${output_video_path}/s2v.gpu${gpu_id}.log"
+        run_cmd="$(build_s2v_cmd "${shard_manifest_path}")"
+        CUDA_VISIBLE_DEVICES="${gpu_id}" bash -lc "${run_cmd}" > "${log_file}" 2>&1 &
+        program_pid+=("$!")
+        echo "Start (PID: ${program_pid[-1]}, GPU: ${gpu_id}, shard: ${shard_id})"
+      done
 
+      if [ ${#program_pid[@]} -eq 0 ]; then
+        echo "Error: no non-empty s2v shards to run. Check --manifest_path content."
+        exit 1
+      fi
+
+      for pid in "${program_pid[@]}"; do
+        wait "${pid}" || exit 1
+      done
+    fi
+  else
+    normalized_dimensions="${dimension//,/ }"
+    read -r -a dimension_list <<< "${normalized_dimensions}"
+
+    shard_tmp_root="${output_video_path}/.prompt_shards"
+
+    function build_benchmark_cmd {
+      local cur_prompt_folder="$2"
+      local cur_info_json="$3"
+      local cmd=(
+        python3 main.py
+        --model "${input_model}"
+        --task "${task}"
+        --scheme "${scheme}"
+        --output_video_path "${output_video_path}"
+        --limit "${limit}"
+        --inference
+      )
+
+      if [ -n "${cur_prompt_folder}" ]; then
+        cmd+=(--prompt_folder "${cur_prompt_folder}")
+      elif [ -n "${prompt_folder}" ]; then
+        cmd+=(--prompt_folder "${prompt_folder}")
+      fi
+      if [ -n "${image_folder}" ]; then
+        cmd+=(--image_folder "${image_folder}")
+      fi
+      if [ -n "${cur_info_json}" ]; then
+        cmd+=(--info_json "${cur_info_json}")
+      elif [ -n "${info_json}" ]; then
+        cmd+=(--info_json "${info_json}")
+      fi
+      if [ -n "$1" ]; then
+        cmd+=(--dimension "$1")
+      fi
+      if [ -n "${mxfp8_chunk_rows}" ]; then
+        cmd+=(--mxfp8_chunk_rows "${mxfp8_chunk_rows}")
+      fi
+      if [ "${disable_mxfp8_inplace_qdq}" = "true" ]; then
+        cmd+=(--disable_mxfp8_inplace_qdq)
+      fi
+
+      printf "%q " "${cmd[@]}"
+    }
+
+    if [ ${#gpu_array[@]} -eq 0 ]; then
+      if [ ${#dimension_list[@]} -eq 0 ]; then
+        eval "$(build_benchmark_cmd "" "" "")"
+      else
+        for cur_dimension in "${dimension_list[@]}"; do
+          eval "$(build_benchmark_cmd "${cur_dimension}" "" "")"
+        done
+      fi
+    else
+      if [ ${#dimension_list[@]} -eq 0 ]; then
+        echo "Error: multi-GPU sharding requires --dimension"
+        exit 1
+      fi
+
+      num_shards=${#gpu_array[@]}
+      for cur_dimension in "${dimension_list[@]}"; do
+        dim_shard_root="${shard_tmp_root}/${cur_dimension}"
+        rm -rf "${dim_shard_root}"
         if [ "${task}" = "t2v" ]; then
-          shard_prompt_folder="${dim_shard_root}/shard_${shard_id}"
+          prompt_file="${prompt_folder}/${cur_dimension}.txt"
+          python3 split_t2v_prompts.py \
+            --prompt_file "${prompt_file}" \
+            --num_shards "${num_shards}" \
+            --output_root "${dim_shard_root}"
         else
-          shard_info_json="${dim_shard_root}/shard_${shard_id}/info.json"
+          python3 split_i2v_info.py \
+            --info_json "${info_json}" \
+            --dimension "${cur_dimension}" \
+            --num_shards "${num_shards}" \
+            --output_root "${dim_shard_root}"
         fi
 
-        cmd="$(build_benchmark_cmd "${cur_dimension}" "${shard_prompt_folder}" "${shard_info_json}")"
-        CUDA_VISIBLE_DEVICES="${gpu_id}" bash -lc "${cmd}" > "${log_file}" 2>&1 &
-        program_pid+=("$!")
-        echo "Start (PID: ${program_pid[-1]}, GPU: ${gpu_id}, dimension: ${cur_dimension})"
-      done
+        program_pid=()
+        for shard_id in "${!gpu_array[@]}"; do
+          gpu_id="${gpu_array[$shard_id]}"
+          log_suffix="${cur_dimension}"
+          if [ -z "${log_suffix}" ]; then
+            log_suffix="all"
+          fi
+          log_file="${output_video_path}/${log_suffix}.gpu${gpu_id}.log"
+          shard_prompt_folder=""
+          shard_info_json=""
 
-      for pid in "${program_pid[@]}"; do
-        wait "${pid}" || exit 1
+          if [ "${task}" = "t2v" ]; then
+            shard_prompt_folder="${dim_shard_root}/shard_${shard_id}"
+          else
+            shard_info_json="${dim_shard_root}/shard_${shard_id}/info.json"
+          fi
+
+          run_cmd="$(build_benchmark_cmd "${cur_dimension}" "${shard_prompt_folder}" "${shard_info_json}")"
+          CUDA_VISIBLE_DEVICES="${gpu_id}" bash -lc "${run_cmd}" > "${log_file}" 2>&1 &
+          program_pid+=("$!")
+          echo "Start (PID: ${program_pid[-1]}, GPU: ${gpu_id}, dimension: ${cur_dimension})"
+        done
+
+        for pid in "${program_pid[@]}"; do
+          wait "${pid}" || exit 1
+        done
       done
-    done
+    fi
   fi
 
   if [ "${accuracy}" = "true" ]; then
@@ -330,7 +441,7 @@ function run_benchmark {
       python evaluate.py \
         --dimension "subject_consistency motion_smoothness aesthetic_quality imaging_quality overall_consistency" \
         --videos_path "${output_video_path}" \
-        --mode=vbench_standard 
+        --mode=vbench_standard
       popd
     elif [ "${task}" = "i2v" ]; then
       echo "Start VBench evaluation for i2v..."
@@ -341,12 +452,42 @@ function run_benchmark {
         --ratio "16-9" \
         --mode=vbench_standard
       popd
+    elif [ "${task}" = "s2v" ]; then
+      echo "Start s2v evaluation..."
+      s2v_eval_script="${script_dir}/evaluate_manifest_no_gt.py"
+      s2v_eval_manifest="${output_video_path}/s2v_manifest_with_generate_video.json"
+      if [ ! -f "${s2v_eval_script}" ]; then
+        echo "Error: s2v evaluation script not found: ${s2v_eval_script}"
+        exit 1
+      fi
+      if [ -z "${s2v_eval_output}" ]; then
+        s2v_eval_output="${output_video_path}/evaluation_no_gt_metrics_s2v.json"
+      fi
+
+      eval_cmd=(
+        python3 "${s2v_eval_script}"
+        --manifest "${manifest_path}"
+        --generated_video_dir "${output_video_path}"
+        --matched_manifest_output "${s2v_eval_manifest}"
+        --output "${s2v_eval_output}"
+        --max_frames "32"
+        --metric_size "192"
+      )
+      printf "%q " "${eval_cmd[@]}" && echo
+      "${eval_cmd[@]}"
+
+      echo "S2V evaluation finished."
+      echo "- matched manifest: ${s2v_eval_manifest}"
+      echo "- metrics output: ${s2v_eval_output}"
     else
-      echo "--accuracy does not support task=${task}. Supported tasks: t2v, i2v."
-      exit 1
+      echo "--accuracy currently does not support task=${task}. Generated videos are saved at ${output_video_path}."
     fi
   else
-    echo "Video generation finished. Use --accuracy to run VBench evaluation for t2v/i2v."
+    if [ "${task}" = "s2v" ]; then
+      echo "S2V generation finished. Videos are in ${output_video_path}."
+    else
+      echo "Video generation finished. Use --accuracy to run VBench evaluation for t2v/i2v."
+    fi
   fi
 }
 
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh
index ae1ff41e1bb..01420acebf9 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/run_quant.sh
@@ -6,6 +6,16 @@ function main {
   run_tuning
 }
 
+function ensure_wan_repo {
+  if [ -d "${wan_dir}" ]; then
+    return
+  fi
+
+  echo "Error: Wan2.2 directory not found: ${wan_dir}"
+  echo "Please prepare Wan2.2 manually and pass --wan_dir=/path/to/Wan2.2 if needed."
+  exit 1
+}
+
 function init_params {
   for var in "$@"
   do
@@ -22,6 +32,9 @@ function init_params {
       --output_model=*)
         tuned_checkpoint=$(echo $var | cut -f2 -d=)
       ;;
+      --wan_dir=*)
+        wan_dir=$(echo $var | cut -f2 -d=)
+      ;;
       *)
         echo "Error: No such parameter: ${var}"
         exit 1
@@ -33,22 +46,34 @@ function init_params {
 function run_tuning {
   tuned_checkpoint=${tuned_checkpoint:="./tmp_autoround"}
   task=${task:="t2v"}
+  script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+  wan_dir=${wan_dir:="${script_dir}/Wan2.2"}
 
   if [ "${topology}" = "wan_fp8" ]; then
-    extra_cmd="--scheme FP8"
+    scheme="FP8"
   elif [ "${topology}" = "wan_mxfp8" ]; then
-    extra_cmd="--scheme MXFP8"
+    scheme="MXFP8"
   else
     echo "Error: unsupported topology ${topology}, use wan_fp8 or wan_mxfp8"
     exit 1
   fi
 
-  python3 main.py \
-    --model ${input_model} \
-    --task ${task} \
-    --output_dir ${tuned_checkpoint} \
-    --quantize \
-    ${extra_cmd}
+  if [ "${task}" = "s2v" ]; then
+    ensure_wan_repo
+    env "PYTHONPATH=${wan_dir}:${PYTHONPATH}" python3 wan_s2v.py \
+      --model ${input_model} \
+      --task s2v-14B \
+      --scheme ${scheme} \
+      --quantize \
+      --output_dir ${tuned_checkpoint}
+  else
+    python3 main.py \
+      --model ${input_model} \
+      --task ${task} \
+      --scheme ${scheme} \
+      --quantize \
+      --output_dir ${tuned_checkpoint}
+  fi
 }
 
 main "$@"
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh
index c9f9700dbbc..da60a112471 100644
--- a/examples/pytorch/diffusion_model/diffusers/wan/setup.sh
+++ b/examples/pytorch/diffusion_model/diffusers/wan/setup.sh
@@ -1,2 +1,53 @@
-pip install --no-cache-dir -r requirements.txt
-pip install VBench --no-deps
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+task=""
+
+usage() {
+  cat <<'USAGE'
+Usage: bash setup.sh --task t2v|i2v|s2v
+USAGE
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --task)
+      task="${2:-}"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Error: unknown argument: $1"
+      usage
+      exit 1
+      ;;
+  esac
+done
+
+if [[ -z "$task" ]]; then
+  echo "Error: --task is required"
+  usage
+  exit 1
+fi
+
+if [[ "$task" == "s2v" ]]; then
+  req_file="${SCRIPT_DIR}/requirements_s2v.txt"
+elif [[ "$task" == "t2v" || "$task" == "i2v" ]]; then
+  req_file="${SCRIPT_DIR}/requirements_i2v_t2v.txt"
+else
+  echo "Error: unsupported task: $task"
+  usage
+  exit 1
+fi
+
+pip install --no-cache-dir -r "$req_file"
+
+if [[ "$task" == "t2v" || "$task" == "i2v" ]]; then
+  pip install --no-cache-dir VBench --no-deps
+fi
+
+echo "Setup completed for task: $task"
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/split_s2v_manifest.py b/examples/pytorch/diffusion_model/diffusers/wan/split_s2v_manifest.py
new file mode 100644
index 00000000000..b0223f95dee
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/split_s2v_manifest.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Split s2v manifest into per-shard JSON files.")
+    parser.add_argument("--manifest_path", required=True, type=str, help="Path to full s2v manifest JSON")
+    parser.add_argument("--num_shards", required=True, type=int, help="Total shard count")
+    parser.add_argument("--output_root", required=True, type=str, help="Root dir to write shard JSON files")
+    return parser.parse_args()
+
+
+def split_list_items(items, num_shards):
+    buckets = [[] for _ in range(num_shards)]
+    for idx, item in enumerate(items):
+        buckets[idx % num_shards].append(item)
+    return buckets
+
+
+def split_dict_items(items, num_shards):
+    buckets = [dict() for _ in range(num_shards)]
+    for idx, key in enumerate(items.keys()):
+        shard_id = idx % num_shards
+        buckets[shard_id][key] = items[key]
+    return buckets
+
+
+def main():
+    args = parse_args()
+
+    if args.num_shards < 1:
+        raise ValueError("--num_shards must be >= 1")
+    if not os.path.isfile(args.manifest_path):
+        raise FileNotFoundError(f"Manifest file not found: {args.manifest_path}")
+
+    with open(args.manifest_path, "r", encoding="utf-8") as f:
+        manifest = json.load(f)
+
+    if isinstance(manifest, list):
+        shard_buckets = split_list_items(manifest, args.num_shards)
+    elif isinstance(manifest, dict):
+        shard_buckets = split_dict_items(manifest, args.num_shards)
+    else:
+        raise ValueError("Manifest must be a JSON object or list")
+
+    os.makedirs(args.output_root, exist_ok=True)
+    written_shards = 0
+    for shard_id, shard_manifest in enumerate(shard_buckets):
+        if len(shard_manifest) == 0:
+            continue
+        shard_dir = os.path.join(args.output_root, f"shard_{shard_id}")
+        os.makedirs(shard_dir, exist_ok=True)
+        shard_manifest_path = os.path.join(shard_dir, "manifest.json")
+        with open(shard_manifest_path, "w", encoding="utf-8") as f:
+            json.dump(shard_manifest, f, ensure_ascii=False, indent=2)
+        written_shards += 1
+
+    total_count = len(manifest)
+    print(
+        f"Split {total_count} s2v samples into {written_shards} non-empty shards "
+        f"(requested {args.num_shards}) under {args.output_root}"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/diffusion_model/diffusers/wan/wan_s2v.py b/examples/pytorch/diffusion_model/diffusers/wan/wan_s2v.py
new file mode 100644
index 00000000000..75ee40fbf32
--- /dev/null
+++ b/examples/pytorch/diffusion_model/diffusers/wan/wan_s2v.py
@@ -0,0 +1,310 @@
+import argparse
+import json
+import logging
+import os
+import re
+from functools import partial
+from pathlib import Path
+
+import torch
+from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare
+
+import wan
+from auto_round.data_type.fp8 import quant_fp8_sym
+from auto_round.data_type.mxfp import quant_mx_rceil
+from auto_round.utils import get_block_names, get_module
+from wan.configs import MAX_AREA_CONFIGS, WAN_CONFIGS
+from wan.utils.utils import merge_video_audio, save_video
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Wan s2v quantization and inference example")
+    parser.add_argument("--model", required=True, type=str, help="Wan S2V checkpoint directory")
+    parser.add_argument("--task", default="s2v-14B", choices=["s2v-14B"], type=str)
+    parser.add_argument("--scheme", default="BF16", choices=["BF16", "FP8", "MXFP8"], type=str)
+    parser.add_argument("--quantize", action="store_true", help="Quantize Wan S2V noise model with AutoRound")
+    parser.add_argument("--inference", action="store_true", help="Run S2V inference")
+    parser.add_argument("--output_dir", "--quantized_model", default="./tmp_autoround_s2v", type=str, help="Output dir for quantized model")
+
+    parser.add_argument("--output_video_path", default="./wan_s2v_video", type=str)
+    parser.add_argument("--manifest_path", default=None, type=str, help="Path to JSON with prompt/image/audio samples")
+
+    parser.add_argument("--prompt", default=None, type=str)
+    parser.add_argument("--image", default=None, type=str)
+    parser.add_argument("--audio", default=None, type=str)
+
+    parser.add_argument("--size", default="1280*720", type=str)
+    parser.add_argument("--infer_frames", default=80, type=int)
+    parser.add_argument("--num_clip", default=None, type=int)
+
+    parser.add_argument("--sample_solver", default="unipc", choices=["unipc", "dpm++"], type=str)
+    parser.add_argument("--sample_steps", default=None, type=int)
+    parser.add_argument("--sample_shift", default=None, type=float)
+    parser.add_argument("--sample_guide_scale", default=None, type=float)
+    parser.add_argument("--seed", default=42, type=int)
+
+    parser.add_argument("--enable_tts", action="store_true")
+    parser.add_argument("--tts_prompt_audio", default=None, type=str)
+    parser.add_argument("--tts_prompt_text", default=None, type=str)
+    parser.add_argument("--tts_text", default=None, type=str)
+    parser.add_argument("--pose_video", default=None, type=str)
+    parser.add_argument("--start_from_ref", action="store_true")
+    parser.add_argument("--offload_model", action="store_true")
+
+    parser.add_argument("--mxfp8_chunk_rows", default=2048, type=int)
+    parser.add_argument("--disable_mxfp8_inplace_qdq", action="store_true")
+    return parser.parse_args()
+
+
+def setup_logging():
+    logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s: %(message)s")
+
+
+def sanitize_filename(text):
+    if not text:
+        return "sample"
+    clean = re.sub(r"[^0-9a-zA-Z._-]+", "_", text).strip("_")
+    return clean[:80] if clean else "sample"
+
+
+def build_samples(args):
+    if not args.manifest_path:
+        raise ValueError("S2V requires --manifest_path")
+
+    manifest_path = Path(args.manifest_path)
+    if not manifest_path.exists():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest = json.load(f)
+
+    entries = []
+    if isinstance(manifest, dict):
+        iterator = manifest.items()
+    elif isinstance(manifest, list):
+        iterator = [(str(i), item) for i, item in enumerate(manifest)]
+    else:
+        raise ValueError("Manifest must be a JSON object or list")
+
+    for sample_id, sample in iterator:
+        if not isinstance(sample, dict):
+            continue
+        prompt = sample.get("prompt", args.prompt)
+        image = sample.get("image", args.image)
+        audio = sample.get("audio", args.audio)
+
+        if not prompt or not image or not audio:
+            logging.warning("Skip sample %s: missing prompt/image/audio", sample_id)
+            continue
+        entries.append({"id": str(sample_id), "prompt": prompt, "image": image, "audio": audio})
+
+    if not entries:
+        raise ValueError("No valid samples found in manifest")
+    return entries
+
+
+def apply_activation_qdq(model, args):
+    if args.scheme == "BF16":
+        return
+
+    if args.scheme == "FP8":
+        logging.info("Enable FP8 activation QDQ for S2V linear layers")
+
+        def act_qdq_forward(module, x, *fwd_args, **fwd_kwargs):
+            qdq_x, _, _ = quant_fp8_sym(x, group_size=0)
+            return module.orig_forward(qdq_x, *fwd_args, **fwd_kwargs)
+
+        for _, module in model.named_modules():
+            if module.__class__.__name__ == "Linear":
+                module.orig_forward = module.forward
+                module.forward = partial(act_qdq_forward, module)
+        return
+
+    logging.info(
+        "Enable MXFP8 activation QDQ (inplace=%s, chunk_rows=%s)",
+        not args.disable_mxfp8_inplace_qdq,
+        args.mxfp8_chunk_rows,
+    )
+
+    def act_qdq_forward(module, x, *fwd_args, **fwd_kwargs):
+        chunk_rows = max(1, int(args.mxfp8_chunk_rows))
+        use_inplace = not args.disable_mxfp8_inplace_qdq
+
+        if use_inplace and x.is_cuda:
+            x_2d = x.reshape(-1, x.shape[-1])
+            total_rows = x_2d.shape[0]
+            for start in range(0, total_rows, chunk_rows):
+                end = min(start + chunk_rows, total_rows)
+                qdq_chunk = quant_mx_rceil(
+                    x_2d[start:end],
+                    bits=8,
+                    group_size=32,
+                    data_type="mx_fp_rceil",
+                )[0]
+                x_2d[start:end].copy_(qdq_chunk)
+                del qdq_chunk
+            qdq_x = x
+        else:
+            qdq_x = quant_mx_rceil(x, bits=8, group_size=32, data_type="mx_fp_rceil")[0]
+
+        return module.orig_forward(qdq_x, *fwd_args, **fwd_kwargs)
+
+    for block_names in get_block_names(model):
+        for block_name in block_names:
+            block = get_module(model, block_name)
+            for _, module in block.named_modules():
+                if module.__class__.__name__ == "Linear":
+                    module.orig_forward = module.forward
+                    module.forward = partial(act_qdq_forward, module)
+
+
+def quantize_noise_model(model, args):
+    if args.scheme == "BF16":
+        raise ValueError("BF16 does not need quantization. Use --scheme FP8 or --scheme MXFP8.")
+
+    layer_config = {}
+    kwargs = {}
+    if args.scheme == "FP8":
+        for name, module in model.named_modules():
+            if module.__class__.__name__ == "Linear":
+                layer_config[name] = {"bits": 8, "data_type": "fp", "group_size": 0, "sym": True}
+    else:
+        kwargs["scheme"] = {
+            "bits": 8,
+            "group_size": 32,
+            "data_type": "mx_fp",
+        }
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    qconfig = AutoRoundConfig(
+        iters=0,
+        disable_opt_rtn=True,
+        layer_config=layer_config,
+        export_format="fake",
+        output_dir=args.output_dir,
+    )
+
+    logging.info("Prepare + convert S2V noise model (%s)", args.scheme)
+    model = prepare(model, qconfig)
+    model = convert(model)
+    logging.info("S2V quantization done. Output saved to %s", args.output_dir)
+
+def load_quantized_noise_model(wan_s2v, output_dir):
+    from wan.modules.s2v.model_s2v import WanModel_S2V
+
+    noise_model = WanModel_S2V.from_pretrained(
+        output_dir,
+        torch_dtype=torch.bfloat16
+    )
+    noise_model.eval()
+    logging.info("Loading quantized noise_model from %s", output_dir)
+    setattr(wan_s2v, "noise_model", noise_model)
+
+
+def run_inference(wan_s2v, args, cfg):
+    os.makedirs(args.output_video_path, exist_ok=True)
+
+    samples = build_samples(args)
+    logging.info("Start S2V generation, total samples: %s", len(samples))
+
+    for sample in samples:
+        prompt = sample["prompt"]
+        image_path = sample["image"]
+        audio_path = sample["audio"]
+        base = f"{sample['id']}_{sanitize_filename(prompt)}.mp4"
+        save_file = os.path.join(args.output_video_path, base)
+        save_file_abs = os.path.abspath(save_file)
+
+        if os.path.exists(save_file_abs):
+            logging.info("Skip %s: video already exists: %s", sample["id"], save_file_abs)
+            continue
+
+        if not os.path.exists(image_path):
+            logging.warning("Skip %s: image not found: %s", sample["id"], image_path)
+            continue
+        if not os.path.exists(audio_path) and not args.enable_tts:
+            logging.warning("Skip %s: audio not found: %s", sample["id"], audio_path)
+            continue
+
+        video = wan_s2v.generate(
+            input_prompt=prompt,
+            ref_image_path=image_path,
+            audio_path=audio_path,
+            enable_tts=args.enable_tts,
+            tts_prompt_audio=args.tts_prompt_audio,
+            tts_prompt_text=args.tts_prompt_text,
+            tts_text=args.tts_text,
+            num_repeat=args.num_clip,
+            pose_video=args.pose_video,
+            max_area=MAX_AREA_CONFIGS[args.size],
+            infer_frames=args.infer_frames,
+            shift=args.sample_shift,
+            sample_solver=args.sample_solver,
+            sampling_steps=args.sample_steps,
+            guide_scale=args.sample_guide_scale,
+            seed=args.seed,
+            offload_model=args.offload_model,
+            init_first_frame=args.start_from_ref,
+        )
+
+        save_video(
+            tensor=video[None],
+            save_file=save_file_abs,
+            fps=cfg.sample_fps,
+            nrow=1,
+            normalize=True,
+            value_range=(-1, 1),
+        )
+
+        if args.enable_tts:
+            merge_video_audio(video_path=save_file_abs, audio_path="tts.wav")
+        else:
+            merge_video_audio(video_path=save_file_abs, audio_path=audio_path)
+
+        logging.info("Saved: %s", save_file_abs)
+
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+def main():
+    args = parse_args()
+    setup_logging()
+    if args.task not in WAN_CONFIGS:
+        raise ValueError(f"Unsupported task: {args.task}")
+    if args.size not in MAX_AREA_CONFIGS:
+        raise ValueError(f"Unsupported --size {args.size}; valid keys: {list(MAX_AREA_CONFIGS.keys())}")
+
+    cfg = WAN_CONFIGS[args.task]
+    if args.sample_steps is None:
+        args.sample_steps = cfg.sample_steps
+    if args.sample_shift is None:
+        args.sample_shift = cfg.sample_shift
+    if args.sample_guide_scale is None:
+        args.sample_guide_scale = cfg.sample_guide_scale
+
+    logging.info("Create WanS2V pipeline from %s", args.model)
+    wan_s2v = wan.WanS2V(
+        config=cfg,
+        checkpoint_dir=args.model,
+        device_id=0,
+        rank=0,
+        t5_fsdp=False,
+        dit_fsdp=False,
+        use_sp=False,
+        t5_cpu=False,
+        convert_model_dtype=True,
+    )
+
+    if args.quantize:
+        quantize_noise_model(wan_s2v.noise_model, args)
+
+    if args.inference:
+        if args.scheme in ["FP8","MXFP8"]:
+            load_quantized_noise_model(wan_s2v, args.output_dir)
+            apply_activation_qdq(wan_s2v.noise_model, args)
+
+        run_inference(wan_s2v, args, cfg)
+
+
+if __name__ == "__main__":
+    main()