feat(wan): Add chunked VAE encoding and TI2V-5B support

2026-03-09 20:47:37 +01:00
parent 967218b7c1
commit 061ae4407c
3 changed files with 223 additions and 64 deletions
--- a/mlx_video/generate_wan.py
+++ b/mlx_video/generate_wan.py
@@ -29,13 +29,37 @@ from mlx_video.utils import Colors
 _build_i2v_mask = build_i2v_mask


+def _best_output_size(w, h, dw, dh, max_area):
+    """Compute the best output resolution that fits within max_area while
+    preserving the input aspect ratio and satisfying alignment constraints.
+    Matches the reference implementation's best_output_size().
+    """
+    ratio = w / h
+    ow = (max_area * ratio) ** 0.5
+    oh = max_area / ow
+
+    # Option 1: process width first
+    ow1 = int(ow // dw * dw)
+    oh1 = int(max_area / ow1 // dh * dh)
+    ratio1 = ow1 / oh1
+
+    # Option 2: process height first
+    oh2 = int(oh // dh * dh)
+    ow2 = int(max_area / oh2 // dw * dw)
+    ratio2 = ow2 / oh2
+
+    if max(ratio / ratio1, ratio1 / ratio) < max(ratio / ratio2, ratio2 / ratio):
+        return ow1, oh1
+    return ow2, oh2
+
+
 def generate_video(
    model_dir: str,
    prompt: str,
    negative_prompt: str | None = None,
    image: str | None = None,
    width: int = 1280,
-    height: int = 720,
+    height: int = 704,
    num_frames: int = 81,
    steps: int = None,
    guide_scale: str | float | tuple = None,
@@ -232,6 +256,15 @@ def generate_video(
            width = align_w
        print(f"{Colors.DIM}  Aligned {old_w}x{old_h} → {width}x{height} (must be divisible by {align_w}x{align_h}){Colors.RESET}")

+    # Enforce max_area constraint (model-specific resolution limit)
+    if config.max_area > 0 and height * width > config.max_area:
+        old_h, old_w = height, width
+        width, height = _best_output_size(width, height, align_w, align_h, config.max_area)
+        print(
+            f"{Colors.YELLOW}  ⚠ Resolution {old_w}x{old_h} exceeds model's max area "
+            f"({config.max_area:,}px). Adjusted → {width}x{height}{Colors.RESET}"
+        )
+
    # Compute target latent shape
    z_dim = config.vae_z_dim
    t_latent = (gen_frames - 1) // vae_stride[0] + 1
@@ -334,7 +367,7 @@ def generate_video(
            mx.eval(img_tensor)

            vae_enc = load_vae_encoder(vae_path, config)
-            z_img = vae_enc(img_tensor)  # [1, 1, H_lat, W_lat, z_dim]
+            z_img = vae_enc.encode(img_tensor)  # [1, 1, H_lat, W_lat, z_dim]
            mx.eval(z_img)
            z_img = z_img[0].transpose(3, 0, 1, 2)  # [z_dim, 1, H_lat, W_lat]
            i2v_mask, i2v_mask_tokens = build_i2v_mask(target_shape, config.patch_size)
@@ -658,8 +691,8 @@ def main():
                        help="Negative prompt for CFG (default: official Chinese prompt from config)")
    parser.add_argument("--no-negative-prompt", action="store_true",
                        help="Disable negative prompt (use empty string instead of config default)")
-    parser.add_argument("--width", type=int, default=1280, help="Video width")
-    parser.add_argument("--height", type=int, default=720, help="Video height")
+    parser.add_argument("--width", type=int, default=1280, help="Video width (default: 1280)")
+    parser.add_argument("--height", type=int, default=704, help="Video height (default: 704; 720p models use 704)")
    parser.add_argument("--num-frames", type=int, default=81, help="Number of frames (must be 4n+1)")
    parser.add_argument("--steps", type=int, default=None, help="Number of diffusion steps (default: from config)")
    parser.add_argument("--guide-scale", type=str, default=None, help="Guidance scale: single float or low,high pair")