feat(wan): Add Wan2.2 I2V support

2026-02-27 13:46:23 +01:00
parent 93da550f65
commit 2bb95c61ed
26 changed files with 4401 additions and 2968 deletions
--- a/mlx_video/models/wan/i2v_utils.py
+++ b/mlx_video/models/wan/i2v_utils.py
@@ -0,0 +1,58 @@
+"""Image-to-Video utility functions for Wan2.2."""
+
+import mlx.core as mx
+import numpy as np
+
+
+def preprocess_image(image_path: str, width: int, height: int) -> mx.array:
+    """Load, resize, center-crop, and normalize an image for I2V.
+
+    Args:
+        image_path: Path to input image
+        width: Target width
+        height: Target height
+
+    Returns:
+        Image tensor [1, 1, H, W, 3] in [-1, 1] (channels-last, batch + temporal dims)
+    """
+    from PIL import Image
+
+    img = Image.open(image_path).convert("RGB")
+
+    # Resize so that the image covers the target size (LANCZOS)
+    scale = max(width / img.width, height / img.height)
+    img = img.resize((round(img.width * scale), round(img.height * scale)), Image.LANCZOS)
+
+    # Center crop
+    x1 = (img.width - width) // 2
+    y1 = (img.height - height) // 2
+    img = img.crop((x1, y1, x1 + width, y1 + height))
+
+    # To tensor: [H, W, 3] float32 in [-1, 1]
+    arr = np.array(img, dtype=np.float32) / 255.0
+    arr = arr * 2.0 - 1.0  # [0,1] → [-1,1]
+    return mx.array(arr[None, None])  # [1, 1, H, W, 3]
+
+
+def build_i2v_mask(z_shape, patch_size):
+    """Build temporal mask for I2V: first frame = 0, rest = 1.
+
+    Args:
+        z_shape: Latent shape (C, T, H, W) in channels-first
+        patch_size: (pt, ph, pw) patch size
+
+    Returns:
+        mask: (C, T, H, W) float32 — 0 for first frame, 1 for rest
+        mask_tokens: (1, L) float32 — 0 for first-frame tokens, 1 for rest
+    """
+    C, T, H, W = z_shape
+    mask = mx.ones(z_shape)
+    # Zero out the first temporal position
+    mask = mx.concatenate([mx.zeros((C, 1, H, W)), mask[:, 1:]], axis=1)
+
+    # Token-level mask for per-token timesteps: subsample to patch grid
+    # mask shape [C, T, H, W] → take first channel, subsample by patch_size
+    pt, ph, pw = patch_size
+    mask_tokens = mask[0, ::pt, ::ph, ::pw]  # [T', H', W']
+    mask_tokens = mask_tokens.reshape(1, -1)  # [1, L]
+    return mask, mask_tokens