fix(wan): Fix scheduler sigma schedule and add debug flags

2026-03-11 07:52:07 +01:00
parent afd15018b7
commit d207275fea
4 changed files with 121 additions and 34 deletions
--- a/mlx_video/generate_wan.py
+++ b/mlx_video/generate_wan.py
@@ -72,6 +72,8 @@ def generate_video(
    loras_low: list | None = None,
    tiling: str = "auto",
    no_compile: bool = False,
+    trim_first_frames: int = 0,
+    debug_latents: bool = False,
 ):

    """Generate video using Wan pipeline (supports T2V and I2V).
@@ -100,6 +102,12 @@ def generate_video(
            - "spatial": Spatial tiling only
            - "temporal": Temporal tiling only
        no_compile: If True, skip mx.compile on models (useful for debugging)
+        trim_first_frames: Number of temporal latent positions to generate extra
+            and discard from the start. Each position = 4 pixel frames. Use 1
+            to fix first-frame artifacts on 14B models (generates 4 extra frames,
+            discards first 4). Use 2 for more aggressive trimming. Default: 0.
+        debug_latents: If True, print per-temporal-position latent statistics
+            after denoising for diagnosing first-frame artifacts.

    """
    import json
@@ -207,6 +215,9 @@ def generate_video(
    assert (num_frames - 1) % 4 == 0, f"num_frames must be 4n+1, got {num_frames}"

    gen_frames = num_frames
+    if trim_first_frames > 0:
+        gen_frames = num_frames + trim_first_frames * 4
+        print(f"{Colors.DIM}  Trim: generating {gen_frames} frames, will discard first {trim_first_frames * 4}{Colors.RESET}")

    version_str = f"Wan{config.model_version}"
    mode_str = "dual-model" if is_dual else "single-model"
@@ -595,6 +606,22 @@ def generate_video(

    print(f"{Colors.DIM}  Denoising: {time.time() - t3:.1f}s{Colors.RESET}")

+    # Diagnostic: per-temporal-position latent statistics
+    if debug_latents:
+        lat_np = np.array(latents)  # [C, T, H, W]
+        n_t = lat_np.shape[1]
+        print(f"\n{Colors.CYAN}  Latent diagnostics (shape {lat_np.shape}):{Colors.RESET}")
+        print(f"  {'Pos':>4s}  {'Mean':>8s}  {'Std':>8s}  {'Min':>8s}  {'Max':>8s}  {'AbsMean':>8s}")
+        for t_pos in range(min(n_t, 8)):
+            frame = lat_np[:, t_pos, :, :]
+            print(f"  {t_pos:4d}  {frame.mean():8.4f}  {frame.std():8.4f}  "
+                  f"{frame.min():8.4f}  {frame.max():8.4f}  {np.abs(frame).mean():8.4f}")
+        if n_t > 8:
+            interior = lat_np[:, 4:, :, :]
+            print(f"  {'4+':>4s}  {interior.mean():8.4f}  {interior.std():8.4f}  "
+                  f"{interior.min():8.4f}  {interior.max():8.4f}  {np.abs(interior).mean():8.4f}")
+        print()
+
    # Free transformer models and text embeddings
    if is_dual:
        del low_noise_model, high_noise_model, cross_kv_low, cross_kv_high
@@ -621,6 +648,9 @@ def generate_video(

    is_wan22_vae = config.vae_z_dim == 48

+    # Temporal extend: prepend reflected latent frames to the VAE input so that
+    # the CausalConv3d zero-padding artifacts fall on the prefix (which we crop).
+    # This gives the first real frame a full temporal receptive field of real data.
    # Select tiling configuration
    from mlx_video.models.ltx.video_vae.tiling import TilingConfig

@@ -676,6 +706,12 @@ def generate_video(
        video = np.clip(video * 255.0, 0, 255).astype(np.uint8)
        video = video.transpose(1, 2, 3, 0)  # [T, H, W, 3]

+    # Trim first N temporal chunks if requested (avoids first-frame artifacts)
+    if trim_first_frames > 0:
+        trim_pixels = trim_first_frames * 4
+        video = video[trim_pixels:]
+        print(f"{Colors.DIM}  Trimmed first {trim_pixels} frames ({video.shape[0]} remaining){Colors.RESET}")
+
    save_video(video, output_path, fps=config.sample_fps)
    print(f"\n{Colors.GREEN}✓ Video saved to {output_path}{Colors.RESET}")
    print(f"{Colors.DIM}  Total time: {time.time() - t1:.1f}s{Colors.RESET}")
@@ -727,6 +763,16 @@ def main():
        "--no-compile", action="store_true",
        help="Disable mx.compile on models (for debugging)",
    )
+    parser.add_argument(
+        "--trim-first-frames", type=int, default=0, metavar="N",
+        help="Generate N extra temporal chunks (N×4 frames) and discard them from the start. "
+             "Fixes first-frame color/lighting artifacts on 14B models. Try 1 first (4 frames). "
+             "Default: 0 (disabled)",
+    )
+    parser.add_argument(
+        "--debug-latents", action="store_true",
+        help="Print per-temporal-position latent statistics after denoising (diagnostic)",
+    )

    args = parser.parse_args()

@@ -766,6 +812,8 @@ def main():
        loras_low=_parse_lora_args(args.lora_low),
        tiling=args.tiling,
        no_compile=args.no_compile,
+        trim_first_frames=args.trim_first_frames,
+        debug_latents=args.debug_latents,

    )

--- a/mlx_video/models/wan/model.py
+++ b/mlx_video/models/wan/model.py
@@ -118,14 +118,12 @@ class WanModel(nn.Module):
            rope_params(1024, 2 * (d // 6)),
        ], axis=1)

-        # Precompute sinusoidal inv_freq for time embedding
-        # Use numpy float64 for precision (matches reference torch.float64),
-        # then store as float32 since MLX GPU doesn't support float64.
+        # Precompute sinusoidal inv_freq for time embedding.
        half = config.freq_dim // 2
-        inv_freq_np = np.power(
-            10000.0, -np.arange(half, dtype=np.float64) / half
+        self._inv_freq = mx.array(
+            np.power(10000.0, -np.arange(half, dtype=np.float64) / half
+            ).astype(np.float32)
        )
-        self._inv_freq = mx.array(inv_freq_np.astype(np.float32))


    def _patchify(self, x: mx.array) -> tuple:
@@ -311,13 +309,16 @@ class WanModel(nn.Module):
                axis=0,
            )  # [B, seq_len, dim]

-        # Time embedding (use cached inv_freq to avoid recomputing each step)
+        # Time embedding: sinusoidal from precomputed inv_freq.
+        # inv_freq was computed in float64 for precision, stored as float32.
+        # With integer timesteps (matching reference), float32 sin/cos is fine.
        if t.ndim == 0:
            t = t[None]

-        pos = t.astype(mx.float32)
-        sinusoid = pos[..., None] * self._inv_freq
-        sin_emb = mx.concatenate([mx.cos(sinusoid), mx.sin(sinusoid)], axis=-1)
+        sinusoid = t[..., None].astype(mx.float32) * self._inv_freq
+        sin_emb = mx.concatenate(
+            [mx.cos(sinusoid), mx.sin(sinusoid)], axis=-1
+        )

        if t.ndim == 1:
            # Standard T2V: scalar timestep per batch element [B]
--- a/mlx_video/models/wan/scheduler.py
+++ b/mlx_video/models/wan/scheduler.py
@@ -12,13 +12,30 @@ import numpy as np
 import mlx.core as mx


-def _compute_sigmas(num_steps: int, shift: float = 1.0) -> np.ndarray:
-    """Compute shifted sigma schedule matching official Wan2.2 code.
+def _compute_sigmas(
+    num_steps: int, shift: float = 1.0, num_train_timesteps: int = 1000
+) -> np.ndarray:
+    """Compute shifted sigma schedule matching official Wan2.2 scheduler.
+
+    The reference creates FlowUniPCMultistepScheduler with shift=1 (identity)
+    in the constructor, deriving sigma_max/sigma_min from the unshifted
+    training schedule.  Then set_timesteps() builds a linspace between those
+    unshifted bounds and applies the actual shift once.

    Returns num_steps+1 values (the last being 0.0 for the terminal state).
    """
-    sigmas = np.linspace(1.0, 0.0, num_steps + 1)[:num_steps]
+    # sigma bounds from unshifted training schedule (constructor uses shift=1)
+    alphas = np.linspace(1.0, 1.0 / num_train_timesteps, num_train_timesteps)[
+        ::-1
+    ]
+    sigmas_unshifted = 1.0 - alphas
+    sigma_max = float(sigmas_unshifted[0])  # (N-1)/N
+    sigma_min = float(sigmas_unshifted[-1])  # 0.0
+
+    # Interpolate, then apply shift once (matching set_timesteps)
+    sigmas = np.linspace(sigma_max, sigma_min, num_steps + 1)[:-1]
    sigmas = shift * sigmas / (1.0 + (shift - 1.0) * sigmas)
+
    return np.append(sigmas, 0.0).astype(np.float32)


@@ -31,9 +48,12 @@ class FlowMatchEulerScheduler:
        self.sigmas = None

    def set_timesteps(self, num_steps: int, shift: float = 1.0):
-        sigmas = _compute_sigmas(num_steps, shift)
+        sigmas = _compute_sigmas(num_steps, shift, self.num_train_timesteps)
        self.sigmas = mx.array(sigmas)
-        self.timesteps = mx.array(sigmas[:-1] * self.num_train_timesteps)
+        # Integer timesteps to match reference (model trained with int timesteps)
+        self.timesteps = mx.array(
+            (sigmas[:-1] * self.num_train_timesteps).astype(np.int64).astype(np.float32)
+        )
        # Store as Python floats to avoid .item() sync in step()
        self._sigmas_float = sigmas.tolist()
        self._step_index = 0
@@ -73,9 +93,11 @@ class FlowDPMPP2MScheduler:
        self.sigmas = None

    def set_timesteps(self, num_steps: int, shift: float = 1.0):
-        sigmas = _compute_sigmas(num_steps, shift)
+        sigmas = _compute_sigmas(num_steps, shift, self.num_train_timesteps)
        self.sigmas = mx.array(sigmas)
-        self.timesteps = mx.array(sigmas[:-1] * self.num_train_timesteps)
+        self.timesteps = mx.array(
+            (sigmas[:-1] * self.num_train_timesteps).astype(np.int64).astype(np.float32)
+        )
        # Store sigmas as Python floats for scalar math
        self._sigmas_float = sigmas.tolist()
        self._step_index = 0
@@ -198,9 +220,11 @@ class FlowUniPCScheduler:
        self.sigmas = None

    def set_timesteps(self, num_steps: int, shift: float = 1.0):
-        sigmas = _compute_sigmas(num_steps, shift)
+        sigmas = _compute_sigmas(num_steps, shift, self.num_train_timesteps)
        self.sigmas = mx.array(sigmas)
-        self.timesteps = mx.array(sigmas[:-1] * self.num_train_timesteps)
+        self.timesteps = mx.array(
+            (sigmas[:-1] * self.num_train_timesteps).astype(np.int64).astype(np.float32)
+        )
        self._sigmas_float = sigmas.tolist()
        self._step_index = 0
        self._num_steps = num_steps