More poodles

2026-03-11 08:14:12 +01:00
parent d207275fea
commit 1cf878f5e0
9 changed files with 347 additions and 23 deletions
--- a/mlx_video/generate_wan.py
+++ b/mlx_video/generate_wan.py
@@ -75,7 +75,6 @@ def generate_video(
    trim_first_frames: int = 0,
    debug_latents: bool = False,
 ):
-
    """Generate video using Wan pipeline (supports T2V and I2V).

    Args:
@@ -108,7 +107,6 @@ def generate_video(
            discards first 4). Use 2 for more aggressive trimming. Default: 0.
        debug_latents: If True, print per-temporal-position latent statistics
            after denoising for diagnosing first-frame artifacts.
-
    """
    import json

@@ -494,6 +492,7 @@ def generate_video(
    print(f"\n{Colors.GREEN}Denoising ({steps} steps)...{Colors.RESET}")
    t3 = time.time()

+    # Compile model forward for faster denoising
    if not no_compile:
        models_to_compile = (
            [high_noise_model, low_noise_model] if is_dual else [single_model]
@@ -501,9 +500,6 @@ def generate_video(
        for m in models_to_compile:
            m._compiled = mx.compile(m)

-
-
-
    # Pre-convert timesteps to Python list to avoid .item() sync each step
    timestep_list = sched.timesteps.tolist()

@@ -773,7 +769,6 @@ def main():
        "--debug-latents", action="store_true",
        help="Print per-temporal-position latent statistics after denoising (diagnostic)",
    )
-
    args = parser.parse_args()

    # Parse guide scale
@@ -814,7 +809,6 @@ def main():
        no_compile=args.no_compile,
        trim_first_frames=args.trim_first_frames,
        debug_latents=args.debug_latents,
-
    )


--- a/mlx_video/models/wan/README.md
+++ b/mlx_video/models/wan/README.md
@@ -146,12 +146,16 @@ For example, for using the the distilled [Wan2.2-Lightning](https://huggingface.
 python -m mlx_video.generate_wan \
    --model-dir /Volumes/SSD/Wan-AI/Wan2.2-T2V-A14B-MLX \
    --width 480 \
-    --height 480 \
-    --num-frames 121 \
-    --prompt "Two dogs of the poodle breed sitting on a beach wearing sunglasses, close up, cinematic, sunset" \
+    --height 704 \
+    --num-frames 41 \
+    --prompt "Two dogs of the poodle breed sitting on a beach wearing sunglasses, nodding with their heads, close up, cinematic, sunset" \
    --steps 4 \
    --guide-scale 1 \
    --trim-first-frames 1 \
+    --seed 2391784614 \
    --lora-high /Volumes/SSD/Wan-AI/lightx2v/Wan2.2-Lightning/Wan2.2-T2V-A14B-4steps-lora-rank64-Seko-V2.0/high_noise_model.safetensors 1 \
    --lora-low /Volumes/SSD/Wan-AI/lightx2v/Wan2.2-Lightning/Wan2.2-T2V-A14B-4steps-lora-rank64-Seko-V2.0/low_noise_model.safetensors 1
 ```
+
+Which results in 
+![Poodles](../../../examples/poodles-wan.gif)
--- a/mlx_video/models/wan/config.py
+++ b/mlx_video/models/wan/config.py
@@ -1,5 +1,5 @@
-from dataclasses import dataclass, field
-from typing import List, Optional, Tuple, Union
+from dataclasses import dataclass
+from typing import Tuple, Union

 from mlx_video.models.ltx.config import BaseModelConfig

@@ -104,7 +104,7 @@ class WanModelConfig(BaseModelConfig):
            sample_shift=5.0,
            sample_guide_scale=(3.5, 3.5),
            max_area=704 * 1280,
-
+        )

    @classmethod
    def wan22_ti2v_5b(cls) -> "WanModelConfig":
@@ -126,4 +126,4 @@ class WanModelConfig(BaseModelConfig):
            sample_guide_scale=5.0,
            sample_fps=24,
            max_area=704 * 1280,
-
+        )
--- a/mlx_video/models/wan/docs/DIAGNOSTICS.md
+++ b/mlx_video/models/wan/docs/DIAGNOSTICS.md
@@ -315,11 +315,6 @@ Applied alongside bug fixes to improve inference speed:
 - **Redundant type cast removal**: MLX type promotion handles `bfloat16 * float32 → float32` automatically — removed 240 unnecessary graph nodes per step (6 casts × 40 blocks)
 - **Euler scheduler sync fix**: Pre-store sigmas as Python floats to avoid `.item()` evaluation sync

-### TeaCache Integration
- Polynomial rescaling stays in MLX lazy graph (Horner's method)
- Single `.item()` call on the accumulated distance for the skip/compute decision
- Configurable threshold, retention steps, and cutoff steps
-
 ---

 ## Resolved: CFG Effectiveness (was Open Investigation)
--- a/mlx_video/models/wan/model.py
+++ b/mlx_video/models/wan/model.py
@@ -1,5 +1,4 @@
 import math
-
 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np
@@ -354,7 +353,6 @@ class WanModel(nn.Module):
            for i, sl in enumerate(seq_lens_list):
                attn_mask[i, :, :, sl:] = -1e9

-
        kwargs = dict(
            e=e0,
            seq_lens=seq_lens_list,