From 3e33172c122c3e124bf3a363f0555a0ae0dae8fa Mon Sep 17 00:00:00 2001 From: Prince Canuma Date: Wed, 18 Mar 2026 17:34:57 +0100 Subject: [PATCH] Refactor and remove Wan2.1/2.2 model files; update README.md to include new model features and usage instructions for LTX-2 and Wan2 models. --- README.md | 173 ++++++++++++------ mlx_video/models/{wan => wan2}/README.md | 36 ++-- mlx_video/models/{wan => wan2}/__init__.py | 0 mlx_video/models/{wan => wan2}/attention.py | 0 mlx_video/models/{wan => wan2}/config.py | 0 mlx_video/models/{wan => wan2}/convert.py | 0 .../models/{wan => wan2}/docs/DIAGNOSTICS.md | 0 .../docs/IMPLEMENTATION_NOTES.md | 0 mlx_video/models/{wan => wan2}/generate.py | 0 mlx_video/models/{wan => wan2}/i2v_utils.py | 0 mlx_video/models/{wan => wan2}/loading.py | 0 mlx_video/models/{wan => wan2}/model.py | 0 mlx_video/models/{wan => wan2}/postprocess.py | 0 mlx_video/models/{wan => wan2}/rope.py | 0 mlx_video/models/{wan => wan2}/scheduler.py | 0 .../models/{wan => wan2}/text_encoder.py | 0 mlx_video/models/{wan => wan2}/tiling.py | 0 mlx_video/models/{wan => wan2}/transformer.py | 0 mlx_video/models/{wan => wan2}/vae.py | 0 mlx_video/models/{wan => wan2}/vae22.py | 0 20 files changed, 137 insertions(+), 72 deletions(-) rename mlx_video/models/{wan => wan2}/README.md (94%) rename mlx_video/models/{wan => wan2}/__init__.py (100%) rename mlx_video/models/{wan => wan2}/attention.py (100%) rename mlx_video/models/{wan => wan2}/config.py (100%) rename mlx_video/models/{wan => wan2}/convert.py (100%) rename mlx_video/models/{wan => wan2}/docs/DIAGNOSTICS.md (100%) rename mlx_video/models/{wan => wan2}/docs/IMPLEMENTATION_NOTES.md (100%) rename mlx_video/models/{wan => wan2}/generate.py (100%) rename mlx_video/models/{wan => wan2}/i2v_utils.py (100%) rename mlx_video/models/{wan => wan2}/loading.py (100%) rename mlx_video/models/{wan => wan2}/model.py (100%) rename mlx_video/models/{wan => wan2}/postprocess.py (100%) rename mlx_video/models/{wan => wan2}/rope.py (100%) rename mlx_video/models/{wan => wan2}/scheduler.py (100%) rename mlx_video/models/{wan => wan2}/text_encoder.py (100%) rename mlx_video/models/{wan => wan2}/tiling.py (100%) rename mlx_video/models/{wan => wan2}/transformer.py (100%) rename mlx_video/models/{wan => wan2}/vae.py (100%) rename mlx_video/models/{wan => wan2}/vae22.py (100%) diff --git a/README.md b/README.md index 4aee54c..10d72d2 100644 --- a/README.md +++ b/README.md @@ -16,38 +16,49 @@ uv pip install git+https://github.com/Blaizzy/mlx-video.git ## Supported Models -### LTX-2 -[LTX-2](https://huggingface.co/Lightricks/LTX-Video) is 19B parameter video generation model from Lightricks +- [**LTX-2**](https://huggingface.co/Lightricks/LTX-Video) — 19B parameter video generation model from Lightricks +- [**Wan2.1**](https://github.com/Wan-Video/Wan2.1) — 1.3B / 14B parameter T2V models (single-model pipeline) +- [**Wan2.2**](https://github.com/Wan-Video/Wan2.2) — T2V-14B, TI2V-5B, and I2V-14B models (dual-model pipeline) ## Features -- Text-to-video generation with the LTX-2 19B DiT model -- Two-stage generation pipeline for high-quality output +**LTX-2 / LTX-2.3** +- Text-to-Video (T2V), Image-to-Video (I2V), Audio-to-Video (A2V) +- Audio-Video joint generation +- Multi-pipeline: distilled, dev, dev-two-stage, dev-two-stage-hq - 2x spatial upscaling for images and videos +- Prompt enhancement via Gemma + +**Wan2.1 / Wan2.2** +- Text-to-Video (T2V) — 1.3B and 14B models +- Image-to-Video (I2V) — 14B model +- Flow-matching diffusion with classifier-free guidance +- LoRA support (e.g. Wan2.2-Lightning for 4-step generation) + +**General** - Optimized for Apple Silicon using MLX +--- -## Usage - -> **ℹ️ Info:** Currently, only the distilled variant is supported. Full LTX-2 feature support is coming soon. +## LTX-2 ### Text-to-Video Generation ```bash # Text-to-Video (distilled, fastest) -uv run mlx_video.generate --prompt "Two dogs wearing sunglasses, cinematic, sunset" -n 97 --width 768 +uv run mlx_video.ltx_2.generate --prompt "Two dogs wearing sunglasses, cinematic, sunset" -n 97 --width 768 # Image-to-Video -uv run mlx_video.generate --prompt "A person dancing" --image photo.jpg +uv run mlx_video.ltx_2.generate --prompt "A person dancing" --image photo.jpg # Audio-to-Video -uv run mlx_video.generate --audio-file music.wav --prompt "A band playing music" +uv run mlx_video.ltx_2.generate --audio-file music.wav --prompt "A band playing music" # Dev pipeline with CFG (higher quality) -uv run mlx_video.generate --pipeline dev --prompt "A cinematic scene" --cfg-scale 3.0 +uv run mlx_video.ltx_2.generate --pipeline dev --prompt "A cinematic scene" --cfg-scale 3.0 # Dev two-stage HQ (highest quality) -uv run mlx_video.generate --pipeline dev-two-stage-hq \ +uv run mlx_video.ltx_2.generate --pipeline dev-two-stage-hq \ --prompt "A cinematic scene of ocean waves at golden hour" \ --model-repo prince-canuma/LTX-2-dev ``` @@ -58,17 +69,8 @@ uv run mlx_video.generate --pipeline dev-two-stage-hq \ Pre-converted weights are available on HuggingFace ([LTX-2-distilled](https://huggingface.co/prince-canuma/LTX-2-distilled), [LTX-2-dev](https://huggingface.co/prince-canuma/LTX-2-dev), [LTX-2.3-distilled](https://huggingface.co/prince-canuma/LTX-2.3-distilled), [LTX-2.3-dev](https://huggingface.co/prince-canuma/LTX-2.3-dev)), or convert from the original Lightricks checkpoint: -```bash -python -m mlx_video.generate \ - --prompt "Ocean waves crashing on a beach at sunset" \ - --height 768 \ - --width 768 \ - --num-frames 65 \ - --seed 123 \ - --output my_video.mp4 -``` -### CLI Options +### LTX-2 CLI Options | Option | Default | Description | |--------|---------|-------------| @@ -82,46 +84,109 @@ python -m mlx_video.generate \ | `--save-frames` | false | Save individual frames as images | | `--model-repo` | Lightricks/LTX-2 | HuggingFace model repository | -## How It Works -The pipeline uses a two-stage generation process: +--- -1. **Stage 1**: Generate at half resolution (e.g., 384x384) with 8 denoising steps -2. **Upsample**: 2x spatial upsampling via LatentUpsampler -3. **Stage 2**: Refine at full resolution (e.g., 768x768) with 3 denoising steps -4. **Decode**: VAE decoder converts latents to RGB video +## Wan2.1 / Wan2.2 + +Both [Wan2.1](https://github.com/Wan-Video/Wan2.1) and [Wan2.2](https://github.com/Wan-Video/Wan2.2) are text-to-video diffusion models built on a DiT (Diffusion Transformer) backbone with a T5 text encoder and 3D VAE. + +### Step 0: Download and Convert Weights + +See the dedicated Wan2.1/Wan2.2 [README.md](mlx_video/models/wan/README.md) for details. + +### Step 1: Generate Video + +```bash +# Wan2.1 — uses defaults from config (50 steps, shift=5.0, guide=5.0) +python -m mlx_video.wan.generate \ + --model-dir wan21_mlx \ + --prompt "A cat playing piano in a cozy room" + +# Wan2.2 — uses defaults from config (40 steps, shift=12.0, guide=3.0,4.0) +python -m mlx_video.wan.generate_wan \ + --model-dir wan22_mlx \ + --prompt "A cat playing piano in a cozy room" +``` + +With custom settings: + +```bash +python -m mlx_video.generate_wan \ + --model-dir wan21_mlx \ + --prompt "Ocean waves at sunset, cinematic, 4K" \ + --negative-prompt "blurry, low quality" \ + --width 1280 \ + --height 720 \ + --num-frames 81 \ + --steps 50 \ + --guide-scale 5.0 \ + --shift 5.0 \ + --seed 42 \ + --output-path my_video.mp4 +``` + +The pipeline auto-detects the model version from `config.json` and selects the right pipeline mode (single or dual model). + +### Image-to-Video (I2V-14B) + +```bash +python -m mlx_video.generate_wan \ + --model-dir wan22_i2v_mlx \ + --prompt "The camera slowly zooms in as the subject begins to move" \ + --image start.png \ + --num-frames 81 \ + --output-path my_video.mp4 +``` + +### LoRA Support + +LoRAs can be used with the `--lora-high` and `--lora-low` command line switches. + +For example, using the distilled [Wan2.2-Lightning](https://huggingface.co/lightx2v/Wan2.2-Lightning) LoRA for 4-step generation: + +```bash +python -m mlx_video.generate_wan \ + --model-dir /Volumes/SSD/Wan-AI/Wan2.2-T2V-A14B-MLX \ + --width 480 \ + --height 704 \ + --num-frames 41 \ + --prompt "Two dogs of the poodle breed sitting on a beach wearing sunglasses, nodding with their heads, close up, cinematic, sunset" \ + --steps 4 \ + --guide-scale 1 \ + --trim-first-frames 1 \ + --seed 2391784614 \ + --lora-high /Volumes/SSD/Wan-AI/lightx2v/Wan2.2-Lightning/Wan2.2-T2V-A14B-4steps-lora-rank64-Seko-V2.0/high_noise_model.safetensors 1 \ + --lora-low /Volumes/SSD/Wan-AI/lightx2v/Wan2.2-Lightning/Wan2.2-T2V-A14B-4steps-lora-rank64-Seko-V2.0/low_noise_model.safetensors 1 +``` + +![Poodles](examples/poodles-wan.gif) + +### Wan CLI Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--model-dir` | (required) | Path to converted MLX model directory | +| `--prompt` | (required) | Text description of the video | +| `--image` | `None` | Input image path (for I2V models) | +| `--negative-prompt` | `""` | Negative prompt for guidance | +| `--width` | 1280 | Video width | +| `--height` | 720 | Video height | +| `--num-frames` | 81 | Number of frames (must be 4n+1) | +| `--steps` | from config | Number of diffusion steps | +| `--guide-scale` | from config | Guidance scale: float or `low,high` pair | +| `--shift` | from config | Noise schedule shift | +| `--seed` | -1 (random) | Random seed for reproducibility | +| `--output-path` | `output.mp4` | Output video path | + +--- ## Requirements - macOS with Apple Silicon - Python >= 3.11 - MLX >= 0.22.0 - -## Model Specifications - -- **Transformer**: 48 layers, 32 attention heads, 128 dim per head -- **Latent channels**: 128 -- **Text encoder**: Gemma 3 with 3840-dim output -- **RoPE**: Split mode with double precision - -## Project Structure - -``` -mlx_video/ -├── generate.py # Video generation pipeline -├── convert.py # Weight conversion (PyTorch -> MLX) -├── postprocess.py # Video post-processing utilities -├── utils.py # Helper functions -└── models/ - └── ltx/ - ├── ltx.py # Main LTXModel (DiT transformer) - ├── config.py # Model configuration - ├── transformer.py # Transformer blocks - ├── attention.py # Multi-head attention with RoPE - ├── text_encoder.py # Text encoder - ├── upsampler.py # 2x spatial upsampler - └── video_vae/ # VAE encoder/decoder -``` +- For weight conversion: PyTorch (`pip install torch`) ## License diff --git a/mlx_video/models/wan/README.md b/mlx_video/models/wan2/README.md similarity index 94% rename from mlx_video/models/wan/README.md rename to mlx_video/models/wan2/README.md index 3d45e2c..6fe7d8a 100644 --- a/mlx_video/models/wan/README.md +++ b/mlx_video/models/wan2/README.md @@ -70,7 +70,7 @@ The conversion script auto-detects the model version from the directory structur #### Wan2.1 T2V 1.3B ```bash -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.1-T2V-1.3B \ --output-dir ./Wan2.1-T2V-1.3B-MLX ``` @@ -78,7 +78,7 @@ python -m mlx_video.convert_wan \ #### Wan2.1 T2V 14B ```bash -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.1-T2V-14B \ --output-dir ./Wan2.1-T2V-14B-MLX ``` @@ -86,7 +86,7 @@ python -m mlx_video.convert_wan \ #### Wan2.2 T2V 14B ```bash -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.2-T2V-A14B \ --output-dir ./Wan2.2-T2V-A14B-MLX ``` @@ -94,7 +94,7 @@ python -m mlx_video.convert_wan \ #### Wan2.2 I2V 14B ```bash -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.2-I2V-A14B \ --output-dir ./Wan2.2-I2V-A14B-MLX ``` @@ -104,7 +104,7 @@ The I2V model is auto-detected from `config.json`; the output will include a `va #### Wan2.2 TI2V 5B ```bash -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.2-TI2V-5B \ --output-dir ./Wan2.2-TI2V-5B-MLX ``` @@ -144,7 +144,7 @@ wan_mlx/ #### Wan2.1 T2V 1.3B ```bash -python -m mlx_video.generate_wan \ +python -m mlx_video.wan2.gemer \ --model-dir ./Wan2.1-T2V-1.3B-MLX \ --prompt "A cat playing piano in a cozy living room, cinematic lighting" \ --width 832 --height 480 --num-frames 81 \ @@ -156,7 +156,7 @@ python -m mlx_video.generate_wan \ #### Wan2.1 T2V 14B ```bash -python -m mlx_video.generate_wan \ +python -m mlx_video.wan2.gemer \ --model-dir ./Wan2.1-T2V-14B-MLX \ --prompt "A woman walks through a misty forest at dawn, slow motion, cinematic" \ --width 1280 --height 704 --num-frames 81 \ @@ -172,7 +172,7 @@ python -m mlx_video.generate_wan \ Wan2.2 uses a dual-model pipeline (separate high-noise and low-noise transformers) and takes guidance as a `high,low` pair: ```bash -python -m mlx_video.generate_wan \ +python -m mlx_video.wan2.generate \ --model-dir ./Wan2.2-T2V-A14B-MLX \ --prompt "Two astronauts playing chess on the surface of the moon, dramatic lighting, 8K" \ --negative-prompt "low quality, blurry, distorted" \ @@ -189,7 +189,7 @@ python -m mlx_video.generate_wan \ Image-to-video: animates a starting image guided by a text prompt. Pass the image with `--image`: ```bash -python -m mlx_video.generate_wan \ +python -m mlx_video.wan2.generate \ --model-dir ./Wan2.2-I2V-A14B-MLX \ --image ./my_photo.png \ --prompt "The person slowly turns their head and smiles, cinematic, natural lighting" \ @@ -207,7 +207,7 @@ python -m mlx_video.generate_wan \ Text+image-to-video: a single-model variant with a larger VAE (`z_dim=48`). Resolution must be divisible by **32** (not 16 as with other models): ```bash -python -m mlx_video.generate_wan \ +python -m mlx_video.wan2.generate \ --model-dir ./Wan2.2-TI2V-5B-MLX \ --image ./my_photo.png \ --prompt "The subject waves hello, warm sunlight, film grain" \ @@ -251,27 +251,27 @@ Quantize the transformer weights to reduce memory usage by ~3.4×. Quantization ```bash # Convert with 4-bit quantization (works for any variant) -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.1-T2V-1.3B \ --output-dir ./Wan2.1-T2V-1.3B-MLX-Q4 \ --quantize --bits 4 --group-size 64 -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.1-T2V-14B \ --output-dir ./Wan2.1-T2V-14B-MLX-Q4 \ --quantize --bits 4 --group-size 64 -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.2-T2V-A14B \ --output-dir ./Wan2.2-T2V-A14B-MLX-Q4 \ --quantize --bits 4 --group-size 64 -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.2-I2V-A14B \ --output-dir ./Wan2.2-I2V-A14B-MLX-Q4 \ --quantize --bits 4 --group-size 64 -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.2-TI2V-5B \ --output-dir ./Wan2.2-TI2V-5B-MLX-Q4 \ --quantize --bits 4 --group-size 64 @@ -280,7 +280,7 @@ python -m mlx_video.convert_wan \ You can also quantize an already-converted MLX model without re-converting from PyTorch: ```bash -python -m mlx_video.convert_wan \ +python -m mlx_video.wan2.convert \ --checkpoint-dir ./Wan2.2-T2V-A14B-MLX \ --output-dir ./Wan2.2-T2V-A14B-MLX-Q4 \ --quantize-only --bits 4 @@ -289,7 +289,7 @@ python -m mlx_video.convert_wan \ Quantized models are used exactly the same way — the quantization is auto-detected from `config.json`: ```bash -python -m mlx_video.generate_wan \ +python -m mlx_video.wan2.generate \ --model-dir ./Wan2.2-T2V-A14B-MLX-Q4 \ --prompt "A cat playing piano" ``` @@ -330,7 +330,7 @@ LoRA's can be used with the `--lora-high` and `--lora-low` command line switches For example, for using the the distilled [Wan2.2-Lightning](https://huggingface.co/lightx2v/Wan2.2-Lightning) LoRA, use the following command. Lightning speeds up generation by using only 4 steps and a CFG scale of 1. ```bash -python -m mlx_video.generate_wan \ +python -m mlx_video.wan2.generate \ --model-dir /Volumes/SSD/Wan-AI/Wan2.2-T2V-A14B-MLX \ --width 480 \ --height 704 \ diff --git a/mlx_video/models/wan/__init__.py b/mlx_video/models/wan2/__init__.py similarity index 100% rename from mlx_video/models/wan/__init__.py rename to mlx_video/models/wan2/__init__.py diff --git a/mlx_video/models/wan/attention.py b/mlx_video/models/wan2/attention.py similarity index 100% rename from mlx_video/models/wan/attention.py rename to mlx_video/models/wan2/attention.py diff --git a/mlx_video/models/wan/config.py b/mlx_video/models/wan2/config.py similarity index 100% rename from mlx_video/models/wan/config.py rename to mlx_video/models/wan2/config.py diff --git a/mlx_video/models/wan/convert.py b/mlx_video/models/wan2/convert.py similarity index 100% rename from mlx_video/models/wan/convert.py rename to mlx_video/models/wan2/convert.py diff --git a/mlx_video/models/wan/docs/DIAGNOSTICS.md b/mlx_video/models/wan2/docs/DIAGNOSTICS.md similarity index 100% rename from mlx_video/models/wan/docs/DIAGNOSTICS.md rename to mlx_video/models/wan2/docs/DIAGNOSTICS.md diff --git a/mlx_video/models/wan/docs/IMPLEMENTATION_NOTES.md b/mlx_video/models/wan2/docs/IMPLEMENTATION_NOTES.md similarity index 100% rename from mlx_video/models/wan/docs/IMPLEMENTATION_NOTES.md rename to mlx_video/models/wan2/docs/IMPLEMENTATION_NOTES.md diff --git a/mlx_video/models/wan/generate.py b/mlx_video/models/wan2/generate.py similarity index 100% rename from mlx_video/models/wan/generate.py rename to mlx_video/models/wan2/generate.py diff --git a/mlx_video/models/wan/i2v_utils.py b/mlx_video/models/wan2/i2v_utils.py similarity index 100% rename from mlx_video/models/wan/i2v_utils.py rename to mlx_video/models/wan2/i2v_utils.py diff --git a/mlx_video/models/wan/loading.py b/mlx_video/models/wan2/loading.py similarity index 100% rename from mlx_video/models/wan/loading.py rename to mlx_video/models/wan2/loading.py diff --git a/mlx_video/models/wan/model.py b/mlx_video/models/wan2/model.py similarity index 100% rename from mlx_video/models/wan/model.py rename to mlx_video/models/wan2/model.py diff --git a/mlx_video/models/wan/postprocess.py b/mlx_video/models/wan2/postprocess.py similarity index 100% rename from mlx_video/models/wan/postprocess.py rename to mlx_video/models/wan2/postprocess.py diff --git a/mlx_video/models/wan/rope.py b/mlx_video/models/wan2/rope.py similarity index 100% rename from mlx_video/models/wan/rope.py rename to mlx_video/models/wan2/rope.py diff --git a/mlx_video/models/wan/scheduler.py b/mlx_video/models/wan2/scheduler.py similarity index 100% rename from mlx_video/models/wan/scheduler.py rename to mlx_video/models/wan2/scheduler.py diff --git a/mlx_video/models/wan/text_encoder.py b/mlx_video/models/wan2/text_encoder.py similarity index 100% rename from mlx_video/models/wan/text_encoder.py rename to mlx_video/models/wan2/text_encoder.py diff --git a/mlx_video/models/wan/tiling.py b/mlx_video/models/wan2/tiling.py similarity index 100% rename from mlx_video/models/wan/tiling.py rename to mlx_video/models/wan2/tiling.py diff --git a/mlx_video/models/wan/transformer.py b/mlx_video/models/wan2/transformer.py similarity index 100% rename from mlx_video/models/wan/transformer.py rename to mlx_video/models/wan2/transformer.py diff --git a/mlx_video/models/wan/vae.py b/mlx_video/models/wan2/vae.py similarity index 100% rename from mlx_video/models/wan/vae.py rename to mlx_video/models/wan2/vae.py diff --git a/mlx_video/models/wan/vae22.py b/mlx_video/models/wan2/vae22.py similarity index 100% rename from mlx_video/models/wan/vae22.py rename to mlx_video/models/wan2/vae22.py