Add librosa dependency and enhance A2V documentation with additional pipeline options

2026-03-16 02:02:13 +01:00
parent 6f6105b715
commit decb3eb9e5
2 changed files with 14 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -88,15 +88,23 @@ uv run mlx_video.generate --pipeline dev --prompt "Waves crashing" --image beach

 ### Audio-to-Video (A2V)

-Generate video conditioned on an input audio file. The audio is encoded to latent space and frozen during denoising — the transformer's cross-attention reads the audio signal to guide video generation.
+Generate video conditioned on an input audio file. Works with all four pipelines. The audio is encoded to latent space and frozen during denoising — the transformer's cross-attention reads the audio signal to guide video generation.

 ```bash
-# A2V - generate video from audio
+# A2V - distilled (default, fastest)
 uv run mlx_video.generate --audio-file music.wav --prompt "A band playing music"

-# A2V with dev pipeline
+# A2V - dev (single-stage with CFG)
 uv run mlx_video.generate --pipeline dev --audio-file ocean.wav --prompt "Ocean waves"

+# A2V - dev-two-stage (dev + LoRA refinement)
+uv run mlx_video.generate --pipeline dev-two-stage --audio-file music.wav \
+    --prompt "A band playing music" --model-repo prince-canuma/LTX-2-dev
+
+# A2V - dev-two-stage-hq (highest quality)
+uv run mlx_video.generate --pipeline dev-two-stage-hq --audio-file music.wav \
+    --prompt "A band playing music" --model-repo prince-canuma/LTX-2-dev
+
 # A2V + I2V (audio + image conditioning)
 uv run mlx_video.generate --audio-file rain.wav --image forest.jpg --prompt "Rain in forest"

@@ -104,6 +112,8 @@ uv run mlx_video.generate --audio-file rain.wav --image forest.jpg --prompt "Rai
 uv run mlx_video.generate --audio-file song.mp3 --audio-start-time 30.0 --prompt "Concert"
 ```

+> **Note:** `--audio-file` (A2V) and `--audio` (generate audio) are mutually exclusive. Supported formats: WAV, FLAC, MP3, OGG, and video files with audio tracks.
+
 ### Audio-Video Generation (experimental)

 Generate synchronized audio alongside video from scratch:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
    "Pillow>=10.3.0",
    "mlx-vlm",
    "rich>=14.2.0",
+    "librosa>=0.10.0",
 ]
 license = {text="MIT"}
 authors = [