From decb3eb9e5779a4f7f9cdcae5338d53e6495ca03 Mon Sep 17 00:00:00 2001 From: Prince Canuma Date: Mon, 16 Mar 2026 02:02:13 +0100 Subject: [PATCH] Add librosa dependency and enhance A2V documentation with additional pipeline options --- README.md | 16 +++++++++++++--- pyproject.toml | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8d86c69..80c87ef 100644 --- a/README.md +++ b/README.md @@ -88,15 +88,23 @@ uv run mlx_video.generate --pipeline dev --prompt "Waves crashing" --image beach ### Audio-to-Video (A2V) -Generate video conditioned on an input audio file. The audio is encoded to latent space and frozen during denoising — the transformer's cross-attention reads the audio signal to guide video generation. +Generate video conditioned on an input audio file. Works with all four pipelines. The audio is encoded to latent space and frozen during denoising — the transformer's cross-attention reads the audio signal to guide video generation. ```bash -# A2V - generate video from audio +# A2V - distilled (default, fastest) uv run mlx_video.generate --audio-file music.wav --prompt "A band playing music" -# A2V with dev pipeline +# A2V - dev (single-stage with CFG) uv run mlx_video.generate --pipeline dev --audio-file ocean.wav --prompt "Ocean waves" +# A2V - dev-two-stage (dev + LoRA refinement) +uv run mlx_video.generate --pipeline dev-two-stage --audio-file music.wav \ + --prompt "A band playing music" --model-repo prince-canuma/LTX-2-dev + +# A2V - dev-two-stage-hq (highest quality) +uv run mlx_video.generate --pipeline dev-two-stage-hq --audio-file music.wav \ + --prompt "A band playing music" --model-repo prince-canuma/LTX-2-dev + # A2V + I2V (audio + image conditioning) uv run mlx_video.generate --audio-file rain.wav --image forest.jpg --prompt "Rain in forest" @@ -104,6 +112,8 @@ uv run mlx_video.generate --audio-file rain.wav --image forest.jpg --prompt "Rai uv run mlx_video.generate --audio-file song.mp3 --audio-start-time 30.0 --prompt "Concert" ``` +> **Note:** `--audio-file` (A2V) and `--audio` (generate audio) are mutually exclusive. Supported formats: WAV, FLAC, MP3, OGG, and video files with audio tracks. + ### Audio-Video Generation (experimental) Generate synchronized audio alongside video from scratch: diff --git a/pyproject.toml b/pyproject.toml index 7c10195..b20887a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "Pillow>=10.3.0", "mlx-vlm", "rich>=14.2.0", + "librosa>=0.10.0", ] license = {text="MIT"} authors = [