Add librosa dependency and enhance A2V documentation with additional pipeline options
This commit is contained in:
16
README.md
16
README.md
@@ -88,15 +88,23 @@ uv run mlx_video.generate --pipeline dev --prompt "Waves crashing" --image beach
|
|||||||
|
|
||||||
### Audio-to-Video (A2V)
|
### Audio-to-Video (A2V)
|
||||||
|
|
||||||
Generate video conditioned on an input audio file. The audio is encoded to latent space and frozen during denoising — the transformer's cross-attention reads the audio signal to guide video generation.
|
Generate video conditioned on an input audio file. Works with all four pipelines. The audio is encoded to latent space and frozen during denoising — the transformer's cross-attention reads the audio signal to guide video generation.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# A2V - generate video from audio
|
# A2V - distilled (default, fastest)
|
||||||
uv run mlx_video.generate --audio-file music.wav --prompt "A band playing music"
|
uv run mlx_video.generate --audio-file music.wav --prompt "A band playing music"
|
||||||
|
|
||||||
# A2V with dev pipeline
|
# A2V - dev (single-stage with CFG)
|
||||||
uv run mlx_video.generate --pipeline dev --audio-file ocean.wav --prompt "Ocean waves"
|
uv run mlx_video.generate --pipeline dev --audio-file ocean.wav --prompt "Ocean waves"
|
||||||
|
|
||||||
|
# A2V - dev-two-stage (dev + LoRA refinement)
|
||||||
|
uv run mlx_video.generate --pipeline dev-two-stage --audio-file music.wav \
|
||||||
|
--prompt "A band playing music" --model-repo prince-canuma/LTX-2-dev
|
||||||
|
|
||||||
|
# A2V - dev-two-stage-hq (highest quality)
|
||||||
|
uv run mlx_video.generate --pipeline dev-two-stage-hq --audio-file music.wav \
|
||||||
|
--prompt "A band playing music" --model-repo prince-canuma/LTX-2-dev
|
||||||
|
|
||||||
# A2V + I2V (audio + image conditioning)
|
# A2V + I2V (audio + image conditioning)
|
||||||
uv run mlx_video.generate --audio-file rain.wav --image forest.jpg --prompt "Rain in forest"
|
uv run mlx_video.generate --audio-file rain.wav --image forest.jpg --prompt "Rain in forest"
|
||||||
|
|
||||||
@@ -104,6 +112,8 @@ uv run mlx_video.generate --audio-file rain.wav --image forest.jpg --prompt "Rai
|
|||||||
uv run mlx_video.generate --audio-file song.mp3 --audio-start-time 30.0 --prompt "Concert"
|
uv run mlx_video.generate --audio-file song.mp3 --audio-start-time 30.0 --prompt "Concert"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> **Note:** `--audio-file` (A2V) and `--audio` (generate audio) are mutually exclusive. Supported formats: WAV, FLAC, MP3, OGG, and video files with audio tracks.
|
||||||
|
|
||||||
### Audio-Video Generation (experimental)
|
### Audio-Video Generation (experimental)
|
||||||
|
|
||||||
Generate synchronized audio alongside video from scratch:
|
Generate synchronized audio alongside video from scratch:
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ dependencies = [
|
|||||||
"Pillow>=10.3.0",
|
"Pillow>=10.3.0",
|
||||||
"mlx-vlm",
|
"mlx-vlm",
|
||||||
"rich>=14.2.0",
|
"rich>=14.2.0",
|
||||||
|
"librosa>=0.10.0",
|
||||||
]
|
]
|
||||||
license = {text="MIT"}
|
license = {text="MIT"}
|
||||||
authors = [
|
authors = [
|
||||||
|
|||||||
Reference in New Issue
Block a user