From 749762a0b98197351bf2ae3c140ccee5c1e04460 Mon Sep 17 00:00:00 2001 From: Prince Canuma Date: Sun, 18 Jan 2026 21:55:38 +0100 Subject: [PATCH] Update audio decoder configuration to use an empty set for attention resolutions in both generate_av.py and generate_dev.py. Add a print statement for loading audio VAE decoder weights in generate_dev.py. --- mlx_video/generate_av.py | 2 +- mlx_video/generate_dev.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mlx_video/generate_av.py b/mlx_video/generate_av.py index e0fb22b..56d182a 100644 --- a/mlx_video/generate_av.py +++ b/mlx_video/generate_av.py @@ -254,7 +254,7 @@ def load_audio_decoder(model_path: Path): out_ch=2, # stereo ch_mult=(1, 2, 4), num_res_blocks=2, - attn_resolutions={8, 16, 32}, + attn_resolutions=set(), # PyTorch uses empty set (no attention in audio decoder) resolution=256, z_channels=AUDIO_LATENT_CHANNELS, norm_type=NormType.PIXEL, diff --git a/mlx_video/generate_dev.py b/mlx_video/generate_dev.py index 791c9ba..1d2a041 100644 --- a/mlx_video/generate_dev.py +++ b/mlx_video/generate_dev.py @@ -275,7 +275,7 @@ def load_audio_decoder(model_path: Path): out_ch=2, # stereo ch_mult=(1, 2, 4), num_res_blocks=2, - attn_resolutions={8, 16, 32}, + attn_resolutions=set(), # PyTorch uses empty set (no attention in audio decoder) resolution=256, z_channels=AUDIO_LATENT_CHANNELS, norm_type=NormType.PIXEL, @@ -289,6 +289,7 @@ def load_audio_decoder(model_path: Path): weight_file = model_path / "ltx-2-19b-distilled.safetensors" if weight_file.exists(): + print(f"Loading audio VAE decoder from {weight_file}...") raw_weights = mx.load(str(weight_file)) sanitized = sanitize_audio_vae_weights(raw_weights) if sanitized: