Update audio decoder configuration to disable mid-block attention and ensure audio waveform is converted to float32 for consistency in processing.

2026-01-19 17:05:59 +01:00
parent 8a2ea38c88
commit bbb3de6aa7
1 changed files with 2 additions and 1 deletions
--- a/mlx_video/generate.py
+++ b/mlx_video/generate.py
@@ -648,6 +648,7 @@ def load_audio_decoder(model_path: Path, pipeline: PipelineType):
        norm_type=NormType.PIXEL,
        causality_axis=CausalityAxis.HEIGHT,
        mel_bins=64,
+        mid_block_add_attention=False,  # Config says no attention in mid block
    )

    weight_file = model_path / ("ltx-2-19b-dev.safetensors" if pipeline == PipelineType.DEV else "ltx-2-19b-distilled.safetensors")
@@ -1277,7 +1278,7 @@ def generate_video(
            audio_waveform = vocoder(mel_spectrogram)
            mx.eval(audio_waveform)

-            audio_np = np.array(audio_waveform)
+            audio_np = np.array(audio_waveform.astype(mx.float32))
            if audio_np.ndim == 3:
                audio_np = audio_np[0]