Refactor generate.py to ensure temporal coordinates and position grids are processed in bfloat16 for consistency with PyTorch's precision behavior. Update denoise_dev_av function to apply standard ratio rescaling for audio and video guidance, enhancing numerical fidelity and model compatibility.

2026-03-12 21:26:38 +01:00
parent b07b1e3213
commit e0aafd72fc
3 changed files with 36 additions and 12 deletions
--- a/mlx_video/models/ltx/config.py
+++ b/mlx_video/models/ltx/config.py
@@ -147,6 +147,12 @@ class LTXModelConfig(BaseModelConfig):
        if self.audio_positional_embedding_max_pos is None:
            self.audio_positional_embedding_max_pos = [20]

+        # PyTorch LTX-2 configurator has a bug: it reads "frequencies_precision"
+        # instead of "rope_double_precision" from the config, so double_precision_rope
+        # is always False in PyTorch regardless of what the config file says. Since the
+        # model was trained with this behavior, we must match it.
+        self.double_precision_rope = False
+
        # Convert string enum values if loading from dict
        if isinstance(self.model_type, str):
            self.model_type = LTXModelType(self.model_type)