Refactor video generation and model loading processes to utilize from_pretrained methods for VideoEncoder and VideoDecoder. Update denoising functions to include a cfg_rescale parameter for improved artifact reduction. Ensure consistent dtype handling across audio and video processing, enhancing precision and aligning with PyTorch behavior.

2026-01-23 17:39:02 +01:00
parent 02bfa228d9
commit df753312c7
3 changed files with 119 additions and 151 deletions
--- a/mlx_video/init.py
+++ b/mlx_video/init.py
@@ -10,24 +10,16 @@ from mlx_video.convert import (

 # Audio VAE components
 from mlx_video.models.ltx.audio_vae import (
-    AudioEncoder,
    AudioDecoder,
    Vocoder,
-    AudioProcessor,
    decode_audio,
-)
-
-# Patchifiers
-from mlx_video.components.patchifiers import (
-    VideoLatentPatchifier,
    AudioPatchifier,
-    VideoLatentShape,
    AudioLatentShape,
+    PerChannelStatistics,
 )

 # Conditioning
 from mlx_video.conditioning import (
-    VideoConditionByKeyframeIndex,
    VideoConditionByLatentIndex,
 )

@@ -43,17 +35,12 @@ __all__ = [
    "sanitize_audio_vae_weights",
    "sanitize_vocoder_weights",
    # Audio VAE
-    "AudioEncoder",
    "AudioDecoder",
    "Vocoder",
-    "AudioProcessor",
    "decode_audio",
-    # Patchifiers
-    "VideoLatentPatchifier",
    "AudioPatchifier",
-    "VideoLatentShape",
    "AudioLatentShape",
+    "PerChannelStatistics",
    # Conditioning
-    "VideoConditionByKeyframeIndex",
    "VideoConditionByLatentIndex",
 ]