Refactor video generation and model loading processes to utilize from_pretrained methods for VideoEncoder and VideoDecoder. Update denoising functions to include a cfg_rescale parameter for improved artifact reduction. Ensure consistent dtype handling across audio and video processing, enhancing precision and aligning with PyTorch behavior.

This commit is contained in:
Prince Canuma
2026-01-23 17:39:02 +01:00
parent 02bfa228d9
commit df753312c7
3 changed files with 119 additions and 151 deletions

View File

@@ -10,24 +10,16 @@ from mlx_video.convert import (
# Audio VAE components
from mlx_video.models.ltx.audio_vae import (
AudioEncoder,
AudioDecoder,
Vocoder,
AudioProcessor,
decode_audio,
)
# Patchifiers
from mlx_video.components.patchifiers import (
VideoLatentPatchifier,
AudioPatchifier,
VideoLatentShape,
AudioLatentShape,
PerChannelStatistics,
)
# Conditioning
from mlx_video.conditioning import (
VideoConditionByKeyframeIndex,
VideoConditionByLatentIndex,
)
@@ -43,17 +35,12 @@ __all__ = [
"sanitize_audio_vae_weights",
"sanitize_vocoder_weights",
# Audio VAE
"AudioEncoder",
"AudioDecoder",
"Vocoder",
"AudioProcessor",
"decode_audio",
# Patchifiers
"VideoLatentPatchifier",
"AudioPatchifier",
"VideoLatentShape",
"AudioLatentShape",
"PerChannelStatistics",
# Conditioning
"VideoConditionByKeyframeIndex",
"VideoConditionByLatentIndex",
]