format

2026-03-18 17:40:05 +01:00
parent 78bcfba31b
commit 17397da70c
77 changed files with 4125 additions and 1655 deletions
--- a/mlx_video/models/ltx_2/init.py
+++ b/mlx_video/models/ltx_2/init.py
@@ -1,8 +1,7 @@
-
+from mlx_video.models.ltx_2.audio_vae import AudioDecoder, Vocoder, decode_audio
 from mlx_video.models.ltx_2.config import (
    LTXModelConfig,
-    TransformerConfig,
    LTXModelType,
+    TransformerConfig,
 )
 from mlx_video.models.ltx_2.ltx import LTXModel, X0Model
-from mlx_video.models.ltx_2.audio_vae import AudioDecoder, Vocoder, decode_audio
--- a/mlx_video/models/ltx_2/adaln.py
+++ b/mlx_video/models/ltx_2/adaln.py
@@ -8,7 +8,6 @@ from mlx_video.utils import get_timestep_embedding

 class AdaLayerNormSingle(nn.Module):

-
    def __init__(
        self,
        embedding_dim: int,
@@ -24,7 +23,9 @@ class AdaLayerNormSingle(nn.Module):
        )

        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, embedding_coefficient * embedding_dim, bias=True)
+        self.linear = nn.Linear(
+            embedding_dim, embedding_coefficient * embedding_dim, bias=True
+        )

    def __call__(
        self,
@@ -56,15 +57,19 @@ class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
        use_additional_conditions: bool = False,
        timestep_proj_dim: int = 256,
    ):
-        
+
        super().__init__()

        self.embedding_dim = embedding_dim
        self.size_emb_dim = size_emb_dim
        self.use_additional_conditions = use_additional_conditions

-        self.time_proj = Timesteps(timestep_proj_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(timestep_proj_dim, embedding_dim, out_dim=embedding_dim)
+        self.time_proj = Timesteps(
+            timestep_proj_dim, flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+        self.timestep_embedder = TimestepEmbedding(
+            timestep_proj_dim, embedding_dim, out_dim=embedding_dim
+        )

        if use_additional_conditions and size_emb_dim > 0:
            self.additional_embedder = ConditionEmbedding(size_emb_dim, embedding_dim)
@@ -87,7 +92,9 @@ class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
        # Add additional conditions if enabled
        if self.use_additional_conditions and self.size_emb_dim > 0:
            if resolution is not None and aspect_ratio is not None:
-                additional_embeds = self.additional_embedder(resolution, aspect_ratio, hidden_dtype)
+                additional_embeds = self.additional_embedder(
+                    resolution, aspect_ratio, hidden_dtype
+                )
                timesteps_emb = timesteps_emb + additional_embeds

        return timesteps_emb
--- a/mlx_video/models/ltx_2/audio_vae/init.py
+++ b/mlx_video/models/ltx_2/audio_vae/init.py
@@ -1,10 +1,10 @@
 """Audio VAE module for LTX-2 audio generation."""

-from .attention import AttentionType, AttnBlock, make_attn
-from .audio_vae import AudioDecoder, AudioEncoder, decode_audio
-from .audio_processor import load_audio, ensure_stereo, waveform_to_mel
-from .causal_conv_2d import CausalConv2d, make_conv2d
 from ..config import CausalityAxis
+from .attention import AttentionType, AttnBlock, make_attn
+from .audio_processor import ensure_stereo, load_audio, waveform_to_mel
+from .audio_vae import AudioDecoder, AudioEncoder, decode_audio
+from .causal_conv_2d import CausalConv2d, make_conv2d
 from .downsample import Downsample, build_downsampling_path
 from .normalization import NormType, PixelNorm, build_normalization_layer
 from .ops import AudioLatentShape, AudioPatchifier, PerChannelStatistics
--- a/mlx_video/models/ltx_2/audio_vae/attention.py
+++ b/mlx_video/models/ltx_2/audio_vae/attention.py
@@ -32,7 +32,9 @@ class AttnBlock(nn.Module):
        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
-        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )

    def __call__(self, x: mx.array) -> mx.array:
        """
@@ -103,6 +105,8 @@ def make_attn(
    elif attn_type == AttentionType.NONE:
        return Identity()
    elif attn_type == AttentionType.LINEAR:
-        raise NotImplementedError(f"Attention type {attn_type.value} is not supported yet.")
+        raise NotImplementedError(
+            f"Attention type {attn_type.value} is not supported yet."
+        )
    else:
        raise ValueError(f"Unknown attention type: {attn_type}")
--- a/mlx_video/models/ltx_2/audio_vae/audio_processor.py
+++ b/mlx_video/models/ltx_2/audio_vae/audio_processor.py
@@ -4,10 +4,9 @@ Matches the PyTorch AudioProcessor from LTX-2 (torchaudio.transforms.MelSpectrog
 using librosa for macOS/MLX compatibility.
 """

-from pathlib import Path

-import numpy as np
 import mlx.core as mx
+import numpy as np


 def load_audio(
@@ -99,14 +98,16 @@ def waveform_to_mel(

    for ch in range(channels):
        # Magnitude spectrogram (power=1.0)
-        S = np.abs(librosa.stft(
-            waveform[ch],
-            n_fft=n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            center=True,
-            pad_mode="reflect",
-        ))
+        S = np.abs(
+            librosa.stft(
+                waveform[ch],
+                n_fft=n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                center=True,
+                pad_mode="reflect",
+            )
+        )

        # Mel filterbank with slaney normalization
        mel_basis = librosa.filters.mel(
--- a/mlx_video/models/ltx_2/audio_vae/audio_vae.py
+++ b/mlx_video/models/ltx_2/audio_vae/audio_vae.py
@@ -1,15 +1,15 @@
 """Audio VAE encoder and decoder for LTX-2."""

-from typing import Dict
 from pathlib import Path
+from typing import Dict

 import mlx.core as mx
 import mlx.nn as nn
 from mlx_vlm.models.base import check_array_shape
-from ..config import AudioDecoderModelConfig, AudioEncoderModelConfig
+
+from ..config import AudioDecoderModelConfig, AudioEncoderModelConfig, CausalityAxis
 from .attention import AttentionType, make_attn
 from .causal_conv_2d import make_conv2d
-from ..config import CausalityAxis
 from .downsample import build_downsampling_path
 from .normalization import NormType, build_normalization_layer
 from .ops import AudioLatentShape, AudioPatchifier, PerChannelStatistics
@@ -39,7 +39,9 @@ def build_mid_block(
        causality_axis=causality_axis,
    )
    mid["attn_1"] = (
-        make_attn(channels, attn_type=attn_type, norm_type=norm_type) if add_attention else None
+        make_attn(channels, attn_type=attn_type, norm_type=norm_type)
+        if add_attention
+        else None
    )
    mid["block_2"] = ResnetBlock(
        in_channels=channels,
@@ -93,7 +95,10 @@ class AudioEncoder(nn.Module):
        self.attn_type = config.attn_type

        self.conv_in = make_conv2d(
-            config.in_channels, self.ch, kernel_size=3, stride=1,
+            config.in_channels,
+            self.ch,
+            kernel_size=3,
+            stride=1,
            causality_axis=self.causality_axis,
        )

@@ -125,7 +130,10 @@ class AudioEncoder(nn.Module):
        self.norm_out = build_normalization_layer(block_in, normtype=self.norm_type)
        out_channels = 2 * config.z_channels if config.double_z else config.z_channels
        self.conv_out = make_conv2d(
-            block_in, out_channels, kernel_size=3, stride=1,
+            block_in,
+            out_channels,
+            kernel_size=3,
+            stride=1,
            causality_axis=self.causality_axis,
        )

@@ -160,7 +168,11 @@ class AudioEncoder(nn.Module):
                continue

            if "conv" in new_key.lower() and "weight" in new_key and value.ndim == 4:
-                value = value if check_array_shape(value) else mx.transpose(value, (0, 2, 3, 1))
+                value = (
+                    value
+                    if check_array_shape(value)
+                    else mx.transpose(value, (0, 2, 3, 1))
+                )

            sanitized[new_key] = value
        return sanitized
@@ -168,11 +180,14 @@ class AudioEncoder(nn.Module):
    @classmethod
    def from_pretrained(cls, model_path: Path) -> "AudioEncoder":
        """Load audio encoder from pretrained weights."""
-        from mlx_video.models.ltx_2.config import AudioEncoderModelConfig
        import json

+        from mlx_video.models.ltx_2.config import AudioEncoderModelConfig
+
        model_path = Path(model_path)
-        config = AudioEncoderModelConfig.from_dict(json.load(open(model_path / "config.json")))
+        config = AudioEncoderModelConfig.from_dict(
+            json.load(open(model_path / "config.json"))
+        )
        encoder = cls(config)
        weights = mx.load(str(model_path / "model.safetensors"))
        encoder.load_weights(list(weights.items()), strict=True)
@@ -265,7 +280,6 @@ class AudioDecoder(nn.Module):
        """
        super().__init__()

-
        # Per-channel statistics for denormalizing latents
        # Uses ch (base channel count) to match the patchified latent dimension
        # Input latent shape: (B, z_channels, T, latent_mel_bins) = (B, 8, T, 16)
@@ -305,7 +319,11 @@ class AudioDecoder(nn.Module):
        self.z_shape = (1, config.z_channels, base_resolution, base_resolution)

        self.conv_in = make_conv2d(
-            config.z_channels, base_block_channels, kernel_size=3, stride=1, causality_axis=self.causality_axis
+            config.z_channels,
+            base_block_channels,
+            kernel_size=3,
+            stride=1,
+            causality_axis=self.causality_axis,
        )

        self.mid = build_mid_block(
@@ -334,9 +352,15 @@ class AudioDecoder(nn.Module):
            initial_block_channels=base_block_channels,
        )

-        self.norm_out = build_normalization_layer(final_block_channels, normtype=self.norm_type)
+        self.norm_out = build_normalization_layer(
+            final_block_channels, normtype=self.norm_type
+        )
        self.conv_out = make_conv2d(
-            final_block_channels, config.out_ch, kernel_size=3, stride=1, causality_axis=self.causality_axis
+            final_block_channels,
+            config.out_ch,
+            kernel_size=3,
+            stride=1,
+            causality_axis=self.causality_axis,
        )

    def sanitize(self, weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
@@ -371,7 +395,11 @@ class AudioDecoder(nn.Module):
            # PyTorch: (out_channels, in_channels, H, W)
            # MLX: (out_channels, H, W, in_channels)
            if "conv" in new_key.lower() and "weight" in new_key and value.ndim == 4:
-                value = value if check_array_shape(value) else mx.transpose(value, (0, 2, 3, 1))
+                value = (
+                    value
+                    if check_array_shape(value)
+                    else mx.transpose(value, (0, 2, 3, 1))
+                )

            sanitized[new_key] = value

@@ -380,17 +408,19 @@ class AudioDecoder(nn.Module):
    @classmethod
    def from_pretrained(cls, model_path: Path) -> "AudioDecoder":
        """Load audio VAE decoder from pretrained model."""
-        from mlx_video.models.ltx_2.config import AudioDecoderModelConfig
        import json

-        config = AudioDecoderModelConfig.from_dict(json.load(open(model_path / "config.json")))
+        from mlx_video.models.ltx_2.config import AudioDecoderModelConfig
+
+        config = AudioDecoderModelConfig.from_dict(
+            json.load(open(model_path / "config.json"))
+        )
        decoder = cls(config)
        weights = mx.load(str(model_path / "model.safetensors"))
        # weights = decoder.sanitize(weights)
        decoder.load_weights(list(weights.items()), strict=True)
        return decoder

-
    def __call__(self, sample: mx.array) -> mx.array:
        """
        Decode latent features back to audio spectrograms.
@@ -414,7 +444,9 @@ class AudioDecoder(nn.Module):

        return self._adjust_output_shape(h, target_shape)

-    def _denormalize_latents(self, sample: mx.array) -> tuple[mx.array, AudioLatentShape]:
+    def _denormalize_latents(
+        self, sample: mx.array
+    ) -> tuple[mx.array, AudioLatentShape]:
        """Denormalize latents using per-channel statistics."""
        # sample shape: (B, H, W, C) in MLX format
        latent_shape = AudioLatentShape(
@@ -436,7 +468,9 @@ class AudioDecoder(nn.Module):
            batch=latent_shape.batch,
            channels=self.out_ch,
            frames=target_frames,
-            mel_bins=self.mel_bins if self.mel_bins is not None else latent_shape.mel_bins,
+            mel_bins=(
+                self.mel_bins if self.mel_bins is not None else latent_shape.mel_bins
+            ),
        )

        return sample, target_shape
@@ -462,7 +496,10 @@ class AudioDecoder(nn.Module):

        # Step 1: Crop first to avoid exceeding target dimensions
        decoded_output = decoded_output[
-            :, : min(current_time, target_time), : min(current_freq, target_freq), :target_channels
+            :,
+            : min(current_time, target_time),
+            : min(current_freq, target_freq),
+            :target_channels,
        ]

        # Step 2: Calculate padding needed for time and frequency dimensions
@@ -514,7 +551,9 @@ class AudioDecoder(nn.Module):
        return mx.tanh(h) if self.tanh_out else h


-def decode_audio(latent: mx.array, audio_decoder: AudioDecoder, vocoder: "Vocoder") -> mx.array:
+def decode_audio(
+    latent: mx.array, audio_decoder: AudioDecoder, vocoder: "Vocoder"
+) -> mx.array:
    """
    Decode an audio latent representation using the provided audio decoder and vocoder.
    Args:
--- a/mlx_video/models/ltx_2/audio_vae/causal_conv_2d.py
+++ b/mlx_video/models/ltx_2/audio_vae/causal_conv_2d.py
@@ -53,8 +53,16 @@ class CausalConv2d(nn.Module):
        # For (N, H, W, C) format: axis 1 is H (height), axis 2 is W (width)
        if self.causality_axis == CausalityAxis.NONE:
            # Non-causal: symmetric padding
-            self.padding = (pad_h // 2, pad_h - pad_h // 2, pad_w // 2, pad_w - pad_w // 2)
-        elif self.causality_axis in (CausalityAxis.WIDTH, CausalityAxis.WIDTH_COMPATIBILITY):
+            self.padding = (
+                pad_h // 2,
+                pad_h - pad_h // 2,
+                pad_w // 2,
+                pad_w - pad_w // 2,
+            )
+        elif self.causality_axis in (
+            CausalityAxis.WIDTH,
+            CausalityAxis.WIDTH_COMPATIBILITY,
+        ):
            # Causal on width: pad left (before width axis)
            self.padding = (pad_h // 2, pad_h - pad_h // 2, pad_w, 0)
        elif self.causality_axis == CausalityAxis.HEIGHT:
@@ -90,7 +98,10 @@ class CausalConv2d(nn.Module):
        if any(p > 0 for p in self.padding):
            # MLX pad expects: [(before_0, after_0), (before_1, after_1), ...]
            # For (N, H, W, C): axis 0=N, axis 1=H, axis 2=W, axis 3=C
-            x = mx.pad(x, [(0, 0), (pad_h_top, pad_h_bottom), (pad_w_left, pad_w_right), (0, 0)])
+            x = mx.pad(
+                x,
+                [(0, 0), (pad_h_top, pad_h_bottom), (pad_w_left, pad_w_right), (0, 0)],
+            )

        return self.conv(x)

@@ -124,7 +135,14 @@ def make_conv2d(
    if causality_axis is not None:
        # For causal convolution, padding is handled internally by CausalConv2d
        return CausalConv2d(
-            in_channels, out_channels, kernel_size, stride, dilation, groups, bias, causality_axis
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            dilation,
+            groups,
+            bias,
+            causality_axis,
        )
    else:
        # For non-causal convolution, use symmetric padding if not specified
--- a/mlx_video/models/ltx_2/audio_vae/downsample.py
+++ b/mlx_video/models/ltx_2/audio_vae/downsample.py
@@ -5,8 +5,8 @@ from typing import Set, Tuple
 import mlx.core as mx
 import mlx.nn as nn

-from .attention import AttentionType, make_attn
 from ..config import CausalityAxis
+from .attention import AttentionType, make_attn
 from .normalization import NormType
 from .resnet import ResnetBlock

@@ -34,7 +34,9 @@ class Downsample(nn.Module):
        if self.with_conv:
            # Do time downsampling here
            # no asymmetric padding in MLX conv, must do it ourselves
-            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+            self.conv = nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )

    def __call__(self, x: mx.array) -> mx.array:
        """
@@ -116,10 +118,14 @@ def build_downsampling_path(
            )
            block_in = block_out
            if curr_res in attn_resolutions:
-                stage["attn"][i_block] = make_attn(block_in, attn_type=attn_type, norm_type=norm_type)
+                stage["attn"][i_block] = make_attn(
+                    block_in, attn_type=attn_type, norm_type=norm_type
+                )

        if i_level != num_resolutions - 1:
-            stage["downsample"] = Downsample(block_in, resamp_with_conv, causality_axis=causality_axis)
+            stage["downsample"] = Downsample(
+                block_in, resamp_with_conv, causality_axis=causality_axis
+            )
            curr_res = curr_res // 2

        down_modules[i_level] = stage
--- a/mlx_video/models/ltx_2/audio_vae/normalization.py
+++ b/mlx_video/models/ltx_2/audio_vae/normalization.py
@@ -51,7 +51,9 @@ def build_normalization_layer(
        A normalization layer
    """
    if normtype == NormType.GROUP:
-        return nn.GroupNorm(num_groups=num_groups, dims=in_channels, eps=1e-6, affine=True)
+        return nn.GroupNorm(
+            num_groups=num_groups, dims=in_channels, eps=1e-6, affine=True
+        )
    if normtype == NormType.PIXEL:
        # For MLX channels-last format (B, H, W, C), normalize along channels (dim=-1)
        # PyTorch uses dim=1 for channels-first format (B, C, H, W)
--- a/mlx_video/models/ltx_2/audio_vae/resnet.py
+++ b/mlx_video/models/ltx_2/audio_vae/resnet.py
@@ -1,12 +1,12 @@
 """ResNet blocks for audio VAE and vocoder."""

-from typing import List, Tuple
+from typing import Tuple

 import mlx.core as mx
 import mlx.nn as nn

-from .causal_conv_2d import make_conv2d
 from ..config import CausalityAxis
+from .causal_conv_2d import make_conv2d
 from .normalization import NormType, build_normalization_layer

 LRELU_SLOPE = 0.1
@@ -125,7 +125,11 @@ class ResnetBlock(nn.Module):

        self.norm1 = build_normalization_layer(in_channels, normtype=norm_type)
        self.conv1 = make_conv2d(
-            in_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            causality_axis=causality_axis,
        )

        if temb_channels > 0:
@@ -134,17 +138,29 @@ class ResnetBlock(nn.Module):
        self.norm2 = build_normalization_layer(out_channels, normtype=norm_type)
        self.dropout_rate = dropout
        self.conv2 = make_conv2d(
-            out_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            causality_axis=causality_axis,
        )

        if self.in_channels != self.out_channels:
            if self.use_conv_shortcut:
                self.conv_shortcut = make_conv2d(
-                    in_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    causality_axis=causality_axis,
                )
            else:
                self.nin_shortcut = make_conv2d(
-                    in_channels, out_channels, kernel_size=1, stride=1, causality_axis=causality_axis
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    causality_axis=causality_axis,
                )

    def __call__(
@@ -168,7 +184,9 @@ class ResnetBlock(nn.Module):
        if temb is not None and self.temb_channels > 0:
            # temb: (B, temb_channels) -> (B, out_channels)
            # Need to add spatial dims: (B, 1, 1, out_channels) for broadcasting
-            h = h + mx.expand_dims(mx.expand_dims(nn.silu(self.temb_proj(temb)), axis=1), axis=1)
+            h = h + mx.expand_dims(
+                mx.expand_dims(nn.silu(self.temb_proj(temb)), axis=1), axis=1
+            )

        h = self.norm2(h)
        h = nn.silu(h)
--- a/mlx_video/models/ltx_2/audio_vae/upsample.py
+++ b/mlx_video/models/ltx_2/audio_vae/upsample.py
@@ -5,9 +5,9 @@ from typing import Set, Tuple
 import mlx.core as mx
 import mlx.nn as nn

+from ..config import CausalityAxis
 from .attention import AttentionType, make_attn
 from .causal_conv_2d import make_conv2d
-from ..config import CausalityAxis
 from .normalization import NormType
 from .resnet import ResnetBlock

@@ -42,7 +42,11 @@ class Upsample(nn.Module):
        self.causality_axis = causality_axis
        if self.with_conv:
            self.conv = make_conv2d(
-                in_channels, in_channels, kernel_size=3, stride=1, causality_axis=causality_axis
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                causality_axis=causality_axis,
            )

    def __call__(self, x: mx.array) -> mx.array:
@@ -124,10 +128,14 @@ def build_upsampling_path(
            )
            block_in = block_out
            if curr_res in attn_resolutions:
-                stage["attn"][i_block] = make_attn(block_in, attn_type=attn_type, norm_type=norm_type)
+                stage["attn"][i_block] = make_attn(
+                    block_in, attn_type=attn_type, norm_type=norm_type
+                )

        if level != 0:
-            stage["upsample"] = Upsample(block_in, resamp_with_conv, causality_axis=causality_axis)
+            stage["upsample"] = Upsample(
+                block_in, resamp_with_conv, causality_axis=causality_axis
+            )
            curr_res *= 2

        up_modules[level] = stage
--- a/mlx_video/models/ltx_2/audio_vae/vocoder.py
+++ b/mlx_video/models/ltx_2/audio_vae/vocoder.py
@@ -7,8 +7,8 @@ Supports:
 """

 import math
-from typing import List, Tuple
 from pathlib import Path
+from typing import Tuple

 import mlx.core as mx
 import mlx.nn as nn
@@ -32,7 +32,9 @@ class Snake(nn.Module):
    def __init__(self, in_features: int, alpha_logscale: bool = True) -> None:
        super().__init__()
        self.alpha_logscale = alpha_logscale
-        self.alpha = mx.zeros((in_features,)) if alpha_logscale else mx.ones((in_features,))
+        self.alpha = (
+            mx.zeros((in_features,)) if alpha_logscale else mx.ones((in_features,))
+        )

    def __call__(self, x: mx.array) -> mx.array:
        # x: (N, L, C) in MLX format
@@ -48,8 +50,12 @@ class SnakeBeta(nn.Module):
    def __init__(self, in_features: int, alpha_logscale: bool = True) -> None:
        super().__init__()
        self.alpha_logscale = alpha_logscale
-        self.alpha = mx.zeros((in_features,)) if alpha_logscale else mx.ones((in_features,))
-        self.beta = mx.zeros((in_features,)) if alpha_logscale else mx.ones((in_features,))
+        self.alpha = (
+            mx.zeros((in_features,)) if alpha_logscale else mx.ones((in_features,))
+        )
+        self.beta = (
+            mx.zeros((in_features,)) if alpha_logscale else mx.ones((in_features,))
+        )

    def __call__(self, x: mx.array) -> mx.array:
        alpha = self.alpha
@@ -73,7 +79,9 @@ def _sinc(x: mx.array) -> mx.array:
    )


-def kaiser_sinc_filter1d(cutoff: float, half_width: float, kernel_size: int) -> mx.array:
+def kaiser_sinc_filter1d(
+    cutoff: float, half_width: float, kernel_size: int
+) -> mx.array:
    """Compute a Kaiser-windowed sinc filter."""
    even = kernel_size % 2 == 0
    half_size = kernel_size // 2
@@ -88,6 +96,7 @@ def kaiser_sinc_filter1d(cutoff: float, half_width: float, kernel_size: int) ->

    # Kaiser window - compute using scipy-compatible formula
    import numpy as np
+
    window = mx.array(np.kaiser(kernel_size, beta).astype(np.float32))

    if even:
@@ -107,6 +116,7 @@ def kaiser_sinc_filter1d(cutoff: float, half_width: float, kernel_size: int) ->
 def hann_sinc_filter1d(ratio: int) -> Tuple[mx.array, int, int, int]:
    """Compute a Hann-windowed sinc filter for upsampling (used by BWE resampler)."""
    import numpy as np
+
    rolloff = 0.99
    lowpass_filter_width = 6
    width = math.ceil(lowpass_filter_width / rolloff)
@@ -187,10 +197,16 @@ class UpSample1d(nn.Module):
            self.kernel_size = filt.shape[2]
            self.filter = filt
        else:
-            self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+            self.kernel_size = (
+                int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+            )
            self.pad = self.kernel_size // ratio - 1
-            self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
-            self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+            self.pad_left = (
+                self.pad * self.stride + (self.kernel_size - self.stride) // 2
+            )
+            self.pad_right = (
+                self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+            )
            self.filter = kaiser_sinc_filter1d(
                cutoff=0.5 / ratio,
                half_width=0.6 / ratio,
@@ -215,10 +231,12 @@ class UpSample1d(nn.Module):
        filt = self.filter.astype(x.dtype)  # (1, 1, K)
        filt = mx.transpose(filt, (0, 2, 1))  # (1, K, 1)

-        x = self.ratio * mx.conv_transpose1d(x, filt, stride=self.stride)  # (N*C, L', 1)
+        x = self.ratio * mx.conv_transpose1d(
+            x, filt, stride=self.stride
+        )  # (N*C, L', 1)

        # Trim padding
-        x = x[:, self.pad_left:-self.pad_right, :]
+        x = x[:, self.pad_left : -self.pad_right, :]

        x = x.reshape(n, c, -1)  # (N, C, L')
        x = mx.transpose(x, (0, 2, 1))  # (N, L', C)
@@ -285,16 +303,24 @@ class AMPBlock1(nn.Module):

        self.convs1 = {
            i: nn.Conv1d(
-                channels, channels, kernel_size, stride=1,
-                dilation=d, padding=get_padding(kernel_size, d),
+                channels,
+                channels,
+                kernel_size,
+                stride=1,
+                dilation=d,
+                padding=get_padding(kernel_size, d),
            )
            for i, d in enumerate(dilation)
        }

        self.convs2 = {
            i: nn.Conv1d(
-                channels, channels, kernel_size, stride=1,
-                dilation=1, padding=get_padding(kernel_size, 1),
+                channels,
+                channels,
+                kernel_size,
+                stride=1,
+                dilation=1,
+                padding=get_padding(kernel_size, 1),
            )
            for i in range(len(dilation))
        }
@@ -348,7 +374,9 @@ class STFTFn(nn.Module):
            y = mx.concatenate([first, y], axis=1)

        # forward_basis: (514, 1, 512) PyTorch format -> (514, 512, 1) MLX
-        basis = mx.transpose(self.forward_basis.astype(y.dtype), (0, 2, 1))  # (514, K, 1)
+        basis = mx.transpose(
+            self.forward_basis.astype(y.dtype), (0, 2, 1)
+        )  # (514, K, 1)

        # Conv1d: (B, T, 1) * (514, K, 1) -> (B, T_frames, 514)
        spec = mx.conv1d(y, basis, stride=self.hop_length)
@@ -358,8 +386,10 @@ class STFTFn(nn.Module):
        real = spec[..., :n_freqs]
        imag = spec[..., n_freqs:]

-        magnitude = mx.sqrt(real ** 2 + imag ** 2)
-        phase = mx.arctan2(imag.astype(mx.float32), real.astype(mx.float32)).astype(real.dtype)
+        magnitude = mx.sqrt(real**2 + imag**2)
+        phase = mx.arctan2(imag.astype(mx.float32), real.astype(mx.float32)).astype(
+            real.dtype
+        )

        # Output: (B, T_frames, n_freqs) in MLX channels-last
        return magnitude, phase
@@ -368,7 +398,9 @@ class STFTFn(nn.Module):
 class MelSTFT(nn.Module):
    """Causal log-mel spectrogram from precomputed STFT bases."""

-    def __init__(self, filter_length: int, hop_length: int, win_length: int, n_mel_channels: int) -> None:
+    def __init__(
+        self, filter_length: int, hop_length: int, win_length: int, n_mel_channels: int
+    ) -> None:
        super().__init__()
        self.stft_fn = STFTFn(filter_length, hop_length, win_length)
        n_freqs = filter_length // 2 + 1
@@ -385,7 +417,9 @@ class MelSTFT(nn.Module):
        """
        magnitude, phase = self.stft_fn(y)
        # magnitude: (B, T_frames, n_freqs)
-        mel = magnitude @ self.mel_basis.astype(magnitude.dtype).T  # (B, T_frames, n_mels)
+        mel = (
+            magnitude @ self.mel_basis.astype(magnitude.dtype).T
+        )  # (B, T_frames, n_mels)
        log_mel = mx.log(mx.clip(mel, 1e-5, None))
        # Transpose to (B, n_mels, T_frames) for compatibility with vocoder input format
        return mx.transpose(log_mel, (0, 2, 1))
@@ -415,8 +449,11 @@ class Vocoder(nn.Module):

        in_channels = 128 if config.stereo else 64
        self.conv_pre = nn.Conv1d(
-            in_channels, config.upsample_initial_channel,
-            kernel_size=7, stride=1, padding=3,
+            in_channels,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
        )

        # Upsampling layers
@@ -424,11 +461,13 @@ class Vocoder(nn.Module):
        for i, (stride, kernel_size) in enumerate(
            zip(config.upsample_rates, config.upsample_kernel_sizes)
        ):
-            in_ch = config.upsample_initial_channel // (2 ** i)
+            in_ch = config.upsample_initial_channel // (2**i)
            out_ch = config.upsample_initial_channel // (2 ** (i + 1))
            self.ups[i] = nn.ConvTranspose1d(
-                in_ch, out_ch,
-                kernel_size=kernel_size, stride=stride,
+                in_ch,
+                out_ch,
+                kernel_size=kernel_size,
+                stride=stride,
                padding=(kernel_size - stride) // 2,
            )

@@ -442,7 +481,9 @@ class Vocoder(nn.Module):
                    config.resblock_kernel_sizes, config.resblock_dilation_sizes
                ):
                    self.resblocks[block_idx] = AMPBlock1(
-                        ch, kernel_size, tuple(dilations),
+                        ch,
+                        kernel_size,
+                        tuple(dilations),
                        activation=config.activation,
                    )
                    block_idx += 1
@@ -455,10 +496,14 @@ class Vocoder(nn.Module):
                for kernel_size, dilations in zip(
                    config.resblock_kernel_sizes, config.resblock_dilation_sizes
                ):
-                    self.resblocks[block_idx] = resblock_class(ch, kernel_size, tuple(dilations))
+                    self.resblocks[block_idx] = resblock_class(
+                        ch, kernel_size, tuple(dilations)
+                    )
                    block_idx += 1

-        final_channels = config.upsample_initial_channel // (2 ** len(config.upsample_rates))
+        final_channels = config.upsample_initial_channel // (
+            2 ** len(config.upsample_rates)
+        )

        # Post-activation
        if self.is_amp:
@@ -468,8 +513,11 @@ class Vocoder(nn.Module):
        # Final conv
        out_channels = 2 if config.stereo else 1
        self.conv_post = nn.Conv1d(
-            final_channels, out_channels,
-            kernel_size=7, stride=1, padding=3,
+            final_channels,
+            out_channels,
+            kernel_size=7,
+            stride=1,
+            padding=3,
            bias=config.use_bias_at_final,
        )

@@ -588,7 +636,9 @@ class VocoderWithBWE(nn.Module):
        """
        x = self.vocoder(mel_spec)  # (B, C, T) at input_sampling_rate
        _, _, length_low_rate = x.shape
-        output_length = length_low_rate * self.output_sampling_rate // self.input_sampling_rate
+        output_length = (
+            length_low_rate * self.output_sampling_rate // self.input_sampling_rate
+        )

        # Pad to hop_length multiple
        remainder = length_low_rate % self.hop_length
@@ -685,5 +735,3 @@ def _load_vocoder_with_bwe(config_dict: dict, weights: dict) -> VocoderWithBWE:

    model.load_weights(list(weights.items()), strict=False)
    return model
-
-
--- a/mlx_video/models/ltx_2/conditioning/init.py
+++ b/mlx_video/models/ltx_2/conditioning/init.py
@@ -1,3 +1,6 @@
 """Conditioning modules for LTX-2 video generation."""

-from mlx_video.models.ltx_2.conditioning.latent import VideoConditionByLatentIndex, apply_conditioning
+from mlx_video.models.ltx_2.conditioning.latent import (
+    VideoConditionByLatentIndex,
+    apply_conditioning,
+)
--- a/mlx_video/models/ltx_2/conditioning/latent.py
+++ b/mlx_video/models/ltx_2/conditioning/latent.py
@@ -5,7 +5,7 @@ the video generation process at specific frame positions.
 """

 from dataclasses import dataclass
-from typing import Optional, List, Tuple
+from typing import List, Optional, Tuple

 import mlx.core as mx

@@ -22,6 +22,7 @@ class VideoConditionByLatentIndex:
        frame_idx: Frame index to condition (0 = first frame)
        strength: Denoising strength (1.0 = full denoise, 0.0 = keep original)
    """
+
    latent: mx.array
    frame_idx: int = 0
    strength: float = 1.0
@@ -41,6 +42,7 @@ class LatentState:
        denoise_mask: Per-frame denoising mask (B, 1, F, 1, 1) where
                      1.0 = full denoise, 0.0 = keep clean
    """
+
    latent: mx.array
    clean_latent: mx.array
    denoise_mask: mx.array
@@ -130,15 +132,15 @@ def apply_conditioning(
            if frame_idx <= i < end_idx:
                # Use conditioning latent
                cond_idx = i - frame_idx
-                latent_list.append(cond_latent[:, :, cond_idx:cond_idx+1])
-                clean_list.append(cond_latent[:, :, cond_idx:cond_idx+1])
+                latent_list.append(cond_latent[:, :, cond_idx : cond_idx + 1])
+                clean_list.append(cond_latent[:, :, cond_idx : cond_idx + 1])
                # Set mask: 1.0 - strength means less denoising for conditioned frames
                mask_list.append(mx.full((b, 1, 1, 1, 1), 1.0 - strength, dtype=dtype))
            else:
                # Keep original
-                latent_list.append(state.latent[:, :, i:i+1])
-                clean_list.append(state.clean_latent[:, :, i:i+1])
-                mask_list.append(state.denoise_mask[:, :, i:i+1])
+                latent_list.append(state.latent[:, :, i : i + 1])
+                clean_list.append(state.clean_latent[:, :, i : i + 1])
+                mask_list.append(state.denoise_mask[:, :, i : i + 1])

        state.latent = mx.concatenate(latent_list, axis=2)
        state.clean_latent = mx.concatenate(clean_list, axis=2)
--- a/mlx_video/models/ltx_2/config.py
+++ b/mlx_video/models/ltx_2/config.py
@@ -1,4 +1,3 @@
-
 import inspect
 from dataclasses import dataclass, field
 from enum import Enum
@@ -22,9 +21,11 @@ class LTXRopeType(Enum):
    SPLIT = "split"
    TWO_D = "2d"

+
 class AttentionType(Enum):
    DEFAULT = "default"

+
@dataclass
 class BaseModelConfig:

@@ -46,7 +47,7 @@ class BaseModelConfig:
            if v is not None:
                if isinstance(v, Enum):
                    result[k] = v.value
-                elif hasattr(v, 'to_dict'):
+                elif hasattr(v, "to_dict"):
                    result[k] = v.to_dict()
                else:
                    result[k] = v
@@ -68,26 +69,30 @@ class VideoVAEConfig(BaseModelConfig):
    out_channels: int = 128
    latent_channels: int = 128
    patch_size: int = 4
-    encoder_blocks: List[tuple] = field(default_factory=lambda: [
-        ("res_x", {"num_layers": 4}),
-        ("compress_space_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 6}),
-        ("compress_time_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 6}),
-        ("compress_all_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 2}),
-        ("compress_all_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 2}),
-    ])
-    decoder_blocks: List[tuple] = field(default_factory=lambda: [
-        ("res_x", {"num_layers": 5, "inject_noise": False}),
-        ("compress_all", {"residual": True, "multiplier": 2}),
-        ("res_x", {"num_layers": 5, "inject_noise": False}),
-        ("compress_all", {"residual": True, "multiplier": 2}),
-        ("res_x", {"num_layers": 5, "inject_noise": False}),
-        ("compress_all", {"residual": True, "multiplier": 2}),
-        ("res_x", {"num_layers": 5, "inject_noise": False}),
-    ])
+    encoder_blocks: List[tuple] = field(
+        default_factory=lambda: [
+            ("res_x", {"num_layers": 4}),
+            ("compress_space_res", {"multiplier": 2}),
+            ("res_x", {"num_layers": 6}),
+            ("compress_time_res", {"multiplier": 2}),
+            ("res_x", {"num_layers": 6}),
+            ("compress_all_res", {"multiplier": 2}),
+            ("res_x", {"num_layers": 2}),
+            ("compress_all_res", {"multiplier": 2}),
+            ("res_x", {"num_layers": 2}),
+        ]
+    )
+    decoder_blocks: List[tuple] = field(
+        default_factory=lambda: [
+            ("res_x", {"num_layers": 5, "inject_noise": False}),
+            ("compress_all", {"residual": True, "multiplier": 2}),
+            ("res_x", {"num_layers": 5, "inject_noise": False}),
+            ("compress_all", {"residual": True, "multiplier": 2}),
+            ("res_x", {"num_layers": 5, "inject_noise": False}),
+            ("compress_all", {"residual": True, "multiplier": 2}),
+            ("res_x", {"num_layers": 5, "inject_noise": False}),
+        ]
+    )


@dataclass
@@ -111,7 +116,9 @@ class LTXModelConfig(BaseModelConfig):
    audio_in_channels: int = 128
    audio_out_channels: int = 128
    audio_cross_attention_dim: int = 2048
-    audio_caption_channels: int = 3840  # Input dim for audio text embeddings (same as video)
+    audio_caption_channels: int = (
+        3840  # Input dim for audio text embeddings (same as video)
+    )

    # Positional embedding config
    positional_embedding_theta: float = 10000.0
@@ -196,7 +203,6 @@ class LTXModelConfig(BaseModelConfig):
        )


-
 class CausalityAxis(Enum):
    """Enum for specifying the causality axis in causal convolutions."""

@@ -237,21 +243,22 @@ class AudioDecoderModelConfig(BaseModelConfig):
    def __post_init__(self):
        """Convert string enum values to proper enum types."""
        # Import here to avoid circular imports
-        from .audio_vae.normalization import NormType
        from .audio_vae.attention import AttentionType
-        
+        from .audio_vae.normalization import NormType
+
        # Convert causality_axis string to enum
        if isinstance(self.causality_axis, str):
            self.causality_axis = CausalityAxis(self.causality_axis)
-        
+
        # Convert norm_type string to enum
        if isinstance(self.norm_type, str):
            self.norm_type = NormType(self.norm_type)
-        
+
        # Convert attn_type string to enum
        if isinstance(self.attn_type, str):
            self.attn_type = AttentionType(self.attn_type)

+
@dataclass
 class AudioEncoderModelConfig(BaseModelConfig):
    ch: int = 128
@@ -282,8 +289,8 @@ class AudioEncoderModelConfig(BaseModelConfig):

    def __post_init__(self):
        """Convert string enum values to proper enum types."""
-        from .audio_vae.normalization import NormType
        from .audio_vae.attention import AttentionType
+        from .audio_vae.normalization import NormType

        if isinstance(self.causality_axis, str):
            self.causality_axis = CausalityAxis(self.causality_axis)
@@ -334,6 +341,7 @@ class VideoDecoderModelConfig(BaseModelConfig):
    dropout: float = 0.0
    timestep_conditioning: bool = False

+
@dataclass
 class VideoEncoderModelConfig(BaseModelConfig):
    convolution_dimensions: int = 3
@@ -343,21 +351,24 @@ class VideoEncoderModelConfig(BaseModelConfig):
    norm_layer: Enum = None
    latent_log_var: Enum = None
    encoder_spatial_padding_mode: Enum = None
-    encoder_blocks: List[tuple] = field(default_factory=lambda: [("res_x", {"num_layers": 4}),
-        ("compress_space_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 6}),
-        ("compress_time_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 6}),
-        ("compress_all_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 2}),
-        ("compress_all_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 2})
-    ])
+    encoder_blocks: List[tuple] = field(
+        default_factory=lambda: [
+            ("res_x", {"num_layers": 4}),
+            ("compress_space_res", {"multiplier": 2}),
+            ("res_x", {"num_layers": 6}),
+            ("compress_time_res", {"multiplier": 2}),
+            ("res_x", {"num_layers": 6}),
+            ("compress_all_res", {"multiplier": 2}),
+            ("res_x", {"num_layers": 2}),
+            ("compress_all_res", {"multiplier": 2}),
+            ("res_x", {"num_layers": 2}),
+        ]
+    )

    def __post_init__(self):
+        from mlx_video.models.ltx_2.video_vae.convolution import PaddingModeType
        from mlx_video.models.ltx_2.video_vae.resnet import NormLayerType
        from mlx_video.models.ltx_2.video_vae.video_vae import LogVarianceType
-        from mlx_video.models.ltx_2.video_vae.convolution import PaddingModeType

        if self.norm_layer is None:
            self.norm_layer = NormLayerType.PIXEL_NORM
@@ -371,10 +382,12 @@ class VideoEncoderModelConfig(BaseModelConfig):
        if isinstance(self.latent_log_var, str):
            self.latent_log_var = LogVarianceType(self.latent_log_var)
        if isinstance(self.encoder_spatial_padding_mode, str):
-            self.encoder_spatial_padding_mode = PaddingModeType(self.encoder_spatial_padding_mode)
+            self.encoder_spatial_padding_mode = PaddingModeType(
+                self.encoder_spatial_padding_mode
+            )

    def to_dict(self) -> dict[str, Any]:
        result = super().to_dict()
        if self.encoder_blocks is not None:
            result["encoder_blocks"] = [list(block) for block in self.encoder_blocks]
-        return result   
+        return result
--- a/mlx_video/models/ltx_2/convert.py
+++ b/mlx_video/models/ltx_2/convert.py
@@ -49,7 +49,6 @@ from typing import Dict

 import mlx.core as mx

-
 # ─── Key prefix routing ──────────────────────────────────────────────────────

 TRANSFORMER_PREFIX = "model.diffusion_model."
@@ -78,7 +77,7 @@ def sanitize_transformer(weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
        if "audio_embeddings_connector" in key or "video_embeddings_connector" in key:
            continue

-        new_key = key[len(TRANSFORMER_PREFIX):]
+        new_key = key[len(TRANSFORMER_PREFIX) :]
        new_key = new_key.replace(".to_out.0.", ".to_out.")
        new_key = new_key.replace(".ff.net.0.proj.", ".ff.proj_in.")
        new_key = new_key.replace(".ff.net.2.", ".ff.proj_out.")
@@ -109,7 +108,7 @@ def sanitize_vae_decoder(weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
            else:
                continue
        elif key.startswith(VAE_DECODER_PREFIX):
-            new_key = key[len(VAE_DECODER_PREFIX):]
+            new_key = key[len(VAE_DECODER_PREFIX) :]
        else:
            continue

@@ -147,7 +146,7 @@ def sanitize_vae_encoder(weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
            if value.dtype != mx.float32:
                value = value.astype(mx.float32)
        elif key.startswith(VAE_ENCODER_PREFIX):
-            new_key = key[len(VAE_ENCODER_PREFIX):]
+            new_key = key[len(VAE_ENCODER_PREFIX) :]
        else:
            continue

@@ -170,7 +169,7 @@ def sanitize_audio_decoder(weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
        new_key = None

        if key.startswith(AUDIO_DECODER_PREFIX):
-            new_key = key[len(AUDIO_DECODER_PREFIX):]
+            new_key = key[len(AUDIO_DECODER_PREFIX) :]
        elif key.startswith(AUDIO_STATS_PREFIX):
            if "mean-of-means" in key:
                new_key = "per_channel_statistics.mean_of_means"
@@ -196,7 +195,7 @@ def sanitize_audio_encoder(weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
        new_key = None

        if key.startswith(AUDIO_ENCODER_PREFIX):
-            new_key = key[len(AUDIO_ENCODER_PREFIX):]
+            new_key = key[len(AUDIO_ENCODER_PREFIX) :]
        elif key.startswith(AUDIO_STATS_PREFIX):
            if "mean-of-means" in key:
                new_key = "per_channel_statistics.mean_of_means"
@@ -226,7 +225,7 @@ def sanitize_vocoder(weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
        if not key.startswith(VOCODER_PREFIX):
            continue

-        new_key = key[len(VOCODER_PREFIX):]
+        new_key = key[len(VOCODER_PREFIX) :]

        # Handle Conv1d/ConvTranspose1d weight shape conversion
        if "weight" in new_key and value.ndim == 3:
@@ -260,20 +259,20 @@ def extract_text_projections(weights: Dict[str, mx.array]) -> Dict[str, mx.array
    # aggregate_embed weights (text_embedding_projection.*)
    for key, value in weights.items():
        if key.startswith(TEXT_PROJ_PREFIX):
-            new_key = key[len(TEXT_PROJ_PREFIX):]
+            new_key = key[len(TEXT_PROJ_PREFIX) :]
            extracted[new_key] = value

    # video_embeddings_connector
    for key, value in weights.items():
        if key.startswith(VIDEO_CONNECTOR_PREFIX):
-            suffix = key[len(VIDEO_CONNECTOR_PREFIX):]
+            suffix = key[len(VIDEO_CONNECTOR_PREFIX) :]
            new_key = "video_embeddings_connector." + sanitize_connector_key(suffix)
            extracted[new_key] = value

    # audio_embeddings_connector
    for key, value in weights.items():
        if key.startswith(AUDIO_CONNECTOR_PREFIX):
-            suffix = key[len(AUDIO_CONNECTOR_PREFIX):]
+            suffix = key[len(AUDIO_CONNECTOR_PREFIX) :]
            new_key = "audio_embeddings_connector." + sanitize_connector_key(suffix)
            extracted[new_key] = value

@@ -369,11 +368,15 @@ def save_config(config: dict, output_dir: Path):
 # ─── Source resolution ─────────────────────────────────────────────────────────

 # Matches monolithic model files: ltx-2-19b-distilled.safetensors, ltx-2.3-22b-dev.safetensors, etc.
-MONOLITHIC_PATTERN = re.compile(r"^ltx-[\d.]+-\d+b-(?P<variant>distilled|dev)\.safetensors$")
+MONOLITHIC_PATTERN = re.compile(
+    r"^ltx-[\d.]+-\d+b-(?P<variant>distilled|dev)\.safetensors$"
+)

 # Matches upscaler files like ltx-2-spatial-upscaler-x2-1.0.safetensors,
 # ltx-2.3-spatial-upscaler-x2-1.0.safetensors, etc.
-UPSCALER_PATTERN = re.compile(r"^ltx-[\d.]+-(?:spatial|temporal)-upscaler-.+\.safetensors$")
+UPSCALER_PATTERN = re.compile(
+    r"^ltx-[\d.]+-(?:spatial|temporal)-upscaler-.+\.safetensors$"
+)


 def resolve_source(source: str, variant: str) -> Path:
@@ -506,7 +509,9 @@ def infer_transformer_config(weights: Dict[str, mx.array]) -> dict:
 def infer_vae_decoder_config(weights: Dict[str, mx.array], variant: str) -> dict:
    """Infer VAE decoder config from weights."""
    # Check for timestep conditioning keys
-    has_timestep = any("last_time_embedder" in k or "last_scale_shift_table" in k for k in weights)
+    has_timestep = any(
+        "last_time_embedder" in k or "last_scale_shift_table" in k for k in weights
+    )

    # Count channel multipliers from up_blocks
    max_block = -1
@@ -658,7 +663,9 @@ def convert(source: str, output_path: Path, variant: str = "distilled"):
    config = infer_transformer_config(transformer_weights)
    save_config(config, output_path / "transformer")
    t_params = sum(v.size for v in transformer_weights.values())
-    print(f"    {len(transformer_weights)} keys, {t_params:,} params, {num_shards} shards")
+    print(
+        f"    {len(transformer_weights)} keys, {t_params:,} params, {num_shards} shards"
+    )

    # 2. VAE Decoder
    print("  [2/7] VAE Decoder...")
@@ -728,7 +735,8 @@ def convert(source: str, output_path: Path, variant: str = "distilled"):
        ]
    else:
        upscaler_files = [
-            f.name for f in source_dir.iterdir()
+            f.name
+            for f in source_dir.iterdir()
            if f.is_file() and UPSCALER_PATTERN.match(f.name)
        ]

@@ -800,12 +808,21 @@ def convert(source: str, output_path: Path, variant: str = "distilled"):
    print(f"\nDone! Converted {all_converted}/{total_keys} keys")
    if all_converted < total_keys:
        known_prefixes = (
-            TRANSFORMER_PREFIX, VAE_DECODER_PREFIX, VAE_ENCODER_PREFIX,
-            VAE_STATS_PREFIX, AUDIO_DECODER_PREFIX, AUDIO_ENCODER_PREFIX,
-            AUDIO_STATS_PREFIX, VOCODER_PREFIX, TEXT_PROJ_PREFIX,
-            VIDEO_CONNECTOR_PREFIX, AUDIO_CONNECTOR_PREFIX,
+            TRANSFORMER_PREFIX,
+            VAE_DECODER_PREFIX,
+            VAE_ENCODER_PREFIX,
+            VAE_STATS_PREFIX,
+            AUDIO_DECODER_PREFIX,
+            AUDIO_ENCODER_PREFIX,
+            AUDIO_STATS_PREFIX,
+            VOCODER_PREFIX,
+            TEXT_PROJ_PREFIX,
+            VIDEO_CONNECTOR_PREFIX,
+            AUDIO_CONNECTOR_PREFIX,
        )
-        skipped = [k for k in all_weights if not any(k.startswith(p) for p in known_prefixes)]
+        skipped = [
+            k for k in all_weights if not any(k.startswith(p) for p in known_prefixes)
+        ]
        if skipped:
            print(f"  Skipped {len(skipped)} keys:")
            for k in sorted(skipped)[:20]:
--- a/mlx_video/models/ltx_2/generate.py
+++ b/mlx_video/models/ltx_2/generate.py
--- a/mlx_video/models/ltx_2/ltx.py
+++ b/mlx_video/models/ltx_2/ltx.py
@@ -1,15 +1,14 @@
+from pathlib import Path
 from typing import List, Optional, Tuple

 import mlx.core as mx
 import mlx.nn as nn
-from pathlib import Path    
+
+from mlx_video.models.ltx_2.adaln import AdaLayerNormSingle
 from mlx_video.models.ltx_2.config import (
    LTXModelConfig,
-    LTXModelType,
    LTXRopeType,
-    TransformerConfig,
 )
-from mlx_video.models.ltx_2.adaln import AdaLayerNormSingle
 from mlx_video.models.ltx_2.rope import precompute_freqs_cis
 from mlx_video.models.ltx_2.text_projection import PixArtAlphaTextProjection
 from mlx_video.models.ltx_2.transformer import (
@@ -58,11 +57,17 @@ class TransformerArgsPreprocessor:
    ) -> Tuple[mx.array, mx.array]:

        timestep = timestep * self.timestep_scale_multiplier
-        timestep_emb, embedded_timestep = self.adaln(timestep.reshape(-1), hidden_dtype=hidden_dtype)
+        timestep_emb, embedded_timestep = self.adaln(
+            timestep.reshape(-1), hidden_dtype=hidden_dtype
+        )

        # Reshape to (batch, tokens, dim)
-        timestep_emb = mx.reshape(timestep_emb, (batch_size, -1, timestep_emb.shape[-1]))
-        embedded_timestep = mx.reshape(embedded_timestep, (batch_size, -1, embedded_timestep.shape[-1]))
+        timestep_emb = mx.reshape(
+            timestep_emb, (batch_size, -1, timestep_emb.shape[-1])
+        )
+        embedded_timestep = mx.reshape(
+            embedded_timestep, (batch_size, -1, embedded_timestep.shape[-1])
+        )

        return timestep_emb, embedded_timestep

@@ -74,9 +79,15 @@ class TransformerArgsPreprocessor:
        hidden_dtype: mx.Dtype = None,
    ) -> Tuple[mx.array, mx.array]:
        timestep = timestep * self.timestep_scale_multiplier
-        timestep_emb, embedded_timestep = adaln(timestep.reshape(-1), hidden_dtype=hidden_dtype)
-        timestep_emb = mx.reshape(timestep_emb, (batch_size, -1, timestep_emb.shape[-1]))
-        embedded_timestep = mx.reshape(embedded_timestep, (batch_size, -1, embedded_timestep.shape[-1]))
+        timestep_emb, embedded_timestep = adaln(
+            timestep.reshape(-1), hidden_dtype=hidden_dtype
+        )
+        timestep_emb = mx.reshape(
+            timestep_emb, (batch_size, -1, timestep_emb.shape[-1])
+        )
+        embedded_timestep = mx.reshape(
+            embedded_timestep, (batch_size, -1, embedded_timestep.shape[-1])
+        )
        return timestep_emb, embedded_timestep

    def _prepare_context(
@@ -107,7 +118,9 @@ class TransformerArgsPreprocessor:
        # Convert boolean/int mask to float mask
        # 0 -> -inf (masked), 1 -> 0 (not masked)
        mask = (attention_mask.astype(x_dtype) - 1) * 1e9
-        mask = mx.reshape(mask, (attention_mask.shape[0], 1, -1, attention_mask.shape[-1]))
+        mask = mx.reshape(
+            mask, (attention_mask.shape[0], 1, -1, attention_mask.shape[-1])
+        )
        return mask

    def _prepare_positional_embeddings(
@@ -132,9 +145,15 @@ class TransformerArgsPreprocessor:

    def prepare(self, modality: Modality) -> TransformerArgs:
        x = self.patchify_proj(modality.latent)
-        timestep, embedded_timestep = self._prepare_timestep(modality.timesteps, x.shape[0], hidden_dtype=x.dtype)
-        context, attention_mask = self._prepare_context(modality.context, x, modality.context_mask)
-        attention_mask = self._prepare_attention_mask(attention_mask, modality.latent.dtype)
+        timestep, embedded_timestep = self._prepare_timestep(
+            modality.timesteps, x.shape[0], hidden_dtype=x.dtype
+        )
+        context, attention_mask = self._prepare_context(
+            modality.context, x, modality.context_mask
+        )
+        attention_mask = self._prepare_attention_mask(
+            attention_mask, modality.latent.dtype
+        )

        # Use precomputed positional embeddings if provided (avoids expensive RoPE recomputation)
        if modality.positional_embeddings is not None:
@@ -152,8 +171,13 @@ class TransformerArgsPreprocessor:
        prompt_timestep = None
        prompt_embedded_timestep = None
        if self.prompt_adaln is not None and modality.sigma is not None:
-            prompt_timestep, prompt_embedded_timestep = self._prepare_timestep_with_adaln(
-                self.prompt_adaln, modality.sigma, x.shape[0], hidden_dtype=x.dtype,
+            prompt_timestep, prompt_embedded_timestep = (
+                self._prepare_timestep_with_adaln(
+                    self.prompt_adaln,
+                    modality.sigma,
+                    x.shape[0],
+                    hidden_dtype=x.dtype,
+                )
            )

        return TransformerArgs(
@@ -229,11 +253,13 @@ class MultiModalTransformerArgsPreprocessor:
        )

        # Prepare cross-attention timestep embeddings
-        cross_scale_shift_timestep, cross_gate_timestep = self._prepare_cross_attention_timestep(
-            timestep=modality.timesteps,
-            timestep_scale_multiplier=self.simple_preprocessor.timestep_scale_multiplier,
-            batch_size=transformer_args.x.shape[0],
-            hidden_dtype=transformer_args.x.dtype,
+        cross_scale_shift_timestep, cross_gate_timestep = (
+            self._prepare_cross_attention_timestep(
+                timestep=modality.timesteps,
+                timestep_scale_multiplier=self.simple_preprocessor.timestep_scale_multiplier,
+                batch_size=transformer_args.x.shape[0],
+                hidden_dtype=transformer_args.x.dtype,
+            )
        )

        return replace(
@@ -254,17 +280,25 @@ class MultiModalTransformerArgsPreprocessor:

        av_ca_factor = self.av_ca_timestep_scale_multiplier / timestep_scale_multiplier

-        scale_shift_timestep, _ = self.cross_scale_shift_adaln(timestep.reshape(-1), hidden_dtype=hidden_dtype)
-        scale_shift_timestep = mx.reshape(scale_shift_timestep, (batch_size, -1, scale_shift_timestep.shape[-1]))
+        scale_shift_timestep, _ = self.cross_scale_shift_adaln(
+            timestep.reshape(-1), hidden_dtype=hidden_dtype
+        )
+        scale_shift_timestep = mx.reshape(
+            scale_shift_timestep, (batch_size, -1, scale_shift_timestep.shape[-1])
+        )

-        gate_timestep, _ = self.cross_gate_adaln(timestep.reshape(-1) * av_ca_factor, hidden_dtype=hidden_dtype)
-        gate_timestep = mx.reshape(gate_timestep, (batch_size, -1, gate_timestep.shape[-1]))
+        gate_timestep, _ = self.cross_gate_adaln(
+            timestep.reshape(-1) * av_ca_factor, hidden_dtype=hidden_dtype
+        )
+        gate_timestep = mx.reshape(
+            gate_timestep, (batch_size, -1, gate_timestep.shape[-1])
+        )

        return scale_shift_timestep, gate_timestep


 class LTXModel(nn.Module):
- 
+
    def __init__(self, config: LTXModelConfig):

        super().__init__()
@@ -285,18 +319,25 @@ class LTXModel(nn.Module):
            self._init_video(config)

        if config.model_type.is_audio_enabled():
-            self.audio_positional_embedding_max_pos = config.audio_positional_embedding_max_pos
+            self.audio_positional_embedding_max_pos = (
+                config.audio_positional_embedding_max_pos
+            )
            self.audio_num_attention_heads = config.audio_num_attention_heads
            self.audio_inner_dim = config.audio_inner_dim
            self._init_audio(config)

        # Initialize cross-modal components
-        if config.model_type.is_video_enabled() and config.model_type.is_audio_enabled():
+        if (
+            config.model_type.is_video_enabled()
+            and config.model_type.is_audio_enabled()
+        ):
            cross_pe_max_pos = max(
                config.positional_embedding_max_pos[0],
                config.audio_positional_embedding_max_pos[0],
            )
-            self.av_ca_timestep_scale_multiplier = config.av_ca_timestep_scale_multiplier
+            self.av_ca_timestep_scale_multiplier = (
+                config.av_ca_timestep_scale_multiplier
+            )
            self.audio_cross_attention_dim = config.audio_cross_attention_dim
            self._init_audio_video(config)

@@ -308,10 +349,14 @@ class LTXModel(nn.Module):
        self.patchify_proj = nn.Linear(config.in_channels, self.inner_dim, bias=True)

        adaln_coefficient = 9 if config.has_prompt_adaln else 6
-        self.adaln_single = AdaLayerNormSingle(self.inner_dim, embedding_coefficient=adaln_coefficient)
+        self.adaln_single = AdaLayerNormSingle(
+            self.inner_dim, embedding_coefficient=adaln_coefficient
+        )

        if config.has_prompt_adaln:
-            self.prompt_adaln_single = AdaLayerNormSingle(self.inner_dim, embedding_coefficient=2)
+            self.prompt_adaln_single = AdaLayerNormSingle(
+                self.inner_dim, embedding_coefficient=2
+            )
        else:
            self.caption_projection = PixArtAlphaTextProjection(
                in_features=config.caption_channels,
@@ -323,13 +368,19 @@ class LTXModel(nn.Module):
        self.proj_out = nn.Linear(self.inner_dim, config.out_channels)

    def _init_audio(self, config: LTXModelConfig) -> None:
-        self.audio_patchify_proj = nn.Linear(config.audio_in_channels, self.audio_inner_dim, bias=True)
+        self.audio_patchify_proj = nn.Linear(
+            config.audio_in_channels, self.audio_inner_dim, bias=True
+        )

        audio_adaln_coefficient = 9 if config.has_prompt_adaln else 6
-        self.audio_adaln_single = AdaLayerNormSingle(self.audio_inner_dim, embedding_coefficient=audio_adaln_coefficient)
+        self.audio_adaln_single = AdaLayerNormSingle(
+            self.audio_inner_dim, embedding_coefficient=audio_adaln_coefficient
+        )

        if config.has_prompt_adaln:
-            self.audio_prompt_adaln_single = AdaLayerNormSingle(self.audio_inner_dim, embedding_coefficient=2)
+            self.audio_prompt_adaln_single = AdaLayerNormSingle(
+                self.audio_inner_dim, embedding_coefficient=2
+            )
        else:
            self.audio_caption_projection = PixArtAlphaTextProjection(
                in_features=config.audio_caption_channels,
@@ -338,7 +389,9 @@ class LTXModel(nn.Module):

        # Output components
        self.audio_scale_shift_table = mx.zeros((2, self.audio_inner_dim))
-        self.audio_norm_out = nn.LayerNorm(self.audio_inner_dim, eps=config.norm_eps, affine=False)
+        self.audio_norm_out = nn.LayerNorm(
+            self.audio_inner_dim, eps=config.norm_eps, affine=False
+        )
        self.audio_proj_out = nn.Linear(self.audio_inner_dim, config.audio_out_channels)

    def _init_audio_video(self, config: LTXModelConfig) -> None:
@@ -361,8 +414,13 @@ class LTXModel(nn.Module):
            embedding_coefficient=1,
        )

-    def _init_preprocessors(self, config: LTXModelConfig, cross_pe_max_pos: Optional[int]) -> None:
-        if config.model_type.is_video_enabled() and config.model_type.is_audio_enabled():
+    def _init_preprocessors(
+        self, config: LTXModelConfig, cross_pe_max_pos: Optional[int]
+    ) -> None:
+        if (
+            config.model_type.is_video_enabled()
+            and config.model_type.is_audio_enabled()
+        ):
            # Multi-modal preprocessors
            self.video_args_preprocessor = MultiModalTransformerArgsPreprocessor(
                patchify_proj=self.patchify_proj,
@@ -468,7 +526,8 @@ class LTXModel(nn.Module):
        stg_a_set = set(stg_audio_blocks) if stg_audio_blocks else set()
        for idx, block in self.transformer_blocks.items():
            video, audio = block(
-                video=video, audio=audio,
+                video=video,
+                audio=audio,
                skip_video_self_attn=(idx in stg_v_set),
                skip_audio_self_attn=(idx in stg_a_set),
                skip_cross_modal=skip_cross_modal,
@@ -483,7 +542,7 @@ class LTXModel(nn.Module):
        x: mx.array,
        embedded_timestep: mx.array,
    ) -> mx.array:
-       
+
        # scale_shift_table: (2, dim) -> expand to (1, 1, 2, dim)
        # embedded_timestep: (B, 1, dim) -> expand to (B, 1, 1, dim)
        table_expanded = scale_shift_table[None, None, :, :]  # (1, 1, 2, dim)
@@ -526,8 +585,12 @@ class LTXModel(nn.Module):
            raise ValueError("Audio is not enabled for this model")

        # Preprocess arguments
-        video_args = self.video_args_preprocessor.prepare(video) if video is not None else None
-        audio_args = self.audio_args_preprocessor.prepare(audio) if audio is not None else None
+        video_args = (
+            self.video_args_preprocessor.prepare(video) if video is not None else None
+        )
+        audio_args = (
+            self.audio_args_preprocessor.prepare(audio) if audio is not None else None
+        )

        # Process transformer blocks
        video_out, audio_out = self._process_transformer_blocks(
@@ -567,7 +630,7 @@ class LTXModel(nn.Module):

    def sanitize(self, weights: dict) -> dict:
        sanitized = {}
-   
+
        has_raw_prefix = any(k.startswith("model.diffusion_model.") for k in weights)
        if not has_raw_prefix:
            return weights
@@ -577,7 +640,10 @@ class LTXModel(nn.Module):

            if not key.startswith("model.diffusion_model."):
                continue
-            if "audio_embeddings_connector" in key or "video_embeddings_connector" in key:
+            if (
+                "audio_embeddings_connector" in key
+                or "video_embeddings_connector" in key
+            ):
                continue

            # Remove 'model.diffusion_model.' prefix
@@ -612,9 +678,11 @@ class LTXModel(nn.Module):
        for weight_file in model_path.glob("*.safetensors"):
            weights.update(mx.load(str(weight_file)))

-
        sanitized = model.sanitize(weights)
-        sanitized = {k: v.astype(mx.bfloat16) if v.dtype == mx.float32 else v for k, v in sanitized.items()}
+        sanitized = {
+            k: v.astype(mx.bfloat16) if v.dtype == mx.float32 else v
+            for k, v in sanitized.items()
+        }

        model.load_weights(list(sanitized.items()), strict=strict)
        mx.eval(model.parameters())
@@ -625,7 +693,7 @@ class LTXModel(nn.Module):
 class X0Model(nn.Module):

    def __init__(self, velocity_model: LTXModel):
-        
+
        super().__init__()
        self.velocity_model = velocity_model

@@ -639,13 +707,18 @@ class X0Model(nn.Module):
    ) -> Tuple[Optional[mx.array], Optional[mx.array]]:

        vx, ax = self.velocity_model(
-            video, audio,
+            video,
+            audio,
            stg_video_blocks=stg_video_blocks,
            stg_audio_blocks=stg_audio_blocks,
            skip_cross_modal=skip_cross_modal,
        )

-        denoised_video = to_denoised(video.latent, vx, video.timesteps) if vx is not None else None
-        denoised_audio = to_denoised(audio.latent, ax, audio.timesteps) if ax is not None else None
+        denoised_video = (
+            to_denoised(video.latent, vx, video.timesteps) if vx is not None else None
+        )
+        denoised_audio = (
+            to_denoised(audio.latent, ax, audio.timesteps) if ax is not None else None
+        )

        return denoised_video, denoised_audio
--- a/mlx_video/models/ltx_2/postprocess.py
+++ b/mlx_video/models/ltx_2/postprocess.py
@@ -1,9 +1,10 @@

 import numpy as np
-from typing import Optional


-def bilateral_filter(image: np.ndarray, d: int = 5, sigma_color: float = 75, sigma_space: float = 75) -> np.ndarray:
+def bilateral_filter(
+    image: np.ndarray, d: int = 5, sigma_color: float = 75, sigma_space: float = 75
+) -> np.ndarray:
    """Apply bilateral filter to reduce grid artifacts while preserving edges.

    Args:
@@ -17,6 +18,7 @@ def bilateral_filter(image: np.ndarray, d: int = 5, sigma_color: float = 75, sig
    """
    try:
        import cv2
+
        return cv2.bilateralFilter(image, d, sigma_color, sigma_space)
    except ImportError:
        # Fallback to simple Gaussian blur if cv2 not available
@@ -35,14 +37,20 @@ def gaussian_blur(image: np.ndarray, kernel_size: int = 3) -> np.ndarray:
    """
    try:
        import cv2
+
        return cv2.GaussianBlur(image, (kernel_size, kernel_size), 0)
    except ImportError:
        # Simple box blur fallback
        from scipy.ndimage import uniform_filter
-        return uniform_filter(image, size=(kernel_size, kernel_size, 1)).astype(np.uint8)
+
+        return uniform_filter(image, size=(kernel_size, kernel_size, 1)).astype(
+            np.uint8
+        )


-def unsharp_mask(image: np.ndarray, kernel_size: int = 5, sigma: float = 1.0, amount: float = 1.0) -> np.ndarray:
+def unsharp_mask(
+    image: np.ndarray, kernel_size: int = 5, sigma: float = 1.0, amount: float = 1.0
+) -> np.ndarray:
    """Apply unsharp masking to enhance edges after blur.

    Args:
@@ -56,6 +64,7 @@ def unsharp_mask(image: np.ndarray, kernel_size: int = 5, sigma: float = 1.0, am
    """
    try:
        import cv2
+
        blurred = cv2.GaussianBlur(image, (kernel_size, kernel_size), sigma)
        sharpened = cv2.addWeighted(image, 1 + amount, blurred, -amount, 0)
        return np.clip(sharpened, 0, 255).astype(np.uint8)
@@ -81,23 +90,23 @@ def reduce_grid_artifacts(
    if method == "bilateral":
        d = max(3, int(5 * strength))
        sigma = 50 + 50 * strength
-        processed = np.stack([
-            bilateral_filter(frame, d=d, sigma_color=sigma, sigma_space=sigma)
-            for frame in video
-        ])
+        processed = np.stack(
+            [
+                bilateral_filter(frame, d=d, sigma_color=sigma, sigma_space=sigma)
+                for frame in video
+            ]
+        )
    elif method == "gaussian":
        kernel_size = max(3, int(3 + 4 * strength))
        if kernel_size % 2 == 0:
            kernel_size += 1
-        processed = np.stack([
-            gaussian_blur(frame, kernel_size=kernel_size)
-            for frame in video
-        ])
+        processed = np.stack(
+            [gaussian_blur(frame, kernel_size=kernel_size) for frame in video]
+        )
    elif method == "frequency":
-        processed = np.stack([
-            remove_grid_frequency(frame, grid_size=8)
-            for frame in video
-        ])
+        processed = np.stack(
+            [remove_grid_frequency(frame, grid_size=8) for frame in video]
+        )
    else:
        raise ValueError(f"Unknown method: {method}")

@@ -160,6 +169,3 @@ def remove_grid_frequency(frame: np.ndarray, grid_size: int = 8) -> np.ndarray:
        result[:, :, c] = np.clip(channel_filtered, 0, 255).astype(np.uint8)

    return result
-
-
-
--- a/mlx_video/models/ltx_2/rope.py
+++ b/mlx_video/models/ltx_2/rope.py
@@ -1,4 +1,3 @@
-
 import math
 from typing import List, Optional, Tuple

@@ -86,11 +85,12 @@ def rotate_half_interleaved(x: mx.array) -> mx.array:
    """
    # x: (..., dim) where dim is even
    x_even = x[..., 0::2]  # [x0, x2, x4, ...]
-    x_odd = x[..., 1::2]   # [x1, x3, x5, ...]
+    x_odd = x[..., 1::2]  # [x1, x3, x5, ...]
    # Stack: [[-x1, x0], [-x3, x2], ...] then flatten to [-x1, x0, -x3, x2, ...]
    rotated = mx.stack([-x_odd, x_even], axis=-1)
    return mx.reshape(rotated, x.shape)

+
 def apply_rotary_emb_1d(
    q: mx.array,
    k: mx.array,
@@ -228,9 +228,9 @@ def get_fractional_positions(
        Fractional positions in range [-1, 1] after scaling
    """
    n_pos_dims = indices_grid.shape[1]
-    assert n_pos_dims == len(max_pos), (
-        f"Number of position dimensions ({n_pos_dims}) must match max_pos length ({len(max_pos)})"
-    )
+    assert n_pos_dims == len(
+        max_pos
+    ), f"Number of position dimensions ({n_pos_dims}) must match max_pos length ({len(max_pos)})"

    # Divide each dimension by its max position
    fractional_positions = []
@@ -392,11 +392,15 @@ def precompute_freqs_cis(
    if max_pos is None:
        max_pos = [20, 2048, 2048]

-
    if double_precision:
        return _precompute_freqs_cis_double_precision(
-            indices_grid, dim, theta, max_pos, use_middle_indices_grid,
-            num_attention_heads, rope_type
+            indices_grid,
+            dim,
+            theta,
+            max_pos,
+            use_middle_indices_grid,
+            num_attention_heads,
+            rope_type,
        )

    # Keep positions in float32 for RoPE computation.
@@ -495,7 +499,9 @@ def _precompute_freqs_cis_double_precision(
    # Compute frequencies: outer product
    # scaled_positions: (B, T, n_dims) -> (B, T, n_dims, 1)
    # freq_indices: (num_indices,) -> (1, 1, 1, num_indices)
-    freqs = mx.expand_dims(scaled_positions, axis=-1) * mx.reshape(freq_indices, (1, 1, 1, -1))
+    freqs = mx.expand_dims(scaled_positions, axis=-1) * mx.reshape(
+        freq_indices, (1, 1, 1, -1)
+    )
    # freqs: (B, T, n_dims, num_indices)

    # Transpose and flatten: (B, T, n_dims, num_indices) -> (B, T, num_indices, n_dims) -> (B, T, num_indices * n_dims)
--- a/mlx_video/models/ltx_2/samplers.py
+++ b/mlx_video/models/ltx_2/samplers.py
@@ -5,15 +5,14 @@ noise injection, ported from the LTX-2 PyTorch implementation.
 """

 import math
-from typing import Optional

 import mlx.core as mx

-
 # ---------------------------------------------------------------------------
 # Phi functions and RK coefficients (pure Python math, no MLX needed)
 # ---------------------------------------------------------------------------

+
 def phi(j: int, neg_h: float) -> float:
    """Compute phi_j(z) where z = -h (negative step size in log-space).

@@ -43,6 +42,7 @@ def get_res2s_coefficients(
    Returns:
        (a21, b1, b2): RK coefficients.
    """
+
    def get_phi(j: int, neg_h: float) -> float:
        cache_key = (j, neg_h)
        if cache_key in phi_cache:
@@ -69,6 +69,7 @@ def get_res2s_coefficients(
 # SDE noise injection
 # ---------------------------------------------------------------------------

+
 def get_sde_coeff(
    sigma_next: float,
 ) -> tuple[float, float, float]:
@@ -139,7 +140,9 @@ def sde_noise_step(
    denoised_next = sample_f32 - sigma * eps_next

    # Mix deterministic and stochastic components
-    x_noised = alpha_ratio * (denoised_next + sigma_down * eps_next) + sigma_up * noise_f32
+    x_noised = (
+        alpha_ratio * (denoised_next + sigma_down * eps_next) + sigma_up * noise_f32
+    )

    return x_noised

@@ -148,6 +151,7 @@ def sde_noise_step(
 # Noise generation
 # ---------------------------------------------------------------------------

+
 def channelwise_normalize(x: mx.array) -> mx.array:
    """Normalize each channel to zero mean and unit variance over spatial dims.

--- a/mlx_video/models/ltx_2/text_encoder.py
+++ b/mlx_video/models/ltx_2/text_encoder.py
@@ -1,25 +1,25 @@
-
-
 import functools
 import logging
 import math
 import re
-from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple

 import mlx.core as mx
 import mlx.nn as nn
-import numpy as np
-from rich.console import Console
-from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn
-
-from mlx_video.utils import rms_norm, apply_quantization
-from mlx_video.models.ltx_2.rope import apply_interleaved_rotary_emb
-
-from mlx_vlm.models.gemma3.language import Gemma3Model
 from mlx_vlm.models.gemma3.config import TextConfig
+from mlx_vlm.models.gemma3.language import Gemma3Model
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    Progress,
+    SpinnerColumn,
+    TaskProgressColumn,
+    TextColumn,
+    TimeRemainingColumn,
+)

+from mlx_video.utils import apply_quantization, rms_norm

 # Path to system prompts
 PROMPTS_DIR = Path(__file__).parent / "prompts"
@@ -36,11 +36,10 @@ def _load_system_prompt(prompt_name: str) -> str:

 class LanguageModel(nn.Module):

-
    def __init__(self, config: TextConfig):
        super().__init__()
        # Create config matching LTX-2 text encoder requirements
-        self.config = config 
+        self.config = config

        # Create the Gemma3Model from mlx-vlm
        self.model = Gemma3Model(self.config)
@@ -51,7 +50,7 @@ class LanguageModel(nn.Module):
        attention_mask: Optional[mx.array],
        dtype: mx.Dtype,
    ) -> mx.array:
-        
+
        causal_mask = mx.tril(mx.ones((seq_len, seq_len), dtype=mx.bool_))

        if attention_mask is not None:
@@ -59,15 +58,25 @@ class LanguageModel(nn.Module):

            padding_mask = attention_mask.astype(mx.bool_)  # (batch, seq_len)
            combined = causal_mask[None, :, :] & padding_mask[:, None, :]
-            min_val = mx.finfo(dtype).min if dtype in (mx.float16, mx.bfloat16) else -1e9
-            mask = mx.where(combined, mx.zeros(combined.shape, dtype=dtype),
-                           mx.full(combined.shape, min_val, dtype=dtype))
+            min_val = (
+                mx.finfo(dtype).min if dtype in (mx.float16, mx.bfloat16) else -1e9
+            )
+            mask = mx.where(
+                combined,
+                mx.zeros(combined.shape, dtype=dtype),
+                mx.full(combined.shape, min_val, dtype=dtype),
+            )
            return mask[:, None, :, :]
        else:
            # No padding mask, just causal
-            min_val = mx.finfo(dtype).min if dtype in (mx.float16, mx.bfloat16) else -1e9
-            mask = mx.where(causal_mask, mx.zeros((seq_len, seq_len), dtype=dtype),
-                           mx.full((seq_len, seq_len), min_val, dtype=dtype))
+            min_val = (
+                mx.finfo(dtype).min if dtype in (mx.float16, mx.bfloat16) else -1e9
+            )
+            mask = mx.where(
+                causal_mask,
+                mx.zeros((seq_len, seq_len), dtype=dtype),
+                mx.full((seq_len, seq_len), min_val, dtype=dtype),
+            )
            return mask[None, None, :, :]  # (1, 1, seq, seq)

    def __call__(
@@ -91,7 +100,11 @@ class LanguageModel(nn.Module):
        batch_size, seq_len = inputs.shape

        # Get embeddings
-        h = input_embeddings if input_embeddings is not None else self.model.embed_tokens(inputs)
+        h = (
+            input_embeddings
+            if input_embeddings is not None
+            else self.model.embed_tokens(inputs)
+        )

        # Apply Gemma scaling
        h *= mx.array(self.config.hidden_size**0.5, mx.bfloat16).astype(h.dtype)
@@ -103,11 +116,12 @@ class LanguageModel(nn.Module):
        if cache is None:
            cache = [None] * len(self.model.layers)

-        full_causal_mask = self._create_causal_mask_with_padding(seq_len, attention_mask, h.dtype)
+        full_causal_mask = self._create_causal_mask_with_padding(
+            seq_len, attention_mask, h.dtype
+        )

        sliding_mask = full_causal_mask

-
        num_layers = len(self.model.layers)
        for i, layer in enumerate(self.model.layers):
            is_global = (
@@ -147,9 +161,9 @@ class LanguageModel(nn.Module):
        for key, value in weights.items():
            if key.startswith(prefix):
                if hasattr(value, "dtype") and value.dtype == mx.float32:
-                    sanitized[key[len(prefix):]] = value.astype(mx.bfloat16)
+                    sanitized[key[len(prefix) :]] = value.astype(mx.bfloat16)
                else:
-                    sanitized[key[len(prefix):]] = value
+                    sanitized[key[len(prefix) :]] = value
        return sanitized

    @property
@@ -158,6 +172,7 @@ class LanguageModel(nn.Module):

    def make_cache(self):
        from mlx_vlm.models.cache import KVCache, RotatingKVCache
+
        caches = []
        for i in range(len(self.layers)):
            if (
@@ -172,6 +187,7 @@ class LanguageModel(nn.Module):
    @classmethod
    def from_pretrained(cls, model_path: str):
        import json
+
        weight_files = sorted(Path(model_path).glob("*.safetensors"))
        config_file = Path(model_path) / "config.json"
        config_dict = {}
@@ -179,7 +195,9 @@ class LanguageModel(nn.Module):
            with open(config_file, "r") as f:
                config_dict = json.load(f)

-            language_model = cls(config=TextConfig.from_dict(config_dict["text_config"]))
+            language_model = cls(
+                config=TextConfig.from_dict(config_dict["text_config"])
+            )
        else:
            raise ValueError(f"Config file not found at {model_path}")

@@ -188,19 +206,18 @@ class LanguageModel(nn.Module):
        for i, wf in enumerate(weight_files):
            weights.update(mx.load(str(wf)))

-
        if hasattr(language_model, "sanitize"):
            weights = language_model.sanitize(weights=weights)

-
-        apply_quantization(model=language_model, weights=weights, quantization=quantization)
+        apply_quantization(
+            model=language_model, weights=weights, quantization=quantization
+        )

        language_model.load_weights(list(weights.items()), strict=False)

        return language_model


-
 class ConnectorAttention(nn.Module):

    def __init__(
@@ -250,9 +267,15 @@ class ConnectorAttention(nn.Module):
        k = self.k_norm(k)

        # Reshape to (B, H, T, D) for SPLIT RoPE
-        q = mx.reshape(q, (batch_size, seq_len, self.num_heads, self.head_dim)).transpose(0, 2, 1, 3)
-        k = mx.reshape(k, (batch_size, seq_len, self.num_heads, self.head_dim)).transpose(0, 2, 1, 3)
-        v = mx.reshape(v, (batch_size, seq_len, self.num_heads, self.head_dim)).transpose(0, 2, 1, 3)
+        q = mx.reshape(
+            q, (batch_size, seq_len, self.num_heads, self.head_dim)
+        ).transpose(0, 2, 1, 3)
+        k = mx.reshape(
+            k, (batch_size, seq_len, self.num_heads, self.head_dim)
+        ).transpose(0, 2, 1, 3)
+        v = mx.reshape(
+            v, (batch_size, seq_len, self.num_heads, self.head_dim)
+        ).transpose(0, 2, 1, 3)

        if pe is not None:
            q = self._apply_split_rope(q, pe[0], pe[1])
@@ -304,7 +327,7 @@ class ConnectorAttention(nn.Module):
        out2 = x2 * cos_freq + x1 * sin_freq

        return mx.concatenate([out1, out2], axis=-1).astype(input_dtype)
-    
+

 class GEGLU(nn.Module):
    """GELU-gated linear unit."""
@@ -336,9 +359,17 @@ class ConnectorFeedForward(nn.Module):

 class ConnectorTransformerBlock(nn.Module):

-    def __init__(self, dim: int = 3840, num_heads: int = 30, head_dim: int = 128, has_gate_logits: bool = False):
+    def __init__(
+        self,
+        dim: int = 3840,
+        num_heads: int = 30,
+        head_dim: int = 128,
+        has_gate_logits: bool = False,
+    ):
        super().__init__()
-        self.attn1 = ConnectorAttention(dim, num_heads, head_dim, has_gate_logits=has_gate_logits)
+        self.attn1 = ConnectorAttention(
+            dim, num_heads, head_dim, has_gate_logits=has_gate_logits
+        )
        self.ff = ConnectorFeedForward(dim)

    def __call__(
@@ -388,14 +419,18 @@ class Embeddings1DConnector(nn.Module):
        self.positional_embedding_max_pos = positional_embedding_max_pos or [1]

        self.transformer_1d_blocks = {
-            i: ConnectorTransformerBlock(dim, num_heads, head_dim, has_gate_logits=has_gate_logits)
+            i: ConnectorTransformerBlock(
+                dim, num_heads, head_dim, has_gate_logits=has_gate_logits
+            )
            for i in range(num_layers)
        }

        if num_learnable_registers > 0:
            self.learnable_registers = mx.zeros((num_learnable_registers, dim))

-    def _precompute_freqs_cis(self, seq_len: int, dtype: mx.Dtype) -> Tuple[mx.array, mx.array]:
+    def _precompute_freqs_cis(
+        self, seq_len: int, dtype: mx.Dtype
+    ) -> Tuple[mx.array, mx.array]:
        """Compute RoPE frequencies for connector (SPLIT type matching PyTorch).

        Returns tuple of (cos, sin) each with shape (1, num_heads, seq_len, head_dim//2).
@@ -464,11 +499,15 @@ class Embeddings1DConnector(nn.Module):

        # Binary mask: 1 for valid tokens, 0 for padded
        # attention_mask is additive: 0 for valid, large negative for padded
-        mask_binary = (attention_mask.squeeze(1).squeeze(1) >= -9000.0).astype(mx.int32)  # (batch, seq)
+        mask_binary = (attention_mask.squeeze(1).squeeze(1) >= -9000.0).astype(
+            mx.int32
+        )  # (batch, seq)

        # Tile registers to match sequence length, cast to hidden_states dtype
        num_tiles = seq_len // self.num_learnable_registers
-        registers = mx.tile(self.learnable_registers, (num_tiles, 1)).astype(dtype)  # (seq_len, dim)
+        registers = mx.tile(self.learnable_registers, (num_tiles, 1)).astype(
+            dtype
+        )  # (seq_len, dim)

        # Process each batch item (PyTorch uses advanced indexing)
        result_list = []
@@ -481,25 +520,33 @@ class Embeddings1DConnector(nn.Module):

            # Extract valid tokens (where mask is 1)
            # Since we have left-padded input, valid tokens are at the end
-            valid_tokens = hs_b[seq_len - num_valid:]  # (num_valid, dim)
+            valid_tokens = hs_b[seq_len - num_valid :]  # (num_valid, dim)

            # Pad with zeros on the right to get back to seq_len
            pad_length = seq_len - num_valid
            if pad_length > 0:
                padding = mx.zeros((pad_length, dim), dtype=dtype)
-                adjusted = mx.concatenate([valid_tokens, padding], axis=0)  # (seq_len, dim)
+                adjusted = mx.concatenate(
+                    [valid_tokens, padding], axis=0
+                )  # (seq_len, dim)
            else:
                adjusted = valid_tokens

            # Create flipped mask: 1s at front (where valid tokens now are), 0s at back
-            flipped_mask = mx.concatenate([
-                mx.ones((num_valid,), dtype=mx.int32),
-                mx.zeros((pad_length,), dtype=mx.int32)
-            ], axis=0)  # (seq,)
+            flipped_mask = mx.concatenate(
+                [
+                    mx.ones((num_valid,), dtype=mx.int32),
+                    mx.zeros((pad_length,), dtype=mx.int32),
+                ],
+                axis=0,
+            )  # (seq,)

            # Combine: valid tokens at front, registers at back
            flipped_mask_expanded = flipped_mask[:, None].astype(dtype)  # (seq, 1)
-            combined = flipped_mask_expanded * adjusted + (1 - flipped_mask_expanded) * registers
+            combined = (
+                flipped_mask_expanded * adjusted
+                + (1 - flipped_mask_expanded) * registers
+            )
            result_list.append(combined)

        hidden_states = mx.stack(result_list, axis=0)  # (batch, seq, dim)
@@ -526,7 +573,9 @@ class Embeddings1DConnector(nn.Module):

        # Process through transformer blocks
        for i in range(len(self.transformer_1d_blocks)):
-            hidden_states = self.transformer_1d_blocks[i](hidden_states, attention_mask, freqs_cis)
+            hidden_states = self.transformer_1d_blocks[i](
+                hidden_states, attention_mask, freqs_cis
+            )

        # Final RMS norm
        hidden_states = rms_norm(hidden_states)
@@ -534,7 +583,6 @@ class Embeddings1DConnector(nn.Module):
        return hidden_states, attention_mask


-
 def norm_and_concat_hidden_states(
    hidden_states: List[mx.array],
    attention_mask: mx.array,
@@ -567,8 +615,12 @@ def norm_and_concat_hidden_states(
    mean = mx.sum(masked, axis=(1, 2), keepdims=True) / (denom + eps)

    # Compute masked min/max per layer
-    x_for_min = mx.where(mask, stacked, mx.full(stacked.shape, float('inf'), dtype=dtype))
-    x_for_max = mx.where(mask, stacked, mx.full(stacked.shape, float('-inf'), dtype=dtype))
+    x_for_min = mx.where(
+        mask, stacked, mx.full(stacked.shape, float("inf"), dtype=dtype)
+    )
+    x_for_max = mx.where(
+        mask, stacked, mx.full(stacked.shape, float("-inf"), dtype=dtype)
+    )
    x_min = mx.min(x_for_min, axis=(1, 2), keepdims=True)
    x_max = mx.max(x_for_max, axis=(1, 2), keepdims=True)
    range_val = x_max - x_min
@@ -603,7 +655,9 @@ def norm_and_concat_per_token_rms(
    dtype = encoded_text.dtype

    # Per-token RMSNorm across hidden dimension: variance = mean(x^2) over dim D
-    variance = mx.mean(encoded_text.astype(mx.float32) ** 2, axis=2, keepdims=True)  # (B, T, 1, L)
+    variance = mx.mean(
+        encoded_text.astype(mx.float32) ** 2, axis=2, keepdims=True
+    )  # (B, T, 1, L)
    normed = encoded_text.astype(mx.float32) * mx.rsqrt(variance + 1e-6)
    normed = normed.astype(dtype)

@@ -625,7 +679,9 @@ def _rescale_norm(x: mx.array, target_dim: int, source_dim: int) -> mx.array:
 class GemmaFeaturesExtractor(nn.Module):
    """V1 feature extractor (LTX-2): 8 * (x - mean) / range normalization."""

-    def __init__(self, input_dim: int = 188160, output_dim: int = 3840, bias: bool = False):
+    def __init__(
+        self, input_dim: int = 188160, output_dim: int = 3840, bias: bool = False
+    ):
        super().__init__()
        self.aggregate_embed = nn.Linear(input_dim, output_dim, bias=bias)

@@ -674,13 +730,14 @@ class GemmaFeaturesExtractorV2(nn.Module):

        if mode == "video":
            target_dim = self.video_aggregate_embed.weight.shape[0]
-            return self.video_aggregate_embed(_rescale_norm(normed, target_dim, self.embedding_dim))
+            return self.video_aggregate_embed(
+                _rescale_norm(normed, target_dim, self.embedding_dim)
+            )
        else:
            target_dim = self.audio_aggregate_embed.weight.shape[0]
-            return self.audio_aggregate_embed(_rescale_norm(normed, target_dim, self.embedding_dim))
-
-
-
+            return self.audio_aggregate_embed(
+                _rescale_norm(normed, target_dim, self.embedding_dim)
+            )


 class AudioEmbeddingsConnector(nn.Module):
@@ -717,8 +774,8 @@ class LTX2TextEncoder(nn.Module):
            video_output_dim = 4096
            audio_output_dim = 2048
            self.feature_extractor_v2 = GemmaFeaturesExtractorV2(
-                flat_dim=feature_input_dim,       # 3840 * 49 = 188160 (concatenated)
-                embedding_dim=hidden_dim,          # 3840 (Gemma hidden_dim, for rescale)
+                flat_dim=feature_input_dim,  # 3840 * 49 = 188160 (concatenated)
+                embedding_dim=hidden_dim,  # 3840 (Gemma hidden_dim, for rescale)
                video_output_dim=video_output_dim,
                audio_output_dim=audio_output_dim,
                bias=True,
@@ -728,37 +785,57 @@ class LTX2TextEncoder(nn.Module):
            # connector_positional_embedding_max_pos=[4096] from LTX-2.3 safetensors
            # config (nested under config.transformer.connector_positional_embedding_max_pos)
            self.video_embeddings_connector = Embeddings1DConnector(
-                dim=video_output_dim, num_heads=32, head_dim=128,
-                num_layers=8, num_learnable_registers=128,
-                positional_embedding_max_pos=[4096], has_gate_logits=True,
+                dim=video_output_dim,
+                num_heads=32,
+                head_dim=128,
+                num_layers=8,
+                num_learnable_registers=128,
+                positional_embedding_max_pos=[4096],
+                has_gate_logits=True,
            )
            self.audio_embeddings_connector = Embeddings1DConnector(
-                dim=audio_output_dim, num_heads=32, head_dim=64,
-                num_layers=8, num_learnable_registers=128,
-                positional_embedding_max_pos=[4096], has_gate_logits=True,
+                dim=audio_output_dim,
+                num_heads=32,
+                head_dim=64,
+                num_layers=8,
+                num_learnable_registers=128,
+                positional_embedding_max_pos=[4096],
+                has_gate_logits=True,
            )
        else:
            # LTX-2: shared feature extractor, 3840-dim connectors
-            self.feature_extractor = GemmaFeaturesExtractor(feature_input_dim, hidden_dim)
+            self.feature_extractor = GemmaFeaturesExtractor(
+                feature_input_dim, hidden_dim
+            )

            self.video_embeddings_connector = Embeddings1DConnector(
-                dim=hidden_dim, num_heads=30, head_dim=128,
-                num_layers=2, num_learnable_registers=128,
+                dim=hidden_dim,
+                num_heads=30,
+                head_dim=128,
+                num_layers=2,
+                num_learnable_registers=128,
                positional_embedding_max_pos=[1],
            )
            self.audio_embeddings_connector = Embeddings1DConnector(
-                dim=hidden_dim, num_heads=30, head_dim=128,
-                num_layers=2, num_learnable_registers=128,
+                dim=hidden_dim,
+                num_heads=30,
+                head_dim=128,
+                num_layers=2,
+                num_learnable_registers=128,
                positional_embedding_max_pos=[1],
            )

        self.processor = None

-    def load(self, model_path: Optional[str] = None, text_encoder_path: Optional[str] = "google/gemma-3-12b-it"):
+    def load(
+        self,
+        model_path: Optional[str] = None,
+        text_encoder_path: Optional[str] = "google/gemma-3-12b-it",
+    ):

        if Path(str(text_encoder_path)).joinpath("text_encoder").is_dir():
            text_encoder_path = str(Path(text_encoder_path) / "text_encoder")
-        
+
        self.language_model = LanguageModel.from_pretrained(text_encoder_path)

        # Load transformer weights for feature extractor and connector.
@@ -785,22 +862,35 @@ class LTX2TextEncoder(nn.Module):

        if transformer_weights:
            self._load_feature_extractors(transformer_weights, is_reformatted)
-            self._load_connector("video_embeddings_connector", transformer_weights, is_reformatted)
-            self._load_connector("audio_embeddings_connector", transformer_weights, is_reformatted)
+            self._load_connector(
+                "video_embeddings_connector", transformer_weights, is_reformatted
+            )
+            self._load_connector(
+                "audio_embeddings_connector", transformer_weights, is_reformatted
+            )
        else:
-            print("WARNING: No transformer weights found for text projection connectors. "
-                  "Text conditioning will use uninitialized weights!")
+            print(
+                "WARNING: No transformer weights found for text projection connectors. "
+                "Text conditioning will use uninitialized weights!"
+            )

        # Load tokenizer
        from transformers import AutoTokenizer
+
        tokenizer_path = model_path / "tokenizer"
        if tokenizer_path.exists():
-            self.processor = AutoTokenizer.from_pretrained(str(tokenizer_path), trust_remote_code=True)
+            self.processor = AutoTokenizer.from_pretrained(
+                str(tokenizer_path), trust_remote_code=True
+            )
        else:
            try:
-                self.processor = AutoTokenizer.from_pretrained(text_encoder_path, trust_remote_code=True)
+                self.processor = AutoTokenizer.from_pretrained(
+                    text_encoder_path, trust_remote_code=True
+                )
            except Exception:
-                self.processor = AutoTokenizer.from_pretrained("google/gemma-3-12b-it", trust_remote_code=True)
+                self.processor = AutoTokenizer.from_pretrained(
+                    "google/gemma-3-12b-it", trust_remote_code=True
+                )
        # Set left padding to match official LTX-2 text encoder
        self.processor.padding_side = "left"

@@ -823,7 +913,11 @@ class LTX2TextEncoder(nn.Module):
                        submodule.bias = weights[b_key]
        else:
            # LTX-2: single aggregate_embed
-            agg_key = "aggregate_embed.weight" if is_reformatted else "text_embedding_projection.aggregate_embed.weight"
+            agg_key = (
+                "aggregate_embed.weight"
+                if is_reformatted
+                else "text_embedding_projection.aggregate_embed.weight"
+            )
            if agg_key in weights:
                self.feature_extractor.aggregate_embed.weight = weights[agg_key]

@@ -837,12 +931,12 @@ class LTX2TextEncoder(nn.Module):
            prefix = f"{name}."
            for key, value in weights.items():
                if key.startswith(prefix):
-                    connector_weights[key[len(prefix):]] = value
+                    connector_weights[key[len(prefix) :]] = value
        else:
            mono_prefix = f"model.diffusion_model.{name}."
            for key, value in weights.items():
                if key.startswith(mono_prefix):
-                    connector_weights[key[len(mono_prefix):]] = value
+                    connector_weights[key[len(mono_prefix) :]] = value

        if not connector_weights:
            return
@@ -894,21 +988,36 @@ class LTX2TextEncoder(nn.Module):
        input_ids = mx.array(inputs["input_ids"])
        attention_mask = mx.array(inputs["attention_mask"])

-        _, all_hidden_states = self.language_model(inputs=input_ids, input_embeddings=None, attention_mask=attention_mask, output_hidden_states=True)
+        _, all_hidden_states = self.language_model(
+            inputs=input_ids,
+            input_embeddings=None,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )

        if self.has_prompt_adaln:
            # LTX-2.3: V2 feature extraction (per-token RMSNorm + rescale)
-            video_features = self.feature_extractor_v2(all_hidden_states, attention_mask, mode="video")
+            video_features = self.feature_extractor_v2(
+                all_hidden_states, attention_mask, mode="video"
+            )
            additive_mask = (attention_mask - 1).astype(video_features.dtype)
-            additive_mask = additive_mask.reshape(attention_mask.shape[0], 1, 1, -1) * 1e9
+            additive_mask = (
+                additive_mask.reshape(attention_mask.shape[0], 1, 1, -1) * 1e9
+            )

-            video_embeddings, _ = self.video_embeddings_connector(video_features, additive_mask)
+            video_embeddings, _ = self.video_embeddings_connector(
+                video_features, additive_mask
+            )

            if return_audio_embeddings:
-                audio_features = self.feature_extractor_v2(all_hidden_states, attention_mask, mode="audio")
+                audio_features = self.feature_extractor_v2(
+                    all_hidden_states, attention_mask, mode="audio"
+                )
                audio_mask = (attention_mask - 1).astype(audio_features.dtype)
                audio_mask = audio_mask.reshape(attention_mask.shape[0], 1, 1, -1) * 1e9
-                audio_embeddings, _ = self.audio_embeddings_connector(audio_features, audio_mask)
+                audio_embeddings, _ = self.audio_embeddings_connector(
+                    audio_features, audio_mask
+                )
                return video_embeddings, audio_embeddings
            else:
                return video_embeddings, attention_mask
@@ -920,12 +1029,18 @@ class LTX2TextEncoder(nn.Module):

            video_features = self.feature_extractor(concat_hidden)
            additive_mask = (attention_mask - 1).astype(video_features.dtype)
-            additive_mask = additive_mask.reshape(attention_mask.shape[0], 1, 1, -1) * 1e9
+            additive_mask = (
+                additive_mask.reshape(attention_mask.shape[0], 1, 1, -1) * 1e9
+            )

-            video_embeddings, _ = self.video_embeddings_connector(video_features, additive_mask)
+            video_embeddings, _ = self.video_embeddings_connector(
+                video_features, additive_mask
+            )

            if return_audio_embeddings:
-                audio_embeddings, _ = self.audio_embeddings_connector(video_features, additive_mask)
+                audio_embeddings, _ = self.audio_embeddings_connector(
+                    video_features, additive_mask
+                )
                return video_embeddings, audio_embeddings
            else:
                return video_embeddings, attention_mask
@@ -964,7 +1079,7 @@ class LTX2TextEncoder(nn.Module):
        # Remove leading/trailing whitespace
        response = response.strip()
        # Remove any leading punctuation
-        response = re.sub(r'^[^\w\s]+', '', response)
+        response = re.sub(r"^[^\w\s]+", "", response)
        return response

    def _apply_chat_template(
@@ -985,7 +1100,9 @@ class LTX2TextEncoder(nn.Module):
                elif isinstance(content, list):
                    # Handle multimodal content (image + text)
                    text_parts = [c["text"] for c in content if c.get("type") == "text"]
-                    formatted += f"<start_of_turn>user\n{' '.join(text_parts)}<end_of_turn>\n"
+                    formatted += (
+                        f"<start_of_turn>user\n{' '.join(text_parts)}<end_of_turn>\n"
+                    )
            elif role == "assistant":
                formatted += f"<start_of_turn>model\n{content}<end_of_turn>\n"
        # Add generation prompt
@@ -1016,7 +1133,9 @@ class LTX2TextEncoder(nn.Module):
            from mlx_lm import stream_generate
            from mlx_lm.sample_utils import make_logits_processors, make_sampler
        except ImportError:
-            logging.warning("mlx-lm not available for prompt enhancement. Using original prompt.")
+            logging.warning(
+                "mlx-lm not available for prompt enhancement. Using original prompt."
+            )
            return prompt

        if self.processor is None:
@@ -1043,7 +1162,11 @@ class LTX2TextEncoder(nn.Module):
        )
        input_ids = mx.array(inputs["input_ids"])

-        sampler = make_sampler(kwargs.get("temperature", 0.7), kwargs.get("top_p", 1.0), top_k=kwargs.get("top_k", -1))
+        sampler = make_sampler(
+            kwargs.get("temperature", 0.7),
+            kwargs.get("top_p", 1.0),
+            top_k=kwargs.get("top_k", -1),
+        )
        logits_processors = make_logits_processors(
            kwargs.get("logit_bias", None),
            kwargs.get("repetition_penalty", 1.3),
@@ -1094,14 +1217,15 @@ class LTX2TextEncoder(nn.Module):
        mx.clear_cache()

        # Decode only the new tokens
-        enhanced_prompt = self.processor.decode(generated_tokens, skip_special_tokens=True)
+        enhanced_prompt = self.processor.decode(
+            generated_tokens, skip_special_tokens=True
+        )

        enhanced_prompt = self._clean_response(enhanced_prompt)
        logging.info(f"Enhanced prompt: {enhanced_prompt}")

        return enhanced_prompt

-
    def enhance_i2v(
        self,
        prompt: str,
@@ -1135,4 +1259,3 @@ def load_text_encoder(model_path: str = "/tmp/ltx2") -> LTX2TextEncoder:
    encoder = LTX2TextEncoder()
    encoder.load(model_path=model_path)
    return encoder
-
--- a/mlx_video/models/ltx_2/text_projection.py
+++ b/mlx_video/models/ltx_2/text_projection.py
@@ -11,7 +11,7 @@ class PixArtAlphaTextProjection(nn.Module):
        out_features: int | None = None,
        bias: bool = True,
    ):
-        
+
        super().__init__()

        out_features = out_features or hidden_size
--- a/mlx_video/models/ltx_2/transformer.py
+++ b/mlx_video/models/ltx_2/transformer.py
@@ -4,8 +4,8 @@ from typing import Optional, Tuple
 import mlx.core as mx
 import mlx.nn as nn

-from mlx_video.models.ltx_2.config import LTXRopeType, TransformerConfig
 from mlx_video.models.ltx_2.attention import Attention
+from mlx_video.models.ltx_2.config import LTXRopeType, TransformerConfig
 from mlx_video.models.ltx_2.feed_forward import FeedForward
 from mlx_video.utils import rms_norm

@@ -171,8 +171,7 @@ class BasicAVTransformerBlock(nn.Module):

        # timestep: (B, seq, num_params * dim) -> reshape to (B, seq, num_params, dim)
        timestep_reshaped = mx.reshape(
-            timestep,
-            (batch_size, timestep.shape[1], num_ada_params, -1)
+            timestep, (batch_size, timestep.shape[1], num_ada_params, -1)
        )

        # Extract the relevant indices
@@ -225,8 +224,12 @@ class BasicAVTransformerBlock(nn.Module):
        )

        # Squeeze the sequence dimension if it's 1
-        scale_shift_squeezed = tuple(mx.squeeze(t, axis=1) if t.shape[1] == 1 else t for t in scale_shift_ada)
-        gate_squeezed = tuple(mx.squeeze(t, axis=1) if t.shape[1] == 1 else t for t in gate_ada)
+        scale_shift_squeezed = tuple(
+            mx.squeeze(t, axis=1) if t.shape[1] == 1 else t for t in scale_shift_ada
+        )
+        gate_squeezed = tuple(
+            mx.squeeze(t, axis=1) if t.shape[1] == 1 else t for t in gate_ada
+        )

        return (*scale_shift_squeezed, *gate_squeezed)

@@ -258,8 +261,16 @@ class BasicAVTransformerBlock(nn.Module):
        # Check which modalities to run
        run_vx = video is not None and video.enabled and vx.size > 0
        run_ax = audio is not None and audio.enabled and ax.size > 0
-        run_a2v = run_vx and (audio is not None and audio.enabled and ax.size > 0) and not skip_cross_modal
-        run_v2a = run_ax and (video is not None and video.enabled and vx.size > 0) and not skip_cross_modal
+        run_a2v = (
+            run_vx
+            and (audio is not None and audio.enabled and ax.size > 0)
+            and not skip_cross_modal
+        )
+        run_v2a = (
+            run_ax
+            and (video is not None and video.enabled and vx.size > 0)
+            and not skip_cross_modal
+        )

        # Process video self-attention and cross-attention with text
        if run_vx:
@@ -269,7 +280,15 @@ class BasicAVTransformerBlock(nn.Module):

            # Self-attention with RoPE (skip_attention=True for STG perturbation)
            norm_vx = rms_norm(vx, eps=self.norm_eps) * (1 + vscale_msa) + vshift_msa
-            vx = vx + self.attn1(norm_vx, pe=video.positional_embeddings, skip_attention=skip_video_self_attn) * vgate_msa
+            vx = (
+                vx
+                + self.attn1(
+                    norm_vx,
+                    pe=video.positional_embeddings,
+                    skip_attention=skip_video_self_attn,
+                )
+                * vgate_msa
+            )

            # Cross-attention with text context
            if self.has_prompt_adaln:
@@ -278,11 +297,24 @@ class BasicAVTransformerBlock(nn.Module):
                    self.scale_shift_table, vx.shape[0], video.timesteps, slice(6, 9)
                )
                vprompt_shift_kv, vprompt_scale_kv = self.get_ada_values(
-                    self.prompt_scale_shift_table, vx.shape[0], video.prompt_timesteps, slice(0, 2)
+                    self.prompt_scale_shift_table,
+                    vx.shape[0],
+                    video.prompt_timesteps,
+                    slice(0, 2),
                )
                attn_input = rms_norm(vx, eps=self.norm_eps) * (1 + vscale_q) + vshift_q
-                encoder_hidden_states = video.context * (1 + vprompt_scale_kv) + vprompt_shift_kv
-                vx = vx + self.attn2(attn_input, context=encoder_hidden_states, mask=video.context_mask) * vgate_q
+                encoder_hidden_states = (
+                    video.context * (1 + vprompt_scale_kv) + vprompt_shift_kv
+                )
+                vx = (
+                    vx
+                    + self.attn2(
+                        attn_input,
+                        context=encoder_hidden_states,
+                        mask=video.context_mask,
+                    )
+                    * vgate_q
+                )
            else:
                vx = vx + self.attn2(
                    rms_norm(vx, eps=self.norm_eps),
@@ -298,20 +330,46 @@ class BasicAVTransformerBlock(nn.Module):

            # Self-attention with RoPE (skip_attention=True for STG perturbation)
            norm_ax = rms_norm(ax, eps=self.norm_eps) * (1 + ascale_msa) + ashift_msa
-            ax = ax + self.audio_attn1(norm_ax, pe=audio.positional_embeddings, skip_attention=skip_audio_self_attn) * agate_msa
+            ax = (
+                ax
+                + self.audio_attn1(
+                    norm_ax,
+                    pe=audio.positional_embeddings,
+                    skip_attention=skip_audio_self_attn,
+                )
+                * agate_msa
+            )

            # Cross-attention with text context
            if self.has_prompt_adaln:
                # LTX-2.3: Q modulated by timestep (indices 6-8), context modulated by prompt_adaln
                ashift_q, ascale_q, agate_q = self.get_ada_values(
-                    self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(6, 9)
+                    self.audio_scale_shift_table,
+                    ax.shape[0],
+                    audio.timesteps,
+                    slice(6, 9),
                )
                aprompt_shift_kv, aprompt_scale_kv = self.get_ada_values(
-                    self.audio_prompt_scale_shift_table, ax.shape[0], audio.prompt_timesteps, slice(0, 2)
+                    self.audio_prompt_scale_shift_table,
+                    ax.shape[0],
+                    audio.prompt_timesteps,
+                    slice(0, 2),
+                )
+                attn_input_a = (
+                    rms_norm(ax, eps=self.norm_eps) * (1 + ascale_q) + ashift_q
+                )
+                encoder_hidden_states_a = (
+                    audio.context * (1 + aprompt_scale_kv) + aprompt_shift_kv
+                )
+                ax = (
+                    ax
+                    + self.audio_attn2(
+                        attn_input_a,
+                        context=encoder_hidden_states_a,
+                        mask=audio.context_mask,
+                    )
+                    * agate_q
                )
-                attn_input_a = rms_norm(ax, eps=self.norm_eps) * (1 + ascale_q) + ashift_q
-                encoder_hidden_states_a = audio.context * (1 + aprompt_scale_kv) + aprompt_shift_kv
-                ax = ax + self.audio_attn2(attn_input_a, context=encoder_hidden_states_a, mask=audio.context_mask) * agate_q
            else:
                ax = ax + self.audio_attn2(
                    rms_norm(ax, eps=self.norm_eps),
--- a/mlx_video/models/ltx_2/upsampler.py
+++ b/mlx_video/models/ltx_2/upsampler.py
@@ -1,4 +1,5 @@
 from typing import Tuple, Union
+
 import mlx.core as mx
 import mlx.nn as nn

@@ -36,11 +37,20 @@ class Conv3d(nn.Module):
        self.groups = groups

        # Weight shape: (C_out, KD, KH, KW, C_in)
-        scale = 1.0 / (in_channels * kernel_size[0] * kernel_size[1] * kernel_size[2]) ** 0.5
+        scale = (
+            1.0
+            / (in_channels * kernel_size[0] * kernel_size[1] * kernel_size[2]) ** 0.5
+        )
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
-            shape=(out_channels, kernel_size[0], kernel_size[1], kernel_size[2], in_channels),
+            shape=(
+                out_channels,
+                kernel_size[0],
+                kernel_size[1],
+                kernel_size[2],
+                in_channels,
+            ),
        )

        if bias:
@@ -87,7 +97,6 @@ class GroupNorm3d(nn.Module):
        n, d, h, w, c = x.shape
        input_dtype = x.dtype

-
        x = x.astype(mx.float32)

        # Reshape to (N, D*H*W, num_groups, C//num_groups)
@@ -219,7 +228,9 @@ class SpatialRationalResampler(nn.Module):
        self.den = den

        # Conv2d: mid_channels -> num^2 * mid_channels for PixelShuffle(num)
-        self.conv = nn.Conv2d(mid_channels, num * num * mid_channels, kernel_size=3, padding=1)
+        self.conv = nn.Conv2d(
+            mid_channels, num * num * mid_channels, kernel_size=3, padding=1
+        )
        self.pixel_shuffle = PixelShuffle2D(num, num)
        self.blur_down = BlurDownsample(stride=den)

@@ -230,7 +241,7 @@ class SpatialRationalResampler(nn.Module):

        x = self.conv(x)
        x = self.pixel_shuffle(x)  # H*num, W*num
-        x = self.blur_down(x)      # H*num/den, W*num/den
+        x = self.blur_down(x)  # H*num/den, W*num/den

        _, h_out, w_out, _ = x.shape
        x = mx.reshape(x, (n, d, h_out, w_out, c))
@@ -240,6 +251,7 @@ class SpatialRationalResampler(nn.Module):
 def _rational_for_scale(scale: float) -> Tuple[int, int]:
    """Convert a float scale to a rational fraction (numerator, denominator)."""
    from fractions import Fraction
+
    frac = Fraction(scale).limit_denominator(10)
    return frac.numerator, frac.denominator

@@ -290,16 +302,22 @@ class LatentUpsampler(nn.Module):
        self.initial_norm = GroupNorm3d(32, mid_channels)

        # Pre-upsample ResBlocks - use dict with int keys for MLX parameter tracking
-        self.res_blocks = {i: ResBlock3D(mid_channels) for i in range(num_blocks_per_stage)}
+        self.res_blocks = {
+            i: ResBlock3D(mid_channels) for i in range(num_blocks_per_stage)
+        }

        # Upsampler: 2D spatial upsampling (frame-by-frame)
        if rational_resampler:
-            self.upsampler = SpatialRationalResampler(mid_channels=mid_channels, scale=spatial_scale)
+            self.upsampler = SpatialRationalResampler(
+                mid_channels=mid_channels, scale=spatial_scale
+            )
        else:
            self.upsampler = SpatialUpsampler2x(mid_channels=mid_channels)

        # Post-upsample ResBlocks - use dict with int keys for MLX parameter tracking
-        self.post_upsample_res_blocks = {i: ResBlock3D(mid_channels) for i in range(num_blocks_per_stage)}
+        self.post_upsample_res_blocks = {
+            i: ResBlock3D(mid_channels) for i in range(num_blocks_per_stage)
+        }

        # Final projection
        self.final_conv = Conv3d(mid_channels, in_channels, kernel_size=3, padding=1)
@@ -314,10 +332,13 @@ class LatentUpsampler(nn.Module):
        Returns:
            Upsampled tensor of shape (B, C, F, H*scale, W*scale) - channels first
        """
+
        def debug_stats(name, t):
            if debug:
                mx.eval(t)
-                print(f"    {name}: shape={t.shape}, min={t.min().item():.4f}, max={t.max().item():.4f}, mean={t.mean().item():.4f}")
+                print(
+                    f"    {name}: shape={t.shape}, min={t.min().item():.4f}, max={t.max().item():.4f}, mean={t.mean().item():.4f}"
+                )

        if debug:
            print("  [DEBUG] LatentUpsampler forward pass:")
@@ -404,7 +425,11 @@ def load_upsampler(weights_path: str) -> Tuple[LatentUpsampler, float]:
    # x2: conv out = 4 * mid (2^2 * mid for PixelShuffle(2))
    # x1.5: conv out = 9 * mid (3^2 * mid for PixelShuffle(3)) + blur downsample
    # Both formats may have upsampler.blur_down.kernel, so use channel count
-    conv_key = "upsampler.conv.weight" if "upsampler.conv.weight" in raw_weights else "upsampler.0.weight"
+    conv_key = (
+        "upsampler.conv.weight"
+        if "upsampler.conv.weight" in raw_weights
+        else "upsampler.0.weight"
+    )
    if conv_key in raw_weights:
        out_channels = raw_weights[conv_key].shape[0]
        ratio = out_channels // mid_channels
@@ -414,7 +439,9 @@ def load_upsampler(weights_path: str) -> Tuple[LatentUpsampler, float]:
        rational_resampler = False
        spatial_scale = 2.0

-    print(f"  Detected: mid_channels={mid_channels}, scale={spatial_scale}x, rational={rational_resampler}")
+    print(
+        f"  Detected: mid_channels={mid_channels}, scale={spatial_scale}x, rational={rational_resampler}"
+    )

    # Create model
    upsampler = LatentUpsampler(
--- a/mlx_video/models/ltx_2/utils.py
+++ b/mlx_video/models/ltx_2/utils.py
@@ -109,6 +109,7 @@ def convert_audio_encoder(
        return encoder_dir

    from huggingface_hub import hf_hub_download
+
    vae_path = hf_hub_download(
        source_repo,
        "audio_vae/diffusion_pytorch_model.safetensors",
--- a/mlx_video/models/ltx_2/video_vae/init.py
+++ b/mlx_video/models/ltx_2/video_vae/init.py
@@ -1,8 +1,8 @@
-from mlx_video.models.ltx_2.video_vae.video_vae import VideoEncoder
-from mlx_video.models.ltx_2.video_vae.encoder import encode_image
 from mlx_video.models.ltx_2.video_vae.decoder import LTX2VideoDecoder, VideoDecoder
+from mlx_video.models.ltx_2.video_vae.encoder import encode_image
 from mlx_video.models.ltx_2.video_vae.tiling import (
-    TilingConfig,
    SpatialTilingConfig,
    TemporalTilingConfig,
+    TilingConfig,
 )
+from mlx_video.models.ltx_2.video_vae.video_vae import VideoEncoder
--- a/mlx_video/models/ltx_2/video_vae/convolution.py
+++ b/mlx_video/models/ltx_2/video_vae/convolution.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union

 import mlx.core as mx
 import mlx.nn as nn
@@ -27,14 +27,18 @@ def reflect_pad_2d(x: mx.array, pad_h: int, pad_w: int) -> mx.array:
    # Height padding (axis 2)
    if pad_h > 0:
        # Get reflection indices - exclude boundary
-        top_pad = x[:, :, 1:pad_h+1, :, :][:, :, ::-1, :, :]  # Flip top portion
-        bottom_pad = x[:, :, -pad_h-1:-1, :, :][:, :, ::-1, :, :]  # Flip bottom portion
+        top_pad = x[:, :, 1 : pad_h + 1, :, :][:, :, ::-1, :, :]  # Flip top portion
+        bottom_pad = x[:, :, -pad_h - 1 : -1, :, :][
+            :, :, ::-1, :, :
+        ]  # Flip bottom portion
        x = mx.concatenate([top_pad, x, bottom_pad], axis=2)

    # Width padding (axis 3)
    if pad_w > 0:
-        left_pad = x[:, :, :, 1:pad_w+1, :][:, :, :, ::-1, :]  # Flip left portion
-        right_pad = x[:, :, :, -pad_w-1:-1, :][:, :, :, ::-1, :]  # Flip right portion
+        left_pad = x[:, :, :, 1 : pad_w + 1, :][:, :, :, ::-1, :]  # Flip left portion
+        right_pad = x[:, :, :, -pad_w - 1 : -1, :][
+            :, :, :, ::-1, :
+        ]  # Flip right portion
        x = mx.concatenate([left_pad, x, right_pad], axis=3)

    return x
@@ -50,7 +54,7 @@ def make_conv_nd(
    causal: bool = False,
    spatial_padding_mode: PaddingModeType = PaddingModeType.ZEROS,
 ) -> nn.Module:
-    
+
    if dims == 2:
        return CausalConv2d(
            in_channels=in_channels,
@@ -118,15 +122,17 @@ class CausalConv3d(nn.Module):
        )

    def __call__(self, x: mx.array, causal: Optional[bool] = None) -> mx.array:
-        
+
        use_causal = causal if causal is not None else self.causal

-        # Apply temporal padding via frame replication 
+        # Apply temporal padding via frame replication
        # Only apply if kernel_size > 1
        if self.time_kernel_size > 1:
            if use_causal:
                # Causal: replicate first frame kernel_size-1 times at the beginning
-                first_frame_pad = mx.repeat(x[:, :, :1, :, :], self.time_kernel_size - 1, axis=2)
+                first_frame_pad = mx.repeat(
+                    x[:, :, :1, :, :], self.time_kernel_size - 1, axis=2
+                )
                x = mx.concatenate([first_frame_pad, x], axis=2)
            else:
                # Non-causal: replicate first frame at start, last frame at end
@@ -176,7 +182,6 @@ class CausalConv3d(nn.Module):
        """
        b, d, h, w, c = x.shape

-
        total_elements = d * h * w * c
        max_safe_elements = 30 * 192 * 192 * 128  # ~140M elements per chunk

@@ -191,11 +196,10 @@ class CausalConv3d(nn.Module):

        overlap = kernel_t - 1

-      
        expected_output_frames = d - overlap

        outputs = []
-        out_idx = 0 
+        out_idx = 0

        # Process chunks
        in_start = 0
--- a/mlx_video/models/ltx_2/video_vae/decoder.py
+++ b/mlx_video/models/ltx_2/video_vae/decoder.py
@@ -15,14 +15,14 @@ Architecture (from PyTorch weights):
 """

 import math
-from typing import Optional, Dict
 from pathlib import Path
+from typing import Dict, Optional

 import mlx.core as mx
 import mlx.nn as nn

 from mlx_video.models.ltx_2.video_vae.convolution import CausalConv3d, PaddingModeType
-from mlx_video.models.ltx_2.video_vae.ops import unpatchify, PerChannelStatistics
+from mlx_video.models.ltx_2.video_vae.ops import PerChannelStatistics, unpatchify
 from mlx_video.models.ltx_2.video_vae.sampling import DepthToSpaceUpsample
 from mlx_video.models.ltx_2.video_vae.tiling import TilingConfig, decode_with_tiling

@@ -77,16 +77,14 @@ class PixArtAlphaTimestepEmbedder(nn.Module):
    def __init__(self, embedding_dim: int):
        super().__init__()
        self.timestep_embedder = TimestepEmbedding(
-            in_channels=256,
-            time_embed_dim=embedding_dim
+            in_channels=256, time_embed_dim=embedding_dim
        )

-    def __call__(self, timestep: mx.array, hidden_dtype: mx.Dtype = mx.float32) -> mx.array:
+    def __call__(
+        self, timestep: mx.array, hidden_dtype: mx.Dtype = mx.float32
+    ) -> mx.array:
        timesteps_proj = get_timestep_embedding(
-            timestep,
-            embedding_dim=256,
-            flip_sin_to_cos=True,
-            downscale_freq_shift=0
+            timestep, embedding_dim=256, flip_sin_to_cos=True, downscale_freq_shift=0
        )
        timesteps_emb = self.timestep_embedder(timesteps_proj.astype(hidden_dtype))
        return timesteps_emb
@@ -119,6 +117,7 @@ class ResnetBlock3DSimple(nn.Module):

    def _make_conv_wrapper(self, in_ch, out_ch, padding_mode):
        """Create a wrapper object with a 'conv' attribute to match PyTorch naming."""
+
        class ConvWrapper(nn.Module):
            def __init__(self_inner):
                super().__init__()
@@ -130,13 +129,15 @@ class ResnetBlock3DSimple(nn.Module):
                    padding=1,
                    spatial_padding_mode=padding_mode,
                )
+
            def __call__(self_inner, x, causal=False):
                return self_inner.conv(x, causal=causal)
+
        return ConvWrapper()

    def pixel_norm(self, x: mx.array, eps: float = 1e-8) -> mx.array:
        """Apply pixel normalization."""
-        return x / mx.sqrt(mx.mean(x ** 2, axis=1, keepdims=True) + eps)
+        return x / mx.sqrt(mx.mean(x**2, axis=1, keepdims=True) + eps)

    def __call__(
        self,
@@ -153,7 +154,9 @@ class ResnetBlock3DSimple(nn.Module):
        if self.timestep_conditioning and timestep_embed is not None:
            # scale_shift_table: (4, C), timestep_embed: (B, 4*C, 1, 1, 1)
            # Combine table with timestep embedding
-            ada_values = self.scale_shift_table[None, :, :, None, None, None]  # (1, 4, C, 1, 1, 1)
+            ada_values = self.scale_shift_table[
+                None, :, :, None, None, None
+            ]  # (1, 4, C, 1, 1, 1)
            # Reshape timestep_embed from (B, 4*C, 1, 1, 1) to (B, 4, C, 1, 1, 1)
            channels = self.scale_shift_table.shape[1]
            ts_reshaped = timestep_embed.reshape(batch_size, 4, channels, 1, 1, 1)
@@ -199,16 +202,14 @@ class ResBlockGroup(nn.Module):

        # Time embedder for this block group: embed_dim = 4 * channels
        if timestep_conditioning:
-            self.time_embedder = PixArtAlphaTimestepEmbedder(
-                embedding_dim=channels * 4
-            )
+            self.time_embedder = PixArtAlphaTimestepEmbedder(embedding_dim=channels * 4)

        # Use dict with int keys for MLX to track parameters properly
        self.res_blocks = {
            i: ResnetBlock3DSimple(
                channels,
                spatial_padding_mode,
-                timestep_conditioning=timestep_conditioning
+                timestep_conditioning=timestep_conditioning,
            )
            for i in range(num_layers)
        }
@@ -224,8 +225,7 @@ class ResBlockGroup(nn.Module):
        if self.timestep_conditioning and timestep is not None:
            batch_size = x.shape[0]
            timestep_embed = self.time_embedder(
-                timestep.flatten(),
-                hidden_dtype=x.dtype
+                timestep.flatten(), hidden_dtype=x.dtype
            )
            # Reshape to (B, 4*C, 1, 1, 1) for broadcasting
            timestep_embed = timestep_embed.reshape(batch_size, -1, 1, 1, 1)
@@ -301,8 +301,10 @@ class LTX2VideoDecoder(nn.Module):
                    padding=1,
                    spatial_padding_mode=spatial_padding_mode,
                )
+
            def __call__(self_inner, x, causal=False):
                return self_inner.conv(x, causal=causal)
+
        self.conv_in = ConvInWrapper()

        # Build up blocks from config
@@ -311,8 +313,12 @@ class LTX2VideoDecoder(nn.Module):
            block_type = block_def[0]
            ch = block_def[1]
            if block_type == "res":
-                num_layers = block_def[2] if len(block_def) > 2 else num_layers_per_block
-                self.up_blocks[idx] = ResBlockGroup(ch, num_layers, spatial_padding_mode, timestep_conditioning)
+                num_layers = (
+                    block_def[2] if len(block_def) > 2 else num_layers_per_block
+                )
+                self.up_blocks[idx] = ResBlockGroup(
+                    ch, num_layers, spatial_padding_mode, timestep_conditioning
+                )
            elif block_type == "d2s":
                reduction = block_def[2] if len(block_def) > 2 else 2
                stride = block_def[3] if len(block_def) > 3 else (2, 2, 2)
@@ -327,6 +333,7 @@ class LTX2VideoDecoder(nn.Module):
                )

        final_out_channels = out_channels * patch_size * patch_size
+
        class ConvOutWrapper(nn.Module):
            def __init__(self_inner):
                super().__init__()
@@ -338,8 +345,10 @@ class LTX2VideoDecoder(nn.Module):
                    padding=1,
                    spatial_padding_mode=spatial_padding_mode,
                )
+
            def __call__(self_inner, x, causal=False):
                return self_inner.conv(x, causal=causal)
+
        self.conv_out = ConvOutWrapper()

        self.act = nn.SiLU()
@@ -358,7 +367,7 @@ class LTX2VideoDecoder(nn.Module):
            return weights
        for key, value in weights.items():
            new_key = key
-            
+
            if not key.startswith("vae.") or key.startswith("vae.encoder."):
                continue

@@ -374,7 +383,6 @@ class LTX2VideoDecoder(nn.Module):
            if key.startswith("vae.decoder."):
                new_key = key.replace("vae.decoder.", "")

-
            # Handle Conv3d weight transpose: (O, I, D, H, W) -> (O, D, H, W, I)
            if ".conv.weight" in key and value.ndim == 5:
                value = mx.transpose(value, (0, 2, 3, 4, 1))
@@ -384,7 +392,10 @@ class LTX2VideoDecoder(nn.Module):

            if ".conv.weight" in new_key or ".conv.bias" in new_key:

-                if ".conv.conv.weight" not in new_key and ".conv.conv.bias" not in new_key:
+                if (
+                    ".conv.conv.weight" not in new_key
+                    and ".conv.conv.bias" not in new_key
+                ):
                    new_key = new_key.replace(".conv.weight", ".conv.conv.weight")
                    new_key = new_key.replace(".conv.bias", ".conv.conv.bias")

@@ -392,7 +403,9 @@ class LTX2VideoDecoder(nn.Module):
        return sanitized

    @classmethod
-    def from_pretrained(cls, model_path: Path, strict: bool = True) -> "LTX2VideoDecoder":
+    def from_pretrained(
+        cls, model_path: Path, strict: bool = True
+    ) -> "LTX2VideoDecoder":
        """Load a pretrained decoder from a directory with config.json and weights.

        Args:
@@ -422,7 +435,6 @@ class LTX2VideoDecoder(nn.Module):
        for wf in weight_files:
            weights.update(mx.load(str(wf)))

-
        # Infer block structure from weights
        decoder_blocks = cls._infer_blocks(weights)

@@ -537,11 +549,9 @@ class LTX2VideoDecoder(nn.Module):

        return final_blocks

-  
-
    def pixel_norm(self, x: mx.array, eps: float = 1e-8) -> mx.array:
        """Apply pixel normalization."""
-        return x / mx.sqrt(mx.mean(x ** 2, axis=1, keepdims=True) + eps)
+        return x / mx.sqrt(mx.mean(x**2, axis=1, keepdims=True) + eps)

    def __call__(
        self,
@@ -551,20 +561,15 @@ class LTX2VideoDecoder(nn.Module):
        debug: bool = False,
        chunked_conv: bool = False,
    ) -> mx.array:
-       

        batch_size = sample.shape[0]

-   
-
        # Add noise if timestep conditioning is enabled
        if self.timestep_conditioning:
            noise = mx.random.normal(sample.shape) * self.decode_noise_scale
            sample = noise + (1.0 - self.decode_noise_scale) * sample
-            

        sample = self.per_channel_statistics.un_normalize(sample)
- 

        if timestep is None and self.timestep_conditioning:
            timestep = mx.full((batch_size,), self.decode_timestep)
@@ -574,7 +579,6 @@ class LTX2VideoDecoder(nn.Module):
            scaled_timestep = timestep * self.timestep_scale_multiplier

        x = self.conv_in(sample, causal=causal)
-       

        for i, block in self.up_blocks.items():
            if isinstance(block, ResBlockGroup):
@@ -583,19 +587,18 @@ class LTX2VideoDecoder(nn.Module):
                x = block(x, causal=causal, chunked_conv=chunked_conv)
            else:
                x = block(x, causal=causal)
-    

        x = self.pixel_norm(x)
-  

        if self.timestep_conditioning and scaled_timestep is not None:
            embedded_timestep = self.last_time_embedder(
-                scaled_timestep.flatten(),
-                hidden_dtype=x.dtype
+                scaled_timestep.flatten(), hidden_dtype=x.dtype
            )
            embedded_timestep = embedded_timestep.reshape(batch_size, -1, 1, 1, 1)

-            ada_values = self.last_scale_shift_table[None, :, :, None, None, None]  # (1, 2, 128, 1, 1, 1)
+            ada_values = self.last_scale_shift_table[
+                None, :, :, None, None, None
+            ]  # (1, 2, 128, 1, 1, 1)
            ts_reshaped = embedded_timestep.reshape(batch_size, 2, 128, 1, 1, 1)
            ada_values = ada_values + ts_reshaped

@@ -603,16 +606,13 @@ class LTX2VideoDecoder(nn.Module):
            scale = ada_values[:, 1]

            x = x * (1 + scale) + shift
-          

        x = self.act(x)
-     

        x = self.conv_out(x, causal=causal)
-        
+
        # Unpatchify: (B, 48, F', H', W') -> (B, 3, F, H*4, W*4)
        x = unpatchify(x, patch_size_hw=self.patch_size, patch_size_t=1)
-     

        return x

@@ -669,11 +669,23 @@ class LTX2VideoDecoder(nn.Module):

        # Auto-enable chunked conv for modes where it helps (larger tiles)
        # Chunked conv reduces memory by processing conv+depth_to_space in temporal chunks
-        use_chunked_conv = tiling_mode in ("conservative", "none", "auto", "default", "spatial")
+        use_chunked_conv = tiling_mode in (
+            "conservative",
+            "none",
+            "auto",
+            "default",
+            "spatial",
+        )

        if not needs_spatial_tiling and not needs_temporal_tiling:
            # No tiling needed, use regular decode
-            return self(sample, causal=causal, timestep=timestep, debug=debug, chunked_conv=use_chunked_conv)
+            return self(
+                sample,
+                causal=causal,
+                timestep=timestep,
+                debug=debug,
+                chunked_conv=use_chunked_conv,
+            )

        return decode_with_tiling(
            decoder_fn=self,
--- a/mlx_video/models/ltx_2/video_vae/encoder.py
+++ b/mlx_video/models/ltx_2/video_vae/encoder.py
@@ -6,8 +6,8 @@ to latent space, which can then be used to condition video generation.
 """

 import mlx.core as mx
-from mlx_video.models.ltx_2.video_vae.video_vae import VideoEncoder

+from mlx_video.models.ltx_2.video_vae.video_vae import VideoEncoder


 def encode_image(
--- a/mlx_video/models/ltx_2/video_vae/ops.py
+++ b/mlx_video/models/ltx_2/video_vae/ops.py
@@ -1,6 +1,5 @@
 """Operations for Video VAE."""

-from typing import List, Tuple

 import mlx.core as mx
 import mlx.nn as nn
@@ -32,7 +31,9 @@ def patchify(x: mx.array, patch_size_hw: int = 4, patch_size_t: int = 1) -> mx.a
    new_c = c * patch_size_hw * patch_size_hw * patch_size_t

    # Reshape: (B, C, F, H, W) -> (B, C, F/pt, pt, H/ph, ph, W/pw, pw)
-    x = mx.reshape(x, (b, c, new_f, patch_size_t, new_h, patch_size_hw, new_w, patch_size_hw))
+    x = mx.reshape(
+        x, (b, c, new_f, patch_size_t, new_h, patch_size_hw, new_w, patch_size_hw)
+    )

    # Permute: (B, C, F', pt, H', ph, W', pw) -> (B, C, pt, pw, ph, F', H', W')
    # PyTorch einops uses (c, p, r, q) = (c, temporal, width, height), so we need pw before ph
@@ -101,7 +102,7 @@ class PerChannelStatistics(nn.Module):
            Normalized tensor
        """
        # Expand mean and std for broadcasting: (C,) -> (1, C, 1, 1, 1)
-        dtype = x.dtype 
+        dtype = x.dtype
        # Cast to float32 for precision
        mean = self.mean.astype(mx.float32).reshape(1, -1, 1, 1, 1)
        std = self.std.astype(mx.float32).reshape(1, -1, 1, 1, 1)
@@ -117,7 +118,7 @@ class PerChannelStatistics(nn.Module):
        Returns:
            Denormalized tensor
        """
-        dtype = x.dtype 
+        dtype = x.dtype
        # Cast to float32 for precision
        mean = self.mean.astype(mx.float32).reshape(1, -1, 1, 1, 1)
        std = self.std.astype(mx.float32).reshape(1, -1, 1, 1, 1)
--- a/mlx_video/models/ltx_2/video_vae/resnet.py
+++ b/mlx_video/models/ltx_2/video_vae/resnet.py
@@ -44,7 +44,7 @@ class ResnetBlock3D(nn.Module):
        timestep_conditioning: bool = False,
        spatial_padding_mode: PaddingModeType = PaddingModeType.ZEROS,
    ):
-        
+
        super().__init__()

        out_channels = out_channels or in_channels
@@ -96,7 +96,7 @@ class ResnetBlock3D(nn.Module):
        causal: bool = True,
        generator: Optional[int] = None,
    ) -> mx.array:
-       
+
        residual = x

        # First block
@@ -136,7 +136,7 @@ class UNetMidBlock3D(nn.Module):
        attention_head_dim: Optional[int] = None,
        spatial_padding_mode: PaddingModeType = PaddingModeType.ZEROS,
    ):
-        
+
        super().__init__()

        self.num_layers = num_layers
--- a/mlx_video/models/ltx_2/video_vae/sampling.py
+++ b/mlx_video/models/ltx_2/video_vae/sampling.py
@@ -104,7 +104,7 @@ class SpaceToDepthDownsample(nn.Module):


 class DepthToSpaceUpsample(nn.Module):
-    
+
    def __init__(
        self,
        dims: int,
@@ -114,7 +114,7 @@ class DepthToSpaceUpsample(nn.Module):
        out_channels_reduction_factor: int = 1,
        spatial_padding_mode: PaddingModeType = PaddingModeType.ZEROS,
    ):
-       
+
        super().__init__()

        if isinstance(stride, int):
@@ -156,7 +156,9 @@ class DepthToSpaceUpsample(nn.Module):

        return x

-    def __call__(self, x: mx.array, causal: bool = True, chunked_conv: bool = False) -> mx.array:
+    def __call__(
+        self, x: mx.array, causal: bool = True, chunked_conv: bool = False
+    ) -> mx.array:

        b, c, d, h, w = x.shape
        st, sh, sw = self.stride
@@ -196,7 +198,9 @@ class DepthToSpaceUpsample(nn.Module):

        return x

-    def _chunked_conv_depth_to_space(self, x: mx.array, causal: bool = True) -> mx.array:
+    def _chunked_conv_depth_to_space(
+        self, x: mx.array, causal: bool = True
+    ) -> mx.array:
        """Chunked conv + depth_to_space that processes in temporal chunks.

        This reduces peak memory by avoiding the full high-channel intermediate tensor.
--- a/mlx_video/models/ltx_2/video_vae/tiling.py
+++ b/mlx_video/models/ltx_2/video_vae/tiling.py
@@ -55,7 +55,9 @@ def compute_trapezoidal_mask_1d(
    # Apply right ramp (fade out)
    if ramp_right > 0:
        # Create fade_out: linspace(1, 0, ramp_right + 2)[1:-1]
-        fade_out = [(ramp_right + 1 - i) / (ramp_right + 1) for i in range(1, ramp_right + 1)]
+        fade_out = [
+            (ramp_right + 1 - i) / (ramp_right + 1) for i in range(1, ramp_right + 1)
+        ]
        for i in range(ramp_right):
            mask[length - ramp_right + i] *= fade_out[i]

@@ -71,11 +73,17 @@ class SpatialTilingConfig:

    def __post_init__(self) -> None:
        if self.tile_size_in_pixels < 64:
-            raise ValueError(f"tile_size_in_pixels must be at least 64, got {self.tile_size_in_pixels}")
+            raise ValueError(
+                f"tile_size_in_pixels must be at least 64, got {self.tile_size_in_pixels}"
+            )
        if self.tile_size_in_pixels % 32 != 0:
-            raise ValueError(f"tile_size_in_pixels must be divisible by 32, got {self.tile_size_in_pixels}")
+            raise ValueError(
+                f"tile_size_in_pixels must be divisible by 32, got {self.tile_size_in_pixels}"
+            )
        if self.tile_overlap_in_pixels % 32 != 0:
-            raise ValueError(f"tile_overlap_in_pixels must be divisible by 32, got {self.tile_overlap_in_pixels}")
+            raise ValueError(
+                f"tile_overlap_in_pixels must be divisible by 32, got {self.tile_overlap_in_pixels}"
+            )
        if self.tile_overlap_in_pixels >= self.tile_size_in_pixels:
            raise ValueError(
                f"Overlap must be less than tile size, got {self.tile_overlap_in_pixels} and {self.tile_size_in_pixels}"
@@ -91,11 +99,17 @@ class TemporalTilingConfig:

    def __post_init__(self) -> None:
        if self.tile_size_in_frames < 16:
-            raise ValueError(f"tile_size_in_frames must be at least 16, got {self.tile_size_in_frames}")
+            raise ValueError(
+                f"tile_size_in_frames must be at least 16, got {self.tile_size_in_frames}"
+            )
        if self.tile_size_in_frames % 8 != 0:
-            raise ValueError(f"tile_size_in_frames must be divisible by 8, got {self.tile_size_in_frames}")
+            raise ValueError(
+                f"tile_size_in_frames must be divisible by 8, got {self.tile_size_in_frames}"
+            )
        if self.tile_overlap_in_frames % 8 != 0:
-            raise ValueError(f"tile_overlap_in_frames must be divisible by 8, got {self.tile_overlap_in_frames}")
+            raise ValueError(
+                f"tile_overlap_in_frames must be divisible by 8, got {self.tile_overlap_in_frames}"
+            )
        if self.tile_overlap_in_frames >= self.tile_size_in_frames:
            raise ValueError(
                f"Overlap must be less than tile size, got {self.tile_overlap_in_frames} and {self.tile_size_in_frames}"
@@ -113,15 +127,21 @@ class TilingConfig:
    def default(cls) -> "TilingConfig":
        """Default tiling: 512px spatial, 64 frame temporal."""
        return cls(
-            spatial_config=SpatialTilingConfig(tile_size_in_pixels=512, tile_overlap_in_pixels=64),
-            temporal_config=TemporalTilingConfig(tile_size_in_frames=64, tile_overlap_in_frames=24),
+            spatial_config=SpatialTilingConfig(
+                tile_size_in_pixels=512, tile_overlap_in_pixels=64
+            ),
+            temporal_config=TemporalTilingConfig(
+                tile_size_in_frames=64, tile_overlap_in_frames=24
+            ),
        )

    @classmethod
    def spatial_only(cls, tile_size: int = 512, overlap: int = 64) -> "TilingConfig":
        """Spatial tiling only (for short videos with large resolution)."""
        return cls(
-            spatial_config=SpatialTilingConfig(tile_size_in_pixels=tile_size, tile_overlap_in_pixels=overlap),
+            spatial_config=SpatialTilingConfig(
+                tile_size_in_pixels=tile_size, tile_overlap_in_pixels=overlap
+            ),
            temporal_config=None,
        )

@@ -130,23 +150,33 @@ class TilingConfig:
        """Temporal tiling only (for long videos with small resolution)."""
        return cls(
            spatial_config=None,
-            temporal_config=TemporalTilingConfig(tile_size_in_frames=tile_size, tile_overlap_in_frames=overlap),
+            temporal_config=TemporalTilingConfig(
+                tile_size_in_frames=tile_size, tile_overlap_in_frames=overlap
+            ),
        )

    @classmethod
    def aggressive(cls) -> "TilingConfig":
        """Aggressive tiling for very large videos (smaller tiles, much lower memory)."""
        return cls(
-            spatial_config=SpatialTilingConfig(tile_size_in_pixels=256, tile_overlap_in_pixels=64),
-            temporal_config=TemporalTilingConfig(tile_size_in_frames=32, tile_overlap_in_frames=8),
+            spatial_config=SpatialTilingConfig(
+                tile_size_in_pixels=256, tile_overlap_in_pixels=64
+            ),
+            temporal_config=TemporalTilingConfig(
+                tile_size_in_frames=32, tile_overlap_in_frames=8
+            ),
        )

    @classmethod
    def conservative(cls) -> "TilingConfig":
        """Conservative tiling (larger tiles, less memory savings but faster)."""
        return cls(
-            spatial_config=SpatialTilingConfig(tile_size_in_pixels=768, tile_overlap_in_pixels=64),
-            temporal_config=TemporalTilingConfig(tile_size_in_frames=96, tile_overlap_in_frames=24),
+            spatial_config=SpatialTilingConfig(
+                tile_size_in_pixels=768, tile_overlap_in_pixels=64
+            ),
+            temporal_config=TemporalTilingConfig(
+                tile_size_in_frames=96, tile_overlap_in_frames=24
+            ),
        )

    @classmethod
@@ -186,10 +216,14 @@ class TilingConfig:
        temporal_config = None

        if needs_spatial:
-            spatial_config = SpatialTilingConfig(tile_size_in_pixels=512, tile_overlap_in_pixels=64)
+            spatial_config = SpatialTilingConfig(
+                tile_size_in_pixels=512, tile_overlap_in_pixels=64
+            )

        if needs_temporal:
-            temporal_config = TemporalTilingConfig(tile_size_in_frames=64, tile_overlap_in_frames=24)
+            temporal_config = TemporalTilingConfig(
+                tile_size_in_frames=64, tile_overlap_in_frames=24
+            )

        return cls(spatial_config=spatial_config, temporal_config=temporal_config)

@@ -197,16 +231,21 @@ class TilingConfig:
@dataclass
 class DimensionIntervals:
    """Intervals for splitting a single dimension."""
+
    starts: List[int]
    ends: List[int]
    left_ramps: List[int]
    right_ramps: List[int]


-def split_in_spatial(size: int, overlap: int, dimension_size: int) -> DimensionIntervals:
+def split_in_spatial(
+    size: int, overlap: int, dimension_size: int
+) -> DimensionIntervals:
    """Split a spatial dimension into intervals."""
    if dimension_size <= size:
-        return DimensionIntervals(starts=[0], ends=[dimension_size], left_ramps=[0], right_ramps=[0])
+        return DimensionIntervals(
+            starts=[0], ends=[dimension_size], left_ramps=[0], right_ramps=[0]
+        )

    amount = (dimension_size + size - 2 * overlap - 1) // (size - overlap)
    starts = [i * (size - overlap) for i in range(amount)]
@@ -215,13 +254,19 @@ def split_in_spatial(size: int, overlap: int, dimension_size: int) -> DimensionI
    left_ramps = [0] + [overlap] * (amount - 1)
    right_ramps = [overlap] * (amount - 1) + [0]

-    return DimensionIntervals(starts=starts, ends=ends, left_ramps=left_ramps, right_ramps=right_ramps)
+    return DimensionIntervals(
+        starts=starts, ends=ends, left_ramps=left_ramps, right_ramps=right_ramps
+    )


-def split_in_temporal(size: int, overlap: int, dimension_size: int) -> DimensionIntervals:
+def split_in_temporal(
+    size: int, overlap: int, dimension_size: int
+) -> DimensionIntervals:
    """Split a temporal dimension into intervals with causal adjustment."""
    if dimension_size <= size:
-        return DimensionIntervals(starts=[0], ends=[dimension_size], left_ramps=[0], right_ramps=[0])
+        return DimensionIntervals(
+            starts=[0], ends=[dimension_size], left_ramps=[0], right_ramps=[0]
+        )

    # Start with spatial split
    intervals = split_in_spatial(size, overlap, dimension_size)
@@ -234,28 +279,41 @@ def split_in_temporal(size: int, overlap: int, dimension_size: int) -> Dimension
        starts[i] = starts[i] - 1
        left_ramps[i] = left_ramps[i] + 1

-    return DimensionIntervals(starts=starts, ends=intervals.ends, left_ramps=left_ramps, right_ramps=intervals.right_ramps)
+    return DimensionIntervals(
+        starts=starts,
+        ends=intervals.ends,
+        left_ramps=left_ramps,
+        right_ramps=intervals.right_ramps,
+    )


-def map_temporal_slice(begin: int, end: int, left_ramp: int, right_ramp: int, scale: int) -> Tuple[slice, mx.array]:
+def map_temporal_slice(
+    begin: int, end: int, left_ramp: int, right_ramp: int, scale: int
+) -> Tuple[slice, mx.array]:
    """Map temporal latent interval to output coordinates and mask."""
    start = begin * scale
    stop = 1 + (end - 1) * scale
    left_ramp_scaled = 1 + (left_ramp - 1) * scale if left_ramp > 0 else 0
    right_ramp_scaled = right_ramp * scale

-    mask = compute_trapezoidal_mask_1d(stop - start, left_ramp_scaled, right_ramp_scaled, True)
+    mask = compute_trapezoidal_mask_1d(
+        stop - start, left_ramp_scaled, right_ramp_scaled, True
+    )
    return slice(start, stop), mask


-def map_spatial_slice(begin: int, end: int, left_ramp: int, right_ramp: int, scale: int) -> Tuple[slice, mx.array]:
+def map_spatial_slice(
+    begin: int, end: int, left_ramp: int, right_ramp: int, scale: int
+) -> Tuple[slice, mx.array]:
    """Map spatial latent interval to output coordinates and mask."""
    start = begin * scale
    stop = end * scale
    left_ramp_scaled = left_ramp * scale
    right_ramp_scaled = right_ramp * scale

-    mask = compute_trapezoidal_mask_1d(stop - start, left_ramp_scaled, right_ramp_scaled, False)
+    mask = compute_trapezoidal_mask_1d(
+        stop - start, left_ramp_scaled, right_ramp_scaled, False
+    )
    return slice(start, stop), mask


@@ -315,7 +373,9 @@ def decode_with_tiling(
        temporal_overlap = 0

    # Compute intervals for each dimension
-    temporal_intervals = split_in_temporal(temporal_tile_size, temporal_overlap, f_latent)
+    temporal_intervals = split_in_temporal(
+        temporal_tile_size, temporal_overlap, f_latent
+    )
    height_intervals = split_in_spatial(spatial_tile_size, spatial_overlap, h_latent)
    width_intervals = split_in_spatial(spatial_tile_size, spatial_overlap, w_latent)

@@ -338,7 +398,9 @@ def decode_with_tiling(
        t_right = temporal_intervals.right_ramps[t_idx]

        # Map temporal coordinates
-        out_t_slice, t_mask = map_temporal_slice(t_start, t_end, t_left, t_right, temporal_scale)
+        out_t_slice, t_mask = map_temporal_slice(
+            t_start, t_end, t_left, t_right, temporal_scale
+        )

        for h_idx in range(num_h_tiles):
            h_start = height_intervals.starts[h_idx]
@@ -347,7 +409,9 @@ def decode_with_tiling(
            h_right = height_intervals.right_ramps[h_idx]

            # Map height coordinates
-            out_h_slice, h_mask = map_spatial_slice(h_start, h_end, h_left, h_right, spatial_scale)
+            out_h_slice, h_mask = map_spatial_slice(
+                h_start, h_end, h_left, h_right, spatial_scale
+            )

            for w_idx in range(num_w_tiles):
                w_start = width_intervals.starts[w_idx]
@@ -356,13 +420,23 @@ def decode_with_tiling(
                w_right = width_intervals.right_ramps[w_idx]

                # Map width coordinates
-                out_w_slice, w_mask = map_spatial_slice(w_start, w_end, w_left, w_right, spatial_scale)
+                out_w_slice, w_mask = map_spatial_slice(
+                    w_start, w_end, w_left, w_right, spatial_scale
+                )

                # Extract tile latents (small slice)
-                tile_latents = latents[:, :, t_start:t_end, h_start:h_end, w_start:w_end]
+                tile_latents = latents[
+                    :, :, t_start:t_end, h_start:h_end, w_start:w_end
+                ]

                # Decode tile
-                tile_output = decoder_fn(tile_latents, causal=causal, timestep=timestep, debug=False, chunked_conv=chunked_conv)
+                tile_output = decoder_fn(
+                    tile_latents,
+                    causal=causal,
+                    timestep=timestep,
+                    debug=False,
+                    chunked_conv=chunked_conv,
+                )
                mx.eval(tile_output)

                # Clear tile_latents reference
@@ -385,13 +459,15 @@ def decode_with_tiling(
                w_mask_slice = w_mask[:actual_w] if len(w_mask) > actual_w else w_mask

                blend_mask = (
-                    t_mask_slice.reshape(1, 1, -1, 1, 1) *
-                    h_mask_slice.reshape(1, 1, 1, -1, 1) *
-                    w_mask_slice.reshape(1, 1, 1, 1, -1)
+                    t_mask_slice.reshape(1, 1, -1, 1, 1)
+                    * h_mask_slice.reshape(1, 1, 1, -1, 1)
+                    * w_mask_slice.reshape(1, 1, 1, 1, -1)
                )

                # Slice tile output to match
-                tile_output_slice = tile_output[:, :, :actual_t, :actual_h, :actual_w].astype(mx.float32)
+                tile_output_slice = tile_output[
+                    :, :, :actual_t, :actual_h, :actual_w
+                ].astype(mx.float32)

                # Clear full tile_output
                del tile_output
@@ -409,11 +485,37 @@ def decode_with_tiling(
                weighted_tile = tile_output_slice * blend_mask

                # Update output using slice assignment
-                output[:, :, t_out_start:t_out_end, h_out_start:h_out_end, w_out_start:w_out_end] = (
-                    output[:, :, t_out_start:t_out_end, h_out_start:h_out_end, w_out_start:w_out_end] + weighted_tile
+                output[
+                    :,
+                    :,
+                    t_out_start:t_out_end,
+                    h_out_start:h_out_end,
+                    w_out_start:w_out_end,
+                ] = (
+                    output[
+                        :,
+                        :,
+                        t_out_start:t_out_end,
+                        h_out_start:h_out_end,
+                        w_out_start:w_out_end,
+                    ]
+                    + weighted_tile
                )
-                weights[:, :, t_out_start:t_out_end, h_out_start:h_out_end, w_out_start:w_out_end] = (
-                    weights[:, :, t_out_start:t_out_end, h_out_start:h_out_end, w_out_start:w_out_end] + blend_mask
+                weights[
+                    :,
+                    :,
+                    t_out_start:t_out_end,
+                    h_out_start:h_out_end,
+                    w_out_start:w_out_end,
+                ] = (
+                    weights[
+                        :,
+                        :,
+                        t_out_start:t_out_end,
+                        h_out_start:h_out_end,
+                        w_out_start:w_out_end,
+                    ]
+                    + blend_mask
                )

                # Force evaluation to free memory
@@ -445,10 +547,12 @@ def decode_with_tiling(
                if next_tile_start_latent == 0:
                    next_tile_start_out = 0
                else:
-                    next_tile_start_out = 1 + (next_tile_start_latent - 1) * temporal_scale
+                    next_tile_start_out = (
+                        1 + (next_tile_start_latent - 1) * temporal_scale
+                    )

                # We need to track how many frames we've already emitted
-                if not hasattr(decode_with_tiling, '_emitted_frames'):
+                if not hasattr(decode_with_tiling, "_emitted_frames"):
                    decode_with_tiling._emitted_frames = 0
                emitted = decode_with_tiling._emitted_frames

@@ -456,7 +560,10 @@ def decode_with_tiling(
                    # Normalize and emit frames [emitted, next_tile_start_out)
                    finalized_weights = weights[:, :, emitted:next_tile_start_out, :, :]
                    finalized_weights = mx.maximum(finalized_weights, 1e-8)
-                    finalized_output = output[:, :, emitted:next_tile_start_out, :, :] / finalized_weights
+                    finalized_output = (
+                        output[:, :, emitted:next_tile_start_out, :, :]
+                        / finalized_weights
+                    )
                    finalized_output = finalized_output.astype(latents.dtype)
                    mx.eval(finalized_output)

@@ -473,7 +580,7 @@ def decode_with_tiling(

    # Emit remaining frames if callback provided
    if on_frames_ready is not None:
-        emitted = getattr(decode_with_tiling, '_emitted_frames', 0)
+        emitted = getattr(decode_with_tiling, "_emitted_frames", 0)
        if emitted < out_f:
            remaining_output = output[:, :, emitted:, :, :].astype(latents.dtype)
            mx.eval(remaining_output)
@@ -481,7 +588,7 @@ def decode_with_tiling(
            del remaining_output

    # Reset emitted frames counter for next call
-    if hasattr(decode_with_tiling, '_emitted_frames'):
+    if hasattr(decode_with_tiling, "_emitted_frames"):
        del decode_with_tiling._emitted_frames

    # Clean up weights
--- a/mlx_video/models/ltx_2/video_vae/video_vae.py
+++ b/mlx_video/models/ltx_2/video_vae/video_vae.py
@@ -8,12 +8,15 @@ import mlx.core as mx
 import mlx.nn as nn

 from mlx_video.models.ltx_2.video_vae.convolution import CausalConv3d, PaddingModeType
-from mlx_video.models.ltx_2.video_vae.ops import PerChannelStatistics, patchify, unpatchify
+from mlx_video.models.ltx_2.video_vae.ops import (
+    PerChannelStatistics,
+    patchify,
+    unpatchify,
+)
 from mlx_video.models.ltx_2.video_vae.resnet import (
    NormLayerType,
    ResnetBlock3D,
    UNetMidBlock3D,
-    get_norm_layer,
 )
 from mlx_video.models.ltx_2.video_vae.sampling import (
    DepthToSpaceUpsample,
@@ -24,6 +27,7 @@ from mlx_video.utils import PixelNorm

 class LogVarianceType(Enum):
    """Log variance mode for VAE."""
+
    PER_CHANNEL = "per_channel"
    UNIFORM = "uniform"
    CONSTANT = "constant"
@@ -229,7 +233,6 @@ class VideoEncoder(nn.Module):
            config: VideoEncoderModelConfig with encoder parameters
        """
        super().__init__()
-        from mlx_video.models.ltx_2.config import VideoEncoderModelConfig

        self.patch_size = config.patch_size
        self.norm_layer = config.norm_layer
@@ -241,10 +244,12 @@ class VideoEncoder(nn.Module):
        encoder_spatial_padding_mode = config.encoder_spatial_padding_mode

        # Per-channel statistics for normalizing latents
-        self.per_channel_statistics = PerChannelStatistics(latent_channels=config.out_channels)
+        self.per_channel_statistics = PerChannelStatistics(
+            latent_channels=config.out_channels
+        )

        # After patchify, channels increase by patch_size^2
-        in_channels = config.in_channels * config.patch_size ** 2
+        in_channels = config.in_channels * config.patch_size**2
        feature_channels = config.out_channels

        # Initial convolution
@@ -262,7 +267,11 @@ class VideoEncoder(nn.Module):
        # Use dict with int keys for MLX to track parameters (lists are NOT tracked)
        self.down_blocks = {}
        for idx, (block_name, block_params) in enumerate(encoder_blocks):
-            block_config = {"num_layers": block_params} if isinstance(block_params, int) else block_params
+            block_config = (
+                {"num_layers": block_params}
+                if isinstance(block_params, int)
+                else block_params
+            )

            block, feature_channels = _make_encoder_block(
                block_name=block_name,
@@ -291,7 +300,10 @@ class VideoEncoder(nn.Module):
        conv_out_channels = config.out_channels
        if config.latent_log_var == LogVarianceType.PER_CHANNEL:
            conv_out_channels *= 2
-        elif config.latent_log_var in {LogVarianceType.UNIFORM, LogVarianceType.CONSTANT}:
+        elif config.latent_log_var in {
+            LogVarianceType.UNIFORM,
+            LogVarianceType.CONSTANT,
+        }:
            conv_out_channels += 1

        self.conv_out = CausalConv3d(
@@ -349,13 +361,16 @@ class VideoEncoder(nn.Module):
        elif self.latent_log_var == LogVarianceType.CONSTANT:
            sample = sample[:, :-1, ...]
            approx_ln_0 = -30
-            sample = mx.concatenate([
-                sample,
-                mx.full_like(sample, approx_ln_0),
-            ], axis=1)
+            sample = mx.concatenate(
+                [
+                    sample,
+                    mx.full_like(sample, approx_ln_0),
+                ],
+                axis=1,
+            )

        # Split into means and logvar, normalize means
-        means = sample[:, :self.latent_channels, ...]
+        means = sample[:, : self.latent_channels, ...]
        return self.per_channel_statistics.normalize(means)

    def sanitize(self, weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
@@ -409,6 +424,7 @@ class VideoEncoder(nn.Module):
            Loaded VideoEncoder instance
        """
        import json
+
        from mlx_video.models.ltx_2.config import VideoEncoderModelConfig

        # Load config
@@ -474,7 +490,7 @@ class VideoDecoder(nn.Module):
            decoder_blocks = []

        self.patch_size = patch_size
-        out_channels = out_channels * patch_size ** 2
+        out_channels = out_channels * patch_size**2
        self.causal = causal
        self.timestep_conditioning = timestep_conditioning
        self._norm_num_groups = self._DEFAULT_NORM_NUM_GROUPS
@@ -510,7 +526,11 @@ class VideoDecoder(nn.Module):
        # Use dict with int keys for MLX to track parameters (lists are NOT tracked)
        self.up_blocks = {}
        for idx, (block_name, block_params) in enumerate(reversed(decoder_blocks)):
-            block_config = {"num_layers": block_params} if isinstance(block_params, int) else block_params
+            block_config = (
+                {"num_layers": block_params}
+                if isinstance(block_params, int)
+                else block_params
+            )

            block, feature_channels = _make_decoder_block(
                block_name=block_name,