Enhance README.md with new usage examples for STG and modality scale parameters in video generation. Update generate.py to support STG and modality guidance in the denoising process, allowing for improved audio-visual integration. Refactor attention mechanisms in the transformer to include options for skipping self-attention, facilitating STG perturbation and modality isolation. Update LTXModel and transformer block processing to accommodate new parameters for enhanced flexibility in model configurations.

2026-03-14 10:26:12 +01:00
parent f346e09de4
commit 9cba2ea7cd
5 changed files with 200 additions and 78 deletions
--- a/README.md
+++ b/README.md
@@ -78,6 +78,10 @@ uv run mlx_video.generate --pipeline dev --prompt "Waves crashing" --image beach
 ```bash
 uv run mlx_video.generate --prompt "Ocean waves crashing" --audio
 uv run mlx_video.generate --pipeline dev --prompt "A jazz band playing" --audio --enhance-prompt
+
+# With full guidance (STG + modality_scale, matches PyTorch defaults)
+uv run mlx_video.generate --pipeline dev --prompt "Ocean waves crashing" --audio \
+    --stg-scale 1.0 --stg-blocks 29 --modality-scale 3.0
 ```

 ### LoRA
@@ -146,6 +150,9 @@ uv run mlx_video.upscale --input video.mp4 --output upscaled.mp4 --refine --prom
 | `--cfg-rescale` | 0.7 | CFG rescale factor (reduces over-saturation) |
 | `--negative-prompt` | (default) | Negative prompt for CFG |
 | `--apg` | false | Use Adaptive Projected Guidance (more stable for I2V) |
+| `--stg-scale` | 0.0 | STG scale (PyTorch default: 1.0, requires `--audio`) |
+| `--stg-blocks` | None | Transformer blocks for STG ([29] for LTX-2, [28] for LTX-2.3) |
+| `--modality-scale` | 1.0 | Cross-modal guidance scale (PyTorch default: 3.0, requires `--audio`) |

 **Dev-Two-Stage LoRA options:**

--- a/mlx_video/generate.py
+++ b/mlx_video/generate.py
@@ -715,22 +715,31 @@ def denoise_dev_av(
    transformer: LTXModel,
    sigmas: mx.array,
    cfg_scale: float = 4.0,
+    audio_cfg_scale: float = 7.0,
    cfg_rescale: float = 0.0,
    verbose: bool = True,
    video_state: Optional[LatentState] = None,
    use_apg: bool = False,
    apg_eta: float = 1.0,
    apg_norm_threshold: float = 0.0,
+    stg_scale: float = 0.0,
+    stg_video_blocks: Optional[list] = None,
+    stg_audio_blocks: Optional[list] = None,
+    modality_scale: float = 1.0,
 ) -> tuple[mx.array, mx.array]:
-    """Run denoising loop for dev pipeline with CFG/APG and audio.
+    """Run denoising loop for dev pipeline with CFG/APG, STG, modality guidance, and audio.

    Args:
-        cfg_rescale: Rescale factor for CFG (0.0-1.0). Higher values blend the CFG result
-                     towards the positive-only prediction, helping reduce artifacts.
-                     Default 0.0 means no rescaling (standard CFG).
+        audio_cfg_scale: Separate CFG scale for audio (PyTorch default: 7.0).
+        cfg_rescale: Rescale factor for CFG (0.0-1.0). Normalizes guided prediction
+                     variance to reduce artifacts. Default 0.0 means no rescaling.
        use_apg: Use Adaptive Projected Guidance instead of standard CFG for video.
        apg_eta: APG parallel component weight (1.0 = keep full parallel)
        apg_norm_threshold: APG guidance norm clamp (0 = no clamping)
+        stg_scale: STG (Spatiotemporal Guidance) scale. 0.0 = disabled.
+        stg_video_blocks: Transformer block indices for video STG perturbation.
+        stg_audio_blocks: Transformer block indices for audio STG perturbation.
+        modality_scale: Cross-modal guidance scale. 1.0 = disabled.
    """
    from mlx_video.models.ltx.rope import precompute_freqs_cis

@@ -738,14 +747,14 @@ def denoise_dev_av(
    if video_state is not None:
        video_latents = video_state.latent

-    # Keep latents in float32 throughout the denoising loop to avoid
-    # bfloat16 quantization noise accumulation over many steps.
-    # PyTorch keeps latents in float32; model input is cast to model dtype.
+    # Keep latents in float32 throughout the denoising loop for precision.
    video_latents = video_latents.astype(mx.float32)
    audio_latents = audio_latents.astype(mx.float32)

    sigmas_list = sigmas.tolist()
    use_cfg = cfg_scale != 1.0
+    use_stg = stg_scale != 0.0 and stg_video_blocks is not None
+    use_modality = modality_scale != 1.0
    num_steps = len(sigmas_list) - 1

    # Precompute video RoPE
@@ -782,7 +791,11 @@ def denoise_dev_av(
        console=console,
        disable=not verbose,
    ) as progress:
-        task = progress.add_task("[cyan]Denoising A/V (CFG)[/]", total=num_steps)
+        passes = ["CFG"] if use_cfg else []
+        if use_stg: passes.append("STG")
+        if use_modality: passes.append("Mod")
+        label = "+".join(passes) if passes else "uncond"
+        task = progress.add_task(f"[cyan]Denoising A/V ({label})[/]", total=num_steps)

        for i in range(num_steps):
            sigma = sigmas_list[i]
@@ -827,7 +840,6 @@ def denoise_dev_av(
            # This matches PyTorch's X0ModelWrapper: x0 = latent - timestep * velocity
            # For conditioned tokens (timestep=0): x0 = latent (velocity is irrelevant)
            # For unconditioned tokens (timestep=sigma): x0 = latent - sigma * velocity
-            # Use the float32 latents (not the bfloat16 model input) for precision
            video_flat_f32 = mx.transpose(mx.reshape(video_latents, (b, c, -1)), (0, 2, 1))
            audio_flat_f32 = mx.reshape(mx.transpose(audio_latents, (0, 2, 1, 3)), (ab, at, ac * af))
            video_timesteps_f32 = mx.expand_dims(video_timesteps.astype(mx.float32), axis=-1)
@@ -836,8 +848,12 @@ def denoise_dev_av(
            video_x0_pos_f32 = video_flat_f32 - video_timesteps_f32 * video_vel_pos.astype(mx.float32)
            audio_x0_pos_f32 = audio_flat_f32 - audio_timesteps_f32 * audio_vel_pos.astype(mx.float32)

+            # Start with positive prediction
+            video_x0_guided_f32 = video_x0_pos_f32
+            audio_x0_guided_f32 = audio_x0_pos_f32
+
+            # Pass 2: CFG (negative conditioning)
            if use_cfg:
-                # Negative conditioning pass
                video_modality_neg = Modality(
                    latent=video_flat, timesteps=video_timesteps, positions=video_positions,
                    context=video_embeddings_neg, context_mask=None, enabled=True,
@@ -851,36 +867,54 @@ def denoise_dev_av(
                video_vel_neg, audio_vel_neg = transformer(video=video_modality_neg, audio=audio_modality_neg)
                mx.eval(video_vel_neg, audio_vel_neg)

-                # Convert negative velocity to x0 using per-token timesteps
                video_x0_neg_f32 = video_flat_f32 - video_timesteps_f32 * video_vel_neg.astype(mx.float32)
                audio_x0_neg_f32 = audio_flat_f32 - audio_timesteps_f32 * audio_vel_neg.astype(mx.float32)

-                # Apply guidance to x0 (denoised) predictions
-                # For conditioned tokens: x0_pos = x0_neg = latent, so delta = 0 (no effect)
                if use_apg:
-                    # APG for video (more stable for I2V), standard CFG for audio
                    video_x0_guided_f32 = video_x0_pos_f32 + apg_delta(
                        video_x0_pos_f32, video_x0_neg_f32, cfg_scale,
                        eta=apg_eta, norm_threshold=apg_norm_threshold
                    )
                else:
                    video_x0_guided_f32 = video_x0_pos_f32 + (cfg_scale - 1.0) * (video_x0_pos_f32 - video_x0_neg_f32)
-                # Always use standard CFG for audio
-                audio_x0_guided_f32 = audio_x0_pos_f32 + (cfg_scale - 1.0) * (audio_x0_pos_f32 - audio_x0_neg_f32)
+                audio_x0_guided_f32 = audio_x0_pos_f32 + (audio_cfg_scale - 1.0) * (audio_x0_pos_f32 - audio_x0_neg_f32)

-                # Apply CFG rescale if enabled (std-ratio rescaling to reduce over-saturation)
-                # factor = rescale * (cond_std / pred_std) + (1 - rescale)
-                # pred = pred * factor
-                if cfg_rescale > 0.0:
-                    v_factor = video_x0_pos_f32.std() / (video_x0_guided_f32.std() + 1e-8)
-                    v_factor = cfg_rescale * v_factor + (1.0 - cfg_rescale)
-                    video_x0_guided_f32 = video_x0_guided_f32 * v_factor
-                    a_factor = audio_x0_pos_f32.std() / (audio_x0_guided_f32.std() + 1e-8)
-                    a_factor = cfg_rescale * a_factor + (1.0 - cfg_rescale)
-                    audio_x0_guided_f32 = audio_x0_guided_f32 * a_factor
-            else:
-                video_x0_guided_f32 = video_x0_pos_f32
-                audio_x0_guided_f32 = audio_x0_pos_f32
+            # Pass 3: STG (self-attention perturbation at specified blocks)
+            if use_stg:
+                video_vel_ptb, audio_vel_ptb = transformer(
+                    video=video_modality_pos, audio=audio_modality_pos,
+                    stg_video_blocks=stg_video_blocks, stg_audio_blocks=stg_audio_blocks,
+                )
+                mx.eval(video_vel_ptb, audio_vel_ptb)
+
+                video_x0_ptb_f32 = video_flat_f32 - video_timesteps_f32 * video_vel_ptb.astype(mx.float32)
+                audio_x0_ptb_f32 = audio_flat_f32 - audio_timesteps_f32 * audio_vel_ptb.astype(mx.float32)
+
+                video_x0_guided_f32 = video_x0_guided_f32 + stg_scale * (video_x0_pos_f32 - video_x0_ptb_f32)
+                audio_x0_guided_f32 = audio_x0_guided_f32 + stg_scale * (audio_x0_pos_f32 - audio_x0_ptb_f32)
+
+            # Pass 4: Modality isolation (skip all cross-modal attention)
+            if use_modality:
+                video_vel_iso, audio_vel_iso = transformer(
+                    video=video_modality_pos, audio=audio_modality_pos,
+                    skip_cross_modal=True,
+                )
+                mx.eval(video_vel_iso, audio_vel_iso)
+
+                video_x0_iso_f32 = video_flat_f32 - video_timesteps_f32 * video_vel_iso.astype(mx.float32)
+                audio_x0_iso_f32 = audio_flat_f32 - audio_timesteps_f32 * audio_vel_iso.astype(mx.float32)
+
+                video_x0_guided_f32 = video_x0_guided_f32 + (modality_scale - 1.0) * (video_x0_pos_f32 - video_x0_iso_f32)
+                audio_x0_guided_f32 = audio_x0_guided_f32 + (modality_scale - 1.0) * (audio_x0_pos_f32 - audio_x0_iso_f32)
+
+            # Apply CFG rescale (std-ratio rescaling to reduce over-saturation)
+            if cfg_rescale > 0.0 and (use_cfg or use_stg or use_modality):
+                v_factor = video_x0_pos_f32.std() / (video_x0_guided_f32.std() + 1e-8)
+                v_factor = cfg_rescale * v_factor + (1.0 - cfg_rescale)
+                video_x0_guided_f32 = video_x0_guided_f32 * v_factor
+                a_factor = audio_x0_pos_f32.std() / (audio_x0_guided_f32.std() + 1e-8)
+                a_factor = cfg_rescale * a_factor + (1.0 - cfg_rescale)
+                audio_x0_guided_f32 = audio_x0_guided_f32 * a_factor

            # Reshape x0 from token space (b, tokens, c) to spatial (b, c, f, h, w)
            video_denoised_f32 = mx.reshape(mx.transpose(video_x0_guided_f32, (0, 2, 1)), (b, c, f, h, w))
@@ -898,8 +932,7 @@ def denoise_dev_av(

            mx.eval(video_denoised_f32, audio_denoised_f32)

-            # Euler step matching PyTorch: sample + velocity * dt
-            # Latents stay in float32 throughout (matching PyTorch behavior)
+            # Euler step: sample + velocity * dt (float32)
            if sigma_next > 0:
                sigma_next_f32 = mx.array(sigma_next, dtype=mx.float32)
                dt_f32 = sigma_next_f32 - sigma_f32
@@ -998,6 +1031,7 @@ def generate_video(
    num_frames: int = 33,
    num_inference_steps: int = 40,
    cfg_scale: float = 4.0,
+    audio_cfg_scale: float = 7.0,
    cfg_rescale: float = 0.0,
    seed: int = 42,
    fps: int = 24,
@@ -1017,6 +1051,9 @@ def generate_video(
    use_apg: bool = False,
    apg_eta: float = 1.0,
    apg_norm_threshold: float = 0.0,
+    stg_scale: float = 0.0,
+    stg_blocks: Optional[list] = None,
+    modality_scale: float = 1.0,
    lora_path: Optional[str] = None,
    lora_strength: float = 1.0,
 ):
@@ -1086,7 +1123,10 @@ def generate_video(
    console.print(f"[dim]Prompt: {prompt[:80]}{'...' if len(prompt) > 80 else ''}[/]")

    if pipeline in (PipelineType.DEV, PipelineType.DEV_TWO_STAGE):
-        console.print(f"[dim]Steps: {num_inference_steps}, CFG: {cfg_scale}, Rescale: {cfg_rescale}[/]")
+        audio_cfg_info = f", Audio CFG: {audio_cfg_scale}" if audio else ""
+        stg_info = f", STG: {stg_scale} blocks={stg_blocks}" if stg_scale != 0.0 else ""
+        mod_info = f", Modality: {modality_scale}" if modality_scale != 1.0 else ""
+        console.print(f"[dim]Steps: {num_inference_steps}, CFG: {cfg_scale}{audio_cfg_info}, Rescale: {cfg_rescale}{stg_info}{mod_info}[/]")

    if is_i2v:
        console.print(f"[dim]Image: {image} (strength={image_strength}, frame={image_frame_idx})[/]")
@@ -1268,10 +1308,6 @@ def generate_video(
        positions = create_position_grid(1, latent_frames, stage2_h, stage2_w)
        mx.eval(positions)

-        # Save stage 1 audio latents — stage 2 only refines video (spatial upsampling).
-        # Audio is already fully denoised from stage 1; re-noising would destroy the signal.
-        stage1_audio_latents = audio_latents
-
        state2 = None
        if is_i2v and stage2_image_latent is not None:
            state2 = LatentState(
@@ -1299,13 +1335,20 @@ def generate_video(
            latents = noise * noise_scale + latents * one_minus_scale
            mx.eval(latents)

-        # Stage 2 refines video only (no audio re-denoising)
-        latents, _ = denoise_distilled(
+        # Re-noise audio at sigma=0.909375 for joint refinement (matches PyTorch)
+        if audio and audio_latents is not None:
+            audio_noise = mx.random.normal(audio_latents.shape, dtype=model_dtype)
+            audio_noise_scale = mx.array(STAGE_2_SIGMAS[0], dtype=model_dtype)
+            audio_latents = audio_noise * audio_noise_scale + audio_latents * (mx.array(1.0, dtype=model_dtype) - audio_noise_scale)
+            mx.eval(audio_latents)
+
+        # Joint video + audio refinement (no CFG, positive embeddings only)
+        latents, audio_latents = denoise_distilled(
            latents, positions, text_embeddings, transformer, STAGE_2_SIGMAS,
            verbose=verbose, state=state2,
+            audio_latents=audio_latents, audio_positions=audio_positions,
+            audio_embeddings=audio_embeddings if audio else None,
        )
-        # Restore audio latents from stage 1
-        audio_latents = stage1_audio_latents

    elif pipeline == PipelineType.DEV:
        # ======================================================================
@@ -1371,7 +1414,7 @@ def generate_video(
            latents = mx.random.normal(video_latent_shape, dtype=model_dtype)
            mx.eval(latents)

-        # Denoise with CFG/APG
+        # Denoise with CFG/APG/STG/modality
        if audio:
            latents, audio_latents = denoise_dev_av(
                latents, audio_latents,
@@ -1379,8 +1422,11 @@ def generate_video(
                video_embeddings_pos, video_embeddings_neg,
                audio_embeddings_pos, audio_embeddings_neg,
                transformer, sigmas, cfg_scale=cfg_scale,
+                audio_cfg_scale=audio_cfg_scale,
                cfg_rescale=cfg_rescale, verbose=verbose, video_state=video_state,
-                use_apg=use_apg, apg_eta=apg_eta, apg_norm_threshold=apg_norm_threshold
+                use_apg=use_apg, apg_eta=apg_eta, apg_norm_threshold=apg_norm_threshold,
+                stg_scale=stg_scale, stg_video_blocks=stg_blocks,
+                stg_audio_blocks=stg_blocks, modality_scale=modality_scale,
            )
        else:
            # Use original denoise_dev with computed sigmas
@@ -1469,7 +1515,7 @@ def generate_video(
            latents = mx.random.normal(stage1_shape, dtype=model_dtype)
            mx.eval(latents)

-        # Run stage 1 with dev-style CFG denoising
+        # Stage 1: Joint AV denoising at half resolution (matches PyTorch)
        if audio:
            latents, audio_latents = denoise_dev_av(
                latents, audio_latents,
@@ -1477,8 +1523,11 @@ def generate_video(
                video_embeddings_pos, video_embeddings_neg,
                audio_embeddings_pos, audio_embeddings_neg,
                transformer, sigmas, cfg_scale=cfg_scale,
+                audio_cfg_scale=audio_cfg_scale,
                cfg_rescale=cfg_rescale, verbose=verbose, video_state=state1,
-                use_apg=use_apg, apg_eta=apg_eta, apg_norm_threshold=apg_norm_threshold
+                use_apg=use_apg, apg_eta=apg_eta, apg_norm_threshold=apg_norm_threshold,
+                stg_scale=stg_scale, stg_video_blocks=stg_blocks,
+                stg_audio_blocks=stg_blocks, modality_scale=modality_scale,
            )
        else:
            latents = denoise_dev(
@@ -1490,6 +1539,9 @@ def generate_video(
                use_apg=use_apg, apg_eta=apg_eta, apg_norm_threshold=apg_norm_threshold
            )

+        if audio and audio_latents is not None:
+            mx.eval(audio_latents)
+
        # Upsample latents 2x
        with console.status("[magenta]🔍 Upsampling latents 2x...[/]", spinner="dots"):
            upscaler_files = sorted(model_path.glob("*spatial-upscaler-x2*.safetensors"))
@@ -1522,14 +1574,12 @@ def generate_video(
                load_and_merge_lora(transformer, lora_path, strength=lora_strength)

        # Stage 2: Distilled refinement at full resolution (no CFG)
+        # Matches PyTorch: re-noise audio at sigma=0.909375, then jointly refine
+        # both video and audio through the distilled schedule using the LoRA-merged model.
        console.print(f"\n[bold yellow]⚡ Stage 2:[/] Distilled refining at {width}x{height} (3 steps, no CFG)")
        positions = create_position_grid(1, latent_frames, stage2_h, stage2_w)
        mx.eval(positions)

-        # Save stage 1 audio latents — stage 2 only refines video (spatial upsampling).
-        # Audio is already fully denoised from stage 1; re-noising would destroy the signal.
-        stage1_audio_latents = audio_latents
-
        state2 = None
        if is_i2v and stage2_image_latent is not None:
            state2 = LatentState(
@@ -1557,13 +1607,20 @@ def generate_video(
            latents = noise * noise_scale + latents * one_minus_scale
            mx.eval(latents)

-        # Stage 2 refines video only (no audio re-denoising)
-        latents, _ = denoise_distilled(
+        # Re-noise audio at sigma=0.909375 for joint refinement (matches PyTorch)
+        if audio and audio_latents is not None:
+            audio_noise = mx.random.normal(audio_latents.shape, dtype=model_dtype)
+            audio_noise_scale = mx.array(STAGE_2_SIGMAS[0], dtype=model_dtype)
+            audio_latents = audio_noise * audio_noise_scale + audio_latents * (mx.array(1.0, dtype=model_dtype) - audio_noise_scale)
+            mx.eval(audio_latents)
+
+        # Joint video + audio refinement (no CFG, positive embeddings only)
+        latents, audio_latents = denoise_distilled(
            latents, positions, text_embeddings, transformer, STAGE_2_SIGMAS,
            verbose=verbose, state=state2,
+            audio_latents=audio_latents, audio_positions=audio_positions,
+            audio_embeddings=audio_embeddings_pos if audio else None,
        )
-        # Restore audio latents from stage 1
-        audio_latents = stage1_audio_latents

    del transformer
    mx.clear_cache()
@@ -1685,6 +1742,7 @@ def generate_video(

            mel_spectrogram = audio_decoder(audio_latents)
            mx.eval(mel_spectrogram)
+            console.print(f"[dim]  Mel spectrogram: shape={mel_spectrogram.shape}, std={mel_spectrogram.std().item():.4f}, mean={mel_spectrogram.mean().item():.4f}[/]")

            audio_waveform = vocoder(mel_spectrogram)
            mx.eval(audio_waveform)
@@ -1771,7 +1829,8 @@ Examples:
    parser.add_argument("--width", "-W", type=int, default=512, help="Output video width")
    parser.add_argument("--num-frames", "-n", type=int, default=33, help="Number of frames")
    parser.add_argument("--steps", type=int, default=30, help="Number of inference steps (dev pipeline only, default 30)")
-    parser.add_argument("--cfg-scale", type=float, default=3.0, help="CFG guidance scale (dev pipeline only, default 3.0)")
+    parser.add_argument("--cfg-scale", type=float, default=3.0, help="CFG guidance scale for video (dev pipeline only, default 3.0)")
+    parser.add_argument("--audio-cfg-scale", type=float, default=7.0, help="CFG guidance scale for audio (default 7.0, PyTorch default)")
    parser.add_argument("--cfg-rescale", type=float, default=0.7, help="CFG rescale factor (0.0-1.0). Normalizes guided prediction variance to reduce artifacts (dev pipeline only, default 0.7)")
    parser.add_argument("--seed", "-s", type=int, default=42, help="Random seed")
    parser.add_argument("--fps", type=int, default=24, help="Frames per second")
@@ -1795,6 +1854,9 @@ Examples:
    parser.add_argument("--apg", action="store_true", help="Use Adaptive Projected Guidance instead of CFG (more stable for I2V)")
    parser.add_argument("--apg-eta", type=float, default=1.0, help="APG parallel component weight (1.0 = keep full parallel)")
    parser.add_argument("--apg-norm-threshold", type=float, default=0.0, help="APG guidance norm clamp (0 = no clamping)")
+    parser.add_argument("--stg-scale", type=float, default=0.0, help="STG (Spatiotemporal Guidance) scale (default 0.0 = disabled, PyTorch default: 1.0)")
+    parser.add_argument("--stg-blocks", type=int, nargs="+", default=None, help="Transformer block indices for STG perturbation (default: [29] for LTX-2, [28] for LTX-2.3)")
+    parser.add_argument("--modality-scale", type=float, default=1.0, help="Cross-modal guidance scale (default 1.0 = disabled, PyTorch default: 3.0)")
    parser.add_argument("--lora-path", type=str, default=None, help="Path to LoRA safetensors file (dev-two-stage pipeline)")
    parser.add_argument("--lora-strength", type=float, default=1.0, help="LoRA merge strength (dev-two-stage pipeline, default 1.0)")
    args = parser.parse_args()
@@ -1817,6 +1879,7 @@ Examples:
        num_frames=args.num_frames,
        num_inference_steps=args.steps,
        cfg_scale=args.cfg_scale,
+        audio_cfg_scale=args.audio_cfg_scale,
        cfg_rescale=args.cfg_rescale,
        seed=args.seed,
        fps=args.fps,
@@ -1836,6 +1899,9 @@ Examples:
        use_apg=args.apg,
        apg_eta=args.apg_eta,
        apg_norm_threshold=args.apg_norm_threshold,
+        stg_scale=args.stg_scale,
+        stg_blocks=args.stg_blocks,
+        modality_scale=args.modality_scale,
        lora_path=args.lora_path,
        lora_strength=args.lora_strength,
    )
--- a/mlx_video/models/ltx/attention.py
+++ b/mlx_video/models/ltx/attention.py
@@ -101,6 +101,7 @@ class Attention(nn.Module):
        mask: Optional[mx.array] = None,
        pe: Optional[Tuple[mx.array, mx.array]] = None,
        k_pe: Optional[Tuple[mx.array, mx.array]] = None,
+        skip_attention: bool = False,
    ) -> mx.array:
        """Forward pass.

@@ -110,6 +111,8 @@ class Attention(nn.Module):
            mask: Attention mask
            pe: Position embeddings for query (and key if k_pe is None)
            k_pe: Position embeddings for key (optional, uses pe if None)
+            skip_attention: If True, bypass Q*K*V attention and use value projection
+                only (for STG perturbation). Matches PyTorch all_perturbed=True.

        Returns:
            Attention output of shape (B, seq_len, query_dim)
@@ -119,24 +122,26 @@ class Attention(nn.Module):
        if hasattr(self, "to_gate_logits"):
            gate = 2.0 * mx.sigmoid(self.to_gate_logits(x))  # (B, seq, heads)

-        # Compute Q, K, V
-        q = self.to_q(x)
        context = x if context is None else context
-        k = self.to_k(context)
        v = self.to_v(context)

-        # Apply normalization
-        q = self.q_norm(q)
-        k = self.k_norm(k)
+        if skip_attention:
+            # STG: bypass Q*K*V attention, use value projection only
+            out = v
+        else:
+            # Standard attention
+            q = self.to_q(x)
+            k = self.to_k(context)

-        # Apply rotary position embeddings
-        if pe is not None:
-            q = apply_rotary_emb(q, pe, self.rope_type)
-            k_pe_to_use = pe if k_pe is None else k_pe
-            k = apply_rotary_emb(k, k_pe_to_use, self.rope_type)
+            q = self.q_norm(q)
+            k = self.k_norm(k)

-        # Compute attention
-        out = scaled_dot_product_attention(q, k, v, self.heads, mask)
+            if pe is not None:
+                q = apply_rotary_emb(q, pe, self.rope_type)
+                k_pe_to_use = pe if k_pe is None else k_pe
+                k = apply_rotary_emb(k, k_pe_to_use, self.rope_type)
+
+            out = scaled_dot_product_attention(q, k, v, self.heads, mask)

        # Apply per-head gating
        if gate is not None:
--- a/mlx_video/models/ltx/ltx.py
+++ b/mlx_video/models/ltx/ltx.py
@@ -453,10 +453,26 @@ class LTXModel(nn.Module):
        self,
        video: Optional[TransformerArgs],
        audio: Optional[TransformerArgs],
+        stg_video_blocks: Optional[List[int]] = None,
+        stg_audio_blocks: Optional[List[int]] = None,
+        skip_cross_modal: bool = False,
    ) -> Tuple[Optional[TransformerArgs], Optional[TransformerArgs]]:
-        """Process through all transformer blocks."""
-        for block in self.transformer_blocks.values():
-            video, audio = block(video=video, audio=audio)
+        """Process through all transformer blocks.
+
+        Args:
+            stg_video_blocks: Block indices where video self-attention is skipped (STG).
+            stg_audio_blocks: Block indices where audio self-attention is skipped (STG).
+            skip_cross_modal: Skip all A2V/V2A cross-attention (modality isolation).
+        """
+        stg_v_set = set(stg_video_blocks) if stg_video_blocks else set()
+        stg_a_set = set(stg_audio_blocks) if stg_audio_blocks else set()
+        for idx, block in self.transformer_blocks.items():
+            video, audio = block(
+                video=video, audio=audio,
+                skip_video_self_attn=(idx in stg_v_set),
+                skip_audio_self_attn=(idx in stg_a_set),
+                skip_cross_modal=skip_cross_modal,
+            )
        return video, audio

    def _process_output(
@@ -490,8 +506,19 @@ class LTXModel(nn.Module):
        self,
        video: Optional[Modality] = None,
        audio: Optional[Modality] = None,
+        stg_video_blocks: Optional[List[int]] = None,
+        stg_audio_blocks: Optional[List[int]] = None,
+        skip_cross_modal: bool = False,
    ) -> Tuple[Optional[mx.array], Optional[mx.array]]:
-       
+        """Forward pass.
+
+        Args:
+            video: Video modality input.
+            audio: Audio modality input.
+            stg_video_blocks: Block indices where video self-attention is skipped (STG).
+            stg_audio_blocks: Block indices where audio self-attention is skipped (STG).
+            skip_cross_modal: Skip all A2V/V2A cross-attention (modality isolation).
+        """
        # Validate inputs
        if not self.model_type.is_video_enabled() and video is not None:
            raise ValueError("Video is not enabled for this model")
@@ -506,6 +533,9 @@ class LTXModel(nn.Module):
        video_out, audio_out = self._process_transformer_blocks(
            video=video_args,
            audio=audio_args,
+            stg_video_blocks=stg_video_blocks,
+            stg_audio_blocks=stg_audio_blocks,
+            skip_cross_modal=skip_cross_modal,
        )

        # Process outputs
@@ -603,9 +633,17 @@ class X0Model(nn.Module):
        self,
        video: Optional[Modality] = None,
        audio: Optional[Modality] = None,
+        stg_video_blocks: Optional[List[int]] = None,
+        stg_audio_blocks: Optional[List[int]] = None,
+        skip_cross_modal: bool = False,
    ) -> Tuple[Optional[mx.array], Optional[mx.array]]:
-       
-        vx, ax = self.velocity_model(video, audio)
+
+        vx, ax = self.velocity_model(
+            video, audio,
+            stg_video_blocks=stg_video_blocks,
+            stg_audio_blocks=stg_audio_blocks,
+            skip_cross_modal=skip_cross_modal,
+        )

        denoised_video = to_denoised(video.latent, vx, video.timesteps) if vx is not None else None
        denoised_audio = to_denoised(audio.latent, ax, audio.timesteps) if ax is not None else None
--- a/mlx_video/models/ltx/transformer.py
+++ b/mlx_video/models/ltx/transformer.py
@@ -234,12 +234,18 @@ class BasicAVTransformerBlock(nn.Module):
        self,
        video: Optional[TransformerArgs] = None,
        audio: Optional[TransformerArgs] = None,
+        skip_video_self_attn: bool = False,
+        skip_audio_self_attn: bool = False,
+        skip_cross_modal: bool = False,
    ) -> Tuple[Optional[TransformerArgs], Optional[TransformerArgs]]:
        """Forward pass through transformer block.

        Args:
            video: Video modality arguments
            audio: Audio modality arguments
+            skip_video_self_attn: Skip video self-attention (for STG perturbation)
+            skip_audio_self_attn: Skip audio self-attention (for STG perturbation)
+            skip_cross_modal: Skip all cross-modal attention (for modality isolation)

        Returns:
            Tuple of (updated_video, updated_audio) TransformerArgs
@@ -252,8 +258,8 @@ class BasicAVTransformerBlock(nn.Module):
        # Check which modalities to run
        run_vx = video is not None and video.enabled and vx.size > 0
        run_ax = audio is not None and audio.enabled and ax.size > 0
-        run_a2v = run_vx and (audio is not None and audio.enabled and ax.size > 0)
-        run_v2a = run_ax and (video is not None and video.enabled and vx.size > 0)
+        run_a2v = run_vx and (audio is not None and audio.enabled and ax.size > 0) and not skip_cross_modal
+        run_v2a = run_ax and (video is not None and video.enabled and vx.size > 0) and not skip_cross_modal

        # Process video self-attention and cross-attention with text
        if run_vx:
@@ -261,9 +267,9 @@ class BasicAVTransformerBlock(nn.Module):
                self.scale_shift_table, vx.shape[0], video.timesteps, slice(0, 3)
            )

-            # Self-attention with RoPE
+            # Self-attention with RoPE (skip_attention=True for STG perturbation)
            norm_vx = rms_norm(vx, eps=self.norm_eps) * (1 + vscale_msa) + vshift_msa
-            vx = vx + self.attn1(norm_vx, pe=video.positional_embeddings) * vgate_msa
+            vx = vx + self.attn1(norm_vx, pe=video.positional_embeddings, skip_attention=skip_video_self_attn) * vgate_msa

            # Cross-attention with text context
            if self.has_prompt_adaln:
@@ -290,9 +296,9 @@ class BasicAVTransformerBlock(nn.Module):
                self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(0, 3)
            )

-            # Self-attention with RoPE
+            # Self-attention with RoPE (skip_attention=True for STG perturbation)
            norm_ax = rms_norm(ax, eps=self.norm_eps) * (1 + ascale_msa) + ashift_msa
-            ax = ax + self.audio_attn1(norm_ax, pe=audio.positional_embeddings) * agate_msa
+            ax = ax + self.audio_attn1(norm_ax, pe=audio.positional_embeddings, skip_attention=skip_audio_self_attn) * agate_msa

            # Cross-attention with text context
            if self.has_prompt_adaln: