"""Unified video and audio-video generation pipeline for LTX-2. Supports both distilled (two-stage with upsampling) and dev (single-stage with CFG) pipelines. """ import argparse import math import time from enum import Enum from pathlib import Path from typing import Optional import mlx.core as mx import numpy as np from PIL import Image from rich.console import Console from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn from rich.panel import Panel # Rich console for styled output console = Console() from mlx_video.models.ltx.ltx import LTXModel from mlx_video.models.ltx.transformer import Modality from mlx_video.utils import to_denoised, load_image, prepare_image_for_encoding, get_model_path from mlx_video.models.ltx.video_vae.decoder import VideoDecoder from mlx_video.models.ltx.video_vae import VideoEncoder from mlx_video.models.ltx.video_vae.tiling import TilingConfig from mlx_video.models.ltx.upsampler import load_upsampler, upsample_latents from mlx_video.conditioning import VideoConditionByLatentIndex, apply_conditioning from mlx_video.conditioning.latent import LatentState, apply_denoise_mask class PipelineType(Enum): """Pipeline type selector.""" DISTILLED = "distilled" # Two-stage with upsampling, fixed sigmas, no CFG DEV = "dev" # Single-stage, dynamic sigmas, CFG DEV_TWO_STAGE = "dev-two-stage" # Two-stage: dev (half res, CFG) + distilled LoRA (full res) # Distilled model sigma schedules STAGE_1_SIGMAS = [1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0] STAGE_2_SIGMAS = [0.909375, 0.725, 0.421875, 0.0] # Dev model scheduling constants BASE_SHIFT_ANCHOR = 1024 MAX_SHIFT_ANCHOR = 4096 # Audio constants AUDIO_SAMPLE_RATE = 24000 # Output audio sample rate AUDIO_LATENT_SAMPLE_RATE = 16000 # VAE internal sample rate AUDIO_HOP_LENGTH = 160 AUDIO_LATENT_DOWNSAMPLE_FACTOR = 4 AUDIO_LATENT_CHANNELS = 8 # Latent channels before patchifying AUDIO_MEL_BINS = 16 AUDIO_LATENTS_PER_SECOND = AUDIO_LATENT_SAMPLE_RATE / AUDIO_HOP_LENGTH / AUDIO_LATENT_DOWNSAMPLE_FACTOR # 25 # Default negative prompt for CFG (dev pipeline) # Matches PyTorch LTX-2 reference DEFAULT_NEGATIVE_PROMPT from constants.py DEFAULT_NEGATIVE_PROMPT = ( "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, " "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of " "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent " "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny " "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, " "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, " "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward " "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." ) def load_and_merge_lora( model: LTXModel, lora_path: str, strength: float = 1.0, ) -> None: """Load LoRA weights and merge them into the transformer model in-place. Supports two formats: - Raw PyTorch: keys like diffusion_model.{module}.lora_A.weight (needs sanitization) - Pre-converted MLX: keys like {module}.lora_A.weight (already sanitized) Merge formula: weight += (lora_B * strength) @ lora_A Args: model: The LTXModel transformer to merge into lora_path: Path to the LoRA safetensors file or directory containing one strength: LoRA strength/coefficient (default 1.0) """ # Resolve path: local file/dir or HuggingFace repo lora_file = Path(lora_path) if lora_file.is_file(): pass # direct file path elif lora_file.is_dir(): # Local directory: find safetensors inside candidates = sorted(lora_file.glob("*.safetensors")) if not candidates: raise FileNotFoundError(f"No .safetensors files found in {lora_path}") # Prefer distilled-lora files over full model weights lora_candidates = [c for c in candidates if "distilled-lora" in c.name] lora_file = lora_candidates[0] if lora_candidates else candidates[0] console.print(f"[dim]Using LoRA file: {lora_file.name}[/]") else: # Treat as HuggingFace repo ID lora_dir = get_model_path(lora_path) candidates = sorted(lora_dir.glob("*.safetensors")) if not candidates: raise FileNotFoundError(f"No .safetensors files found in {lora_dir}") # Prefer distilled-lora files over full model weights lora_candidates = [c for c in candidates if "distilled-lora" in c.name] lora_file = lora_candidates[0] if lora_candidates else candidates[0] console.print(f"[dim]Using LoRA from repo: {lora_path} ({lora_file.name})[/]") # Load LoRA weights lora_weights = mx.load(str(lora_file)) # Detect format: raw PyTorch has 'diffusion_model.' prefix has_prefix = any(k.startswith("diffusion_model.") for k in lora_weights) # Group into A/B pairs by module name lora_pairs = {} for key in lora_weights: module_key = key if has_prefix: if not key.startswith("diffusion_model."): continue module_key = key.replace("diffusion_model.", "") if module_key.endswith(".lora_A.weight"): base_key = module_key.replace(".lora_A.weight", "") lora_pairs.setdefault(base_key, {})["A"] = lora_weights[key] elif module_key.endswith(".lora_B.weight"): base_key = module_key.replace(".lora_B.weight", "") lora_pairs.setdefault(base_key, {})["B"] = lora_weights[key] # Apply key sanitization only for raw PyTorch format # Replacements handle both mid-string and end-of-string positions # since LoRA base keys end at the module name without trailing dot _LORA_KEY_REPLACEMENTS = [ (".to_out.0", ".to_out"), (".ff.net.0.proj", ".ff.proj_in"), (".ff.net.2", ".ff.proj_out"), (".audio_ff.net.0.proj", ".audio_ff.proj_in"), (".audio_ff.net.2", ".audio_ff.proj_out"), (".linear_1", ".linear1"), (".linear_2", ".linear2"), ] if has_prefix: sanitized_pairs = {} for key, pair in lora_pairs.items(): new_key = key for old, new in _LORA_KEY_REPLACEMENTS: if new_key.endswith(old): new_key = new_key[:-len(old)] + new else: new_key = new_key.replace(old + ".", new + ".") sanitized_pairs[new_key] = pair else: sanitized_pairs = lora_pairs # Get current model weights as a flat dict (references, not copies) def flatten_params(params, prefix=""): flat = {} for k, v in params.items(): full_key = f"{prefix}.{k}" if prefix else k if isinstance(v, dict): flat.update(flatten_params(v, full_key)) else: flat[full_key] = v return flat flat_weights = flatten_params(dict(model.parameters())) # Merge LoRA deltas in batches to avoid doubling memory merged_count = 0 batch = [] batch_size = 100 # merge 100 weights at a time, then eval to free intermediates for module_key, pair in sanitized_pairs.items(): if "A" not in pair or "B" not in pair: continue weight_key = f"{module_key}.weight" if weight_key not in flat_weights: continue lora_a = pair["A"].astype(mx.float32) # (rank, in_features) lora_b = pair["B"].astype(mx.float32) # (out_features, rank) # delta = (lora_B * strength) @ lora_A delta = (lora_b * strength) @ lora_a base_weight = flat_weights.pop(weight_key) merged_weight = (base_weight.astype(mx.float32) + delta).astype(base_weight.dtype) batch.append((weight_key, merged_weight)) del base_weight merged_count += 1 if len(batch) >= batch_size: model.load_weights(batch, strict=False) mx.eval(model.parameters()) batch.clear() if batch: model.load_weights(batch, strict=False) mx.eval(model.parameters()) batch.clear() del flat_weights, lora_weights mx.clear_cache() console.print(f"[green]✓[/] Merged {merged_count} LoRA pairs (strength={strength})") def cfg_delta(cond: mx.array, uncond: mx.array, scale: float) -> mx.array: """Compute CFG delta for classifier-free guidance. Args: cond: Conditional prediction uncond: Unconditional prediction scale: CFG guidance scale Returns: Delta to add to unconditional for CFG: (scale - 1) * (cond - uncond) """ return (scale - 1.0) * (cond - uncond) def apg_delta( cond: mx.array, uncond: mx.array, scale: float, eta: float = 1.0, norm_threshold: float = 0.0, ) -> mx.array: """Compute APG (Adaptive Projected Guidance) delta. Decomposes guidance into parallel and orthogonal components relative to the conditional prediction, providing more stable guidance for I2V. Based on: https://arxiv.org/abs/2407.12173 Args: cond: Conditional prediction (x0_pos) uncond: Unconditional prediction (x0_neg) scale: Guidance strength (same as CFG scale) eta: Weight for parallel component (1.0 = keep full parallel) norm_threshold: Clamp guidance norm to this value (0 = no clamping) Returns: Delta to add to unconditional for APG guidance """ guidance = cond - uncond # Optionally clamp guidance norm for stability if norm_threshold > 0: guidance_norm = mx.sqrt(mx.sum(guidance ** 2, axis=(-1, -2, -3), keepdims=True) + 1e-8) scale_factor = mx.minimum(mx.ones_like(guidance_norm), norm_threshold / guidance_norm) guidance = guidance * scale_factor # Project guidance onto cond direction batch_size = cond.shape[0] cond_flat = mx.reshape(cond, (batch_size, -1)) guidance_flat = mx.reshape(guidance, (batch_size, -1)) # Projection coefficient: (guidance · cond) / (cond · cond) dot_product = mx.sum(guidance_flat * cond_flat, axis=1, keepdims=True) squared_norm = mx.sum(cond_flat ** 2, axis=1, keepdims=True) + 1e-8 proj_coeff = dot_product / squared_norm # Reshape back and compute parallel/orthogonal components proj_coeff = mx.reshape(proj_coeff, (batch_size,) + (1,) * (cond.ndim - 1)) g_parallel = proj_coeff * cond g_orth = guidance - g_parallel # Combine with eta weighting parallel component g_apg = g_parallel * eta + g_orth return g_apg * (scale - 1.0) def ltx2_scheduler( steps: int, num_tokens: Optional[int] = None, max_shift: float = 2.05, base_shift: float = 0.95, stretch: bool = True, terminal: float = 0.1, ) -> mx.array: """LTX-2 scheduler for sigma generation (dev model). Generates a sigma schedule with token-count-dependent shifting and optional stretching to a terminal value. Args: steps: Number of inference steps num_tokens: Number of latent tokens (F*H*W). If None, uses MAX_SHIFT_ANCHOR max_shift: Maximum shift factor base_shift: Base shift factor stretch: Whether to stretch sigmas to terminal value terminal: Terminal sigma value for stretching Returns: Array of sigma values of shape (steps + 1,) """ tokens = num_tokens if num_tokens is not None else MAX_SHIFT_ANCHOR sigmas = np.linspace(1.0, 0.0, steps + 1) # Compute shift based on token count x1 = BASE_SHIFT_ANCHOR x2 = MAX_SHIFT_ANCHOR mm = (max_shift - base_shift) / (x2 - x1) b = base_shift - mm * x1 sigma_shift = tokens * mm + b # Apply shift transformation power = 1 with np.errstate(divide='ignore', invalid='ignore'): sigmas = np.where( sigmas != 0, math.exp(sigma_shift) / (math.exp(sigma_shift) + (1 / sigmas - 1) ** power), 0, ) # Stretch sigmas to terminal value if stretch: non_zero_mask = sigmas != 0 non_zero_sigmas = sigmas[non_zero_mask] one_minus_z = 1.0 - non_zero_sigmas scale_factor = one_minus_z[-1] / (1.0 - terminal) stretched = 1.0 - (one_minus_z / scale_factor) sigmas[non_zero_mask] = stretched return mx.array(sigmas, dtype=mx.float32) def create_position_grid( batch_size: int, num_frames: int, height: int, width: int, temporal_scale: int = 8, spatial_scale: int = 32, fps: float = 24.0, causal_fix: bool = True, ) -> mx.array: """Create position grid for RoPE in pixel space. Args: batch_size: Batch size num_frames: Number of frames (latent) height: Height (latent) width: Width (latent) temporal_scale: VAE temporal scale factor (default 8) spatial_scale: VAE spatial scale factor (default 32) fps: Frames per second (default 24.0) causal_fix: Apply causal fix for first frame (default True) Returns: Position grid of shape (B, 3, num_patches, 2) in pixel space where dim 2 is [start, end) bounds for each patch """ patch_size_t, patch_size_h, patch_size_w = 1, 1, 1 t_coords = np.arange(0, num_frames, patch_size_t) h_coords = np.arange(0, height, patch_size_h) w_coords = np.arange(0, width, patch_size_w) t_grid, h_grid, w_grid = np.meshgrid(t_coords, h_coords, w_coords, indexing='ij') patch_starts = np.stack([t_grid, h_grid, w_grid], axis=0) patch_size_delta = np.array([patch_size_t, patch_size_h, patch_size_w]).reshape(3, 1, 1, 1) patch_ends = patch_starts + patch_size_delta latent_coords = np.stack([patch_starts, patch_ends], axis=-1) num_patches = num_frames * height * width latent_coords = latent_coords.reshape(3, num_patches, 2) latent_coords = np.tile(latent_coords[np.newaxis, ...], (batch_size, 1, 1, 1)) scale_factors = np.array([temporal_scale, spatial_scale, spatial_scale]).reshape(1, 3, 1, 1) pixel_coords = (latent_coords * scale_factors).astype(np.float32) if causal_fix: pixel_coords[:, 0, :, :] = np.clip( pixel_coords[:, 0, :, :] + 1 - temporal_scale, a_min=0, a_max=None ) # Divide temporal coords by fps pixel_coords[:, 0, :, :] = pixel_coords[:, 0, :, :] / fps # Cast entire position grid through bfloat16 to match PyTorch's behavior. # PyTorch does: positions = positions.to(bfloat16) on ALL coordinates before # passing to the transformer/RoPE. This quantization is what the model was # trained with, so we must replicate it for numerical fidelity. positions_bf16 = mx.array(pixel_coords, dtype=mx.bfloat16) mx.eval(positions_bf16) return positions_bf16.astype(mx.float32) def create_audio_position_grid( batch_size: int, audio_frames: int, sample_rate: int = AUDIO_LATENT_SAMPLE_RATE, hop_length: int = AUDIO_HOP_LENGTH, downsample_factor: int = AUDIO_LATENT_DOWNSAMPLE_FACTOR, is_causal: bool = True, ) -> mx.array: """Create temporal position grid for audio RoPE.""" def get_audio_latent_time_in_sec(start_idx: int, end_idx: int) -> np.ndarray: latent_frame = np.arange(start_idx, end_idx, dtype=np.float32) mel_frame = latent_frame * downsample_factor if is_causal: mel_frame = np.clip(mel_frame + 1 - downsample_factor, 0, None) return mel_frame * hop_length / sample_rate start_times = get_audio_latent_time_in_sec(0, audio_frames) end_times = get_audio_latent_time_in_sec(1, audio_frames + 1) positions = np.stack([start_times, end_times], axis=-1) positions = positions[np.newaxis, np.newaxis, :, :] positions = np.tile(positions, (batch_size, 1, 1, 1)) # Cast through bfloat16 to match PyTorch's precision behavior positions_bf16 = mx.array(positions, dtype=mx.bfloat16) mx.eval(positions_bf16) return positions_bf16.astype(mx.float32) def compute_audio_frames(num_video_frames: int, fps: float) -> int: """Compute number of audio latent frames given video duration.""" duration = num_video_frames / fps return round(duration * AUDIO_LATENTS_PER_SECOND) # ============================================================================= # Distilled Pipeline Denoising (no CFG, fixed sigmas) # ============================================================================= def denoise_distilled( latents: mx.array, positions: mx.array, text_embeddings: mx.array, transformer: LTXModel, sigmas: list, verbose: bool = True, state: Optional[LatentState] = None, audio_latents: Optional[mx.array] = None, audio_positions: Optional[mx.array] = None, audio_embeddings: Optional[mx.array] = None, ) -> tuple[mx.array, Optional[mx.array]]: """Run denoising loop for distilled pipeline (no CFG).""" dtype = latents.dtype enable_audio = audio_latents is not None if state is not None: latents = state.latent # Keep latents in float32 throughout to avoid quantization noise accumulation. latents = latents.astype(mx.float32) if enable_audio: audio_latents = audio_latents.astype(mx.float32) desc = "[cyan]Denoising A/V[/]" if enable_audio else "[cyan]Denoising[/]" num_steps = len(sigmas) - 1 with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), TimeRemainingColumn(), console=console, disable=not verbose, ) as progress: task = progress.add_task(desc, total=num_steps) for i in range(num_steps): sigma, sigma_next = sigmas[i], sigmas[i + 1] b, c, f, h, w = latents.shape num_tokens = f * h * w # Cast to model dtype for transformer input latents_flat = mx.transpose(mx.reshape(latents, (b, c, -1)), (0, 2, 1)).astype(dtype) if state is not None: denoise_mask_flat = mx.reshape(state.denoise_mask, (b, 1, f, 1, 1)) denoise_mask_flat = mx.broadcast_to(denoise_mask_flat, (b, 1, f, h, w)) denoise_mask_flat = mx.reshape(denoise_mask_flat, (b, num_tokens)) timesteps = mx.array(sigma, dtype=dtype) * denoise_mask_flat else: timesteps = mx.full((b, num_tokens), sigma, dtype=dtype) video_modality = Modality( latent=latents_flat, timesteps=timesteps, positions=positions, context=text_embeddings, context_mask=None, enabled=True, sigma=mx.full((b,), sigma, dtype=dtype), ) audio_modality = None if enable_audio: ab, ac, at, af = audio_latents.shape audio_flat = mx.transpose(audio_latents, (0, 2, 1, 3)) audio_flat = mx.reshape(audio_flat, (ab, at, ac * af)).astype(dtype) audio_modality = Modality( latent=audio_flat, timesteps=mx.full((ab, at), sigma, dtype=dtype), positions=audio_positions, context=audio_embeddings, context_mask=None, enabled=True, sigma=mx.full((ab,), sigma, dtype=dtype), ) velocity, audio_velocity = transformer(video=video_modality, audio=audio_modality) mx.eval(velocity) if audio_velocity is not None: mx.eval(audio_velocity) # Compute denoised (x0) using per-token timesteps in float32 # x0 = latent - timestep * velocity # For conditioned tokens (timestep=0): x0 = latent # For unconditioned tokens (timestep=sigma): x0 = latent - sigma * velocity sigma_f32 = mx.array(sigma, dtype=mx.float32) latents_flat_f32 = mx.transpose(mx.reshape(latents, (b, c, -1)), (0, 2, 1)) timesteps_f32 = mx.expand_dims(timesteps.astype(mx.float32), axis=-1) x0_f32 = latents_flat_f32 - timesteps_f32 * velocity.astype(mx.float32) denoised = mx.reshape(mx.transpose(x0_f32, (0, 2, 1)), (b, c, f, h, w)) audio_denoised = None if enable_audio and audio_velocity is not None: ab, ac, at, af = audio_latents.shape audio_velocity = mx.reshape(audio_velocity, (ab, at, ac, af)) audio_velocity = mx.transpose(audio_velocity, (0, 2, 1, 3)) audio_denoised = audio_latents - sigma_f32 * audio_velocity.astype(mx.float32) if state is not None: denoised = apply_denoise_mask(denoised, state.clean_latent.astype(mx.float32), state.denoise_mask) mx.eval(denoised) if audio_denoised is not None: mx.eval(audio_denoised) # Euler step in float32 (latents stay in float32) if sigma_next > 0: sigma_next_f32 = mx.array(sigma_next, dtype=mx.float32) latents = denoised + sigma_next_f32 * (latents - denoised) / sigma_f32 if enable_audio and audio_denoised is not None: audio_latents = audio_denoised + sigma_next_f32 * (audio_latents - audio_denoised) / sigma_f32 else: latents = denoised if enable_audio and audio_denoised is not None: audio_latents = audio_denoised mx.eval(latents) if enable_audio: mx.eval(audio_latents) progress.advance(task) return latents.astype(dtype), audio_latents.astype(dtype) if enable_audio else None # ============================================================================= # Dev Pipeline Denoising (with CFG, dynamic sigmas) # ============================================================================= def denoise_dev( latents: mx.array, positions: mx.array, text_embeddings_pos: mx.array, text_embeddings_neg: mx.array, transformer: LTXModel, sigmas: mx.array, cfg_scale: float = 4.0, cfg_rescale: float = 0.0, verbose: bool = True, state: Optional[LatentState] = None, use_apg: bool = False, apg_eta: float = 1.0, apg_norm_threshold: float = 0.0, stg_scale: float = 0.0, stg_blocks: Optional[list] = None, ) -> mx.array: """Run denoising loop for dev pipeline with CFG/APG and optional STG guidance. Args: cfg_rescale: Rescale factor for CFG (0.0-1.0). Normalizes guided prediction variance relative to conditional prediction to reduce over-saturation. PyTorch default is 0.7. Set to 0.0 to disable. use_apg: Use Adaptive Projected Guidance instead of standard CFG. APG decomposes guidance into parallel/orthogonal components for more stable I2V generation. apg_eta: APG parallel component weight (1.0 = keep full parallel) apg_norm_threshold: APG guidance norm clamp (0 = no clamping) stg_scale: STG (Spatiotemporal Guidance) scale. 0.0 = disabled. stg_blocks: Transformer block indices for STG perturbation. """ from mlx_video.models.ltx.rope import precompute_freqs_cis dtype = latents.dtype if state is not None: latents = state.latent # Keep latents in float32 throughout the denoising loop to avoid # quantization noise accumulation over many steps. # Model input is cast to model dtype; all denoising math stays in float32. latents = latents.astype(mx.float32) sigmas_list = sigmas.tolist() use_cfg = cfg_scale != 1.0 use_stg = stg_scale != 0.0 and stg_blocks is not None num_steps = len(sigmas_list) - 1 # Precompute RoPE once precomputed_rope = precompute_freqs_cis( positions, dim=transformer.inner_dim, theta=transformer.positional_embedding_theta, max_pos=transformer.positional_embedding_max_pos, use_middle_indices_grid=transformer.use_middle_indices_grid, num_attention_heads=transformer.num_attention_heads, rope_type=transformer.rope_type, double_precision=transformer.config.double_precision_rope, ) mx.eval(precomputed_rope) with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), TimeRemainingColumn(), console=console, disable=not verbose, ) as progress: passes = ["CFG"] if use_cfg else [] if use_stg: passes.append("STG") label = "+".join(passes) if passes else "uncond" task = progress.add_task(f"[cyan]Denoising ({label})[/]", total=num_steps) for i in range(num_steps): sigma = sigmas_list[i] sigma_next = sigmas_list[i + 1] b, c, f, h, w = latents.shape num_tokens = f * h * w # Cast to model dtype for transformer input latents_flat = mx.transpose(mx.reshape(latents, (b, c, -1)), (0, 2, 1)).astype(dtype) if state is not None: denoise_mask_flat = mx.reshape(state.denoise_mask, (b, 1, f, 1, 1)) denoise_mask_flat = mx.broadcast_to(denoise_mask_flat, (b, 1, f, h, w)) denoise_mask_flat = mx.reshape(denoise_mask_flat, (b, num_tokens)) timesteps = mx.array(sigma, dtype=dtype) * denoise_mask_flat else: timesteps = mx.full((b, num_tokens), sigma, dtype=dtype) sigma_array = mx.full((b,), sigma, dtype=dtype) # Positive conditioning pass video_modality_pos = Modality( latent=latents_flat, timesteps=timesteps, positions=positions, context=text_embeddings_pos, context_mask=None, enabled=True, positional_embeddings=precomputed_rope, sigma=sigma_array, ) velocity_pos, _ = transformer(video=video_modality_pos, audio=None) # Convert velocity to x0 (denoised) using per-token timesteps # Matches PyTorch's X0Model: x0 = latent - timestep * velocity # For conditioned tokens (timestep=0): x0 = latent (correct regardless of velocity) # For unconditioned tokens (timestep=sigma): x0 = latent - sigma * velocity latents_flat_f32 = mx.transpose(mx.reshape(latents, (b, c, -1)), (0, 2, 1)) timesteps_f32 = mx.expand_dims(timesteps.astype(mx.float32), axis=-1) x0_pos_f32 = latents_flat_f32 - timesteps_f32 * velocity_pos.astype(mx.float32) # Start with positive prediction x0_guided_f32 = x0_pos_f32 if use_cfg: # Negative conditioning pass video_modality_neg = Modality( latent=latents_flat, timesteps=timesteps, positions=positions, context=text_embeddings_neg, context_mask=None, enabled=True, positional_embeddings=precomputed_rope, sigma=sigma_array, ) velocity_neg, _ = transformer(video=video_modality_neg, audio=None) # Convert negative velocity to x0 using per-token timesteps x0_neg_f32 = latents_flat_f32 - timesteps_f32 * velocity_neg.astype(mx.float32) # Apply guidance to x0 predictions # For conditioned tokens: x0_pos = x0_neg = latent, so delta = 0 if use_apg: # APG: decompose into parallel/orthogonal components for stability x0_guided_f32 = x0_pos_f32 + apg_delta( x0_pos_f32, x0_neg_f32, cfg_scale, eta=apg_eta, norm_threshold=apg_norm_threshold ) else: # Standard CFG x0_guided_f32 = x0_pos_f32 + (cfg_scale - 1.0) * (x0_pos_f32 - x0_neg_f32) # STG pass: skip self-attention at specified blocks if use_stg: velocity_ptb, _ = transformer( video=video_modality_pos, audio=None, stg_video_blocks=stg_blocks, ) mx.eval(velocity_ptb) x0_ptb_f32 = latents_flat_f32 - timesteps_f32 * velocity_ptb.astype(mx.float32) x0_guided_f32 = x0_guided_f32 + stg_scale * (x0_pos_f32 - x0_ptb_f32) # Apply CFG rescale if enabled (std-ratio rescaling to reduce over-saturation) # factor = rescale * (cond_std / pred_std) + (1 - rescale) # pred = pred * factor if cfg_rescale > 0.0 and (use_cfg or use_stg): v_factor = x0_pos_f32.std() / (x0_guided_f32.std() + 1e-8) v_factor = cfg_rescale * v_factor + (1.0 - cfg_rescale) x0_guided_f32 = x0_guided_f32 * v_factor # Reshape x0 from token space (b, tokens, c) to spatial (b, c, f, h, w) denoised = mx.reshape(mx.transpose(x0_guided_f32, (0, 2, 1)), (b, c, f, h, w)) sigma_f32 = mx.array(sigma, dtype=mx.float32) if state is not None: denoised = apply_denoise_mask(denoised, state.clean_latent.astype(mx.float32), state.denoise_mask) # Euler step in float32 (latents stay in float32) if sigma_next > 0: sigma_next_f32 = mx.array(sigma_next, dtype=mx.float32) latents = denoised + sigma_next_f32 * (latents - denoised) / sigma_f32 else: latents = denoised mx.eval(latents) progress.advance(task) return latents.astype(dtype) def denoise_dev_av( video_latents: mx.array, audio_latents: mx.array, video_positions: mx.array, audio_positions: mx.array, video_embeddings_pos: mx.array, video_embeddings_neg: mx.array, audio_embeddings_pos: mx.array, audio_embeddings_neg: mx.array, transformer: LTXModel, sigmas: mx.array, cfg_scale: float = 4.0, audio_cfg_scale: float = 7.0, cfg_rescale: float = 0.0, verbose: bool = True, video_state: Optional[LatentState] = None, use_apg: bool = False, apg_eta: float = 1.0, apg_norm_threshold: float = 0.0, stg_scale: float = 0.0, stg_video_blocks: Optional[list] = None, stg_audio_blocks: Optional[list] = None, modality_scale: float = 1.0, ) -> tuple[mx.array, mx.array]: """Run denoising loop for dev pipeline with CFG/APG, STG, modality guidance, and audio. Args: audio_cfg_scale: Separate CFG scale for audio (PyTorch default: 7.0). cfg_rescale: Rescale factor for CFG (0.0-1.0). Normalizes guided prediction variance to reduce artifacts. Default 0.0 means no rescaling. use_apg: Use Adaptive Projected Guidance instead of standard CFG for video. apg_eta: APG parallel component weight (1.0 = keep full parallel) apg_norm_threshold: APG guidance norm clamp (0 = no clamping) stg_scale: STG (Spatiotemporal Guidance) scale. 0.0 = disabled. stg_video_blocks: Transformer block indices for video STG perturbation. stg_audio_blocks: Transformer block indices for audio STG perturbation. modality_scale: Cross-modal guidance scale. 1.0 = disabled. """ from mlx_video.models.ltx.rope import precompute_freqs_cis dtype = video_latents.dtype if video_state is not None: video_latents = video_state.latent # Keep latents in float32 throughout the denoising loop for precision. video_latents = video_latents.astype(mx.float32) audio_latents = audio_latents.astype(mx.float32) sigmas_list = sigmas.tolist() use_cfg = cfg_scale != 1.0 use_stg = stg_scale != 0.0 and stg_video_blocks is not None use_modality = modality_scale != 1.0 num_steps = len(sigmas_list) - 1 # Precompute video RoPE precomputed_video_rope = precompute_freqs_cis( video_positions, dim=transformer.inner_dim, theta=transformer.positional_embedding_theta, max_pos=transformer.positional_embedding_max_pos, use_middle_indices_grid=transformer.use_middle_indices_grid, num_attention_heads=transformer.num_attention_heads, rope_type=transformer.rope_type, double_precision=transformer.config.double_precision_rope, ) # Precompute audio RoPE precomputed_audio_rope = precompute_freqs_cis( audio_positions, dim=transformer.audio_inner_dim, theta=transformer.positional_embedding_theta, max_pos=transformer.audio_positional_embedding_max_pos, use_middle_indices_grid=transformer.use_middle_indices_grid, num_attention_heads=transformer.audio_num_attention_heads, rope_type=transformer.rope_type, double_precision=transformer.config.double_precision_rope, ) mx.eval(precomputed_video_rope, precomputed_audio_rope) with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), TimeRemainingColumn(), console=console, disable=not verbose, ) as progress: passes = ["CFG"] if use_cfg else [] if use_stg: passes.append("STG") if use_modality: passes.append("Mod") label = "+".join(passes) if passes else "uncond" task = progress.add_task(f"[cyan]Denoising A/V ({label})[/]", total=num_steps) for i in range(num_steps): sigma = sigmas_list[i] sigma_next = sigmas_list[i + 1] # Flatten video latents (cast to model dtype for transformer input) b, c, f, h, w = video_latents.shape num_video_tokens = f * h * w video_flat = mx.transpose(mx.reshape(video_latents, (b, c, -1)), (0, 2, 1)).astype(dtype) # Flatten audio latents (cast to model dtype for transformer input) ab, ac, at, af = audio_latents.shape audio_flat = mx.transpose(audio_latents, (0, 2, 1, 3)) audio_flat = mx.reshape(audio_flat, (ab, at, ac * af)).astype(dtype) # Compute timesteps if video_state is not None: denoise_mask_flat = mx.reshape(video_state.denoise_mask, (b, 1, f, 1, 1)) denoise_mask_flat = mx.broadcast_to(denoise_mask_flat, (b, 1, f, h, w)) denoise_mask_flat = mx.reshape(denoise_mask_flat, (b, num_video_tokens)) video_timesteps = mx.array(sigma, dtype=dtype) * denoise_mask_flat else: video_timesteps = mx.full((b, num_video_tokens), sigma, dtype=dtype) audio_timesteps = mx.full((ab, at), sigma, dtype=dtype) # Positive conditioning pass sigma_array = mx.full((b,), sigma, dtype=dtype) audio_sigma_array = mx.full((ab,), sigma, dtype=dtype) video_modality_pos = Modality( latent=video_flat, timesteps=video_timesteps, positions=video_positions, context=video_embeddings_pos, context_mask=None, enabled=True, positional_embeddings=precomputed_video_rope, sigma=sigma_array, ) audio_modality_pos = Modality( latent=audio_flat, timesteps=audio_timesteps, positions=audio_positions, context=audio_embeddings_pos, context_mask=None, enabled=True, positional_embeddings=precomputed_audio_rope, sigma=audio_sigma_array, ) video_vel_pos, audio_vel_pos = transformer(video=video_modality_pos, audio=audio_modality_pos) mx.eval(video_vel_pos, audio_vel_pos) # Convert velocity to denoised (x0) using per-token timesteps # This matches PyTorch's X0ModelWrapper: x0 = latent - timestep * velocity # For conditioned tokens (timestep=0): x0 = latent (velocity is irrelevant) # For unconditioned tokens (timestep=sigma): x0 = latent - sigma * velocity video_flat_f32 = mx.transpose(mx.reshape(video_latents, (b, c, -1)), (0, 2, 1)) audio_flat_f32 = mx.reshape(mx.transpose(audio_latents, (0, 2, 1, 3)), (ab, at, ac * af)) video_timesteps_f32 = mx.expand_dims(video_timesteps.astype(mx.float32), axis=-1) audio_timesteps_f32 = mx.expand_dims(audio_timesteps.astype(mx.float32), axis=-1) video_x0_pos_f32 = video_flat_f32 - video_timesteps_f32 * video_vel_pos.astype(mx.float32) audio_x0_pos_f32 = audio_flat_f32 - audio_timesteps_f32 * audio_vel_pos.astype(mx.float32) # Start with positive prediction video_x0_guided_f32 = video_x0_pos_f32 audio_x0_guided_f32 = audio_x0_pos_f32 # Pass 2: CFG (negative conditioning) if use_cfg: video_modality_neg = Modality( latent=video_flat, timesteps=video_timesteps, positions=video_positions, context=video_embeddings_neg, context_mask=None, enabled=True, positional_embeddings=precomputed_video_rope, sigma=sigma_array, ) audio_modality_neg = Modality( latent=audio_flat, timesteps=audio_timesteps, positions=audio_positions, context=audio_embeddings_neg, context_mask=None, enabled=True, positional_embeddings=precomputed_audio_rope, sigma=audio_sigma_array, ) video_vel_neg, audio_vel_neg = transformer(video=video_modality_neg, audio=audio_modality_neg) mx.eval(video_vel_neg, audio_vel_neg) video_x0_neg_f32 = video_flat_f32 - video_timesteps_f32 * video_vel_neg.astype(mx.float32) audio_x0_neg_f32 = audio_flat_f32 - audio_timesteps_f32 * audio_vel_neg.astype(mx.float32) if use_apg: video_x0_guided_f32 = video_x0_pos_f32 + apg_delta( video_x0_pos_f32, video_x0_neg_f32, cfg_scale, eta=apg_eta, norm_threshold=apg_norm_threshold ) else: video_x0_guided_f32 = video_x0_pos_f32 + (cfg_scale - 1.0) * (video_x0_pos_f32 - video_x0_neg_f32) audio_x0_guided_f32 = audio_x0_pos_f32 + (audio_cfg_scale - 1.0) * (audio_x0_pos_f32 - audio_x0_neg_f32) # Pass 3: STG (self-attention perturbation at specified blocks) if use_stg: video_vel_ptb, audio_vel_ptb = transformer( video=video_modality_pos, audio=audio_modality_pos, stg_video_blocks=stg_video_blocks, stg_audio_blocks=stg_audio_blocks, ) mx.eval(video_vel_ptb, audio_vel_ptb) video_x0_ptb_f32 = video_flat_f32 - video_timesteps_f32 * video_vel_ptb.astype(mx.float32) audio_x0_ptb_f32 = audio_flat_f32 - audio_timesteps_f32 * audio_vel_ptb.astype(mx.float32) video_x0_guided_f32 = video_x0_guided_f32 + stg_scale * (video_x0_pos_f32 - video_x0_ptb_f32) audio_x0_guided_f32 = audio_x0_guided_f32 + stg_scale * (audio_x0_pos_f32 - audio_x0_ptb_f32) # Pass 4: Modality isolation (skip all cross-modal attention) if use_modality: video_vel_iso, audio_vel_iso = transformer( video=video_modality_pos, audio=audio_modality_pos, skip_cross_modal=True, ) mx.eval(video_vel_iso, audio_vel_iso) video_x0_iso_f32 = video_flat_f32 - video_timesteps_f32 * video_vel_iso.astype(mx.float32) audio_x0_iso_f32 = audio_flat_f32 - audio_timesteps_f32 * audio_vel_iso.astype(mx.float32) video_x0_guided_f32 = video_x0_guided_f32 + (modality_scale - 1.0) * (video_x0_pos_f32 - video_x0_iso_f32) audio_x0_guided_f32 = audio_x0_guided_f32 + (modality_scale - 1.0) * (audio_x0_pos_f32 - audio_x0_iso_f32) # Apply CFG rescale (std-ratio rescaling to reduce over-saturation) if cfg_rescale > 0.0 and (use_cfg or use_stg or use_modality): v_factor = video_x0_pos_f32.std() / (video_x0_guided_f32.std() + 1e-8) v_factor = cfg_rescale * v_factor + (1.0 - cfg_rescale) video_x0_guided_f32 = video_x0_guided_f32 * v_factor a_factor = audio_x0_pos_f32.std() / (audio_x0_guided_f32.std() + 1e-8) a_factor = cfg_rescale * a_factor + (1.0 - cfg_rescale) audio_x0_guided_f32 = audio_x0_guided_f32 * a_factor # Reshape x0 from token space (b, tokens, c) to spatial (b, c, f, h, w) video_denoised_f32 = mx.reshape(mx.transpose(video_x0_guided_f32, (0, 2, 1)), (b, c, f, h, w)) audio_denoised_f32 = mx.reshape(audio_x0_guided_f32, (ab, at, ac, af)) audio_denoised_f32 = mx.transpose(audio_denoised_f32, (0, 2, 1, 3)) # Post-process: blend denoised with clean latent using mask # Matches PyTorch's post_process_latent: denoised * mask + clean * (1 - mask) sigma_f32 = mx.array(sigma, dtype=mx.float32) if video_state is not None: clean_f32 = video_state.clean_latent.astype(mx.float32) mask_f32 = video_state.denoise_mask.astype(mx.float32) video_denoised_f32 = video_denoised_f32 * mask_f32 + clean_f32 * (1.0 - mask_f32) mx.eval(video_denoised_f32, audio_denoised_f32) # Euler step: sample + velocity * dt (float32) if sigma_next > 0: sigma_next_f32 = mx.array(sigma_next, dtype=mx.float32) dt_f32 = sigma_next_f32 - sigma_f32 video_velocity_f32 = (video_latents - video_denoised_f32) / sigma_f32 video_latents = video_latents + video_velocity_f32 * dt_f32 audio_velocity_f32 = (audio_latents - audio_denoised_f32) / sigma_f32 audio_latents = audio_latents + audio_velocity_f32 * dt_f32 else: video_latents = video_denoised_f32 audio_latents = audio_denoised_f32 mx.eval(video_latents, audio_latents) progress.advance(task) return video_latents, audio_latents # ============================================================================= # Audio Loading and Processing # ============================================================================= def load_audio_decoder(model_path: Path, pipeline: PipelineType): """Load audio VAE decoder.""" from mlx_video.models.ltx.audio_vae import AudioDecoder decoder = AudioDecoder.from_pretrained(model_path / "audio_vae") return decoder def load_vocoder_model(model_path: Path, pipeline: PipelineType): """Load vocoder for mel to waveform conversion. Automatically detects HiFi-GAN (LTX-2) or BigVGAN+BWE (LTX-2.3). """ from mlx_video.models.ltx.audio_vae.vocoder import load_vocoder as _load_vocoder return _load_vocoder(model_path / "vocoder") def save_audio(audio: np.ndarray, path: Path, sample_rate: int = AUDIO_SAMPLE_RATE): """Save audio to WAV file.""" import wave if audio.ndim == 2: audio = audio.T audio = np.clip(audio, -1.0, 1.0) audio_int16 = (audio * 32767).astype(np.int16) with wave.open(str(path), 'wb') as wf: wf.setnchannels(2 if audio_int16.ndim == 2 else 1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(audio_int16.tobytes()) def mux_video_audio(video_path: Path, audio_path: Path, output_path: Path): """Combine video and audio into final output using ffmpeg.""" import subprocess cmd = [ "ffmpeg", "-y", "-i", str(video_path), "-i", str(audio_path), "-c:v", "copy", "-c:a", "aac", "-shortest", str(output_path) ] try: subprocess.run(cmd, check=True, capture_output=True) return True except subprocess.CalledProcessError as e: console.print(f"[red]FFmpeg error: {e.stderr.decode()}[/]") return False except FileNotFoundError: console.print("[red]FFmpeg not found. Please install ffmpeg.[/]") return False # ============================================================================= # Unified Generate Function # ============================================================================= def generate_video( model_repo: str, text_encoder_repo: str, prompt: str, pipeline: PipelineType = PipelineType.DISTILLED, negative_prompt: str = DEFAULT_NEGATIVE_PROMPT, height: int = 512, width: int = 512, num_frames: int = 33, num_inference_steps: int = 40, cfg_scale: float = 4.0, audio_cfg_scale: float = 7.0, cfg_rescale: float = 0.0, seed: int = 42, fps: int = 24, output_path: str = "output.mp4", save_frames: bool = False, verbose: bool = True, enhance_prompt: bool = False, max_tokens: int = 512, temperature: float = 0.7, image: Optional[str] = None, image_strength: float = 1.0, image_frame_idx: int = 0, tiling: str = "auto", stream: bool = False, audio: bool = False, output_audio_path: Optional[str] = None, use_apg: bool = False, apg_eta: float = 1.0, apg_norm_threshold: float = 0.0, stg_scale: float = 0.0, stg_blocks: Optional[list] = None, modality_scale: float = 1.0, lora_path: Optional[str] = None, lora_strength: float = 1.0, ): """Generate video using LTX-2 models. Supports three pipelines: - DISTILLED: Two-stage generation with upsampling, fixed sigma schedules, no CFG - DEV: Single-stage generation with dynamic sigmas and CFG - DEV_TWO_STAGE: Stage 1 dev (half res, CFG) + upsample + stage 2 distilled with LoRA (full res, no CFG) Args: model_repo: Model repository ID text_encoder_repo: Text encoder repository ID prompt: Text description of the video to generate pipeline: Pipeline type (DISTILLED or DEV) negative_prompt: Negative prompt for CFG (dev pipeline only) height: Output video height (must be divisible by 32/64) width: Output video width (must be divisible by 32/64) num_frames: Number of frames (must be 1 + 8*k) num_inference_steps: Number of denoising steps (dev pipeline only) cfg_scale: Guidance scale for CFG (dev pipeline only) seed: Random seed for reproducibility fps: Frames per second for output video output_path: Path to save the output video save_frames: Whether to save individual frames as images verbose: Whether to print progress enhance_prompt: Whether to enhance prompt using Gemma max_tokens: Max tokens for prompt enhancement temperature: Temperature for prompt enhancement image: Path to conditioning image for I2V image_strength: Conditioning strength for I2V image_frame_idx: Frame index to condition for I2V tiling: Tiling mode for VAE decoding stream: Stream frames to output as they're decoded audio: Enable synchronized audio generation output_audio_path: Path to save audio file use_apg: Use Adaptive Projected Guidance instead of CFG (more stable for I2V) apg_eta: APG parallel component weight (1.0 = keep full parallel) apg_norm_threshold: APG guidance norm clamp (0 = no clamping) """ start_time = time.time() # Validate dimensions is_two_stage = pipeline in (PipelineType.DISTILLED, PipelineType.DEV_TWO_STAGE) divisor = 64 if is_two_stage else 32 assert height % divisor == 0, f"Height must be divisible by {divisor}, got {height}" assert width % divisor == 0, f"Width must be divisible by {divisor}, got {width}" if num_frames % 8 != 1: adjusted_num_frames = round((num_frames - 1) / 8) * 8 + 1 console.print(f"[yellow]⚠️ Number of frames must be 1 + 8*k. Using: {adjusted_num_frames}[/]") num_frames = adjusted_num_frames is_i2v = image is not None mode_str = "I2V" if is_i2v else "T2V" if audio: mode_str += "+Audio" pipeline_names = { PipelineType.DISTILLED: "DISTILLED", PipelineType.DEV: "DEV", PipelineType.DEV_TWO_STAGE: "DEV-TWO-STAGE", } pipeline_name = pipeline_names[pipeline] header = f"[bold cyan]🎬 [{pipeline_name}] [{mode_str}] {width}x{height} • {num_frames} frames[/]" console.print(Panel(header, expand=False)) console.print(f"[dim]Prompt: {prompt[:80]}{'...' if len(prompt) > 80 else ''}[/]") if pipeline in (PipelineType.DEV, PipelineType.DEV_TWO_STAGE): audio_cfg_info = f", Audio CFG: {audio_cfg_scale}" if audio else "" stg_info = f", STG: {stg_scale} blocks={stg_blocks}" if stg_scale != 0.0 else "" mod_info = f", Modality: {modality_scale}" if modality_scale != 1.0 else "" console.print(f"[dim]Steps: {num_inference_steps}, CFG: {cfg_scale}{audio_cfg_info}, Rescale: {cfg_rescale}{stg_info}{mod_info}[/]") if is_i2v: console.print(f"[dim]Image: {image} (strength={image_strength}, frame={image_frame_idx})[/]") audio_frames = None if audio: audio_frames = compute_audio_frames(num_frames, fps) console.print(f"[dim]Audio: {audio_frames} latent frames @ {AUDIO_SAMPLE_RATE}Hz[/]") # Get model path model_path = get_model_path(model_repo) text_encoder_path = model_path if text_encoder_repo is None else get_model_path(text_encoder_repo) # Calculate latent dimensions if is_two_stage: stage1_h, stage1_w = height // 2 // 32, width // 2 // 32 stage2_h, stage2_w = height // 32, width // 32 else: latent_h, latent_w = height // 32, width // 32 latent_frames = 1 + (num_frames - 1) // 8 mx.random.seed(seed) # Read transformer config to detect model version import json transformer_config_path = model_path / "transformer" / "config.json" has_prompt_adaln = False if transformer_config_path.exists(): with open(transformer_config_path) as f: has_prompt_adaln = json.load(f).get("has_prompt_adaln", False) # Load text encoder with console.status("[blue]📝 Loading text encoder...[/]", spinner="dots"): from mlx_video.models.ltx.text_encoder import LTX2TextEncoder text_encoder = LTX2TextEncoder(has_prompt_adaln=has_prompt_adaln) text_encoder.load(model_path=model_path, text_encoder_path=text_encoder_path) mx.eval(text_encoder.parameters()) console.print("[green]✓[/] Text encoder loaded") # Optionally enhance the prompt if enhance_prompt: console.print("[bold magenta]✨ Enhancing prompt[/]") prompt = text_encoder.enhance_t2v(prompt, max_tokens=max_tokens, temperature=temperature, seed=seed, verbose=verbose) console.print(f"[dim]Enhanced: {prompt[:150]}{'...' if len(prompt) > 150 else ''}[/]") # Encode prompts if pipeline in (PipelineType.DEV, PipelineType.DEV_TWO_STAGE): # Dev/dev-two-stage pipelines need positive and negative embeddings for CFG if audio: video_embeddings_pos, audio_embeddings_pos = text_encoder(prompt, return_audio_embeddings=True) video_embeddings_neg, audio_embeddings_neg = text_encoder(negative_prompt, return_audio_embeddings=True) model_dtype = video_embeddings_pos.dtype mx.eval(video_embeddings_pos, video_embeddings_neg, audio_embeddings_pos, audio_embeddings_neg) else: video_embeddings_pos, _ = text_encoder(prompt, return_audio_embeddings=False) video_embeddings_neg, _ = text_encoder(negative_prompt, return_audio_embeddings=False) audio_embeddings_pos = audio_embeddings_neg = None model_dtype = video_embeddings_pos.dtype mx.eval(video_embeddings_pos, video_embeddings_neg) # For dev-two-stage, stage 2 uses single positive embedding (no CFG) if pipeline == PipelineType.DEV_TWO_STAGE: text_embeddings = video_embeddings_pos else: # Distilled pipeline - single embedding if audio: text_embeddings, audio_embeddings = text_encoder(prompt, return_audio_embeddings=True) mx.eval(text_embeddings, audio_embeddings) else: text_embeddings, _ = text_encoder(prompt, return_audio_embeddings=False) audio_embeddings = None mx.eval(text_embeddings) model_dtype = text_embeddings.dtype del text_encoder mx.clear_cache() # Load transformer transformer_desc = f"🤖 Loading {pipeline_name.lower()} transformer{' (A/V mode)' if audio else ''}..." with console.status(f"[blue]{transformer_desc}[/]", spinner="dots"): transformer = LTXModel.from_pretrained(model_path=model_path / "transformer", strict=True) console.print("[green]✓[/] Transformer loaded") # Auto-detect stg_blocks from transformer config if not explicitly provided. # LTX-2.3 (has_prompt_adaln=True) uses block 28; LTX-2 uses block 29. if stg_blocks is None and stg_scale != 0.0: if transformer.config.has_prompt_adaln: stg_blocks = [28] else: stg_blocks = [29] console.print(f"[dim]Auto-detected STG blocks: {stg_blocks} (model={'2.3' if transformer.config.has_prompt_adaln else '2'})[/]") # ========================================================================== # Pipeline-specific generation logic # ========================================================================== if pipeline == PipelineType.DISTILLED: # ====================================================================== # DISTILLED PIPELINE: Two-stage with upsampling # ====================================================================== # Load VAE encoder for I2V stage1_image_latent = None stage2_image_latent = None if is_i2v: with console.status("[blue]🖼️ Loading VAE encoder and encoding image...[/]", spinner="dots"): vae_encoder = VideoEncoder.from_pretrained(model_path / "vae" / "encoder") input_image = load_image(image, height=height // 2, width=width // 2, dtype=model_dtype) stage1_image_tensor = prepare_image_for_encoding(input_image, height // 2, width // 2, dtype=model_dtype) stage1_image_latent = vae_encoder(stage1_image_tensor) mx.eval(stage1_image_latent) input_image = load_image(image, height=height, width=width, dtype=model_dtype) stage2_image_tensor = prepare_image_for_encoding(input_image, height, width, dtype=model_dtype) stage2_image_latent = vae_encoder(stage2_image_tensor) mx.eval(stage2_image_latent) del vae_encoder mx.clear_cache() console.print("[green]✓[/] VAE encoder loaded and image encoded") # Stage 1 console.print(f"\n[bold yellow]⚡ Stage 1:[/] Generating at {width//2}x{height//2} (8 steps)") mx.random.seed(seed) positions = create_position_grid(1, latent_frames, stage1_h, stage1_w) mx.eval(positions) audio_positions = None audio_latents = None if audio: audio_positions = create_audio_position_grid(1, audio_frames) audio_latents = mx.random.normal((1, AUDIO_LATENT_CHANNELS, audio_frames, AUDIO_MEL_BINS)).astype(model_dtype) mx.eval(audio_positions, audio_latents) # Apply I2V conditioning state1 = None if is_i2v and stage1_image_latent is not None: latent_shape = (1, 128, latent_frames, stage1_h, stage1_w) state1 = LatentState( latent=mx.zeros(latent_shape, dtype=model_dtype), clean_latent=mx.zeros(latent_shape, dtype=model_dtype), denoise_mask=mx.ones((1, 1, latent_frames, 1, 1), dtype=model_dtype), ) conditioning = VideoConditionByLatentIndex(latent=stage1_image_latent, frame_idx=image_frame_idx, strength=image_strength) state1 = apply_conditioning(state1, [conditioning]) noise = mx.random.normal(latent_shape, dtype=model_dtype) noise_scale = mx.array(STAGE_1_SIGMAS[0], dtype=model_dtype) scaled_mask = state1.denoise_mask * noise_scale state1 = LatentState( latent=noise * scaled_mask + state1.latent * (mx.array(1.0, dtype=model_dtype) - scaled_mask), clean_latent=state1.clean_latent, denoise_mask=state1.denoise_mask, ) latents = state1.latent mx.eval(latents) else: latents = mx.random.normal((1, 128, latent_frames, stage1_h, stage1_w), dtype=model_dtype) mx.eval(latents) latents, audio_latents = denoise_distilled( latents, positions, text_embeddings, transformer, STAGE_1_SIGMAS, verbose=verbose, state=state1, audio_latents=audio_latents, audio_positions=audio_positions, audio_embeddings=audio_embeddings, ) # Upsample latents with console.status("[magenta]🔍 Upsampling latents 2x...[/]", spinner="dots"): upscaler_files = sorted(model_path.glob("*spatial-upscaler-x2*.safetensors")) if not upscaler_files: raise FileNotFoundError(f"No spatial upscaler found in {model_path}") upsampler = load_upsampler(str(upscaler_files[0])) mx.eval(upsampler.parameters()) vae_decoder = VideoDecoder.from_pretrained(str(model_path / "vae" / "decoder")) latents = upsample_latents(latents, upsampler, vae_decoder.per_channel_statistics.mean, vae_decoder.per_channel_statistics.std) mx.eval(latents) del upsampler mx.clear_cache() console.print("[green]✓[/] Latents upsampled") # Stage 2 console.print(f"\n[bold yellow]⚡ Stage 2:[/] Refining at {width}x{height} (3 steps)") positions = create_position_grid(1, latent_frames, stage2_h, stage2_w) mx.eval(positions) state2 = None if is_i2v and stage2_image_latent is not None: state2 = LatentState( latent=latents, clean_latent=mx.zeros_like(latents), denoise_mask=mx.ones((1, 1, latent_frames, 1, 1), dtype=model_dtype), ) conditioning = VideoConditionByLatentIndex(latent=stage2_image_latent, frame_idx=image_frame_idx, strength=image_strength) state2 = apply_conditioning(state2, [conditioning]) noise = mx.random.normal(latents.shape).astype(model_dtype) noise_scale = mx.array(STAGE_2_SIGMAS[0], dtype=model_dtype) scaled_mask = state2.denoise_mask * noise_scale state2 = LatentState( latent=noise * scaled_mask + state2.latent * (mx.array(1.0, dtype=model_dtype) - scaled_mask), clean_latent=state2.clean_latent, denoise_mask=state2.denoise_mask, ) latents = state2.latent mx.eval(latents) else: noise_scale = mx.array(STAGE_2_SIGMAS[0], dtype=model_dtype) one_minus_scale = mx.array(1.0 - STAGE_2_SIGMAS[0], dtype=model_dtype) noise = mx.random.normal(latents.shape).astype(model_dtype) latents = noise * noise_scale + latents * one_minus_scale mx.eval(latents) # Re-noise audio at sigma=0.909375 for joint refinement (matches PyTorch) if audio and audio_latents is not None: audio_noise = mx.random.normal(audio_latents.shape, dtype=model_dtype) audio_noise_scale = mx.array(STAGE_2_SIGMAS[0], dtype=model_dtype) audio_latents = audio_noise * audio_noise_scale + audio_latents * (mx.array(1.0, dtype=model_dtype) - audio_noise_scale) mx.eval(audio_latents) # Joint video + audio refinement (no CFG, positive embeddings only) latents, audio_latents = denoise_distilled( latents, positions, text_embeddings, transformer, STAGE_2_SIGMAS, verbose=verbose, state=state2, audio_latents=audio_latents, audio_positions=audio_positions, audio_embeddings=audio_embeddings if audio else None, ) elif pipeline == PipelineType.DEV: # ====================================================================== # DEV PIPELINE: Single-stage with CFG # ====================================================================== # Load VAE encoder for I2V image_latent = None if is_i2v: with console.status("[blue]🖼️ Loading VAE encoder and encoding image...[/]", spinner="dots"): vae_encoder = VideoEncoder.from_pretrained(model_path / "vae" / "encoder") input_image = load_image(image, height=height, width=width, dtype=model_dtype) image_tensor = prepare_image_for_encoding(input_image, height, width, dtype=model_dtype) image_latent = vae_encoder(image_tensor) mx.eval(image_latent) del vae_encoder mx.clear_cache() console.print("[green]✓[/] VAE encoder loaded and image encoded") # Generate sigma schedule with token-count-dependent shifting sigmas = ltx2_scheduler(steps=num_inference_steps) mx.eval(sigmas) console.print(f"[dim]Sigma schedule: {sigmas[0].item():.4f} → {sigmas[-2].item():.4f} → {sigmas[-1].item():.4f}[/]") console.print(f"\n[bold yellow]⚡ Generating:[/] {width}x{height} ({num_inference_steps} steps, CFG={cfg_scale}, rescale={cfg_rescale})") mx.random.seed(seed) video_positions = create_position_grid(1, latent_frames, latent_h, latent_w) mx.eval(video_positions) audio_positions = None audio_latents = None if audio: audio_positions = create_audio_position_grid(1, audio_frames) audio_latents = mx.random.normal((1, AUDIO_LATENT_CHANNELS, audio_frames, AUDIO_MEL_BINS), dtype=model_dtype) mx.eval(audio_positions, audio_latents) # Initialize latents with optional I2V conditioning video_state = None video_latent_shape = (1, 128, latent_frames, latent_h, latent_w) if is_i2v and image_latent is not None: video_state = LatentState( latent=mx.zeros(video_latent_shape, dtype=model_dtype), clean_latent=mx.zeros(video_latent_shape, dtype=model_dtype), denoise_mask=mx.ones((1, 1, latent_frames, 1, 1), dtype=model_dtype), ) conditioning = VideoConditionByLatentIndex(latent=image_latent, frame_idx=image_frame_idx, strength=image_strength) video_state = apply_conditioning(video_state, [conditioning]) noise = mx.random.normal(video_latent_shape, dtype=model_dtype) noise_scale = sigmas[0] scaled_mask = video_state.denoise_mask * noise_scale video_state = LatentState( latent=noise * scaled_mask + video_state.latent * (mx.array(1.0, dtype=model_dtype) - scaled_mask), clean_latent=video_state.clean_latent, denoise_mask=video_state.denoise_mask, ) latents = video_state.latent mx.eval(latents) else: latents = mx.random.normal(video_latent_shape, dtype=model_dtype) mx.eval(latents) # Denoise with CFG/APG/STG/modality if audio: latents, audio_latents = denoise_dev_av( latents, audio_latents, video_positions, audio_positions, video_embeddings_pos, video_embeddings_neg, audio_embeddings_pos, audio_embeddings_neg, transformer, sigmas, cfg_scale=cfg_scale, audio_cfg_scale=audio_cfg_scale, cfg_rescale=cfg_rescale, verbose=verbose, video_state=video_state, use_apg=use_apg, apg_eta=apg_eta, apg_norm_threshold=apg_norm_threshold, stg_scale=stg_scale, stg_video_blocks=stg_blocks, stg_audio_blocks=stg_blocks, modality_scale=modality_scale, ) else: # Use original denoise_dev with computed sigmas latents = denoise_dev( latents, video_positions, video_embeddings_pos, video_embeddings_neg, transformer, sigmas, cfg_scale=cfg_scale, cfg_rescale=cfg_rescale, verbose=verbose, state=video_state, use_apg=use_apg, apg_eta=apg_eta, apg_norm_threshold=apg_norm_threshold, stg_scale=stg_scale, stg_blocks=stg_blocks, ) # Load VAE decoder (for dev pipeline, loaded here instead of during upsampling) vae_decoder = VideoDecoder.from_pretrained(str(model_path / "vae" / "decoder")) elif pipeline == PipelineType.DEV_TWO_STAGE: # ====================================================================== # DEV TWO-STAGE PIPELINE: # Stage 1: Dev denoising at half resolution with CFG # Upsample: 2x spatial via LatentUpsampler # Stage 2: Distilled denoising at full resolution with LoRA, no CFG # ====================================================================== # Load VAE encoder for I2V stage1_image_latent = None stage2_image_latent = None if is_i2v: with console.status("[blue]🖼️ Loading VAE encoder and encoding image...[/]", spinner="dots"): vae_encoder = VideoEncoder.from_pretrained(model_path / "vae" / "encoder") input_image = load_image(image, height=height // 2, width=width // 2, dtype=model_dtype) stage1_image_tensor = prepare_image_for_encoding(input_image, height // 2, width // 2, dtype=model_dtype) stage1_image_latent = vae_encoder(stage1_image_tensor) mx.eval(stage1_image_latent) input_image = load_image(image, height=height, width=width, dtype=model_dtype) stage2_image_tensor = prepare_image_for_encoding(input_image, height, width, dtype=model_dtype) stage2_image_latent = vae_encoder(stage2_image_tensor) mx.eval(stage2_image_latent) del vae_encoder mx.clear_cache() console.print("[green]✓[/] VAE encoder loaded and image encoded") # Stage 1: Dev denoising at half resolution with CFG sigmas = ltx2_scheduler(steps=num_inference_steps) mx.eval(sigmas) console.print(f"[dim]Stage 1 sigma schedule: {sigmas[0].item():.4f} → {sigmas[-2].item():.4f} → {sigmas[-1].item():.4f}[/]") console.print(f"\n[bold yellow]⚡ Stage 1:[/] Dev generating at {width//2}x{height//2} ({num_inference_steps} steps, CFG={cfg_scale}, rescale={cfg_rescale})") mx.random.seed(seed) positions = create_position_grid(1, latent_frames, stage1_h, stage1_w) mx.eval(positions) audio_positions = None audio_latents = None if audio: audio_positions = create_audio_position_grid(1, audio_frames) audio_latents = mx.random.normal((1, AUDIO_LATENT_CHANNELS, audio_frames, AUDIO_MEL_BINS), dtype=model_dtype) mx.eval(audio_positions, audio_latents) # Apply I2V conditioning for stage 1 state1 = None stage1_shape = (1, 128, latent_frames, stage1_h, stage1_w) if is_i2v and stage1_image_latent is not None: state1 = LatentState( latent=mx.zeros(stage1_shape, dtype=model_dtype), clean_latent=mx.zeros(stage1_shape, dtype=model_dtype), denoise_mask=mx.ones((1, 1, latent_frames, 1, 1), dtype=model_dtype), ) conditioning = VideoConditionByLatentIndex(latent=stage1_image_latent, frame_idx=image_frame_idx, strength=image_strength) state1 = apply_conditioning(state1, [conditioning]) noise = mx.random.normal(stage1_shape, dtype=model_dtype) noise_scale = sigmas[0] scaled_mask = state1.denoise_mask * noise_scale state1 = LatentState( latent=noise * scaled_mask + state1.latent * (mx.array(1.0, dtype=model_dtype) - scaled_mask), clean_latent=state1.clean_latent, denoise_mask=state1.denoise_mask, ) latents = state1.latent mx.eval(latents) else: latents = mx.random.normal(stage1_shape, dtype=model_dtype) mx.eval(latents) # Stage 1: Joint AV denoising at half resolution (matches PyTorch) if audio: latents, audio_latents = denoise_dev_av( latents, audio_latents, positions, audio_positions, video_embeddings_pos, video_embeddings_neg, audio_embeddings_pos, audio_embeddings_neg, transformer, sigmas, cfg_scale=cfg_scale, audio_cfg_scale=audio_cfg_scale, cfg_rescale=cfg_rescale, verbose=verbose, video_state=state1, use_apg=use_apg, apg_eta=apg_eta, apg_norm_threshold=apg_norm_threshold, stg_scale=stg_scale, stg_video_blocks=stg_blocks, stg_audio_blocks=stg_blocks, modality_scale=modality_scale, ) else: latents = denoise_dev( latents, positions, video_embeddings_pos, video_embeddings_neg, transformer, sigmas, cfg_scale=cfg_scale, cfg_rescale=cfg_rescale, verbose=verbose, state=state1, use_apg=use_apg, apg_eta=apg_eta, apg_norm_threshold=apg_norm_threshold, stg_scale=stg_scale, stg_blocks=stg_blocks, ) if audio and audio_latents is not None: mx.eval(audio_latents) # Upsample latents 2x with console.status("[magenta]🔍 Upsampling latents 2x...[/]", spinner="dots"): upscaler_files = sorted(model_path.glob("*spatial-upscaler-x2*.safetensors")) if not upscaler_files: raise FileNotFoundError(f"No spatial upscaler found in {model_path}") upsampler = load_upsampler(str(upscaler_files[0])) mx.eval(upsampler.parameters()) vae_decoder = VideoDecoder.from_pretrained(str(model_path / "vae" / "decoder")) latents = upsample_latents(latents, upsampler, vae_decoder.per_channel_statistics.mean, vae_decoder.per_channel_statistics.std) mx.eval(latents) del upsampler mx.clear_cache() console.print("[green]✓[/] Latents upsampled") # Merge LoRA weights for stage 2 (distilled refinement) if lora_path is None: # Auto-detect LoRA file in model directory lora_files = sorted(model_path.glob("*distilled-lora*.safetensors")) if lora_files: lora_path = str(lora_files[0]) console.print(f"[dim]Auto-detected LoRA: {Path(lora_path).name}[/]") else: console.print("[yellow]⚠️ No LoRA file found. Stage 2 will use base weights.[/]") if lora_path is not None: with console.status("[blue]🔧 Merging distilled LoRA weights...[/]", spinner="dots"): load_and_merge_lora(transformer, lora_path, strength=lora_strength) # Stage 2: Distilled refinement at full resolution (no CFG) # Matches PyTorch: re-noise audio at sigma=0.909375, then jointly refine # both video and audio through the distilled schedule using the LoRA-merged model. console.print(f"\n[bold yellow]⚡ Stage 2:[/] Distilled refining at {width}x{height} (3 steps, no CFG)") positions = create_position_grid(1, latent_frames, stage2_h, stage2_w) mx.eval(positions) state2 = None if is_i2v and stage2_image_latent is not None: state2 = LatentState( latent=latents, clean_latent=mx.zeros_like(latents), denoise_mask=mx.ones((1, 1, latent_frames, 1, 1), dtype=model_dtype), ) conditioning = VideoConditionByLatentIndex(latent=stage2_image_latent, frame_idx=image_frame_idx, strength=image_strength) state2 = apply_conditioning(state2, [conditioning]) noise = mx.random.normal(latents.shape).astype(model_dtype) noise_scale = mx.array(STAGE_2_SIGMAS[0], dtype=model_dtype) scaled_mask = state2.denoise_mask * noise_scale state2 = LatentState( latent=noise * scaled_mask + state2.latent * (mx.array(1.0, dtype=model_dtype) - scaled_mask), clean_latent=state2.clean_latent, denoise_mask=state2.denoise_mask, ) latents = state2.latent mx.eval(latents) else: noise_scale = mx.array(STAGE_2_SIGMAS[0], dtype=model_dtype) one_minus_scale = mx.array(1.0 - STAGE_2_SIGMAS[0], dtype=model_dtype) noise = mx.random.normal(latents.shape).astype(model_dtype) latents = noise * noise_scale + latents * one_minus_scale mx.eval(latents) # Re-noise audio at sigma=0.909375 for joint refinement (matches PyTorch) if audio and audio_latents is not None: audio_noise = mx.random.normal(audio_latents.shape, dtype=model_dtype) audio_noise_scale = mx.array(STAGE_2_SIGMAS[0], dtype=model_dtype) audio_latents = audio_noise * audio_noise_scale + audio_latents * (mx.array(1.0, dtype=model_dtype) - audio_noise_scale) mx.eval(audio_latents) # Joint video + audio refinement (no CFG, positive embeddings only) latents, audio_latents = denoise_distilled( latents, positions, text_embeddings, transformer, STAGE_2_SIGMAS, verbose=verbose, state=state2, audio_latents=audio_latents, audio_positions=audio_positions, audio_embeddings=audio_embeddings_pos if audio else None, ) del transformer mx.clear_cache() # ========================================================================== # Decode and save outputs (common to both pipelines) # ========================================================================== console.print("\n[blue]🎞️ Decoding video...[/]") # Select tiling configuration if tiling == "none": tiling_config = None elif tiling == "auto": tiling_config = TilingConfig.auto(height, width, num_frames) elif tiling == "default": tiling_config = TilingConfig.default() elif tiling == "aggressive": tiling_config = TilingConfig.aggressive() elif tiling == "conservative": tiling_config = TilingConfig.conservative() elif tiling == "spatial": tiling_config = TilingConfig.spatial_only() elif tiling == "temporal": tiling_config = TilingConfig.temporal_only() else: console.print(f"[yellow] Unknown tiling mode '{tiling}', using auto[/]") tiling_config = TilingConfig.auto(height, width, num_frames) output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Stream mode video_writer = None stream_progress = None if stream and tiling_config is not None: import cv2 fourcc = cv2.VideoWriter_fourcc(*'avc1') video_writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height)) stream_progress = Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), console=console, ) stream_progress.start() stream_task = stream_progress.add_task("[cyan]Streaming frames[/]", total=num_frames) def on_frames_ready(frames: mx.array, _start_idx: int): frames = mx.squeeze(frames, axis=0) frames = mx.transpose(frames, (1, 2, 3, 0)) frames = mx.clip((frames + 1.0) / 2.0, 0.0, 1.0) frames = (frames * 255).astype(mx.uint8) frames_np = np.array(frames) for frame in frames_np: video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) stream_progress.advance(stream_task) else: on_frames_ready = None if tiling_config is not None: spatial_info = f"{tiling_config.spatial_config.tile_size_in_pixels}px" if tiling_config.spatial_config else "none" temporal_info = f"{tiling_config.temporal_config.tile_size_in_frames}f" if tiling_config.temporal_config else "none" console.print(f"[dim] Tiling ({tiling}): spatial={spatial_info}, temporal={temporal_info}[/]") video = vae_decoder.decode_tiled(latents, tiling_config=tiling_config, tiling_mode=tiling, debug=verbose, on_frames_ready=on_frames_ready) else: console.print("[dim] Tiling: disabled[/]") video = vae_decoder(latents) mx.eval(video) mx.clear_cache() # Close stream writer if video_writer is not None: video_writer.release() if stream_progress is not None: stream_progress.stop() console.print(f"[green]✅ Streamed video to[/] {output_path}") video = mx.squeeze(video, axis=0) video = mx.transpose(video, (1, 2, 3, 0)) video = mx.clip((video + 1.0) / 2.0, 0.0, 1.0) video = (video * 255).astype(mx.uint8) video_np = np.array(video) else: video = mx.squeeze(video, axis=0) video = mx.transpose(video, (1, 2, 3, 0)) video = mx.clip((video + 1.0) / 2.0, 0.0, 1.0) video = (video * 255).astype(mx.uint8) video_np = np.array(video) if audio: temp_video_path = output_path.with_suffix('.temp.mp4') save_path = temp_video_path else: save_path = output_path try: import cv2 h, w = video_np.shape[1], video_np.shape[2] fourcc = cv2.VideoWriter_fourcc(*'avc1') out = cv2.VideoWriter(str(save_path), fourcc, fps, (w, h)) for frame in video_np: out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) out.release() if not audio: console.print(f"[green]✅ Saved video to[/] {output_path}") except Exception as e: console.print(f"[red]❌ Could not save video: {e}[/]") # Decode and save audio if enabled audio_np = None if audio and audio_latents is not None: with console.status("[blue]🔊 Decoding audio...[/]", spinner="dots"): audio_decoder = load_audio_decoder(model_path, pipeline) vocoder = load_vocoder_model(model_path, pipeline) mx.eval(audio_decoder.parameters(), vocoder.parameters()) mel_spectrogram = audio_decoder(audio_latents) mx.eval(mel_spectrogram) console.print(f"[dim] Mel spectrogram: shape={mel_spectrogram.shape}, std={mel_spectrogram.std().item():.4f}, mean={mel_spectrogram.mean().item():.4f}[/]") audio_waveform = vocoder(mel_spectrogram) mx.eval(audio_waveform) audio_np = np.array(audio_waveform.astype(mx.float32)) if audio_np.ndim == 3: audio_np = audio_np[0] # Get sample rate from vocoder (dynamic: 24kHz for LTX-2, 48kHz for LTX-2.3 BWE) vocoder_sample_rate = getattr(vocoder, 'output_sampling_rate', AUDIO_SAMPLE_RATE) del audio_decoder, vocoder mx.clear_cache() console.print("[green]✓[/] Audio decoded") audio_path = Path(output_audio_path) if output_audio_path else output_path.with_suffix('.wav') save_audio(audio_np, audio_path, vocoder_sample_rate) console.print(f"[green]✅ Saved audio to[/] {audio_path}") with console.status("[blue]🎬 Combining video and audio...[/]", spinner="dots"): temp_video_path = output_path.with_suffix('.temp.mp4') success = mux_video_audio(temp_video_path, audio_path, output_path) if success: console.print(f"[green]✅ Saved video with audio to[/] {output_path}") temp_video_path.unlink() else: temp_video_path.rename(output_path) console.print(f"[yellow]⚠️ Saved video without audio to[/] {output_path}") del vae_decoder mx.clear_cache() if save_frames: frames_dir = output_path.parent / f"{output_path.stem}_frames" frames_dir.mkdir(exist_ok=True) for i, frame in enumerate(video_np): Image.fromarray(frame).save(frames_dir / f"frame_{i:04d}.png") console.print(f"[green]✅ Saved {len(video_np)} frames to {frames_dir}[/]") elapsed = time.time() - start_time minutes, seconds = divmod(elapsed, 60) time_str = f"{int(minutes)}m {seconds:.1f}s" if minutes >= 1 else f"{seconds:.1f}s" console.print(Panel( f"[bold green]🎉 Done![/] Generated in {time_str} ({elapsed/num_frames:.2f}s/frame)\n" f"[bold green]✨ Peak memory:[/] {mx.get_peak_memory() / (1024 ** 3):.2f}GB", expand=False )) if audio: return video_np, audio_np return video_np def main(): parser = argparse.ArgumentParser( description="Generate videos with MLX LTX-2 (Distilled or Dev pipeline)", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Distilled pipeline (two-stage, fast, no CFG) python -m mlx_video.generate --prompt "A cat walking on grass" python -m mlx_video.generate --prompt "Ocean waves" --pipeline distilled # Dev pipeline (single-stage, CFG, higher quality) python -m mlx_video.generate --prompt "A cat walking" --pipeline dev --cfg-scale 3.0 python -m mlx_video.generate --prompt "Ocean waves" --pipeline dev --steps 40 # Dev two-stage pipeline (dev + LoRA refinement) python -m mlx_video.generate --prompt "A cat walking" --pipeline dev-two-stage --cfg-scale 3.0 # Image-to-Video (works with both pipelines) python -m mlx_video.generate --prompt "A person dancing" --image photo.jpg python -m mlx_video.generate --prompt "Waves crashing" --image beach.png --pipeline dev # With Audio (works with both pipelines) python -m mlx_video.generate --prompt "Ocean waves crashing" --audio python -m mlx_video.generate --prompt "A jazz band playing" --audio --pipeline dev """ ) parser.add_argument("--prompt", "-p", type=str, required=True, help="Text description of the video to generate") parser.add_argument("--pipeline", type=str, default="distilled", choices=["distilled", "dev", "dev-two-stage"], help="Pipeline type: distilled (two-stage, fast), dev (single-stage, CFG), or dev-two-stage (dev + LoRA refinement)") parser.add_argument("--negative-prompt", type=str, default=DEFAULT_NEGATIVE_PROMPT, help="Negative prompt for CFG (dev pipeline only)") parser.add_argument("--height", "-H", type=int, default=512, help="Output video height") parser.add_argument("--width", "-W", type=int, default=512, help="Output video width") parser.add_argument("--num-frames", "-n", type=int, default=33, help="Number of frames") parser.add_argument("--steps", type=int, default=30, help="Number of inference steps (dev pipeline only, default 30)") parser.add_argument("--cfg-scale", type=float, default=3.0, help="CFG guidance scale for video (dev pipeline only, default 3.0)") parser.add_argument("--audio-cfg-scale", type=float, default=7.0, help="CFG guidance scale for audio (default 7.0, PyTorch default)") parser.add_argument("--cfg-rescale", type=float, default=0.7, help="CFG rescale factor (0.0-1.0). Normalizes guided prediction variance to reduce artifacts (dev pipeline only, default 0.7)") parser.add_argument("--seed", "-s", type=int, default=42, help="Random seed") parser.add_argument("--fps", type=int, default=24, help="Frames per second") parser.add_argument("--output-path", "-o", type=str, default="output.mp4", help="Output video path") parser.add_argument("--save-frames", action="store_true", help="Save individual frames as images") parser.add_argument("--model-repo", type=str, default="Lightricks/LTX-2", help="Model repository") parser.add_argument("--text-encoder-repo", type=str, default=None, help="Text encoder repository") parser.add_argument("--verbose", action="store_true", help="Verbose output") parser.add_argument("--enhance-prompt", action="store_true", help="Enhance the prompt using Gemma") parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens for prompt enhancement") parser.add_argument("--temperature", type=float, default=0.7, help="Temperature for prompt enhancement") parser.add_argument("--image", "-i", type=str, default=None, help="Path to conditioning image for I2V") parser.add_argument("--image-strength", type=float, default=1.0, help="Conditioning strength for I2V") parser.add_argument("--image-frame-idx", type=int, default=0, help="Frame index to condition for I2V") parser.add_argument("--tiling", type=str, default="auto", choices=["auto", "none", "default", "aggressive", "conservative", "spatial", "temporal"], help="Tiling mode for VAE decoding") parser.add_argument("--stream", action="store_true", help="Stream frames to output as they're decoded") parser.add_argument("--audio", "-a", action="store_true", help="Enable synchronized audio generation") parser.add_argument("--output-audio", type=str, default=None, help="Output audio path") parser.add_argument("--apg", action="store_true", help="Use Adaptive Projected Guidance instead of CFG (more stable for I2V)") parser.add_argument("--apg-eta", type=float, default=1.0, help="APG parallel component weight (1.0 = keep full parallel)") parser.add_argument("--apg-norm-threshold", type=float, default=0.0, help="APG guidance norm clamp (0 = no clamping)") parser.add_argument("--stg-scale", type=float, default=0.0, help="STG (Spatiotemporal Guidance) scale (default 0.0 = disabled, PyTorch default: 1.0)") parser.add_argument("--stg-blocks", type=int, nargs="+", default=None, help="Transformer block indices for STG perturbation (default: [29] for LTX-2, [28] for LTX-2.3)") parser.add_argument("--modality-scale", type=float, default=1.0, help="Cross-modal guidance scale (default 1.0 = disabled, PyTorch default: 3.0)") parser.add_argument("--lora-path", type=str, default=None, help="Path to LoRA safetensors file (dev-two-stage pipeline)") parser.add_argument("--lora-strength", type=float, default=1.0, help="LoRA merge strength (dev-two-stage pipeline, default 1.0)") args = parser.parse_args() pipeline_map = { "distilled": PipelineType.DISTILLED, "dev": PipelineType.DEV, "dev-two-stage": PipelineType.DEV_TWO_STAGE, } pipeline = pipeline_map[args.pipeline] generate_video( model_repo=args.model_repo, text_encoder_repo=args.text_encoder_repo, prompt=args.prompt, pipeline=pipeline, negative_prompt=args.negative_prompt, height=args.height, width=args.width, num_frames=args.num_frames, num_inference_steps=args.steps, cfg_scale=args.cfg_scale, audio_cfg_scale=args.audio_cfg_scale, cfg_rescale=args.cfg_rescale, seed=args.seed, fps=args.fps, output_path=args.output_path, save_frames=args.save_frames, verbose=args.verbose, enhance_prompt=args.enhance_prompt, max_tokens=args.max_tokens, temperature=args.temperature, image=args.image, image_strength=args.image_strength, image_frame_idx=args.image_frame_idx, tiling=args.tiling, stream=args.stream, audio=args.audio, output_audio_path=args.output_audio, use_apg=args.apg, apg_eta=args.apg_eta, apg_norm_threshold=args.apg_norm_threshold, stg_scale=args.stg_scale, stg_blocks=args.stg_blocks, modality_scale=args.modality_scale, lora_path=args.lora_path, lora_strength=args.lora_strength, ) if __name__ == "__main__": main()