ltx2-mps/story_continuous.py

#!/usr/bin/env python3
"""
generate a continuous story video with ltx-2
uses image-to-video to maintain visual continuity between scenes
"""

import os
import subprocess
import numpy as np
import torch
from PIL import Image
from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline
from diffusers.pipelines.ltx2.export_utils import encode_video

# story scenes
SCENES = [
    "Sunrise over the Swiss Alps, snow-covered peaks glowing orange and pink, a white Swiss shepherd dog standing on a ridge, peaceful morning, cinematic wide shot",
    "The Swiss shepherd walking through deep powder snow, determined stride, pine trees, snow particles in air, morning light, tracking shot following the dog",
    "The shepherd stops suddenly, ears perked, alert pose, listening intently, snowy forest, something caught its attention",
    "A small white lamb alone in the snow, shivering, lost and scared, the shepherd approaches gently in the background",
    "The Swiss shepherd nuzzling the scared lamb, comforting gesture, warm breath visible, tender moment, shallow depth of field",
    "The shepherd leading the lamb through snowy alpine meadow, protective stance, walking together, mountains in background, golden hour",
    "Wide shot of dog and lamb crossing a snowy hill, vast white landscape, beautiful alpine scenery, afternoon light",
    "A cozy Swiss mountain village appearing in the distance, warm lights glowing, smoke from chimneys, dusk, hopeful atmosphere",
    "The shepherd and lamb arriving at a wooden barn, warm light spilling out, welcoming atmosphere, journey's end",
    "Night sky over the Alps with stars, the shepherd dog silhouette on a ridge, majestic ending, peaceful, cinematic finale",
]

def main():
    output_dir = os.path.expanduser("~/Desktop/mountain_guardian")
    os.makedirs(output_dir, exist_ok=True)

    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"using {device}")

    # load both pipelines
    print("loading text-to-video pipeline...")
    t2v_pipe = LTX2Pipeline.from_pretrained(
        "Lightricks/LTX-2",
        torch_dtype=torch.bfloat16
    )
    t2v_pipe.to(device)

    print("loading image-to-video pipeline...")
    i2v_pipe = LTX2ImageToVideoPipeline.from_pretrained(
        "Lightricks/LTX-2",
        torch_dtype=torch.bfloat16
    )
    i2v_pipe.to(device)

    width, height = 768, 448
    frames = 97  # ~4 seconds per scene
    steps = 20

    all_video_frames = []
    all_audio = []
    last_frame = None

    for i, prompt in enumerate(SCENES):
        print(f"\n{'='*60}")
        print(f"scene {i+1}/{len(SCENES)}")
        print(f"prompt: {prompt[:60]}...")
        print(f"{'='*60}\n")

        if i == 0:
            # first scene: text-to-video
            result = t2v_pipe(
                prompt=prompt,
                negative_prompt="blurry, low quality, distorted, deformed",
                width=width,
                height=height,
                num_frames=frames,
                num_inference_steps=steps,
            )
        else:
            # subsequent scenes: image-to-video from last frame
            result = i2v_pipe(
                image=last_frame,
                prompt=prompt,
                negative_prompt="blurry, low quality, distorted, deformed",
                width=width,
                height=height,
                num_frames=frames,
                num_inference_steps=steps,
            )

        # get frames
        video_frames = result.frames[0]

        # save last frame for next scene
        last_frame = video_frames[-1]

        # collect frames (skip first frame for scenes 2+ to avoid duplicate)
        if i == 0:
            all_video_frames.extend(video_frames)
        else:
            all_video_frames.extend(video_frames[1:])  # skip first frame (duplicate of last)

        # collect audio
        if result.audio is not None:
            all_audio.append(result.audio[0])

        # save individual scene
        scene_path = os.path.join(output_dir, f"scene_{i+1:02d}.mp4")
        video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
        audio = result.audio[0].float().cpu() if result.audio is not None else None
        audio_sr = t2v_pipe.vocoder.config.output_sampling_rate if audio is not None else None
        encode_video(video_tensor, fps=24, audio=audio, audio_sample_rate=audio_sr, output_path=scene_path)
        print(f"saved: {scene_path}")

    # save full video
    print("\ncreating full video...")
    full_path = os.path.join(output_dir, "mountain_guardian_full.mp4")
    video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in all_video_frames])

    # concatenate audio
    if all_audio:
        full_audio = torch.cat(all_audio, dim=-1).float().cpu()
        audio_sr = t2v_pipe.vocoder.config.output_sampling_rate
    else:
        full_audio = None
        audio_sr = None

    encode_video(video_tensor, fps=24, audio=full_audio, audio_sample_rate=audio_sr, output_path=full_path)

    print(f"\n{'='*60}")
    print(f"done!")
    print(f"total frames: {len(all_video_frames)}")
    print(f"duration: ~{len(all_video_frames)/24:.1f} seconds")
    print(f"saved to: {full_path}")
    print(f"{'='*60}")

if __name__ == "__main__":
    main()