Files
ltx2-mps/story_continuous.py
2026-01-11 20:01:20 +01:00

135 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""
generate a continuous story video with ltx-2
uses image-to-video to maintain visual continuity between scenes
"""
import os
import subprocess
import numpy as np
import torch
from PIL import Image
from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline
from diffusers.pipelines.ltx2.export_utils import encode_video
# story scenes
SCENES = [
"Sunrise over the Swiss Alps, snow-covered peaks glowing orange and pink, a white Swiss shepherd dog standing on a ridge, peaceful morning, cinematic wide shot",
"The Swiss shepherd walking through deep powder snow, determined stride, pine trees, snow particles in air, morning light, tracking shot following the dog",
"The shepherd stops suddenly, ears perked, alert pose, listening intently, snowy forest, something caught its attention",
"A small white lamb alone in the snow, shivering, lost and scared, the shepherd approaches gently in the background",
"The Swiss shepherd nuzzling the scared lamb, comforting gesture, warm breath visible, tender moment, shallow depth of field",
"The shepherd leading the lamb through snowy alpine meadow, protective stance, walking together, mountains in background, golden hour",
"Wide shot of dog and lamb crossing a snowy hill, vast white landscape, beautiful alpine scenery, afternoon light",
"A cozy Swiss mountain village appearing in the distance, warm lights glowing, smoke from chimneys, dusk, hopeful atmosphere",
"The shepherd and lamb arriving at a wooden barn, warm light spilling out, welcoming atmosphere, journey's end",
"Night sky over the Alps with stars, the shepherd dog silhouette on a ridge, majestic ending, peaceful, cinematic finale",
]
def main():
output_dir = os.path.expanduser("~/Desktop/mountain_guardian")
os.makedirs(output_dir, exist_ok=True)
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"using {device}")
# load both pipelines
print("loading text-to-video pipeline...")
t2v_pipe = LTX2Pipeline.from_pretrained(
"Lightricks/LTX-2",
torch_dtype=torch.bfloat16
)
t2v_pipe.to(device)
print("loading image-to-video pipeline...")
i2v_pipe = LTX2ImageToVideoPipeline.from_pretrained(
"Lightricks/LTX-2",
torch_dtype=torch.bfloat16
)
i2v_pipe.to(device)
width, height = 768, 448
frames = 97 # ~4 seconds per scene
steps = 20
all_video_frames = []
all_audio = []
last_frame = None
for i, prompt in enumerate(SCENES):
print(f"\n{'='*60}")
print(f"scene {i+1}/{len(SCENES)}")
print(f"prompt: {prompt[:60]}...")
print(f"{'='*60}\n")
if i == 0:
# first scene: text-to-video
result = t2v_pipe(
prompt=prompt,
negative_prompt="blurry, low quality, distorted, deformed",
width=width,
height=height,
num_frames=frames,
num_inference_steps=steps,
)
else:
# subsequent scenes: image-to-video from last frame
result = i2v_pipe(
image=last_frame,
prompt=prompt,
negative_prompt="blurry, low quality, distorted, deformed",
width=width,
height=height,
num_frames=frames,
num_inference_steps=steps,
)
# get frames
video_frames = result.frames[0]
# save last frame for next scene
last_frame = video_frames[-1]
# collect frames (skip first frame for scenes 2+ to avoid duplicate)
if i == 0:
all_video_frames.extend(video_frames)
else:
all_video_frames.extend(video_frames[1:]) # skip first frame (duplicate of last)
# collect audio
if result.audio is not None:
all_audio.append(result.audio[0])
# save individual scene
scene_path = os.path.join(output_dir, f"scene_{i+1:02d}.mp4")
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
audio = result.audio[0].float().cpu() if result.audio is not None else None
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate if audio is not None else None
encode_video(video_tensor, fps=24, audio=audio, audio_sample_rate=audio_sr, output_path=scene_path)
print(f"saved: {scene_path}")
# save full video
print("\ncreating full video...")
full_path = os.path.join(output_dir, "mountain_guardian_full.mp4")
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in all_video_frames])
# concatenate audio
if all_audio:
full_audio = torch.cat(all_audio, dim=-1).float().cpu()
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate
else:
full_audio = None
audio_sr = None
encode_video(video_tensor, fps=24, audio=full_audio, audio_sample_rate=audio_sr, output_path=full_path)
print(f"\n{'='*60}")
print(f"done!")
print(f"total frames: {len(all_video_frames)}")
print(f"duration: ~{len(all_video_frames)/24:.1f} seconds")
print(f"saved to: {full_path}")
print(f"{'='*60}")
if __name__ == "__main__":
main()