135 lines
5.2 KiB
Python
135 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
generate a continuous story video with ltx-2
|
|
uses image-to-video to maintain visual continuity between scenes
|
|
"""
|
|
|
|
import os
|
|
import subprocess
|
|
import numpy as np
|
|
import torch
|
|
from PIL import Image
|
|
from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline
|
|
from diffusers.pipelines.ltx2.export_utils import encode_video
|
|
|
|
# story scenes
|
|
SCENES = [
|
|
"Sunrise over the Swiss Alps, snow-covered peaks glowing orange and pink, a white Swiss shepherd dog standing on a ridge, peaceful morning, cinematic wide shot",
|
|
"The Swiss shepherd walking through deep powder snow, determined stride, pine trees, snow particles in air, morning light, tracking shot following the dog",
|
|
"The shepherd stops suddenly, ears perked, alert pose, listening intently, snowy forest, something caught its attention",
|
|
"A small white lamb alone in the snow, shivering, lost and scared, the shepherd approaches gently in the background",
|
|
"The Swiss shepherd nuzzling the scared lamb, comforting gesture, warm breath visible, tender moment, shallow depth of field",
|
|
"The shepherd leading the lamb through snowy alpine meadow, protective stance, walking together, mountains in background, golden hour",
|
|
"Wide shot of dog and lamb crossing a snowy hill, vast white landscape, beautiful alpine scenery, afternoon light",
|
|
"A cozy Swiss mountain village appearing in the distance, warm lights glowing, smoke from chimneys, dusk, hopeful atmosphere",
|
|
"The shepherd and lamb arriving at a wooden barn, warm light spilling out, welcoming atmosphere, journey's end",
|
|
"Night sky over the Alps with stars, the shepherd dog silhouette on a ridge, majestic ending, peaceful, cinematic finale",
|
|
]
|
|
|
|
def main():
|
|
output_dir = os.path.expanduser("~/Desktop/mountain_guardian")
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
|
print(f"using {device}")
|
|
|
|
# load both pipelines
|
|
print("loading text-to-video pipeline...")
|
|
t2v_pipe = LTX2Pipeline.from_pretrained(
|
|
"Lightricks/LTX-2",
|
|
torch_dtype=torch.bfloat16
|
|
)
|
|
t2v_pipe.to(device)
|
|
|
|
print("loading image-to-video pipeline...")
|
|
i2v_pipe = LTX2ImageToVideoPipeline.from_pretrained(
|
|
"Lightricks/LTX-2",
|
|
torch_dtype=torch.bfloat16
|
|
)
|
|
i2v_pipe.to(device)
|
|
|
|
width, height = 768, 448
|
|
frames = 97 # ~4 seconds per scene
|
|
steps = 20
|
|
|
|
all_video_frames = []
|
|
all_audio = []
|
|
last_frame = None
|
|
|
|
for i, prompt in enumerate(SCENES):
|
|
print(f"\n{'='*60}")
|
|
print(f"scene {i+1}/{len(SCENES)}")
|
|
print(f"prompt: {prompt[:60]}...")
|
|
print(f"{'='*60}\n")
|
|
|
|
if i == 0:
|
|
# first scene: text-to-video
|
|
result = t2v_pipe(
|
|
prompt=prompt,
|
|
negative_prompt="blurry, low quality, distorted, deformed",
|
|
width=width,
|
|
height=height,
|
|
num_frames=frames,
|
|
num_inference_steps=steps,
|
|
)
|
|
else:
|
|
# subsequent scenes: image-to-video from last frame
|
|
result = i2v_pipe(
|
|
image=last_frame,
|
|
prompt=prompt,
|
|
negative_prompt="blurry, low quality, distorted, deformed",
|
|
width=width,
|
|
height=height,
|
|
num_frames=frames,
|
|
num_inference_steps=steps,
|
|
)
|
|
|
|
# get frames
|
|
video_frames = result.frames[0]
|
|
|
|
# save last frame for next scene
|
|
last_frame = video_frames[-1]
|
|
|
|
# collect frames (skip first frame for scenes 2+ to avoid duplicate)
|
|
if i == 0:
|
|
all_video_frames.extend(video_frames)
|
|
else:
|
|
all_video_frames.extend(video_frames[1:]) # skip first frame (duplicate of last)
|
|
|
|
# collect audio
|
|
if result.audio is not None:
|
|
all_audio.append(result.audio[0])
|
|
|
|
# save individual scene
|
|
scene_path = os.path.join(output_dir, f"scene_{i+1:02d}.mp4")
|
|
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
|
|
audio = result.audio[0].float().cpu() if result.audio is not None else None
|
|
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate if audio is not None else None
|
|
encode_video(video_tensor, fps=24, audio=audio, audio_sample_rate=audio_sr, output_path=scene_path)
|
|
print(f"saved: {scene_path}")
|
|
|
|
# save full video
|
|
print("\ncreating full video...")
|
|
full_path = os.path.join(output_dir, "mountain_guardian_full.mp4")
|
|
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in all_video_frames])
|
|
|
|
# concatenate audio
|
|
if all_audio:
|
|
full_audio = torch.cat(all_audio, dim=-1).float().cpu()
|
|
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate
|
|
else:
|
|
full_audio = None
|
|
audio_sr = None
|
|
|
|
encode_video(video_tensor, fps=24, audio=full_audio, audio_sample_rate=audio_sr, output_path=full_path)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"done!")
|
|
print(f"total frames: {len(all_video_frames)}")
|
|
print(f"duration: ~{len(all_video_frames)/24:.1f} seconds")
|
|
print(f"saved to: {full_path}")
|
|
print(f"{'='*60}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|