Added story generator
This commit is contained in:
134
story_continuous.py
Normal file
134
story_continuous.py
Normal file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
generate a continuous story video with ltx-2
|
||||
uses image-to-video to maintain visual continuity between scenes
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline
|
||||
from diffusers.pipelines.ltx2.export_utils import encode_video
|
||||
|
||||
# story scenes
|
||||
SCENES = [
|
||||
"Sunrise over the Swiss Alps, snow-covered peaks glowing orange and pink, a white Swiss shepherd dog standing on a ridge, peaceful morning, cinematic wide shot",
|
||||
"The Swiss shepherd walking through deep powder snow, determined stride, pine trees, snow particles in air, morning light, tracking shot following the dog",
|
||||
"The shepherd stops suddenly, ears perked, alert pose, listening intently, snowy forest, something caught its attention",
|
||||
"A small white lamb alone in the snow, shivering, lost and scared, the shepherd approaches gently in the background",
|
||||
"The Swiss shepherd nuzzling the scared lamb, comforting gesture, warm breath visible, tender moment, shallow depth of field",
|
||||
"The shepherd leading the lamb through snowy alpine meadow, protective stance, walking together, mountains in background, golden hour",
|
||||
"Wide shot of dog and lamb crossing a snowy hill, vast white landscape, beautiful alpine scenery, afternoon light",
|
||||
"A cozy Swiss mountain village appearing in the distance, warm lights glowing, smoke from chimneys, dusk, hopeful atmosphere",
|
||||
"The shepherd and lamb arriving at a wooden barn, warm light spilling out, welcoming atmosphere, journey's end",
|
||||
"Night sky over the Alps with stars, the shepherd dog silhouette on a ridge, majestic ending, peaceful, cinematic finale",
|
||||
]
|
||||
|
||||
def main():
|
||||
output_dir = os.path.expanduser("~/Desktop/mountain_guardian")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
print(f"using {device}")
|
||||
|
||||
# load both pipelines
|
||||
print("loading text-to-video pipeline...")
|
||||
t2v_pipe = LTX2Pipeline.from_pretrained(
|
||||
"Lightricks/LTX-2",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
t2v_pipe.to(device)
|
||||
|
||||
print("loading image-to-video pipeline...")
|
||||
i2v_pipe = LTX2ImageToVideoPipeline.from_pretrained(
|
||||
"Lightricks/LTX-2",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
i2v_pipe.to(device)
|
||||
|
||||
width, height = 768, 448
|
||||
frames = 97 # ~4 seconds per scene
|
||||
steps = 20
|
||||
|
||||
all_video_frames = []
|
||||
all_audio = []
|
||||
last_frame = None
|
||||
|
||||
for i, prompt in enumerate(SCENES):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"scene {i+1}/{len(SCENES)}")
|
||||
print(f"prompt: {prompt[:60]}...")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
if i == 0:
|
||||
# first scene: text-to-video
|
||||
result = t2v_pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt="blurry, low quality, distorted, deformed",
|
||||
width=width,
|
||||
height=height,
|
||||
num_frames=frames,
|
||||
num_inference_steps=steps,
|
||||
)
|
||||
else:
|
||||
# subsequent scenes: image-to-video from last frame
|
||||
result = i2v_pipe(
|
||||
image=last_frame,
|
||||
prompt=prompt,
|
||||
negative_prompt="blurry, low quality, distorted, deformed",
|
||||
width=width,
|
||||
height=height,
|
||||
num_frames=frames,
|
||||
num_inference_steps=steps,
|
||||
)
|
||||
|
||||
# get frames
|
||||
video_frames = result.frames[0]
|
||||
|
||||
# save last frame for next scene
|
||||
last_frame = video_frames[-1]
|
||||
|
||||
# collect frames (skip first frame for scenes 2+ to avoid duplicate)
|
||||
if i == 0:
|
||||
all_video_frames.extend(video_frames)
|
||||
else:
|
||||
all_video_frames.extend(video_frames[1:]) # skip first frame (duplicate of last)
|
||||
|
||||
# collect audio
|
||||
if result.audio is not None:
|
||||
all_audio.append(result.audio[0])
|
||||
|
||||
# save individual scene
|
||||
scene_path = os.path.join(output_dir, f"scene_{i+1:02d}.mp4")
|
||||
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
|
||||
audio = result.audio[0].float().cpu() if result.audio is not None else None
|
||||
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate if audio is not None else None
|
||||
encode_video(video_tensor, fps=24, audio=audio, audio_sample_rate=audio_sr, output_path=scene_path)
|
||||
print(f"saved: {scene_path}")
|
||||
|
||||
# save full video
|
||||
print("\ncreating full video...")
|
||||
full_path = os.path.join(output_dir, "mountain_guardian_full.mp4")
|
||||
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in all_video_frames])
|
||||
|
||||
# concatenate audio
|
||||
if all_audio:
|
||||
full_audio = torch.cat(all_audio, dim=-1).float().cpu()
|
||||
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate
|
||||
else:
|
||||
full_audio = None
|
||||
audio_sr = None
|
||||
|
||||
encode_video(video_tensor, fps=24, audio=full_audio, audio_sample_rate=audio_sr, output_path=full_path)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"done!")
|
||||
print(f"total frames: {len(all_video_frames)}")
|
||||
print(f"duration: ~{len(all_video_frames)/24:.1f} seconds")
|
||||
print(f"saved to: {full_path}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user