Added story generator

This commit is contained in:
Norbert Schmidt
2026-01-11 20:01:20 +01:00
parent fc95099449
commit 8174da7490
6 changed files with 846 additions and 0 deletions

5
clipmaker Executable file
View File

@@ -0,0 +1,5 @@
#!/bin/bash
# clipmaker - quick wrapper
source ~/ltx-venv/bin/activate
cd "$(dirname "$0")"
python clipmaker.py "$@"

339
clipmaker.py Executable file
View File

@@ -0,0 +1,339 @@
#!/usr/bin/env python3
"""
clipmaker - high quality video clip generator for ltx-2
usage:
clipmaker "your prompt here" # quick preview
clipmaker "your prompt" --preset hq # high quality
clipmaker "your prompt" --preset max # maximum quality
clipmaker --batch prompts.txt # batch from file
clipmaker --interactive # interactive mode
"""
import argparse
import os
import sys
import json
from datetime import datetime
from pathlib import Path
import numpy as np
import torch
from PIL import Image
# quality presets
PRESETS = {
"preview": {
"width": 512,
"height": 320,
"frames": 25,
"steps": 10,
"guidance": 4.0,
"description": "fast preview (~1 min)"
},
"standard": {
"width": 768,
"height": 448,
"frames": 49,
"steps": 20,
"guidance": 4.0,
"description": "balanced quality (~5 min)"
},
"hq": {
"width": 1024,
"height": 576,
"frames": 97,
"steps": 25,
"guidance": 4.0,
"description": "high quality (~15 min)"
},
"max": {
"width": 1024,
"height": 576,
"frames": 161,
"steps": 30,
"guidance": 4.0,
"description": "maximum quality (~30 min)"
},
"cinematic": {
"width": 1280,
"height": 720,
"frames": 97,
"steps": 30,
"guidance": 4.5,
"description": "cinematic 720p (~25 min)"
},
}
# default negative prompt based on ltx-2 guide
DEFAULT_NEGATIVE = "blurry, low quality, distorted, deformed, ugly, bad anatomy, text, watermark, signature, out of frame"
# prompt enhancement tips
PROMPT_TIPS = """
prompt tips (from ltx-2 guide):
- write as flowing paragraph, 4-8 sentences
- include: shot type, lighting, action, camera movement, audio
- use cinematography terms: dolly, pan, track, handheld, close-up
- describe sounds and dialogue in "quotes"
- use present tense for actions
example:
"A cinematic medium shot of a coffee cup on a wooden table, steam rising
gently in soft morning light. The camera slowly pushes in as a hand
reaches into frame to lift the cup. Warm ambient cafe sounds and soft
jazz play in the background. Shallow depth of field, golden hour lighting."
"""
class ClipMaker:
def __init__(self, output_dir="~/Desktop/clips"):
self.output_dir = Path(output_dir).expanduser()
self.output_dir.mkdir(parents=True, exist_ok=True)
self.pipe = None
self.device = None
def load_model(self):
"""load the ltx-2 pipeline"""
if self.pipe is not None:
return
from diffusers import LTX2Pipeline
self.device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"device: {self.device}")
print("loading ltx-2 model...")
self.pipe = LTX2Pipeline.from_pretrained(
"Lightricks/LTX-2",
torch_dtype=torch.bfloat16
)
self.pipe.to(self.device)
print("model ready\n")
def generate(self, prompt, preset="standard", negative_prompt=None,
seed=None, output_path=None, no_audio=False):
"""generate a video clip"""
from diffusers.pipelines.ltx2.export_utils import encode_video
self.load_model()
# get preset settings
if preset not in PRESETS:
print(f"unknown preset: {preset}")
print(f"available: {', '.join(PRESETS.keys())}")
return None
settings = PRESETS[preset]
# generate output path if not provided
if output_path is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = self.output_dir / f"clip_{timestamp}.mp4"
else:
output_path = Path(output_path)
# set seed
if seed is None:
seed = torch.randint(0, 2**31, (1,)).item()
generator = torch.Generator(device="cpu")
generator.manual_seed(seed)
# use default negative if not provided
if negative_prompt is None:
negative_prompt = DEFAULT_NEGATIVE
print(f"{'='*60}")
print(f"generating clip")
print(f"{'='*60}")
print(f"preset: {preset} ({settings['description']})")
print(f"size: {settings['width']}x{settings['height']}")
print(f"frames: {settings['frames']} (~{settings['frames']/24:.1f}s)")
print(f"steps: {settings['steps']}")
print(f"seed: {seed}")
print(f"audio: {'no' if no_audio else 'yes'}")
print(f"output: {output_path}")
print(f"\nprompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}")
print(f"{'='*60}\n")
# generate
result = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
width=settings["width"],
height=settings["height"],
num_frames=settings["frames"],
num_inference_steps=settings["steps"],
guidance_scale=settings["guidance"],
generator=generator,
)
# get video frames as tensor
video_frames = result.frames[0]
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
# get audio
audio = None
audio_sr = None
if not no_audio and result.audio is not None:
audio = result.audio[0].float().cpu()
audio_sr = self.pipe.vocoder.config.output_sampling_rate
print(f"audio: {audio_sr}Hz")
# export
encode_video(
video=video_tensor,
fps=24,
audio=audio,
audio_sample_rate=audio_sr,
output_path=str(output_path)
)
# save metadata
meta_path = output_path.with_suffix(".json")
metadata = {
"prompt": prompt,
"negative_prompt": negative_prompt,
"preset": preset,
"settings": settings,
"seed": seed,
"timestamp": datetime.now().isoformat(),
"output": str(output_path),
}
with open(meta_path, "w") as f:
json.dump(metadata, f, indent=2)
print(f"\n{'='*60}")
print(f"done!")
print(f"video: {output_path}")
print(f"metadata: {meta_path}")
print(f"seed: {seed} (use --seed {seed} to reproduce)")
print(f"{'='*60}\n")
return output_path
def batch_generate(self, prompts_file, preset="standard"):
"""generate multiple clips from a file"""
prompts_path = Path(prompts_file)
if not prompts_path.exists():
print(f"file not found: {prompts_file}")
return
prompts = []
with open(prompts_path) as f:
current_prompt = []
for line in f:
line = line.strip()
if line == "---": # separator between prompts
if current_prompt:
prompts.append(" ".join(current_prompt))
current_prompt = []
elif line and not line.startswith("#"): # skip comments
current_prompt.append(line)
if current_prompt:
prompts.append(" ".join(current_prompt))
print(f"found {len(prompts)} prompts in {prompts_file}")
print(f"preset: {preset}")
print()
for i, prompt in enumerate(prompts):
print(f"\n[{i+1}/{len(prompts)}]")
self.generate(prompt, preset=preset)
def interactive(self):
"""interactive prompt mode"""
print("\n" + "="*60)
print("clipmaker interactive mode")
print("="*60)
print(PROMPT_TIPS)
print("\npresets:", ", ".join(PRESETS.keys()))
print("commands: /preset <name>, /tips, /quit\n")
current_preset = "standard"
while True:
try:
prompt = input(f"[{current_preset}] > ").strip()
except (EOFError, KeyboardInterrupt):
print("\nbye!")
break
if not prompt:
continue
elif prompt == "/quit":
print("bye!")
break
elif prompt == "/tips":
print(PROMPT_TIPS)
elif prompt.startswith("/preset"):
parts = prompt.split()
if len(parts) > 1 and parts[1] in PRESETS:
current_preset = parts[1]
print(f"preset: {current_preset} - {PRESETS[current_preset]['description']}")
else:
print(f"presets: {', '.join(PRESETS.keys())}")
else:
self.generate(prompt, preset=current_preset)
def main():
parser = argparse.ArgumentParser(
description="clipmaker - hq video clip generator",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=f"""
presets:
preview - {PRESETS['preview']['description']}
standard - {PRESETS['standard']['description']}
hq - {PRESETS['hq']['description']}
max - {PRESETS['max']['description']}
cinematic - {PRESETS['cinematic']['description']}
examples:
clipmaker "a cat sleeping on a couch"
clipmaker "epic sunset timelapse" --preset hq
clipmaker --batch prompts.txt --preset standard
clipmaker --interactive
"""
)
parser.add_argument("prompt", nargs="?", help="video prompt")
parser.add_argument("--preset", "-p", default="standard",
choices=PRESETS.keys(), help="quality preset")
parser.add_argument("--output", "-o", help="output path")
parser.add_argument("--seed", "-s", type=int, help="random seed")
parser.add_argument("--negative", "-n", help="negative prompt")
parser.add_argument("--no-audio", action="store_true", help="disable audio")
parser.add_argument("--batch", "-b", help="batch generate from file")
parser.add_argument("--interactive", "-i", action="store_true",
help="interactive mode")
parser.add_argument("--output-dir", default="~/Desktop/clips",
help="output directory")
parser.add_argument("--tips", action="store_true", help="show prompt tips")
args = parser.parse_args()
if args.tips:
print(PROMPT_TIPS)
return
maker = ClipMaker(output_dir=args.output_dir)
if args.interactive:
maker.interactive()
elif args.batch:
maker.batch_generate(args.batch, preset=args.preset)
elif args.prompt:
maker.generate(
prompt=args.prompt,
preset=args.preset,
negative_prompt=args.negative,
seed=args.seed,
output_path=args.output,
no_audio=args.no_audio,
)
else:
parser.print_help()
if __name__ == "__main__":
main()

43
sample_prompts.txt Normal file
View File

@@ -0,0 +1,43 @@
# sample prompts for clipmaker batch mode
# separate prompts with ---
# lines starting with # are comments
# cinematic nature
A breathtaking cinematic wide shot of a lone wolf walking through a snowy forest at dusk. The camera slowly tracks alongside the wolf as it moves gracefully between snow-covered pine trees. Soft blue twilight illuminates the scene, with the wolf's breath visible in the cold air. Ambient sounds of gentle wind and distant owls. Shallow depth of field, film grain, atmospheric and moody.
---
# urban timelapse style
A hyperlapse shot moving through neon-lit streets of Tokyo at night. The camera glides forward at street level as crowds of people blur past on either side. Bright signs in Japanese reflect off wet pavement after rain. Electronic ambient sounds and city noise create an immersive soundscape. Cyberpunk aesthetic, vibrant colors, motion blur on pedestrians.
---
# dramatic portrait
A cinematic close-up of an elderly fisherman's weathered face, deep wrinkles telling stories of decades at sea. He looks out at the ocean with knowing eyes, salt-and-pepper beard moving slightly in the wind. Golden hour sunlight creates a warm rim light around his profile. The sound of waves and seagulls in the distance. Shallow depth of field, intimate and contemplative.
---
# fantasy scene
A mystical wide shot of an ancient stone temple overgrown with glowing bioluminescent vines, deep in an enchanted forest. Magical particles float through the air as moonlight streams through gaps in the canopy above. The camera slowly pushes in toward the temple entrance. Ethereal ambient music and forest sounds create a sense of wonder. Fantasy aesthetic, volumetric lighting, dreamlike atmosphere.
---
# action sequence
An intense tracking shot following a motorcycle speeding through desert canyons at sunset. The camera moves alongside the rider as dust kicks up behind the wheels. Orange and red rock formations blur past. Engine roar and wind create an exhilarating soundscape. The rider leans into a sharp curve, sunlight flaring across the lens. Cinematic, dynamic, adrenaline-pumping.
---
# cozy interior
A warm, intimate shot inside a rustic cabin during a rainstorm. The camera slowly pans across a crackling fireplace, past stacked books and a steaming mug of tea on a wooden table. Rain streaks down the window, lightning briefly illuminates the room. Sounds of rain, thunder, and fire crackling. Shallow depth of field, amber lighting, hygge aesthetic.
---
# underwater world
A mesmerizing underwater shot following a sea turtle gliding through crystal clear tropical waters. Sunbeams pierce the surface above, creating dancing light patterns on the sandy ocean floor. Colorful fish swim past as the turtle moves gracefully through a coral reef. Ambient underwater sounds and gentle current noise. Nature documentary style, serene and meditative.
---
# vintage nostalgia
A dreamy medium shot of a woman in a 1960s summer dress riding a vintage bicycle down a tree-lined country road. The camera tracks alongside her as dappled sunlight filters through the leaves above. Her hair and dress flutter in the breeze. Nostalgic soundtrack with birds singing. Soft film grain, warm vintage color grading, golden hour lighting.
---
# sci-fi atmosphere
A sweeping establishing shot of a massive space station orbiting Earth, the blue planet glowing in the background. The camera slowly arcs around the station as small spacecraft dock and depart. Solar panels catch the sunlight. Ambient electronic hum and distant radio chatter. Hard science fiction aesthetic, realistic lighting, epic scale.
---
# food cinematography
An elegant overhead shot slowly descending toward a beautifully plated gourmet dish on a dark marble surface. Steam rises gently from the food as the camera pushes in. A chef's hand enters frame to add a final garnish. Soft ambient restaurant sounds and gentle plating noises. Shallow depth of field, dramatic side lighting, food photography style.
---

164
story_cinematic.py Normal file
View File

@@ -0,0 +1,164 @@
#!/usr/bin/env python3
"""
the mountain guardian - a cinematic short film
generated with ltx-2 using proper prompting techniques
"""
import os
import numpy as np
import torch
from PIL import Image
from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline
from diffusers.pipelines.ltx2.export_utils import encode_video
# cinematic story prompts following ltx-2 prompting guide
SCENES = [
# scene 1: opening - sunrise
"""EXT. SWISS ALPS - DAWN. A cinematic wide establishing shot of snow-covered mountain peaks as golden sunrise light spills over the ridgeline. The camera slowly pans right, revealing a vast alpine valley blanketed in fresh powder snow. Wisps of morning mist drift between the pine trees below. The warm orange glow gradually illuminates the pristine white landscape. Ambient sounds of gentle wind and distant bird calls fill the air. The shot lingers on the majestic scenery, peaceful and untouched.""",
# scene 2: hero introduction
"""EXT. MOUNTAIN RIDGE - DAWN. The camera pushes in slowly on a magnificent white Swiss shepherd dog standing proudly on a snowy ridge, silhouetted against the golden sunrise. The dog's thick fur ruffles gently in the cold mountain breeze, breath visible in the frigid air. Medium shot, shallow depth of field with the valley soft in the background. The shepherd surveys the landscape below with alert, intelligent eyes, ears perked forward. Soft ambient wind and the dog's quiet breathing create an intimate atmosphere. Cinematic warm backlighting creates a heroic golden rim around the dog's form.""",
# scene 3: the patrol begins
"""EXT. SNOWY FOREST - MORNING. Tracking shot following the white Swiss shepherd as it walks purposefully through deep powder snow, each step sending up small puffs of white. Pine trees tower on either side, their branches heavy with snow. The camera moves alongside the dog at eye level, handheld style with subtle movement. Morning light filters through the forest canopy in soft rays. The sound of snow crunching under paws and the dog's steady breathing. The shepherd moves with determination, nose low, following an invisible trail through the wilderness.""",
# scene 4: something's wrong
"""EXT. FOREST CLEARING - MORNING. The Swiss shepherd stops abruptly mid-stride, head snapping to the right, ears rotating forward. Close-up on the dog's face showing intense focus, nostrils flaring as it catches a scent. The camera slowly pushes in on the shepherd's alert expression. A beat of tense silence, then a faint, distant bleating sound echoes through the trees. The dog's eyes widen slightly with recognition. The ambient forest sounds fade as the shepherd locks onto the direction of the cry. Shallow depth of field isolates the dog's concentrated expression.""",
# scene 5: discovery
"""EXT. SNOWY HOLLOW - MORNING. Wide shot revealing a small white lamb huddled alone in a depression in the snow, shivering visibly, its wool matted and wet. The lamb lets out weak, frightened bleats, breath coming in short visible puffs. The camera slowly dollies forward, keeping the vulnerable creature centered. Snow continues to fall gently around it. In the background, barely visible through the snow, the Swiss shepherd appears at the edge of the clearing. Soft, melancholic ambient tones underscore the lamb's distress. The scene conveys isolation and vulnerability.""",
# scene 6: gentle approach
"""EXT. SNOWY HOLLOW - MORNING. Medium shot as the Swiss shepherd approaches the frightened lamb with slow, deliberate steps, body low and non-threatening. The lamb looks up with wide, fearful eyes but doesn't flee. The dog pauses, then takes another careful step forward. The camera tracks alongside at ground level. Soft snow crunches beneath careful paws. The shepherd's expression is gentle, reassuring. Warm morning light breaks through the clouds above. The tension slowly dissolves as the lamb recognizes help has arrived. Ambient sounds of gentle wind and soft animal breathing.""",
# scene 7: comfort
"""EXT. SNOWY HOLLOW - MORNING. Close-up intimate shot as the Swiss shepherd gently nuzzles the shivering lamb, warm breath creating a soft cloud between them. The lamb presses against the dog's thick fur, seeking warmth. The camera holds on this tender moment, shallow depth of field blurring the snowy background. The shepherd's eyes close briefly in a gesture of comfort. Soft, warm lighting wraps around both animals. The lamb's frightened bleating quiets to soft sounds of relief. A moment of connection between two creatures in the vast wilderness. Heartwarming and genuine.""",
# scene 8: the journey begins
"""EXT. ALPINE MEADOW - MIDDAY. Wide cinematic shot as the Swiss shepherd leads the small lamb across a vast snowy meadow, the dog walking protectively alongside its small companion. Mountain peaks rise majestically in the background under a pale blue sky. The camera slowly cranes up to reveal the epic scale of their journey ahead. Both animals leave a trail of footprints in the pristine snow. Soft orchestral tones suggest hope and determination. The shepherd occasionally glances back to check on the lamb, who follows trustingly. Golden sunlight illuminates the pair as they traverse the white expanse.""",
# scene 9: village in sight
"""EXT. MOUNTAIN OVERLOOK - LATE AFTERNOON. The camera pushes forward as the Swiss shepherd and lamb crest a snowy hill, revealing a picturesque Swiss village nestled in the valley below. Warm lights glow from windows of wooden chalets, smoke rising from chimneys into the golden hour sky. The shepherd pauses, tail wagging slightly at the sight. The lamb stands close beside, tired but hopeful. A sense of relief and accomplishment fills the frame. Church bells chime faintly in the distance. The camera slowly zooms toward the welcoming village as the sun sets behind the mountains.""",
# scene 10: finale
"""EXT. VILLAGE BARN - DUSK. Medium shot as a farmer in traditional Swiss clothing opens a wooden barn door, warm golden light spilling out into the blue twilight. His weathered face shows surprise, then breaks into a warm smile as he sees the shepherd with the lost lamb. He kneels down, arms open. The lamb bounds forward into the warm barn interior where other sheep can be seen. The farmer reaches out to pat the shepherd's head gratefully, saying softly "Good dog... good dog." The shepherd sits proudly, breath visible in the cold air, mission complete. Warm interior light contrasts with the cold blue dusk outside.""",
]
def main():
output_dir = os.path.expanduser("~/Desktop/mountain_guardian")
os.makedirs(output_dir, exist_ok=True)
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"using {device}")
print(f"\n{'='*60}")
print("THE MOUNTAIN GUARDIAN")
print("a cinematic short film")
print(f"{'='*60}\n")
# load pipelines
print("loading text-to-video pipeline...")
t2v_pipe = LTX2Pipeline.from_pretrained(
"Lightricks/LTX-2",
torch_dtype=torch.bfloat16
)
t2v_pipe.to(device)
print("loading image-to-video pipeline...")
i2v_pipe = LTX2ImageToVideoPipeline.from_pretrained(
"Lightricks/LTX-2",
torch_dtype=torch.bfloat16
)
i2v_pipe.to(device)
# settings - 768x448, 97 frames (~4 sec per scene)
width, height = 768, 448
frames = 97
steps = 25 # higher for better quality
all_frames = []
all_audio = []
last_frame = None
for i, prompt in enumerate(SCENES):
print(f"\n{'='*60}")
print(f"SCENE {i+1}/{len(SCENES)}")
print(f"{'='*60}")
# show first 100 chars of prompt
print(f"{prompt[:100]}...")
print()
neg_prompt = "blurry, low quality, distorted, deformed, ugly, bad anatomy, text, watermark, signature"
if i == 0 or last_frame is None:
# first scene: text-to-video
result = t2v_pipe(
prompt=prompt,
negative_prompt=neg_prompt,
width=width,
height=height,
num_frames=frames,
num_inference_steps=steps,
guidance_scale=4.0,
)
else:
# subsequent scenes: image-to-video for continuity
result = i2v_pipe(
image=last_frame,
prompt=prompt,
negative_prompt=neg_prompt,
width=width,
height=height,
num_frames=frames,
num_inference_steps=steps,
guidance_scale=4.0,
)
video_frames = result.frames[0]
last_frame = video_frames[-1]
# collect frames (skip first for scenes 2+ to avoid duplicate)
if i == 0:
all_frames.extend(video_frames)
else:
all_frames.extend(video_frames[1:])
# collect audio
if result.audio is not None:
all_audio.append(result.audio[0])
# save individual scene
scene_path = os.path.join(output_dir, f"scene_{i+1:02d}.mp4")
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
audio = result.audio[0].float().cpu() if result.audio is not None else None
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate if audio is not None else None
encode_video(video_tensor, fps=24, audio=audio, audio_sample_rate=audio_sr, output_path=scene_path)
print(f"saved: {scene_path}")
# create full film
print(f"\n{'='*60}")
print("ASSEMBLING FINAL FILM...")
print(f"{'='*60}\n")
full_path = os.path.join(output_dir, "the_mountain_guardian.mp4")
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in all_frames])
if all_audio:
full_audio = torch.cat(all_audio, dim=-1).float().cpu()
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate
else:
full_audio = None
audio_sr = None
encode_video(video_tensor, fps=24, audio=full_audio, audio_sample_rate=audio_sr, output_path=full_path)
duration = len(all_frames) / 24
print(f"\n{'='*60}")
print("PRODUCTION COMPLETE")
print(f"{'='*60}")
print(f"total frames: {len(all_frames)}")
print(f"duration: {duration:.1f} seconds ({duration/60:.1f} minutes)")
print(f"output: {full_path}")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()

134
story_continuous.py Normal file
View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""
generate a continuous story video with ltx-2
uses image-to-video to maintain visual continuity between scenes
"""
import os
import subprocess
import numpy as np
import torch
from PIL import Image
from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline
from diffusers.pipelines.ltx2.export_utils import encode_video
# story scenes
SCENES = [
"Sunrise over the Swiss Alps, snow-covered peaks glowing orange and pink, a white Swiss shepherd dog standing on a ridge, peaceful morning, cinematic wide shot",
"The Swiss shepherd walking through deep powder snow, determined stride, pine trees, snow particles in air, morning light, tracking shot following the dog",
"The shepherd stops suddenly, ears perked, alert pose, listening intently, snowy forest, something caught its attention",
"A small white lamb alone in the snow, shivering, lost and scared, the shepherd approaches gently in the background",
"The Swiss shepherd nuzzling the scared lamb, comforting gesture, warm breath visible, tender moment, shallow depth of field",
"The shepherd leading the lamb through snowy alpine meadow, protective stance, walking together, mountains in background, golden hour",
"Wide shot of dog and lamb crossing a snowy hill, vast white landscape, beautiful alpine scenery, afternoon light",
"A cozy Swiss mountain village appearing in the distance, warm lights glowing, smoke from chimneys, dusk, hopeful atmosphere",
"The shepherd and lamb arriving at a wooden barn, warm light spilling out, welcoming atmosphere, journey's end",
"Night sky over the Alps with stars, the shepherd dog silhouette on a ridge, majestic ending, peaceful, cinematic finale",
]
def main():
output_dir = os.path.expanduser("~/Desktop/mountain_guardian")
os.makedirs(output_dir, exist_ok=True)
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"using {device}")
# load both pipelines
print("loading text-to-video pipeline...")
t2v_pipe = LTX2Pipeline.from_pretrained(
"Lightricks/LTX-2",
torch_dtype=torch.bfloat16
)
t2v_pipe.to(device)
print("loading image-to-video pipeline...")
i2v_pipe = LTX2ImageToVideoPipeline.from_pretrained(
"Lightricks/LTX-2",
torch_dtype=torch.bfloat16
)
i2v_pipe.to(device)
width, height = 768, 448
frames = 97 # ~4 seconds per scene
steps = 20
all_video_frames = []
all_audio = []
last_frame = None
for i, prompt in enumerate(SCENES):
print(f"\n{'='*60}")
print(f"scene {i+1}/{len(SCENES)}")
print(f"prompt: {prompt[:60]}...")
print(f"{'='*60}\n")
if i == 0:
# first scene: text-to-video
result = t2v_pipe(
prompt=prompt,
negative_prompt="blurry, low quality, distorted, deformed",
width=width,
height=height,
num_frames=frames,
num_inference_steps=steps,
)
else:
# subsequent scenes: image-to-video from last frame
result = i2v_pipe(
image=last_frame,
prompt=prompt,
negative_prompt="blurry, low quality, distorted, deformed",
width=width,
height=height,
num_frames=frames,
num_inference_steps=steps,
)
# get frames
video_frames = result.frames[0]
# save last frame for next scene
last_frame = video_frames[-1]
# collect frames (skip first frame for scenes 2+ to avoid duplicate)
if i == 0:
all_video_frames.extend(video_frames)
else:
all_video_frames.extend(video_frames[1:]) # skip first frame (duplicate of last)
# collect audio
if result.audio is not None:
all_audio.append(result.audio[0])
# save individual scene
scene_path = os.path.join(output_dir, f"scene_{i+1:02d}.mp4")
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
audio = result.audio[0].float().cpu() if result.audio is not None else None
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate if audio is not None else None
encode_video(video_tensor, fps=24, audio=audio, audio_sample_rate=audio_sr, output_path=scene_path)
print(f"saved: {scene_path}")
# save full video
print("\ncreating full video...")
full_path = os.path.join(output_dir, "mountain_guardian_full.mp4")
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in all_video_frames])
# concatenate audio
if all_audio:
full_audio = torch.cat(all_audio, dim=-1).float().cpu()
audio_sr = t2v_pipe.vocoder.config.output_sampling_rate
else:
full_audio = None
audio_sr = None
encode_video(video_tensor, fps=24, audio=full_audio, audio_sample_rate=audio_sr, output_path=full_path)
print(f"\n{'='*60}")
print(f"done!")
print(f"total frames: {len(all_video_frames)}")
print(f"duration: ~{len(all_video_frames)/24:.1f} seconds")
print(f"saved to: {full_path}")
print(f"{'='*60}")
if __name__ == "__main__":
main()

161
story_generator.py Executable file
View File

@@ -0,0 +1,161 @@
#!/usr/bin/env python3
"""
generate a multi-scene story video with ltx-2
"""
import os
import subprocess
import sys
# story scenes - each generates ~4 seconds
SCENES = [
# Act 1: Introduction
{
"prompt": "Sunrise over the Swiss Alps, snow-covered peaks glowing orange and pink, peaceful morning atmosphere, cinematic wide shot, gentle wind blowing snow, 4K quality",
"name": "01_sunrise"
},
{
"prompt": "A majestic white Swiss shepherd dog standing on a snowy ridge overlooking a mountain valley, morning light, heroic pose, wind ruffling fur, cinematic portrait shot",
"name": "02_hero_intro"
},
{
"prompt": "Close-up of the Swiss shepherd's face, alert eyes scanning the horizon, breath visible in cold air, morning sunlight on fur, shallow depth of field",
"name": "03_closeup"
},
{
"prompt": "The Swiss shepherd walking through deep powder snow in the Alps, determined stride, pine trees in background, snow particles in air, tracking shot",
"name": "04_walking"
},
# Act 2: Discovery
{
"prompt": "The Swiss shepherd stops suddenly, ears perked up, alert pose, listening intently, snowy forest background, dramatic lighting, tension building",
"name": "05_alert"
},
{
"prompt": "A small white lamb alone in the snow, shivering, lost and scared, soft snowfall, vulnerable, wide snowy landscape, emotional scene",
"name": "06_lost_lamb"
},
{
"prompt": "The Swiss shepherd approaching the lamb gently, careful steps through snow, compassionate body language, soft winter light, heartwarming moment",
"name": "07_approach"
},
{
"prompt": "Close-up of the shepherd dog nuzzling the scared lamb, comforting gesture, warm breath visible, tender moment, shallow depth of field, emotional",
"name": "08_comfort"
},
# Act 3: Journey home
{
"prompt": "The Swiss shepherd leading the small lamb through snowy alpine meadow, protective stance, walking together, mountains in background, golden hour light",
"name": "09_leading"
},
{
"prompt": "Wide shot of dog and lamb crossing a snowy hill together, tiny figures in vast white landscape, beautiful alpine scenery, cinematic composition",
"name": "10_journey"
},
{
"prompt": "The shepherd and lamb walking past snow-covered pine trees, gentle snowfall, peaceful atmosphere, soft afternoon light filtering through branches",
"name": "11_forest_path"
},
{
"prompt": "A cozy Swiss mountain village appearing in the distance, warm lights glowing from windows, smoke from chimneys, dusk setting in, hopeful atmosphere",
"name": "12_village_sight"
},
# Act 4: Reunion
{
"prompt": "The Swiss shepherd and lamb arriving at a wooden barn door, warm light spilling out, welcoming atmosphere, end of journey, relief",
"name": "13_arrival"
},
{
"prompt": "A farmer in traditional Swiss clothing opening the barn door, surprised and grateful expression, warm interior light, emotional reunion moment",
"name": "14_farmer"
},
{
"prompt": "The lamb running to join other sheep in a warm barn, happy reunion, straw on floor, cozy interior, heartwarming resolution",
"name": "15_reunion"
},
{
"prompt": "The Swiss shepherd sitting proudly outside the barn, farmer patting its head gratefully, twilight sky, village lights twinkling, satisfied hero",
"name": "16_reward"
},
# Finale
{
"prompt": "Night sky over the Swiss Alps with stars and northern lights, the shepherd dog silhouette on a ridge, majestic ending, peaceful, cinematic wide shot",
"name": "17_finale"
},
]
def generate_scene(scene, output_dir, width=768, height=448, frames=97, steps=20):
"""generate a single scene"""
output_path = os.path.join(output_dir, f"{scene['name']}.mp4")
if os.path.exists(output_path):
print(f"skipping {scene['name']} (already exists)")
return output_path
cmd = [
sys.executable, "generate.py",
scene["prompt"],
"-o", output_path,
"--width", str(width),
"--height", str(height),
"--frames", str(frames),
"--steps", str(steps),
"-n", "blurry, low quality, distorted, deformed, ugly, bad anatomy"
]
print(f"\n{'='*60}")
print(f"generating: {scene['name']}")
print(f"prompt: {scene['prompt'][:80]}...")
print(f"{'='*60}\n")
subprocess.run(cmd, cwd=os.path.dirname(os.path.abspath(__file__)))
return output_path
def concatenate_videos(video_files, output_path):
"""concatenate all videos using ffmpeg"""
# create file list
list_path = "/tmp/video_list.txt"
with open(list_path, "w") as f:
for video in video_files:
f.write(f"file '{video}'\n")
cmd = [
"ffmpeg", "-y", "-f", "concat", "-safe", "0",
"-i", list_path,
"-c:v", "libx264", "-crf", "18",
"-c:a", "aac", "-b:a", "192k",
output_path
]
print(f"\nconcatenating {len(video_files)} clips...")
subprocess.run(cmd)
print(f"saved to: {output_path}")
def main():
output_dir = os.path.expanduser("~/Desktop/mountain_guardian")
os.makedirs(output_dir, exist_ok=True)
print(f"generating {len(SCENES)} scenes for 'The Mountain Guardian'")
print(f"output directory: {output_dir}")
print(f"estimated time: ~{len(SCENES) * 10} minutes\n")
video_files = []
for i, scene in enumerate(SCENES):
print(f"\n[{i+1}/{len(SCENES)}]")
video_path = generate_scene(scene, output_dir)
video_files.append(video_path)
# concatenate all videos
final_path = os.path.join(output_dir, "mountain_guardian_full.mp4")
concatenate_videos(video_files, final_path)
print(f"\n{'='*60}")
print(f"done! final video: {final_path}")
print(f"{'='*60}")
if __name__ == "__main__":
main()