From 8174da74908fb07fd07a8df2ab7947281816c19a Mon Sep 17 00:00:00 2001 From: Norbert Schmidt Date: Sun, 11 Jan 2026 20:01:20 +0100 Subject: [PATCH] Added story generator --- clipmaker | 5 + clipmaker.py | 339 ++++++++++++++++++++++++++++++++++++++++++++ sample_prompts.txt | 43 ++++++ story_cinematic.py | 164 +++++++++++++++++++++ story_continuous.py | 134 +++++++++++++++++ story_generator.py | 161 +++++++++++++++++++++ 6 files changed, 846 insertions(+) create mode 100755 clipmaker create mode 100755 clipmaker.py create mode 100644 sample_prompts.txt create mode 100644 story_cinematic.py create mode 100644 story_continuous.py create mode 100755 story_generator.py diff --git a/clipmaker b/clipmaker new file mode 100755 index 0000000..57f50a6 --- /dev/null +++ b/clipmaker @@ -0,0 +1,5 @@ +#!/bin/bash +# clipmaker - quick wrapper +source ~/ltx-venv/bin/activate +cd "$(dirname "$0")" +python clipmaker.py "$@" diff --git a/clipmaker.py b/clipmaker.py new file mode 100755 index 0000000..0daa03d --- /dev/null +++ b/clipmaker.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +""" +clipmaker - high quality video clip generator for ltx-2 + +usage: + clipmaker "your prompt here" # quick preview + clipmaker "your prompt" --preset hq # high quality + clipmaker "your prompt" --preset max # maximum quality + clipmaker --batch prompts.txt # batch from file + clipmaker --interactive # interactive mode +""" + +import argparse +import os +import sys +import json +from datetime import datetime +from pathlib import Path + +import numpy as np +import torch +from PIL import Image + +# quality presets +PRESETS = { + "preview": { + "width": 512, + "height": 320, + "frames": 25, + "steps": 10, + "guidance": 4.0, + "description": "fast preview (~1 min)" + }, + "standard": { + "width": 768, + "height": 448, + "frames": 49, + "steps": 20, + "guidance": 4.0, + "description": "balanced quality (~5 min)" + }, + "hq": { + "width": 1024, + "height": 576, + "frames": 97, + "steps": 25, + "guidance": 4.0, + "description": "high quality (~15 min)" + }, + "max": { + "width": 1024, + "height": 576, + "frames": 161, + "steps": 30, + "guidance": 4.0, + "description": "maximum quality (~30 min)" + }, + "cinematic": { + "width": 1280, + "height": 720, + "frames": 97, + "steps": 30, + "guidance": 4.5, + "description": "cinematic 720p (~25 min)" + }, +} + +# default negative prompt based on ltx-2 guide +DEFAULT_NEGATIVE = "blurry, low quality, distorted, deformed, ugly, bad anatomy, text, watermark, signature, out of frame" + +# prompt enhancement tips +PROMPT_TIPS = """ +prompt tips (from ltx-2 guide): + - write as flowing paragraph, 4-8 sentences + - include: shot type, lighting, action, camera movement, audio + - use cinematography terms: dolly, pan, track, handheld, close-up + - describe sounds and dialogue in "quotes" + - use present tense for actions + +example: + "A cinematic medium shot of a coffee cup on a wooden table, steam rising + gently in soft morning light. The camera slowly pushes in as a hand + reaches into frame to lift the cup. Warm ambient cafe sounds and soft + jazz play in the background. Shallow depth of field, golden hour lighting." +""" + + +class ClipMaker: + def __init__(self, output_dir="~/Desktop/clips"): + self.output_dir = Path(output_dir).expanduser() + self.output_dir.mkdir(parents=True, exist_ok=True) + self.pipe = None + self.device = None + + def load_model(self): + """load the ltx-2 pipeline""" + if self.pipe is not None: + return + + from diffusers import LTX2Pipeline + + self.device = "mps" if torch.backends.mps.is_available() else "cpu" + print(f"device: {self.device}") + + print("loading ltx-2 model...") + self.pipe = LTX2Pipeline.from_pretrained( + "Lightricks/LTX-2", + torch_dtype=torch.bfloat16 + ) + self.pipe.to(self.device) + print("model ready\n") + + def generate(self, prompt, preset="standard", negative_prompt=None, + seed=None, output_path=None, no_audio=False): + """generate a video clip""" + from diffusers.pipelines.ltx2.export_utils import encode_video + + self.load_model() + + # get preset settings + if preset not in PRESETS: + print(f"unknown preset: {preset}") + print(f"available: {', '.join(PRESETS.keys())}") + return None + + settings = PRESETS[preset] + + # generate output path if not provided + if output_path is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = self.output_dir / f"clip_{timestamp}.mp4" + else: + output_path = Path(output_path) + + # set seed + if seed is None: + seed = torch.randint(0, 2**31, (1,)).item() + generator = torch.Generator(device="cpu") + generator.manual_seed(seed) + + # use default negative if not provided + if negative_prompt is None: + negative_prompt = DEFAULT_NEGATIVE + + print(f"{'='*60}") + print(f"generating clip") + print(f"{'='*60}") + print(f"preset: {preset} ({settings['description']})") + print(f"size: {settings['width']}x{settings['height']}") + print(f"frames: {settings['frames']} (~{settings['frames']/24:.1f}s)") + print(f"steps: {settings['steps']}") + print(f"seed: {seed}") + print(f"audio: {'no' if no_audio else 'yes'}") + print(f"output: {output_path}") + print(f"\nprompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}") + print(f"{'='*60}\n") + + # generate + result = self.pipe( + prompt=prompt, + negative_prompt=negative_prompt, + width=settings["width"], + height=settings["height"], + num_frames=settings["frames"], + num_inference_steps=settings["steps"], + guidance_scale=settings["guidance"], + generator=generator, + ) + + # get video frames as tensor + video_frames = result.frames[0] + video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames]) + + # get audio + audio = None + audio_sr = None + if not no_audio and result.audio is not None: + audio = result.audio[0].float().cpu() + audio_sr = self.pipe.vocoder.config.output_sampling_rate + print(f"audio: {audio_sr}Hz") + + # export + encode_video( + video=video_tensor, + fps=24, + audio=audio, + audio_sample_rate=audio_sr, + output_path=str(output_path) + ) + + # save metadata + meta_path = output_path.with_suffix(".json") + metadata = { + "prompt": prompt, + "negative_prompt": negative_prompt, + "preset": preset, + "settings": settings, + "seed": seed, + "timestamp": datetime.now().isoformat(), + "output": str(output_path), + } + with open(meta_path, "w") as f: + json.dump(metadata, f, indent=2) + + print(f"\n{'='*60}") + print(f"done!") + print(f"video: {output_path}") + print(f"metadata: {meta_path}") + print(f"seed: {seed} (use --seed {seed} to reproduce)") + print(f"{'='*60}\n") + + return output_path + + def batch_generate(self, prompts_file, preset="standard"): + """generate multiple clips from a file""" + prompts_path = Path(prompts_file) + if not prompts_path.exists(): + print(f"file not found: {prompts_file}") + return + + prompts = [] + with open(prompts_path) as f: + current_prompt = [] + for line in f: + line = line.strip() + if line == "---": # separator between prompts + if current_prompt: + prompts.append(" ".join(current_prompt)) + current_prompt = [] + elif line and not line.startswith("#"): # skip comments + current_prompt.append(line) + if current_prompt: + prompts.append(" ".join(current_prompt)) + + print(f"found {len(prompts)} prompts in {prompts_file}") + print(f"preset: {preset}") + print() + + for i, prompt in enumerate(prompts): + print(f"\n[{i+1}/{len(prompts)}]") + self.generate(prompt, preset=preset) + + def interactive(self): + """interactive prompt mode""" + print("\n" + "="*60) + print("clipmaker interactive mode") + print("="*60) + print(PROMPT_TIPS) + print("\npresets:", ", ".join(PRESETS.keys())) + print("commands: /preset , /tips, /quit\n") + + current_preset = "standard" + + while True: + try: + prompt = input(f"[{current_preset}] > ").strip() + except (EOFError, KeyboardInterrupt): + print("\nbye!") + break + + if not prompt: + continue + elif prompt == "/quit": + print("bye!") + break + elif prompt == "/tips": + print(PROMPT_TIPS) + elif prompt.startswith("/preset"): + parts = prompt.split() + if len(parts) > 1 and parts[1] in PRESETS: + current_preset = parts[1] + print(f"preset: {current_preset} - {PRESETS[current_preset]['description']}") + else: + print(f"presets: {', '.join(PRESETS.keys())}") + else: + self.generate(prompt, preset=current_preset) + + +def main(): + parser = argparse.ArgumentParser( + description="clipmaker - hq video clip generator", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=f""" +presets: + preview - {PRESETS['preview']['description']} + standard - {PRESETS['standard']['description']} + hq - {PRESETS['hq']['description']} + max - {PRESETS['max']['description']} + cinematic - {PRESETS['cinematic']['description']} + +examples: + clipmaker "a cat sleeping on a couch" + clipmaker "epic sunset timelapse" --preset hq + clipmaker --batch prompts.txt --preset standard + clipmaker --interactive + """ + ) + + parser.add_argument("prompt", nargs="?", help="video prompt") + parser.add_argument("--preset", "-p", default="standard", + choices=PRESETS.keys(), help="quality preset") + parser.add_argument("--output", "-o", help="output path") + parser.add_argument("--seed", "-s", type=int, help="random seed") + parser.add_argument("--negative", "-n", help="negative prompt") + parser.add_argument("--no-audio", action="store_true", help="disable audio") + parser.add_argument("--batch", "-b", help="batch generate from file") + parser.add_argument("--interactive", "-i", action="store_true", + help="interactive mode") + parser.add_argument("--output-dir", default="~/Desktop/clips", + help="output directory") + parser.add_argument("--tips", action="store_true", help="show prompt tips") + + args = parser.parse_args() + + if args.tips: + print(PROMPT_TIPS) + return + + maker = ClipMaker(output_dir=args.output_dir) + + if args.interactive: + maker.interactive() + elif args.batch: + maker.batch_generate(args.batch, preset=args.preset) + elif args.prompt: + maker.generate( + prompt=args.prompt, + preset=args.preset, + negative_prompt=args.negative, + seed=args.seed, + output_path=args.output, + no_audio=args.no_audio, + ) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/sample_prompts.txt b/sample_prompts.txt new file mode 100644 index 0000000..10aec9a --- /dev/null +++ b/sample_prompts.txt @@ -0,0 +1,43 @@ +# sample prompts for clipmaker batch mode +# separate prompts with --- +# lines starting with # are comments + +# cinematic nature +A breathtaking cinematic wide shot of a lone wolf walking through a snowy forest at dusk. The camera slowly tracks alongside the wolf as it moves gracefully between snow-covered pine trees. Soft blue twilight illuminates the scene, with the wolf's breath visible in the cold air. Ambient sounds of gentle wind and distant owls. Shallow depth of field, film grain, atmospheric and moody. +--- + +# urban timelapse style +A hyperlapse shot moving through neon-lit streets of Tokyo at night. The camera glides forward at street level as crowds of people blur past on either side. Bright signs in Japanese reflect off wet pavement after rain. Electronic ambient sounds and city noise create an immersive soundscape. Cyberpunk aesthetic, vibrant colors, motion blur on pedestrians. +--- + +# dramatic portrait +A cinematic close-up of an elderly fisherman's weathered face, deep wrinkles telling stories of decades at sea. He looks out at the ocean with knowing eyes, salt-and-pepper beard moving slightly in the wind. Golden hour sunlight creates a warm rim light around his profile. The sound of waves and seagulls in the distance. Shallow depth of field, intimate and contemplative. +--- + +# fantasy scene +A mystical wide shot of an ancient stone temple overgrown with glowing bioluminescent vines, deep in an enchanted forest. Magical particles float through the air as moonlight streams through gaps in the canopy above. The camera slowly pushes in toward the temple entrance. Ethereal ambient music and forest sounds create a sense of wonder. Fantasy aesthetic, volumetric lighting, dreamlike atmosphere. +--- + +# action sequence +An intense tracking shot following a motorcycle speeding through desert canyons at sunset. The camera moves alongside the rider as dust kicks up behind the wheels. Orange and red rock formations blur past. Engine roar and wind create an exhilarating soundscape. The rider leans into a sharp curve, sunlight flaring across the lens. Cinematic, dynamic, adrenaline-pumping. +--- + +# cozy interior +A warm, intimate shot inside a rustic cabin during a rainstorm. The camera slowly pans across a crackling fireplace, past stacked books and a steaming mug of tea on a wooden table. Rain streaks down the window, lightning briefly illuminates the room. Sounds of rain, thunder, and fire crackling. Shallow depth of field, amber lighting, hygge aesthetic. +--- + +# underwater world +A mesmerizing underwater shot following a sea turtle gliding through crystal clear tropical waters. Sunbeams pierce the surface above, creating dancing light patterns on the sandy ocean floor. Colorful fish swim past as the turtle moves gracefully through a coral reef. Ambient underwater sounds and gentle current noise. Nature documentary style, serene and meditative. +--- + +# vintage nostalgia +A dreamy medium shot of a woman in a 1960s summer dress riding a vintage bicycle down a tree-lined country road. The camera tracks alongside her as dappled sunlight filters through the leaves above. Her hair and dress flutter in the breeze. Nostalgic soundtrack with birds singing. Soft film grain, warm vintage color grading, golden hour lighting. +--- + +# sci-fi atmosphere +A sweeping establishing shot of a massive space station orbiting Earth, the blue planet glowing in the background. The camera slowly arcs around the station as small spacecraft dock and depart. Solar panels catch the sunlight. Ambient electronic hum and distant radio chatter. Hard science fiction aesthetic, realistic lighting, epic scale. +--- + +# food cinematography +An elegant overhead shot slowly descending toward a beautifully plated gourmet dish on a dark marble surface. Steam rises gently from the food as the camera pushes in. A chef's hand enters frame to add a final garnish. Soft ambient restaurant sounds and gentle plating noises. Shallow depth of field, dramatic side lighting, food photography style. +--- diff --git a/story_cinematic.py b/story_cinematic.py new file mode 100644 index 0000000..c42fbdf --- /dev/null +++ b/story_cinematic.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +the mountain guardian - a cinematic short film +generated with ltx-2 using proper prompting techniques +""" + +import os +import numpy as np +import torch +from PIL import Image +from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline +from diffusers.pipelines.ltx2.export_utils import encode_video + +# cinematic story prompts following ltx-2 prompting guide +SCENES = [ + # scene 1: opening - sunrise + """EXT. SWISS ALPS - DAWN. A cinematic wide establishing shot of snow-covered mountain peaks as golden sunrise light spills over the ridgeline. The camera slowly pans right, revealing a vast alpine valley blanketed in fresh powder snow. Wisps of morning mist drift between the pine trees below. The warm orange glow gradually illuminates the pristine white landscape. Ambient sounds of gentle wind and distant bird calls fill the air. The shot lingers on the majestic scenery, peaceful and untouched.""", + + # scene 2: hero introduction + """EXT. MOUNTAIN RIDGE - DAWN. The camera pushes in slowly on a magnificent white Swiss shepherd dog standing proudly on a snowy ridge, silhouetted against the golden sunrise. The dog's thick fur ruffles gently in the cold mountain breeze, breath visible in the frigid air. Medium shot, shallow depth of field with the valley soft in the background. The shepherd surveys the landscape below with alert, intelligent eyes, ears perked forward. Soft ambient wind and the dog's quiet breathing create an intimate atmosphere. Cinematic warm backlighting creates a heroic golden rim around the dog's form.""", + + # scene 3: the patrol begins + """EXT. SNOWY FOREST - MORNING. Tracking shot following the white Swiss shepherd as it walks purposefully through deep powder snow, each step sending up small puffs of white. Pine trees tower on either side, their branches heavy with snow. The camera moves alongside the dog at eye level, handheld style with subtle movement. Morning light filters through the forest canopy in soft rays. The sound of snow crunching under paws and the dog's steady breathing. The shepherd moves with determination, nose low, following an invisible trail through the wilderness.""", + + # scene 4: something's wrong + """EXT. FOREST CLEARING - MORNING. The Swiss shepherd stops abruptly mid-stride, head snapping to the right, ears rotating forward. Close-up on the dog's face showing intense focus, nostrils flaring as it catches a scent. The camera slowly pushes in on the shepherd's alert expression. A beat of tense silence, then a faint, distant bleating sound echoes through the trees. The dog's eyes widen slightly with recognition. The ambient forest sounds fade as the shepherd locks onto the direction of the cry. Shallow depth of field isolates the dog's concentrated expression.""", + + # scene 5: discovery + """EXT. SNOWY HOLLOW - MORNING. Wide shot revealing a small white lamb huddled alone in a depression in the snow, shivering visibly, its wool matted and wet. The lamb lets out weak, frightened bleats, breath coming in short visible puffs. The camera slowly dollies forward, keeping the vulnerable creature centered. Snow continues to fall gently around it. In the background, barely visible through the snow, the Swiss shepherd appears at the edge of the clearing. Soft, melancholic ambient tones underscore the lamb's distress. The scene conveys isolation and vulnerability.""", + + # scene 6: gentle approach + """EXT. SNOWY HOLLOW - MORNING. Medium shot as the Swiss shepherd approaches the frightened lamb with slow, deliberate steps, body low and non-threatening. The lamb looks up with wide, fearful eyes but doesn't flee. The dog pauses, then takes another careful step forward. The camera tracks alongside at ground level. Soft snow crunches beneath careful paws. The shepherd's expression is gentle, reassuring. Warm morning light breaks through the clouds above. The tension slowly dissolves as the lamb recognizes help has arrived. Ambient sounds of gentle wind and soft animal breathing.""", + + # scene 7: comfort + """EXT. SNOWY HOLLOW - MORNING. Close-up intimate shot as the Swiss shepherd gently nuzzles the shivering lamb, warm breath creating a soft cloud between them. The lamb presses against the dog's thick fur, seeking warmth. The camera holds on this tender moment, shallow depth of field blurring the snowy background. The shepherd's eyes close briefly in a gesture of comfort. Soft, warm lighting wraps around both animals. The lamb's frightened bleating quiets to soft sounds of relief. A moment of connection between two creatures in the vast wilderness. Heartwarming and genuine.""", + + # scene 8: the journey begins + """EXT. ALPINE MEADOW - MIDDAY. Wide cinematic shot as the Swiss shepherd leads the small lamb across a vast snowy meadow, the dog walking protectively alongside its small companion. Mountain peaks rise majestically in the background under a pale blue sky. The camera slowly cranes up to reveal the epic scale of their journey ahead. Both animals leave a trail of footprints in the pristine snow. Soft orchestral tones suggest hope and determination. The shepherd occasionally glances back to check on the lamb, who follows trustingly. Golden sunlight illuminates the pair as they traverse the white expanse.""", + + # scene 9: village in sight + """EXT. MOUNTAIN OVERLOOK - LATE AFTERNOON. The camera pushes forward as the Swiss shepherd and lamb crest a snowy hill, revealing a picturesque Swiss village nestled in the valley below. Warm lights glow from windows of wooden chalets, smoke rising from chimneys into the golden hour sky. The shepherd pauses, tail wagging slightly at the sight. The lamb stands close beside, tired but hopeful. A sense of relief and accomplishment fills the frame. Church bells chime faintly in the distance. The camera slowly zooms toward the welcoming village as the sun sets behind the mountains.""", + + # scene 10: finale + """EXT. VILLAGE BARN - DUSK. Medium shot as a farmer in traditional Swiss clothing opens a wooden barn door, warm golden light spilling out into the blue twilight. His weathered face shows surprise, then breaks into a warm smile as he sees the shepherd with the lost lamb. He kneels down, arms open. The lamb bounds forward into the warm barn interior where other sheep can be seen. The farmer reaches out to pat the shepherd's head gratefully, saying softly "Good dog... good dog." The shepherd sits proudly, breath visible in the cold air, mission complete. Warm interior light contrasts with the cold blue dusk outside.""", +] + +def main(): + output_dir = os.path.expanduser("~/Desktop/mountain_guardian") + os.makedirs(output_dir, exist_ok=True) + + device = "mps" if torch.backends.mps.is_available() else "cpu" + print(f"using {device}") + print(f"\n{'='*60}") + print("THE MOUNTAIN GUARDIAN") + print("a cinematic short film") + print(f"{'='*60}\n") + + # load pipelines + print("loading text-to-video pipeline...") + t2v_pipe = LTX2Pipeline.from_pretrained( + "Lightricks/LTX-2", + torch_dtype=torch.bfloat16 + ) + t2v_pipe.to(device) + + print("loading image-to-video pipeline...") + i2v_pipe = LTX2ImageToVideoPipeline.from_pretrained( + "Lightricks/LTX-2", + torch_dtype=torch.bfloat16 + ) + i2v_pipe.to(device) + + # settings - 768x448, 97 frames (~4 sec per scene) + width, height = 768, 448 + frames = 97 + steps = 25 # higher for better quality + + all_frames = [] + all_audio = [] + last_frame = None + + for i, prompt in enumerate(SCENES): + print(f"\n{'='*60}") + print(f"SCENE {i+1}/{len(SCENES)}") + print(f"{'='*60}") + # show first 100 chars of prompt + print(f"{prompt[:100]}...") + print() + + neg_prompt = "blurry, low quality, distorted, deformed, ugly, bad anatomy, text, watermark, signature" + + if i == 0 or last_frame is None: + # first scene: text-to-video + result = t2v_pipe( + prompt=prompt, + negative_prompt=neg_prompt, + width=width, + height=height, + num_frames=frames, + num_inference_steps=steps, + guidance_scale=4.0, + ) + else: + # subsequent scenes: image-to-video for continuity + result = i2v_pipe( + image=last_frame, + prompt=prompt, + negative_prompt=neg_prompt, + width=width, + height=height, + num_frames=frames, + num_inference_steps=steps, + guidance_scale=4.0, + ) + + video_frames = result.frames[0] + last_frame = video_frames[-1] + + # collect frames (skip first for scenes 2+ to avoid duplicate) + if i == 0: + all_frames.extend(video_frames) + else: + all_frames.extend(video_frames[1:]) + + # collect audio + if result.audio is not None: + all_audio.append(result.audio[0]) + + # save individual scene + scene_path = os.path.join(output_dir, f"scene_{i+1:02d}.mp4") + video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames]) + audio = result.audio[0].float().cpu() if result.audio is not None else None + audio_sr = t2v_pipe.vocoder.config.output_sampling_rate if audio is not None else None + encode_video(video_tensor, fps=24, audio=audio, audio_sample_rate=audio_sr, output_path=scene_path) + print(f"saved: {scene_path}") + + # create full film + print(f"\n{'='*60}") + print("ASSEMBLING FINAL FILM...") + print(f"{'='*60}\n") + + full_path = os.path.join(output_dir, "the_mountain_guardian.mp4") + video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in all_frames]) + + if all_audio: + full_audio = torch.cat(all_audio, dim=-1).float().cpu() + audio_sr = t2v_pipe.vocoder.config.output_sampling_rate + else: + full_audio = None + audio_sr = None + + encode_video(video_tensor, fps=24, audio=full_audio, audio_sample_rate=audio_sr, output_path=full_path) + + duration = len(all_frames) / 24 + print(f"\n{'='*60}") + print("PRODUCTION COMPLETE") + print(f"{'='*60}") + print(f"total frames: {len(all_frames)}") + print(f"duration: {duration:.1f} seconds ({duration/60:.1f} minutes)") + print(f"output: {full_path}") + print(f"{'='*60}\n") + +if __name__ == "__main__": + main() diff --git a/story_continuous.py b/story_continuous.py new file mode 100644 index 0000000..705b2a0 --- /dev/null +++ b/story_continuous.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +""" +generate a continuous story video with ltx-2 +uses image-to-video to maintain visual continuity between scenes +""" + +import os +import subprocess +import numpy as np +import torch +from PIL import Image +from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline +from diffusers.pipelines.ltx2.export_utils import encode_video + +# story scenes +SCENES = [ + "Sunrise over the Swiss Alps, snow-covered peaks glowing orange and pink, a white Swiss shepherd dog standing on a ridge, peaceful morning, cinematic wide shot", + "The Swiss shepherd walking through deep powder snow, determined stride, pine trees, snow particles in air, morning light, tracking shot following the dog", + "The shepherd stops suddenly, ears perked, alert pose, listening intently, snowy forest, something caught its attention", + "A small white lamb alone in the snow, shivering, lost and scared, the shepherd approaches gently in the background", + "The Swiss shepherd nuzzling the scared lamb, comforting gesture, warm breath visible, tender moment, shallow depth of field", + "The shepherd leading the lamb through snowy alpine meadow, protective stance, walking together, mountains in background, golden hour", + "Wide shot of dog and lamb crossing a snowy hill, vast white landscape, beautiful alpine scenery, afternoon light", + "A cozy Swiss mountain village appearing in the distance, warm lights glowing, smoke from chimneys, dusk, hopeful atmosphere", + "The shepherd and lamb arriving at a wooden barn, warm light spilling out, welcoming atmosphere, journey's end", + "Night sky over the Alps with stars, the shepherd dog silhouette on a ridge, majestic ending, peaceful, cinematic finale", +] + +def main(): + output_dir = os.path.expanduser("~/Desktop/mountain_guardian") + os.makedirs(output_dir, exist_ok=True) + + device = "mps" if torch.backends.mps.is_available() else "cpu" + print(f"using {device}") + + # load both pipelines + print("loading text-to-video pipeline...") + t2v_pipe = LTX2Pipeline.from_pretrained( + "Lightricks/LTX-2", + torch_dtype=torch.bfloat16 + ) + t2v_pipe.to(device) + + print("loading image-to-video pipeline...") + i2v_pipe = LTX2ImageToVideoPipeline.from_pretrained( + "Lightricks/LTX-2", + torch_dtype=torch.bfloat16 + ) + i2v_pipe.to(device) + + width, height = 768, 448 + frames = 97 # ~4 seconds per scene + steps = 20 + + all_video_frames = [] + all_audio = [] + last_frame = None + + for i, prompt in enumerate(SCENES): + print(f"\n{'='*60}") + print(f"scene {i+1}/{len(SCENES)}") + print(f"prompt: {prompt[:60]}...") + print(f"{'='*60}\n") + + if i == 0: + # first scene: text-to-video + result = t2v_pipe( + prompt=prompt, + negative_prompt="blurry, low quality, distorted, deformed", + width=width, + height=height, + num_frames=frames, + num_inference_steps=steps, + ) + else: + # subsequent scenes: image-to-video from last frame + result = i2v_pipe( + image=last_frame, + prompt=prompt, + negative_prompt="blurry, low quality, distorted, deformed", + width=width, + height=height, + num_frames=frames, + num_inference_steps=steps, + ) + + # get frames + video_frames = result.frames[0] + + # save last frame for next scene + last_frame = video_frames[-1] + + # collect frames (skip first frame for scenes 2+ to avoid duplicate) + if i == 0: + all_video_frames.extend(video_frames) + else: + all_video_frames.extend(video_frames[1:]) # skip first frame (duplicate of last) + + # collect audio + if result.audio is not None: + all_audio.append(result.audio[0]) + + # save individual scene + scene_path = os.path.join(output_dir, f"scene_{i+1:02d}.mp4") + video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames]) + audio = result.audio[0].float().cpu() if result.audio is not None else None + audio_sr = t2v_pipe.vocoder.config.output_sampling_rate if audio is not None else None + encode_video(video_tensor, fps=24, audio=audio, audio_sample_rate=audio_sr, output_path=scene_path) + print(f"saved: {scene_path}") + + # save full video + print("\ncreating full video...") + full_path = os.path.join(output_dir, "mountain_guardian_full.mp4") + video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in all_video_frames]) + + # concatenate audio + if all_audio: + full_audio = torch.cat(all_audio, dim=-1).float().cpu() + audio_sr = t2v_pipe.vocoder.config.output_sampling_rate + else: + full_audio = None + audio_sr = None + + encode_video(video_tensor, fps=24, audio=full_audio, audio_sample_rate=audio_sr, output_path=full_path) + + print(f"\n{'='*60}") + print(f"done!") + print(f"total frames: {len(all_video_frames)}") + print(f"duration: ~{len(all_video_frames)/24:.1f} seconds") + print(f"saved to: {full_path}") + print(f"{'='*60}") + +if __name__ == "__main__": + main() diff --git a/story_generator.py b/story_generator.py new file mode 100755 index 0000000..4533236 --- /dev/null +++ b/story_generator.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +generate a multi-scene story video with ltx-2 +""" + +import os +import subprocess +import sys + +# story scenes - each generates ~4 seconds +SCENES = [ + # Act 1: Introduction + { + "prompt": "Sunrise over the Swiss Alps, snow-covered peaks glowing orange and pink, peaceful morning atmosphere, cinematic wide shot, gentle wind blowing snow, 4K quality", + "name": "01_sunrise" + }, + { + "prompt": "A majestic white Swiss shepherd dog standing on a snowy ridge overlooking a mountain valley, morning light, heroic pose, wind ruffling fur, cinematic portrait shot", + "name": "02_hero_intro" + }, + { + "prompt": "Close-up of the Swiss shepherd's face, alert eyes scanning the horizon, breath visible in cold air, morning sunlight on fur, shallow depth of field", + "name": "03_closeup" + }, + { + "prompt": "The Swiss shepherd walking through deep powder snow in the Alps, determined stride, pine trees in background, snow particles in air, tracking shot", + "name": "04_walking" + }, + + # Act 2: Discovery + { + "prompt": "The Swiss shepherd stops suddenly, ears perked up, alert pose, listening intently, snowy forest background, dramatic lighting, tension building", + "name": "05_alert" + }, + { + "prompt": "A small white lamb alone in the snow, shivering, lost and scared, soft snowfall, vulnerable, wide snowy landscape, emotional scene", + "name": "06_lost_lamb" + }, + { + "prompt": "The Swiss shepherd approaching the lamb gently, careful steps through snow, compassionate body language, soft winter light, heartwarming moment", + "name": "07_approach" + }, + { + "prompt": "Close-up of the shepherd dog nuzzling the scared lamb, comforting gesture, warm breath visible, tender moment, shallow depth of field, emotional", + "name": "08_comfort" + }, + + # Act 3: Journey home + { + "prompt": "The Swiss shepherd leading the small lamb through snowy alpine meadow, protective stance, walking together, mountains in background, golden hour light", + "name": "09_leading" + }, + { + "prompt": "Wide shot of dog and lamb crossing a snowy hill together, tiny figures in vast white landscape, beautiful alpine scenery, cinematic composition", + "name": "10_journey" + }, + { + "prompt": "The shepherd and lamb walking past snow-covered pine trees, gentle snowfall, peaceful atmosphere, soft afternoon light filtering through branches", + "name": "11_forest_path" + }, + { + "prompt": "A cozy Swiss mountain village appearing in the distance, warm lights glowing from windows, smoke from chimneys, dusk setting in, hopeful atmosphere", + "name": "12_village_sight" + }, + + # Act 4: Reunion + { + "prompt": "The Swiss shepherd and lamb arriving at a wooden barn door, warm light spilling out, welcoming atmosphere, end of journey, relief", + "name": "13_arrival" + }, + { + "prompt": "A farmer in traditional Swiss clothing opening the barn door, surprised and grateful expression, warm interior light, emotional reunion moment", + "name": "14_farmer" + }, + { + "prompt": "The lamb running to join other sheep in a warm barn, happy reunion, straw on floor, cozy interior, heartwarming resolution", + "name": "15_reunion" + }, + { + "prompt": "The Swiss shepherd sitting proudly outside the barn, farmer patting its head gratefully, twilight sky, village lights twinkling, satisfied hero", + "name": "16_reward" + }, + + # Finale + { + "prompt": "Night sky over the Swiss Alps with stars and northern lights, the shepherd dog silhouette on a ridge, majestic ending, peaceful, cinematic wide shot", + "name": "17_finale" + }, +] + +def generate_scene(scene, output_dir, width=768, height=448, frames=97, steps=20): + """generate a single scene""" + output_path = os.path.join(output_dir, f"{scene['name']}.mp4") + + if os.path.exists(output_path): + print(f"skipping {scene['name']} (already exists)") + return output_path + + cmd = [ + sys.executable, "generate.py", + scene["prompt"], + "-o", output_path, + "--width", str(width), + "--height", str(height), + "--frames", str(frames), + "--steps", str(steps), + "-n", "blurry, low quality, distorted, deformed, ugly, bad anatomy" + ] + + print(f"\n{'='*60}") + print(f"generating: {scene['name']}") + print(f"prompt: {scene['prompt'][:80]}...") + print(f"{'='*60}\n") + + subprocess.run(cmd, cwd=os.path.dirname(os.path.abspath(__file__))) + return output_path + +def concatenate_videos(video_files, output_path): + """concatenate all videos using ffmpeg""" + # create file list + list_path = "/tmp/video_list.txt" + with open(list_path, "w") as f: + for video in video_files: + f.write(f"file '{video}'\n") + + cmd = [ + "ffmpeg", "-y", "-f", "concat", "-safe", "0", + "-i", list_path, + "-c:v", "libx264", "-crf", "18", + "-c:a", "aac", "-b:a", "192k", + output_path + ] + + print(f"\nconcatenating {len(video_files)} clips...") + subprocess.run(cmd) + print(f"saved to: {output_path}") + +def main(): + output_dir = os.path.expanduser("~/Desktop/mountain_guardian") + os.makedirs(output_dir, exist_ok=True) + + print(f"generating {len(SCENES)} scenes for 'The Mountain Guardian'") + print(f"output directory: {output_dir}") + print(f"estimated time: ~{len(SCENES) * 10} minutes\n") + + video_files = [] + for i, scene in enumerate(SCENES): + print(f"\n[{i+1}/{len(SCENES)}]") + video_path = generate_scene(scene, output_dir) + video_files.append(video_path) + + # concatenate all videos + final_path = os.path.join(output_dir, "mountain_guardian_full.mp4") + concatenate_videos(video_files, final_path) + + print(f"\n{'='*60}") + print(f"done! final video: {final_path}") + print(f"{'='*60}") + +if __name__ == "__main__": + main()