Added story generator

2026-01-11 20:01:20 +01:00
parent fc95099449
commit 8174da7490
6 changed files with 846 additions and 0 deletions
--- a/story_continuous.py
+++ b/story_continuous.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""
+generate a continuous story video with ltx-2
+uses image-to-video to maintain visual continuity between scenes
+"""
+
+import os
+import subprocess
+import numpy as np
+import torch
+from PIL import Image
+from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline
+from diffusers.pipelines.ltx2.export_utils import encode_video
+
+# story scenes
+SCENES = [
+    "Sunrise over the Swiss Alps, snow-covered peaks glowing orange and pink, a white Swiss shepherd dog standing on a ridge, peaceful morning, cinematic wide shot",
+    "The Swiss shepherd walking through deep powder snow, determined stride, pine trees, snow particles in air, morning light, tracking shot following the dog",
+    "The shepherd stops suddenly, ears perked, alert pose, listening intently, snowy forest, something caught its attention",
+    "A small white lamb alone in the snow, shivering, lost and scared, the shepherd approaches gently in the background",
+    "The Swiss shepherd nuzzling the scared lamb, comforting gesture, warm breath visible, tender moment, shallow depth of field",
+    "The shepherd leading the lamb through snowy alpine meadow, protective stance, walking together, mountains in background, golden hour",
+    "Wide shot of dog and lamb crossing a snowy hill, vast white landscape, beautiful alpine scenery, afternoon light",
+    "A cozy Swiss mountain village appearing in the distance, warm lights glowing, smoke from chimneys, dusk, hopeful atmosphere",
+    "The shepherd and lamb arriving at a wooden barn, warm light spilling out, welcoming atmosphere, journey's end",
+    "Night sky over the Alps with stars, the shepherd dog silhouette on a ridge, majestic ending, peaceful, cinematic finale",
+]
+
+def main():
+    output_dir = os.path.expanduser("~/Desktop/mountain_guardian")
+    os.makedirs(output_dir, exist_ok=True)
+
+    device = "mps" if torch.backends.mps.is_available() else "cpu"
+    print(f"using {device}")
+
+    # load both pipelines
+    print("loading text-to-video pipeline...")
+    t2v_pipe = LTX2Pipeline.from_pretrained(
+        "Lightricks/LTX-2",
+        torch_dtype=torch.bfloat16
+    )
+    t2v_pipe.to(device)
+
+    print("loading image-to-video pipeline...")
+    i2v_pipe = LTX2ImageToVideoPipeline.from_pretrained(
+        "Lightricks/LTX-2",
+        torch_dtype=torch.bfloat16
+    )
+    i2v_pipe.to(device)
+
+    width, height = 768, 448
+    frames = 97  # ~4 seconds per scene
+    steps = 20
+
+    all_video_frames = []
+    all_audio = []
+    last_frame = None
+
+    for i, prompt in enumerate(SCENES):
+        print(f"\n{'='*60}")
+        print(f"scene {i+1}/{len(SCENES)}")
+        print(f"prompt: {prompt[:60]}...")
+        print(f"{'='*60}\n")
+
+        if i == 0:
+            # first scene: text-to-video
+            result = t2v_pipe(
+                prompt=prompt,
+                negative_prompt="blurry, low quality, distorted, deformed",
+                width=width,
+                height=height,
+                num_frames=frames,
+                num_inference_steps=steps,
+            )
+        else:
+            # subsequent scenes: image-to-video from last frame
+            result = i2v_pipe(
+                image=last_frame,
+                prompt=prompt,
+                negative_prompt="blurry, low quality, distorted, deformed",
+                width=width,
+                height=height,
+                num_frames=frames,
+                num_inference_steps=steps,
+            )
+
+        # get frames
+        video_frames = result.frames[0]
+
+        # save last frame for next scene
+        last_frame = video_frames[-1]
+
+        # collect frames (skip first frame for scenes 2+ to avoid duplicate)
+        if i == 0:
+            all_video_frames.extend(video_frames)
+        else:
+            all_video_frames.extend(video_frames[1:])  # skip first frame (duplicate of last)
+
+        # collect audio
+        if result.audio is not None:
+            all_audio.append(result.audio[0])
+
+        # save individual scene
+        scene_path = os.path.join(output_dir, f"scene_{i+1:02d}.mp4")
+        video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
+        audio = result.audio[0].float().cpu() if result.audio is not None else None
+        audio_sr = t2v_pipe.vocoder.config.output_sampling_rate if audio is not None else None
+        encode_video(video_tensor, fps=24, audio=audio, audio_sample_rate=audio_sr, output_path=scene_path)
+        print(f"saved: {scene_path}")
+
+    # save full video
+    print("\ncreating full video...")
+    full_path = os.path.join(output_dir, "mountain_guardian_full.mp4")
+    video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in all_video_frames])
+
+    # concatenate audio
+    if all_audio:
+        full_audio = torch.cat(all_audio, dim=-1).float().cpu()
+        audio_sr = t2v_pipe.vocoder.config.output_sampling_rate
+    else:
+        full_audio = None
+        audio_sr = None
+
+    encode_video(video_tensor, fps=24, audio=full_audio, audio_sample_rate=audio_sr, output_path=full_path)
+
+    print(f"\n{'='*60}")
+    print(f"done!")
+    print(f"total frames: {len(all_video_frames)}")
+    print(f"duration: ~{len(all_video_frames)/24:.1f} seconds")
+    print(f"saved to: {full_path}")
+    print(f"{'='*60}")
+
+if __name__ == "__main__":
+    main()