From 02b8c27835c219bfbca8b9d2e8ede852a0faae98 Mon Sep 17 00:00:00 2001
From: Norbert Schmidt <github@ddq.nl>
Date: Tue, 31 Mar 2026 13:55:39 +0200
Subject: [PATCH] Upgrade to LTX-2.3 with audio generation

- Switch from mlx_video.generate_av to mlx_video.models.ltx_2.generate
- Use prince-canuma/LTX-2.3-distilled model with google/gemma-3-12b-it text encoder
- Add --audio flag for joint audio-video generation
- Add auto-background execution with nohup logging
- Add CLAUDE.md and test stories

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                 | 61 +++++++++++++++++++++++++++++++++++++++
 generate_story.sh         | 32 ++++++++++++++++++--
 stories/local_runners.txt | 12 ++++++++
 stories/test_ltx23.txt    |  8 +++++
 stories/test_person.txt   |  7 +++++
 5 files changed, 117 insertions(+), 3 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 stories/local_runners.txt
 create mode 100644 stories/test_ltx23.txt
 create mode 100644 stories/test_person.txt

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..66715d4
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,61 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+mlx-video-maker is a toolkit for generating multi-scene AI videos with seamless transitions on Apple Silicon Macs. It uses LTX-2 (2B parameter DiT model) via the MLX framework for native inference on M1/M2/M3/M4 chips.
+
+## How It Works
+
+The core technique is **I2V chaining**: Scene 1 is generated as text-to-video, then the last frame is extracted and used as the input image for scene 2 (image-to-video), and so on. All scenes are concatenated into a final movie with ffmpeg.
+
+Pipeline: T2V (scene 1) → extract last frame → I2V (scene 2+) → repeat → ffmpeg concat
+
+## Running
+
+```bash
+# Generate a movie from a story file
+./generate_story.sh stories/my_story.txt [output_dir]
+
+# Background generation (long runs)
+nohup ./generate_story.sh stories/my_story.txt output/ > output/nohup.out 2>&1 &
+```
+
+### Key parameters (all optional)
+- `--width 1920` (must be divisible by 64)
+- `--height 1088` (must be divisible by 64)
+- `--frames 121` (must satisfy 1 + 8*k)
+- `--strength 0.7` (I2V conditioning, 0.0-1.0; 0.7 is sweet spot)
+- `--fps 24`
+- `--python ./venv/bin/python`
+
+## Architecture
+
+- **`generate_story.sh`** — Single bash script that orchestrates everything. Handles argument parsing, scene generation (calling `mlx_video.generate_av`), frame extraction via ffprobe/ffmpeg, and final concatenation.
+- **`stories/`** — Story files: plain text, one prompt per line, `#` for comments, empty lines ignored.
+- **`promptguide.md`** — Comprehensive LTX-2 prompt engineering guide covering shot establishment, camera movement, audio description, and multi-shot continuity techniques.
+- **`output/`** — Generated artifacts: `scene{N}.mp4`, `scene{N}_lastframe.jpg`, `concat_list.txt`, final movie (gitignored).
+
+## Key Design Decisions
+
+- **Resumable**: Script skips scenes that already exist in the output directory.
+- **No Python package structure**: Direct script execution, single pip dependency (`mlx-video` from GitHub).
+- **No build/test/lint**: Manual testing by reviewing generated video output.
+
+## Requirements
+
+- Apple Silicon Mac with 64GB+ RAM (32GB minimum at lower resolution)
+- Python 3.11+, ffmpeg, ffprobe
+- `pip install git+https://github.com/Blaizzy/mlx-video.git`
+
+## Story File Format
+
+```text
+# Comments start with #
+A cinematic wide shot of a mountain landscape at golden hour...
+
+A close-up tracking shot follows a character walking through fog...
+```
+
+Each non-empty, non-comment line is one scene prompt. Prompts should be flowing narrative paragraphs (not keyword lists) following the six elements in promptguide.md: shot establishment, scene setting, action, character definition, camera movement, audio description.
diff --git a/generate_story.sh b/generate_story.sh
index bf73ec2..abf464a 100755
--- a/generate_story.sh
+++ b/generate_story.sh
@@ -7,6 +7,26 @@
 
 set -e
 
+# Auto-background: re-exec under nohup if not already backgrounded
+if [ -z "$_MLX_BG" ] && [ -t 0 ]; then
+    export _MLX_BG=1
+    # Determine log file location (need to peek at args for output dir)
+    _LOG_DIR="$HOME/Nextcloud/Documents/mlx-video-stories"
+    for _arg in "$@"; do
+        if [ -n "$_NEXT_IS_DIR" ]; then _LOG_DIR="$_arg"; unset _NEXT_IS_DIR; break; fi
+        [[ "$_arg" == --* ]] && break
+        [ "$_SEEN_STORY" = "1" ] && _LOG_DIR="$_arg" && break
+        _SEEN_STORY=1
+    done
+    mkdir -p "$_LOG_DIR"
+    _LOG="$_LOG_DIR/generation.log"
+    echo "Running in background. Log: $_LOG"
+    echo "Follow with: tail -f $_LOG"
+    nohup "$0" "$@" > "$_LOG" 2>&1 &
+    echo "PID: $!"
+    exit 0
+fi
+
 # Default settings
 WIDTH=1920
 HEIGHT=1088
@@ -14,7 +34,7 @@ FRAMES=121
 STRENGTH=0.7
 FPS=24
 VENV_PYTHON="${VENV_PYTHON:-./venv/bin/python}"
-OUTPUT_DIR="./output"
+OUTPUT_DIR="$HOME/Nextcloud/Documents/mlx-video-stories"
 
 # Colors
 RED='\033[0;31m'
@@ -136,13 +156,16 @@ for i in $(seq 1 $NUM_SCENES); do
 
     if [ $i -eq 1 ]; then
         # First scene: Text-to-Video
-        $VENV_PYTHON -m mlx_video.generate_av \
+        $VENV_PYTHON -m mlx_video.models.ltx_2.generate \
             --prompt "$PROMPT" \
+            --model-repo prince-canuma/LTX-2.3-distilled \
+            --text-encoder-repo google/gemma-3-12b-it \
             --height $HEIGHT \
             --width $WIDTH \
             --num-frames $FRAMES \
             --fps $FPS \
             --seed $((42 + i)) \
+            --audio \
             --output-path "$SCENE_FILE"
     else
         # Subsequent scenes: Image-to-Video
@@ -162,8 +185,10 @@ for i in $(seq 1 $NUM_SCENES); do
         fi
 
         # Generate with I2V
-        $VENV_PYTHON -m mlx_video.generate_av \
+        $VENV_PYTHON -m mlx_video.models.ltx_2.generate \
             --prompt "$PROMPT" \
+            --model-repo prince-canuma/LTX-2.3-distilled \
+            --text-encoder-repo google/gemma-3-12b-it \
             --image "$LAST_FRAME" \
             --image-strength $STRENGTH \
             --height $HEIGHT \
@@ -171,6 +196,7 @@ for i in $(seq 1 $NUM_SCENES); do
             --num-frames $FRAMES \
             --fps $FPS \
             --seed $((42 + i)) \
+            --audio \
             --output-path "$SCENE_FILE"
     fi
 
diff --git a/stories/local_runners.txt b/stories/local_runners.txt
new file mode 100644
index 0000000..31dff14
--- /dev/null
+++ b/stories/local_runners.txt
@@ -0,0 +1,12 @@
+# Local Runners - MLX community showcase
+# Theme: devs generating AI video locally, raw energy, no cloud needed
+
+A cinematic wide shot of a dimly lit room filled with glowing Apple laptops on a long wooden table. Multiple developers sit intensely focused, screens reflecting off their faces in blue and purple light. The camera slowly dollies forward between the rows. Sound of mechanical keyboards clicking rapidly and a low electronic hum building tension.
+
+A close-up tracking shot moves across laptop screens showing colorful terminal output scrolling rapidly with progress bars and neural network visualizations. Code and numbers cascade down the displays. Green text on black backgrounds. The camera glides smoothly left to right. Sound of digital processing tones and a pulsing synthetic beat growing stronger.
+
+A medium shot of a developer leaning back in their chair with a confident grin as a fully rendered AI video plays on their MacBook screen. The room behind them is dark with ambient RGB lighting. They tap the spacebar triumphantly. The camera slowly pushes in on their satisfied expression. Sound of a cinematic bass drop and the crowd murmuring in amazement.
+
+A dramatic wide aerial shot pulling back to reveal an entire warehouse space filled with hundreds of developers at glowing workstations, all generating video simultaneously. Streams of light rise from each screen into the air like digital aurora borealis. The camera rises and tilts upward. Sound of an epic orchestral swell mixed with electronic beats reaching a powerful crescendo.
+
+A slow motion close-up of a MacBook Pro with the Apple logo glowing. The screen displays a beautiful AI generated landscape video playing flawlessly. Binary code and particles of light float upward from the keyboard like embers from a fire. The camera holds steady with shallow depth of field. Sound of a deep resonant tone fading into silence with a final satisfying click.
diff --git a/stories/test_ltx23.txt b/stories/test_ltx23.txt
new file mode 100644
index 0000000..1380de1
--- /dev/null
+++ b/stories/test_ltx23.txt
@@ -0,0 +1,8 @@
+# Quick LTX 2.3 test - 3 scenes, low res
+# Testing: T2V, I2V continuity, audio consistency
+
+A wide establishing shot of a quiet cobblestone alley in a European village at dawn. Warm golden light spills between old brick buildings with wooden shutters. A tabby cat sits on a windowsill grooming itself. The camera slowly pushes forward through the alley. Soft ambient sound of distant church bells and birdsong.
+
+A medium tracking shot follows the tabby cat as it leaps from the windowsill and trots along the cobblestones. Morning light casts long shadows across the wet stones. The camera tracks alongside the cat at ground level. Sound of soft paws on stone and gentle wind rustling through hanging laundry.
+
+A close-up shot of the tabby cat stopping at a wooden door and looking up expectantly. The door creaks open revealing a warm interior with a fireplace glow. The cat slips inside. The camera holds steady at the doorway. Sound of a creaking door hinge and a crackling fireplace inside.
diff --git a/stories/test_person.txt b/stories/test_person.txt
new file mode 100644
index 0000000..1c29d12
--- /dev/null
+++ b/stories/test_person.txt
@@ -0,0 +1,7 @@
+# LTX 2.3 test - person speaking, testing audio speech quality
+
+A medium shot of a young woman standing in a sunlit kitchen. She looks directly at the camera and says hello, welcome to my cooking show. Her voice is clear and warm. The kitchen has white tiles and wooden countertops. The camera is static. Natural indoor ambient sound with her speaking voice.
+
+A close-up of the woman chopping vegetables on a wooden cutting board. She explains that today we are making a simple pasta dish. Her hands move confidently with the knife. Sound of chopping and her narrating voice clearly audible.
+
+A wide shot of the woman stirring a pot on the stove. Steam rises from the pot. She turns to the camera and says this is the secret ingredient, and holds up a jar of spices with a smile. The camera slowly zooms in. Sound of bubbling water and her clear speaking voice.