Added audio

This commit is contained in:
Norbert Schmidt
2026-01-11 15:37:27 +01:00
parent 1366941d39
commit fc95099449
3 changed files with 23 additions and 35 deletions

View File

@@ -1,6 +1,6 @@
# ltx2-mps
run [LTX-2](https://huggingface.co/Lightricks/LTX-2) video generation on mac using MPS (metal).
run [LTX-2](https://huggingface.co/Lightricks/LTX-2) video + audio generation on mac using MPS (metal).
## what's this about
@@ -53,6 +53,7 @@ python generate.py "a cat walking through grass" -o output.mp4
| `--fps` | 24 | output fps |
| `--seed` | random | seed for reproducibility |
| `-n` | "" | negative prompt |
| `--no-audio` | false | disable audio generation |
### examples

View File

@@ -8,10 +8,10 @@ usage: python generate.py "your prompt" -o output.mp4
import argparse
import sys
import imageio
import numpy as np
import torch
from diffusers import LTX2Pipeline
from diffusers.pipelines.ltx2.export_utils import encode_video
def main():
@@ -26,8 +26,7 @@ def main():
parser.add_argument("--frames", type=int, default=25, help="frame count")
parser.add_argument("--fps", type=int, default=24, help="output fps")
parser.add_argument("--seed", type=int, default=None, help="random seed")
parser.add_argument("--crf", type=int, default=10, help="video quality (0-51, lower=better)")
parser.add_argument("--prores", action="store_true", help="use prores codec (large files, best quality)")
parser.add_argument("--no-audio", action="store_true", help="disable audio generation")
args = parser.parse_args()
@@ -63,7 +62,7 @@ def main():
generator = torch.Generator(device="cpu")
generator.manual_seed(args.seed)
print(f"\ngenerating...")
print(f"\ngenerating{'...' if args.no_audio else ' with audio...'}")
print(f" prompt: {args.prompt}")
print(f" size: {args.width}x{args.height}, {args.frames} frames")
print(f" steps: {args.steps}, guidance: {args.guidance}")
@@ -81,39 +80,26 @@ def main():
generator=generator,
)
# get video frames as tensor
video_frames = result.frames[0]
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
# convert to uint8 numpy arrays
frames = []
for frame in video_frames:
frame = np.array(frame, dtype=np.uint8)
frames.append(frame)
# get audio if available
audio = None
audio_sample_rate = None
if not args.no_audio and result.audio is not None:
audio = result.audio[0].float().cpu()
audio_sample_rate = pipe.vocoder.config.output_sampling_rate
print(f"audio generated ({audio_sample_rate}Hz)")
# export video
if args.prores:
output_path = args.output.replace('.mp4', '.mov') if args.output.endswith('.mp4') else args.output
writer = imageio.get_writer(
output_path,
fps=args.fps,
codec='prores_ks',
pixelformat='yuv422p10le',
output_params=['-profile:v', '3'] # prores hq
)
else:
output_path = args.output
writer = imageio.get_writer(
output_path,
fps=args.fps,
codec='libx264',
quality=None,
pixelformat='yuv420p',
output_params=['-crf', str(args.crf), '-preset', 'slow']
)
for frame in frames:
writer.append_data(frame)
writer.close()
args.output = output_path
# export with audio
encode_video(
video=video_tensor,
fps=args.fps,
audio=audio,
audio_sample_rate=audio_sample_rate,
output_path=args.output
)
print(f"\nsaved to: {args.output}")
print(f"seed: {args.seed}")

View File

@@ -7,5 +7,6 @@ safetensors>=0.4.0
sentencepiece>=0.1.99
imageio>=2.30.0
imageio-ffmpeg>=0.4.9
av>=10.0.0
# Install diffusers from git for LTX2Pipeline:
# pip install git+https://github.com/huggingface/diffusers.git