Added audio

This commit is contained in:
Norbert Schmidt
2026-01-11 15:37:27 +01:00
parent 1366941d39
commit fc95099449
3 changed files with 23 additions and 35 deletions

View File

@@ -1,6 +1,6 @@
# ltx2-mps # ltx2-mps
run [LTX-2](https://huggingface.co/Lightricks/LTX-2) video generation on mac using MPS (metal). run [LTX-2](https://huggingface.co/Lightricks/LTX-2) video + audio generation on mac using MPS (metal).
## what's this about ## what's this about
@@ -53,6 +53,7 @@ python generate.py "a cat walking through grass" -o output.mp4
| `--fps` | 24 | output fps | | `--fps` | 24 | output fps |
| `--seed` | random | seed for reproducibility | | `--seed` | random | seed for reproducibility |
| `-n` | "" | negative prompt | | `-n` | "" | negative prompt |
| `--no-audio` | false | disable audio generation |
### examples ### examples

View File

@@ -8,10 +8,10 @@ usage: python generate.py "your prompt" -o output.mp4
import argparse import argparse
import sys import sys
import imageio
import numpy as np import numpy as np
import torch import torch
from diffusers import LTX2Pipeline from diffusers import LTX2Pipeline
from diffusers.pipelines.ltx2.export_utils import encode_video
def main(): def main():
@@ -26,8 +26,7 @@ def main():
parser.add_argument("--frames", type=int, default=25, help="frame count") parser.add_argument("--frames", type=int, default=25, help="frame count")
parser.add_argument("--fps", type=int, default=24, help="output fps") parser.add_argument("--fps", type=int, default=24, help="output fps")
parser.add_argument("--seed", type=int, default=None, help="random seed") parser.add_argument("--seed", type=int, default=None, help="random seed")
parser.add_argument("--crf", type=int, default=10, help="video quality (0-51, lower=better)") parser.add_argument("--no-audio", action="store_true", help="disable audio generation")
parser.add_argument("--prores", action="store_true", help="use prores codec (large files, best quality)")
args = parser.parse_args() args = parser.parse_args()
@@ -63,7 +62,7 @@ def main():
generator = torch.Generator(device="cpu") generator = torch.Generator(device="cpu")
generator.manual_seed(args.seed) generator.manual_seed(args.seed)
print(f"\ngenerating...") print(f"\ngenerating{'...' if args.no_audio else ' with audio...'}")
print(f" prompt: {args.prompt}") print(f" prompt: {args.prompt}")
print(f" size: {args.width}x{args.height}, {args.frames} frames") print(f" size: {args.width}x{args.height}, {args.frames} frames")
print(f" steps: {args.steps}, guidance: {args.guidance}") print(f" steps: {args.steps}, guidance: {args.guidance}")
@@ -81,40 +80,27 @@ def main():
generator=generator, generator=generator,
) )
# get video frames as tensor
video_frames = result.frames[0] video_frames = result.frames[0]
video_tensor = torch.stack([torch.from_numpy(np.array(f)) for f in video_frames])
# convert to uint8 numpy arrays # get audio if available
frames = [] audio = None
for frame in video_frames: audio_sample_rate = None
frame = np.array(frame, dtype=np.uint8) if not args.no_audio and result.audio is not None:
frames.append(frame) audio = result.audio[0].float().cpu()
audio_sample_rate = pipe.vocoder.config.output_sampling_rate
print(f"audio generated ({audio_sample_rate}Hz)")
# export video # export with audio
if args.prores: encode_video(
output_path = args.output.replace('.mp4', '.mov') if args.output.endswith('.mp4') else args.output video=video_tensor,
writer = imageio.get_writer(
output_path,
fps=args.fps, fps=args.fps,
codec='prores_ks', audio=audio,
pixelformat='yuv422p10le', audio_sample_rate=audio_sample_rate,
output_params=['-profile:v', '3'] # prores hq
)
else:
output_path=args.output output_path=args.output
writer = imageio.get_writer(
output_path,
fps=args.fps,
codec='libx264',
quality=None,
pixelformat='yuv420p',
output_params=['-crf', str(args.crf), '-preset', 'slow']
) )
for frame in frames:
writer.append_data(frame)
writer.close()
args.output = output_path
print(f"\nsaved to: {args.output}") print(f"\nsaved to: {args.output}")
print(f"seed: {args.seed}") print(f"seed: {args.seed}")

View File

@@ -7,5 +7,6 @@ safetensors>=0.4.0
sentencepiece>=0.1.99 sentencepiece>=0.1.99
imageio>=2.30.0 imageio>=2.30.0
imageio-ffmpeg>=0.4.9 imageio-ffmpeg>=0.4.9
av>=10.0.0
# Install diffusers from git for LTX2Pipeline: # Install diffusers from git for LTX2Pipeline:
# pip install git+https://github.com/huggingface/diffusers.git # pip install git+https://github.com/huggingface/diffusers.git