reachy-mlx-vlm/record_voice.py

"""
Record a voice reference clip for Spark-TTS cloning.

Usage:
    .venv/bin/python record_voice.py [seconds] [output.wav]

Defaults: 12 seconds, output to voice_ref.wav

Tips:
- Quiet room, no background noise
- Natural speaking pace, not robotic
- Read a passage of real sentences (not word lists)
- 5-15 seconds works best
"""
import sys
import wave

import numpy as np
import sounddevice as sd

SAMPLE_RATE = 16000


def main():
    duration = float(sys.argv[1]) if len(sys.argv) > 1 else 12.0
    outpath = sys.argv[2] if len(sys.argv) > 2 else "voice_ref.wav"

    print(f"Recording {duration:.0f}s to {outpath}")
    print("Speak naturally. Starting in 3...")
    sd.sleep(1000)
    print("2...")
    sd.sleep(1000)
    print("1...")
    sd.sleep(1000)
    print("GO — speak now")

    audio = sd.rec(
        int(duration * SAMPLE_RATE),
        samplerate=SAMPLE_RATE,
        channels=1,
        dtype="int16",
    )
    sd.wait()
    print("Done.")

    with wave.open(outpath, "wb") as f:
        f.setnchannels(1)
        f.setsampwidth(2)
        f.setframerate(SAMPLE_RATE)
        f.writeframes(audio.tobytes())

    peak = np.abs(audio).max()
    print(f"Saved {outpath} — peak={peak} (healthy range: 5000–30000)")
    if peak < 2000:
        print("WARNING: very quiet recording. Speak louder or move closer to mic.")
    elif peak > 32000:
        print("WARNING: clipping. Move further from mic or lower input volume.")


if __name__ == "__main__":
    main()