""" Record a voice reference clip for Spark-TTS cloning. Usage: .venv/bin/python record_voice.py [seconds] [output.wav] Defaults: 12 seconds, output to voice_ref.wav Tips: - Quiet room, no background noise - Natural speaking pace, not robotic - Read a passage of real sentences (not word lists) - 5-15 seconds works best """ import sys import wave import numpy as np import sounddevice as sd SAMPLE_RATE = 16000 def main(): duration = float(sys.argv[1]) if len(sys.argv) > 1 else 12.0 outpath = sys.argv[2] if len(sys.argv) > 2 else "voice_ref.wav" print(f"Recording {duration:.0f}s to {outpath}") print("Speak naturally. Starting in 3...") sd.sleep(1000) print("2...") sd.sleep(1000) print("1...") sd.sleep(1000) print("GO — speak now") audio = sd.rec( int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype="int16", ) sd.wait() print("Done.") with wave.open(outpath, "wb") as f: f.setnchannels(1) f.setsampwidth(2) f.setframerate(SAMPLE_RATE) f.writeframes(audio.tobytes()) peak = np.abs(audio).max() print(f"Saved {outpath} — peak={peak} (healthy range: 5000–30000)") if peak < 2000: print("WARNING: very quiet recording. Speak louder or move closer to mic.") elif peak > 32000: print("WARNING: clipping. Move further from mic or lower input volume.") if __name__ == "__main__": main()