Local STT (Qwen3-ASR), VLM (Gemma 4 26B-A4B), and TTS (Spark-TTS) running on Apple Silicon via MLX, with bracket-tag action system for nod, shake, wiggle, dance, photo, and pre-recorded emotions.
62 lines
1.5 KiB
Python
62 lines
1.5 KiB
Python
"""
|
||
Record a voice reference clip for Spark-TTS cloning.
|
||
|
||
Usage:
|
||
.venv/bin/python record_voice.py [seconds] [output.wav]
|
||
|
||
Defaults: 12 seconds, output to voice_ref.wav
|
||
|
||
Tips:
|
||
- Quiet room, no background noise
|
||
- Natural speaking pace, not robotic
|
||
- Read a passage of real sentences (not word lists)
|
||
- 5-15 seconds works best
|
||
"""
|
||
import sys
|
||
import wave
|
||
|
||
import numpy as np
|
||
import sounddevice as sd
|
||
|
||
SAMPLE_RATE = 16000
|
||
|
||
|
||
def main():
|
||
duration = float(sys.argv[1]) if len(sys.argv) > 1 else 12.0
|
||
outpath = sys.argv[2] if len(sys.argv) > 2 else "voice_ref.wav"
|
||
|
||
print(f"Recording {duration:.0f}s to {outpath}")
|
||
print("Speak naturally. Starting in 3...")
|
||
sd.sleep(1000)
|
||
print("2...")
|
||
sd.sleep(1000)
|
||
print("1...")
|
||
sd.sleep(1000)
|
||
print("GO — speak now")
|
||
|
||
audio = sd.rec(
|
||
int(duration * SAMPLE_RATE),
|
||
samplerate=SAMPLE_RATE,
|
||
channels=1,
|
||
dtype="int16",
|
||
)
|
||
sd.wait()
|
||
print("Done.")
|
||
|
||
with wave.open(outpath, "wb") as f:
|
||
f.setnchannels(1)
|
||
f.setsampwidth(2)
|
||
f.setframerate(SAMPLE_RATE)
|
||
f.writeframes(audio.tobytes())
|
||
|
||
peak = np.abs(audio).max()
|
||
print(f"Saved {outpath} — peak={peak} (healthy range: 5000–30000)")
|
||
if peak < 2000:
|
||
print("WARNING: very quiet recording. Speak louder or move closer to mic.")
|
||
elif peak > 32000:
|
||
print("WARNING: clipping. Move further from mic or lower input volume.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|