Add fully local conversational AI pipeline for Reachy Mini

Local STT (Qwen3-ASR), VLM (Gemma 4 26B-A4B), and TTS (Spark-TTS) running on Apple Silicon via MLX, with bracket-tag action system for nod, shake, wiggle, dance, photo, and pre-recorded emotions.
2026-05-12 09:24:02 +02:00
parent 3a8a8e3145
commit 5a04a7133a
12 changed files with 4074 additions and 0 deletions
--- a/record_voice.py
+++ b/record_voice.py
@@ -0,0 +1,61 @@
+"""
+Record a voice reference clip for Spark-TTS cloning.
+
+Usage:
+    .venv/bin/python record_voice.py [seconds] [output.wav]
+
+Defaults: 12 seconds, output to voice_ref.wav
+
+Tips:
+- Quiet room, no background noise
+- Natural speaking pace, not robotic
+- Read a passage of real sentences (not word lists)
+- 5-15 seconds works best
+"""
+import sys
+import wave
+
+import numpy as np
+import sounddevice as sd
+
+SAMPLE_RATE = 16000
+
+
+def main():
+    duration = float(sys.argv[1]) if len(sys.argv) > 1 else 12.0
+    outpath = sys.argv[2] if len(sys.argv) > 2 else "voice_ref.wav"
+
+    print(f"Recording {duration:.0f}s to {outpath}")
+    print("Speak naturally. Starting in 3...")
+    sd.sleep(1000)
+    print("2...")
+    sd.sleep(1000)
+    print("1...")
+    sd.sleep(1000)
+    print("GO — speak now")
+
+    audio = sd.rec(
+        int(duration * SAMPLE_RATE),
+        samplerate=SAMPLE_RATE,
+        channels=1,
+        dtype="int16",
+    )
+    sd.wait()
+    print("Done.")
+
+    with wave.open(outpath, "wb") as f:
+        f.setnchannels(1)
+        f.setsampwidth(2)
+        f.setframerate(SAMPLE_RATE)
+        f.writeframes(audio.tobytes())
+
+    peak = np.abs(audio).max()
+    print(f"Saved {outpath} — peak={peak} (healthy range: 5000–30000)")
+    if peak < 2000:
+        print("WARNING: very quiet recording. Speak louder or move closer to mic.")
+    elif peak > 32000:
+        print("WARNING: clipping. Move further from mic or lower input volume.")
+
+
+if __name__ == "__main__":
+    main()