Add fully local conversational AI pipeline for Reachy Mini

Local STT (Qwen3-ASR), VLM (Gemma 4 26B-A4B), and TTS (Spark-TTS) running
on Apple Silicon via MLX, with bracket-tag action system for nod, shake,
wiggle, dance, photo, and pre-recorded emotions.
This commit is contained in:
Norbert Schmidt
2026-05-12 09:24:02 +02:00
parent 3a8a8e3145
commit 5a04a7133a
12 changed files with 4074 additions and 0 deletions

61
record_voice.py Normal file
View File

@@ -0,0 +1,61 @@
"""
Record a voice reference clip for Spark-TTS cloning.
Usage:
.venv/bin/python record_voice.py [seconds] [output.wav]
Defaults: 12 seconds, output to voice_ref.wav
Tips:
- Quiet room, no background noise
- Natural speaking pace, not robotic
- Read a passage of real sentences (not word lists)
- 5-15 seconds works best
"""
import sys
import wave
import numpy as np
import sounddevice as sd
SAMPLE_RATE = 16000
def main():
duration = float(sys.argv[1]) if len(sys.argv) > 1 else 12.0
outpath = sys.argv[2] if len(sys.argv) > 2 else "voice_ref.wav"
print(f"Recording {duration:.0f}s to {outpath}")
print("Speak naturally. Starting in 3...")
sd.sleep(1000)
print("2...")
sd.sleep(1000)
print("1...")
sd.sleep(1000)
print("GO — speak now")
audio = sd.rec(
int(duration * SAMPLE_RATE),
samplerate=SAMPLE_RATE,
channels=1,
dtype="int16",
)
sd.wait()
print("Done.")
with wave.open(outpath, "wb") as f:
f.setnchannels(1)
f.setsampwidth(2)
f.setframerate(SAMPLE_RATE)
f.writeframes(audio.tobytes())
peak = np.abs(audio).max()
print(f"Saved {outpath} — peak={peak} (healthy range: 500030000)")
if peak < 2000:
print("WARNING: very quiet recording. Speak louder or move closer to mic.")
elif peak > 32000:
print("WARNING: clipping. Move further from mic or lower input volume.")
if __name__ == "__main__":
main()