From 2b5fbef6cd31db950fcf41012fa2e7b4c95c00b3 Mon Sep 17 00:00:00 2001
From: syui <syui@syui.ai>
Date: Tue, 31 Mar 2026 08:47:16 +0900
Subject: [PATCH] feat(voice): add ai.json voice presets, pitch shift, and
 config path fix

- Voice settings per model in src/voice/ai.json (v2/v3 presets)
- Pitch shift via rodio speed() for higher-pitched voice (default 1.35)
- Load .env from $cfg/ai.syui.log/.env (fixed path)
- Remove Google STT dependency, ElevenLabs-only

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/voice/ai.json | 16 ++++++++++++++++
 src/voice/mod.rs  | 27 ++++++++++++---------------
 src/voice/tts.rs  | 32 ++++++++++++++++++++++++++------
 3 files changed, 54 insertions(+), 21 deletions(-)
 create mode 100644 src/voice/ai.json
diff --git a/src/voice/ai.json b/src/voice/ai.json
new file mode 100644
index 0000000..2f77a08
--- /dev/null
+++ b/src/voice/ai.json
@@ -0,0 +1,16 @@
+{
+  "eleven_multilingual_v2": {
+    "stability": 0.5,
+    "similarity_boost": 0.8,
+    "style": 0.2,
+    "speed": 0.85,
+    "pitch": 1.35
+  },
+  "eleven_v3": {
+    "stability": 0.5,
+    "similarity_boost": 0.8,
+    "style": 0.2,
+    "speed": 0.75,
+    "pitch": 1.35
+  }
+}
diff --git a/src/voice/mod.rs b/src/voice/mod.rs
index 42732a5..3ba7792 100644
--- a/src/voice/mod.rs
+++ b/src/voice/mod.rs
@@ -1,23 +1,20 @@
 pub mod tts;
 pub mod stt;
 
-/// Load .env file from cwd, setting vars that aren't already set.
+/// Load .env file from $cfg/ai.syui.log/.env
 fn load_dotenv() {
-    for dir in &[".", env!("CARGO_MANIFEST_DIR")] {
-        let path = std::path::Path::new(dir).join(".env");
-        if let Ok(content) = std::fs::read_to_string(&path) {
-            for line in content.lines() {
-                let line = line.trim();
-                if line.is_empty() || line.starts_with('#') { continue; }
-                if let Some((key, val)) = line.split_once('=') {
-                    let key = key.trim();
-                    let val = val.trim();
-                    if std::env::var(key).is_err() {
-                        std::env::set_var(key, val);
-                    }
+    let path = format!("{}/ai.syui.log/.env", crate::config::config_dir());
+    if let Ok(content) = std::fs::read_to_string(&path) {
+        for line in content.lines() {
+            let line = line.trim();
+            if line.is_empty() || line.starts_with('#') { continue; }
+            if let Some((key, val)) = line.split_once('=') {
+                let key = key.trim();
+                let val = val.trim();
+                if std::env::var(key).is_err() {
+                    std::env::set_var(key, val);
                 }
             }
-            break;
         }
     }
 }
@@ -68,7 +65,7 @@ impl VoiceSystem {
             Ok(data) => data,
             Err(e) => { eprintln!("tts error: {e}"); return; }
         };
-        if let Err(e) = tts::play_audio(&audio) {
+        if let Err(e) = tts::play_audio(&audio, &self.config.tts_model) {
             eprintln!("audio play error: {e}");
         }
     }
diff --git a/src/voice/tts.rs b/src/voice/tts.rs
index 1befe41..456341f 100644
--- a/src/voice/tts.rs
+++ b/src/voice/tts.rs
@@ -1,8 +1,19 @@
-use std::io::{Cursor, Read};
+use std::io::Cursor;
+use rodio::Source;
 use crate::voice::VoiceConfig;
 
+const VOICE_SETTINGS: &str = include_str!("ai.json");
+
+/// Load voice settings for the current model from ai.json.
+fn load_settings(model: &str) -> serde_json::Value {
+    let all: serde_json::Value = serde_json::from_str(VOICE_SETTINGS).unwrap_or_default();
+    all.get(model).cloned().unwrap_or_else(|| all["eleven_v3"].clone())
+}
+
 /// Synthesize text to audio bytes via ElevenLabs API.
 pub fn synthesize(config: &VoiceConfig, text: &str) -> Result<Vec<u8>, String> {
+    let settings = load_settings(&config.tts_model);
+
     let url = format!(
         "https://api.elevenlabs.io/v1/text-to-speech/{}",
         config.tts_voice_id
@@ -12,8 +23,10 @@ pub fn synthesize(config: &VoiceConfig, text: &str) -> Result<Vec<u8>, String> {
         "text": text,
         "model_id": config.tts_model,
         "voice_settings": {
-            "stability": 0.5,
-            "similarity_boost": 0.75
+            "stability": settings["stability"],
+            "similarity_boost": settings["similarity_boost"],
+            "style": settings["style"],
+            "speed": settings["speed"]
         }
     });
 
@@ -37,8 +50,10 @@ pub fn synthesize(config: &VoiceConfig, text: &str) -> Result<Vec<u8>, String> {
         .map_err(|e| format!("TTS read error: {e}"))
 }
 
-/// Play audio bytes (MP3) using rodio.
-pub fn play_audio(data: &[u8]) -> Result<(), String> {
+/// Play audio bytes (MP3) using rodio with pitch shift from ai.json.
+pub fn play_audio(data: &[u8], model: &str) -> Result<(), String> {
+    let settings = load_settings(model);
+
     let (_stream, handle) = rodio::OutputStream::try_default()
         .map_err(|e| format!("audio output error: {e}"))?;
     let sink = rodio::Sink::try_new(&handle)
@@ -48,7 +63,12 @@ pub fn play_audio(data: &[u8]) -> Result<(), String> {
     let source = rodio::Decoder::new(cursor)
         .map_err(|e| format!("audio decode error: {e}"))?;
 
-    sink.append(source);
+    let pitch = std::env::var("TTS_PITCH")
+        .ok()
+        .and_then(|v| v.parse::<f32>().ok())
+        .unwrap_or_else(|| settings["pitch"].as_f64().unwrap_or(1.35) as f32);
+
+    sink.append(source.speed(pitch));
     sink.sleep_until_end();
     Ok(())
 }