feat(voice): add ai.json voice presets, pitch shift, and config path fix

- Voice settings per model in src/voice/ai.json (v2/v3 presets) - Pitch shift via rodio speed() for higher-pitched voice (default 1.35) - Load .env from $cfg/ai.syui.log/.env (fixed path) - Remove Google STT dependency, ElevenLabs-only Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 08:47:16 +09:00
parent aaebea081d
commit 2b5fbef6cd
3 changed files with 54 additions and 21 deletions
--- a/src/voice/ai.json
+++ b/src/voice/ai.json
@@ -0,0 +1,16 @@
 {
  "eleven_multilingual_v2": {
    "stability": 0.5,
    "similarity_boost": 0.8,
    "style": 0.2,
    "speed": 0.85,
    "pitch": 1.35
  },
  "eleven_v3": {
    "stability": 0.5,
    "similarity_boost": 0.8,
    "style": 0.2,
    "speed": 0.75,
    "pitch": 1.35
  }
 }
--- a/src/voice/mod.rs
+++ b/src/voice/mod.rs
@@ -1,23 +1,20 @@
 pub mod tts;
 pub mod stt;
-/// Load .env file from cwd, setting vars that aren't already set.
+/// Load .env file from $cfg/ai.syui.log/.env
 fn load_dotenv() {
-    for dir in &[".", env!("CARGO_MANIFEST_DIR")] {
+    let path = format!("{}/ai.syui.log/.env", crate::config::config_dir());
-        let path = std::path::Path::new(dir).join(".env");
+    if let Ok(content) = std::fs::read_to_string(&path) {
-        if let Ok(content) = std::fs::read_to_string(&path) {
+        for line in content.lines() {
-            for line in content.lines() {
+            let line = line.trim();
-                let line = line.trim();
+            if line.is_empty() || line.starts_with('#') { continue; }
-                if line.is_empty() || line.starts_with('#') { continue; }
+            if let Some((key, val)) = line.split_once('=') {
-                if let Some((key, val)) = line.split_once('=') {
+                let key = key.trim();
-                    let key = key.trim();
+                let val = val.trim();
-                    let val = val.trim();
+                if std::env::var(key).is_err() {
-                    if std::env::var(key).is_err() {
+                    std::env::set_var(key, val);
                        std::env::set_var(key, val);
                    }
                }
            }
            break;
        }
    }
 }
@@ -68,7 +65,7 @@ impl VoiceSystem {
            Ok(data) => data,
            Err(e) => { eprintln!("tts error: {e}"); return; }
        };
-        if let Err(e) = tts::play_audio(&audio) {
+        if let Err(e) = tts::play_audio(&audio, &self.config.tts_model) {
            eprintln!("audio play error: {e}");
        }
    }
--- a/src/voice/tts.rs
+++ b/src/voice/tts.rs
@@ -1,8 +1,19 @@
-use std::io::{Cursor, Read};
+use std::io::Cursor;
 use rodio::Source;
 use crate::voice::VoiceConfig;
 const VOICE_SETTINGS: &str = include_str!("ai.json");
 /// Load voice settings for the current model from ai.json.
 fn load_settings(model: &str) -> serde_json::Value {
    let all: serde_json::Value = serde_json::from_str(VOICE_SETTINGS).unwrap_or_default();
    all.get(model).cloned().unwrap_or_else(|| all["eleven_v3"].clone())
 }
 /// Synthesize text to audio bytes via ElevenLabs API.
 pub fn synthesize(config: &VoiceConfig, text: &str) -> Result<Vec<u8>, String> {
    let settings = load_settings(&config.tts_model);
    let url = format!(
        "https://api.elevenlabs.io/v1/text-to-speech/{}",
        config.tts_voice_id
@@ -12,8 +23,10 @@ pub fn synthesize(config: &VoiceConfig, text: &str) -> Result<Vec<u8>, String> {
        "text": text,
        "model_id": config.tts_model,
        "voice_settings": {
-            "stability": 0.5,
+            "stability": settings["stability"],
-            "similarity_boost": 0.75
+            "similarity_boost": settings["similarity_boost"],
            "style": settings["style"],
            "speed": settings["speed"]
        }
    });
@@ -37,8 +50,10 @@ pub fn synthesize(config: &VoiceConfig, text: &str) -> Result<Vec<u8>, String> {
        .map_err(|e| format!("TTS read error: {e}"))
 }
-/// Play audio bytes (MP3) using rodio.
+/// Play audio bytes (MP3) using rodio with pitch shift from ai.json.
-pub fn play_audio(data: &[u8]) -> Result<(), String> {
+pub fn play_audio(data: &[u8], model: &str) -> Result<(), String> {
    let settings = load_settings(model);
    let (_stream, handle) = rodio::OutputStream::try_default()
        .map_err(|e| format!("audio output error: {e}"))?;
    let sink = rodio::Sink::try_new(&handle)
@@ -48,7 +63,12 @@ pub fn play_audio(data: &[u8]) -> Result<(), String> {
    let source = rodio::Decoder::new(cursor)
        .map_err(|e| format!("audio decode error: {e}"))?;
-    sink.append(source);
+    let pitch = std::env::var("TTS_PITCH")
        .ok()
        .and_then(|v| v.parse::<f32>().ok())
        .unwrap_or_else(|| settings["pitch"].as_f64().unwrap_or(1.35) as f32);
    sink.append(source.speed(pitch));
    sink.sleep_until_end();
    Ok(())
 }