From 2b5fbef6cd31db950fcf41012fa2e7b4c95c00b3 Mon Sep 17 00:00:00 2001 From: syui Date: Tue, 31 Mar 2026 08:47:16 +0900 Subject: [PATCH] feat(voice): add ai.json voice presets, pitch shift, and config path fix - Voice settings per model in src/voice/ai.json (v2/v3 presets) - Pitch shift via rodio speed() for higher-pitched voice (default 1.35) - Load .env from $cfg/ai.syui.log/.env (fixed path) - Remove Google STT dependency, ElevenLabs-only Co-Authored-By: Claude Opus 4.6 (1M context) --- src/voice/ai.json | 16 ++++++++++++++++ src/voice/mod.rs | 27 ++++++++++++--------------- src/voice/tts.rs | 32 ++++++++++++++++++++++++++------ 3 files changed, 54 insertions(+), 21 deletions(-) create mode 100644 src/voice/ai.json diff --git a/src/voice/ai.json b/src/voice/ai.json new file mode 100644 index 0000000..2f77a08 --- /dev/null +++ b/src/voice/ai.json @@ -0,0 +1,16 @@ +{ + "eleven_multilingual_v2": { + "stability": 0.5, + "similarity_boost": 0.8, + "style": 0.2, + "speed": 0.85, + "pitch": 1.35 + }, + "eleven_v3": { + "stability": 0.5, + "similarity_boost": 0.8, + "style": 0.2, + "speed": 0.75, + "pitch": 1.35 + } +} diff --git a/src/voice/mod.rs b/src/voice/mod.rs index 42732a5..3ba7792 100644 --- a/src/voice/mod.rs +++ b/src/voice/mod.rs @@ -1,23 +1,20 @@ pub mod tts; pub mod stt; -/// Load .env file from cwd, setting vars that aren't already set. +/// Load .env file from $cfg/ai.syui.log/.env fn load_dotenv() { - for dir in &[".", env!("CARGO_MANIFEST_DIR")] { - let path = std::path::Path::new(dir).join(".env"); - if let Ok(content) = std::fs::read_to_string(&path) { - for line in content.lines() { - let line = line.trim(); - if line.is_empty() || line.starts_with('#') { continue; } - if let Some((key, val)) = line.split_once('=') { - let key = key.trim(); - let val = val.trim(); - if std::env::var(key).is_err() { - std::env::set_var(key, val); - } + let path = format!("{}/ai.syui.log/.env", crate::config::config_dir()); + if let Ok(content) = std::fs::read_to_string(&path) { + for line in content.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { continue; } + if let Some((key, val)) = line.split_once('=') { + let key = key.trim(); + let val = val.trim(); + if std::env::var(key).is_err() { + std::env::set_var(key, val); } } - break; } } } @@ -68,7 +65,7 @@ impl VoiceSystem { Ok(data) => data, Err(e) => { eprintln!("tts error: {e}"); return; } }; - if let Err(e) = tts::play_audio(&audio) { + if let Err(e) = tts::play_audio(&audio, &self.config.tts_model) { eprintln!("audio play error: {e}"); } } diff --git a/src/voice/tts.rs b/src/voice/tts.rs index 1befe41..456341f 100644 --- a/src/voice/tts.rs +++ b/src/voice/tts.rs @@ -1,8 +1,19 @@ -use std::io::{Cursor, Read}; +use std::io::Cursor; +use rodio::Source; use crate::voice::VoiceConfig; +const VOICE_SETTINGS: &str = include_str!("ai.json"); + +/// Load voice settings for the current model from ai.json. +fn load_settings(model: &str) -> serde_json::Value { + let all: serde_json::Value = serde_json::from_str(VOICE_SETTINGS).unwrap_or_default(); + all.get(model).cloned().unwrap_or_else(|| all["eleven_v3"].clone()) +} + /// Synthesize text to audio bytes via ElevenLabs API. pub fn synthesize(config: &VoiceConfig, text: &str) -> Result, String> { + let settings = load_settings(&config.tts_model); + let url = format!( "https://api.elevenlabs.io/v1/text-to-speech/{}", config.tts_voice_id @@ -12,8 +23,10 @@ pub fn synthesize(config: &VoiceConfig, text: &str) -> Result, String> { "text": text, "model_id": config.tts_model, "voice_settings": { - "stability": 0.5, - "similarity_boost": 0.75 + "stability": settings["stability"], + "similarity_boost": settings["similarity_boost"], + "style": settings["style"], + "speed": settings["speed"] } }); @@ -37,8 +50,10 @@ pub fn synthesize(config: &VoiceConfig, text: &str) -> Result, String> { .map_err(|e| format!("TTS read error: {e}")) } -/// Play audio bytes (MP3) using rodio. -pub fn play_audio(data: &[u8]) -> Result<(), String> { +/// Play audio bytes (MP3) using rodio with pitch shift from ai.json. +pub fn play_audio(data: &[u8], model: &str) -> Result<(), String> { + let settings = load_settings(model); + let (_stream, handle) = rodio::OutputStream::try_default() .map_err(|e| format!("audio output error: {e}"))?; let sink = rodio::Sink::try_new(&handle) @@ -48,7 +63,12 @@ pub fn play_audio(data: &[u8]) -> Result<(), String> { let source = rodio::Decoder::new(cursor) .map_err(|e| format!("audio decode error: {e}"))?; - sink.append(source); + let pitch = std::env::var("TTS_PITCH") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or_else(|| settings["pitch"].as_f64().unwrap_or(1.35) as f32); + + sink.append(source.speed(pitch)); sink.sleep_until_end(); Ok(()) }