feat(voice): add voice input/output foundation with rodio, cpal, and webrtc-vad

2026-03-26 07:06:37 +09:00
parent c4dcac1d95
commit 2df28d13b2
10 changed files with 430 additions and 1 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,3 +20,8 @@ libc = "0.2"
 notify = { version = "7", features = ["macos_fsevent"] }
 ratatui = "0.29"
 crossterm = "0.28"
 reqwest = { version = "0.12", features = ["json", "blocking"] }
 rodio = "0.19"
 base64 = "0.22"
 cpal = "0.15"
 webrtc-vad = "0.4"
--- a/examples/voice_test.rs
+++ b/examples/voice_test.rs
@@ -0,0 +1,20 @@
 fn main() {
    eprintln!("Voice test: initializing...");
    let voice = match aishell::voice::VoiceSystem::new() {
        Some(v) => v,
        None => {
            eprintln!("Voice system not available. Check ELEVENLABS_API_KEY in .env");
            return;
        }
    };
    eprintln!("Voice test: listening (speak now)...");
    match voice.listen() {
        Some(text) => {
            eprintln!("Recognized: {text}");
            eprintln!("Speaking back...");
            voice.speak(&text);
        }
        None => eprintln!("No speech detected."),
    }
 }
--- a/src/agent.rs
+++ b/src/agent.rs
@@ -179,7 +179,10 @@ impl Agent {
    pub fn stop(&mut self) {
        if self.is_running() && !self.stopped {
            self.stopped = true;
            #[cfg(unix)]
            unsafe { libc::kill(self.pid as i32, libc::SIGTERM); }
            #[cfg(windows)]
            { let _ = std::process::Command::new("taskkill").args(["/F", "/PID", &self.pid.to_string()]).output(); }
            self.status = AgentStatus::Error("stopped".to_string());
            self.dirty = true;
            self.log("stopped", "");
@@ -291,7 +294,10 @@ impl Drop for Agent {
    fn drop(&mut self) {
        if !self.stopped {
            self.stopped = true;
            #[cfg(unix)]
            unsafe { libc::kill(self.pid as i32, libc::SIGTERM); }
            #[cfg(windows)]
            { let _ = std::process::Command::new("taskkill").args(["/F", "/PID", &self.pid.to_string()]).output(); }
        }
    }
 }
--- a/src/ai.rs
+++ b/src/ai.rs
@@ -44,7 +44,10 @@ impl ClaudeManager {
    pub fn cancel(&mut self) {
        self.status = StatusKind::Idle;
        #[cfg(unix)]
        unsafe { libc::kill(self.child_pid as i32, libc::SIGINT); }
        #[cfg(windows)]
        { let _ = std::process::Command::new("taskkill").args(["/F", "/PID", &self.child_pid.to_string()]).output(); }
        while self.output_rx.try_recv().is_ok() {}
    }
@@ -59,6 +62,9 @@ impl ClaudeManager {
 impl Drop for ClaudeManager {
    fn drop(&mut self) {
        #[cfg(unix)]
        unsafe { libc::kill(self.child_pid as i32, libc::SIGTERM); }
        #[cfg(windows)]
        { let _ = std::process::Command::new("taskkill").args(["/F", "/PID", &self.child_pid.to_string()]).output(); }
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -8,3 +8,4 @@ pub mod agent;
 pub mod tui;
 pub mod headless;
 pub mod watch;
 pub mod voice;
--- a/src/main.rs
+++ b/src/main.rs
@@ -26,7 +26,7 @@ fn main() {
            println!("{}", env!("CARGO_PKG_VERSION"));
        }
        Some("help" | "--help" | "-h") => print_help(),
-        None | Some("tui") => {
+        None | Some("tui") | Some("--voice") => {
            // Show logo before entering alternate screen
            eprintln!("\x1b[38;5;226m{}\x1b[0m\n\x1b[1m  aishell\x1b[0m v{}\n",
                LOGO, env!("CARGO_PKG_VERSION"));
--- a/src/tui.rs
+++ b/src/tui.rs
@@ -51,6 +51,9 @@ pub struct App {
    cmd_cache: CommandCache,
    watch_rx: Option<std::sync::mpsc::Receiver<Vec<String>>>,
    voice: Option<std::sync::Arc<crate::voice::VoiceSystem>>,
    voice_input_tx: Option<std::sync::mpsc::Sender<String>>,
    voice_input_rx: Option<std::sync::mpsc::Receiver<String>>,
    mode: Mode,
    input: String,
    input_task: String,
@@ -82,6 +85,13 @@ impl App {
            agent_scroll: 0,
            cmd_cache: CommandCache::new(),
            watch_rx: None,
            voice: if std::env::args().any(|a| a == "--voice") {
                crate::voice::VoiceSystem::new().map(std::sync::Arc::new)
            } else {
                None
            },
            voice_input_tx: None,
            voice_input_rx: None,
            mode: Mode::Ai,
            input: String::new(),
            input_task: String::new(),
@@ -89,6 +99,13 @@ impl App {
            should_quit: false,
        };
        // Setup voice input channel
        if app.voice.is_some() {
            let (tx, rx) = std::sync::mpsc::channel();
            app.voice_input_tx = Some(tx);
            app.voice_input_rx = Some(rx);
        }
        // Send protocol + identity + project context
        if let Some(ref mut claude) = app.claude {
            let cwd = std::env::current_dir()
@@ -128,6 +145,17 @@ impl App {
    }
    fn poll_all(&mut self) {
        // Check for voice input
        if let Some(ref rx) = self.voice_input_rx {
            if let Ok(text) = rx.try_recv() {
                self.ai_output.push_str(&format!("\n---\n[voice] {text}\n"));
                self.ai_scroll = u16::MAX;
                if let Some(ref mut claude) = self.claude {
                    claude.send(&text);
                }
            }
        }
        let mut stream_ended = false;
        if let Some(ref mut claude) = self.claude {
@@ -152,6 +180,24 @@ impl App {
                            &format!("{STATE_DIR}/ai.txt"),
                            self.ai_output.as_bytes(),
                        );
                        // Voice: speak response, then listen for next input
                        if let Some(ref voice) = self.voice {
                            let last = self.ai_output.rsplit("---").next().unwrap_or(&self.ai_output);
                            let text = last.trim().to_string();
                            if !text.is_empty() {
                                let v = voice.clone();
                                let tx = self.voice_input_tx.clone();
                                std::thread::spawn(move || {
                                    v.speak(&text);
                                    // After speaking, listen for user's response
                                    if let Some(heard) = v.listen() {
                                        if let Some(tx) = tx {
                                            let _ = tx.send(heard);
                                        }
                                    }
                                });
                            }
                        }
                        stream_ended = true;
                    }
                }
--- a/src/voice/mod.rs
+++ b/src/voice/mod.rs
@@ -0,0 +1,83 @@
 pub mod tts;
 pub mod stt;
 /// Load .env file from cwd, setting vars that aren't already set.
 fn load_dotenv() {
    for dir in &[".", env!("CARGO_MANIFEST_DIR")] {
        let path = std::path::Path::new(dir).join(".env");
        if let Ok(content) = std::fs::read_to_string(&path) {
            for line in content.lines() {
                let line = line.trim();
                if line.is_empty() || line.starts_with('#') { continue; }
                if let Some((key, val)) = line.split_once('=') {
                    let key = key.trim();
                    let val = val.trim();
                    if std::env::var(key).is_err() {
                        std::env::set_var(key, val);
                    }
                }
            }
            break;
        }
    }
 }
 pub struct VoiceConfig {
    pub tts_api_key: String,
    pub tts_voice_id: String,
    pub tts_model: String,
    pub stt_language: String,
 }
 impl VoiceConfig {
    pub fn load() -> Option<Self> {
        // Load .env file if present (cwd or project root)
        load_dotenv();
        let tts_api_key = std::env::var("ELEVENLABS_API_KEY").ok()?;
        Some(Self {
            tts_api_key,
            tts_voice_id: std::env::var("ELEVENLABS_VOICE_ID").unwrap_or_default(),
            tts_model: std::env::var("ELEVENLABS_MODEL_ID").unwrap_or_else(|_| "eleven_multilingual_v2".into()),
            stt_language: std::env::var("STT_LANGUAGE").unwrap_or_else(|_| "ja-JP".into()),
        })
    }
    pub fn is_available(&self) -> bool {
        !self.tts_api_key.is_empty()
    }
 }
 pub struct VoiceSystem {
    pub config: VoiceConfig,
 }
 impl VoiceSystem {
    pub fn new() -> Option<Self> {
        let config = VoiceConfig::load()?;
        if !config.is_available() {
            return None;
        }
        Some(Self { config })
    }
    pub fn speak(&self, text: &str) {
        if text.trim().is_empty() { return; }
        let audio = match tts::synthesize(&self.config, text) {
            Ok(data) => data,
            Err(e) => { eprintln!("tts error: {e}"); return; }
        };
        if let Err(e) = tts::play_audio(&audio) {
            eprintln!("audio play error: {e}");
        }
    }
    pub fn listen(&self) -> Option<String> {
        match stt::recognize(&self.config) {
            Ok(text) if !text.is_empty() => Some(text),
            Ok(_) => None,
            Err(e) => { eprintln!("stt error: {e}"); None }
        }
    }
 }
--- a/src/voice/stt.rs
+++ b/src/voice/stt.rs
@@ -0,0 +1,208 @@
 use crate::voice::VoiceConfig;
 use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
 use std::sync::{Arc, Mutex, mpsc};
 /// Record audio via VAD and recognize speech via Google Cloud STT.
 pub fn recognize(config: &VoiceConfig) -> Result<String, String> {
    let audio = record_vad().map_err(|e| format!("recording: {e}"))?;
    if audio.is_empty() {
        return Ok(String::new());
    }
    transcribe(config, &audio)
 }
 /// Record until speech ends (VAD-based).
 fn record_vad() -> Result<Vec<i16>, String> {
    let host = cpal::default_host();
    let device = host.default_input_device()
        .ok_or("No input device")?;
    let default_config = device.default_input_config()
        .map_err(|e| format!("input config: {e}"))?;
    let device_rate = default_config.sample_rate().0;
    let device_channels = default_config.channels() as usize;
    let config = cpal::StreamConfig {
        channels: default_config.channels(),
        sample_rate: cpal::SampleRate(device_rate),
        buffer_size: cpal::BufferSize::Default,
    };
    let (tx, rx) = mpsc::channel::<Vec<i16>>();
    let done = Arc::new(Mutex::new(false));
    let done_clone = done.clone();
    let gain: f32 = 8.0;
    let stream = device.build_input_stream(
        &config,
        move |data: &[f32], _: &cpal::InputCallbackInfo| {
            if *done_clone.lock().unwrap() { return; }
            let mono: Vec<i16> = data.chunks(device_channels)
                .map(|ch| {
                    let avg = ch.iter().sum::<f32>() / device_channels as f32;
                    let amplified = (avg * gain).clamp(-1.0, 1.0);
                    (amplified * 32767.0) as i16
                })
                .collect();
            let _ = tx.send(mono);
        },
        |err| eprintln!("audio error: {err}"),
        None,
    ).map_err(|e| format!("build stream: {e}"))?;
    stream.play().map_err(|e| format!("play: {e}"))?;
    let frame_ms: u32 = 30;
    let device_frame_size = (device_rate * frame_ms / 1000) as usize;
    let vad_frame_size: usize = 480; // 30ms @ 16kHz
    let silence_frames: u32 = 500 / frame_ms;
    let min_speech_frames: u32 = 200 / frame_ms;
    let max_frames: u32 = 8000 / frame_ms; // 8 second max
    let mut vad = webrtc_vad::Vad::new_with_rate_and_mode(
        webrtc_vad::SampleRate::Rate16kHz,
        webrtc_vad::VadMode::Quality,
    );
    let mut frame_buf: Vec<i16> = Vec::with_capacity(device_frame_size);
    let mut audio_buf: Vec<i16> = Vec::new();
    let mut recording = false;
    let mut silence_count: u32 = 0;
    let mut speech_count: u32 = 0;
    let mut total_frames: u32 = 0;
    eprintln!("  listening...");
    loop {
        match rx.recv_timeout(std::time::Duration::from_millis(100)) {
            Ok(samples) => {
                for sample in samples {
                    frame_buf.push(sample);
                    if frame_buf.len() >= device_frame_size {
                        total_frames += 1;
                        let vad_frame = resample(&frame_buf, device_rate, 16000, vad_frame_size);
                        let is_speech = vad.is_voice_segment(&vad_frame).unwrap_or(false);
                        if is_speech {
                            recording = true;
                            silence_count = 0;
                            speech_count += 1;
                            audio_buf.extend_from_slice(&frame_buf);
                        } else if recording {
                            silence_count += 1;
                            audio_buf.extend_from_slice(&frame_buf);
                            if silence_count >= silence_frames && speech_count >= min_speech_frames {
                                *done.lock().unwrap() = true;
                                break;
                            }
                        }
                        if total_frames >= max_frames {
                            *done.lock().unwrap() = true;
                            break;
                        }
                        frame_buf.clear();
                    }
                }
            }
            Err(mpsc::RecvTimeoutError::Timeout) => {}
            Err(mpsc::RecvTimeoutError::Disconnected) => break,
        }
        if *done.lock().unwrap() { break; }
    }
    drop(stream);
    if audio_buf.is_empty() {
        return Ok(Vec::new());
    }
    let output_len = (audio_buf.len() as f32 * 16000.0 / device_rate as f32) as usize;
    Ok(resample(&audio_buf, device_rate, 16000, output_len))
 }
 /// Send audio to Google Cloud STT and return transcript.
 fn transcribe(config: &VoiceConfig, audio: &[i16]) -> Result<String, String> {
    let api_key = std::env::var("GOOGLE_API_KEY")
        .map_err(|_| "GOOGLE_API_KEY not set".to_string())?;
    // Convert i16 samples to WAV bytes
    let wav_data = encode_wav(audio, 16000);
    let encoded = base64::Engine::encode(&base64::engine::general_purpose::STANDARD, &wav_data);
    let body = serde_json::json!({
        "config": {
            "encoding": "LINEAR16",
            "sampleRateHertz": 16000,
            "languageCode": config.stt_language,
        },
        "audio": {
            "content": encoded
        }
    });
    let url = format!("https://speech.googleapis.com/v1/speech:recognize?key={api_key}");
    let client = reqwest::blocking::Client::new();
    let resp = client.post(&url)
        .json(&body)
        .send()
        .map_err(|e| format!("STT request: {e}"))?;
    if !resp.status().is_success() {
        let status = resp.status();
        let body = resp.text().unwrap_or_default();
        return Err(format!("STT API error {status}: {body}"));
    }
    let json: serde_json::Value = resp.json()
        .map_err(|e| format!("STT parse: {e}"))?;
    let transcript = json["results"][0]["alternatives"][0]["transcript"]
        .as_str()
        .unwrap_or("")
        .to_string();
    Ok(transcript)
 }
 /// Encode i16 samples as WAV bytes.
 fn encode_wav(samples: &[i16], sample_rate: u32) -> Vec<u8> {
    let data_len = (samples.len() * 2) as u32;
    let file_len = 36 + data_len;
    let mut buf = Vec::with_capacity(file_len as usize + 8);
    // RIFF header
    buf.extend_from_slice(b"RIFF");
    buf.extend_from_slice(&file_len.to_le_bytes());
    buf.extend_from_slice(b"WAVE");
    // fmt chunk
    buf.extend_from_slice(b"fmt ");
    buf.extend_from_slice(&16u32.to_le_bytes()); // chunk size
    buf.extend_from_slice(&1u16.to_le_bytes());  // PCM
    buf.extend_from_slice(&1u16.to_le_bytes());  // mono
    buf.extend_from_slice(&sample_rate.to_le_bytes());
    buf.extend_from_slice(&(sample_rate * 2).to_le_bytes()); // byte rate
    buf.extend_from_slice(&2u16.to_le_bytes());  // block align
    buf.extend_from_slice(&16u16.to_le_bytes()); // bits per sample
    // data chunk
    buf.extend_from_slice(b"data");
    buf.extend_from_slice(&data_len.to_le_bytes());
    for &s in samples {
        buf.extend_from_slice(&s.to_le_bytes());
    }
    buf
 }
 fn resample(input: &[i16], from_rate: u32, to_rate: u32, output_len: usize) -> Vec<i16> {
    if from_rate == to_rate {
        return input.to_vec();
    }
    let ratio = from_rate as f64 / to_rate as f64;
    (0..output_len)
        .map(|i| {
            let src = (i as f64 * ratio) as usize;
            input.get(src).copied().unwrap_or(0)
        })
        .collect()
 }
--- a/src/voice/tts.rs
+++ b/src/voice/tts.rs
@@ -0,0 +1,54 @@
 use std::io::{Cursor, Read};
 use crate::voice::VoiceConfig;
 /// Synthesize text to audio bytes via ElevenLabs API.
 pub fn synthesize(config: &VoiceConfig, text: &str) -> Result<Vec<u8>, String> {
    let url = format!(
        "https://api.elevenlabs.io/v1/text-to-speech/{}",
        config.tts_voice_id
    );
    let body = serde_json::json!({
        "text": text,
        "model_id": config.tts_model,
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.75
        }
    });
    let client = reqwest::blocking::Client::new();
    let resp = client.post(&url)
        .header("xi-api-key", &config.tts_api_key)
        .header("Content-Type", "application/json")
        .header("Accept", "audio/mpeg")
        .json(&body)
        .send()
        .map_err(|e| format!("TTS request failed: {e}"))?;
    if !resp.status().is_success() {
        let status = resp.status();
        let body = resp.text().unwrap_or_default();
        return Err(format!("TTS API error {status}: {body}"));
    }
    resp.bytes()
        .map(|b| b.to_vec())
        .map_err(|e| format!("TTS read error: {e}"))
 }
 /// Play audio bytes (MP3) using rodio.
 pub fn play_audio(data: &[u8]) -> Result<(), String> {
    let (_stream, handle) = rodio::OutputStream::try_default()
        .map_err(|e| format!("audio output error: {e}"))?;
    let sink = rodio::Sink::try_new(&handle)
        .map_err(|e| format!("audio sink error: {e}"))?;
    let cursor = Cursor::new(data.to_vec());
    let source = rodio::Decoder::new(cursor)
        .map_err(|e| format!("audio decode error: {e}"))?;
    sink.append(source);
    sink.sleep_until_end();
    Ok(())
 }