From aaebea081d5ffa0fb9b7c02560b6126b7713b4a6 Mon Sep 17 00:00:00 2001 From: syui Date: Sun, 29 Mar 2026 22:33:38 +0900 Subject: [PATCH] rm gcloud --- .gitignore | 1 + Cargo.toml | 2 +- src/voice/mod.rs | 2 +- src/voice/stt.rs | 47 ++++++++++++++++++----------------------------- 4 files changed, 21 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index 03377c6..fb77546 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ Thumbs.db /example /config /model +/examples diff --git a/Cargo.toml b/Cargo.toml index 620f597..0aea9d0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ libc = "0.2" notify = { version = "7", features = ["macos_fsevent"] } ratatui = "0.29" crossterm = "0.28" -reqwest = { version = "0.12", features = ["json", "blocking"] } +reqwest = { version = "0.12", features = ["json", "blocking", "multipart"] } rodio = "0.19" base64 = "0.22" cpal = "0.15" diff --git a/src/voice/mod.rs b/src/voice/mod.rs index c3e0150..42732a5 100644 --- a/src/voice/mod.rs +++ b/src/voice/mod.rs @@ -40,7 +40,7 @@ impl VoiceConfig { tts_api_key, tts_voice_id: std::env::var("ELEVENLABS_VOICE_ID").unwrap_or_default(), tts_model: std::env::var("ELEVENLABS_MODEL_ID").unwrap_or_else(|_| "eleven_multilingual_v2".into()), - stt_language: std::env::var("STT_LANGUAGE").unwrap_or_else(|_| "ja-JP".into()), + stt_language: std::env::var("STT_LANGUAGE").unwrap_or_else(|_| "ja".into()), }) } diff --git a/src/voice/stt.rs b/src/voice/stt.rs index 1841db0..d71ee7a 100644 --- a/src/voice/stt.rs +++ b/src/voice/stt.rs @@ -2,7 +2,7 @@ use crate::voice::VoiceConfig; use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; use std::sync::{Arc, Mutex, mpsc}; -/// Record audio via VAD and recognize speech via Google Cloud STT. +/// Record audio via VAD and recognize speech via ElevenLabs STT. pub fn recognize(config: &VoiceConfig) -> Result { let audio = record_vad().map_err(|e| format!("recording: {e}"))?; if audio.is_empty() { @@ -121,32 +121,24 @@ fn record_vad() -> Result, String> { Ok(resample(&audio_buf, device_rate, 16000, output_len)) } -/// Send audio to Google Cloud STT and return transcript. +/// Send audio to ElevenLabs STT and return transcript. fn transcribe(config: &VoiceConfig, audio: &[i16]) -> Result { - let api_key = std::env::var("GOOGLE_API_KEY") - .map_err(|_| "GOOGLE_API_KEY not set".to_string())?; - - // Convert i16 samples to WAV bytes let wav_data = encode_wav(audio, 16000); - let encoded = base64::Engine::encode(&base64::engine::general_purpose::STANDARD, &wav_data); - let body = serde_json::json!({ - "config": { - "encoding": "LINEAR16", - "sampleRateHertz": 16000, - "languageCode": config.stt_language, - }, - "audio": { - "content": encoded - } - }); + let part = reqwest::blocking::multipart::Part::bytes(wav_data) + .file_name("audio.wav") + .mime_str("audio/wav") + .map_err(|e| format!("mime: {e}"))?; - let url = "https://speech.googleapis.com/v1/speech:recognize"; + let form = reqwest::blocking::multipart::Form::new() + .text("model_id", "scribe_v1") + .text("language_code", config.stt_language.clone()) + .part("file", part); let client = reqwest::blocking::Client::new(); - let resp = client.post(url) - .header("x-goog-api-key", &api_key) - .json(&body) + let resp = client.post("https://api.elevenlabs.io/v1/speech-to-text") + .header("xi-api-key", &config.tts_api_key) + .multipart(form) .send() .map_err(|e| format!("STT request: {e}"))?; @@ -159,7 +151,7 @@ fn transcribe(config: &VoiceConfig, audio: &[i16]) -> Result { let json: serde_json::Value = resp.json() .map_err(|e| format!("STT parse: {e}"))?; - let transcript = json["results"][0]["alternatives"][0]["transcript"] + let transcript = json["text"] .as_str() .unwrap_or("") .to_string(); @@ -173,20 +165,17 @@ fn encode_wav(samples: &[i16], sample_rate: u32) -> Vec { let file_len = 36 + data_len; let mut buf = Vec::with_capacity(file_len as usize + 8); - // RIFF header buf.extend_from_slice(b"RIFF"); buf.extend_from_slice(&file_len.to_le_bytes()); buf.extend_from_slice(b"WAVE"); - // fmt chunk buf.extend_from_slice(b"fmt "); - buf.extend_from_slice(&16u32.to_le_bytes()); // chunk size + buf.extend_from_slice(&16u32.to_le_bytes()); buf.extend_from_slice(&1u16.to_le_bytes()); // PCM buf.extend_from_slice(&1u16.to_le_bytes()); // mono buf.extend_from_slice(&sample_rate.to_le_bytes()); - buf.extend_from_slice(&(sample_rate * 2).to_le_bytes()); // byte rate - buf.extend_from_slice(&2u16.to_le_bytes()); // block align - buf.extend_from_slice(&16u16.to_le_bytes()); // bits per sample - // data chunk + buf.extend_from_slice(&(sample_rate * 2).to_le_bytes()); + buf.extend_from_slice(&2u16.to_le_bytes()); + buf.extend_from_slice(&16u16.to_le_bytes()); buf.extend_from_slice(b"data"); buf.extend_from_slice(&data_len.to_le_bytes()); for &s in samples {