From aaebea081d5ffa0fb9b7c02560b6126b7713b4a6 Mon Sep 17 00:00:00 2001
From: syui <syui@syui.ai>
Date: Sun, 29 Mar 2026 22:33:38 +0900
Subject: [PATCH] rm gcloud

---
 .gitignore       |  1 +
 Cargo.toml       |  2 +-
 src/voice/mod.rs |  2 +-
 src/voice/stt.rs | 47 ++++++++++++++++++-----------------------------
 4 files changed, 21 insertions(+), 31 deletions(-)
diff --git a/.gitignore b/.gitignore
index 03377c6..fb77546 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ Thumbs.db
 /example
 /config
 /model
+/examples
diff --git a/Cargo.toml b/Cargo.toml
index 620f597..0aea9d0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,7 +20,7 @@ libc = "0.2"
 notify = { version = "7", features = ["macos_fsevent"] }
 ratatui = "0.29"
 crossterm = "0.28"
-reqwest = { version = "0.12", features = ["json", "blocking"] }
+reqwest = { version = "0.12", features = ["json", "blocking", "multipart"] }
 rodio = "0.19"
 base64 = "0.22"
 cpal = "0.15"
diff --git a/src/voice/mod.rs b/src/voice/mod.rs
index c3e0150..42732a5 100644
--- a/src/voice/mod.rs
+++ b/src/voice/mod.rs
@@ -40,7 +40,7 @@ impl VoiceConfig {
             tts_api_key,
             tts_voice_id: std::env::var("ELEVENLABS_VOICE_ID").unwrap_or_default(),
             tts_model: std::env::var("ELEVENLABS_MODEL_ID").unwrap_or_else(|_| "eleven_multilingual_v2".into()),
-            stt_language: std::env::var("STT_LANGUAGE").unwrap_or_else(|_| "ja-JP".into()),
+            stt_language: std::env::var("STT_LANGUAGE").unwrap_or_else(|_| "ja".into()),
         })
     }
 
diff --git a/src/voice/stt.rs b/src/voice/stt.rs
index 1841db0..d71ee7a 100644
--- a/src/voice/stt.rs
+++ b/src/voice/stt.rs
@@ -2,7 +2,7 @@ use crate::voice::VoiceConfig;
 use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
 use std::sync::{Arc, Mutex, mpsc};
 
-/// Record audio via VAD and recognize speech via Google Cloud STT.
+/// Record audio via VAD and recognize speech via ElevenLabs STT.
 pub fn recognize(config: &VoiceConfig) -> Result<String, String> {
     let audio = record_vad().map_err(|e| format!("recording: {e}"))?;
     if audio.is_empty() {
@@ -121,32 +121,24 @@ fn record_vad() -> Result<Vec<i16>, String> {
     Ok(resample(&audio_buf, device_rate, 16000, output_len))
 }
 
-/// Send audio to Google Cloud STT and return transcript.
+/// Send audio to ElevenLabs STT and return transcript.
 fn transcribe(config: &VoiceConfig, audio: &[i16]) -> Result<String, String> {
-    let api_key = std::env::var("GOOGLE_API_KEY")
-        .map_err(|_| "GOOGLE_API_KEY not set".to_string())?;
-
-    // Convert i16 samples to WAV bytes
     let wav_data = encode_wav(audio, 16000);
-    let encoded = base64::Engine::encode(&base64::engine::general_purpose::STANDARD, &wav_data);
 
-    let body = serde_json::json!({
-        "config": {
-            "encoding": "LINEAR16",
-            "sampleRateHertz": 16000,
-            "languageCode": config.stt_language,
-        },
-        "audio": {
-            "content": encoded
-        }
-    });
+    let part = reqwest::blocking::multipart::Part::bytes(wav_data)
+        .file_name("audio.wav")
+        .mime_str("audio/wav")
+        .map_err(|e| format!("mime: {e}"))?;
 
-    let url = "https://speech.googleapis.com/v1/speech:recognize";
+    let form = reqwest::blocking::multipart::Form::new()
+        .text("model_id", "scribe_v1")
+        .text("language_code", config.stt_language.clone())
+        .part("file", part);
 
     let client = reqwest::blocking::Client::new();
-    let resp = client.post(url)
-        .header("x-goog-api-key", &api_key)
-        .json(&body)
+    let resp = client.post("https://api.elevenlabs.io/v1/speech-to-text")
+        .header("xi-api-key", &config.tts_api_key)
+        .multipart(form)
         .send()
         .map_err(|e| format!("STT request: {e}"))?;
 
@@ -159,7 +151,7 @@ fn transcribe(config: &VoiceConfig, audio: &[i16]) -> Result<String, String> {
     let json: serde_json::Value = resp.json()
         .map_err(|e| format!("STT parse: {e}"))?;
 
-    let transcript = json["results"][0]["alternatives"][0]["transcript"]
+    let transcript = json["text"]
         .as_str()
         .unwrap_or("")
         .to_string();
@@ -173,20 +165,17 @@ fn encode_wav(samples: &[i16], sample_rate: u32) -> Vec<u8> {
     let file_len = 36 + data_len;
     let mut buf = Vec::with_capacity(file_len as usize + 8);
 
-    // RIFF header
     buf.extend_from_slice(b"RIFF");
     buf.extend_from_slice(&file_len.to_le_bytes());
     buf.extend_from_slice(b"WAVE");
-    // fmt chunk
     buf.extend_from_slice(b"fmt ");
-    buf.extend_from_slice(&16u32.to_le_bytes()); // chunk size
+    buf.extend_from_slice(&16u32.to_le_bytes());
     buf.extend_from_slice(&1u16.to_le_bytes());  // PCM
     buf.extend_from_slice(&1u16.to_le_bytes());  // mono
     buf.extend_from_slice(&sample_rate.to_le_bytes());
-    buf.extend_from_slice(&(sample_rate * 2).to_le_bytes()); // byte rate
-    buf.extend_from_slice(&2u16.to_le_bytes());  // block align
-    buf.extend_from_slice(&16u16.to_le_bytes()); // bits per sample
-    // data chunk
+    buf.extend_from_slice(&(sample_rate * 2).to_le_bytes());
+    buf.extend_from_slice(&2u16.to_le_bytes());
+    buf.extend_from_slice(&16u16.to_le_bytes());
     buf.extend_from_slice(b"data");
     buf.extend_from_slice(&data_len.to_le_bytes());
     for &s in samples {