rm gcloud
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -26,3 +26,4 @@ Thumbs.db
|
|||||||
/example
|
/example
|
||||||
/config
|
/config
|
||||||
/model
|
/model
|
||||||
|
/examples
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ libc = "0.2"
|
|||||||
notify = { version = "7", features = ["macos_fsevent"] }
|
notify = { version = "7", features = ["macos_fsevent"] }
|
||||||
ratatui = "0.29"
|
ratatui = "0.29"
|
||||||
crossterm = "0.28"
|
crossterm = "0.28"
|
||||||
reqwest = { version = "0.12", features = ["json", "blocking"] }
|
reqwest = { version = "0.12", features = ["json", "blocking", "multipart"] }
|
||||||
rodio = "0.19"
|
rodio = "0.19"
|
||||||
base64 = "0.22"
|
base64 = "0.22"
|
||||||
cpal = "0.15"
|
cpal = "0.15"
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ impl VoiceConfig {
|
|||||||
tts_api_key,
|
tts_api_key,
|
||||||
tts_voice_id: std::env::var("ELEVENLABS_VOICE_ID").unwrap_or_default(),
|
tts_voice_id: std::env::var("ELEVENLABS_VOICE_ID").unwrap_or_default(),
|
||||||
tts_model: std::env::var("ELEVENLABS_MODEL_ID").unwrap_or_else(|_| "eleven_multilingual_v2".into()),
|
tts_model: std::env::var("ELEVENLABS_MODEL_ID").unwrap_or_else(|_| "eleven_multilingual_v2".into()),
|
||||||
stt_language: std::env::var("STT_LANGUAGE").unwrap_or_else(|_| "ja-JP".into()),
|
stt_language: std::env::var("STT_LANGUAGE").unwrap_or_else(|_| "ja".into()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use crate::voice::VoiceConfig;
|
|||||||
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
|
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
|
||||||
use std::sync::{Arc, Mutex, mpsc};
|
use std::sync::{Arc, Mutex, mpsc};
|
||||||
|
|
||||||
/// Record audio via VAD and recognize speech via Google Cloud STT.
|
/// Record audio via VAD and recognize speech via ElevenLabs STT.
|
||||||
pub fn recognize(config: &VoiceConfig) -> Result<String, String> {
|
pub fn recognize(config: &VoiceConfig) -> Result<String, String> {
|
||||||
let audio = record_vad().map_err(|e| format!("recording: {e}"))?;
|
let audio = record_vad().map_err(|e| format!("recording: {e}"))?;
|
||||||
if audio.is_empty() {
|
if audio.is_empty() {
|
||||||
@@ -121,32 +121,24 @@ fn record_vad() -> Result<Vec<i16>, String> {
|
|||||||
Ok(resample(&audio_buf, device_rate, 16000, output_len))
|
Ok(resample(&audio_buf, device_rate, 16000, output_len))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Send audio to Google Cloud STT and return transcript.
|
/// Send audio to ElevenLabs STT and return transcript.
|
||||||
fn transcribe(config: &VoiceConfig, audio: &[i16]) -> Result<String, String> {
|
fn transcribe(config: &VoiceConfig, audio: &[i16]) -> Result<String, String> {
|
||||||
let api_key = std::env::var("GOOGLE_API_KEY")
|
|
||||||
.map_err(|_| "GOOGLE_API_KEY not set".to_string())?;
|
|
||||||
|
|
||||||
// Convert i16 samples to WAV bytes
|
|
||||||
let wav_data = encode_wav(audio, 16000);
|
let wav_data = encode_wav(audio, 16000);
|
||||||
let encoded = base64::Engine::encode(&base64::engine::general_purpose::STANDARD, &wav_data);
|
|
||||||
|
|
||||||
let body = serde_json::json!({
|
let part = reqwest::blocking::multipart::Part::bytes(wav_data)
|
||||||
"config": {
|
.file_name("audio.wav")
|
||||||
"encoding": "LINEAR16",
|
.mime_str("audio/wav")
|
||||||
"sampleRateHertz": 16000,
|
.map_err(|e| format!("mime: {e}"))?;
|
||||||
"languageCode": config.stt_language,
|
|
||||||
},
|
|
||||||
"audio": {
|
|
||||||
"content": encoded
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let url = "https://speech.googleapis.com/v1/speech:recognize";
|
let form = reqwest::blocking::multipart::Form::new()
|
||||||
|
.text("model_id", "scribe_v1")
|
||||||
|
.text("language_code", config.stt_language.clone())
|
||||||
|
.part("file", part);
|
||||||
|
|
||||||
let client = reqwest::blocking::Client::new();
|
let client = reqwest::blocking::Client::new();
|
||||||
let resp = client.post(url)
|
let resp = client.post("https://api.elevenlabs.io/v1/speech-to-text")
|
||||||
.header("x-goog-api-key", &api_key)
|
.header("xi-api-key", &config.tts_api_key)
|
||||||
.json(&body)
|
.multipart(form)
|
||||||
.send()
|
.send()
|
||||||
.map_err(|e| format!("STT request: {e}"))?;
|
.map_err(|e| format!("STT request: {e}"))?;
|
||||||
|
|
||||||
@@ -159,7 +151,7 @@ fn transcribe(config: &VoiceConfig, audio: &[i16]) -> Result<String, String> {
|
|||||||
let json: serde_json::Value = resp.json()
|
let json: serde_json::Value = resp.json()
|
||||||
.map_err(|e| format!("STT parse: {e}"))?;
|
.map_err(|e| format!("STT parse: {e}"))?;
|
||||||
|
|
||||||
let transcript = json["results"][0]["alternatives"][0]["transcript"]
|
let transcript = json["text"]
|
||||||
.as_str()
|
.as_str()
|
||||||
.unwrap_or("")
|
.unwrap_or("")
|
||||||
.to_string();
|
.to_string();
|
||||||
@@ -173,20 +165,17 @@ fn encode_wav(samples: &[i16], sample_rate: u32) -> Vec<u8> {
|
|||||||
let file_len = 36 + data_len;
|
let file_len = 36 + data_len;
|
||||||
let mut buf = Vec::with_capacity(file_len as usize + 8);
|
let mut buf = Vec::with_capacity(file_len as usize + 8);
|
||||||
|
|
||||||
// RIFF header
|
|
||||||
buf.extend_from_slice(b"RIFF");
|
buf.extend_from_slice(b"RIFF");
|
||||||
buf.extend_from_slice(&file_len.to_le_bytes());
|
buf.extend_from_slice(&file_len.to_le_bytes());
|
||||||
buf.extend_from_slice(b"WAVE");
|
buf.extend_from_slice(b"WAVE");
|
||||||
// fmt chunk
|
|
||||||
buf.extend_from_slice(b"fmt ");
|
buf.extend_from_slice(b"fmt ");
|
||||||
buf.extend_from_slice(&16u32.to_le_bytes()); // chunk size
|
buf.extend_from_slice(&16u32.to_le_bytes());
|
||||||
buf.extend_from_slice(&1u16.to_le_bytes()); // PCM
|
buf.extend_from_slice(&1u16.to_le_bytes()); // PCM
|
||||||
buf.extend_from_slice(&1u16.to_le_bytes()); // mono
|
buf.extend_from_slice(&1u16.to_le_bytes()); // mono
|
||||||
buf.extend_from_slice(&sample_rate.to_le_bytes());
|
buf.extend_from_slice(&sample_rate.to_le_bytes());
|
||||||
buf.extend_from_slice(&(sample_rate * 2).to_le_bytes()); // byte rate
|
buf.extend_from_slice(&(sample_rate * 2).to_le_bytes());
|
||||||
buf.extend_from_slice(&2u16.to_le_bytes()); // block align
|
buf.extend_from_slice(&2u16.to_le_bytes());
|
||||||
buf.extend_from_slice(&16u16.to_le_bytes()); // bits per sample
|
buf.extend_from_slice(&16u16.to_le_bytes());
|
||||||
// data chunk
|
|
||||||
buf.extend_from_slice(b"data");
|
buf.extend_from_slice(b"data");
|
||||||
buf.extend_from_slice(&data_len.to_le_bytes());
|
buf.extend_from_slice(&data_len.to_le_bytes());
|
||||||
for &s in samples {
|
for &s in samples {
|
||||||
|
|||||||
Reference in New Issue
Block a user