update gpt

2025-06-06 03:18:20 +09:00
parent a9dca2fe38
commit c0e4dc63ea
18 changed files with 2827 additions and 51 deletions
--- a/src/translator/markdown_parser.rs
+++ b/src/translator/markdown_parser.rs
@@ -0,0 +1,253 @@
+use anyhow::Result;
+use regex::Regex;
+use super::MarkdownSection;
+
+pub struct MarkdownParser {
+    code_block_regex: Regex,
+    header_regex: Regex,
+    link_regex: Regex,
+    image_regex: Regex,
+    table_regex: Regex,
+    list_regex: Regex,
+    quote_regex: Regex,
+}
+
+impl MarkdownParser {
+    pub fn new() -> Self {
+        Self {
+            code_block_regex: Regex::new(r"```([a-zA-Z0-9]*)\n([\s\S]*?)\n```").unwrap(),
+            header_regex: Regex::new(r"^(#{1,6})\s+(.+)$").unwrap(),
+            link_regex: Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap(),
+            image_regex: Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap(),
+            table_regex: Regex::new(r"^\|.*\|$").unwrap(),
+            list_regex: Regex::new(r"^[\s]*[-*+]\s+(.+)$").unwrap(),
+            quote_regex: Regex::new(r"^>\s+(.+)$").unwrap(),
+        }
+    }
+    
+    pub fn parse_markdown(&self, content: &str) -> Result<Vec<MarkdownSection>> {
+        let mut sections = Vec::new();
+        let mut current_text = String::new();
+        let lines: Vec<&str> = content.lines().collect();
+        let mut i = 0;
+        
+        while i < lines.len() {
+            let line = lines[i];
+            
+            // Check for code blocks
+            if line.starts_with("```") {
+                // Save accumulated text
+                if !current_text.trim().is_empty() {
+                    sections.extend(self.parse_text_sections(&current_text)?);
+                    current_text.clear();
+                }
+                
+                // Parse code block
+                let (code_section, lines_consumed) = self.parse_code_block(&lines[i..])?;
+                sections.push(code_section);
+                i += lines_consumed;
+                continue;
+            }
+            
+            // Check for headers
+            if let Some(caps) = self.header_regex.captures(line) {
+                // Save accumulated text
+                if !current_text.trim().is_empty() {
+                    sections.extend(self.parse_text_sections(&current_text)?);
+                    current_text.clear();
+                }
+                
+                let level = caps.get(1).unwrap().as_str().len() as u8;
+                let header_text = caps.get(2).unwrap().as_str().to_string();
+                sections.push(MarkdownSection::Header(header_text, level));
+                i += 1;
+                continue;
+            }
+            
+            // Check for tables
+            if self.table_regex.is_match(line) {
+                // Save accumulated text
+                if !current_text.trim().is_empty() {
+                    sections.extend(self.parse_text_sections(&current_text)?);
+                    current_text.clear();
+                }
+                
+                let (table_section, lines_consumed) = self.parse_table(&lines[i..])?;
+                sections.push(table_section);
+                i += lines_consumed;
+                continue;
+            }
+            
+            // Check for quotes
+            if let Some(caps) = self.quote_regex.captures(line) {
+                // Save accumulated text
+                if !current_text.trim().is_empty() {
+                    sections.extend(self.parse_text_sections(&current_text)?);
+                    current_text.clear();
+                }
+                
+                let quote_text = caps.get(1).unwrap().as_str().to_string();
+                sections.push(MarkdownSection::Quote(quote_text));
+                i += 1;
+                continue;
+            }
+            
+            // Check for lists
+            if let Some(caps) = self.list_regex.captures(line) {
+                // Save accumulated text
+                if !current_text.trim().is_empty() {
+                    sections.extend(self.parse_text_sections(&current_text)?);
+                    current_text.clear();
+                }
+                
+                let list_text = caps.get(1).unwrap().as_str().to_string();
+                sections.push(MarkdownSection::List(list_text));
+                i += 1;
+                continue;
+            }
+            
+            // Accumulate regular text
+            current_text.push_str(line);
+            current_text.push('\n');
+            i += 1;
+        }
+        
+        // Process remaining text
+        if !current_text.trim().is_empty() {
+            sections.extend(self.parse_text_sections(&current_text)?);
+        }
+        
+        Ok(sections)
+    }
+    
+    fn parse_code_block(&self, lines: &[&str]) -> Result<(MarkdownSection, usize)> {
+        if lines.is_empty() || !lines[0].starts_with("```") {
+            anyhow::bail!("Not a code block");
+        }
+        
+        let first_line = lines[0];
+        let language = if first_line.len() > 3 {
+            Some(first_line[3..].trim().to_string())
+        } else {
+            None
+        };
+        
+        let mut content = String::new();
+        let mut end_index = 1;
+        
+        for (i, &line) in lines[1..].iter().enumerate() {
+            if line.starts_with("```") {
+                end_index = i + 2; // +1 for slice offset, +1 for closing line
+                break;
+            }
+            if i > 0 {
+                content.push('\n');
+            }
+            content.push_str(line);
+        }
+        
+        Ok((MarkdownSection::Code(content, language), end_index))
+    }
+    
+    fn parse_table(&self, lines: &[&str]) -> Result<(MarkdownSection, usize)> {
+        let mut table_content = String::new();
+        let mut line_count = 0;
+        
+        for &line in lines {
+            if self.table_regex.is_match(line) {
+                if line_count > 0 {
+                    table_content.push('\n');
+                }
+                table_content.push_str(line);
+                line_count += 1;
+            } else {
+                break;
+            }
+        }
+        
+        Ok((MarkdownSection::Table(table_content), line_count))
+    }
+    
+    fn parse_text_sections(&self, text: &str) -> Result<Vec<MarkdownSection>> {
+        let mut sections = Vec::new();
+        let mut remaining = text;
+        
+        // Look for images first (they should be preserved)
+        while let Some(caps) = self.image_regex.captures(remaining) {
+            let full_match = caps.get(0).unwrap();
+            let before = &remaining[..full_match.start()];
+            let alt = caps.get(1).unwrap().as_str().to_string();
+            let url = caps.get(2).unwrap().as_str().to_string();
+            
+            if !before.trim().is_empty() {
+                sections.push(MarkdownSection::Text(before.to_string()));
+            }
+            
+            sections.push(MarkdownSection::Image(alt, url));
+            remaining = &remaining[full_match.end()..];
+        }
+        
+        // Look for links
+        let mut current_text = remaining.to_string();
+        while let Some(caps) = self.link_regex.captures(&current_text) {
+            let full_match = caps.get(0).unwrap();
+            let before = &current_text[..full_match.start()];
+            let link_text = caps.get(1).unwrap().as_str().to_string();
+            let url = caps.get(2).unwrap().as_str().to_string();
+            
+            if !before.trim().is_empty() {
+                sections.push(MarkdownSection::Text(before.to_string()));
+            }
+            
+            sections.push(MarkdownSection::Link(link_text, url));
+            current_text = current_text[full_match.end()..].to_string();
+        }
+        
+        // Add remaining text
+        if !current_text.trim().is_empty() {
+            sections.push(MarkdownSection::Text(current_text));
+        }
+        
+        Ok(sections)
+    }
+    
+    pub fn rebuild_markdown(&self, sections: Vec<MarkdownSection>) -> String {
+        let mut result = String::new();
+        
+        for section in sections {
+            match section {
+                MarkdownSection::Text(text) => {
+                    result.push_str(&text);
+                }
+                MarkdownSection::Code(content, Some(lang)) => {
+                    result.push_str(&format!("```{}\n{}\n```\n", lang, content));
+                }
+                MarkdownSection::Code(content, None) => {
+                    result.push_str(&format!("```\n{}\n```\n", content));
+                }
+                MarkdownSection::Header(text, level) => {
+                    let hashes = "#".repeat(level as usize);
+                    result.push_str(&format!("{} {}\n", hashes, text));
+                }
+                MarkdownSection::Link(text, url) => {
+                    result.push_str(&format!("[{}]({})", text, url));
+                }
+                MarkdownSection::Image(alt, url) => {
+                    result.push_str(&format!("![{}]({})", alt, url));
+                }
+                MarkdownSection::Table(content) => {
+                    result.push_str(&content);
+                    result.push('\n');
+                }
+                MarkdownSection::List(text) => {
+                    result.push_str(&format!("- {}\n", text));
+                }
+                MarkdownSection::Quote(text) => {
+                    result.push_str(&format!("> {}\n", text));
+                }
+            }
+        }
+        
+        result
+    }
+}
--- a/src/translator/mod.rs
+++ b/src/translator/mod.rs
@@ -0,0 +1,123 @@
+pub mod ollama_translator;
+pub mod markdown_parser;
+
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TranslationConfig {
+    pub source_lang: String,
+    pub target_lang: String,
+    pub ollama_endpoint: String,
+    pub model: String,
+    pub preserve_code: bool,
+    pub preserve_links: bool,
+}
+
+impl Default for TranslationConfig {
+    fn default() -> Self {
+        Self {
+            source_lang: "ja".to_string(),
+            target_lang: "en".to_string(),
+            ollama_endpoint: "http://localhost:11434".to_string(),
+            model: "qwen2.5:latest".to_string(),
+            preserve_code: true,
+            preserve_links: true,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum MarkdownSection {
+    Text(String),
+    Code(String, Option<String>), // content, language
+    Header(String, u8), // content, level (1-6)
+    Link(String, String), // text, url
+    Image(String, String), // alt, url
+    Table(String),
+    List(String),
+    Quote(String),
+}
+
+pub trait Translator {
+    async fn translate(&self, content: &str, config: &TranslationConfig) -> Result<String>;
+    async fn translate_markdown(&self, content: &str, config: &TranslationConfig) -> Result<String>;
+    async fn translate_sections(&self, sections: Vec<MarkdownSection>, config: &TranslationConfig) -> Result<Vec<MarkdownSection>>;
+}
+
+pub struct TranslationResult {
+    pub original: String,
+    pub translated: String,
+    pub source_lang: String,
+    pub target_lang: String,
+    pub model: String,
+    pub metrics: TranslationMetrics,
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct TranslationMetrics {
+    pub character_count: usize,
+    pub word_count: usize,
+    pub translation_time_ms: u64,
+    pub sections_translated: usize,
+    pub sections_preserved: usize,
+}
+
+pub struct LanguageMapping {
+    pub mappings: HashMap<String, LanguageInfo>,
+}
+
+#[derive(Debug, Clone)]
+pub struct LanguageInfo {
+    pub name: String,
+    pub code: String,
+    pub ollama_prompt: String,
+}
+
+impl LanguageMapping {
+    pub fn new() -> Self {
+        let mut mappings = HashMap::new();
+        
+        // 主要言語の設定
+        mappings.insert("ja".to_string(), LanguageInfo {
+            name: "Japanese".to_string(),
+            code: "ja".to_string(),
+            ollama_prompt: "You are a professional Japanese translator specializing in technical documentation.".to_string(),
+        });
+        
+        mappings.insert("en".to_string(), LanguageInfo {
+            name: "English".to_string(),
+            code: "en".to_string(),
+            ollama_prompt: "You are a professional English translator specializing in technical documentation.".to_string(),
+        });
+        
+        mappings.insert("zh".to_string(), LanguageInfo {
+            name: "Chinese".to_string(),
+            code: "zh".to_string(),
+            ollama_prompt: "You are a professional Chinese translator specializing in technical documentation.".to_string(),
+        });
+        
+        mappings.insert("ko".to_string(), LanguageInfo {
+            name: "Korean".to_string(),
+            code: "ko".to_string(),
+            ollama_prompt: "You are a professional Korean translator specializing in technical documentation.".to_string(),
+        });
+        
+        mappings.insert("es".to_string(), LanguageInfo {
+            name: "Spanish".to_string(),
+            code: "es".to_string(),
+            ollama_prompt: "You are a professional Spanish translator specializing in technical documentation.".to_string(),
+        });
+        
+        Self { mappings }
+    }
+    
+    pub fn get_language_info(&self, code: &str) -> Option<&LanguageInfo> {
+        self.mappings.get(code)
+    }
+    
+    pub fn get_supported_languages(&self) -> Vec<String> {
+        self.mappings.keys().cloned().collect()
+    }
+}
--- a/src/translator/ollama_translator.rs
+++ b/src/translator/ollama_translator.rs
@@ -0,0 +1,214 @@
+use anyhow::Result;
+use reqwest::Client;
+use serde_json::json;
+use std::time::Instant;
+use super::*;
+use crate::translator::markdown_parser::MarkdownParser;
+
+pub struct OllamaTranslator {
+    client: Client,
+    language_mapping: LanguageMapping,
+    parser: MarkdownParser,
+}
+
+impl OllamaTranslator {
+    pub fn new() -> Self {
+        Self {
+            client: Client::new(),
+            language_mapping: LanguageMapping::new(),
+            parser: MarkdownParser::new(),
+        }
+    }
+    
+    async fn call_ollama(&self, prompt: &str, config: &TranslationConfig) -> Result<String> {
+        let request_body = json!({
+            "model": config.model,
+            "prompt": prompt,
+            "stream": false,
+            "options": {
+                "temperature": 0.3,
+                "top_p": 0.9,
+                "top_k": 40
+            }
+        });
+        
+        let url = format!("{}/api/generate", config.ollama_endpoint);
+        
+        let response = self.client
+            .post(&url)
+            .json(&request_body)
+            .send()
+            .await?;
+        
+        if !response.status().is_success() {
+            anyhow::bail!("Ollama API request failed: {}", response.status());
+        }
+        
+        let response_text = response.text().await?;
+        let response_json: serde_json::Value = serde_json::from_str(&response_text)?;
+        
+        let translated = response_json
+            .get("response")
+            .and_then(|v| v.as_str())
+            .ok_or_else(|| anyhow::anyhow!("Invalid response from Ollama"))?;
+        
+        Ok(translated.to_string())
+    }
+    
+    fn build_translation_prompt(&self, text: &str, config: &TranslationConfig) -> Result<String> {
+        let source_info = self.language_mapping.get_language_info(&config.source_lang)
+            .ok_or_else(|| anyhow::anyhow!("Unsupported source language: {}", config.source_lang))?;
+        
+        let target_info = self.language_mapping.get_language_info(&config.target_lang)
+            .ok_or_else(|| anyhow::anyhow!("Unsupported target language: {}", config.target_lang))?;
+        
+        let prompt = format!(
+            r#"{system_prompt}
+
+Translate the following text from {source_lang} to {target_lang}.
+
+IMPORTANT RULES:
+1. Preserve all Markdown formatting (headers, links, code blocks, etc.)
+2. Do NOT translate content within code blocks (```)
+3. Do NOT translate URLs or file paths
+4. Preserve technical terms when appropriate
+5. Maintain the original structure and formatting
+6. Only output the translated text, no explanations
+
+Original text ({source_code}):
+{text}
+
+Translated text ({target_code}):"#,
+            system_prompt = target_info.ollama_prompt,
+            source_lang = source_info.name,
+            target_lang = target_info.name,
+            source_code = source_info.code,
+            target_code = target_info.code,
+            text = text
+        );
+        
+        Ok(prompt)
+    }
+    
+    fn build_section_translation_prompt(&self, section: &MarkdownSection, config: &TranslationConfig) -> Result<String> {
+        let target_info = self.language_mapping.get_language_info(&config.target_lang)
+            .ok_or_else(|| anyhow::anyhow!("Unsupported target language: {}", config.target_lang))?;
+        
+        let (content, section_type) = match section {
+            MarkdownSection::Text(text) => (text.clone(), "text"),
+            MarkdownSection::Header(text, _) => (text.clone(), "header"),
+            MarkdownSection::Quote(text) => (text.clone(), "quote"),
+            MarkdownSection::List(text) => (text.clone(), "list"),
+            _ => return Ok(String::new()), // Skip translation for code, links, etc.
+        };
+        
+        let prompt = format!(
+            r#"{system_prompt}
+
+Translate this {section_type} from {source_lang} to {target_lang}.
+
+RULES:
+- Only translate the text content
+- Preserve formatting symbols (*, #, >, etc.)
+- Keep technical terms when appropriate
+- Output only the translated text
+
+Text to translate:
+{content}
+
+Translation:"#,
+            system_prompt = target_info.ollama_prompt,
+            section_type = section_type,
+            source_lang = config.source_lang,
+            target_lang = config.target_lang,
+            content = content
+        );
+        
+        Ok(prompt)
+    }
+}
+
+impl Translator for OllamaTranslator {
+    async fn translate(&self, content: &str, config: &TranslationConfig) -> Result<String> {
+        let prompt = self.build_translation_prompt(content, config)?;
+        self.call_ollama(&prompt, config).await
+    }
+    
+    async fn translate_markdown(&self, content: &str, config: &TranslationConfig) -> Result<String> {
+        println!("🔄 Parsing markdown content...");
+        let sections = self.parser.parse_markdown(content)?;
+        
+        println!("📝 Found {} sections to process", sections.len());
+        let translated_sections = self.translate_sections(sections, config).await?;
+        
+        println!("✅ Rebuilding markdown from translated sections...");
+        let result = self.parser.rebuild_markdown(translated_sections);
+        
+        Ok(result)
+    }
+    
+    async fn translate_sections(&self, sections: Vec<MarkdownSection>, config: &TranslationConfig) -> Result<Vec<MarkdownSection>> {
+        let mut translated_sections = Vec::new();
+        let start_time = Instant::now();
+        
+        for (index, section) in sections.into_iter().enumerate() {
+            println!("  🔤 Processing section {}", index + 1);
+            
+            let translated_section = match &section {
+                MarkdownSection::Code(content, lang) => {
+                    if config.preserve_code {
+                        println!("    ⏭️  Preserving code block");
+                        section // Preserve code blocks
+                    } else {
+                        section // Still preserve for now
+                    }
+                }
+                MarkdownSection::Link(text, url) => {
+                    if config.preserve_links {
+                        println!("    ⏭️  Preserving link");
+                        section // Preserve links
+                    } else {
+                        // Translate link text only
+                        let prompt = self.build_section_translation_prompt(&MarkdownSection::Text(text.clone()), config)?;
+                        let translated_text = self.call_ollama(&prompt, config).await?;
+                        MarkdownSection::Link(translated_text.trim().to_string(), url.clone())
+                    }
+                }
+                MarkdownSection::Image(alt, url) => {
+                    println!("    🖼️  Preserving image");
+                    section // Preserve images
+                }
+                MarkdownSection::Table(content) => {
+                    println!("    📊 Translating table content");
+                    let prompt = self.build_section_translation_prompt(&MarkdownSection::Text(content.clone()), config)?;
+                    let translated_content = self.call_ollama(&prompt, config).await?;
+                    MarkdownSection::Table(translated_content.trim().to_string())
+                }
+                _ => {
+                    // Translate text sections
+                    println!("    🔤 Translating text");
+                    let prompt = self.build_section_translation_prompt(&section, config)?;
+                    let translated_text = self.call_ollama(&prompt, config).await?;
+                    
+                    match section {
+                        MarkdownSection::Text(_) => MarkdownSection::Text(translated_text.trim().to_string()),
+                        MarkdownSection::Header(_, level) => MarkdownSection::Header(translated_text.trim().to_string(), level),
+                        MarkdownSection::Quote(_) => MarkdownSection::Quote(translated_text.trim().to_string()),
+                        MarkdownSection::List(_) => MarkdownSection::List(translated_text.trim().to_string()),
+                        _ => section,
+                    }
+                }
+            };
+            
+            translated_sections.push(translated_section);
+            
+            // Add small delay to avoid overwhelming Ollama
+            tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+        }
+        
+        let elapsed = start_time.elapsed();
+        println!("⏱️  Translation completed in {:.2}s", elapsed.as_secs_f64());
+        
+        Ok(translated_sections)
+    }
+}