update gpt

This commit is contained in:
2025-06-06 03:18:20 +09:00
parent a9dca2fe38
commit c0e4dc63ea
18 changed files with 2827 additions and 51 deletions

View File

@ -0,0 +1,253 @@
use anyhow::Result;
use regex::Regex;
use super::MarkdownSection;
pub struct MarkdownParser {
code_block_regex: Regex,
header_regex: Regex,
link_regex: Regex,
image_regex: Regex,
table_regex: Regex,
list_regex: Regex,
quote_regex: Regex,
}
impl MarkdownParser {
pub fn new() -> Self {
Self {
code_block_regex: Regex::new(r"```([a-zA-Z0-9]*)\n([\s\S]*?)\n```").unwrap(),
header_regex: Regex::new(r"^(#{1,6})\s+(.+)$").unwrap(),
link_regex: Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap(),
image_regex: Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap(),
table_regex: Regex::new(r"^\|.*\|$").unwrap(),
list_regex: Regex::new(r"^[\s]*[-*+]\s+(.+)$").unwrap(),
quote_regex: Regex::new(r"^>\s+(.+)$").unwrap(),
}
}
pub fn parse_markdown(&self, content: &str) -> Result<Vec<MarkdownSection>> {
let mut sections = Vec::new();
let mut current_text = String::new();
let lines: Vec<&str> = content.lines().collect();
let mut i = 0;
while i < lines.len() {
let line = lines[i];
// Check for code blocks
if line.starts_with("```") {
// Save accumulated text
if !current_text.trim().is_empty() {
sections.extend(self.parse_text_sections(&current_text)?);
current_text.clear();
}
// Parse code block
let (code_section, lines_consumed) = self.parse_code_block(&lines[i..])?;
sections.push(code_section);
i += lines_consumed;
continue;
}
// Check for headers
if let Some(caps) = self.header_regex.captures(line) {
// Save accumulated text
if !current_text.trim().is_empty() {
sections.extend(self.parse_text_sections(&current_text)?);
current_text.clear();
}
let level = caps.get(1).unwrap().as_str().len() as u8;
let header_text = caps.get(2).unwrap().as_str().to_string();
sections.push(MarkdownSection::Header(header_text, level));
i += 1;
continue;
}
// Check for tables
if self.table_regex.is_match(line) {
// Save accumulated text
if !current_text.trim().is_empty() {
sections.extend(self.parse_text_sections(&current_text)?);
current_text.clear();
}
let (table_section, lines_consumed) = self.parse_table(&lines[i..])?;
sections.push(table_section);
i += lines_consumed;
continue;
}
// Check for quotes
if let Some(caps) = self.quote_regex.captures(line) {
// Save accumulated text
if !current_text.trim().is_empty() {
sections.extend(self.parse_text_sections(&current_text)?);
current_text.clear();
}
let quote_text = caps.get(1).unwrap().as_str().to_string();
sections.push(MarkdownSection::Quote(quote_text));
i += 1;
continue;
}
// Check for lists
if let Some(caps) = self.list_regex.captures(line) {
// Save accumulated text
if !current_text.trim().is_empty() {
sections.extend(self.parse_text_sections(&current_text)?);
current_text.clear();
}
let list_text = caps.get(1).unwrap().as_str().to_string();
sections.push(MarkdownSection::List(list_text));
i += 1;
continue;
}
// Accumulate regular text
current_text.push_str(line);
current_text.push('\n');
i += 1;
}
// Process remaining text
if !current_text.trim().is_empty() {
sections.extend(self.parse_text_sections(&current_text)?);
}
Ok(sections)
}
fn parse_code_block(&self, lines: &[&str]) -> Result<(MarkdownSection, usize)> {
if lines.is_empty() || !lines[0].starts_with("```") {
anyhow::bail!("Not a code block");
}
let first_line = lines[0];
let language = if first_line.len() > 3 {
Some(first_line[3..].trim().to_string())
} else {
None
};
let mut content = String::new();
let mut end_index = 1;
for (i, &line) in lines[1..].iter().enumerate() {
if line.starts_with("```") {
end_index = i + 2; // +1 for slice offset, +1 for closing line
break;
}
if i > 0 {
content.push('\n');
}
content.push_str(line);
}
Ok((MarkdownSection::Code(content, language), end_index))
}
fn parse_table(&self, lines: &[&str]) -> Result<(MarkdownSection, usize)> {
let mut table_content = String::new();
let mut line_count = 0;
for &line in lines {
if self.table_regex.is_match(line) {
if line_count > 0 {
table_content.push('\n');
}
table_content.push_str(line);
line_count += 1;
} else {
break;
}
}
Ok((MarkdownSection::Table(table_content), line_count))
}
fn parse_text_sections(&self, text: &str) -> Result<Vec<MarkdownSection>> {
let mut sections = Vec::new();
let mut remaining = text;
// Look for images first (they should be preserved)
while let Some(caps) = self.image_regex.captures(remaining) {
let full_match = caps.get(0).unwrap();
let before = &remaining[..full_match.start()];
let alt = caps.get(1).unwrap().as_str().to_string();
let url = caps.get(2).unwrap().as_str().to_string();
if !before.trim().is_empty() {
sections.push(MarkdownSection::Text(before.to_string()));
}
sections.push(MarkdownSection::Image(alt, url));
remaining = &remaining[full_match.end()..];
}
// Look for links
let mut current_text = remaining.to_string();
while let Some(caps) = self.link_regex.captures(&current_text) {
let full_match = caps.get(0).unwrap();
let before = &current_text[..full_match.start()];
let link_text = caps.get(1).unwrap().as_str().to_string();
let url = caps.get(2).unwrap().as_str().to_string();
if !before.trim().is_empty() {
sections.push(MarkdownSection::Text(before.to_string()));
}
sections.push(MarkdownSection::Link(link_text, url));
current_text = current_text[full_match.end()..].to_string();
}
// Add remaining text
if !current_text.trim().is_empty() {
sections.push(MarkdownSection::Text(current_text));
}
Ok(sections)
}
pub fn rebuild_markdown(&self, sections: Vec<MarkdownSection>) -> String {
let mut result = String::new();
for section in sections {
match section {
MarkdownSection::Text(text) => {
result.push_str(&text);
}
MarkdownSection::Code(content, Some(lang)) => {
result.push_str(&format!("```{}\n{}\n```\n", lang, content));
}
MarkdownSection::Code(content, None) => {
result.push_str(&format!("```\n{}\n```\n", content));
}
MarkdownSection::Header(text, level) => {
let hashes = "#".repeat(level as usize);
result.push_str(&format!("{} {}\n", hashes, text));
}
MarkdownSection::Link(text, url) => {
result.push_str(&format!("[{}]({})", text, url));
}
MarkdownSection::Image(alt, url) => {
result.push_str(&format!("![{}]({})", alt, url));
}
MarkdownSection::Table(content) => {
result.push_str(&content);
result.push('\n');
}
MarkdownSection::List(text) => {
result.push_str(&format!("- {}\n", text));
}
MarkdownSection::Quote(text) => {
result.push_str(&format!("> {}\n", text));
}
}
}
result
}
}

123
src/translator/mod.rs Normal file
View File

@ -0,0 +1,123 @@
pub mod ollama_translator;
pub mod markdown_parser;
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranslationConfig {
pub source_lang: String,
pub target_lang: String,
pub ollama_endpoint: String,
pub model: String,
pub preserve_code: bool,
pub preserve_links: bool,
}
impl Default for TranslationConfig {
fn default() -> Self {
Self {
source_lang: "ja".to_string(),
target_lang: "en".to_string(),
ollama_endpoint: "http://localhost:11434".to_string(),
model: "qwen2.5:latest".to_string(),
preserve_code: true,
preserve_links: true,
}
}
}
#[derive(Debug, Clone)]
pub enum MarkdownSection {
Text(String),
Code(String, Option<String>), // content, language
Header(String, u8), // content, level (1-6)
Link(String, String), // text, url
Image(String, String), // alt, url
Table(String),
List(String),
Quote(String),
}
pub trait Translator {
async fn translate(&self, content: &str, config: &TranslationConfig) -> Result<String>;
async fn translate_markdown(&self, content: &str, config: &TranslationConfig) -> Result<String>;
async fn translate_sections(&self, sections: Vec<MarkdownSection>, config: &TranslationConfig) -> Result<Vec<MarkdownSection>>;
}
pub struct TranslationResult {
pub original: String,
pub translated: String,
pub source_lang: String,
pub target_lang: String,
pub model: String,
pub metrics: TranslationMetrics,
}
#[derive(Debug, Clone, Default)]
pub struct TranslationMetrics {
pub character_count: usize,
pub word_count: usize,
pub translation_time_ms: u64,
pub sections_translated: usize,
pub sections_preserved: usize,
}
pub struct LanguageMapping {
pub mappings: HashMap<String, LanguageInfo>,
}
#[derive(Debug, Clone)]
pub struct LanguageInfo {
pub name: String,
pub code: String,
pub ollama_prompt: String,
}
impl LanguageMapping {
pub fn new() -> Self {
let mut mappings = HashMap::new();
// 主要言語の設定
mappings.insert("ja".to_string(), LanguageInfo {
name: "Japanese".to_string(),
code: "ja".to_string(),
ollama_prompt: "You are a professional Japanese translator specializing in technical documentation.".to_string(),
});
mappings.insert("en".to_string(), LanguageInfo {
name: "English".to_string(),
code: "en".to_string(),
ollama_prompt: "You are a professional English translator specializing in technical documentation.".to_string(),
});
mappings.insert("zh".to_string(), LanguageInfo {
name: "Chinese".to_string(),
code: "zh".to_string(),
ollama_prompt: "You are a professional Chinese translator specializing in technical documentation.".to_string(),
});
mappings.insert("ko".to_string(), LanguageInfo {
name: "Korean".to_string(),
code: "ko".to_string(),
ollama_prompt: "You are a professional Korean translator specializing in technical documentation.".to_string(),
});
mappings.insert("es".to_string(), LanguageInfo {
name: "Spanish".to_string(),
code: "es".to_string(),
ollama_prompt: "You are a professional Spanish translator specializing in technical documentation.".to_string(),
});
Self { mappings }
}
pub fn get_language_info(&self, code: &str) -> Option<&LanguageInfo> {
self.mappings.get(code)
}
pub fn get_supported_languages(&self) -> Vec<String> {
self.mappings.keys().cloned().collect()
}
}

View File

@ -0,0 +1,214 @@
use anyhow::Result;
use reqwest::Client;
use serde_json::json;
use std::time::Instant;
use super::*;
use crate::translator::markdown_parser::MarkdownParser;
pub struct OllamaTranslator {
client: Client,
language_mapping: LanguageMapping,
parser: MarkdownParser,
}
impl OllamaTranslator {
pub fn new() -> Self {
Self {
client: Client::new(),
language_mapping: LanguageMapping::new(),
parser: MarkdownParser::new(),
}
}
async fn call_ollama(&self, prompt: &str, config: &TranslationConfig) -> Result<String> {
let request_body = json!({
"model": config.model,
"prompt": prompt,
"stream": false,
"options": {
"temperature": 0.3,
"top_p": 0.9,
"top_k": 40
}
});
let url = format!("{}/api/generate", config.ollama_endpoint);
let response = self.client
.post(&url)
.json(&request_body)
.send()
.await?;
if !response.status().is_success() {
anyhow::bail!("Ollama API request failed: {}", response.status());
}
let response_text = response.text().await?;
let response_json: serde_json::Value = serde_json::from_str(&response_text)?;
let translated = response_json
.get("response")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Invalid response from Ollama"))?;
Ok(translated.to_string())
}
fn build_translation_prompt(&self, text: &str, config: &TranslationConfig) -> Result<String> {
let source_info = self.language_mapping.get_language_info(&config.source_lang)
.ok_or_else(|| anyhow::anyhow!("Unsupported source language: {}", config.source_lang))?;
let target_info = self.language_mapping.get_language_info(&config.target_lang)
.ok_or_else(|| anyhow::anyhow!("Unsupported target language: {}", config.target_lang))?;
let prompt = format!(
r#"{system_prompt}
Translate the following text from {source_lang} to {target_lang}.
IMPORTANT RULES:
1. Preserve all Markdown formatting (headers, links, code blocks, etc.)
2. Do NOT translate content within code blocks (```)
3. Do NOT translate URLs or file paths
4. Preserve technical terms when appropriate
5. Maintain the original structure and formatting
6. Only output the translated text, no explanations
Original text ({source_code}):
{text}
Translated text ({target_code}):"#,
system_prompt = target_info.ollama_prompt,
source_lang = source_info.name,
target_lang = target_info.name,
source_code = source_info.code,
target_code = target_info.code,
text = text
);
Ok(prompt)
}
fn build_section_translation_prompt(&self, section: &MarkdownSection, config: &TranslationConfig) -> Result<String> {
let target_info = self.language_mapping.get_language_info(&config.target_lang)
.ok_or_else(|| anyhow::anyhow!("Unsupported target language: {}", config.target_lang))?;
let (content, section_type) = match section {
MarkdownSection::Text(text) => (text.clone(), "text"),
MarkdownSection::Header(text, _) => (text.clone(), "header"),
MarkdownSection::Quote(text) => (text.clone(), "quote"),
MarkdownSection::List(text) => (text.clone(), "list"),
_ => return Ok(String::new()), // Skip translation for code, links, etc.
};
let prompt = format!(
r#"{system_prompt}
Translate this {section_type} from {source_lang} to {target_lang}.
RULES:
- Only translate the text content
- Preserve formatting symbols (*, #, >, etc.)
- Keep technical terms when appropriate
- Output only the translated text
Text to translate:
{content}
Translation:"#,
system_prompt = target_info.ollama_prompt,
section_type = section_type,
source_lang = config.source_lang,
target_lang = config.target_lang,
content = content
);
Ok(prompt)
}
}
impl Translator for OllamaTranslator {
async fn translate(&self, content: &str, config: &TranslationConfig) -> Result<String> {
let prompt = self.build_translation_prompt(content, config)?;
self.call_ollama(&prompt, config).await
}
async fn translate_markdown(&self, content: &str, config: &TranslationConfig) -> Result<String> {
println!("🔄 Parsing markdown content...");
let sections = self.parser.parse_markdown(content)?;
println!("📝 Found {} sections to process", sections.len());
let translated_sections = self.translate_sections(sections, config).await?;
println!("✅ Rebuilding markdown from translated sections...");
let result = self.parser.rebuild_markdown(translated_sections);
Ok(result)
}
async fn translate_sections(&self, sections: Vec<MarkdownSection>, config: &TranslationConfig) -> Result<Vec<MarkdownSection>> {
let mut translated_sections = Vec::new();
let start_time = Instant::now();
for (index, section) in sections.into_iter().enumerate() {
println!(" 🔤 Processing section {}", index + 1);
let translated_section = match &section {
MarkdownSection::Code(content, lang) => {
if config.preserve_code {
println!(" ⏭️ Preserving code block");
section // Preserve code blocks
} else {
section // Still preserve for now
}
}
MarkdownSection::Link(text, url) => {
if config.preserve_links {
println!(" ⏭️ Preserving link");
section // Preserve links
} else {
// Translate link text only
let prompt = self.build_section_translation_prompt(&MarkdownSection::Text(text.clone()), config)?;
let translated_text = self.call_ollama(&prompt, config).await?;
MarkdownSection::Link(translated_text.trim().to_string(), url.clone())
}
}
MarkdownSection::Image(alt, url) => {
println!(" 🖼️ Preserving image");
section // Preserve images
}
MarkdownSection::Table(content) => {
println!(" 📊 Translating table content");
let prompt = self.build_section_translation_prompt(&MarkdownSection::Text(content.clone()), config)?;
let translated_content = self.call_ollama(&prompt, config).await?;
MarkdownSection::Table(translated_content.trim().to_string())
}
_ => {
// Translate text sections
println!(" 🔤 Translating text");
let prompt = self.build_section_translation_prompt(&section, config)?;
let translated_text = self.call_ollama(&prompt, config).await?;
match section {
MarkdownSection::Text(_) => MarkdownSection::Text(translated_text.trim().to_string()),
MarkdownSection::Header(_, level) => MarkdownSection::Header(translated_text.trim().to_string(), level),
MarkdownSection::Quote(_) => MarkdownSection::Quote(translated_text.trim().to_string()),
MarkdownSection::List(_) => MarkdownSection::List(translated_text.trim().to_string()),
_ => section,
}
}
};
translated_sections.push(translated_section);
// Add small delay to avoid overwhelming Ollama
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
}
let elapsed = start_time.elapsed();
println!("⏱️ Translation completed in {:.2}s", elapsed.as_secs_f64());
Ok(translated_sections)
}
}