phase 0
This commit is contained in:
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
translator.py — Chinese to English part name translation via Claude API.
|
||||
|
||||
Detects CJK unicode range. Batches all names in a single API call per file.
|
||||
Flags uncertain translations in the notes column.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
|
||||
logger = logging.getLogger("step_processor.translator")
|
||||
|
||||
CJK_PATTERN = re.compile(r'[一-鿿㐀-䶿]')
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are a mechanical engineering translator specializing in Chinese "
|
||||
"manufacturing CAD files for display and enclosure products. "
|
||||
"Translate the following part names from Chinese to English. "
|
||||
"Preserve technical precision. Use standard hardware/manufacturing terminology. "
|
||||
"Output ONLY a JSON object mapping original Chinese to translated English, nothing else.\n"
|
||||
'Example: {"安装支架": "Mounting Bracket", "螺钉M4": "M4 Screw", "前面板": "Front Panel"}'
|
||||
)
|
||||
|
||||
|
||||
def has_chinese(text: str) -> bool:
|
||||
"""Return True if text contains CJK characters."""
|
||||
return bool(CJK_PATTERN.search(str(text)))
|
||||
|
||||
|
||||
def translate_bom(df: pd.DataFrame, model_name: str = "") -> pd.DataFrame:
|
||||
"""Detect Chinese part names and translate via Claude API."""
|
||||
needs_translation = df["part_name_original"].apply(has_chinese)
|
||||
chinese_names = df.loc[needs_translation, "part_name_original"].unique().tolist()
|
||||
if not chinese_names:
|
||||
logger.info("No Chinese part names detected — translation skipped")
|
||||
return df
|
||||
logger.info(f"Translating {len(chinese_names)} Chinese part names...")
|
||||
translation_map = _call_claude_api(chinese_names, model_name)
|
||||
if not translation_map:
|
||||
logger.warning("Translation API returned no results — retaining original names")
|
||||
df.loc[needs_translation, "notes"] = (
|
||||
df.loc[needs_translation, "notes"].apply(
|
||||
lambda n: (n + "; " if n else "") + "translation-failed"))
|
||||
return df
|
||||
for idx, row in df.iterrows():
|
||||
original = row["part_name_original"]
|
||||
if has_chinese(original):
|
||||
translated = translation_map.get(original)
|
||||
if translated:
|
||||
df.at[idx, "part_name_english"] = translated
|
||||
note_tag = "ambiguous-translation" if "[?]" in translated else "machine-translated"
|
||||
else:
|
||||
df.at[idx, "part_name_english"] = original
|
||||
note_tag = "translation-missing"
|
||||
existing = row["notes"]
|
||||
df.at[idx, "notes"] = (existing + "; " if existing else "") + note_tag
|
||||
logger.info(f"Translated {needs_translation.sum()} parts")
|
||||
return df
|
||||
|
||||
|
||||
def get_translation_map(df: pd.DataFrame) -> dict:
|
||||
"""Return dict of original → english for all translated rows."""
|
||||
mask = df["part_name_original"] != df["part_name_english"]
|
||||
return dict(zip(df.loc[mask, "part_name_original"],
|
||||
df.loc[mask, "part_name_english"]))
|
||||
|
||||
|
||||
def _call_claude_api(names: list, model_name: str = "") -> dict:
|
||||
"""Single batched Claude API call. Returns original→translated dict."""
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
logger.error("ANTHROPIC_API_KEY not set — translation unavailable")
|
||||
return {}
|
||||
try:
|
||||
import anthropic
|
||||
except ImportError:
|
||||
logger.error("anthropic package not installed — pip install anthropic")
|
||||
return {}
|
||||
names_json = json.dumps(names, ensure_ascii=False)
|
||||
user_msg = f"Translate these part names from Chinese to English:\n{names_json}"
|
||||
if model_name:
|
||||
user_msg += f"\n\nContext: Parts from a {model_name} display enclosure assembly."
|
||||
try:
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
response = client.messages.create(
|
||||
model="claude-haiku-4-5-20251001",
|
||||
max_tokens=2048,
|
||||
system=SYSTEM_PROMPT,
|
||||
messages=[{"role": "user", "content": user_msg}],
|
||||
)
|
||||
text = response.content[0].text.strip()
|
||||
json_match = re.search(r'\{.*\}', text, re.DOTALL)
|
||||
if json_match:
|
||||
text = json_match.group(0)
|
||||
result = json.loads(text)
|
||||
logger.info(f"API returned {len(result)} translations")
|
||||
return result
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Translation API JSON parse error: {e}")
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.error(f"Translation API error: {type(e).__name__}: {e}")
|
||||
return {}
|
||||
Reference in New Issue
Block a user