108 lines
4.3 KiB
Python
108 lines
4.3 KiB
Python
"""
|
|
translator.py — Chinese to English part name translation via Claude API.
|
|
|
|
Detects CJK unicode range. Batches all names in a single API call per file.
|
|
Flags uncertain translations in the notes column.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
|
|
import pandas as pd
|
|
|
|
logger = logging.getLogger("step_processor.translator")
|
|
|
|
CJK_PATTERN = re.compile(r'[一-鿿㐀-䶿]')
|
|
|
|
SYSTEM_PROMPT = (
|
|
"You are a mechanical engineering translator specializing in Chinese "
|
|
"manufacturing CAD files for display and enclosure products. "
|
|
"Translate the following part names from Chinese to English. "
|
|
"Preserve technical precision. Use standard hardware/manufacturing terminology. "
|
|
"Output ONLY a JSON object mapping original Chinese to translated English, nothing else.\n"
|
|
'Example: {"安装支架": "Mounting Bracket", "螺钉M4": "M4 Screw", "前面板": "Front Panel"}'
|
|
)
|
|
|
|
|
|
def has_chinese(text: str) -> bool:
|
|
"""Return True if text contains CJK characters."""
|
|
return bool(CJK_PATTERN.search(str(text)))
|
|
|
|
|
|
def translate_bom(df: pd.DataFrame, model_name: str = "") -> pd.DataFrame:
|
|
"""Detect Chinese part names and translate via Claude API."""
|
|
needs_translation = df["part_name_original"].apply(has_chinese)
|
|
chinese_names = df.loc[needs_translation, "part_name_original"].unique().tolist()
|
|
if not chinese_names:
|
|
logger.info("No Chinese part names detected — translation skipped")
|
|
return df
|
|
logger.info(f"Translating {len(chinese_names)} Chinese part names...")
|
|
translation_map = _call_claude_api(chinese_names, model_name)
|
|
if not translation_map:
|
|
logger.warning("Translation API returned no results — retaining original names")
|
|
df.loc[needs_translation, "notes"] = (
|
|
df.loc[needs_translation, "notes"].apply(
|
|
lambda n: (n + "; " if n else "") + "translation-failed"))
|
|
return df
|
|
for idx, row in df.iterrows():
|
|
original = row["part_name_original"]
|
|
if has_chinese(original):
|
|
translated = translation_map.get(original)
|
|
if translated:
|
|
df.at[idx, "part_name_english"] = translated
|
|
note_tag = "ambiguous-translation" if "[?]" in translated else "machine-translated"
|
|
else:
|
|
df.at[idx, "part_name_english"] = original
|
|
note_tag = "translation-missing"
|
|
existing = row["notes"]
|
|
df.at[idx, "notes"] = (existing + "; " if existing else "") + note_tag
|
|
logger.info(f"Translated {needs_translation.sum()} parts")
|
|
return df
|
|
|
|
|
|
def get_translation_map(df: pd.DataFrame) -> dict:
|
|
"""Return dict of original → english for all translated rows."""
|
|
mask = df["part_name_original"] != df["part_name_english"]
|
|
return dict(zip(df.loc[mask, "part_name_original"],
|
|
df.loc[mask, "part_name_english"]))
|
|
|
|
|
|
def _call_claude_api(names: list, model_name: str = "") -> dict:
|
|
"""Single batched Claude API call. Returns original→translated dict."""
|
|
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
if not api_key:
|
|
logger.error("ANTHROPIC_API_KEY not set — translation unavailable")
|
|
return {}
|
|
try:
|
|
import anthropic
|
|
except ImportError:
|
|
logger.error("anthropic package not installed — pip install anthropic")
|
|
return {}
|
|
names_json = json.dumps(names, ensure_ascii=False)
|
|
user_msg = f"Translate these part names from Chinese to English:\n{names_json}"
|
|
if model_name:
|
|
user_msg += f"\n\nContext: Parts from a {model_name} display enclosure assembly."
|
|
try:
|
|
client = anthropic.Anthropic(api_key=api_key)
|
|
response = client.messages.create(
|
|
model="claude-haiku-4-5-20251001",
|
|
max_tokens=2048,
|
|
system=SYSTEM_PROMPT,
|
|
messages=[{"role": "user", "content": user_msg}],
|
|
)
|
|
text = response.content[0].text.strip()
|
|
json_match = re.search(r'\{.*\}', text, re.DOTALL)
|
|
if json_match:
|
|
text = json_match.group(0)
|
|
result = json.loads(text)
|
|
logger.info(f"API returned {len(result)} translations")
|
|
return result
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Translation API JSON parse error: {e}")
|
|
return {}
|
|
except Exception as e:
|
|
logger.error(f"Translation API error: {type(e).__name__}: {e}")
|
|
return {}
|