Files
Jason Stedwell c1abe36822 phase 0
2026-06-17 16:03:26 -05:00

108 lines
4.3 KiB
Python

"""
translator.py — Chinese to English part name translation via Claude API.
Detects CJK unicode range. Batches all names in a single API call per file.
Flags uncertain translations in the notes column.
"""
import json
import logging
import os
import re
import pandas as pd
logger = logging.getLogger("step_processor.translator")
CJK_PATTERN = re.compile(r'[一-鿿㐀-䶿]')
SYSTEM_PROMPT = (
"You are a mechanical engineering translator specializing in Chinese "
"manufacturing CAD files for display and enclosure products. "
"Translate the following part names from Chinese to English. "
"Preserve technical precision. Use standard hardware/manufacturing terminology. "
"Output ONLY a JSON object mapping original Chinese to translated English, nothing else.\n"
'Example: {"安装支架": "Mounting Bracket", "螺钉M4": "M4 Screw", "前面板": "Front Panel"}'
)
def has_chinese(text: str) -> bool:
"""Return True if text contains CJK characters."""
return bool(CJK_PATTERN.search(str(text)))
def translate_bom(df: pd.DataFrame, model_name: str = "") -> pd.DataFrame:
"""Detect Chinese part names and translate via Claude API."""
needs_translation = df["part_name_original"].apply(has_chinese)
chinese_names = df.loc[needs_translation, "part_name_original"].unique().tolist()
if not chinese_names:
logger.info("No Chinese part names detected — translation skipped")
return df
logger.info(f"Translating {len(chinese_names)} Chinese part names...")
translation_map = _call_claude_api(chinese_names, model_name)
if not translation_map:
logger.warning("Translation API returned no results — retaining original names")
df.loc[needs_translation, "notes"] = (
df.loc[needs_translation, "notes"].apply(
lambda n: (n + "; " if n else "") + "translation-failed"))
return df
for idx, row in df.iterrows():
original = row["part_name_original"]
if has_chinese(original):
translated = translation_map.get(original)
if translated:
df.at[idx, "part_name_english"] = translated
note_tag = "ambiguous-translation" if "[?]" in translated else "machine-translated"
else:
df.at[idx, "part_name_english"] = original
note_tag = "translation-missing"
existing = row["notes"]
df.at[idx, "notes"] = (existing + "; " if existing else "") + note_tag
logger.info(f"Translated {needs_translation.sum()} parts")
return df
def get_translation_map(df: pd.DataFrame) -> dict:
"""Return dict of original → english for all translated rows."""
mask = df["part_name_original"] != df["part_name_english"]
return dict(zip(df.loc[mask, "part_name_original"],
df.loc[mask, "part_name_english"]))
def _call_claude_api(names: list, model_name: str = "") -> dict:
"""Single batched Claude API call. Returns original→translated dict."""
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
logger.error("ANTHROPIC_API_KEY not set — translation unavailable")
return {}
try:
import anthropic
except ImportError:
logger.error("anthropic package not installed — pip install anthropic")
return {}
names_json = json.dumps(names, ensure_ascii=False)
user_msg = f"Translate these part names from Chinese to English:\n{names_json}"
if model_name:
user_msg += f"\n\nContext: Parts from a {model_name} display enclosure assembly."
try:
client = anthropic.Anthropic(api_key=api_key)
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=2048,
system=SYSTEM_PROMPT,
messages=[{"role": "user", "content": user_msg}],
)
text = response.content[0].text.strip()
json_match = re.search(r'\{.*\}', text, re.DOTALL)
if json_match:
text = json_match.group(0)
result = json.loads(text)
logger.info(f"API returned {len(result)} translations")
return result
except json.JSONDecodeError as e:
logger.error(f"Translation API JSON parse error: {e}")
return {}
except Exception as e:
logger.error(f"Translation API error: {type(e).__name__}: {e}")
return {}