""" translator.py — Chinese to English part name translation via Claude API. Detects CJK unicode range. Batches all names in a single API call per file. Flags uncertain translations in the notes column. """ import json import logging import os import re import pandas as pd logger = logging.getLogger("step_processor.translator") CJK_PATTERN = re.compile(r'[一-鿿㐀-䶿]') SYSTEM_PROMPT = ( "You are a mechanical engineering translator specializing in Chinese " "manufacturing CAD files for display and enclosure products. " "Translate the following part names from Chinese to English. " "Preserve technical precision. Use standard hardware/manufacturing terminology. " "Output ONLY a JSON object mapping original Chinese to translated English, nothing else.\n" 'Example: {"安装支架": "Mounting Bracket", "螺钉M4": "M4 Screw", "前面板": "Front Panel"}' ) def has_chinese(text: str) -> bool: """Return True if text contains CJK characters.""" return bool(CJK_PATTERN.search(str(text))) def translate_bom(df: pd.DataFrame, model_name: str = "") -> pd.DataFrame: """Detect Chinese part names and translate via Claude API.""" needs_translation = df["part_name_original"].apply(has_chinese) chinese_names = df.loc[needs_translation, "part_name_original"].unique().tolist() if not chinese_names: logger.info("No Chinese part names detected — translation skipped") return df logger.info(f"Translating {len(chinese_names)} Chinese part names...") translation_map = _call_claude_api(chinese_names, model_name) if not translation_map: logger.warning("Translation API returned no results — retaining original names") df.loc[needs_translation, "notes"] = ( df.loc[needs_translation, "notes"].apply( lambda n: (n + "; " if n else "") + "translation-failed")) return df for idx, row in df.iterrows(): original = row["part_name_original"] if has_chinese(original): translated = translation_map.get(original) if translated: df.at[idx, "part_name_english"] = translated note_tag = "ambiguous-translation" if "[?]" in translated else "machine-translated" else: df.at[idx, "part_name_english"] = original note_tag = "translation-missing" existing = row["notes"] df.at[idx, "notes"] = (existing + "; " if existing else "") + note_tag logger.info(f"Translated {needs_translation.sum()} parts") return df def get_translation_map(df: pd.DataFrame) -> dict: """Return dict of original → english for all translated rows.""" mask = df["part_name_original"] != df["part_name_english"] return dict(zip(df.loc[mask, "part_name_original"], df.loc[mask, "part_name_english"])) def _call_claude_api(names: list, model_name: str = "") -> dict: """Single batched Claude API call. Returns original→translated dict.""" api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: logger.error("ANTHROPIC_API_KEY not set — translation unavailable") return {} try: import anthropic except ImportError: logger.error("anthropic package not installed — pip install anthropic") return {} names_json = json.dumps(names, ensure_ascii=False) user_msg = f"Translate these part names from Chinese to English:\n{names_json}" if model_name: user_msg += f"\n\nContext: Parts from a {model_name} display enclosure assembly." try: client = anthropic.Anthropic(api_key=api_key) response = client.messages.create( model="claude-haiku-4-5-20251001", max_tokens=2048, system=SYSTEM_PROMPT, messages=[{"role": "user", "content": user_msg}], ) text = response.content[0].text.strip() json_match = re.search(r'\{.*\}', text, re.DOTALL) if json_match: text = json_match.group(0) result = json.loads(text) logger.info(f"API returned {len(result)} translations") return result except json.JSONDecodeError as e: logger.error(f"Translation API JSON parse error: {e}") return {} except Exception as e: logger.error(f"Translation API error: {type(e).__name__}: {e}") return {}