""" rewriter.py — STEP label rewriter for Chinese→English translation. Produces {stem}_EN.step — NEVER modifies source file. Targets only PRODUCT entity name strings. Validates entity count before/after to ensure file integrity. """ import logging import re from pathlib import Path logger = logging.getLogger("step_processor.rewriter") # Targets both quoted strings in: #N = PRODUCT('id', 'name', 'description', ...) # ISO 10303-21 PRODUCT has two name fields; CAD viewers typically display the second. # Chinese CAD exports set both to the same Chinese string, so both must be replaced. # Groups: (prefix) (id) (sep) (name) (suffix-quote) PRODUCT_PATTERN = re.compile( r"(#\d+\s*=\s*PRODUCT\s*\(\s*')([^']*)(',\s*')([^']*)(')", re.IGNORECASE ) ENTITY_PATTERN = re.compile(r"^#\d+\s*=\s*\S+\s*\(", re.MULTILINE) def _read_step_for_rewrite(source_path: Path) -> str: """Read STEP file with GBK-aware encoding detection. STEP files from Chinese CAD tools embed raw GBK bytes in PRODUCT name strings. Reading as UTF-8 turns those bytes into replacement characters (U+FFFD), which makes the Chinese→English lookup fail. We try GBK when UTF-8 produces replacement chars so the regex substitution can actually find and replace the Chinese strings. """ for enc in ('utf-8', 'gbk'): try: text = source_path.read_text(encoding=enc) if enc == 'utf-8' and '�' in text: continue # has replacement chars — retry as GBK return text except (UnicodeDecodeError, LookupError): continue return source_path.read_text(encoding='latin-1', errors='replace') def rewrite_step(source_path: Path, translation_map: dict): """ Produce English-labeled copy of the STEP file. Returns output Path or None if no rewrite needed or failed. """ if not translation_map: logger.info("No translations to apply — _EN.step skipped") return None try: source_text = _read_step_for_rewrite(source_path) except Exception as e: logger.error(f"Could not read source STEP: {e}") return None original_count = len(ENTITY_PATTERN.findall(source_text)) if not any(orig in source_text for orig in translation_map): logger.info("No Chinese labels in STEP text — _EN.step skipped") return None lines = source_text.splitlines(keepends=True) replaced_count = 0 output_lines = [] for line in lines: new_line, count = _replace_product_names(line, translation_map) replaced_count += count output_lines.append(new_line) output_text = "".join(output_lines) new_count = len(ENTITY_PATTERN.findall(output_text)) if new_count != original_count: logger.error( f"Entity count mismatch: {original_count} → {new_count}. " "Aborting — source file untouched.") return None if replaced_count == 0: logger.info("No PRODUCT entities matched — _EN.step skipped") return None out_path = source_path.parent / f"{source_path.stem}_EN.step" try: out_path.write_text(output_text, encoding="utf-8") logger.info(f"_EN.step written: {out_path.name} ({replaced_count} labels replaced)") return out_path except Exception as e: logger.error(f"Failed to write _EN.step: {e}") out_path.unlink(missing_ok=True) return None def _replace_product_names(line: str, translation_map: dict): count = 0 def replacer(m): nonlocal count # Try id field first (group 2), fall back to name field (group 4) # Both are Chinese in Chinese CAD exports; replace both with English. translated = translation_map.get(m.group(2)) or translation_map.get(m.group(4)) if translated: count += 1 # Replace both the id field and the name field return m.group(1) + translated + m.group(3) + translated + m.group(5) return m.group(0) new_line = PRODUCT_PATTERN.sub(replacer, line) return new_line, count