Files
Jason Stedwell c1abe36822 phase 0
2026-06-17 16:03:26 -05:00

104 lines
4.0 KiB
Python

"""
rewriter.py — STEP label rewriter for Chinese→English translation.
Produces {stem}_EN.step — NEVER modifies source file.
Targets only PRODUCT entity name strings.
Validates entity count before/after to ensure file integrity.
"""
import logging
import re
from pathlib import Path
logger = logging.getLogger("step_processor.rewriter")
# Targets both quoted strings in: #N = PRODUCT('id', 'name', 'description', ...)
# ISO 10303-21 PRODUCT has two name fields; CAD viewers typically display the second.
# Chinese CAD exports set both to the same Chinese string, so both must be replaced.
# Groups: (prefix) (id) (sep) (name) (suffix-quote)
PRODUCT_PATTERN = re.compile(
r"(#\d+\s*=\s*PRODUCT\s*\(\s*')([^']*)(',\s*')([^']*)(')",
re.IGNORECASE
)
ENTITY_PATTERN = re.compile(r"^#\d+\s*=\s*\S+\s*\(", re.MULTILINE)
def _read_step_for_rewrite(source_path: Path) -> str:
"""Read STEP file with GBK-aware encoding detection.
STEP files from Chinese CAD tools embed raw GBK bytes in PRODUCT name
strings. Reading as UTF-8 turns those bytes into replacement characters
(U+FFFD), which makes the Chinese→English lookup fail. We try GBK when
UTF-8 produces replacement chars so the regex substitution can actually
find and replace the Chinese strings.
"""
for enc in ('utf-8', 'gbk'):
try:
text = source_path.read_text(encoding=enc)
if enc == 'utf-8' and '' in text:
continue # has replacement chars — retry as GBK
return text
except (UnicodeDecodeError, LookupError):
continue
return source_path.read_text(encoding='latin-1', errors='replace')
def rewrite_step(source_path: Path, translation_map: dict):
"""
Produce English-labeled copy of the STEP file.
Returns output Path or None if no rewrite needed or failed.
"""
if not translation_map:
logger.info("No translations to apply — _EN.step skipped")
return None
try:
source_text = _read_step_for_rewrite(source_path)
except Exception as e:
logger.error(f"Could not read source STEP: {e}")
return None
original_count = len(ENTITY_PATTERN.findall(source_text))
if not any(orig in source_text for orig in translation_map):
logger.info("No Chinese labels in STEP text — _EN.step skipped")
return None
lines = source_text.splitlines(keepends=True)
replaced_count = 0
output_lines = []
for line in lines:
new_line, count = _replace_product_names(line, translation_map)
replaced_count += count
output_lines.append(new_line)
output_text = "".join(output_lines)
new_count = len(ENTITY_PATTERN.findall(output_text))
if new_count != original_count:
logger.error(
f"Entity count mismatch: {original_count}{new_count}. "
"Aborting — source file untouched.")
return None
if replaced_count == 0:
logger.info("No PRODUCT entities matched — _EN.step skipped")
return None
out_path = source_path.parent / f"{source_path.stem}_EN.step"
try:
out_path.write_text(output_text, encoding="utf-8")
logger.info(f"_EN.step written: {out_path.name} ({replaced_count} labels replaced)")
return out_path
except Exception as e:
logger.error(f"Failed to write _EN.step: {e}")
out_path.unlink(missing_ok=True)
return None
def _replace_product_names(line: str, translation_map: dict):
count = 0
def replacer(m):
nonlocal count
# Try id field first (group 2), fall back to name field (group 4)
# Both are Chinese in Chinese CAD exports; replace both with English.
translated = translation_map.get(m.group(2)) or translation_map.get(m.group(4))
if translated:
count += 1
# Replace both the id field and the name field
return m.group(1) + translated + m.group(3) + translated + m.group(5)
return m.group(0)
new_line = PRODUCT_PATTERN.sub(replacer, line)
return new_line, count