104 lines
4.0 KiB
Python
104 lines
4.0 KiB
Python
"""
|
|
rewriter.py — STEP label rewriter for Chinese→English translation.
|
|
|
|
Produces {stem}_EN.step — NEVER modifies source file.
|
|
Targets only PRODUCT entity name strings.
|
|
Validates entity count before/after to ensure file integrity.
|
|
"""
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger("step_processor.rewriter")
|
|
|
|
# Targets both quoted strings in: #N = PRODUCT('id', 'name', 'description', ...)
|
|
# ISO 10303-21 PRODUCT has two name fields; CAD viewers typically display the second.
|
|
# Chinese CAD exports set both to the same Chinese string, so both must be replaced.
|
|
# Groups: (prefix) (id) (sep) (name) (suffix-quote)
|
|
PRODUCT_PATTERN = re.compile(
|
|
r"(#\d+\s*=\s*PRODUCT\s*\(\s*')([^']*)(',\s*')([^']*)(')",
|
|
re.IGNORECASE
|
|
)
|
|
ENTITY_PATTERN = re.compile(r"^#\d+\s*=\s*\S+\s*\(", re.MULTILINE)
|
|
|
|
|
|
def _read_step_for_rewrite(source_path: Path) -> str:
|
|
"""Read STEP file with GBK-aware encoding detection.
|
|
|
|
STEP files from Chinese CAD tools embed raw GBK bytes in PRODUCT name
|
|
strings. Reading as UTF-8 turns those bytes into replacement characters
|
|
(U+FFFD), which makes the Chinese→English lookup fail. We try GBK when
|
|
UTF-8 produces replacement chars so the regex substitution can actually
|
|
find and replace the Chinese strings.
|
|
"""
|
|
for enc in ('utf-8', 'gbk'):
|
|
try:
|
|
text = source_path.read_text(encoding=enc)
|
|
if enc == 'utf-8' and '�' in text:
|
|
continue # has replacement chars — retry as GBK
|
|
return text
|
|
except (UnicodeDecodeError, LookupError):
|
|
continue
|
|
return source_path.read_text(encoding='latin-1', errors='replace')
|
|
|
|
|
|
def rewrite_step(source_path: Path, translation_map: dict):
|
|
"""
|
|
Produce English-labeled copy of the STEP file.
|
|
Returns output Path or None if no rewrite needed or failed.
|
|
"""
|
|
if not translation_map:
|
|
logger.info("No translations to apply — _EN.step skipped")
|
|
return None
|
|
try:
|
|
source_text = _read_step_for_rewrite(source_path)
|
|
except Exception as e:
|
|
logger.error(f"Could not read source STEP: {e}")
|
|
return None
|
|
original_count = len(ENTITY_PATTERN.findall(source_text))
|
|
if not any(orig in source_text for orig in translation_map):
|
|
logger.info("No Chinese labels in STEP text — _EN.step skipped")
|
|
return None
|
|
lines = source_text.splitlines(keepends=True)
|
|
replaced_count = 0
|
|
output_lines = []
|
|
for line in lines:
|
|
new_line, count = _replace_product_names(line, translation_map)
|
|
replaced_count += count
|
|
output_lines.append(new_line)
|
|
output_text = "".join(output_lines)
|
|
new_count = len(ENTITY_PATTERN.findall(output_text))
|
|
if new_count != original_count:
|
|
logger.error(
|
|
f"Entity count mismatch: {original_count} → {new_count}. "
|
|
"Aborting — source file untouched.")
|
|
return None
|
|
if replaced_count == 0:
|
|
logger.info("No PRODUCT entities matched — _EN.step skipped")
|
|
return None
|
|
out_path = source_path.parent / f"{source_path.stem}_EN.step"
|
|
try:
|
|
out_path.write_text(output_text, encoding="utf-8")
|
|
logger.info(f"_EN.step written: {out_path.name} ({replaced_count} labels replaced)")
|
|
return out_path
|
|
except Exception as e:
|
|
logger.error(f"Failed to write _EN.step: {e}")
|
|
out_path.unlink(missing_ok=True)
|
|
return None
|
|
|
|
|
|
def _replace_product_names(line: str, translation_map: dict):
|
|
count = 0
|
|
def replacer(m):
|
|
nonlocal count
|
|
# Try id field first (group 2), fall back to name field (group 4)
|
|
# Both are Chinese in Chinese CAD exports; replace both with English.
|
|
translated = translation_map.get(m.group(2)) or translation_map.get(m.group(4))
|
|
if translated:
|
|
count += 1
|
|
# Replace both the id field and the name field
|
|
return m.group(1) + translated + m.group(3) + translated + m.group(5)
|
|
return m.group(0)
|
|
new_line = PRODUCT_PATTERN.sub(replacer, line)
|
|
return new_line, count
|