phase 0
This commit is contained in:
@@ -0,0 +1,103 @@
|
||||
"""
|
||||
rewriter.py — STEP label rewriter for Chinese→English translation.
|
||||
|
||||
Produces {stem}_EN.step — NEVER modifies source file.
|
||||
Targets only PRODUCT entity name strings.
|
||||
Validates entity count before/after to ensure file integrity.
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger("step_processor.rewriter")
|
||||
|
||||
# Targets both quoted strings in: #N = PRODUCT('id', 'name', 'description', ...)
|
||||
# ISO 10303-21 PRODUCT has two name fields; CAD viewers typically display the second.
|
||||
# Chinese CAD exports set both to the same Chinese string, so both must be replaced.
|
||||
# Groups: (prefix) (id) (sep) (name) (suffix-quote)
|
||||
PRODUCT_PATTERN = re.compile(
|
||||
r"(#\d+\s*=\s*PRODUCT\s*\(\s*')([^']*)(',\s*')([^']*)(')",
|
||||
re.IGNORECASE
|
||||
)
|
||||
ENTITY_PATTERN = re.compile(r"^#\d+\s*=\s*\S+\s*\(", re.MULTILINE)
|
||||
|
||||
|
||||
def _read_step_for_rewrite(source_path: Path) -> str:
|
||||
"""Read STEP file with GBK-aware encoding detection.
|
||||
|
||||
STEP files from Chinese CAD tools embed raw GBK bytes in PRODUCT name
|
||||
strings. Reading as UTF-8 turns those bytes into replacement characters
|
||||
(U+FFFD), which makes the Chinese→English lookup fail. We try GBK when
|
||||
UTF-8 produces replacement chars so the regex substitution can actually
|
||||
find and replace the Chinese strings.
|
||||
"""
|
||||
for enc in ('utf-8', 'gbk'):
|
||||
try:
|
||||
text = source_path.read_text(encoding=enc)
|
||||
if enc == 'utf-8' and '�' in text:
|
||||
continue # has replacement chars — retry as GBK
|
||||
return text
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
continue
|
||||
return source_path.read_text(encoding='latin-1', errors='replace')
|
||||
|
||||
|
||||
def rewrite_step(source_path: Path, translation_map: dict):
|
||||
"""
|
||||
Produce English-labeled copy of the STEP file.
|
||||
Returns output Path or None if no rewrite needed or failed.
|
||||
"""
|
||||
if not translation_map:
|
||||
logger.info("No translations to apply — _EN.step skipped")
|
||||
return None
|
||||
try:
|
||||
source_text = _read_step_for_rewrite(source_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Could not read source STEP: {e}")
|
||||
return None
|
||||
original_count = len(ENTITY_PATTERN.findall(source_text))
|
||||
if not any(orig in source_text for orig in translation_map):
|
||||
logger.info("No Chinese labels in STEP text — _EN.step skipped")
|
||||
return None
|
||||
lines = source_text.splitlines(keepends=True)
|
||||
replaced_count = 0
|
||||
output_lines = []
|
||||
for line in lines:
|
||||
new_line, count = _replace_product_names(line, translation_map)
|
||||
replaced_count += count
|
||||
output_lines.append(new_line)
|
||||
output_text = "".join(output_lines)
|
||||
new_count = len(ENTITY_PATTERN.findall(output_text))
|
||||
if new_count != original_count:
|
||||
logger.error(
|
||||
f"Entity count mismatch: {original_count} → {new_count}. "
|
||||
"Aborting — source file untouched.")
|
||||
return None
|
||||
if replaced_count == 0:
|
||||
logger.info("No PRODUCT entities matched — _EN.step skipped")
|
||||
return None
|
||||
out_path = source_path.parent / f"{source_path.stem}_EN.step"
|
||||
try:
|
||||
out_path.write_text(output_text, encoding="utf-8")
|
||||
logger.info(f"_EN.step written: {out_path.name} ({replaced_count} labels replaced)")
|
||||
return out_path
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write _EN.step: {e}")
|
||||
out_path.unlink(missing_ok=True)
|
||||
return None
|
||||
|
||||
|
||||
def _replace_product_names(line: str, translation_map: dict):
|
||||
count = 0
|
||||
def replacer(m):
|
||||
nonlocal count
|
||||
# Try id field first (group 2), fall back to name field (group 4)
|
||||
# Both are Chinese in Chinese CAD exports; replace both with English.
|
||||
translated = translation_map.get(m.group(2)) or translation_map.get(m.group(4))
|
||||
if translated:
|
||||
count += 1
|
||||
# Replace both the id field and the name field
|
||||
return m.group(1) + translated + m.group(3) + translated + m.group(5)
|
||||
return m.group(0)
|
||||
new_line = PRODUCT_PATTERN.sub(replacer, line)
|
||||
return new_line, count
|
||||
Reference in New Issue
Block a user