step-parse/skill.src/modules/rewriter.py

"""
rewriter.py — STEP label rewriter for Chinese→English translation.

Produces {stem}_EN.step — NEVER modifies source file.
Targets only PRODUCT entity name strings.
Validates entity count before/after to ensure file integrity.
"""
import logging
import re
from pathlib import Path

logger = logging.getLogger("step_processor.rewriter")

# Targets both quoted strings in: #N = PRODUCT('id', 'name', 'description', ...)
# ISO 10303-21 PRODUCT has two name fields; CAD viewers typically display the second.
# Chinese CAD exports set both to the same Chinese string, so both must be replaced.
# Groups: (prefix)  (id)  (sep)  (name)  (suffix-quote)
PRODUCT_PATTERN = re.compile(
    r"(#\d+\s*=\s*PRODUCT\s*\(\s*')([^']*)(',\s*')([^']*)(')",
    re.IGNORECASE
)
ENTITY_PATTERN = re.compile(r"^#\d+\s*=\s*\S+\s*\(", re.MULTILINE)


def _read_step_for_rewrite(source_path: Path) -> str:
    """Read STEP file with GBK-aware encoding detection.

    STEP files from Chinese CAD tools embed raw GBK bytes in PRODUCT name
    strings.  Reading as UTF-8 turns those bytes into replacement characters
    (U+FFFD), which makes the Chinese→English lookup fail.  We try GBK when
    UTF-8 produces replacement chars so the regex substitution can actually
    find and replace the Chinese strings.
    """
    for enc in ('utf-8', 'gbk'):
        try:
            text = source_path.read_text(encoding=enc)
            if enc == 'utf-8' and '�' in text:
                continue  # has replacement chars — retry as GBK
            return text
        except (UnicodeDecodeError, LookupError):
            continue
    return source_path.read_text(encoding='latin-1', errors='replace')


def rewrite_step(source_path: Path, translation_map: dict):
    """
    Produce English-labeled copy of the STEP file.
    Returns output Path or None if no rewrite needed or failed.
    """
    if not translation_map:
        logger.info("No translations to apply — _EN.step skipped")
        return None
    try:
        source_text = _read_step_for_rewrite(source_path)
    except Exception as e:
        logger.error(f"Could not read source STEP: {e}")
        return None
    original_count = len(ENTITY_PATTERN.findall(source_text))
    if not any(orig in source_text for orig in translation_map):
        logger.info("No Chinese labels in STEP text — _EN.step skipped")
        return None
    lines = source_text.splitlines(keepends=True)
    replaced_count = 0
    output_lines = []
    for line in lines:
        new_line, count = _replace_product_names(line, translation_map)
        replaced_count += count
        output_lines.append(new_line)
    output_text = "".join(output_lines)
    new_count = len(ENTITY_PATTERN.findall(output_text))
    if new_count != original_count:
        logger.error(
            f"Entity count mismatch: {original_count} → {new_count}. "
            "Aborting — source file untouched.")
        return None
    if replaced_count == 0:
        logger.info("No PRODUCT entities matched — _EN.step skipped")
        return None
    out_path = source_path.parent / f"{source_path.stem}_EN.step"
    try:
        out_path.write_text(output_text, encoding="utf-8")
        logger.info(f"_EN.step written: {out_path.name} ({replaced_count} labels replaced)")
        return out_path
    except Exception as e:
        logger.error(f"Failed to write _EN.step: {e}")
        out_path.unlink(missing_ok=True)
        return None


def _replace_product_names(line: str, translation_map: dict):
    count = 0
    def replacer(m):
        nonlocal count
        # Try id field first (group 2), fall back to name field (group 4)
        # Both are Chinese in Chinese CAD exports; replace both with English.
        translated = translation_map.get(m.group(2)) or translation_map.get(m.group(4))
        if translated:
            count += 1
            # Replace both the id field and the name field
            return m.group(1) + translated + m.group(3) + translated + m.group(5)
        return m.group(0)
    new_line = PRODUCT_PATTERN.sub(replacer, line)
    return new_line, count