""" bom.py — BOM extraction from STEP assembly tree. Primary: build123d assembly traversal. Fallback: STEP ISO 10303-21 text parser for PRODUCT entities. Always produces a complete DataFrame; saved as MPM-branded Excel (.xlsx). """ import logging import math import re from collections import Counter from pathlib import Path import pandas as pd from .loader import StepModel logger = logging.getLogger("step_processor.bom") BOM_COLUMNS = [ "part_number", "part_name_original", "part_name_english", "quantity", "level", "parent", "bbox_x_mm", "bbox_y_mm", "bbox_z_mm", "notes" ] # ── Excel output — MPM brand palette (hex, no #) ───────────────────────────── _MPM_DARK_SHADE = "232022" # header background + body text _MPM_LIGHT_SHADE = "F5F1EC" # header text _MPM_WARM_OFF_WHITE = "FAF7F2" # alternating row tint _MPM_MIDDLE_GOLD = "DCBB4F" # accent border under header row # Column rename + reorder for stakeholder-facing Excel output. # Internal processing always uses BOM_COLUMNS names. _XLSX_RENAME = { "part_name_english": "part_description", "part_name_original": "part_name_supplier", } _XLSX_ORDER = [ "part_number", "part_description", "quantity", "level", "parent", "bbox_x_mm", "bbox_y_mm", "bbox_z_mm", "notes", "part_name_supplier", ] _XLSX_HEADERS = { "part_number": "Part #", "part_description": "Part Description", "quantity": "Qty", "level": "Level", "parent": "Parent", "bbox_x_mm": "X (mm)", "bbox_y_mm": "Y (mm)", "bbox_z_mm": "Z (mm)", "notes": "Notes", "part_name_supplier": "Supplier Part Name", } _XLSX_WIDTHS = { "part_number": 12, "part_description": 40, "quantity": 8, "level": 7, "parent": 22, "bbox_x_mm": 11, "bbox_y_mm": 11, "bbox_z_mm": 11, "notes": 34, "part_name_supplier": 40, } def _safe(v): """Convert NaN/None → None so openpyxl writes blank cells.""" if v is None: return None try: if isinstance(v, float) and math.isnan(v): return None except Exception: pass return v def extract_bom(model: StepModel) -> pd.DataFrame: """Extract BOM from a loaded StepModel. Returns DataFrame with BOM_COLUMNS. Name-extraction strategy ------------------------ The STEP text parser is always the primary source for part_name_original. It reads raw bytes with GBK/UTF-8 encoding detection, correctly decoding Chinese CAD part labels. OCC's STEP reader (used by build123d) applies an internal codec that maps each 2-byte GBK sequence to an incorrect Unicode codepoint — the resulting strings cannot be recovered. We therefore never rely on child.label for part names when the file may contain CJK characters. OCC assembly walk (_bom_from_parts) is kept as a fallback only for files where the text parser returns nothing (e.g., non-PRODUCT-entity STEP files). """ rows = [] # Primary: STEP text parser — encoding-aware, correct for ASCII and CJK files rows = _bom_from_step_text(model.path) if not rows and model.backend == "build123d" and model.parts: # Fallback: OCC assembly walk (CJK names will be garbled but structure intact) logger.debug("STEP text parser empty — falling back to OCC assembly walk") rows = _bom_from_parts(model.parts) if not rows: logger.info("No assembly structure — treating as single part") stem = model.path.stem rows = [{"part_number": "001", "part_name_original": stem, "part_name_english": stem, "quantity": 1, "level": 0, "parent": "", "bbox_x_mm": None, "bbox_y_mm": None, "bbox_z_mm": None, "notes": "single-body file"}] df = pd.DataFrame(rows, columns=BOM_COLUMNS) if model.backend == "build123d": df = _enrich_bboxes(model, df) logger.info(f"BOM extracted: {len(df)} parts") return df def _bom_from_parts(parts: list) -> list: name_counts = Counter(p["name"] for p in parts) seen = set() rows = [] for i, p in enumerate(parts): name = p["name"] if name in seen: continue seen.add(name) rows.append({ "part_number": f"{len(rows)+1:03d}", "part_name_original": name, "part_name_english": name, "quantity": name_counts[name], "level": p.get("level", 0), "parent": p.get("parent", ""), "bbox_x_mm": None, "bbox_y_mm": None, "bbox_z_mm": None, "notes": "", }) return rows def _read_step_text(step_path: Path) -> str: """Read STEP file text with CJK-aware encoding detection. STEP files from Chinese manufacturers embed raw GBK bytes in name strings. Strategy: try UTF-8 first (correct for modern files); if replacement chars appear, retry as GBK (covers Chinese CAD exports); fall back to latin-1 which never fails (may contain mojibake, but at least it's readable). """ for enc in ('utf-8', 'gbk'): try: text = step_path.read_text(encoding=enc) if enc == 'utf-8' and '�' in text: # Replacement chars detected — GBK bytes can't be UTF-8 continue return text except (UnicodeDecodeError, LookupError): continue return step_path.read_text(encoding='latin-1', errors='replace') def _bom_from_step_text(step_path: Path) -> list: """Parse STEP ISO 10303-21 PRODUCT entities directly.""" try: text = _read_step_text(step_path) except Exception as e: logger.warning(f"Could not read STEP text: {e}") return [] pattern = re.compile(r"#\d+\s*=\s*PRODUCT\s*\(\s*'([^']*)'", re.IGNORECASE) seen = {} for match in pattern.finditer(text): name = match.group(1).strip() if not name or name.upper() in ("", "NONE"): continue if name in seen: seen[name]["quantity"] += 1 else: seen[name] = { "part_number": f"{len(seen)+1:03d}", "part_name_original": name, "part_name_english": name, "quantity": 1, "level": 0, "parent": "", "bbox_x_mm": None, "bbox_y_mm": None, "bbox_z_mm": None, "notes": "parsed from STEP text", } rows = list(seen.values()) if rows: logger.info(f"STEP text parser found {len(rows)} unique part names") return rows def _enrich_bboxes(model: StepModel, df: pd.DataFrame) -> pd.DataFrame: """Add bounding box dims per part from build123d. Best-effort.""" try: bb = model.shape.bounding_box() if len(df) == 1: df.at[0, "bbox_x_mm"] = round(bb.size.X, 2) df.at[0, "bbox_y_mm"] = round(bb.size.Y, 2) df.at[0, "bbox_z_mm"] = round(bb.size.Z, 2) else: children = getattr(model.shape, "children", []) or [] for i, child in enumerate(children): if i >= len(df): break try: cb = child.bounding_box() df.at[i, "bbox_x_mm"] = round(cb.size.X, 2) df.at[i, "bbox_y_mm"] = round(cb.size.Y, 2) df.at[i, "bbox_z_mm"] = round(cb.size.Z, 2) except Exception: pass except Exception as e: logger.debug(f"bbox enrichment skipped: {e}") return df def save_bom_csv(df: pd.DataFrame, step_path: Path) -> Path: """Write BOM DataFrame to CSV (legacy fallback).""" out_path = step_path.parent / f"{step_path.stem}_bom.csv" df.to_csv(out_path, index=False) logger.info(f"BOM CSV → {out_path.name}") return out_path def save_bom_xlsx(df: pd.DataFrame, step_path: Path) -> Path: """Write BOM DataFrame to an MPM-branded Excel workbook. Column changes vs internal schema (BOM_COLUMNS): part_name_english → Part Description (column 2) part_name_original → Supplier Part Name (last column) Falls back to CSV if openpyxl is unavailable. """ out_path = step_path.parent / f"{step_path.stem}_bom.xlsx" try: from openpyxl import Workbook from openpyxl.styles import Alignment, Border, Font, PatternFill, Side from openpyxl.utils import get_column_letter except ImportError: logger.warning("openpyxl not installed — falling back to CSV") return save_bom_csv(df, step_path) # Build display DataFrame disp = df.rename(columns=_XLSX_RENAME).copy() for col in _XLSX_ORDER: if col not in disp.columns: disp[col] = None disp = disp[_XLSX_ORDER] wb = Workbook() ws = wb.active ws.title = "Bill of Materials" gold_border = Border(bottom=Side(style="medium", color=_MPM_MIDDLE_GOLD)) hdr_fill = PatternFill("solid", fgColor=_MPM_DARK_SHADE) hdr_font = Font(name="Montserrat", bold=True, color=_MPM_LIGHT_SHADE, size=10) hdr_align = Alignment(horizontal="center", vertical="center", wrap_text=True) # Header row for c, col in enumerate(_XLSX_ORDER, 1): cell = ws.cell(row=1, column=c, value=_XLSX_HEADERS.get(col, col)) cell.font = hdr_font cell.fill = hdr_fill cell.alignment = hdr_align cell.border = gold_border ws.column_dimensions[get_column_letter(c)].width = _XLSX_WIDTHS.get(col, 15) ws.row_dimensions[1].height = 28 # Data rows body_font = Font(name="Open Sans", size=10, color=_MPM_DARK_SHADE) body_align = Alignment(horizontal="left", vertical="center") for r, (_, row) in enumerate(disp.iterrows(), 2): fill = PatternFill("solid", fgColor=_MPM_WARM_OFF_WHITE if r % 2 == 0 else "FFFFFF") for c, col in enumerate(_XLSX_ORDER, 1): cell = ws.cell(row=r, column=c, value=_safe(row[col])) cell.font = body_font cell.fill = fill cell.alignment = body_align ws.freeze_panes = "A2" wb.save(str(out_path)) logger.info(f"BOM XLSX → {out_path.name}") return out_path