feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging

2025-12-16 23:50:12 +01:00 · 2025-10-18 21:32:12 -07:00 · 2025-10-18 21:32:12 -07:00 · 8435312c8f
commit 8435312c8f
parent e9e949aae3
58 changed files with 11921 additions and 3961 deletions
--- a/code/web/services/card_index.py
+++ b/code/web/services/card_index.py
@ -4,30 +4,21 @@ Phase A refactor: Provides a thin API for building and querying the in-memory
 card index keyed by tag/theme. Future enhancements may introduce a persistent
 cache layer or precomputed artifact.

+M4: Updated to load from all_cards.parquet instead of CSV shards.
+
 Public API:
  maybe_build_index() -> None
  get_tag_pool(tag: str) -> list[dict]
  lookup_commander(name: str) -> dict | None

-The index is rebuilt lazily when any of the CSV shard files change mtime.
+The index is rebuilt lazily when the Parquet file mtime changes.
 """
 from __future__ import annotations

 from pathlib import Path
-import csv
-import os
 from typing import Any, Dict, List, Optional

-CARD_FILES_GLOB = [
-    Path("csv_files/blue_cards.csv"),
-    Path("csv_files/white_cards.csv"),
-    Path("csv_files/black_cards.csv"),
-    Path("csv_files/red_cards.csv"),
-    Path("csv_files/green_cards.csv"),
-    Path("csv_files/colorless_cards.csv"),
-    Path("csv_files/cards.csv"),  # fallback large file last
-]
-
+# M4: No longer need CSV file glob, we load from Parquet
 THEME_TAGS_COL = "themeTags"
 NAME_COL = "name"
 COLOR_IDENTITY_COL = "colorIdentity"
@ -53,75 +44,63 @@ def _normalize_rarity(raw: str) -> str:
    r = (raw or "").strip().lower()
    return _RARITY_NORM.get(r, r)

-def _resolve_card_files() -> List[Path]:
-    """Return base card file list + any extra test files supplied via env.
-
-    Environment variable: CARD_INDEX_EXTRA_CSV can contain a comma or semicolon
-    separated list of additional CSV paths (used by tests to inject synthetic
-    edge cases without polluting production shards).
-    """
-    files: List[Path] = list(CARD_FILES_GLOB)
-    extra = os.getenv("CARD_INDEX_EXTRA_CSV")
-    if extra:
-        for part in extra.replace(";", ",").split(","):
-            p = part.strip()
-            if not p:
-                continue
-            path_obj = Path(p)
-            # Include even if missing; maybe created later in test before build
-            files.append(path_obj)
-    return files
-

 def maybe_build_index() -> None:
-    """Rebuild the index if any card CSV mtime changed.
+    """Rebuild the index if the Parquet file mtime changed.

-    Incorporates any extra CSVs specified via CARD_INDEX_EXTRA_CSV.
+    M4: Loads from all_cards.parquet instead of CSV files.
    """
    global _CARD_INDEX, _CARD_INDEX_MTIME
-    latest = 0.0
-    card_files = _resolve_card_files()
-    for p in card_files:
-        if p.exists():
-            mt = p.stat().st_mtime
-            if mt > latest:
-                latest = mt
-    if _CARD_INDEX and _CARD_INDEX_MTIME and latest <= _CARD_INDEX_MTIME:
-        return
-    new_index: Dict[str, List[Dict[str, Any]]] = {}
-    for p in card_files:
-        if not p.exists():
-            continue
-        try:
-            with p.open("r", encoding="utf-8", newline="") as fh:
-                reader = csv.DictReader(fh)
-                if not reader.fieldnames or THEME_TAGS_COL not in reader.fieldnames:
+    
+    try:
+        from path_util import get_processed_cards_path
+        from deck_builder import builder_utils as bu
+        
+        parquet_path = Path(get_processed_cards_path())
+        if not parquet_path.exists():
+            return
+            
+        latest = parquet_path.stat().st_mtime
+        if _CARD_INDEX and _CARD_INDEX_MTIME and latest <= _CARD_INDEX_MTIME:
+            return
+        
+        # Load from Parquet
+        df = bu._load_all_cards_parquet()
+        if df.empty or THEME_TAGS_COL not in df.columns:
+            return
+        
+        new_index: Dict[str, List[Dict[str, Any]]] = {}
+        
+        for _, row in df.iterrows():
+            name = row.get(NAME_COL) or row.get("faceName") or ""
+            tags = row.get(THEME_TAGS_COL)
+            
+            # Handle tags (already a list after our conversion in builder_utils)
+            if not tags or not isinstance(tags, list):
+                continue
+                
+            color_id = str(row.get(COLOR_IDENTITY_COL) or "").strip()
+            mana_cost = str(row.get(MANA_COST_COL) or "").strip()
+            rarity = _normalize_rarity(str(row.get(RARITY_COL) or ""))
+            
+            for tg in tags:
+                if not tg:
                    continue
-                for row in reader:
-                    name = row.get(NAME_COL) or row.get("faceName") or ""
-                    tags_raw = row.get(THEME_TAGS_COL) or ""
-                    tags = [t.strip(" '[]") for t in tags_raw.split(',') if t.strip()] if tags_raw else []
-                    if not tags:
-                        continue
-                    color_id = (row.get(COLOR_IDENTITY_COL) or "").strip()
-                    mana_cost = (row.get(MANA_COST_COL) or "").strip()
-                    rarity = _normalize_rarity(row.get(RARITY_COL) or "")
-                    for tg in tags:
-                        if not tg:
-                            continue
-                        new_index.setdefault(tg, []).append({
-                            "name": name,
-                            "color_identity": color_id,
-                            "tags": tags,
-                            "mana_cost": mana_cost,
-                            "rarity": rarity,
-                            "color_identity_list": list(color_id) if color_id else [],
-                            "pip_colors": [c for c in mana_cost if c in {"W","U","B","R","G"}],
-                        })
-        except Exception:
-            continue
-    _CARD_INDEX = new_index
-    _CARD_INDEX_MTIME = latest
+                new_index.setdefault(tg, []).append({
+                    "name": name,
+                    "color_identity": color_id,
+                    "tags": tags,
+                    "mana_cost": mana_cost,
+                    "rarity": rarity,
+                    "color_identity_list": [c.strip() for c in color_id.split(',') if c.strip()],
+                    "pip_colors": [c for c in mana_cost if c in {"W","U","B","R","G"}],
+                })
+        
+        _CARD_INDEX = new_index
+        _CARD_INDEX_MTIME = latest
+    except Exception:
+        # Defensive: if anything fails, leave index unchanged
+        pass

 def get_tag_pool(tag: str) -> List[Dict[str, Any]]:
    return _CARD_INDEX.get(tag, [])
--- a/code/web/services/card_similarity.py
+++ b/code/web/services/card_similarity.py
@ -247,11 +247,13 @@ class CardSimilarity:
        Returns:
            Set of theme tag strings
        """
-        if pd.isna(tags) or not tags:
+        # M4: Handle both scalar NA (CSV) and array values (Parquet)
+        if pd.isna(tags) if isinstance(tags, (str, float, int, type(None))) else False:
            return set()
-
+        
        if isinstance(tags, list):
-            return set(tags)
+            # M4: Parquet format - already a list
+            return set(tags) if tags else set()

        if isinstance(tags, str):
            # Handle string representation of list: "['tag1', 'tag2']"
--- a/code/web/services/commander_catalog_loader.py
+++ b/code/web/services/commander_catalog_loader.py
@ -2,14 +2,14 @@

 Responsibilities
 ================
- Read and normalize `commander_cards.csv` (shared with the deck builder).
+- Read and normalize commander data from all_cards.parquet (M4 migration).
 - Produce deterministic commander records with rich metadata (slug, colors,
  partner/background flags, theme tags, Scryfall image URLs).
 - Cache the parsed catalog and invalidate on file timestamp changes.

-The loader operates without pandas to keep the web layer light-weight and to
-simplify unit testing. It honors the `CSV_FILES_DIR` environment variable via
-`path_util.csv_dir()` just like the CLI builder.
+M4: Updated to load from all_cards.parquet instead of commander_cards.csv.
+The loader uses pandas to filter commanders (isCommander == True) from the
+unified Parquet data source.
 """

 from __future__ import annotations
@ -18,12 +18,10 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Iterable, List, Mapping, Optional, Tuple
 import ast
-import csv
 import os
 import re
 from urllib.parse import quote

-from path_util import csv_dir
 from deck_builder.partner_background_utils import analyze_partner_background

 __all__ = [
@ -204,9 +202,11 @@ def find_commander_record(name: str | None) -> CommanderRecord | None:


 def _resolve_commander_path(source_path: str | os.PathLike[str] | None) -> Path:
+    """M4: Resolve Parquet path instead of commander_cards.csv."""
    if source_path is not None:
        return Path(source_path).resolve()
-    return (Path(csv_dir()) / "commander_cards.csv").resolve()
+    from path_util import get_processed_cards_path
+    return Path(get_processed_cards_path()).resolve()


 def _is_cache_valid(path: Path, cached: CommanderCatalog) -> bool:
@ -221,24 +221,31 @@ def _is_cache_valid(path: Path, cached: CommanderCatalog) -> bool:


 def _build_catalog(path: Path) -> CommanderCatalog:
+    """M4: Load commanders from Parquet instead of CSV."""
    if not path.exists():
-        raise FileNotFoundError(f"Commander CSV not found at {path}")
+        raise FileNotFoundError(f"Commander Parquet not found at {path}")

    entries: List[CommanderRecord] = []
    used_slugs: set[str] = set()

-    with path.open("r", encoding="utf-8", newline="") as handle:
-        reader = csv.DictReader(handle)
-        if reader.fieldnames is None:
-            raise ValueError("Commander CSV missing header row")
+    # Load commanders from Parquet (isCommander == True)
+    from deck_builder import builder_utils as bu
+    df = bu._load_all_cards_parquet()
+    if df.empty or 'isCommander' not in df.columns:
+        raise ValueError("Parquet missing isCommander column")
+    
+    commanders_df = df[df['isCommander']].copy()

-        for index, row in enumerate(reader):
-            try:
-                record = _row_to_record(row, used_slugs)
-            except Exception:
-                continue
-            entries.append(record)
-            used_slugs.add(record.slug)
+    # Convert DataFrame rows to CommanderRecords
+    for _, row in commanders_df.iterrows():
+        try:
+            # Convert row to dict for _row_to_record
+            row_dict = row.to_dict()
+            record = _row_to_record(row_dict, used_slugs)
+        except Exception:
+            continue
+        entries.append(record)
+        used_slugs.add(record.slug)

    stat_result = path.stat()
    mtime_ns = getattr(stat_result, "st_mtime_ns", int(stat_result.st_mtime * 1_000_000_000))
--- a/code/web/services/orchestrator.py
+++ b/code/web/services/orchestrator.py
@ -224,10 +224,18 @@ def _maybe_refresh_partner_synergy(out_func=None, *, force: bool = False, root:

        if not needs_refresh:
            source_times: list[float] = []
-            candidates = [
-                root_path / "config" / "themes" / "theme_list.json",
-                root_path / "csv_files" / "commander_cards.csv",
-            ]
+            # M4: Check all_cards.parquet instead of commander_cards.csv
+            try:
+                from path_util import get_processed_cards_path
+                parquet_path = Path(get_processed_cards_path())
+                candidates = [
+                    root_path / "config" / "themes" / "theme_list.json",
+                    parquet_path,
+                ]
+            except Exception:
+                candidates = [
+                    root_path / "config" / "themes" / "theme_list.json",
+                ]
            for candidate in candidates:
                try:
                    if candidate.exists():
@ -919,14 +927,16 @@ def _is_truthy_env(name: str, default: str = '1') -> bool:
 def is_setup_ready() -> bool:
    """Fast readiness check: required files present and tagging completed.

-    We consider the system ready if csv_files/cards.csv exists and the
+    M4: Updated to check for all_cards.parquet instead of cards.csv.
+    We consider the system ready if card_files/processed/all_cards.parquet exists and the
    .tagging_complete.json flag exists. Freshness (mtime) is enforced only
    during auto-refresh inside _ensure_setup_ready, not here.
    """
    try:
-        cards_path = os.path.join('csv_files', 'cards.csv')
+        from path_util import get_processed_cards_path
+        parquet_path = get_processed_cards_path()
        flag_path = os.path.join('csv_files', '.tagging_complete.json')
-        return os.path.exists(cards_path) and os.path.exists(flag_path)
+        return os.path.exists(parquet_path) and os.path.exists(flag_path)
    except Exception:
        return False

@ -983,20 +993,25 @@ def is_setup_stale() -> bool:
        except Exception:
            pass

-        # Fallback: compare cards.csv mtime
-        cards_path = os.path.join('csv_files', 'cards.csv')
-        if not os.path.exists(cards_path):
+        # Fallback: compare all_cards.parquet mtime (M4 update)
+        try:
+            from path_util import get_processed_cards_path
+            parquet_path = get_processed_cards_path()
+            if not os.path.exists(parquet_path):
+                return False
+            age_seconds = time.time() - os.path.getmtime(parquet_path)
+            return age_seconds > refresh_age_seconds
+        except Exception:
            return False
-        age_seconds = time.time() - os.path.getmtime(cards_path)
-        return age_seconds > refresh_age_seconds
    except Exception:
        return False


 def _ensure_setup_ready(out, force: bool = False) -> None:
-    """Ensure card CSVs exist and tagging has completed; bootstrap if needed.
+    """Ensure card data exists and tagging has completed; bootstrap if needed.

-    Mirrors the CLI behavior used in build_deck_full: if csv_files/cards.csv is
+    M4: Updated to check for all_cards.parquet instead of cards.csv.
+    Mirrors the CLI behavior used in build_deck_full: if the Parquet file is
    missing, too old, or the tagging flag is absent, run initial setup and tagging.
    """
    # Track whether a theme catalog export actually executed during this invocation
@ -1201,7 +1216,9 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
                pass

    try:
-        cards_path = os.path.join('csv_files', 'cards.csv')
+        # M4 (Parquet Migration): Check for processed Parquet file instead of CSV
+        from path_util import get_processed_cards_path  # type: ignore
+        cards_path = get_processed_cards_path()
        flag_path = os.path.join('csv_files', '.tagging_complete.json')
        auto_setup_enabled = _is_truthy_env('WEB_AUTO_SETUP', '1')
        # Allow tuning of time-based refresh; default 7 days
@ -1215,14 +1232,14 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
            _write_status({"running": True, "phase": "setup", "message": "Forcing full setup and tagging...", "started_at": _dt.now().isoformat(timespec='seconds'), "percent": 0})

        if not os.path.exists(cards_path):
-            out("cards.csv not found. Running initial setup and tagging...")
+            out(f"Processed Parquet not found ({cards_path}). Running initial setup and tagging...")
            _write_status({"running": True, "phase": "setup", "message": "Preparing card database (initial setup)...", "started_at": _dt.now().isoformat(timespec='seconds'), "percent": 0})
            refresh_needed = True
        else:
            try:
                age_seconds = time.time() - os.path.getmtime(cards_path)
                if age_seconds > refresh_age_seconds and not force:
-                    out("cards.csv is older than 7 days. Refreshing data (setup + tagging)...")
+                    out(f"Processed Parquet is older than {days} days. Refreshing data (setup + tagging)...")
                    _write_status({"running": True, "phase": "setup", "message": "Refreshing card database (initial setup)...", "started_at": _dt.now().isoformat(timespec='seconds'), "percent": 0})
                    refresh_needed = True
            except Exception:
@ -1239,6 +1256,55 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
                out("Setup/tagging required, but WEB_AUTO_SETUP=0. Please run Setup from the UI.")
                _write_status({"running": False, "phase": "requires_setup", "message": "Setup required (auto disabled)."})
                return
+            
+            # Try downloading pre-tagged data from GitHub first (faster than local build)
+            try:
+                import urllib.request
+                import urllib.error
+                out("[SETUP] Attempting to download pre-tagged data from GitHub...")
+                _write_status({"running": True, "phase": "download", "message": "Downloading pre-tagged data from GitHub...", "percent": 5})
+                
+                base_url = "https://raw.githubusercontent.com/mwisnowski/mtg_python_deckbuilder/similarity-cache-data"
+                files_to_download = [
+                    ("card_files/processed/all_cards.parquet", "card_files/processed/all_cards.parquet"),
+                    ("card_files/processed/.tagging_complete.json", "card_files/processed/.tagging_complete.json"),
+                    ("card_files/similarity_cache.parquet", "card_files/similarity_cache.parquet"),
+                    ("card_files/similarity_cache_metadata.json", "card_files/similarity_cache_metadata.json"),
+                ]
+                
+                download_success = True
+                for remote_path, local_path in files_to_download:
+                    try:
+                        remote_url = f"{base_url}/{remote_path}"
+                        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+                        urllib.request.urlretrieve(remote_url, local_path)
+                        out(f"[SETUP] Downloaded: {local_path}")
+                    except urllib.error.HTTPError as e:
+                        if e.code == 404:
+                            out(f"[SETUP] File not available on GitHub (404): {remote_path}")
+                            download_success = False
+                            break
+                        raise
+                
+                if download_success:
+                    out("[SETUP] ✓ Successfully downloaded pre-tagged data from GitHub. Skipping local setup/tagging.")
+                    _write_status({
+                        "running": False,
+                        "phase": "done",
+                        "message": "Setup complete (downloaded from GitHub)",
+                        "percent": 100,
+                        "finished_at": _dt.now().isoformat(timespec='seconds')
+                    })
+                    # Refresh theme catalog after successful download
+                    _refresh_theme_catalog(out, force=False, fast_path=True)
+                    return
+                else:
+                    out("[SETUP] GitHub download incomplete. Falling back to local setup/tagging...")
+                    _write_status({"running": True, "phase": "setup", "message": "GitHub download failed, running local setup...", "percent": 0})
+            except Exception as e:
+                out(f"[SETUP] GitHub download failed ({e}). Falling back to local setup/tagging...")
+                _write_status({"running": True, "phase": "setup", "message": "GitHub download failed, running local setup...", "percent": 0})
+            
            try:
                from file_setup.setup import initial_setup  # type: ignore
                # Always run initial_setup when forced or when cards are missing/stale
@ -1247,95 +1313,39 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
                out(f"Initial setup failed: {e}")
                _write_status({"running": False, "phase": "error", "message": f"Initial setup failed: {e}"})
                return
-            # Tagging with progress; support parallel workers for speed
+            # M4 (Parquet Migration): Use unified run_tagging with parallel support
            try:
                from tagging import tagger as _tagger  # type: ignore
-                from settings import COLORS as _COLORS  # type: ignore
-                colors = list(_COLORS)
-                total = len(colors)
                use_parallel = str(os.getenv('WEB_TAG_PARALLEL', '1')).strip().lower() in {"1","true","yes","on"}
                max_workers_env = os.getenv('WEB_TAG_WORKERS')
                try:
                    max_workers = int(max_workers_env) if max_workers_env else None
                except Exception:
                    max_workers = None
+                
+                mode_label = "parallel" if use_parallel else "sequential"
                _write_status({
                    "running": True,
                    "phase": "tagging",
-                    "message": "Tagging cards (this may take a while)..." if not use_parallel else "Tagging cards in parallel...",
-                    "color": None,
-                    "percent": 0,
-                    "color_idx": 0,
-                    "color_total": total,
+                    "message": f"Tagging all cards ({mode_label} mode)...",
+                    "percent": 10,
                    "tagging_started_at": _dt.now().isoformat(timespec='seconds')
                })
-
-                if use_parallel:
-                    try:
-                        import concurrent.futures as _f
-                        completed = 0
-                        with _f.ProcessPoolExecutor(max_workers=max_workers) as ex:
-                            fut_map = {ex.submit(_tagger.load_dataframe, c): c for c in colors}
-                            for fut in _f.as_completed(fut_map):
-                                c = fut_map[fut]
-                                try:
-                                    fut.result()
-                                    completed += 1
-                                    pct = int(completed * 100 / max(1, total))
-                                    _write_status({
-                                        "running": True,
-                                        "phase": "tagging",
-                                        "message": f"Tagged {c}",
-                                        "color": c,
-                                        "percent": pct,
-                                        "color_idx": completed,
-                                        "color_total": total,
-                                    })
-                                except Exception as e:
-                                    out(f"Parallel tagging failed for {c}: {e}")
-                                    _write_status({"running": False, "phase": "error", "message": f"Tagging {c} failed: {e}", "color": c})
-                                    return
-                    except Exception as e:
-                        out(f"Parallel tagging init failed: {e}; falling back to sequential")
-                        use_parallel = False
-
-                if not use_parallel:
-                    for idx, _color in enumerate(colors, start=1):
-                        try:
-                            pct = int((idx - 1) * 100 / max(1, total))
-                            # Estimate ETA based on average time per completed color
-                            eta_s = None
-                            try:
-                                from datetime import datetime as __dt
-                                ts = __dt.fromisoformat(json.load(open(os.path.join('csv_files', '.setup_status.json'), 'r', encoding='utf-8')).get('tagging_started_at'))  # type: ignore
-                                elapsed = max(0.0, (_dt.now() - ts).total_seconds())
-                                completed = max(0, idx - 1)
-                                if completed > 0:
-                                    avg = elapsed / completed
-                                    remaining = max(0, total - completed)
-                                    eta_s = int(avg * remaining)
-                            except Exception:
-                                eta_s = None
-                            payload = {
-                                "running": True,
-                                "phase": "tagging",
-                                "message": f"Tagging {_color}...",
-                                "color": _color,
-                                "percent": pct,
-                                "color_idx": idx,
-                                "color_total": total,
-                            }
-                            if eta_s is not None:
-                                payload["eta_seconds"] = eta_s
-                            _write_status(payload)
-                            _tagger.load_dataframe(_color)
-                        except Exception as e:
-                            out(f"Tagging {_color} failed: {e}")
-                            _write_status({"running": False, "phase": "error", "message": f"Tagging {_color} failed: {e}", "color": _color})
-                            return
+                
+                out(f"Starting unified tagging ({mode_label} mode)...")
+                _tagger.run_tagging(parallel=use_parallel, max_workers=max_workers)
+                
+                _write_status({
+                    "running": True,
+                    "phase": "tagging",
+                    "message": f"Tagging complete ({mode_label} mode)",
+                    "percent": 90,
+                })
+                out(f"✓ Tagging complete ({mode_label} mode)")
+                
            except Exception as e:
-                out(f"Tagging failed to start: {e}")
-                _write_status({"running": False, "phase": "error", "message": f"Tagging failed to start: {e}"})
+                out(f"Tagging failed: {e}")
+                _write_status({"running": False, "phase": "error", "message": f"Tagging failed: {e}"})
                return
            try:
                os.makedirs('csv_files', exist_ok=True)
--- a/code/web/services/owned_store.py
+++ b/code/web/services/owned_store.py
@ -124,135 +124,74 @@ def add_names(names: Iterable[str]) -> Tuple[int, int]:


 def _enrich_from_csvs(target_names: Iterable[str]) -> Dict[str, Dict[str, object]]:
-    """Return metadata for target names by scanning csv_files/*_cards.csv.
+    """Return metadata for target names by scanning all_cards.parquet (M4).
    Output: { Name: { 'tags': [..], 'type': str|None, 'colors': [..] } }
    """
-    from pathlib import Path
-    import json as _json
-    import csv as _csv
-
-    base = Path('csv_files')
    meta: Dict[str, Dict[str, object]] = {}
    want = {str(n).strip().lower() for n in target_names if str(n).strip()}
-    if not (base.exists() and want):
+    if not want:
        return meta
-    csv_files = [p for p in base.glob('*_cards.csv') if p.name.lower() not in ('cards.csv', 'commander_cards.csv')]

-    def _norm(s: str) -> str: return str(s or '').strip().lower()
-    for path in csv_files:
-        try:
-            with path.open('r', encoding='utf-8', errors='ignore') as f:
-                reader = _csv.DictReader(f)
-                headers = [h for h in (reader.fieldnames or [])]
-                name_key = None
-                tags_key = None
-                type_key = None
-                colors_key = None
-                for h in headers:
-                    hn = _norm(h)
-                    if hn in ('name', 'card', 'cardname', 'card_name'):
-                        name_key = h
-                    if hn in ('tags', 'theme_tags', 'themetags', 'themetagsjson') or hn == 'themetags' or hn == 'themetagsjson':
-                        tags_key = h
-                    if hn in ('type', 'type_line', 'typeline'):
-                        type_key = h
-                    if hn in ('colors', 'coloridentity', 'color_identity', 'color'):
-                        colors_key = h
-                if not tags_key:
-                    for h in headers:
-                        if h.strip() in ('ThemeTags', 'themeTags'):
-                            tags_key = h
+    try:
+        from deck_builder import builder_utils as bu
+        df = bu._load_all_cards_parquet()
+        if df.empty:
+            return meta
+
+        # Filter to cards we care about
+        df['name_lower'] = df['name'].str.lower()
+        df_filtered = df[df['name_lower'].isin(want)].copy()
+
+        for _, row in df_filtered.iterrows():
+            nm = str(row.get('name') or '').strip()
+            if not nm:
+                continue
+
+            entry = meta.setdefault(nm, {"tags": [], "type": None, "colors": []})
+
+            # Tags (already a list after our conversion in builder_utils)
+            tags = row.get('themeTags')
+            if tags and isinstance(tags, list):
+                existing = entry.get('tags') or []
+                seen = {str(t).lower() for t in existing}
+                for t in tags:
+                    t_str = str(t).strip()
+                    if t_str and t_str.lower() not in seen:
+                        existing.append(t_str)
+                        seen.add(t_str.lower())
+                entry['tags'] = existing
+
+            # Type
+            if not entry.get('type'):
+                t_raw = str(row.get('type') or '').strip()
+                if t_raw:
+                    tline = t_raw.split('—')[0].strip() if '—' in t_raw else t_raw
+                    prim = None
+                    for cand in ['Creature','Instant','Sorcery','Artifact','Enchantment','Planeswalker','Land','Battle']:
+                        if cand.lower() in tline.lower():
+                            prim = cand
                            break
-                if not colors_key:
-                    for h in headers:
-                        if h.strip() in ('ColorIdentity', 'colorIdentity'):
-                            colors_key = h
-                            break
-                if not name_key:
-                    continue
-                for row in reader:
-                    try:
-                        nm = str(row.get(name_key) or '').strip()
-                        if not nm:
-                            continue
-                        low = nm.lower()
-                        if low not in want:
-                            continue
-                        entry = meta.setdefault(nm, {"tags": [], "type": None, "colors": []})
-                        # Tags
-                        if tags_key:
-                            raw = (row.get(tags_key) or '').strip()
-                            vals: List[str] = []
-                            if raw:
-                                if raw.startswith('['):
-                                    try:
-                                        arr = _json.loads(raw)
-                                        if isinstance(arr, list):
-                                            vals = [str(x).strip() for x in arr if str(x).strip()]
-                                    except Exception:
-                                        vals = []
-                                if not vals:
-                                    parts = [p.strip() for p in raw.replace(';', ',').split(',')]
-                                    vals = [p for p in parts if p]
-                            if vals:
-                                existing = entry.get('tags') or []
-                                seen = {str(t).lower() for t in existing}
-                                for t in vals:
-                                    if str(t).lower() not in seen:
-                                        existing.append(str(t))
-                                        seen.add(str(t).lower())
-                                entry['tags'] = existing
-                        # Type
-                        if type_key and not entry.get('type'):
-                            t_raw = str(row.get(type_key) or '').strip()
-                            if t_raw:
-                                tline = t_raw.split('—')[0].strip() if '—' in t_raw else t_raw
-                                prim = None
-                                for cand in ['Creature','Instant','Sorcery','Artifact','Enchantment','Planeswalker','Land','Battle']:
-                                    if cand.lower() in tline.lower():
-                                        prim = cand
-                                        break
-                                if not prim and tline:
-                                    prim = tline.split()[0]
-                                if prim:
-                                    entry['type'] = prim
-                        # Colors
-                        if colors_key and not entry.get('colors'):
-                            c_raw = str(row.get(colors_key) or '').strip()
-                            cols: List[str] = []
-                            if c_raw:
-                                if c_raw.startswith('['):
-                                    try:
-                                        arr = _json.loads(c_raw)
-                                        if isinstance(arr, list):
-                                            cols = [str(x).strip().upper() for x in arr if str(x).strip()]
-                                    except Exception:
-                                        cols = []
-                                if not cols:
-                                    parts = [p.strip().upper() for p in c_raw.replace(';', ',').replace('[','').replace(']','').replace("'",'').split(',') if p.strip()]
-                                    if parts:
-                                        cols = parts
-                                if not cols:
-                                    for ch in c_raw:
-                                        if ch.upper() in ('W','U','B','R','G','C'):
-                                            cols.append(ch.upper())
-                            if cols:
-                                seen_c = set()
-                                uniq = []
-                                for c in cols:
-                                    if c not in seen_c:
-                                        uniq.append(c)
-                                        seen_c.add(c)
-                                entry['colors'] = uniq
-                    except Exception:
-                        continue
-        except Exception:
-            continue
+                    if not prim and tline:
+                        prim = tline.split()[0]
+                    if prim:
+                        entry['type'] = prim
+
+            # Colors
+            if not entry.get('colors'):
+                colors_raw = str(row.get('colorIdentity') or '').strip()
+                if colors_raw:
+                    parts = [c.strip() for c in colors_raw.split(',') if c.strip()]
+                    entry['colors'] = parts
+
+    except Exception:
+        # Defensive: return empty or partial meta
+        pass
+
    return meta


 def add_and_enrich(names: Iterable[str]) -> Tuple[int, int]:
-    """Add names and enrich their metadata from CSVs in one pass.
+    """Add names and enrich their metadata from Parquet (M4).
    Returns (added_count, total_after).
    """
    data = _load_raw()