feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging

This commit is contained in:
matt 2025-10-18 21:32:12 -07:00
parent e9e949aae3
commit 8435312c8f
58 changed files with 11921 additions and 3961 deletions

View file

@ -4,30 +4,21 @@ Phase A refactor: Provides a thin API for building and querying the in-memory
card index keyed by tag/theme. Future enhancements may introduce a persistent
cache layer or precomputed artifact.
M4: Updated to load from all_cards.parquet instead of CSV shards.
Public API:
maybe_build_index() -> None
get_tag_pool(tag: str) -> list[dict]
lookup_commander(name: str) -> dict | None
The index is rebuilt lazily when any of the CSV shard files change mtime.
The index is rebuilt lazily when the Parquet file mtime changes.
"""
from __future__ import annotations
from pathlib import Path
import csv
import os
from typing import Any, Dict, List, Optional
CARD_FILES_GLOB = [
Path("csv_files/blue_cards.csv"),
Path("csv_files/white_cards.csv"),
Path("csv_files/black_cards.csv"),
Path("csv_files/red_cards.csv"),
Path("csv_files/green_cards.csv"),
Path("csv_files/colorless_cards.csv"),
Path("csv_files/cards.csv"), # fallback large file last
]
# M4: No longer need CSV file glob, we load from Parquet
THEME_TAGS_COL = "themeTags"
NAME_COL = "name"
COLOR_IDENTITY_COL = "colorIdentity"
@ -53,75 +44,63 @@ def _normalize_rarity(raw: str) -> str:
r = (raw or "").strip().lower()
return _RARITY_NORM.get(r, r)
def _resolve_card_files() -> List[Path]:
"""Return base card file list + any extra test files supplied via env.
Environment variable: CARD_INDEX_EXTRA_CSV can contain a comma or semicolon
separated list of additional CSV paths (used by tests to inject synthetic
edge cases without polluting production shards).
"""
files: List[Path] = list(CARD_FILES_GLOB)
extra = os.getenv("CARD_INDEX_EXTRA_CSV")
if extra:
for part in extra.replace(";", ",").split(","):
p = part.strip()
if not p:
continue
path_obj = Path(p)
# Include even if missing; maybe created later in test before build
files.append(path_obj)
return files
def maybe_build_index() -> None:
"""Rebuild the index if any card CSV mtime changed.
"""Rebuild the index if the Parquet file mtime changed.
Incorporates any extra CSVs specified via CARD_INDEX_EXTRA_CSV.
M4: Loads from all_cards.parquet instead of CSV files.
"""
global _CARD_INDEX, _CARD_INDEX_MTIME
latest = 0.0
card_files = _resolve_card_files()
for p in card_files:
if p.exists():
mt = p.stat().st_mtime
if mt > latest:
latest = mt
if _CARD_INDEX and _CARD_INDEX_MTIME and latest <= _CARD_INDEX_MTIME:
return
new_index: Dict[str, List[Dict[str, Any]]] = {}
for p in card_files:
if not p.exists():
continue
try:
with p.open("r", encoding="utf-8", newline="") as fh:
reader = csv.DictReader(fh)
if not reader.fieldnames or THEME_TAGS_COL not in reader.fieldnames:
try:
from path_util import get_processed_cards_path
from deck_builder import builder_utils as bu
parquet_path = Path(get_processed_cards_path())
if not parquet_path.exists():
return
latest = parquet_path.stat().st_mtime
if _CARD_INDEX and _CARD_INDEX_MTIME and latest <= _CARD_INDEX_MTIME:
return
# Load from Parquet
df = bu._load_all_cards_parquet()
if df.empty or THEME_TAGS_COL not in df.columns:
return
new_index: Dict[str, List[Dict[str, Any]]] = {}
for _, row in df.iterrows():
name = row.get(NAME_COL) or row.get("faceName") or ""
tags = row.get(THEME_TAGS_COL)
# Handle tags (already a list after our conversion in builder_utils)
if not tags or not isinstance(tags, list):
continue
color_id = str(row.get(COLOR_IDENTITY_COL) or "").strip()
mana_cost = str(row.get(MANA_COST_COL) or "").strip()
rarity = _normalize_rarity(str(row.get(RARITY_COL) or ""))
for tg in tags:
if not tg:
continue
for row in reader:
name = row.get(NAME_COL) or row.get("faceName") or ""
tags_raw = row.get(THEME_TAGS_COL) or ""
tags = [t.strip(" '[]") for t in tags_raw.split(',') if t.strip()] if tags_raw else []
if not tags:
continue
color_id = (row.get(COLOR_IDENTITY_COL) or "").strip()
mana_cost = (row.get(MANA_COST_COL) or "").strip()
rarity = _normalize_rarity(row.get(RARITY_COL) or "")
for tg in tags:
if not tg:
continue
new_index.setdefault(tg, []).append({
"name": name,
"color_identity": color_id,
"tags": tags,
"mana_cost": mana_cost,
"rarity": rarity,
"color_identity_list": list(color_id) if color_id else [],
"pip_colors": [c for c in mana_cost if c in {"W","U","B","R","G"}],
})
except Exception:
continue
_CARD_INDEX = new_index
_CARD_INDEX_MTIME = latest
new_index.setdefault(tg, []).append({
"name": name,
"color_identity": color_id,
"tags": tags,
"mana_cost": mana_cost,
"rarity": rarity,
"color_identity_list": [c.strip() for c in color_id.split(',') if c.strip()],
"pip_colors": [c for c in mana_cost if c in {"W","U","B","R","G"}],
})
_CARD_INDEX = new_index
_CARD_INDEX_MTIME = latest
except Exception:
# Defensive: if anything fails, leave index unchanged
pass
def get_tag_pool(tag: str) -> List[Dict[str, Any]]:
return _CARD_INDEX.get(tag, [])

View file

@ -247,11 +247,13 @@ class CardSimilarity:
Returns:
Set of theme tag strings
"""
if pd.isna(tags) or not tags:
# M4: Handle both scalar NA (CSV) and array values (Parquet)
if pd.isna(tags) if isinstance(tags, (str, float, int, type(None))) else False:
return set()
if isinstance(tags, list):
return set(tags)
# M4: Parquet format - already a list
return set(tags) if tags else set()
if isinstance(tags, str):
# Handle string representation of list: "['tag1', 'tag2']"

View file

@ -2,14 +2,14 @@
Responsibilities
================
- Read and normalize `commander_cards.csv` (shared with the deck builder).
- Read and normalize commander data from all_cards.parquet (M4 migration).
- Produce deterministic commander records with rich metadata (slug, colors,
partner/background flags, theme tags, Scryfall image URLs).
- Cache the parsed catalog and invalidate on file timestamp changes.
The loader operates without pandas to keep the web layer light-weight and to
simplify unit testing. It honors the `CSV_FILES_DIR` environment variable via
`path_util.csv_dir()` just like the CLI builder.
M4: Updated to load from all_cards.parquet instead of commander_cards.csv.
The loader uses pandas to filter commanders (isCommander == True) from the
unified Parquet data source.
"""
from __future__ import annotations
@ -18,12 +18,10 @@ from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Mapping, Optional, Tuple
import ast
import csv
import os
import re
from urllib.parse import quote
from path_util import csv_dir
from deck_builder.partner_background_utils import analyze_partner_background
__all__ = [
@ -204,9 +202,11 @@ def find_commander_record(name: str | None) -> CommanderRecord | None:
def _resolve_commander_path(source_path: str | os.PathLike[str] | None) -> Path:
"""M4: Resolve Parquet path instead of commander_cards.csv."""
if source_path is not None:
return Path(source_path).resolve()
return (Path(csv_dir()) / "commander_cards.csv").resolve()
from path_util import get_processed_cards_path
return Path(get_processed_cards_path()).resolve()
def _is_cache_valid(path: Path, cached: CommanderCatalog) -> bool:
@ -221,24 +221,31 @@ def _is_cache_valid(path: Path, cached: CommanderCatalog) -> bool:
def _build_catalog(path: Path) -> CommanderCatalog:
"""M4: Load commanders from Parquet instead of CSV."""
if not path.exists():
raise FileNotFoundError(f"Commander CSV not found at {path}")
raise FileNotFoundError(f"Commander Parquet not found at {path}")
entries: List[CommanderRecord] = []
used_slugs: set[str] = set()
with path.open("r", encoding="utf-8", newline="") as handle:
reader = csv.DictReader(handle)
if reader.fieldnames is None:
raise ValueError("Commander CSV missing header row")
# Load commanders from Parquet (isCommander == True)
from deck_builder import builder_utils as bu
df = bu._load_all_cards_parquet()
if df.empty or 'isCommander' not in df.columns:
raise ValueError("Parquet missing isCommander column")
commanders_df = df[df['isCommander']].copy()
for index, row in enumerate(reader):
try:
record = _row_to_record(row, used_slugs)
except Exception:
continue
entries.append(record)
used_slugs.add(record.slug)
# Convert DataFrame rows to CommanderRecords
for _, row in commanders_df.iterrows():
try:
# Convert row to dict for _row_to_record
row_dict = row.to_dict()
record = _row_to_record(row_dict, used_slugs)
except Exception:
continue
entries.append(record)
used_slugs.add(record.slug)
stat_result = path.stat()
mtime_ns = getattr(stat_result, "st_mtime_ns", int(stat_result.st_mtime * 1_000_000_000))

View file

@ -224,10 +224,18 @@ def _maybe_refresh_partner_synergy(out_func=None, *, force: bool = False, root:
if not needs_refresh:
source_times: list[float] = []
candidates = [
root_path / "config" / "themes" / "theme_list.json",
root_path / "csv_files" / "commander_cards.csv",
]
# M4: Check all_cards.parquet instead of commander_cards.csv
try:
from path_util import get_processed_cards_path
parquet_path = Path(get_processed_cards_path())
candidates = [
root_path / "config" / "themes" / "theme_list.json",
parquet_path,
]
except Exception:
candidates = [
root_path / "config" / "themes" / "theme_list.json",
]
for candidate in candidates:
try:
if candidate.exists():
@ -919,14 +927,16 @@ def _is_truthy_env(name: str, default: str = '1') -> bool:
def is_setup_ready() -> bool:
"""Fast readiness check: required files present and tagging completed.
We consider the system ready if csv_files/cards.csv exists and the
M4: Updated to check for all_cards.parquet instead of cards.csv.
We consider the system ready if card_files/processed/all_cards.parquet exists and the
.tagging_complete.json flag exists. Freshness (mtime) is enforced only
during auto-refresh inside _ensure_setup_ready, not here.
"""
try:
cards_path = os.path.join('csv_files', 'cards.csv')
from path_util import get_processed_cards_path
parquet_path = get_processed_cards_path()
flag_path = os.path.join('csv_files', '.tagging_complete.json')
return os.path.exists(cards_path) and os.path.exists(flag_path)
return os.path.exists(parquet_path) and os.path.exists(flag_path)
except Exception:
return False
@ -983,20 +993,25 @@ def is_setup_stale() -> bool:
except Exception:
pass
# Fallback: compare cards.csv mtime
cards_path = os.path.join('csv_files', 'cards.csv')
if not os.path.exists(cards_path):
# Fallback: compare all_cards.parquet mtime (M4 update)
try:
from path_util import get_processed_cards_path
parquet_path = get_processed_cards_path()
if not os.path.exists(parquet_path):
return False
age_seconds = time.time() - os.path.getmtime(parquet_path)
return age_seconds > refresh_age_seconds
except Exception:
return False
age_seconds = time.time() - os.path.getmtime(cards_path)
return age_seconds > refresh_age_seconds
except Exception:
return False
def _ensure_setup_ready(out, force: bool = False) -> None:
"""Ensure card CSVs exist and tagging has completed; bootstrap if needed.
"""Ensure card data exists and tagging has completed; bootstrap if needed.
Mirrors the CLI behavior used in build_deck_full: if csv_files/cards.csv is
M4: Updated to check for all_cards.parquet instead of cards.csv.
Mirrors the CLI behavior used in build_deck_full: if the Parquet file is
missing, too old, or the tagging flag is absent, run initial setup and tagging.
"""
# Track whether a theme catalog export actually executed during this invocation
@ -1201,7 +1216,9 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
pass
try:
cards_path = os.path.join('csv_files', 'cards.csv')
# M4 (Parquet Migration): Check for processed Parquet file instead of CSV
from path_util import get_processed_cards_path # type: ignore
cards_path = get_processed_cards_path()
flag_path = os.path.join('csv_files', '.tagging_complete.json')
auto_setup_enabled = _is_truthy_env('WEB_AUTO_SETUP', '1')
# Allow tuning of time-based refresh; default 7 days
@ -1215,14 +1232,14 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
_write_status({"running": True, "phase": "setup", "message": "Forcing full setup and tagging...", "started_at": _dt.now().isoformat(timespec='seconds'), "percent": 0})
if not os.path.exists(cards_path):
out("cards.csv not found. Running initial setup and tagging...")
out(f"Processed Parquet not found ({cards_path}). Running initial setup and tagging...")
_write_status({"running": True, "phase": "setup", "message": "Preparing card database (initial setup)...", "started_at": _dt.now().isoformat(timespec='seconds'), "percent": 0})
refresh_needed = True
else:
try:
age_seconds = time.time() - os.path.getmtime(cards_path)
if age_seconds > refresh_age_seconds and not force:
out("cards.csv is older than 7 days. Refreshing data (setup + tagging)...")
out(f"Processed Parquet is older than {days} days. Refreshing data (setup + tagging)...")
_write_status({"running": True, "phase": "setup", "message": "Refreshing card database (initial setup)...", "started_at": _dt.now().isoformat(timespec='seconds'), "percent": 0})
refresh_needed = True
except Exception:
@ -1239,6 +1256,55 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
out("Setup/tagging required, but WEB_AUTO_SETUP=0. Please run Setup from the UI.")
_write_status({"running": False, "phase": "requires_setup", "message": "Setup required (auto disabled)."})
return
# Try downloading pre-tagged data from GitHub first (faster than local build)
try:
import urllib.request
import urllib.error
out("[SETUP] Attempting to download pre-tagged data from GitHub...")
_write_status({"running": True, "phase": "download", "message": "Downloading pre-tagged data from GitHub...", "percent": 5})
base_url = "https://raw.githubusercontent.com/mwisnowski/mtg_python_deckbuilder/similarity-cache-data"
files_to_download = [
("card_files/processed/all_cards.parquet", "card_files/processed/all_cards.parquet"),
("card_files/processed/.tagging_complete.json", "card_files/processed/.tagging_complete.json"),
("card_files/similarity_cache.parquet", "card_files/similarity_cache.parquet"),
("card_files/similarity_cache_metadata.json", "card_files/similarity_cache_metadata.json"),
]
download_success = True
for remote_path, local_path in files_to_download:
try:
remote_url = f"{base_url}/{remote_path}"
os.makedirs(os.path.dirname(local_path), exist_ok=True)
urllib.request.urlretrieve(remote_url, local_path)
out(f"[SETUP] Downloaded: {local_path}")
except urllib.error.HTTPError as e:
if e.code == 404:
out(f"[SETUP] File not available on GitHub (404): {remote_path}")
download_success = False
break
raise
if download_success:
out("[SETUP] ✓ Successfully downloaded pre-tagged data from GitHub. Skipping local setup/tagging.")
_write_status({
"running": False,
"phase": "done",
"message": "Setup complete (downloaded from GitHub)",
"percent": 100,
"finished_at": _dt.now().isoformat(timespec='seconds')
})
# Refresh theme catalog after successful download
_refresh_theme_catalog(out, force=False, fast_path=True)
return
else:
out("[SETUP] GitHub download incomplete. Falling back to local setup/tagging...")
_write_status({"running": True, "phase": "setup", "message": "GitHub download failed, running local setup...", "percent": 0})
except Exception as e:
out(f"[SETUP] GitHub download failed ({e}). Falling back to local setup/tagging...")
_write_status({"running": True, "phase": "setup", "message": "GitHub download failed, running local setup...", "percent": 0})
try:
from file_setup.setup import initial_setup # type: ignore
# Always run initial_setup when forced or when cards are missing/stale
@ -1247,95 +1313,39 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
out(f"Initial setup failed: {e}")
_write_status({"running": False, "phase": "error", "message": f"Initial setup failed: {e}"})
return
# Tagging with progress; support parallel workers for speed
# M4 (Parquet Migration): Use unified run_tagging with parallel support
try:
from tagging import tagger as _tagger # type: ignore
from settings import COLORS as _COLORS # type: ignore
colors = list(_COLORS)
total = len(colors)
use_parallel = str(os.getenv('WEB_TAG_PARALLEL', '1')).strip().lower() in {"1","true","yes","on"}
max_workers_env = os.getenv('WEB_TAG_WORKERS')
try:
max_workers = int(max_workers_env) if max_workers_env else None
except Exception:
max_workers = None
mode_label = "parallel" if use_parallel else "sequential"
_write_status({
"running": True,
"phase": "tagging",
"message": "Tagging cards (this may take a while)..." if not use_parallel else "Tagging cards in parallel...",
"color": None,
"percent": 0,
"color_idx": 0,
"color_total": total,
"message": f"Tagging all cards ({mode_label} mode)...",
"percent": 10,
"tagging_started_at": _dt.now().isoformat(timespec='seconds')
})
if use_parallel:
try:
import concurrent.futures as _f
completed = 0
with _f.ProcessPoolExecutor(max_workers=max_workers) as ex:
fut_map = {ex.submit(_tagger.load_dataframe, c): c for c in colors}
for fut in _f.as_completed(fut_map):
c = fut_map[fut]
try:
fut.result()
completed += 1
pct = int(completed * 100 / max(1, total))
_write_status({
"running": True,
"phase": "tagging",
"message": f"Tagged {c}",
"color": c,
"percent": pct,
"color_idx": completed,
"color_total": total,
})
except Exception as e:
out(f"Parallel tagging failed for {c}: {e}")
_write_status({"running": False, "phase": "error", "message": f"Tagging {c} failed: {e}", "color": c})
return
except Exception as e:
out(f"Parallel tagging init failed: {e}; falling back to sequential")
use_parallel = False
if not use_parallel:
for idx, _color in enumerate(colors, start=1):
try:
pct = int((idx - 1) * 100 / max(1, total))
# Estimate ETA based on average time per completed color
eta_s = None
try:
from datetime import datetime as __dt
ts = __dt.fromisoformat(json.load(open(os.path.join('csv_files', '.setup_status.json'), 'r', encoding='utf-8')).get('tagging_started_at')) # type: ignore
elapsed = max(0.0, (_dt.now() - ts).total_seconds())
completed = max(0, idx - 1)
if completed > 0:
avg = elapsed / completed
remaining = max(0, total - completed)
eta_s = int(avg * remaining)
except Exception:
eta_s = None
payload = {
"running": True,
"phase": "tagging",
"message": f"Tagging {_color}...",
"color": _color,
"percent": pct,
"color_idx": idx,
"color_total": total,
}
if eta_s is not None:
payload["eta_seconds"] = eta_s
_write_status(payload)
_tagger.load_dataframe(_color)
except Exception as e:
out(f"Tagging {_color} failed: {e}")
_write_status({"running": False, "phase": "error", "message": f"Tagging {_color} failed: {e}", "color": _color})
return
out(f"Starting unified tagging ({mode_label} mode)...")
_tagger.run_tagging(parallel=use_parallel, max_workers=max_workers)
_write_status({
"running": True,
"phase": "tagging",
"message": f"Tagging complete ({mode_label} mode)",
"percent": 90,
})
out(f"✓ Tagging complete ({mode_label} mode)")
except Exception as e:
out(f"Tagging failed to start: {e}")
_write_status({"running": False, "phase": "error", "message": f"Tagging failed to start: {e}"})
out(f"Tagging failed: {e}")
_write_status({"running": False, "phase": "error", "message": f"Tagging failed: {e}"})
return
try:
os.makedirs('csv_files', exist_ok=True)

View file

@ -124,135 +124,74 @@ def add_names(names: Iterable[str]) -> Tuple[int, int]:
def _enrich_from_csvs(target_names: Iterable[str]) -> Dict[str, Dict[str, object]]:
"""Return metadata for target names by scanning csv_files/*_cards.csv.
"""Return metadata for target names by scanning all_cards.parquet (M4).
Output: { Name: { 'tags': [..], 'type': str|None, 'colors': [..] } }
"""
from pathlib import Path
import json as _json
import csv as _csv
base = Path('csv_files')
meta: Dict[str, Dict[str, object]] = {}
want = {str(n).strip().lower() for n in target_names if str(n).strip()}
if not (base.exists() and want):
if not want:
return meta
csv_files = [p for p in base.glob('*_cards.csv') if p.name.lower() not in ('cards.csv', 'commander_cards.csv')]
def _norm(s: str) -> str: return str(s or '').strip().lower()
for path in csv_files:
try:
with path.open('r', encoding='utf-8', errors='ignore') as f:
reader = _csv.DictReader(f)
headers = [h for h in (reader.fieldnames or [])]
name_key = None
tags_key = None
type_key = None
colors_key = None
for h in headers:
hn = _norm(h)
if hn in ('name', 'card', 'cardname', 'card_name'):
name_key = h
if hn in ('tags', 'theme_tags', 'themetags', 'themetagsjson') or hn == 'themetags' or hn == 'themetagsjson':
tags_key = h
if hn in ('type', 'type_line', 'typeline'):
type_key = h
if hn in ('colors', 'coloridentity', 'color_identity', 'color'):
colors_key = h
if not tags_key:
for h in headers:
if h.strip() in ('ThemeTags', 'themeTags'):
tags_key = h
try:
from deck_builder import builder_utils as bu
df = bu._load_all_cards_parquet()
if df.empty:
return meta
# Filter to cards we care about
df['name_lower'] = df['name'].str.lower()
df_filtered = df[df['name_lower'].isin(want)].copy()
for _, row in df_filtered.iterrows():
nm = str(row.get('name') or '').strip()
if not nm:
continue
entry = meta.setdefault(nm, {"tags": [], "type": None, "colors": []})
# Tags (already a list after our conversion in builder_utils)
tags = row.get('themeTags')
if tags and isinstance(tags, list):
existing = entry.get('tags') or []
seen = {str(t).lower() for t in existing}
for t in tags:
t_str = str(t).strip()
if t_str and t_str.lower() not in seen:
existing.append(t_str)
seen.add(t_str.lower())
entry['tags'] = existing
# Type
if not entry.get('type'):
t_raw = str(row.get('type') or '').strip()
if t_raw:
tline = t_raw.split('')[0].strip() if '' in t_raw else t_raw
prim = None
for cand in ['Creature','Instant','Sorcery','Artifact','Enchantment','Planeswalker','Land','Battle']:
if cand.lower() in tline.lower():
prim = cand
break
if not colors_key:
for h in headers:
if h.strip() in ('ColorIdentity', 'colorIdentity'):
colors_key = h
break
if not name_key:
continue
for row in reader:
try:
nm = str(row.get(name_key) or '').strip()
if not nm:
continue
low = nm.lower()
if low not in want:
continue
entry = meta.setdefault(nm, {"tags": [], "type": None, "colors": []})
# Tags
if tags_key:
raw = (row.get(tags_key) or '').strip()
vals: List[str] = []
if raw:
if raw.startswith('['):
try:
arr = _json.loads(raw)
if isinstance(arr, list):
vals = [str(x).strip() for x in arr if str(x).strip()]
except Exception:
vals = []
if not vals:
parts = [p.strip() for p in raw.replace(';', ',').split(',')]
vals = [p for p in parts if p]
if vals:
existing = entry.get('tags') or []
seen = {str(t).lower() for t in existing}
for t in vals:
if str(t).lower() not in seen:
existing.append(str(t))
seen.add(str(t).lower())
entry['tags'] = existing
# Type
if type_key and not entry.get('type'):
t_raw = str(row.get(type_key) or '').strip()
if t_raw:
tline = t_raw.split('')[0].strip() if '' in t_raw else t_raw
prim = None
for cand in ['Creature','Instant','Sorcery','Artifact','Enchantment','Planeswalker','Land','Battle']:
if cand.lower() in tline.lower():
prim = cand
break
if not prim and tline:
prim = tline.split()[0]
if prim:
entry['type'] = prim
# Colors
if colors_key and not entry.get('colors'):
c_raw = str(row.get(colors_key) or '').strip()
cols: List[str] = []
if c_raw:
if c_raw.startswith('['):
try:
arr = _json.loads(c_raw)
if isinstance(arr, list):
cols = [str(x).strip().upper() for x in arr if str(x).strip()]
except Exception:
cols = []
if not cols:
parts = [p.strip().upper() for p in c_raw.replace(';', ',').replace('[','').replace(']','').replace("'",'').split(',') if p.strip()]
if parts:
cols = parts
if not cols:
for ch in c_raw:
if ch.upper() in ('W','U','B','R','G','C'):
cols.append(ch.upper())
if cols:
seen_c = set()
uniq = []
for c in cols:
if c not in seen_c:
uniq.append(c)
seen_c.add(c)
entry['colors'] = uniq
except Exception:
continue
except Exception:
continue
if not prim and tline:
prim = tline.split()[0]
if prim:
entry['type'] = prim
# Colors
if not entry.get('colors'):
colors_raw = str(row.get('colorIdentity') or '').strip()
if colors_raw:
parts = [c.strip() for c in colors_raw.split(',') if c.strip()]
entry['colors'] = parts
except Exception:
# Defensive: return empty or partial meta
pass
return meta
def add_and_enrich(names: Iterable[str]) -> Tuple[int, int]:
"""Add names and enrich their metadata from CSVs in one pass.
"""Add names and enrich their metadata from Parquet (M4).
Returns (added_count, total_after).
"""
data = _load_raw()