mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 23:50:12 +01:00
feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging
This commit is contained in:
parent
e9e949aae3
commit
8435312c8f
58 changed files with 11921 additions and 3961 deletions
|
|
@ -4,30 +4,21 @@ Phase A refactor: Provides a thin API for building and querying the in-memory
|
|||
card index keyed by tag/theme. Future enhancements may introduce a persistent
|
||||
cache layer or precomputed artifact.
|
||||
|
||||
M4: Updated to load from all_cards.parquet instead of CSV shards.
|
||||
|
||||
Public API:
|
||||
maybe_build_index() -> None
|
||||
get_tag_pool(tag: str) -> list[dict]
|
||||
lookup_commander(name: str) -> dict | None
|
||||
|
||||
The index is rebuilt lazily when any of the CSV shard files change mtime.
|
||||
The index is rebuilt lazily when the Parquet file mtime changes.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import csv
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
CARD_FILES_GLOB = [
|
||||
Path("csv_files/blue_cards.csv"),
|
||||
Path("csv_files/white_cards.csv"),
|
||||
Path("csv_files/black_cards.csv"),
|
||||
Path("csv_files/red_cards.csv"),
|
||||
Path("csv_files/green_cards.csv"),
|
||||
Path("csv_files/colorless_cards.csv"),
|
||||
Path("csv_files/cards.csv"), # fallback large file last
|
||||
]
|
||||
|
||||
# M4: No longer need CSV file glob, we load from Parquet
|
||||
THEME_TAGS_COL = "themeTags"
|
||||
NAME_COL = "name"
|
||||
COLOR_IDENTITY_COL = "colorIdentity"
|
||||
|
|
@ -53,75 +44,63 @@ def _normalize_rarity(raw: str) -> str:
|
|||
r = (raw or "").strip().lower()
|
||||
return _RARITY_NORM.get(r, r)
|
||||
|
||||
def _resolve_card_files() -> List[Path]:
|
||||
"""Return base card file list + any extra test files supplied via env.
|
||||
|
||||
Environment variable: CARD_INDEX_EXTRA_CSV can contain a comma or semicolon
|
||||
separated list of additional CSV paths (used by tests to inject synthetic
|
||||
edge cases without polluting production shards).
|
||||
"""
|
||||
files: List[Path] = list(CARD_FILES_GLOB)
|
||||
extra = os.getenv("CARD_INDEX_EXTRA_CSV")
|
||||
if extra:
|
||||
for part in extra.replace(";", ",").split(","):
|
||||
p = part.strip()
|
||||
if not p:
|
||||
continue
|
||||
path_obj = Path(p)
|
||||
# Include even if missing; maybe created later in test before build
|
||||
files.append(path_obj)
|
||||
return files
|
||||
|
||||
|
||||
def maybe_build_index() -> None:
|
||||
"""Rebuild the index if any card CSV mtime changed.
|
||||
"""Rebuild the index if the Parquet file mtime changed.
|
||||
|
||||
Incorporates any extra CSVs specified via CARD_INDEX_EXTRA_CSV.
|
||||
M4: Loads from all_cards.parquet instead of CSV files.
|
||||
"""
|
||||
global _CARD_INDEX, _CARD_INDEX_MTIME
|
||||
latest = 0.0
|
||||
card_files = _resolve_card_files()
|
||||
for p in card_files:
|
||||
if p.exists():
|
||||
mt = p.stat().st_mtime
|
||||
if mt > latest:
|
||||
latest = mt
|
||||
if _CARD_INDEX and _CARD_INDEX_MTIME and latest <= _CARD_INDEX_MTIME:
|
||||
return
|
||||
new_index: Dict[str, List[Dict[str, Any]]] = {}
|
||||
for p in card_files:
|
||||
if not p.exists():
|
||||
continue
|
||||
try:
|
||||
with p.open("r", encoding="utf-8", newline="") as fh:
|
||||
reader = csv.DictReader(fh)
|
||||
if not reader.fieldnames or THEME_TAGS_COL not in reader.fieldnames:
|
||||
|
||||
try:
|
||||
from path_util import get_processed_cards_path
|
||||
from deck_builder import builder_utils as bu
|
||||
|
||||
parquet_path = Path(get_processed_cards_path())
|
||||
if not parquet_path.exists():
|
||||
return
|
||||
|
||||
latest = parquet_path.stat().st_mtime
|
||||
if _CARD_INDEX and _CARD_INDEX_MTIME and latest <= _CARD_INDEX_MTIME:
|
||||
return
|
||||
|
||||
# Load from Parquet
|
||||
df = bu._load_all_cards_parquet()
|
||||
if df.empty or THEME_TAGS_COL not in df.columns:
|
||||
return
|
||||
|
||||
new_index: Dict[str, List[Dict[str, Any]]] = {}
|
||||
|
||||
for _, row in df.iterrows():
|
||||
name = row.get(NAME_COL) or row.get("faceName") or ""
|
||||
tags = row.get(THEME_TAGS_COL)
|
||||
|
||||
# Handle tags (already a list after our conversion in builder_utils)
|
||||
if not tags or not isinstance(tags, list):
|
||||
continue
|
||||
|
||||
color_id = str(row.get(COLOR_IDENTITY_COL) or "").strip()
|
||||
mana_cost = str(row.get(MANA_COST_COL) or "").strip()
|
||||
rarity = _normalize_rarity(str(row.get(RARITY_COL) or ""))
|
||||
|
||||
for tg in tags:
|
||||
if not tg:
|
||||
continue
|
||||
for row in reader:
|
||||
name = row.get(NAME_COL) or row.get("faceName") or ""
|
||||
tags_raw = row.get(THEME_TAGS_COL) or ""
|
||||
tags = [t.strip(" '[]") for t in tags_raw.split(',') if t.strip()] if tags_raw else []
|
||||
if not tags:
|
||||
continue
|
||||
color_id = (row.get(COLOR_IDENTITY_COL) or "").strip()
|
||||
mana_cost = (row.get(MANA_COST_COL) or "").strip()
|
||||
rarity = _normalize_rarity(row.get(RARITY_COL) or "")
|
||||
for tg in tags:
|
||||
if not tg:
|
||||
continue
|
||||
new_index.setdefault(tg, []).append({
|
||||
"name": name,
|
||||
"color_identity": color_id,
|
||||
"tags": tags,
|
||||
"mana_cost": mana_cost,
|
||||
"rarity": rarity,
|
||||
"color_identity_list": list(color_id) if color_id else [],
|
||||
"pip_colors": [c for c in mana_cost if c in {"W","U","B","R","G"}],
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
_CARD_INDEX = new_index
|
||||
_CARD_INDEX_MTIME = latest
|
||||
new_index.setdefault(tg, []).append({
|
||||
"name": name,
|
||||
"color_identity": color_id,
|
||||
"tags": tags,
|
||||
"mana_cost": mana_cost,
|
||||
"rarity": rarity,
|
||||
"color_identity_list": [c.strip() for c in color_id.split(',') if c.strip()],
|
||||
"pip_colors": [c for c in mana_cost if c in {"W","U","B","R","G"}],
|
||||
})
|
||||
|
||||
_CARD_INDEX = new_index
|
||||
_CARD_INDEX_MTIME = latest
|
||||
except Exception:
|
||||
# Defensive: if anything fails, leave index unchanged
|
||||
pass
|
||||
|
||||
def get_tag_pool(tag: str) -> List[Dict[str, Any]]:
|
||||
return _CARD_INDEX.get(tag, [])
|
||||
|
|
|
|||
|
|
@ -247,11 +247,13 @@ class CardSimilarity:
|
|||
Returns:
|
||||
Set of theme tag strings
|
||||
"""
|
||||
if pd.isna(tags) or not tags:
|
||||
# M4: Handle both scalar NA (CSV) and array values (Parquet)
|
||||
if pd.isna(tags) if isinstance(tags, (str, float, int, type(None))) else False:
|
||||
return set()
|
||||
|
||||
|
||||
if isinstance(tags, list):
|
||||
return set(tags)
|
||||
# M4: Parquet format - already a list
|
||||
return set(tags) if tags else set()
|
||||
|
||||
if isinstance(tags, str):
|
||||
# Handle string representation of list: "['tag1', 'tag2']"
|
||||
|
|
|
|||
|
|
@ -2,14 +2,14 @@
|
|||
|
||||
Responsibilities
|
||||
================
|
||||
- Read and normalize `commander_cards.csv` (shared with the deck builder).
|
||||
- Read and normalize commander data from all_cards.parquet (M4 migration).
|
||||
- Produce deterministic commander records with rich metadata (slug, colors,
|
||||
partner/background flags, theme tags, Scryfall image URLs).
|
||||
- Cache the parsed catalog and invalidate on file timestamp changes.
|
||||
|
||||
The loader operates without pandas to keep the web layer light-weight and to
|
||||
simplify unit testing. It honors the `CSV_FILES_DIR` environment variable via
|
||||
`path_util.csv_dir()` just like the CLI builder.
|
||||
M4: Updated to load from all_cards.parquet instead of commander_cards.csv.
|
||||
The loader uses pandas to filter commanders (isCommander == True) from the
|
||||
unified Parquet data source.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -18,12 +18,10 @@ from dataclasses import dataclass
|
|||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Mapping, Optional, Tuple
|
||||
import ast
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
from urllib.parse import quote
|
||||
|
||||
from path_util import csv_dir
|
||||
from deck_builder.partner_background_utils import analyze_partner_background
|
||||
|
||||
__all__ = [
|
||||
|
|
@ -204,9 +202,11 @@ def find_commander_record(name: str | None) -> CommanderRecord | None:
|
|||
|
||||
|
||||
def _resolve_commander_path(source_path: str | os.PathLike[str] | None) -> Path:
|
||||
"""M4: Resolve Parquet path instead of commander_cards.csv."""
|
||||
if source_path is not None:
|
||||
return Path(source_path).resolve()
|
||||
return (Path(csv_dir()) / "commander_cards.csv").resolve()
|
||||
from path_util import get_processed_cards_path
|
||||
return Path(get_processed_cards_path()).resolve()
|
||||
|
||||
|
||||
def _is_cache_valid(path: Path, cached: CommanderCatalog) -> bool:
|
||||
|
|
@ -221,24 +221,31 @@ def _is_cache_valid(path: Path, cached: CommanderCatalog) -> bool:
|
|||
|
||||
|
||||
def _build_catalog(path: Path) -> CommanderCatalog:
|
||||
"""M4: Load commanders from Parquet instead of CSV."""
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Commander CSV not found at {path}")
|
||||
raise FileNotFoundError(f"Commander Parquet not found at {path}")
|
||||
|
||||
entries: List[CommanderRecord] = []
|
||||
used_slugs: set[str] = set()
|
||||
|
||||
with path.open("r", encoding="utf-8", newline="") as handle:
|
||||
reader = csv.DictReader(handle)
|
||||
if reader.fieldnames is None:
|
||||
raise ValueError("Commander CSV missing header row")
|
||||
# Load commanders from Parquet (isCommander == True)
|
||||
from deck_builder import builder_utils as bu
|
||||
df = bu._load_all_cards_parquet()
|
||||
if df.empty or 'isCommander' not in df.columns:
|
||||
raise ValueError("Parquet missing isCommander column")
|
||||
|
||||
commanders_df = df[df['isCommander']].copy()
|
||||
|
||||
for index, row in enumerate(reader):
|
||||
try:
|
||||
record = _row_to_record(row, used_slugs)
|
||||
except Exception:
|
||||
continue
|
||||
entries.append(record)
|
||||
used_slugs.add(record.slug)
|
||||
# Convert DataFrame rows to CommanderRecords
|
||||
for _, row in commanders_df.iterrows():
|
||||
try:
|
||||
# Convert row to dict for _row_to_record
|
||||
row_dict = row.to_dict()
|
||||
record = _row_to_record(row_dict, used_slugs)
|
||||
except Exception:
|
||||
continue
|
||||
entries.append(record)
|
||||
used_slugs.add(record.slug)
|
||||
|
||||
stat_result = path.stat()
|
||||
mtime_ns = getattr(stat_result, "st_mtime_ns", int(stat_result.st_mtime * 1_000_000_000))
|
||||
|
|
|
|||
|
|
@ -224,10 +224,18 @@ def _maybe_refresh_partner_synergy(out_func=None, *, force: bool = False, root:
|
|||
|
||||
if not needs_refresh:
|
||||
source_times: list[float] = []
|
||||
candidates = [
|
||||
root_path / "config" / "themes" / "theme_list.json",
|
||||
root_path / "csv_files" / "commander_cards.csv",
|
||||
]
|
||||
# M4: Check all_cards.parquet instead of commander_cards.csv
|
||||
try:
|
||||
from path_util import get_processed_cards_path
|
||||
parquet_path = Path(get_processed_cards_path())
|
||||
candidates = [
|
||||
root_path / "config" / "themes" / "theme_list.json",
|
||||
parquet_path,
|
||||
]
|
||||
except Exception:
|
||||
candidates = [
|
||||
root_path / "config" / "themes" / "theme_list.json",
|
||||
]
|
||||
for candidate in candidates:
|
||||
try:
|
||||
if candidate.exists():
|
||||
|
|
@ -919,14 +927,16 @@ def _is_truthy_env(name: str, default: str = '1') -> bool:
|
|||
def is_setup_ready() -> bool:
|
||||
"""Fast readiness check: required files present and tagging completed.
|
||||
|
||||
We consider the system ready if csv_files/cards.csv exists and the
|
||||
M4: Updated to check for all_cards.parquet instead of cards.csv.
|
||||
We consider the system ready if card_files/processed/all_cards.parquet exists and the
|
||||
.tagging_complete.json flag exists. Freshness (mtime) is enforced only
|
||||
during auto-refresh inside _ensure_setup_ready, not here.
|
||||
"""
|
||||
try:
|
||||
cards_path = os.path.join('csv_files', 'cards.csv')
|
||||
from path_util import get_processed_cards_path
|
||||
parquet_path = get_processed_cards_path()
|
||||
flag_path = os.path.join('csv_files', '.tagging_complete.json')
|
||||
return os.path.exists(cards_path) and os.path.exists(flag_path)
|
||||
return os.path.exists(parquet_path) and os.path.exists(flag_path)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
|
@ -983,20 +993,25 @@ def is_setup_stale() -> bool:
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: compare cards.csv mtime
|
||||
cards_path = os.path.join('csv_files', 'cards.csv')
|
||||
if not os.path.exists(cards_path):
|
||||
# Fallback: compare all_cards.parquet mtime (M4 update)
|
||||
try:
|
||||
from path_util import get_processed_cards_path
|
||||
parquet_path = get_processed_cards_path()
|
||||
if not os.path.exists(parquet_path):
|
||||
return False
|
||||
age_seconds = time.time() - os.path.getmtime(parquet_path)
|
||||
return age_seconds > refresh_age_seconds
|
||||
except Exception:
|
||||
return False
|
||||
age_seconds = time.time() - os.path.getmtime(cards_path)
|
||||
return age_seconds > refresh_age_seconds
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _ensure_setup_ready(out, force: bool = False) -> None:
|
||||
"""Ensure card CSVs exist and tagging has completed; bootstrap if needed.
|
||||
"""Ensure card data exists and tagging has completed; bootstrap if needed.
|
||||
|
||||
Mirrors the CLI behavior used in build_deck_full: if csv_files/cards.csv is
|
||||
M4: Updated to check for all_cards.parquet instead of cards.csv.
|
||||
Mirrors the CLI behavior used in build_deck_full: if the Parquet file is
|
||||
missing, too old, or the tagging flag is absent, run initial setup and tagging.
|
||||
"""
|
||||
# Track whether a theme catalog export actually executed during this invocation
|
||||
|
|
@ -1201,7 +1216,9 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
|
|||
pass
|
||||
|
||||
try:
|
||||
cards_path = os.path.join('csv_files', 'cards.csv')
|
||||
# M4 (Parquet Migration): Check for processed Parquet file instead of CSV
|
||||
from path_util import get_processed_cards_path # type: ignore
|
||||
cards_path = get_processed_cards_path()
|
||||
flag_path = os.path.join('csv_files', '.tagging_complete.json')
|
||||
auto_setup_enabled = _is_truthy_env('WEB_AUTO_SETUP', '1')
|
||||
# Allow tuning of time-based refresh; default 7 days
|
||||
|
|
@ -1215,14 +1232,14 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
|
|||
_write_status({"running": True, "phase": "setup", "message": "Forcing full setup and tagging...", "started_at": _dt.now().isoformat(timespec='seconds'), "percent": 0})
|
||||
|
||||
if not os.path.exists(cards_path):
|
||||
out("cards.csv not found. Running initial setup and tagging...")
|
||||
out(f"Processed Parquet not found ({cards_path}). Running initial setup and tagging...")
|
||||
_write_status({"running": True, "phase": "setup", "message": "Preparing card database (initial setup)...", "started_at": _dt.now().isoformat(timespec='seconds'), "percent": 0})
|
||||
refresh_needed = True
|
||||
else:
|
||||
try:
|
||||
age_seconds = time.time() - os.path.getmtime(cards_path)
|
||||
if age_seconds > refresh_age_seconds and not force:
|
||||
out("cards.csv is older than 7 days. Refreshing data (setup + tagging)...")
|
||||
out(f"Processed Parquet is older than {days} days. Refreshing data (setup + tagging)...")
|
||||
_write_status({"running": True, "phase": "setup", "message": "Refreshing card database (initial setup)...", "started_at": _dt.now().isoformat(timespec='seconds'), "percent": 0})
|
||||
refresh_needed = True
|
||||
except Exception:
|
||||
|
|
@ -1239,6 +1256,55 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
|
|||
out("Setup/tagging required, but WEB_AUTO_SETUP=0. Please run Setup from the UI.")
|
||||
_write_status({"running": False, "phase": "requires_setup", "message": "Setup required (auto disabled)."})
|
||||
return
|
||||
|
||||
# Try downloading pre-tagged data from GitHub first (faster than local build)
|
||||
try:
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
out("[SETUP] Attempting to download pre-tagged data from GitHub...")
|
||||
_write_status({"running": True, "phase": "download", "message": "Downloading pre-tagged data from GitHub...", "percent": 5})
|
||||
|
||||
base_url = "https://raw.githubusercontent.com/mwisnowski/mtg_python_deckbuilder/similarity-cache-data"
|
||||
files_to_download = [
|
||||
("card_files/processed/all_cards.parquet", "card_files/processed/all_cards.parquet"),
|
||||
("card_files/processed/.tagging_complete.json", "card_files/processed/.tagging_complete.json"),
|
||||
("card_files/similarity_cache.parquet", "card_files/similarity_cache.parquet"),
|
||||
("card_files/similarity_cache_metadata.json", "card_files/similarity_cache_metadata.json"),
|
||||
]
|
||||
|
||||
download_success = True
|
||||
for remote_path, local_path in files_to_download:
|
||||
try:
|
||||
remote_url = f"{base_url}/{remote_path}"
|
||||
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
||||
urllib.request.urlretrieve(remote_url, local_path)
|
||||
out(f"[SETUP] Downloaded: {local_path}")
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404:
|
||||
out(f"[SETUP] File not available on GitHub (404): {remote_path}")
|
||||
download_success = False
|
||||
break
|
||||
raise
|
||||
|
||||
if download_success:
|
||||
out("[SETUP] ✓ Successfully downloaded pre-tagged data from GitHub. Skipping local setup/tagging.")
|
||||
_write_status({
|
||||
"running": False,
|
||||
"phase": "done",
|
||||
"message": "Setup complete (downloaded from GitHub)",
|
||||
"percent": 100,
|
||||
"finished_at": _dt.now().isoformat(timespec='seconds')
|
||||
})
|
||||
# Refresh theme catalog after successful download
|
||||
_refresh_theme_catalog(out, force=False, fast_path=True)
|
||||
return
|
||||
else:
|
||||
out("[SETUP] GitHub download incomplete. Falling back to local setup/tagging...")
|
||||
_write_status({"running": True, "phase": "setup", "message": "GitHub download failed, running local setup...", "percent": 0})
|
||||
except Exception as e:
|
||||
out(f"[SETUP] GitHub download failed ({e}). Falling back to local setup/tagging...")
|
||||
_write_status({"running": True, "phase": "setup", "message": "GitHub download failed, running local setup...", "percent": 0})
|
||||
|
||||
try:
|
||||
from file_setup.setup import initial_setup # type: ignore
|
||||
# Always run initial_setup when forced or when cards are missing/stale
|
||||
|
|
@ -1247,95 +1313,39 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
|
|||
out(f"Initial setup failed: {e}")
|
||||
_write_status({"running": False, "phase": "error", "message": f"Initial setup failed: {e}"})
|
||||
return
|
||||
# Tagging with progress; support parallel workers for speed
|
||||
# M4 (Parquet Migration): Use unified run_tagging with parallel support
|
||||
try:
|
||||
from tagging import tagger as _tagger # type: ignore
|
||||
from settings import COLORS as _COLORS # type: ignore
|
||||
colors = list(_COLORS)
|
||||
total = len(colors)
|
||||
use_parallel = str(os.getenv('WEB_TAG_PARALLEL', '1')).strip().lower() in {"1","true","yes","on"}
|
||||
max_workers_env = os.getenv('WEB_TAG_WORKERS')
|
||||
try:
|
||||
max_workers = int(max_workers_env) if max_workers_env else None
|
||||
except Exception:
|
||||
max_workers = None
|
||||
|
||||
mode_label = "parallel" if use_parallel else "sequential"
|
||||
_write_status({
|
||||
"running": True,
|
||||
"phase": "tagging",
|
||||
"message": "Tagging cards (this may take a while)..." if not use_parallel else "Tagging cards in parallel...",
|
||||
"color": None,
|
||||
"percent": 0,
|
||||
"color_idx": 0,
|
||||
"color_total": total,
|
||||
"message": f"Tagging all cards ({mode_label} mode)...",
|
||||
"percent": 10,
|
||||
"tagging_started_at": _dt.now().isoformat(timespec='seconds')
|
||||
})
|
||||
|
||||
if use_parallel:
|
||||
try:
|
||||
import concurrent.futures as _f
|
||||
completed = 0
|
||||
with _f.ProcessPoolExecutor(max_workers=max_workers) as ex:
|
||||
fut_map = {ex.submit(_tagger.load_dataframe, c): c for c in colors}
|
||||
for fut in _f.as_completed(fut_map):
|
||||
c = fut_map[fut]
|
||||
try:
|
||||
fut.result()
|
||||
completed += 1
|
||||
pct = int(completed * 100 / max(1, total))
|
||||
_write_status({
|
||||
"running": True,
|
||||
"phase": "tagging",
|
||||
"message": f"Tagged {c}",
|
||||
"color": c,
|
||||
"percent": pct,
|
||||
"color_idx": completed,
|
||||
"color_total": total,
|
||||
})
|
||||
except Exception as e:
|
||||
out(f"Parallel tagging failed for {c}: {e}")
|
||||
_write_status({"running": False, "phase": "error", "message": f"Tagging {c} failed: {e}", "color": c})
|
||||
return
|
||||
except Exception as e:
|
||||
out(f"Parallel tagging init failed: {e}; falling back to sequential")
|
||||
use_parallel = False
|
||||
|
||||
if not use_parallel:
|
||||
for idx, _color in enumerate(colors, start=1):
|
||||
try:
|
||||
pct = int((idx - 1) * 100 / max(1, total))
|
||||
# Estimate ETA based on average time per completed color
|
||||
eta_s = None
|
||||
try:
|
||||
from datetime import datetime as __dt
|
||||
ts = __dt.fromisoformat(json.load(open(os.path.join('csv_files', '.setup_status.json'), 'r', encoding='utf-8')).get('tagging_started_at')) # type: ignore
|
||||
elapsed = max(0.0, (_dt.now() - ts).total_seconds())
|
||||
completed = max(0, idx - 1)
|
||||
if completed > 0:
|
||||
avg = elapsed / completed
|
||||
remaining = max(0, total - completed)
|
||||
eta_s = int(avg * remaining)
|
||||
except Exception:
|
||||
eta_s = None
|
||||
payload = {
|
||||
"running": True,
|
||||
"phase": "tagging",
|
||||
"message": f"Tagging {_color}...",
|
||||
"color": _color,
|
||||
"percent": pct,
|
||||
"color_idx": idx,
|
||||
"color_total": total,
|
||||
}
|
||||
if eta_s is not None:
|
||||
payload["eta_seconds"] = eta_s
|
||||
_write_status(payload)
|
||||
_tagger.load_dataframe(_color)
|
||||
except Exception as e:
|
||||
out(f"Tagging {_color} failed: {e}")
|
||||
_write_status({"running": False, "phase": "error", "message": f"Tagging {_color} failed: {e}", "color": _color})
|
||||
return
|
||||
|
||||
out(f"Starting unified tagging ({mode_label} mode)...")
|
||||
_tagger.run_tagging(parallel=use_parallel, max_workers=max_workers)
|
||||
|
||||
_write_status({
|
||||
"running": True,
|
||||
"phase": "tagging",
|
||||
"message": f"Tagging complete ({mode_label} mode)",
|
||||
"percent": 90,
|
||||
})
|
||||
out(f"✓ Tagging complete ({mode_label} mode)")
|
||||
|
||||
except Exception as e:
|
||||
out(f"Tagging failed to start: {e}")
|
||||
_write_status({"running": False, "phase": "error", "message": f"Tagging failed to start: {e}"})
|
||||
out(f"Tagging failed: {e}")
|
||||
_write_status({"running": False, "phase": "error", "message": f"Tagging failed: {e}"})
|
||||
return
|
||||
try:
|
||||
os.makedirs('csv_files', exist_ok=True)
|
||||
|
|
|
|||
|
|
@ -124,135 +124,74 @@ def add_names(names: Iterable[str]) -> Tuple[int, int]:
|
|||
|
||||
|
||||
def _enrich_from_csvs(target_names: Iterable[str]) -> Dict[str, Dict[str, object]]:
|
||||
"""Return metadata for target names by scanning csv_files/*_cards.csv.
|
||||
"""Return metadata for target names by scanning all_cards.parquet (M4).
|
||||
Output: { Name: { 'tags': [..], 'type': str|None, 'colors': [..] } }
|
||||
"""
|
||||
from pathlib import Path
|
||||
import json as _json
|
||||
import csv as _csv
|
||||
|
||||
base = Path('csv_files')
|
||||
meta: Dict[str, Dict[str, object]] = {}
|
||||
want = {str(n).strip().lower() for n in target_names if str(n).strip()}
|
||||
if not (base.exists() and want):
|
||||
if not want:
|
||||
return meta
|
||||
csv_files = [p for p in base.glob('*_cards.csv') if p.name.lower() not in ('cards.csv', 'commander_cards.csv')]
|
||||
|
||||
def _norm(s: str) -> str: return str(s or '').strip().lower()
|
||||
for path in csv_files:
|
||||
try:
|
||||
with path.open('r', encoding='utf-8', errors='ignore') as f:
|
||||
reader = _csv.DictReader(f)
|
||||
headers = [h for h in (reader.fieldnames or [])]
|
||||
name_key = None
|
||||
tags_key = None
|
||||
type_key = None
|
||||
colors_key = None
|
||||
for h in headers:
|
||||
hn = _norm(h)
|
||||
if hn in ('name', 'card', 'cardname', 'card_name'):
|
||||
name_key = h
|
||||
if hn in ('tags', 'theme_tags', 'themetags', 'themetagsjson') or hn == 'themetags' or hn == 'themetagsjson':
|
||||
tags_key = h
|
||||
if hn in ('type', 'type_line', 'typeline'):
|
||||
type_key = h
|
||||
if hn in ('colors', 'coloridentity', 'color_identity', 'color'):
|
||||
colors_key = h
|
||||
if not tags_key:
|
||||
for h in headers:
|
||||
if h.strip() in ('ThemeTags', 'themeTags'):
|
||||
tags_key = h
|
||||
try:
|
||||
from deck_builder import builder_utils as bu
|
||||
df = bu._load_all_cards_parquet()
|
||||
if df.empty:
|
||||
return meta
|
||||
|
||||
# Filter to cards we care about
|
||||
df['name_lower'] = df['name'].str.lower()
|
||||
df_filtered = df[df['name_lower'].isin(want)].copy()
|
||||
|
||||
for _, row in df_filtered.iterrows():
|
||||
nm = str(row.get('name') or '').strip()
|
||||
if not nm:
|
||||
continue
|
||||
|
||||
entry = meta.setdefault(nm, {"tags": [], "type": None, "colors": []})
|
||||
|
||||
# Tags (already a list after our conversion in builder_utils)
|
||||
tags = row.get('themeTags')
|
||||
if tags and isinstance(tags, list):
|
||||
existing = entry.get('tags') or []
|
||||
seen = {str(t).lower() for t in existing}
|
||||
for t in tags:
|
||||
t_str = str(t).strip()
|
||||
if t_str and t_str.lower() not in seen:
|
||||
existing.append(t_str)
|
||||
seen.add(t_str.lower())
|
||||
entry['tags'] = existing
|
||||
|
||||
# Type
|
||||
if not entry.get('type'):
|
||||
t_raw = str(row.get('type') or '').strip()
|
||||
if t_raw:
|
||||
tline = t_raw.split('—')[0].strip() if '—' in t_raw else t_raw
|
||||
prim = None
|
||||
for cand in ['Creature','Instant','Sorcery','Artifact','Enchantment','Planeswalker','Land','Battle']:
|
||||
if cand.lower() in tline.lower():
|
||||
prim = cand
|
||||
break
|
||||
if not colors_key:
|
||||
for h in headers:
|
||||
if h.strip() in ('ColorIdentity', 'colorIdentity'):
|
||||
colors_key = h
|
||||
break
|
||||
if not name_key:
|
||||
continue
|
||||
for row in reader:
|
||||
try:
|
||||
nm = str(row.get(name_key) or '').strip()
|
||||
if not nm:
|
||||
continue
|
||||
low = nm.lower()
|
||||
if low not in want:
|
||||
continue
|
||||
entry = meta.setdefault(nm, {"tags": [], "type": None, "colors": []})
|
||||
# Tags
|
||||
if tags_key:
|
||||
raw = (row.get(tags_key) or '').strip()
|
||||
vals: List[str] = []
|
||||
if raw:
|
||||
if raw.startswith('['):
|
||||
try:
|
||||
arr = _json.loads(raw)
|
||||
if isinstance(arr, list):
|
||||
vals = [str(x).strip() for x in arr if str(x).strip()]
|
||||
except Exception:
|
||||
vals = []
|
||||
if not vals:
|
||||
parts = [p.strip() for p in raw.replace(';', ',').split(',')]
|
||||
vals = [p for p in parts if p]
|
||||
if vals:
|
||||
existing = entry.get('tags') or []
|
||||
seen = {str(t).lower() for t in existing}
|
||||
for t in vals:
|
||||
if str(t).lower() not in seen:
|
||||
existing.append(str(t))
|
||||
seen.add(str(t).lower())
|
||||
entry['tags'] = existing
|
||||
# Type
|
||||
if type_key and not entry.get('type'):
|
||||
t_raw = str(row.get(type_key) or '').strip()
|
||||
if t_raw:
|
||||
tline = t_raw.split('—')[0].strip() if '—' in t_raw else t_raw
|
||||
prim = None
|
||||
for cand in ['Creature','Instant','Sorcery','Artifact','Enchantment','Planeswalker','Land','Battle']:
|
||||
if cand.lower() in tline.lower():
|
||||
prim = cand
|
||||
break
|
||||
if not prim and tline:
|
||||
prim = tline.split()[0]
|
||||
if prim:
|
||||
entry['type'] = prim
|
||||
# Colors
|
||||
if colors_key and not entry.get('colors'):
|
||||
c_raw = str(row.get(colors_key) or '').strip()
|
||||
cols: List[str] = []
|
||||
if c_raw:
|
||||
if c_raw.startswith('['):
|
||||
try:
|
||||
arr = _json.loads(c_raw)
|
||||
if isinstance(arr, list):
|
||||
cols = [str(x).strip().upper() for x in arr if str(x).strip()]
|
||||
except Exception:
|
||||
cols = []
|
||||
if not cols:
|
||||
parts = [p.strip().upper() for p in c_raw.replace(';', ',').replace('[','').replace(']','').replace("'",'').split(',') if p.strip()]
|
||||
if parts:
|
||||
cols = parts
|
||||
if not cols:
|
||||
for ch in c_raw:
|
||||
if ch.upper() in ('W','U','B','R','G','C'):
|
||||
cols.append(ch.upper())
|
||||
if cols:
|
||||
seen_c = set()
|
||||
uniq = []
|
||||
for c in cols:
|
||||
if c not in seen_c:
|
||||
uniq.append(c)
|
||||
seen_c.add(c)
|
||||
entry['colors'] = uniq
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
if not prim and tline:
|
||||
prim = tline.split()[0]
|
||||
if prim:
|
||||
entry['type'] = prim
|
||||
|
||||
# Colors
|
||||
if not entry.get('colors'):
|
||||
colors_raw = str(row.get('colorIdentity') or '').strip()
|
||||
if colors_raw:
|
||||
parts = [c.strip() for c in colors_raw.split(',') if c.strip()]
|
||||
entry['colors'] = parts
|
||||
|
||||
except Exception:
|
||||
# Defensive: return empty or partial meta
|
||||
pass
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def add_and_enrich(names: Iterable[str]) -> Tuple[int, int]:
|
||||
"""Add names and enrich their metadata from CSVs in one pass.
|
||||
"""Add names and enrich their metadata from Parquet (M4).
|
||||
Returns (added_count, total_after).
|
||||
"""
|
||||
data = _load_raw()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue