mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 23:50:12 +01:00
feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging
This commit is contained in:
parent
e9e949aae3
commit
8435312c8f
58 changed files with 11921 additions and 3961 deletions
|
|
@ -9,7 +9,7 @@ from pathlib import Path
|
|||
import re
|
||||
from typing import Mapping, Tuple
|
||||
|
||||
from code.logging_util import get_logger
|
||||
from logging_util import get_logger
|
||||
from deck_builder.partner_background_utils import analyze_partner_background
|
||||
from path_util import csv_dir
|
||||
|
||||
|
|
|
|||
|
|
@ -154,28 +154,33 @@ class DeckBuilder(
|
|||
start_ts = datetime.datetime.now()
|
||||
logger.info("=== Deck Build: BEGIN ===")
|
||||
try:
|
||||
# Ensure CSVs exist and are tagged before starting any deck build logic
|
||||
# M4: Ensure Parquet file exists and is tagged before starting any deck build logic
|
||||
try:
|
||||
import time as _time
|
||||
import json as _json
|
||||
from datetime import datetime as _dt
|
||||
cards_path = os.path.join(CSV_DIRECTORY, 'cards.csv')
|
||||
from code.path_util import get_processed_cards_path
|
||||
|
||||
parquet_path = get_processed_cards_path()
|
||||
flag_path = os.path.join(CSV_DIRECTORY, '.tagging_complete.json')
|
||||
refresh_needed = False
|
||||
if not os.path.exists(cards_path):
|
||||
logger.info("cards.csv not found. Running initial setup and tagging before deck build...")
|
||||
|
||||
if not os.path.exists(parquet_path):
|
||||
logger.info("all_cards.parquet not found. Running initial setup and tagging before deck build...")
|
||||
refresh_needed = True
|
||||
else:
|
||||
try:
|
||||
age_seconds = _time.time() - os.path.getmtime(cards_path)
|
||||
age_seconds = _time.time() - os.path.getmtime(parquet_path)
|
||||
if age_seconds > 7 * 24 * 60 * 60:
|
||||
logger.info("cards.csv is older than 7 days. Refreshing data before deck build...")
|
||||
logger.info("all_cards.parquet is older than 7 days. Refreshing data before deck build...")
|
||||
refresh_needed = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not os.path.exists(flag_path):
|
||||
logger.info("Tagging completion flag not found. Performing full tagging before deck build...")
|
||||
refresh_needed = True
|
||||
|
||||
if refresh_needed:
|
||||
initial_setup()
|
||||
from tagging import tagger as _tagger
|
||||
|
|
@ -187,7 +192,7 @@ class DeckBuilder(
|
|||
except Exception:
|
||||
logger.warning("Failed to write tagging completion flag (non-fatal).")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed ensuring CSVs before deck build: {e}")
|
||||
logger.error(f"Failed ensuring Parquet file before deck build: {e}")
|
||||
self.run_initial_setup()
|
||||
self.run_deck_build_step1()
|
||||
self.run_deck_build_step2()
|
||||
|
|
@ -832,14 +837,25 @@ class DeckBuilder(
|
|||
def load_commander_data(self) -> pd.DataFrame:
|
||||
if self._commander_df is not None:
|
||||
return self._commander_df
|
||||
df = pd.read_csv(
|
||||
bc.COMMANDER_CSV_PATH,
|
||||
converters=getattr(bc, "COMMANDER_CONVERTERS", None)
|
||||
)
|
||||
|
||||
# M4: Load commanders from Parquet instead of CSV
|
||||
from deck_builder import builder_utils as bu
|
||||
from deck_builder import builder_constants as bc
|
||||
|
||||
all_cards_df = bu._load_all_cards_parquet()
|
||||
if all_cards_df.empty:
|
||||
# Fallback to empty DataFrame with expected columns
|
||||
return pd.DataFrame(columns=['name', 'themeTags', 'creatureTypes'])
|
||||
|
||||
# Filter to only commander-eligible cards
|
||||
df = bc.get_commanders(all_cards_df)
|
||||
|
||||
# Ensure required columns exist with proper defaults
|
||||
if "themeTags" not in df.columns:
|
||||
df["themeTags"] = [[] for _ in range(len(df))]
|
||||
if "creatureTypes" not in df.columns:
|
||||
df["creatureTypes"] = [[] for _ in range(len(df))]
|
||||
|
||||
self._commander_df = df
|
||||
return df
|
||||
|
||||
|
|
@ -1125,9 +1141,9 @@ class DeckBuilder(
|
|||
return full, load_files
|
||||
|
||||
def setup_dataframes(self) -> pd.DataFrame:
|
||||
"""Load all csv files for current color identity into one combined DataFrame.
|
||||
"""Load cards from all_cards.parquet and filter by current color identity.
|
||||
|
||||
Each file stem in files_to_load corresponds to csv_files/{stem}_cards.csv.
|
||||
M4: Migrated from CSV to Parquet. Filters by color identity using colorIdentity column.
|
||||
The result is cached and returned. Minimal validation only (non-empty, required columns exist if known).
|
||||
"""
|
||||
if self._combined_cards_df is not None:
|
||||
|
|
@ -1135,37 +1151,53 @@ class DeckBuilder(
|
|||
if not self.files_to_load:
|
||||
# Attempt to determine if not yet done
|
||||
self.determine_color_identity()
|
||||
dfs = []
|
||||
required = getattr(bc, 'CSV_REQUIRED_COLUMNS', [])
|
||||
from path_util import csv_dir as _csv_dir
|
||||
base = _csv_dir()
|
||||
|
||||
# Define converters for list columns (same as tagger.py)
|
||||
converters = {
|
||||
'themeTags': pd.eval,
|
||||
'creatureTypes': pd.eval,
|
||||
'metadataTags': pd.eval # M2: Parse metadataTags column
|
||||
}
|
||||
# M4: Load from Parquet instead of CSV files
|
||||
from deck_builder import builder_utils as bu
|
||||
all_cards_df = bu._load_all_cards_parquet()
|
||||
|
||||
if all_cards_df is None or all_cards_df.empty:
|
||||
raise RuntimeError("Failed to load all_cards.parquet or file is empty.")
|
||||
|
||||
# M4: Filter by color identity instead of loading multiple CSVs
|
||||
# Get the colors from self.color_identity (e.g., {'W', 'U', 'B', 'G'})
|
||||
if hasattr(self, 'color_identity') and self.color_identity:
|
||||
# Determine which cards can be played in this color identity
|
||||
# A card can be played if its color identity is a subset of the commander's color identity
|
||||
def card_matches_identity(card_colors):
|
||||
"""Check if card's color identity is legal in commander's identity."""
|
||||
if card_colors is None or (isinstance(card_colors, float) and pd.isna(card_colors)):
|
||||
# Colorless cards can go in any deck
|
||||
return True
|
||||
if isinstance(card_colors, str):
|
||||
# Handle string format like "B, G, R, U" (note the spaces after commas)
|
||||
card_colors = {c.strip() for c in card_colors.split(',')} if card_colors else set()
|
||||
elif isinstance(card_colors, list):
|
||||
card_colors = set(card_colors)
|
||||
else:
|
||||
# Unknown format, be permissive
|
||||
return True
|
||||
# Card is legal if its colors are a subset of commander colors
|
||||
return card_colors.issubset(self.color_identity)
|
||||
|
||||
if 'colorIdentity' in all_cards_df.columns:
|
||||
mask = all_cards_df['colorIdentity'].apply(card_matches_identity)
|
||||
combined = all_cards_df[mask].copy()
|
||||
logger.info(f"M4 COLOR_FILTER: Filtered {len(all_cards_df)} cards to {len(combined)} cards for identity {sorted(self.color_identity)}")
|
||||
else:
|
||||
logger.warning("M4 COLOR_FILTER: colorIdentity column missing, using all cards")
|
||||
combined = all_cards_df.copy()
|
||||
else:
|
||||
# No color identity set, use all cards
|
||||
logger.warning("M4 COLOR_FILTER: No color identity set, using all cards")
|
||||
combined = all_cards_df.copy()
|
||||
|
||||
for stem in self.files_to_load:
|
||||
path = f"{base}/{stem}_cards.csv"
|
||||
try:
|
||||
df = pd.read_csv(path, converters=converters)
|
||||
if required:
|
||||
missing = [c for c in required if c not in df.columns]
|
||||
if missing:
|
||||
# Skip or still keep with warning; choose to warn
|
||||
self.output_func(f"Warning: {path} missing columns: {missing}")
|
||||
dfs.append(df)
|
||||
except FileNotFoundError:
|
||||
self.output_func(f"Warning: CSV file not found: {path}")
|
||||
continue
|
||||
if not dfs:
|
||||
raise RuntimeError("No CSV files loaded for color identity.")
|
||||
combined = pd.concat(dfs, axis=0, ignore_index=True)
|
||||
# Drop duplicate rows by 'name' if column exists
|
||||
if 'name' in combined.columns:
|
||||
before_dedup = len(combined)
|
||||
combined = combined.drop_duplicates(subset='name', keep='first')
|
||||
if len(combined) < before_dedup:
|
||||
logger.info(f"M4 DEDUP: Removed {before_dedup - len(combined)} duplicate names")
|
||||
# If owned-only mode, filter combined pool to owned names (case-insensitive)
|
||||
if self.use_owned_only:
|
||||
try:
|
||||
|
|
@ -1951,10 +1983,10 @@ class DeckBuilder(
|
|||
return
|
||||
block = self._format_commander_pretty(self.commander_row)
|
||||
self.output_func("\n" + block)
|
||||
# New: show which CSV files (stems) were loaded for this color identity
|
||||
if self.files_to_load:
|
||||
file_list = ", ".join(f"{stem}_cards.csv" for stem in self.files_to_load)
|
||||
self.output_func(f"Card Pool Files: {file_list}")
|
||||
# M4: Show that we're loading from unified Parquet file
|
||||
if hasattr(self, 'color_identity') and self.color_identity:
|
||||
colors = ', '.join(sorted(self.color_identity))
|
||||
self.output_func(f"Card Pool: all_cards.parquet (filtered to {colors} identity)")
|
||||
# Owned-only status
|
||||
if getattr(self, 'use_owned_only', False):
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -1,9 +1,12 @@
|
|||
from typing import Dict, List, Final, Tuple, Union, Callable, Any as _Any
|
||||
from settings import CARD_DATA_COLUMNS as CSV_REQUIRED_COLUMNS # unified
|
||||
from path_util import csv_dir
|
||||
import pandas as pd
|
||||
|
||||
__all__ = [
|
||||
'CSV_REQUIRED_COLUMNS'
|
||||
'CSV_REQUIRED_COLUMNS',
|
||||
'get_commanders',
|
||||
'get_backgrounds',
|
||||
]
|
||||
import ast
|
||||
|
||||
|
|
@ -14,8 +17,10 @@ MAX_FUZZY_CHOICES: Final[int] = 5 # Maximum number of fuzzy match choices
|
|||
|
||||
# Commander-related constants
|
||||
DUPLICATE_CARD_FORMAT: Final[str] = '{card_name} x {count}'
|
||||
# M4: Deprecated - use Parquet loading instead
|
||||
COMMANDER_CSV_PATH: Final[str] = f"{csv_dir()}/commander_cards.csv"
|
||||
DECK_DIRECTORY = '../deck_files'
|
||||
# M4: Deprecated - Parquet handles types natively (no converters needed)
|
||||
COMMANDER_CONVERTERS: Final[Dict[str, str]] = {
|
||||
'themeTags': ast.literal_eval,
|
||||
'creatureTypes': ast.literal_eval,
|
||||
|
|
@ -918,3 +923,36 @@ ICONIC_CARDS: Final[set[str]] = {
|
|||
'Vampiric Tutor', 'Mystical Tutor', 'Enlightened Tutor', 'Worldly Tutor',
|
||||
'Eternal Witness', 'Solemn Simulacrum', 'Consecrated Sphinx', 'Avenger of Zendikar',
|
||||
}
|
||||
|
||||
|
||||
# M4: Parquet filtering helpers
|
||||
def get_commanders(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Filter DataFrame to only commander-legal cards using isCommander flag.
|
||||
|
||||
M4: Replaces CSV-based commander filtering with Parquet boolean flag.
|
||||
|
||||
Args:
|
||||
df: DataFrame with 'isCommander' column
|
||||
|
||||
Returns:
|
||||
Filtered DataFrame containing only commanders
|
||||
"""
|
||||
if 'isCommander' not in df.columns:
|
||||
return pd.DataFrame()
|
||||
return df[df['isCommander'] == True].copy() # noqa: E712
|
||||
|
||||
|
||||
def get_backgrounds(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Filter DataFrame to only background cards using isBackground flag.
|
||||
|
||||
M4: Replaces CSV-based background filtering with Parquet boolean flag.
|
||||
|
||||
Args:
|
||||
df: DataFrame with 'isBackground' column
|
||||
|
||||
Returns:
|
||||
Filtered DataFrame containing only backgrounds
|
||||
"""
|
||||
if 'isBackground' not in df.columns:
|
||||
return pd.DataFrame()
|
||||
return df[df['isBackground'] == True].copy() # noqa: E712
|
||||
|
|
|
|||
|
|
@ -71,16 +71,56 @@ def _resolved_csv_dir(base_dir: str | None = None) -> str:
|
|||
return base_dir or csv_dir()
|
||||
|
||||
|
||||
def _load_all_cards_parquet() -> pd.DataFrame:
|
||||
"""Load all cards from the unified Parquet file.
|
||||
|
||||
M4: Centralized Parquet loading for deck builder.
|
||||
Returns empty DataFrame on error (defensive).
|
||||
Converts numpy arrays to Python lists for compatibility with existing code.
|
||||
"""
|
||||
try:
|
||||
from code.path_util import get_processed_cards_path
|
||||
from code.file_setup.data_loader import DataLoader
|
||||
import numpy as np
|
||||
|
||||
parquet_path = get_processed_cards_path()
|
||||
if not Path(parquet_path).exists():
|
||||
return pd.DataFrame()
|
||||
|
||||
data_loader = DataLoader()
|
||||
df = data_loader.read_cards(parquet_path, format="parquet")
|
||||
|
||||
# M4: Convert numpy arrays to Python lists for compatibility
|
||||
# Parquet stores lists as numpy arrays, but existing code expects Python lists
|
||||
list_columns = ['themeTags', 'creatureTypes', 'metadataTags', 'keywords']
|
||||
for col in list_columns:
|
||||
if col in df.columns:
|
||||
df[col] = df[col].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
|
||||
|
||||
return df
|
||||
except Exception:
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def _load_multi_face_land_map(base_dir: str) -> Dict[str, Dict[str, Any]]:
|
||||
"""Load mapping of multi-faced cards that have at least one land face."""
|
||||
"""Load mapping of multi-faced cards that have at least one land face.
|
||||
|
||||
M4: Migrated to use Parquet loading. base_dir parameter kept for
|
||||
backward compatibility but now only used as cache key.
|
||||
"""
|
||||
try:
|
||||
base_path = Path(base_dir)
|
||||
csv_path = base_path / 'cards.csv'
|
||||
if not csv_path.exists():
|
||||
# M4: Load from Parquet instead of CSV
|
||||
df = _load_all_cards_parquet()
|
||||
if df.empty:
|
||||
return {}
|
||||
|
||||
# Select only needed columns
|
||||
usecols = ['name', 'layout', 'side', 'type', 'text', 'manaCost', 'manaValue', 'faceName']
|
||||
df = pd.read_csv(csv_path, usecols=usecols, low_memory=False)
|
||||
available_cols = [col for col in usecols if col in df.columns]
|
||||
if not available_cols:
|
||||
return {}
|
||||
df = df[available_cols].copy()
|
||||
except Exception:
|
||||
return {}
|
||||
if df.empty or 'layout' not in df.columns or 'type' not in df.columns:
|
||||
|
|
@ -170,7 +210,13 @@ def parse_theme_tags(val) -> list[str]:
|
|||
['Tag1', 'Tag2']
|
||||
"['Tag1', 'Tag2']"
|
||||
Tag1, Tag2
|
||||
numpy.ndarray (from Parquet)
|
||||
Returns list of stripped string tags (may be empty)."""
|
||||
# M4: Handle numpy arrays from Parquet
|
||||
import numpy as np
|
||||
if isinstance(val, np.ndarray):
|
||||
return [str(x).strip() for x in val.tolist() if x and str(x).strip()]
|
||||
|
||||
if isinstance(val, list):
|
||||
flat: list[str] = []
|
||||
for v in val:
|
||||
|
|
|
|||
|
|
@ -7,8 +7,8 @@ from typing import Iterable, Sequence, Tuple
|
|||
|
||||
from exceptions import CommanderPartnerError
|
||||
|
||||
from code.deck_builder.partner_background_utils import analyze_partner_background
|
||||
from code.deck_builder.color_identity_utils import canon_color_code, color_label_from_code
|
||||
from .partner_background_utils import analyze_partner_background
|
||||
from .color_identity_utils import canon_color_code, color_label_from_code
|
||||
|
||||
_WUBRG_ORDER: Tuple[str, ...] = ("W", "U", "B", "R", "G", "C")
|
||||
_COLOR_PRIORITY = {color: index for index, color in enumerate(_WUBRG_ORDER)}
|
||||
|
|
|
|||
|
|
@ -7,9 +7,9 @@ import datetime as _dt
|
|||
import re as _re
|
||||
import logging_util
|
||||
|
||||
from code.deck_builder.summary_telemetry import record_land_summary, record_theme_summary, record_partner_summary
|
||||
from code.deck_builder.color_identity_utils import normalize_colors, canon_color_code, color_label_from_code
|
||||
from code.deck_builder.shared_copy import build_land_headline, dfc_card_note
|
||||
from ..summary_telemetry import record_land_summary, record_theme_summary, record_partner_summary
|
||||
from ..color_identity_utils import normalize_colors, canon_color_code, color_label_from_code
|
||||
from ..shared_copy import build_land_headline, dfc_card_note
|
||||
|
||||
logger = logging_util.logging.getLogger(__name__)
|
||||
|
||||
|
|
|
|||
|
|
@ -425,12 +425,20 @@ class RandomBuildResult:
|
|||
|
||||
|
||||
def _load_commanders_df() -> pd.DataFrame:
|
||||
"""Load commander CSV using the same path/converters as the builder.
|
||||
"""Load commanders from Parquet using isCommander boolean flag.
|
||||
|
||||
Uses bc.COMMANDER_CSV_PATH and bc.COMMANDER_CONVERTERS for consistency.
|
||||
M4: Migrated from CSV to Parquet loading with boolean filtering.
|
||||
"""
|
||||
df = pd.read_csv(bc.COMMANDER_CSV_PATH, converters=getattr(bc, "COMMANDER_CONVERTERS", None))
|
||||
return _ensure_theme_tag_cache(df)
|
||||
from . import builder_utils as bu
|
||||
|
||||
# Load all cards from Parquet
|
||||
df = bu._load_all_cards_parquet()
|
||||
if df.empty:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Filter to commanders using boolean flag
|
||||
commanders_df = bc.get_commanders(df)
|
||||
return _ensure_theme_tag_cache(commanders_df)
|
||||
|
||||
|
||||
def _ensure_theme_tag_cache(df: pd.DataFrame) -> pd.DataFrame:
|
||||
|
|
|
|||
|
|
@ -9,9 +9,9 @@ from functools import lru_cache
|
|||
from pathlib import Path
|
||||
from typing import Iterable, Tuple
|
||||
|
||||
from code.logging_util import get_logger
|
||||
import logging_util
|
||||
|
||||
LOGGER = get_logger(__name__)
|
||||
LOGGER = logging_util.get_logger(__name__)
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
DEFAULT_CATALOG_PATH = ROOT / "config" / "themes" / "theme_catalog.csv"
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from dataclasses import dataclass
|
|||
from functools import lru_cache
|
||||
from typing import Iterable, List, Sequence
|
||||
|
||||
from code.deck_builder.theme_catalog_loader import ThemeCatalogEntry
|
||||
from .theme_catalog_loader import ThemeCatalogEntry
|
||||
|
||||
__all__ = [
|
||||
"normalize_theme",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue