feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging

This commit is contained in:
matt 2025-10-18 21:32:12 -07:00
parent e9e949aae3
commit 8435312c8f
58 changed files with 11921 additions and 3961 deletions

View file

@ -1,362 +1,374 @@
"""MTG Python Deckbuilder setup module.
"""Parquet-based setup for MTG Python Deckbuilder.
This module provides the main setup functionality for the MTG Python Deckbuilder
application. It handles initial setup tasks such as downloading card data,
creating color-filtered card lists, and gener logger.info(f'Downloading latest card data for {color} cards')
download_cards_csv(MTGJSON_API_URL, f'{CSV_DIRECTORY}/cards.csv')
This module handles downloading and processing MTGJSON Parquet data for the
MTG Python Deckbuilder. It replaces the old CSV-based multi-file approach
with a single-file Parquet workflow.
logger.info('Loading and processing card data')
try:
df = pd.read_csv(f'{CSV_DIRECTORY}/cards.csv', low_memory=False)
except pd.errors.ParserError as e:
logger.warning(f'CSV parsing error encountered: {e}. Retrying with error handling...')
df = pd.read_csv(
f'{CSV_DIRECTORY}/cards.csv',
low_memory=False,
on_bad_lines='warn', # Warn about malformed rows but continue
encoding_errors='replace' # Replace bad encoding chars
)
logger.info('Successfully loaded card data with error handling (some rows may have been skipped)')
Key Changes from CSV approach:
- Single all_cards.parquet file instead of 18+ color-specific CSVs
- Downloads from MTGJSON Parquet API (faster, smaller)
- Adds isCommander and isBackground boolean flags
- Filters to essential columns only (14 base + 4 custom = 18 total)
- Uses DataLoader abstraction for format flexibility
logger.info(f'Regenerating {color} cards CSV')der-eligible card lists.
Key Features:
- Initial setup and configuration
- Card data download and processing
- Color-based card filtering
- Commander card list generation
- CSV file management and validation
The module works in conjunction with setup_utils.py for utility functions and
exceptions.py for error handling.
Introduced in v3.0.0 as part of CSVParquet migration.
"""
from __future__ import annotations
# Standard library imports
from enum import Enum
import os
from typing import List, Dict, Any
# Third-party imports (optional)
try:
import inquirer # type: ignore
except Exception:
inquirer = None # Fallback to simple input-based menu when unavailable
import pandas as pd
import requests
from tqdm import tqdm
# Local imports
from .data_loader import DataLoader, validate_schema
from .setup_constants import (
CSV_PROCESSING_COLUMNS,
CARD_TYPES_TO_EXCLUDE,
NON_LEGAL_SETS,
BANNED_CARDS,
FILTER_CONFIG,
SORT_CONFIG,
)
import logging_util
from settings import CSV_DIRECTORY
from .setup_constants import BANNED_CARDS, SETUP_COLORS, COLOR_ABRV, MTGJSON_API_URL
from .setup_utils import (
download_cards_csv,
filter_dataframe,
process_legendary_cards,
check_csv_exists,
save_color_filtered_csvs,
enrich_commander_rows_with_tags,
)
from exceptions import (
CSVFileNotFoundError,
CommanderValidationError,
MTGJSONDownloadError
)
from scripts import generate_background_cards as background_cards_script
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
from path_util import card_files_raw_dir, get_processed_cards_path
import settings
logger = logging_util.get_logger(__name__)
# MTGJSON Parquet API URL
MTGJSON_PARQUET_URL = "https://mtgjson.com/api/v5/parquet/cards.parquet"
def _generate_background_catalog(cards_path: str, output_path: str) -> None:
"""Regenerate ``background_cards.csv`` from the latest cards dataset."""
logger.info('Generating background cards catalog')
args = [
'--source', cards_path,
'--output', output_path,
]
try:
background_cards_script.main(args)
except Exception: # pragma: no cover - surfaced to caller/test
logger.exception('Failed to generate background catalog')
raise
else:
logger.info('Background cards catalog generated successfully')
# Create logger for this module
logger = logging_util.logging.getLogger(__name__)
logger.setLevel(logging_util.LOG_LEVEL)
logger.addHandler(logging_util.file_handler)
logger.addHandler(logging_util.stream_handler)
# Create CSV directory if it doesn't exist
if not os.path.exists(CSV_DIRECTORY):
os.makedirs(CSV_DIRECTORY)
## Note: using shared check_csv_exists from setup_utils to avoid duplication
def initial_setup() -> None:
"""Perform initial setup by downloading card data and creating filtered CSV files.
Downloads the latest card data from MTGJSON if needed, creates color-filtered CSV files,
and generates commander-eligible cards list. Uses utility functions from setup_utils.py
for file operations and data processing.
Raises:
CSVFileNotFoundError: If required CSV files cannot be found
MTGJSONDownloadError: If card data download fails
DataFrameProcessingError: If data processing fails
ColorFilterError: If color filtering fails
"""
logger.info('Checking for cards.csv file')
try:
cards_file = f'{CSV_DIRECTORY}/cards.csv'
try:
with open(cards_file, 'r', encoding='utf-8'):
logger.info('cards.csv exists')
except FileNotFoundError:
logger.info('cards.csv not found, downloading from mtgjson')
download_cards_csv(MTGJSON_API_URL, cards_file)
df = pd.read_csv(cards_file, low_memory=False)
logger.info('Checking for color identity sorted files')
# Generate color-identity filtered CSVs in one pass
save_color_filtered_csvs(df, CSV_DIRECTORY)
# Generate commander list
determine_commanders()
except Exception as e:
logger.error(f'Error during initial setup: {str(e)}')
raise
## Removed local filter_by_color in favor of setup_utils.save_color_filtered_csvs
def determine_commanders() -> None:
"""Generate commander_cards.csv containing all cards eligible to be commanders.
This function processes the card database to identify and validate commander-eligible cards,
applying comprehensive validation steps and filtering criteria.
Raises:
CSVFileNotFoundError: If cards.csv is missing and cannot be downloaded
MTGJSONDownloadError: If downloading cards data fails
CommanderValidationError: If commander validation fails
DataFrameProcessingError: If data processing operations fail
"""
logger.info('Starting commander card generation process')
try:
# Check for cards.csv with progress tracking
cards_file = f'{CSV_DIRECTORY}/cards.csv'
if not check_csv_exists(cards_file):
logger.info('cards.csv not found, initiating download')
download_cards_csv(MTGJSON_API_URL, cards_file)
else:
logger.info('cards.csv found, proceeding with processing')
# Load and process cards data
logger.info('Loading card data from CSV')
df = pd.read_csv(cards_file, low_memory=False)
# Process legendary cards with validation
logger.info('Processing and validating legendary cards')
try:
filtered_df = process_legendary_cards(df)
except CommanderValidationError as e:
logger.error(f'Commander validation failed: {str(e)}')
raise
# Apply standard filters
logger.info('Applying standard card filters')
filtered_df = filter_dataframe(filtered_df, BANNED_CARDS)
logger.info('Enriching commander metadata with theme and creature tags')
filtered_df = enrich_commander_rows_with_tags(filtered_df, CSV_DIRECTORY)
# Save commander cards
logger.info('Saving validated commander cards')
commander_path = f'{CSV_DIRECTORY}/commander_cards.csv'
filtered_df.to_csv(commander_path, index=False)
background_output = f'{CSV_DIRECTORY}/background_cards.csv'
_generate_background_catalog(cards_file, background_output)
logger.info('Commander card generation completed successfully')
except (CSVFileNotFoundError, MTGJSONDownloadError) as e:
logger.error(f'File operation error: {str(e)}')
raise
except CommanderValidationError as e:
logger.error(f'Commander validation error: {str(e)}')
raise
except Exception as e:
logger.error(f'Unexpected error during commander generation: {str(e)}')
raise
def regenerate_csvs_all() -> None:
"""Regenerate all color-filtered CSV files from latest card data.
Downloads fresh card data and recreates all color-filtered CSV files.
Useful for updating the card database when new sets are released.
Raises:
MTGJSONDownloadError: If card data download fails
DataFrameProcessingError: If data processing fails
ColorFilterError: If color filtering fails
"""
try:
logger.info('Downloading latest card data from MTGJSON')
download_cards_csv(MTGJSON_API_URL, f'{CSV_DIRECTORY}/cards.csv')
logger.info('Loading and processing card data')
try:
df = pd.read_csv(f'{CSV_DIRECTORY}/cards.csv', low_memory=False)
except pd.errors.ParserError as e:
logger.warning(f'CSV parsing error encountered: {e}. Retrying with error handling...')
df = pd.read_csv(
f'{CSV_DIRECTORY}/cards.csv',
low_memory=False,
on_bad_lines='warn', # Warn about malformed rows but continue
encoding_errors='replace' # Replace bad encoding chars
)
logger.info(f'Successfully loaded card data with error handling (some rows may have been skipped)')
logger.info('Regenerating color identity sorted files')
save_color_filtered_csvs(df, CSV_DIRECTORY)
logger.info('Regenerating commander cards')
determine_commanders()
logger.info('Card database regeneration complete')
except Exception as e:
logger.error(f'Failed to regenerate card database: {str(e)}')
raise
# Once files are regenerated, create a new legendary list (already executed in try)
def regenerate_csv_by_color(color: str) -> None:
"""Regenerate CSV file for a specific color identity.
def download_parquet_from_mtgjson(output_path: str) -> None:
"""Download MTGJSON cards.parquet file.
Args:
color: Color name to regenerate CSV for (e.g. 'white', 'blue')
output_path: Where to save the downloaded Parquet file
Raises:
ValueError: If color is not valid
MTGJSONDownloadError: If card data download fails
DataFrameProcessingError: If data processing fails
ColorFilterError: If color filtering fails
requests.RequestException: If download fails
IOError: If file cannot be written
"""
logger.info(f"Downloading MTGJSON Parquet from {MTGJSON_PARQUET_URL}")
try:
if color not in SETUP_COLORS:
raise ValueError(f'Invalid color: {color}')
color_abv = COLOR_ABRV[SETUP_COLORS.index(color)]
logger.info(f'Downloading latest card data for {color} cards')
download_cards_csv(MTGJSON_API_URL, f'{CSV_DIRECTORY}/cards.csv')
logger.info('Loading and processing card data')
df = pd.read_csv(
f'{CSV_DIRECTORY}/cards.csv',
low_memory=False,
on_bad_lines='skip', # Skip malformed rows (MTGJSON CSV has escaping issues)
encoding_errors='replace' # Replace bad encoding chars
)
logger.info(f'Regenerating {color} cards CSV')
# Use shared utilities to base-filter once then slice color, honoring bans
base_df = filter_dataframe(df, BANNED_CARDS)
base_df[base_df['colorIdentity'] == color_abv].to_csv(
f'{CSV_DIRECTORY}/{color}_cards.csv', index=False
)
logger.info(f'Successfully regenerated {color} cards database')
except Exception as e:
logger.error(f'Failed to regenerate {color} cards: {str(e)}')
response = requests.get(MTGJSON_PARQUET_URL, stream=True, timeout=60)
response.raise_for_status()
# Get file size for progress bar
total_size = int(response.headers.get('content-length', 0))
# Ensure output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Download with progress bar
with open(output_path, 'wb') as f, tqdm(
total=total_size,
unit='B',
unit_scale=True,
desc='Downloading cards.parquet'
) as pbar:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
pbar.update(len(chunk))
logger.info(f"✓ Downloaded {total_size / (1024**2):.2f} MB to {output_path}")
except requests.RequestException as e:
logger.error(f"Failed to download MTGJSON Parquet: {e}")
raise
except IOError as e:
logger.error(f"Failed to write Parquet file: {e}")
raise
class SetupOption(Enum):
"""Enum for setup menu options."""
INITIAL_SETUP = 'Initial Setup'
REGENERATE_CSV = 'Regenerate CSV Files'
BACK = 'Back'
def _display_setup_menu() -> SetupOption:
"""Display the setup menu and return the selected option.
def is_valid_commander(row: pd.Series) -> bool:
"""Determine if a card can be a commander.
Returns:
SetupOption: The selected menu option
"""
if inquirer is not None:
question: List[Dict[str, Any]] = [
inquirer.List(
'menu',
choices=[option.value for option in SetupOption],
carousel=True)]
answer = inquirer.prompt(question)
return SetupOption(answer['menu'])
# Simple fallback when inquirer isn't installed (e.g., headless/container)
options = list(SetupOption)
print("\nSetup Menu:")
for idx, opt in enumerate(options, start=1):
print(f" {idx}) {opt.value}")
while True:
try:
sel = input("Select an option [1]: ").strip() or "1"
i = int(sel)
if 1 <= i <= len(options):
return options[i - 1]
except KeyboardInterrupt:
print("")
return SetupOption.BACK
except Exception:
pass
print("Invalid selection. Please try again.")
def setup() -> bool:
"""Run the setup process for the MTG Python Deckbuilder.
Criteria:
- Legendary Creature
- OR: Has "can be your commander" in text
- OR: Background (Partner with Background)
This function provides a menu-driven interface to:
1. Perform initial setup by downloading and processing card data
2. Regenerate CSV files with updated card data
3. Perform all tagging processes on the color-sorted csv files
The function handles errors gracefully and provides feedback through logging.
Returns:
bool: True if setup completed successfully, False otherwise
"""
try:
print('Which setup operation would you like to perform?\n'
'If this is your first time setting up, do the initial setup.\n'
'If you\'ve done the basic setup before, you can regenerate the CSV files\n')
Args:
row: DataFrame row with card data
choice = _display_setup_menu()
if choice == SetupOption.INITIAL_SETUP:
logger.info('Starting initial setup')
initial_setup()
logger.info('Initial setup completed successfully')
return True
elif choice == SetupOption.REGENERATE_CSV:
logger.info('Starting CSV regeneration')
regenerate_csvs_all()
logger.info('CSV regeneration completed successfully')
return True
elif choice == SetupOption.BACK:
logger.info('Setup cancelled by user')
return False
except Exception as e:
logger.error(f'Error during setup: {e}')
raise
Returns:
True if card can be a commander
"""
type_line = str(row.get('type', ''))
text = str(row.get('text', '')).lower()
# Legendary Creature
if 'Legendary' in type_line and 'Creature' in type_line:
return True
# Special text (e.g., "can be your commander")
if 'can be your commander' in text:
return True
# Backgrounds can be commanders (with Choose a Background)
if 'Background' in type_line:
return True
return False
def is_background(row: pd.Series) -> bool:
"""Determine if a card is a Background.
Args:
row: DataFrame row with card data
Returns:
True if card has Background type
"""
type_line = str(row.get('type', ''))
return 'Background' in type_line
def extract_creature_types(row: pd.Series) -> str:
"""Extract creature types from type line.
Args:
row: DataFrame row with card data
Returns:
Comma-separated creature types or empty string
"""
type_line = str(row.get('type', ''))
# Check if it's a creature
if 'Creature' not in type_line:
return ''
# Split on — to get subtypes
if '' in type_line:
parts = type_line.split('')
if len(parts) >= 2:
# Get everything after the dash, strip whitespace
subtypes = parts[1].strip()
return subtypes
return ''
def process_raw_parquet(raw_path: str, output_path: str) -> pd.DataFrame:
"""Process raw MTGJSON Parquet into processed all_cards.parquet.
This function:
1. Loads raw Parquet (all ~82 columns)
2. Filters to essential columns (CSV_PROCESSING_COLUMNS)
3. Applies standard filtering (banned cards, illegal sets, special types)
4. Deduplicates by faceName (keep first printing only)
5. Adds custom columns: creatureTypes, themeTags, isCommander, isBackground
6. Validates schema
7. Writes to processed directory
Args:
raw_path: Path to raw cards.parquet from MTGJSON
output_path: Path to save processed all_cards.parquet
Returns:
Processed DataFrame
Raises:
ValueError: If schema validation fails
"""
logger.info(f"Processing {raw_path}")
# Load raw Parquet with DataLoader
loader = DataLoader()
df = loader.read_cards(raw_path)
logger.info(f"Loaded {len(df)} cards with {len(df.columns)} columns")
# Step 1: Fill NA values
logger.info("Filling NA values")
for col, fill_value in settings.FILL_NA_COLUMNS.items():
if col in df.columns:
if col == 'faceName':
df[col] = df[col].fillna(df['name'])
else:
df[col] = df[col].fillna(fill_value)
# Step 2: Apply configuration-based filters (FILTER_CONFIG)
logger.info("Applying configuration filters")
for field, rules in FILTER_CONFIG.items():
if field not in df.columns:
logger.warning(f"Skipping filter for missing field: {field}")
continue
for rule_type, values in rules.items():
if not values:
continue
if rule_type == 'exclude':
for value in values:
mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False)
before = len(df)
df = df[~mask]
logger.debug(f"Excluded {field} containing '{value}': {before - len(df)} removed")
elif rule_type == 'require':
for value in values:
mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False)
before = len(df)
df = df[mask]
logger.debug(f"Required {field} containing '{value}': {before - len(df)} removed")
# Step 3: Remove illegal sets
if 'printings' in df.columns:
logger.info("Removing illegal sets")
for set_code in NON_LEGAL_SETS:
before = len(df)
df = df[~df['printings'].str.contains(set_code, na=False)]
if len(df) < before:
logger.debug(f"Removed set {set_code}: {before - len(df)} cards")
# Step 4: Remove banned cards
logger.info("Removing banned cards")
banned_set = {b.casefold() for b in BANNED_CARDS}
name_lc = df['name'].astype(str).str.casefold()
face_lc = df['faceName'].astype(str).str.casefold() if 'faceName' in df.columns else name_lc
mask = ~(name_lc.isin(banned_set) | face_lc.isin(banned_set))
before = len(df)
df = df[mask]
logger.debug(f"Removed banned cards: {before - len(df)} filtered out")
# Step 5: Remove special card types
logger.info("Removing special card types")
for card_type in CARD_TYPES_TO_EXCLUDE:
before = len(df)
df = df[~df['type'].str.contains(card_type, na=False)]
if len(df) < before:
logger.debug(f"Removed type {card_type}: {before - len(df)} cards")
# Step 6: Filter to essential columns only (reduce from ~82 to 14)
logger.info(f"Filtering to {len(CSV_PROCESSING_COLUMNS)} essential columns")
df = df[CSV_PROCESSING_COLUMNS]
# Step 7: Sort and deduplicate (CRITICAL: keeps only one printing per unique card)
logger.info("Sorting and deduplicating cards")
df = df.sort_values(
by=SORT_CONFIG['columns'],
key=lambda col: col.str.lower() if not SORT_CONFIG['case_sensitive'] else col
)
before = len(df)
df = df.drop_duplicates(subset='faceName', keep='first')
logger.info(f"Deduplicated: {before}{len(df)} cards ({before - len(df)} duplicate printings removed)")
# Step 8: Add custom columns
logger.info("Adding custom columns: creatureTypes, themeTags, isCommander, isBackground")
# creatureTypes: extracted from type line
df['creatureTypes'] = df.apply(extract_creature_types, axis=1)
# themeTags: empty placeholder (filled during tagging)
df['themeTags'] = ''
# isCommander: boolean flag
df['isCommander'] = df.apply(is_valid_commander, axis=1)
# isBackground: boolean flag
df['isBackground'] = df.apply(is_background, axis=1)
# Reorder columns to match CARD_DATA_COLUMNS
# CARD_DATA_COLUMNS has: name, faceName, edhrecRank, colorIdentity, colors,
# manaCost, manaValue, type, creatureTypes, text,
# power, toughness, keywords, themeTags, layout, side
# We need to add isCommander and isBackground at the end
final_columns = settings.CARD_DATA_COLUMNS + ['isCommander', 'isBackground']
# Ensure all columns exist
for col in final_columns:
if col not in df.columns:
logger.warning(f"Column {col} missing, adding empty column")
df[col] = ''
df = df[final_columns]
logger.info(f"Final dataset: {len(df)} cards, {len(df.columns)} columns")
logger.info(f"Commanders: {df['isCommander'].sum()}")
logger.info(f"Backgrounds: {df['isBackground'].sum()}")
# Validate schema (check required columns present)
try:
validate_schema(df)
logger.info("✓ Schema validation passed")
except ValueError as e:
logger.error(f"Schema validation failed: {e}")
raise
# Write to processed directory
logger.info(f"Writing processed Parquet to {output_path}")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
loader.write_cards(df, output_path)
logger.info(f"✓ Created {output_path}")
return df
def initial_setup() -> None:
"""Download and process MTGJSON Parquet data.
Modern Parquet-based setup workflow (replaces legacy CSV approach).
Workflow:
1. Download cards.parquet from MTGJSON card_files/raw/cards.parquet
2. Process and filter card_files/processed/all_cards.parquet
3. No color-specific files (filter at query time instead)
Raises:
Various exceptions from download/processing steps
"""
logger.info("=" * 80)
logger.info("Starting Parquet-based initial setup")
logger.info("=" * 80)
# Step 1: Download raw Parquet
raw_dir = card_files_raw_dir()
raw_path = os.path.join(raw_dir, "cards.parquet")
if os.path.exists(raw_path):
logger.info(f"Raw Parquet already exists: {raw_path}")
logger.info("Skipping download (delete file to re-download)")
else:
download_parquet_from_mtgjson(raw_path)
# Step 2: Process raw → processed
processed_path = get_processed_cards_path()
logger.info(f"Processing raw Parquet → {processed_path}")
process_raw_parquet(raw_path, processed_path)
logger.info("=" * 80)
logger.info("✓ Parquet setup complete")
logger.info(f" Raw: {raw_path}")
logger.info(f" Processed: {processed_path}")
logger.info("=" * 80)
def regenerate_processed_parquet() -> None:
"""Regenerate processed Parquet from existing raw file.
Useful when:
- Column processing logic changes
- Adding new custom columns
- Testing without re-downloading
"""
logger.info("Regenerating processed Parquet from raw file")
raw_path = os.path.join(card_files_raw_dir(), "cards.parquet")
if not os.path.exists(raw_path):
logger.error(f"Raw Parquet not found: {raw_path}")
logger.error("Run initial_setup_parquet() first to download")
raise FileNotFoundError(f"Raw Parquet not found: {raw_path}")
processed_path = get_processed_cards_path()
process_raw_parquet(raw_path, processed_path)
logger.info(f"✓ Regenerated {processed_path}")