mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 23:50:12 +01:00
feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging
This commit is contained in:
parent
e9e949aae3
commit
8435312c8f
58 changed files with 11921 additions and 3961 deletions
|
|
@ -1,362 +1,374 @@
|
|||
"""MTG Python Deckbuilder setup module.
|
||||
"""Parquet-based setup for MTG Python Deckbuilder.
|
||||
|
||||
This module provides the main setup functionality for the MTG Python Deckbuilder
|
||||
application. It handles initial setup tasks such as downloading card data,
|
||||
creating color-filtered card lists, and gener logger.info(f'Downloading latest card data for {color} cards')
|
||||
download_cards_csv(MTGJSON_API_URL, f'{CSV_DIRECTORY}/cards.csv')
|
||||
This module handles downloading and processing MTGJSON Parquet data for the
|
||||
MTG Python Deckbuilder. It replaces the old CSV-based multi-file approach
|
||||
with a single-file Parquet workflow.
|
||||
|
||||
logger.info('Loading and processing card data')
|
||||
try:
|
||||
df = pd.read_csv(f'{CSV_DIRECTORY}/cards.csv', low_memory=False)
|
||||
except pd.errors.ParserError as e:
|
||||
logger.warning(f'CSV parsing error encountered: {e}. Retrying with error handling...')
|
||||
df = pd.read_csv(
|
||||
f'{CSV_DIRECTORY}/cards.csv',
|
||||
low_memory=False,
|
||||
on_bad_lines='warn', # Warn about malformed rows but continue
|
||||
encoding_errors='replace' # Replace bad encoding chars
|
||||
)
|
||||
logger.info('Successfully loaded card data with error handling (some rows may have been skipped)')
|
||||
Key Changes from CSV approach:
|
||||
- Single all_cards.parquet file instead of 18+ color-specific CSVs
|
||||
- Downloads from MTGJSON Parquet API (faster, smaller)
|
||||
- Adds isCommander and isBackground boolean flags
|
||||
- Filters to essential columns only (14 base + 4 custom = 18 total)
|
||||
- Uses DataLoader abstraction for format flexibility
|
||||
|
||||
logger.info(f'Regenerating {color} cards CSV')der-eligible card lists.
|
||||
|
||||
Key Features:
|
||||
- Initial setup and configuration
|
||||
- Card data download and processing
|
||||
- Color-based card filtering
|
||||
- Commander card list generation
|
||||
- CSV file management and validation
|
||||
|
||||
The module works in conjunction with setup_utils.py for utility functions and
|
||||
exceptions.py for error handling.
|
||||
Introduced in v3.0.0 as part of CSV→Parquet migration.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# Standard library imports
|
||||
from enum import Enum
|
||||
import os
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Third-party imports (optional)
|
||||
try:
|
||||
import inquirer # type: ignore
|
||||
except Exception:
|
||||
inquirer = None # Fallback to simple input-based menu when unavailable
|
||||
import pandas as pd
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
# Local imports
|
||||
from .data_loader import DataLoader, validate_schema
|
||||
from .setup_constants import (
|
||||
CSV_PROCESSING_COLUMNS,
|
||||
CARD_TYPES_TO_EXCLUDE,
|
||||
NON_LEGAL_SETS,
|
||||
BANNED_CARDS,
|
||||
FILTER_CONFIG,
|
||||
SORT_CONFIG,
|
||||
)
|
||||
import logging_util
|
||||
from settings import CSV_DIRECTORY
|
||||
from .setup_constants import BANNED_CARDS, SETUP_COLORS, COLOR_ABRV, MTGJSON_API_URL
|
||||
from .setup_utils import (
|
||||
download_cards_csv,
|
||||
filter_dataframe,
|
||||
process_legendary_cards,
|
||||
check_csv_exists,
|
||||
save_color_filtered_csvs,
|
||||
enrich_commander_rows_with_tags,
|
||||
)
|
||||
from exceptions import (
|
||||
CSVFileNotFoundError,
|
||||
CommanderValidationError,
|
||||
MTGJSONDownloadError
|
||||
)
|
||||
from scripts import generate_background_cards as background_cards_script
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
from path_util import card_files_raw_dir, get_processed_cards_path
|
||||
import settings
|
||||
|
||||
logger = logging_util.get_logger(__name__)
|
||||
|
||||
# MTGJSON Parquet API URL
|
||||
MTGJSON_PARQUET_URL = "https://mtgjson.com/api/v5/parquet/cards.parquet"
|
||||
|
||||
|
||||
def _generate_background_catalog(cards_path: str, output_path: str) -> None:
|
||||
"""Regenerate ``background_cards.csv`` from the latest cards dataset."""
|
||||
|
||||
logger.info('Generating background cards catalog')
|
||||
args = [
|
||||
'--source', cards_path,
|
||||
'--output', output_path,
|
||||
]
|
||||
try:
|
||||
background_cards_script.main(args)
|
||||
except Exception: # pragma: no cover - surfaced to caller/test
|
||||
logger.exception('Failed to generate background catalog')
|
||||
raise
|
||||
else:
|
||||
logger.info('Background cards catalog generated successfully')
|
||||
|
||||
# Create logger for this module
|
||||
logger = logging_util.logging.getLogger(__name__)
|
||||
logger.setLevel(logging_util.LOG_LEVEL)
|
||||
logger.addHandler(logging_util.file_handler)
|
||||
logger.addHandler(logging_util.stream_handler)
|
||||
|
||||
# Create CSV directory if it doesn't exist
|
||||
if not os.path.exists(CSV_DIRECTORY):
|
||||
os.makedirs(CSV_DIRECTORY)
|
||||
|
||||
## Note: using shared check_csv_exists from setup_utils to avoid duplication
|
||||
|
||||
def initial_setup() -> None:
|
||||
"""Perform initial setup by downloading card data and creating filtered CSV files.
|
||||
|
||||
Downloads the latest card data from MTGJSON if needed, creates color-filtered CSV files,
|
||||
and generates commander-eligible cards list. Uses utility functions from setup_utils.py
|
||||
for file operations and data processing.
|
||||
|
||||
Raises:
|
||||
CSVFileNotFoundError: If required CSV files cannot be found
|
||||
MTGJSONDownloadError: If card data download fails
|
||||
DataFrameProcessingError: If data processing fails
|
||||
ColorFilterError: If color filtering fails
|
||||
"""
|
||||
logger.info('Checking for cards.csv file')
|
||||
|
||||
try:
|
||||
cards_file = f'{CSV_DIRECTORY}/cards.csv'
|
||||
try:
|
||||
with open(cards_file, 'r', encoding='utf-8'):
|
||||
logger.info('cards.csv exists')
|
||||
except FileNotFoundError:
|
||||
logger.info('cards.csv not found, downloading from mtgjson')
|
||||
download_cards_csv(MTGJSON_API_URL, cards_file)
|
||||
|
||||
df = pd.read_csv(cards_file, low_memory=False)
|
||||
|
||||
logger.info('Checking for color identity sorted files')
|
||||
# Generate color-identity filtered CSVs in one pass
|
||||
save_color_filtered_csvs(df, CSV_DIRECTORY)
|
||||
|
||||
# Generate commander list
|
||||
determine_commanders()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Error during initial setup: {str(e)}')
|
||||
raise
|
||||
|
||||
## Removed local filter_by_color in favor of setup_utils.save_color_filtered_csvs
|
||||
|
||||
def determine_commanders() -> None:
|
||||
"""Generate commander_cards.csv containing all cards eligible to be commanders.
|
||||
|
||||
This function processes the card database to identify and validate commander-eligible cards,
|
||||
applying comprehensive validation steps and filtering criteria.
|
||||
|
||||
Raises:
|
||||
CSVFileNotFoundError: If cards.csv is missing and cannot be downloaded
|
||||
MTGJSONDownloadError: If downloading cards data fails
|
||||
CommanderValidationError: If commander validation fails
|
||||
DataFrameProcessingError: If data processing operations fail
|
||||
"""
|
||||
logger.info('Starting commander card generation process')
|
||||
|
||||
try:
|
||||
# Check for cards.csv with progress tracking
|
||||
cards_file = f'{CSV_DIRECTORY}/cards.csv'
|
||||
if not check_csv_exists(cards_file):
|
||||
logger.info('cards.csv not found, initiating download')
|
||||
download_cards_csv(MTGJSON_API_URL, cards_file)
|
||||
else:
|
||||
logger.info('cards.csv found, proceeding with processing')
|
||||
|
||||
# Load and process cards data
|
||||
logger.info('Loading card data from CSV')
|
||||
df = pd.read_csv(cards_file, low_memory=False)
|
||||
|
||||
# Process legendary cards with validation
|
||||
logger.info('Processing and validating legendary cards')
|
||||
try:
|
||||
filtered_df = process_legendary_cards(df)
|
||||
except CommanderValidationError as e:
|
||||
logger.error(f'Commander validation failed: {str(e)}')
|
||||
raise
|
||||
|
||||
# Apply standard filters
|
||||
logger.info('Applying standard card filters')
|
||||
filtered_df = filter_dataframe(filtered_df, BANNED_CARDS)
|
||||
|
||||
logger.info('Enriching commander metadata with theme and creature tags')
|
||||
filtered_df = enrich_commander_rows_with_tags(filtered_df, CSV_DIRECTORY)
|
||||
|
||||
# Save commander cards
|
||||
logger.info('Saving validated commander cards')
|
||||
commander_path = f'{CSV_DIRECTORY}/commander_cards.csv'
|
||||
filtered_df.to_csv(commander_path, index=False)
|
||||
|
||||
background_output = f'{CSV_DIRECTORY}/background_cards.csv'
|
||||
_generate_background_catalog(cards_file, background_output)
|
||||
|
||||
logger.info('Commander card generation completed successfully')
|
||||
|
||||
except (CSVFileNotFoundError, MTGJSONDownloadError) as e:
|
||||
logger.error(f'File operation error: {str(e)}')
|
||||
raise
|
||||
except CommanderValidationError as e:
|
||||
logger.error(f'Commander validation error: {str(e)}')
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f'Unexpected error during commander generation: {str(e)}')
|
||||
raise
|
||||
|
||||
def regenerate_csvs_all() -> None:
|
||||
"""Regenerate all color-filtered CSV files from latest card data.
|
||||
|
||||
Downloads fresh card data and recreates all color-filtered CSV files.
|
||||
Useful for updating the card database when new sets are released.
|
||||
|
||||
Raises:
|
||||
MTGJSONDownloadError: If card data download fails
|
||||
DataFrameProcessingError: If data processing fails
|
||||
ColorFilterError: If color filtering fails
|
||||
"""
|
||||
try:
|
||||
logger.info('Downloading latest card data from MTGJSON')
|
||||
download_cards_csv(MTGJSON_API_URL, f'{CSV_DIRECTORY}/cards.csv')
|
||||
|
||||
logger.info('Loading and processing card data')
|
||||
try:
|
||||
df = pd.read_csv(f'{CSV_DIRECTORY}/cards.csv', low_memory=False)
|
||||
except pd.errors.ParserError as e:
|
||||
logger.warning(f'CSV parsing error encountered: {e}. Retrying with error handling...')
|
||||
df = pd.read_csv(
|
||||
f'{CSV_DIRECTORY}/cards.csv',
|
||||
low_memory=False,
|
||||
on_bad_lines='warn', # Warn about malformed rows but continue
|
||||
encoding_errors='replace' # Replace bad encoding chars
|
||||
)
|
||||
logger.info(f'Successfully loaded card data with error handling (some rows may have been skipped)')
|
||||
|
||||
logger.info('Regenerating color identity sorted files')
|
||||
save_color_filtered_csvs(df, CSV_DIRECTORY)
|
||||
|
||||
logger.info('Regenerating commander cards')
|
||||
determine_commanders()
|
||||
|
||||
logger.info('Card database regeneration complete')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to regenerate card database: {str(e)}')
|
||||
raise
|
||||
# Once files are regenerated, create a new legendary list (already executed in try)
|
||||
|
||||
def regenerate_csv_by_color(color: str) -> None:
|
||||
"""Regenerate CSV file for a specific color identity.
|
||||
def download_parquet_from_mtgjson(output_path: str) -> None:
|
||||
"""Download MTGJSON cards.parquet file.
|
||||
|
||||
Args:
|
||||
color: Color name to regenerate CSV for (e.g. 'white', 'blue')
|
||||
output_path: Where to save the downloaded Parquet file
|
||||
|
||||
Raises:
|
||||
ValueError: If color is not valid
|
||||
MTGJSONDownloadError: If card data download fails
|
||||
DataFrameProcessingError: If data processing fails
|
||||
ColorFilterError: If color filtering fails
|
||||
requests.RequestException: If download fails
|
||||
IOError: If file cannot be written
|
||||
"""
|
||||
logger.info(f"Downloading MTGJSON Parquet from {MTGJSON_PARQUET_URL}")
|
||||
|
||||
try:
|
||||
if color not in SETUP_COLORS:
|
||||
raise ValueError(f'Invalid color: {color}')
|
||||
|
||||
color_abv = COLOR_ABRV[SETUP_COLORS.index(color)]
|
||||
|
||||
logger.info(f'Downloading latest card data for {color} cards')
|
||||
download_cards_csv(MTGJSON_API_URL, f'{CSV_DIRECTORY}/cards.csv')
|
||||
|
||||
logger.info('Loading and processing card data')
|
||||
df = pd.read_csv(
|
||||
f'{CSV_DIRECTORY}/cards.csv',
|
||||
low_memory=False,
|
||||
on_bad_lines='skip', # Skip malformed rows (MTGJSON CSV has escaping issues)
|
||||
encoding_errors='replace' # Replace bad encoding chars
|
||||
)
|
||||
|
||||
logger.info(f'Regenerating {color} cards CSV')
|
||||
# Use shared utilities to base-filter once then slice color, honoring bans
|
||||
base_df = filter_dataframe(df, BANNED_CARDS)
|
||||
base_df[base_df['colorIdentity'] == color_abv].to_csv(
|
||||
f'{CSV_DIRECTORY}/{color}_cards.csv', index=False
|
||||
)
|
||||
|
||||
logger.info(f'Successfully regenerated {color} cards database')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to regenerate {color} cards: {str(e)}')
|
||||
response = requests.get(MTGJSON_PARQUET_URL, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Get file size for progress bar
|
||||
total_size = int(response.headers.get('content-length', 0))
|
||||
|
||||
# Ensure output directory exists
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
# Download with progress bar
|
||||
with open(output_path, 'wb') as f, tqdm(
|
||||
total=total_size,
|
||||
unit='B',
|
||||
unit_scale=True,
|
||||
desc='Downloading cards.parquet'
|
||||
) as pbar:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
pbar.update(len(chunk))
|
||||
|
||||
logger.info(f"✓ Downloaded {total_size / (1024**2):.2f} MB to {output_path}")
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Failed to download MTGJSON Parquet: {e}")
|
||||
raise
|
||||
except IOError as e:
|
||||
logger.error(f"Failed to write Parquet file: {e}")
|
||||
raise
|
||||
|
||||
class SetupOption(Enum):
|
||||
"""Enum for setup menu options."""
|
||||
INITIAL_SETUP = 'Initial Setup'
|
||||
REGENERATE_CSV = 'Regenerate CSV Files'
|
||||
BACK = 'Back'
|
||||
|
||||
def _display_setup_menu() -> SetupOption:
|
||||
"""Display the setup menu and return the selected option.
|
||||
def is_valid_commander(row: pd.Series) -> bool:
|
||||
"""Determine if a card can be a commander.
|
||||
|
||||
Returns:
|
||||
SetupOption: The selected menu option
|
||||
"""
|
||||
if inquirer is not None:
|
||||
question: List[Dict[str, Any]] = [
|
||||
inquirer.List(
|
||||
'menu',
|
||||
choices=[option.value for option in SetupOption],
|
||||
carousel=True)]
|
||||
answer = inquirer.prompt(question)
|
||||
return SetupOption(answer['menu'])
|
||||
|
||||
# Simple fallback when inquirer isn't installed (e.g., headless/container)
|
||||
options = list(SetupOption)
|
||||
print("\nSetup Menu:")
|
||||
for idx, opt in enumerate(options, start=1):
|
||||
print(f" {idx}) {opt.value}")
|
||||
while True:
|
||||
try:
|
||||
sel = input("Select an option [1]: ").strip() or "1"
|
||||
i = int(sel)
|
||||
if 1 <= i <= len(options):
|
||||
return options[i - 1]
|
||||
except KeyboardInterrupt:
|
||||
print("")
|
||||
return SetupOption.BACK
|
||||
except Exception:
|
||||
pass
|
||||
print("Invalid selection. Please try again.")
|
||||
|
||||
def setup() -> bool:
|
||||
"""Run the setup process for the MTG Python Deckbuilder.
|
||||
Criteria:
|
||||
- Legendary Creature
|
||||
- OR: Has "can be your commander" in text
|
||||
- OR: Background (Partner with Background)
|
||||
|
||||
This function provides a menu-driven interface to:
|
||||
1. Perform initial setup by downloading and processing card data
|
||||
2. Regenerate CSV files with updated card data
|
||||
3. Perform all tagging processes on the color-sorted csv files
|
||||
|
||||
The function handles errors gracefully and provides feedback through logging.
|
||||
|
||||
Returns:
|
||||
bool: True if setup completed successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
print('Which setup operation would you like to perform?\n'
|
||||
'If this is your first time setting up, do the initial setup.\n'
|
||||
'If you\'ve done the basic setup before, you can regenerate the CSV files\n')
|
||||
Args:
|
||||
row: DataFrame row with card data
|
||||
|
||||
choice = _display_setup_menu()
|
||||
|
||||
if choice == SetupOption.INITIAL_SETUP:
|
||||
logger.info('Starting initial setup')
|
||||
initial_setup()
|
||||
logger.info('Initial setup completed successfully')
|
||||
return True
|
||||
|
||||
elif choice == SetupOption.REGENERATE_CSV:
|
||||
logger.info('Starting CSV regeneration')
|
||||
regenerate_csvs_all()
|
||||
logger.info('CSV regeneration completed successfully')
|
||||
return True
|
||||
|
||||
elif choice == SetupOption.BACK:
|
||||
logger.info('Setup cancelled by user')
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Error during setup: {e}')
|
||||
raise
|
||||
Returns:
|
||||
True if card can be a commander
|
||||
"""
|
||||
type_line = str(row.get('type', ''))
|
||||
text = str(row.get('text', '')).lower()
|
||||
|
||||
# Legendary Creature
|
||||
if 'Legendary' in type_line and 'Creature' in type_line:
|
||||
return True
|
||||
|
||||
# Special text (e.g., "can be your commander")
|
||||
if 'can be your commander' in text:
|
||||
return True
|
||||
|
||||
# Backgrounds can be commanders (with Choose a Background)
|
||||
if 'Background' in type_line:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_background(row: pd.Series) -> bool:
|
||||
"""Determine if a card is a Background.
|
||||
|
||||
Args:
|
||||
row: DataFrame row with card data
|
||||
|
||||
Returns:
|
||||
True if card has Background type
|
||||
"""
|
||||
type_line = str(row.get('type', ''))
|
||||
return 'Background' in type_line
|
||||
|
||||
|
||||
def extract_creature_types(row: pd.Series) -> str:
|
||||
"""Extract creature types from type line.
|
||||
|
||||
Args:
|
||||
row: DataFrame row with card data
|
||||
|
||||
Returns:
|
||||
Comma-separated creature types or empty string
|
||||
"""
|
||||
type_line = str(row.get('type', ''))
|
||||
|
||||
# Check if it's a creature
|
||||
if 'Creature' not in type_line:
|
||||
return ''
|
||||
|
||||
# Split on — to get subtypes
|
||||
if '—' in type_line:
|
||||
parts = type_line.split('—')
|
||||
if len(parts) >= 2:
|
||||
# Get everything after the dash, strip whitespace
|
||||
subtypes = parts[1].strip()
|
||||
return subtypes
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def process_raw_parquet(raw_path: str, output_path: str) -> pd.DataFrame:
|
||||
"""Process raw MTGJSON Parquet into processed all_cards.parquet.
|
||||
|
||||
This function:
|
||||
1. Loads raw Parquet (all ~82 columns)
|
||||
2. Filters to essential columns (CSV_PROCESSING_COLUMNS)
|
||||
3. Applies standard filtering (banned cards, illegal sets, special types)
|
||||
4. Deduplicates by faceName (keep first printing only)
|
||||
5. Adds custom columns: creatureTypes, themeTags, isCommander, isBackground
|
||||
6. Validates schema
|
||||
7. Writes to processed directory
|
||||
|
||||
Args:
|
||||
raw_path: Path to raw cards.parquet from MTGJSON
|
||||
output_path: Path to save processed all_cards.parquet
|
||||
|
||||
Returns:
|
||||
Processed DataFrame
|
||||
|
||||
Raises:
|
||||
ValueError: If schema validation fails
|
||||
"""
|
||||
logger.info(f"Processing {raw_path}")
|
||||
|
||||
# Load raw Parquet with DataLoader
|
||||
loader = DataLoader()
|
||||
df = loader.read_cards(raw_path)
|
||||
|
||||
logger.info(f"Loaded {len(df)} cards with {len(df.columns)} columns")
|
||||
|
||||
# Step 1: Fill NA values
|
||||
logger.info("Filling NA values")
|
||||
for col, fill_value in settings.FILL_NA_COLUMNS.items():
|
||||
if col in df.columns:
|
||||
if col == 'faceName':
|
||||
df[col] = df[col].fillna(df['name'])
|
||||
else:
|
||||
df[col] = df[col].fillna(fill_value)
|
||||
|
||||
# Step 2: Apply configuration-based filters (FILTER_CONFIG)
|
||||
logger.info("Applying configuration filters")
|
||||
for field, rules in FILTER_CONFIG.items():
|
||||
if field not in df.columns:
|
||||
logger.warning(f"Skipping filter for missing field: {field}")
|
||||
continue
|
||||
|
||||
for rule_type, values in rules.items():
|
||||
if not values:
|
||||
continue
|
||||
|
||||
if rule_type == 'exclude':
|
||||
for value in values:
|
||||
mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False)
|
||||
before = len(df)
|
||||
df = df[~mask]
|
||||
logger.debug(f"Excluded {field} containing '{value}': {before - len(df)} removed")
|
||||
elif rule_type == 'require':
|
||||
for value in values:
|
||||
mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False)
|
||||
before = len(df)
|
||||
df = df[mask]
|
||||
logger.debug(f"Required {field} containing '{value}': {before - len(df)} removed")
|
||||
|
||||
# Step 3: Remove illegal sets
|
||||
if 'printings' in df.columns:
|
||||
logger.info("Removing illegal sets")
|
||||
for set_code in NON_LEGAL_SETS:
|
||||
before = len(df)
|
||||
df = df[~df['printings'].str.contains(set_code, na=False)]
|
||||
if len(df) < before:
|
||||
logger.debug(f"Removed set {set_code}: {before - len(df)} cards")
|
||||
|
||||
# Step 4: Remove banned cards
|
||||
logger.info("Removing banned cards")
|
||||
banned_set = {b.casefold() for b in BANNED_CARDS}
|
||||
name_lc = df['name'].astype(str).str.casefold()
|
||||
face_lc = df['faceName'].astype(str).str.casefold() if 'faceName' in df.columns else name_lc
|
||||
mask = ~(name_lc.isin(banned_set) | face_lc.isin(banned_set))
|
||||
before = len(df)
|
||||
df = df[mask]
|
||||
logger.debug(f"Removed banned cards: {before - len(df)} filtered out")
|
||||
|
||||
# Step 5: Remove special card types
|
||||
logger.info("Removing special card types")
|
||||
for card_type in CARD_TYPES_TO_EXCLUDE:
|
||||
before = len(df)
|
||||
df = df[~df['type'].str.contains(card_type, na=False)]
|
||||
if len(df) < before:
|
||||
logger.debug(f"Removed type {card_type}: {before - len(df)} cards")
|
||||
|
||||
# Step 6: Filter to essential columns only (reduce from ~82 to 14)
|
||||
logger.info(f"Filtering to {len(CSV_PROCESSING_COLUMNS)} essential columns")
|
||||
df = df[CSV_PROCESSING_COLUMNS]
|
||||
|
||||
# Step 7: Sort and deduplicate (CRITICAL: keeps only one printing per unique card)
|
||||
logger.info("Sorting and deduplicating cards")
|
||||
df = df.sort_values(
|
||||
by=SORT_CONFIG['columns'],
|
||||
key=lambda col: col.str.lower() if not SORT_CONFIG['case_sensitive'] else col
|
||||
)
|
||||
before = len(df)
|
||||
df = df.drop_duplicates(subset='faceName', keep='first')
|
||||
logger.info(f"Deduplicated: {before} → {len(df)} cards ({before - len(df)} duplicate printings removed)")
|
||||
|
||||
# Step 8: Add custom columns
|
||||
logger.info("Adding custom columns: creatureTypes, themeTags, isCommander, isBackground")
|
||||
|
||||
# creatureTypes: extracted from type line
|
||||
df['creatureTypes'] = df.apply(extract_creature_types, axis=1)
|
||||
|
||||
# themeTags: empty placeholder (filled during tagging)
|
||||
df['themeTags'] = ''
|
||||
|
||||
# isCommander: boolean flag
|
||||
df['isCommander'] = df.apply(is_valid_commander, axis=1)
|
||||
|
||||
# isBackground: boolean flag
|
||||
df['isBackground'] = df.apply(is_background, axis=1)
|
||||
|
||||
# Reorder columns to match CARD_DATA_COLUMNS
|
||||
# CARD_DATA_COLUMNS has: name, faceName, edhrecRank, colorIdentity, colors,
|
||||
# manaCost, manaValue, type, creatureTypes, text,
|
||||
# power, toughness, keywords, themeTags, layout, side
|
||||
# We need to add isCommander and isBackground at the end
|
||||
final_columns = settings.CARD_DATA_COLUMNS + ['isCommander', 'isBackground']
|
||||
|
||||
# Ensure all columns exist
|
||||
for col in final_columns:
|
||||
if col not in df.columns:
|
||||
logger.warning(f"Column {col} missing, adding empty column")
|
||||
df[col] = ''
|
||||
|
||||
df = df[final_columns]
|
||||
|
||||
logger.info(f"Final dataset: {len(df)} cards, {len(df.columns)} columns")
|
||||
logger.info(f"Commanders: {df['isCommander'].sum()}")
|
||||
logger.info(f"Backgrounds: {df['isBackground'].sum()}")
|
||||
|
||||
# Validate schema (check required columns present)
|
||||
try:
|
||||
validate_schema(df)
|
||||
logger.info("✓ Schema validation passed")
|
||||
except ValueError as e:
|
||||
logger.error(f"Schema validation failed: {e}")
|
||||
raise
|
||||
|
||||
# Write to processed directory
|
||||
logger.info(f"Writing processed Parquet to {output_path}")
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
loader.write_cards(df, output_path)
|
||||
|
||||
logger.info(f"✓ Created {output_path}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def initial_setup() -> None:
|
||||
"""Download and process MTGJSON Parquet data.
|
||||
|
||||
Modern Parquet-based setup workflow (replaces legacy CSV approach).
|
||||
|
||||
Workflow:
|
||||
1. Download cards.parquet from MTGJSON → card_files/raw/cards.parquet
|
||||
2. Process and filter → card_files/processed/all_cards.parquet
|
||||
3. No color-specific files (filter at query time instead)
|
||||
|
||||
Raises:
|
||||
Various exceptions from download/processing steps
|
||||
"""
|
||||
logger.info("=" * 80)
|
||||
logger.info("Starting Parquet-based initial setup")
|
||||
logger.info("=" * 80)
|
||||
|
||||
# Step 1: Download raw Parquet
|
||||
raw_dir = card_files_raw_dir()
|
||||
raw_path = os.path.join(raw_dir, "cards.parquet")
|
||||
|
||||
if os.path.exists(raw_path):
|
||||
logger.info(f"Raw Parquet already exists: {raw_path}")
|
||||
logger.info("Skipping download (delete file to re-download)")
|
||||
else:
|
||||
download_parquet_from_mtgjson(raw_path)
|
||||
|
||||
# Step 2: Process raw → processed
|
||||
processed_path = get_processed_cards_path()
|
||||
|
||||
logger.info(f"Processing raw Parquet → {processed_path}")
|
||||
process_raw_parquet(raw_path, processed_path)
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info("✓ Parquet setup complete")
|
||||
logger.info(f" Raw: {raw_path}")
|
||||
logger.info(f" Processed: {processed_path}")
|
||||
logger.info("=" * 80)
|
||||
|
||||
|
||||
def regenerate_processed_parquet() -> None:
|
||||
"""Regenerate processed Parquet from existing raw file.
|
||||
|
||||
Useful when:
|
||||
- Column processing logic changes
|
||||
- Adding new custom columns
|
||||
- Testing without re-downloading
|
||||
"""
|
||||
logger.info("Regenerating processed Parquet from raw file")
|
||||
|
||||
raw_path = os.path.join(card_files_raw_dir(), "cards.parquet")
|
||||
|
||||
if not os.path.exists(raw_path):
|
||||
logger.error(f"Raw Parquet not found: {raw_path}")
|
||||
logger.error("Run initial_setup_parquet() first to download")
|
||||
raise FileNotFoundError(f"Raw Parquet not found: {raw_path}")
|
||||
|
||||
processed_path = get_processed_cards_path()
|
||||
process_raw_parquet(raw_path, processed_path)
|
||||
|
||||
logger.info(f"✓ Regenerated {processed_path}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue