"""Parquet-based setup for MTG Python Deckbuilder. This module handles downloading and processing MTGJSON Parquet data for the MTG Python Deckbuilder. It replaces the old CSV-based multi-file approach with a single-file Parquet workflow. Key Changes from CSV approach: - Single all_cards.parquet file instead of 18+ color-specific CSVs - Downloads from MTGJSON Parquet API (faster, smaller) - Adds isCommander and isBackground boolean flags - Filters to essential columns only (14 base + 4 custom = 18 total) - Uses DataLoader abstraction for format flexibility Introduced in v3.0.0 as part of CSV→Parquet migration. """ from __future__ import annotations import os import pandas as pd import requests from tqdm import tqdm from .data_loader import DataLoader, validate_schema from .setup_constants import ( CSV_PROCESSING_COLUMNS, CARD_TYPES_TO_EXCLUDE, NON_LEGAL_SETS, BANNED_CARDS, FILTER_CONFIG, SORT_CONFIG, ) import logging_util from path_util import card_files_raw_dir, get_processed_cards_path import settings logger = logging_util.get_logger(__name__) # MTGJSON Parquet API URL MTGJSON_PARQUET_URL = "https://mtgjson.com/api/v5/parquet/cards.parquet" def download_parquet_from_mtgjson(output_path: str) -> None: """Download MTGJSON cards.parquet file. Args: output_path: Where to save the downloaded Parquet file Raises: requests.RequestException: If download fails IOError: If file cannot be written """ logger.info(f"Downloading MTGJSON Parquet from {MTGJSON_PARQUET_URL}") try: response = requests.get(MTGJSON_PARQUET_URL, stream=True, timeout=60) response.raise_for_status() # Get file size for progress bar total_size = int(response.headers.get('content-length', 0)) # Ensure output directory exists os.makedirs(os.path.dirname(output_path), exist_ok=True) # Download with progress bar with open(output_path, 'wb') as f, tqdm( total=total_size, unit='B', unit_scale=True, desc='Downloading cards.parquet' ) as pbar: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) pbar.update(len(chunk)) logger.info(f"✓ Downloaded {total_size / (1024**2):.2f} MB to {output_path}") except requests.RequestException as e: logger.error(f"Failed to download MTGJSON Parquet: {e}") raise except IOError as e: logger.error(f"Failed to write Parquet file: {e}") raise def is_valid_commander(row: pd.Series) -> bool: """Determine if a card can be a commander. Criteria: - Legendary Creature - OR: Has "can be your commander" in text - OR: Background (Partner with Background) Args: row: DataFrame row with card data Returns: True if card can be a commander """ type_line = str(row.get('type', '')) text = str(row.get('text', '')).lower() # Legendary Creature if 'Legendary' in type_line and 'Creature' in type_line: return True # Special text (e.g., "can be your commander") if 'can be your commander' in text: return True # Backgrounds can be commanders (with Choose a Background) if 'Background' in type_line: return True return False def is_background(row: pd.Series) -> bool: """Determine if a card is a Background. Args: row: DataFrame row with card data Returns: True if card has Background type """ type_line = str(row.get('type', '')) return 'Background' in type_line def extract_creature_types(row: pd.Series) -> str: """Extract creature types from type line. Args: row: DataFrame row with card data Returns: Comma-separated creature types or empty string """ type_line = str(row.get('type', '')) # Check if it's a creature if 'Creature' not in type_line: return '' # Split on — to get subtypes if '—' in type_line: parts = type_line.split('—') if len(parts) >= 2: # Get everything after the dash, strip whitespace subtypes = parts[1].strip() return subtypes return '' def process_raw_parquet(raw_path: str, output_path: str) -> pd.DataFrame: """Process raw MTGJSON Parquet into processed all_cards.parquet. This function: 1. Loads raw Parquet (all ~82 columns) 2. Filters to essential columns (CSV_PROCESSING_COLUMNS) 3. Applies standard filtering (banned cards, illegal sets, special types) 4. Deduplicates by faceName (keep first printing only) 5. Adds custom columns: creatureTypes, themeTags, isCommander, isBackground 6. Validates schema 7. Writes to processed directory Args: raw_path: Path to raw cards.parquet from MTGJSON output_path: Path to save processed all_cards.parquet Returns: Processed DataFrame Raises: ValueError: If schema validation fails """ logger.info(f"Processing {raw_path}") # Load raw Parquet with DataLoader loader = DataLoader() df = loader.read_cards(raw_path) logger.info(f"Loaded {len(df)} cards with {len(df.columns)} columns") # Step 1: Fill NA values logger.info("Filling NA values") for col, fill_value in settings.FILL_NA_COLUMNS.items(): if col in df.columns: if col == 'faceName': df[col] = df[col].fillna(df['name']) else: df[col] = df[col].fillna(fill_value) # Step 2: Apply configuration-based filters (FILTER_CONFIG) logger.info("Applying configuration filters") for field, rules in FILTER_CONFIG.items(): if field not in df.columns: logger.warning(f"Skipping filter for missing field: {field}") continue for rule_type, values in rules.items(): if not values: continue if rule_type == 'exclude': for value in values: mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False) before = len(df) df = df[~mask] logger.debug(f"Excluded {field} containing '{value}': {before - len(df)} removed") elif rule_type == 'require': for value in values: mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False) before = len(df) df = df[mask] logger.debug(f"Required {field} containing '{value}': {before - len(df)} removed") # Step 3: Remove illegal sets if 'printings' in df.columns: logger.info("Removing illegal sets") for set_code in NON_LEGAL_SETS: before = len(df) df = df[~df['printings'].str.contains(set_code, na=False)] if len(df) < before: logger.debug(f"Removed set {set_code}: {before - len(df)} cards") # Step 4: Remove banned cards logger.info("Removing banned cards") banned_set = {b.casefold() for b in BANNED_CARDS} name_lc = df['name'].astype(str).str.casefold() face_lc = df['faceName'].astype(str).str.casefold() if 'faceName' in df.columns else name_lc mask = ~(name_lc.isin(banned_set) | face_lc.isin(banned_set)) before = len(df) df = df[mask] logger.debug(f"Removed banned cards: {before - len(df)} filtered out") # Step 5: Remove special card types logger.info("Removing special card types") for card_type in CARD_TYPES_TO_EXCLUDE: before = len(df) df = df[~df['type'].str.contains(card_type, na=False)] if len(df) < before: logger.debug(f"Removed type {card_type}: {before - len(df)} cards") # Step 6: Filter to essential columns only (reduce from ~82 to 14) logger.info(f"Filtering to {len(CSV_PROCESSING_COLUMNS)} essential columns") df = df[CSV_PROCESSING_COLUMNS] # Step 7: Sort and deduplicate (CRITICAL: keeps only one printing per unique card) logger.info("Sorting and deduplicating cards") df = df.sort_values( by=SORT_CONFIG['columns'], key=lambda col: col.str.lower() if not SORT_CONFIG['case_sensitive'] else col ) before = len(df) df = df.drop_duplicates(subset='faceName', keep='first') logger.info(f"Deduplicated: {before} → {len(df)} cards ({before - len(df)} duplicate printings removed)") # Step 8: Add custom columns logger.info("Adding custom columns: creatureTypes, themeTags, isCommander, isBackground") # creatureTypes: extracted from type line df['creatureTypes'] = df.apply(extract_creature_types, axis=1) # themeTags: empty placeholder (filled during tagging) df['themeTags'] = '' # isCommander: boolean flag df['isCommander'] = df.apply(is_valid_commander, axis=1) # isBackground: boolean flag df['isBackground'] = df.apply(is_background, axis=1) # Reorder columns to match CARD_DATA_COLUMNS # CARD_DATA_COLUMNS has: name, faceName, edhrecRank, colorIdentity, colors, # manaCost, manaValue, type, creatureTypes, text, # power, toughness, keywords, themeTags, layout, side # We need to add isCommander and isBackground at the end final_columns = settings.CARD_DATA_COLUMNS + ['isCommander', 'isBackground'] # Ensure all columns exist for col in final_columns: if col not in df.columns: logger.warning(f"Column {col} missing, adding empty column") df[col] = '' df = df[final_columns] logger.info(f"Final dataset: {len(df)} cards, {len(df.columns)} columns") logger.info(f"Commanders: {df['isCommander'].sum()}") logger.info(f"Backgrounds: {df['isBackground'].sum()}") # Validate schema (check required columns present) try: validate_schema(df) logger.info("✓ Schema validation passed") except ValueError as e: logger.error(f"Schema validation failed: {e}") raise # Write to processed directory logger.info(f"Writing processed Parquet to {output_path}") os.makedirs(os.path.dirname(output_path), exist_ok=True) loader.write_cards(df, output_path) logger.info(f"✓ Created {output_path}") return df def initial_setup() -> None: """Download and process MTGJSON Parquet data. Modern Parquet-based setup workflow (replaces legacy CSV approach). Workflow: 1. Download cards.parquet from MTGJSON → card_files/raw/cards.parquet 2. Process and filter → card_files/processed/all_cards.parquet 3. No color-specific files (filter at query time instead) Raises: Various exceptions from download/processing steps """ logger.info("=" * 80) logger.info("Starting Parquet-based initial setup") logger.info("=" * 80) # Step 1: Download raw Parquet raw_dir = card_files_raw_dir() raw_path = os.path.join(raw_dir, "cards.parquet") if os.path.exists(raw_path): logger.info(f"Raw Parquet already exists: {raw_path}") logger.info("Skipping download (delete file to re-download)") else: download_parquet_from_mtgjson(raw_path) # Step 2: Process raw → processed processed_path = get_processed_cards_path() logger.info(f"Processing raw Parquet → {processed_path}") process_raw_parquet(raw_path, processed_path) logger.info("=" * 80) logger.info("✓ Parquet setup complete") logger.info(f" Raw: {raw_path}") logger.info(f" Processed: {processed_path}") logger.info("=" * 80) # Step 3: Optional image caching (if enabled) try: from code.file_setup.image_cache import ImageCache cache = ImageCache() if cache.is_enabled(): logger.info("=" * 80) logger.info("Card image caching enabled - starting download") logger.info("=" * 80) # Download bulk data logger.info("Downloading Scryfall bulk data...") cache.download_bulk_data() # Download images logger.info("Downloading card images (this may take 1-2 hours)...") def progress(current, total, card_name): if current % 100 == 0: # Log every 100 cards pct = (current / total) * 100 logger.info(f" Progress: {current}/{total} ({pct:.1f}%) - {card_name}") stats = cache.download_images(progress_callback=progress) logger.info("=" * 80) logger.info("✓ Image cache complete") logger.info(f" Downloaded: {stats['downloaded']}") logger.info(f" Skipped: {stats['skipped']}") logger.info(f" Failed: {stats['failed']}") logger.info("=" * 80) else: logger.info("Card image caching disabled (CACHE_CARD_IMAGES=0)") logger.info("Images will be fetched from Scryfall API on demand") except Exception as e: logger.error(f"Failed to cache images (continuing anyway): {e}") logger.error("Images will be fetched from Scryfall API on demand") def regenerate_processed_parquet() -> None: """Regenerate processed Parquet from existing raw file. Useful when: - Column processing logic changes - Adding new custom columns - Testing without re-downloading """ logger.info("Regenerating processed Parquet from raw file") raw_path = os.path.join(card_files_raw_dir(), "cards.parquet") if not os.path.exists(raw_path): logger.error(f"Raw Parquet not found: {raw_path}") logger.error("Run initial_setup_parquet() first to download") raise FileNotFoundError(f"Raw Parquet not found: {raw_path}") processed_path = get_processed_cards_path() process_raw_parquet(raw_path, processed_path) logger.info(f"✓ Regenerated {processed_path}")