feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging

2026-03-08 06:32:37 +01:00 · 2025-10-18 21:32:12 -07:00 · 2025-10-18 21:32:12 -07:00 · 8435312c8f
commit 8435312c8f
parent e9e949aae3
58 changed files with 11921 additions and 3961 deletions
--- a/code/file_setup/setup.py
+++ b/code/file_setup/setup.py
@ -1,362 +1,374 @@
-"""MTG Python Deckbuilder setup module.
+"""Parquet-based setup for MTG Python Deckbuilder.

-This module provides the main setup functionality for the MTG Python Deckbuilder
-application. It handles initial setup tasks such as downloading card data,
-creating color-filtered card lists, and gener        logger.info(f'Downloading latest card data for {color} cards')
-        download_cards_csv(MTGJSON_API_URL, f'{CSV_DIRECTORY}/cards.csv')
+This module handles downloading and processing MTGJSON Parquet data for the
+MTG Python Deckbuilder. It replaces the old CSV-based multi-file approach
+with a single-file Parquet workflow.

-        logger.info('Loading and processing card data')
-        try:
-            df = pd.read_csv(f'{CSV_DIRECTORY}/cards.csv', low_memory=False)
-        except pd.errors.ParserError as e:
-            logger.warning(f'CSV parsing error encountered: {e}. Retrying with error handling...')
-            df = pd.read_csv(
-                f'{CSV_DIRECTORY}/cards.csv',
-                low_memory=False,
-                on_bad_lines='warn',  # Warn about malformed rows but continue
-                encoding_errors='replace'  # Replace bad encoding chars
-            )
-            logger.info('Successfully loaded card data with error handling (some rows may have been skipped)')
+Key Changes from CSV approach:
+- Single all_cards.parquet file instead of 18+ color-specific CSVs
+- Downloads from MTGJSON Parquet API (faster, smaller)
+- Adds isCommander and isBackground boolean flags
+- Filters to essential columns only (14 base + 4 custom = 18 total)
+- Uses DataLoader abstraction for format flexibility

-        logger.info(f'Regenerating {color} cards CSV')der-eligible card lists.
-
-Key Features:
-    - Initial setup and configuration
-    - Card data download and processing
-    - Color-based card filtering
-    - Commander card list generation
-    - CSV file management and validation
-
-The module works in conjunction with setup_utils.py for utility functions and
-exceptions.py for error handling.
+Introduced in v3.0.0 as part of CSV→Parquet migration.
 """

 from __future__ import annotations

-# Standard library imports
-from enum import Enum
 import os
-from typing import List, Dict, Any

-# Third-party imports (optional)
-try:
-    import inquirer  # type: ignore
-except Exception:
-    inquirer = None  # Fallback to simple input-based menu when unavailable
 import pandas as pd
+import requests
+from tqdm import tqdm

-# Local imports
+from .data_loader import DataLoader, validate_schema
+from .setup_constants import (
+    CSV_PROCESSING_COLUMNS,
+    CARD_TYPES_TO_EXCLUDE,
+    NON_LEGAL_SETS,
+    BANNED_CARDS,
+    FILTER_CONFIG,
+    SORT_CONFIG,
+)
 import logging_util
-from settings import CSV_DIRECTORY
-from .setup_constants import BANNED_CARDS, SETUP_COLORS, COLOR_ABRV, MTGJSON_API_URL
-from .setup_utils import (
-    download_cards_csv,
-    filter_dataframe,
-    process_legendary_cards,
-    check_csv_exists,
-    save_color_filtered_csvs,
-    enrich_commander_rows_with_tags,
-)
-from exceptions import (
-    CSVFileNotFoundError,
-    CommanderValidationError,
-    MTGJSONDownloadError
-)
-from scripts import generate_background_cards as background_cards_script
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
+from path_util import card_files_raw_dir, get_processed_cards_path
+import settings
+
+logger = logging_util.get_logger(__name__)
+
+# MTGJSON Parquet API URL
+MTGJSON_PARQUET_URL = "https://mtgjson.com/api/v5/parquet/cards.parquet"


-def _generate_background_catalog(cards_path: str, output_path: str) -> None:
-    """Regenerate ``background_cards.csv`` from the latest cards dataset."""
-
-    logger.info('Generating background cards catalog')
-    args = [
-        '--source', cards_path,
-        '--output', output_path,
-    ]
-    try:
-        background_cards_script.main(args)
-    except Exception:  # pragma: no cover - surfaced to caller/test
-        logger.exception('Failed to generate background catalog')
-        raise
-    else:
-        logger.info('Background cards catalog generated successfully')
-
-# Create logger for this module
-logger = logging_util.logging.getLogger(__name__)
-logger.setLevel(logging_util.LOG_LEVEL)
-logger.addHandler(logging_util.file_handler)
-logger.addHandler(logging_util.stream_handler)
-
-# Create CSV directory if it doesn't exist
-if not os.path.exists(CSV_DIRECTORY):
-    os.makedirs(CSV_DIRECTORY)
-
-## Note: using shared check_csv_exists from setup_utils to avoid duplication
-
-def initial_setup() -> None:
-    """Perform initial setup by downloading card data and creating filtered CSV files.
-    
-    Downloads the latest card data from MTGJSON if needed, creates color-filtered CSV files,
-    and generates commander-eligible cards list. Uses utility functions from setup_utils.py
-    for file operations and data processing.
-    
-    Raises:
-        CSVFileNotFoundError: If required CSV files cannot be found
-        MTGJSONDownloadError: If card data download fails
-        DataFrameProcessingError: If data processing fails
-        ColorFilterError: If color filtering fails
-    """
-    logger.info('Checking for cards.csv file')
-    
-    try:
-        cards_file = f'{CSV_DIRECTORY}/cards.csv'
-        try:
-            with open(cards_file, 'r', encoding='utf-8'):
-                logger.info('cards.csv exists')
-        except FileNotFoundError:
-            logger.info('cards.csv not found, downloading from mtgjson')
-            download_cards_csv(MTGJSON_API_URL, cards_file)
-        
-        df = pd.read_csv(cards_file, low_memory=False)
-        
-        logger.info('Checking for color identity sorted files')
-        # Generate color-identity filtered CSVs in one pass
-        save_color_filtered_csvs(df, CSV_DIRECTORY)
-        
-        # Generate commander list
-        determine_commanders()
-
-    except Exception as e:
-        logger.error(f'Error during initial setup: {str(e)}')
-        raise
-
-## Removed local filter_by_color in favor of setup_utils.save_color_filtered_csvs
-
-def determine_commanders() -> None:
-    """Generate commander_cards.csv containing all cards eligible to be commanders.
-    
-    This function processes the card database to identify and validate commander-eligible cards,
-    applying comprehensive validation steps and filtering criteria.
-    
-    Raises:
-        CSVFileNotFoundError: If cards.csv is missing and cannot be downloaded
-        MTGJSONDownloadError: If downloading cards data fails
-        CommanderValidationError: If commander validation fails
-        DataFrameProcessingError: If data processing operations fail
-    """
-    logger.info('Starting commander card generation process')
-    
-    try:
-        # Check for cards.csv with progress tracking
-        cards_file = f'{CSV_DIRECTORY}/cards.csv'
-        if not check_csv_exists(cards_file):
-            logger.info('cards.csv not found, initiating download')
-            download_cards_csv(MTGJSON_API_URL, cards_file)
-        else:
-            logger.info('cards.csv found, proceeding with processing')
-        
-        # Load and process cards data
-        logger.info('Loading card data from CSV')
-        df = pd.read_csv(cards_file, low_memory=False)
-        
-        # Process legendary cards with validation
-        logger.info('Processing and validating legendary cards')
-        try:
-            filtered_df = process_legendary_cards(df)
-        except CommanderValidationError as e:
-            logger.error(f'Commander validation failed: {str(e)}')
-            raise
-        
-        # Apply standard filters
-        logger.info('Applying standard card filters')
-        filtered_df = filter_dataframe(filtered_df, BANNED_CARDS)
-        
-        logger.info('Enriching commander metadata with theme and creature tags')
-        filtered_df = enrich_commander_rows_with_tags(filtered_df, CSV_DIRECTORY)
-
-        # Save commander cards
-        logger.info('Saving validated commander cards')
-        commander_path = f'{CSV_DIRECTORY}/commander_cards.csv'
-        filtered_df.to_csv(commander_path, index=False)
-
-        background_output = f'{CSV_DIRECTORY}/background_cards.csv'
-        _generate_background_catalog(cards_file, background_output)
-
-        logger.info('Commander card generation completed successfully')
-        
-    except (CSVFileNotFoundError, MTGJSONDownloadError) as e:
-        logger.error(f'File operation error: {str(e)}')
-        raise
-    except CommanderValidationError as e:
-        logger.error(f'Commander validation error: {str(e)}')
-        raise
-    except Exception as e:
-        logger.error(f'Unexpected error during commander generation: {str(e)}')
-        raise
-    
-def regenerate_csvs_all() -> None:
-    """Regenerate all color-filtered CSV files from latest card data.
-    
-    Downloads fresh card data and recreates all color-filtered CSV files.
-    Useful for updating the card database when new sets are released.
-    
-    Raises:
-        MTGJSONDownloadError: If card data download fails
-        DataFrameProcessingError: If data processing fails
-        ColorFilterError: If color filtering fails
-    """
-    try:
-        logger.info('Downloading latest card data from MTGJSON')
-        download_cards_csv(MTGJSON_API_URL, f'{CSV_DIRECTORY}/cards.csv')
-        
-        logger.info('Loading and processing card data')
-        try:
-            df = pd.read_csv(f'{CSV_DIRECTORY}/cards.csv', low_memory=False)
-        except pd.errors.ParserError as e:
-            logger.warning(f'CSV parsing error encountered: {e}. Retrying with error handling...')
-            df = pd.read_csv(
-                f'{CSV_DIRECTORY}/cards.csv',
-                low_memory=False,
-                on_bad_lines='warn',  # Warn about malformed rows but continue
-                encoding_errors='replace'  # Replace bad encoding chars
-            )
-            logger.info(f'Successfully loaded card data with error handling (some rows may have been skipped)')
-        
-        logger.info('Regenerating color identity sorted files')
-        save_color_filtered_csvs(df, CSV_DIRECTORY)
-            
-        logger.info('Regenerating commander cards')
-        determine_commanders()
-        
-        logger.info('Card database regeneration complete')
-        
-    except Exception as e:
-        logger.error(f'Failed to regenerate card database: {str(e)}')
-        raise
-    # Once files are regenerated, create a new legendary list (already executed in try)
-
-def regenerate_csv_by_color(color: str) -> None:
-    """Regenerate CSV file for a specific color identity.
+def download_parquet_from_mtgjson(output_path: str) -> None:
+    """Download MTGJSON cards.parquet file.
    
    Args:
-        color: Color name to regenerate CSV for (e.g. 'white', 'blue')
+        output_path: Where to save the downloaded Parquet file
        
    Raises:
-        ValueError: If color is not valid
-        MTGJSONDownloadError: If card data download fails
-        DataFrameProcessingError: If data processing fails
-        ColorFilterError: If color filtering fails
+        requests.RequestException: If download fails
+        IOError: If file cannot be written
    """
+    logger.info(f"Downloading MTGJSON Parquet from {MTGJSON_PARQUET_URL}")
+    
    try:
-        if color not in SETUP_COLORS:
-            raise ValueError(f'Invalid color: {color}')
-
-        color_abv = COLOR_ABRV[SETUP_COLORS.index(color)]
-
-        logger.info(f'Downloading latest card data for {color} cards')
-        download_cards_csv(MTGJSON_API_URL, f'{CSV_DIRECTORY}/cards.csv')
-
-        logger.info('Loading and processing card data')
-        df = pd.read_csv(
-            f'{CSV_DIRECTORY}/cards.csv',
-            low_memory=False,
-            on_bad_lines='skip',  # Skip malformed rows (MTGJSON CSV has escaping issues)
-            encoding_errors='replace'  # Replace bad encoding chars
-        )
-
-        logger.info(f'Regenerating {color} cards CSV')
-        # Use shared utilities to base-filter once then slice color, honoring bans
-        base_df = filter_dataframe(df, BANNED_CARDS)
-        base_df[base_df['colorIdentity'] == color_abv].to_csv(
-            f'{CSV_DIRECTORY}/{color}_cards.csv', index=False
-        )
-
-        logger.info(f'Successfully regenerated {color} cards database')
-
-    except Exception as e:
-        logger.error(f'Failed to regenerate {color} cards: {str(e)}')
+        response = requests.get(MTGJSON_PARQUET_URL, stream=True, timeout=60)
+        response.raise_for_status()
+        
+        # Get file size for progress bar
+        total_size = int(response.headers.get('content-length', 0))
+        
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        
+        # Download with progress bar
+        with open(output_path, 'wb') as f, tqdm(
+            total=total_size,
+            unit='B',
+            unit_scale=True,
+            desc='Downloading cards.parquet'
+        ) as pbar:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+                pbar.update(len(chunk))
+        
+        logger.info(f"✓ Downloaded {total_size / (1024**2):.2f} MB to {output_path}")
+        
+    except requests.RequestException as e:
+        logger.error(f"Failed to download MTGJSON Parquet: {e}")
+        raise
+    except IOError as e:
+        logger.error(f"Failed to write Parquet file: {e}")
        raise

-class SetupOption(Enum):
-    """Enum for setup menu options."""
-    INITIAL_SETUP = 'Initial Setup'
-    REGENERATE_CSV = 'Regenerate CSV Files'
-    BACK = 'Back'

-def _display_setup_menu() -> SetupOption:
-    """Display the setup menu and return the selected option.
+def is_valid_commander(row: pd.Series) -> bool:
+    """Determine if a card can be a commander.
    
-    Returns:
-        SetupOption: The selected menu option
-    """
-    if inquirer is not None:
-        question: List[Dict[str, Any]] = [
-            inquirer.List(
-                'menu',
-                choices=[option.value for option in SetupOption],
-                carousel=True)]
-        answer = inquirer.prompt(question)
-        return SetupOption(answer['menu'])
-
-    # Simple fallback when inquirer isn't installed (e.g., headless/container)
-    options = list(SetupOption)
-    print("\nSetup Menu:")
-    for idx, opt in enumerate(options, start=1):
-        print(f"  {idx}) {opt.value}")
-    while True:
-        try:
-            sel = input("Select an option [1]: ").strip() or "1"
-            i = int(sel)
-            if 1 <= i <= len(options):
-                return options[i - 1]
-        except KeyboardInterrupt:
-            print("")
-            return SetupOption.BACK
-        except Exception:
-            pass
-        print("Invalid selection. Please try again.")
-
-def setup() -> bool:
-    """Run the setup process for the MTG Python Deckbuilder.
+    Criteria:
+    - Legendary Creature
+    - OR: Has "can be your commander" in text
+    - OR: Background (Partner with Background)
    
-    This function provides a menu-driven interface to:
-    1. Perform initial setup by downloading and processing card data
-    2. Regenerate CSV files with updated card data
-    3. Perform all tagging processes on the color-sorted csv files
-    
-    The function handles errors gracefully and provides feedback through logging.
-    
-    Returns:
-        bool: True if setup completed successfully, False otherwise
-    """
-    try:
-        print('Which setup operation would you like to perform?\n'
-              'If this is your first time setting up, do the initial setup.\n'
-              'If you\'ve done the basic setup before, you can regenerate the CSV files\n')
+    Args:
+        row: DataFrame row with card data
        
-        choice = _display_setup_menu()
-        
-        if choice == SetupOption.INITIAL_SETUP:
-            logger.info('Starting initial setup')
-            initial_setup()
-            logger.info('Initial setup completed successfully')
-            return True
-            
-        elif choice == SetupOption.REGENERATE_CSV:
-            logger.info('Starting CSV regeneration')
-            regenerate_csvs_all()
-            logger.info('CSV regeneration completed successfully')
-            return True
-            
-        elif choice == SetupOption.BACK:
-            logger.info('Setup cancelled by user')
-            return False
-            
-    except Exception as e:
-        logger.error(f'Error during setup: {e}')
-        raise
+    Returns:
+        True if card can be a commander
+    """
+    type_line = str(row.get('type', ''))
+    text = str(row.get('text', '')).lower()
+    
+    # Legendary Creature
+    if 'Legendary' in type_line and 'Creature' in type_line:
+        return True
+    
+    # Special text (e.g., "can be your commander")
+    if 'can be your commander' in text:
+        return True
+    
+    # Backgrounds can be commanders (with Choose a Background)
+    if 'Background' in type_line:
+        return True
    
    return False
+
+
+def is_background(row: pd.Series) -> bool:
+    """Determine if a card is a Background.
+    
+    Args:
+        row: DataFrame row with card data
+        
+    Returns:
+        True if card has Background type
+    """
+    type_line = str(row.get('type', ''))
+    return 'Background' in type_line
+
+
+def extract_creature_types(row: pd.Series) -> str:
+    """Extract creature types from type line.
+    
+    Args:
+        row: DataFrame row with card data
+        
+    Returns:
+        Comma-separated creature types or empty string
+    """
+    type_line = str(row.get('type', ''))
+    
+    # Check if it's a creature
+    if 'Creature' not in type_line:
+        return ''
+    
+    # Split on — to get subtypes
+    if '—' in type_line:
+        parts = type_line.split('—')
+        if len(parts) >= 2:
+            # Get everything after the dash, strip whitespace
+            subtypes = parts[1].strip()
+            return subtypes
+    
+    return ''
+
+
+def process_raw_parquet(raw_path: str, output_path: str) -> pd.DataFrame:
+    """Process raw MTGJSON Parquet into processed all_cards.parquet.
+    
+    This function:
+    1. Loads raw Parquet (all ~82 columns)
+    2. Filters to essential columns (CSV_PROCESSING_COLUMNS)
+    3. Applies standard filtering (banned cards, illegal sets, special types)
+    4. Deduplicates by faceName (keep first printing only)
+    5. Adds custom columns: creatureTypes, themeTags, isCommander, isBackground
+    6. Validates schema
+    7. Writes to processed directory
+    
+    Args:
+        raw_path: Path to raw cards.parquet from MTGJSON
+        output_path: Path to save processed all_cards.parquet
+        
+    Returns:
+        Processed DataFrame
+        
+    Raises:
+        ValueError: If schema validation fails
+    """
+    logger.info(f"Processing {raw_path}")
+    
+    # Load raw Parquet with DataLoader
+    loader = DataLoader()
+    df = loader.read_cards(raw_path)
+    
+    logger.info(f"Loaded {len(df)} cards with {len(df.columns)} columns")
+    
+    # Step 1: Fill NA values
+    logger.info("Filling NA values")
+    for col, fill_value in settings.FILL_NA_COLUMNS.items():
+        if col in df.columns:
+            if col == 'faceName':
+                df[col] = df[col].fillna(df['name'])
+            else:
+                df[col] = df[col].fillna(fill_value)
+    
+    # Step 2: Apply configuration-based filters (FILTER_CONFIG)
+    logger.info("Applying configuration filters")
+    for field, rules in FILTER_CONFIG.items():
+        if field not in df.columns:
+            logger.warning(f"Skipping filter for missing field: {field}")
+            continue
+        
+        for rule_type, values in rules.items():
+            if not values:
+                continue
+            
+            if rule_type == 'exclude':
+                for value in values:
+                    mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False)
+                    before = len(df)
+                    df = df[~mask]
+                    logger.debug(f"Excluded {field} containing '{value}': {before - len(df)} removed")
+            elif rule_type == 'require':
+                for value in values:
+                    mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False)
+                    before = len(df)
+                    df = df[mask]
+                    logger.debug(f"Required {field} containing '{value}': {before - len(df)} removed")
+    
+    # Step 3: Remove illegal sets
+    if 'printings' in df.columns:
+        logger.info("Removing illegal sets")
+        for set_code in NON_LEGAL_SETS:
+            before = len(df)
+            df = df[~df['printings'].str.contains(set_code, na=False)]
+            if len(df) < before:
+                logger.debug(f"Removed set {set_code}: {before - len(df)} cards")
+    
+    # Step 4: Remove banned cards
+    logger.info("Removing banned cards")
+    banned_set = {b.casefold() for b in BANNED_CARDS}
+    name_lc = df['name'].astype(str).str.casefold()
+    face_lc = df['faceName'].astype(str).str.casefold() if 'faceName' in df.columns else name_lc
+    mask = ~(name_lc.isin(banned_set) | face_lc.isin(banned_set))
+    before = len(df)
+    df = df[mask]
+    logger.debug(f"Removed banned cards: {before - len(df)} filtered out")
+    
+    # Step 5: Remove special card types
+    logger.info("Removing special card types")
+    for card_type in CARD_TYPES_TO_EXCLUDE:
+        before = len(df)
+        df = df[~df['type'].str.contains(card_type, na=False)]
+        if len(df) < before:
+            logger.debug(f"Removed type {card_type}: {before - len(df)} cards")
+    
+    # Step 6: Filter to essential columns only (reduce from ~82 to 14)
+    logger.info(f"Filtering to {len(CSV_PROCESSING_COLUMNS)} essential columns")
+    df = df[CSV_PROCESSING_COLUMNS]
+    
+    # Step 7: Sort and deduplicate (CRITICAL: keeps only one printing per unique card)
+    logger.info("Sorting and deduplicating cards")
+    df = df.sort_values(
+        by=SORT_CONFIG['columns'],
+        key=lambda col: col.str.lower() if not SORT_CONFIG['case_sensitive'] else col
+    )
+    before = len(df)
+    df = df.drop_duplicates(subset='faceName', keep='first')
+    logger.info(f"Deduplicated: {before} → {len(df)} cards ({before - len(df)} duplicate printings removed)")
+    
+    # Step 8: Add custom columns
+    logger.info("Adding custom columns: creatureTypes, themeTags, isCommander, isBackground")
+    
+    # creatureTypes: extracted from type line
+    df['creatureTypes'] = df.apply(extract_creature_types, axis=1)
+    
+    # themeTags: empty placeholder (filled during tagging)
+    df['themeTags'] = ''
+    
+    # isCommander: boolean flag
+    df['isCommander'] = df.apply(is_valid_commander, axis=1)
+    
+    # isBackground: boolean flag
+    df['isBackground'] = df.apply(is_background, axis=1)
+    
+    # Reorder columns to match CARD_DATA_COLUMNS
+    # CARD_DATA_COLUMNS has: name, faceName, edhrecRank, colorIdentity, colors,
+    #                        manaCost, manaValue, type, creatureTypes, text,
+    #                        power, toughness, keywords, themeTags, layout, side
+    # We need to add isCommander and isBackground at the end
+    final_columns = settings.CARD_DATA_COLUMNS + ['isCommander', 'isBackground']
+    
+    # Ensure all columns exist
+    for col in final_columns:
+        if col not in df.columns:
+            logger.warning(f"Column {col} missing, adding empty column")
+            df[col] = ''
+    
+    df = df[final_columns]
+    
+    logger.info(f"Final dataset: {len(df)} cards, {len(df.columns)} columns")
+    logger.info(f"Commanders: {df['isCommander'].sum()}")
+    logger.info(f"Backgrounds: {df['isBackground'].sum()}")
+    
+    # Validate schema (check required columns present)
+    try:
+        validate_schema(df)
+        logger.info("✓ Schema validation passed")
+    except ValueError as e:
+        logger.error(f"Schema validation failed: {e}")
+        raise
+    
+    # Write to processed directory
+    logger.info(f"Writing processed Parquet to {output_path}")
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    loader.write_cards(df, output_path)
+    
+    logger.info(f"✓ Created {output_path}")
+    
+    return df
+
+
+def initial_setup() -> None:
+    """Download and process MTGJSON Parquet data.
+    
+    Modern Parquet-based setup workflow (replaces legacy CSV approach).
+    
+    Workflow:
+    1. Download cards.parquet from MTGJSON → card_files/raw/cards.parquet
+    2. Process and filter → card_files/processed/all_cards.parquet
+    3. No color-specific files (filter at query time instead)
+    
+    Raises:
+        Various exceptions from download/processing steps
+    """
+    logger.info("=" * 80)
+    logger.info("Starting Parquet-based initial setup")
+    logger.info("=" * 80)
+    
+    # Step 1: Download raw Parquet
+    raw_dir = card_files_raw_dir()
+    raw_path = os.path.join(raw_dir, "cards.parquet")
+    
+    if os.path.exists(raw_path):
+        logger.info(f"Raw Parquet already exists: {raw_path}")
+        logger.info("Skipping download (delete file to re-download)")
+    else:
+        download_parquet_from_mtgjson(raw_path)
+    
+    # Step 2: Process raw → processed
+    processed_path = get_processed_cards_path()
+    
+    logger.info(f"Processing raw Parquet → {processed_path}")
+    process_raw_parquet(raw_path, processed_path)
+    
+    logger.info("=" * 80)
+    logger.info("✓ Parquet setup complete")
+    logger.info(f"  Raw: {raw_path}")
+    logger.info(f"  Processed: {processed_path}")
+    logger.info("=" * 80)
+
+
+def regenerate_processed_parquet() -> None:
+    """Regenerate processed Parquet from existing raw file.
+    
+    Useful when:
+    - Column processing logic changes
+    - Adding new custom columns
+    - Testing without re-downloading
+    """
+    logger.info("Regenerating processed Parquet from raw file")
+    
+    raw_path = os.path.join(card_files_raw_dir(), "cards.parquet")
+    
+    if not os.path.exists(raw_path):
+        logger.error(f"Raw Parquet not found: {raw_path}")
+        logger.error("Run initial_setup_parquet() first to download")
+        raise FileNotFoundError(f"Raw Parquet not found: {raw_path}")
+    
+    processed_path = get_processed_cards_path()
+    process_raw_parquet(raw_path, processed_path)
+    
+    logger.info(f"✓ Regenerated {processed_path}")