mtg_python_deckbuilder/code/file_setup/setup.py

"""Parquet-based setup for MTG Python Deckbuilder.

This module handles downloading and processing MTGJSON Parquet data for the
MTG Python Deckbuilder. It replaces the old CSV-based multi-file approach
with a single-file Parquet workflow.

Key Changes from CSV approach:
- Single all_cards.parquet file instead of 18+ color-specific CSVs
- Downloads from MTGJSON Parquet API (faster, smaller)
- Adds isCommander and isBackground boolean flags
- Filters to essential columns only (14 base + 4 custom = 18 total)
- Uses DataLoader abstraction for format flexibility

Introduced in v3.0.0 as part of CSV→Parquet migration.
"""

from __future__ import annotations

import os

import pandas as pd
import requests
from tqdm import tqdm

from .data_loader import DataLoader, validate_schema
from .setup_constants import (
    CSV_PROCESSING_COLUMNS,
    CARD_TYPES_TO_EXCLUDE,
    NON_LEGAL_SETS,
    BANNED_CARDS,
    FILTER_CONFIG,
    SORT_CONFIG,
)
import logging_util
from path_util import card_files_raw_dir, get_processed_cards_path
import settings

logger = logging_util.get_logger(__name__)

# MTGJSON Parquet API URL
MTGJSON_PARQUET_URL = "https://mtgjson.com/api/v5/parquet/cards.parquet"


def download_parquet_from_mtgjson(output_path: str) -> None:
    """Download MTGJSON cards.parquet file.

    Args:
        output_path: Where to save the downloaded Parquet file

    Raises:
        requests.RequestException: If download fails
        IOError: If file cannot be written
    """
    logger.info(f"Downloading MTGJSON Parquet from {MTGJSON_PARQUET_URL}")

    try:
        response = requests.get(MTGJSON_PARQUET_URL, stream=True, timeout=60)
        response.raise_for_status()

        # Get file size for progress bar
        total_size = int(response.headers.get('content-length', 0))

        # Ensure output directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Download with progress bar
        with open(output_path, 'wb') as f, tqdm(
            total=total_size,
            unit='B',
            unit_scale=True,
            desc='Downloading cards.parquet'
        ) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                pbar.update(len(chunk))

        logger.info(f"✓ Downloaded {total_size / (1024**2):.2f} MB to {output_path}")

    except requests.RequestException as e:
        logger.error(f"Failed to download MTGJSON Parquet: {e}")
        raise
    except IOError as e:
        logger.error(f"Failed to write Parquet file: {e}")
        raise


def is_valid_commander(row: pd.Series) -> bool:
    """Determine if a card can be a commander.

    Criteria:
    - Legendary Creature
    - OR: Has "can be your commander" in text
    - OR: Background (Partner with Background)

    Args:
        row: DataFrame row with card data

    Returns:
        True if card can be a commander
    """
    type_line = str(row.get('type', ''))
    text = str(row.get('text', '')).lower()

    # Legendary Creature
    if 'Legendary' in type_line and 'Creature' in type_line:
        return True

    # Special text (e.g., "can be your commander")
    if 'can be your commander' in text:
        return True

    # Backgrounds can be commanders (with Choose a Background)
    if 'Background' in type_line:
        return True

    return False


def is_background(row: pd.Series) -> bool:
    """Determine if a card is a Background.

    Args:
        row: DataFrame row with card data

    Returns:
        True if card has Background type
    """
    type_line = str(row.get('type', ''))
    return 'Background' in type_line


def extract_creature_types(row: pd.Series) -> str:
    """Extract creature types from type line.

    Args:
        row: DataFrame row with card data

    Returns:
        Comma-separated creature types or empty string
    """
    type_line = str(row.get('type', ''))

    # Check if it's a creature
    if 'Creature' not in type_line:
        return ''

    # Split on — to get subtypes
    if '—' in type_line:
        parts = type_line.split('—')
        if len(parts) >= 2:
            # Get everything after the dash, strip whitespace
            subtypes = parts[1].strip()
            return subtypes

    return ''


def process_raw_parquet(raw_path: str, output_path: str) -> pd.DataFrame:
    """Process raw MTGJSON Parquet into processed all_cards.parquet.

    This function:
    1. Loads raw Parquet (all ~82 columns)
    2. Filters to essential columns (CSV_PROCESSING_COLUMNS)
    3. Applies standard filtering (banned cards, illegal sets, special types)
    4. Deduplicates by faceName (keep first printing only)
    5. Adds custom columns: creatureTypes, themeTags, isCommander, isBackground
    6. Validates schema
    7. Writes to processed directory

    Args:
        raw_path: Path to raw cards.parquet from MTGJSON
        output_path: Path to save processed all_cards.parquet

    Returns:
        Processed DataFrame

    Raises:
        ValueError: If schema validation fails
    """
    logger.info(f"Processing {raw_path}")

    # Load raw Parquet with DataLoader
    loader = DataLoader()
    df = loader.read_cards(raw_path)

    logger.info(f"Loaded {len(df)} cards with {len(df.columns)} columns")

    # Step 1: Fill NA values
    logger.info("Filling NA values")
    for col, fill_value in settings.FILL_NA_COLUMNS.items():
        if col in df.columns:
            if col == 'faceName':
                df[col] = df[col].fillna(df['name'])
            else:
                df[col] = df[col].fillna(fill_value)

    # Step 2: Apply configuration-based filters (FILTER_CONFIG)
    logger.info("Applying configuration filters")
    for field, rules in FILTER_CONFIG.items():
        if field not in df.columns:
            logger.warning(f"Skipping filter for missing field: {field}")
            continue

        for rule_type, values in rules.items():
            if not values:
                continue

            if rule_type == 'exclude':
                for value in values:
                    mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False)
                    before = len(df)
                    df = df[~mask]
                    logger.debug(f"Excluded {field} containing '{value}': {before - len(df)} removed")
            elif rule_type == 'require':
                for value in values:
                    mask = df[field].astype(str).str.contains(value, case=False, na=False, regex=False)
                    before = len(df)
                    df = df[mask]
                    logger.debug(f"Required {field} containing '{value}': {before - len(df)} removed")

    # Step 3: Remove illegal sets
    if 'printings' in df.columns:
        logger.info("Removing illegal sets")
        for set_code in NON_LEGAL_SETS:
            before = len(df)
            df = df[~df['printings'].str.contains(set_code, na=False)]
            if len(df) < before:
                logger.debug(f"Removed set {set_code}: {before - len(df)} cards")

    # Step 4: Remove banned cards
    logger.info("Removing banned cards")
    banned_set = {b.casefold() for b in BANNED_CARDS}
    name_lc = df['name'].astype(str).str.casefold()
    face_lc = df['faceName'].astype(str).str.casefold() if 'faceName' in df.columns else name_lc
    mask = ~(name_lc.isin(banned_set) | face_lc.isin(banned_set))
    before = len(df)
    df = df[mask]
    logger.debug(f"Removed banned cards: {before - len(df)} filtered out")

    # Step 5: Remove special card types
    logger.info("Removing special card types")
    for card_type in CARD_TYPES_TO_EXCLUDE:
        before = len(df)
        df = df[~df['type'].str.contains(card_type, na=False)]
        if len(df) < before:
            logger.debug(f"Removed type {card_type}: {before - len(df)} cards")

    # Step 6: Filter to essential columns only (reduce from ~82 to 14)
    logger.info(f"Filtering to {len(CSV_PROCESSING_COLUMNS)} essential columns")
    df = df[CSV_PROCESSING_COLUMNS]

    # Step 7: Sort and deduplicate (CRITICAL: keeps only one printing per unique card)
    logger.info("Sorting and deduplicating cards")
    df = df.sort_values(
        by=SORT_CONFIG['columns'],
        key=lambda col: col.str.lower() if not SORT_CONFIG['case_sensitive'] else col
    )
    before = len(df)
    df = df.drop_duplicates(subset='faceName', keep='first')
    logger.info(f"Deduplicated: {before} → {len(df)} cards ({before - len(df)} duplicate printings removed)")

    # Step 8: Add custom columns
    logger.info("Adding custom columns: creatureTypes, themeTags, isCommander, isBackground")

    # creatureTypes: extracted from type line
    df['creatureTypes'] = df.apply(extract_creature_types, axis=1)

    # themeTags: empty placeholder (filled during tagging)
    df['themeTags'] = ''

    # isCommander: boolean flag
    df['isCommander'] = df.apply(is_valid_commander, axis=1)

    # isBackground: boolean flag
    df['isBackground'] = df.apply(is_background, axis=1)

    # Reorder columns to match CARD_DATA_COLUMNS
    # CARD_DATA_COLUMNS has: name, faceName, edhrecRank, colorIdentity, colors,
    #                        manaCost, manaValue, type, creatureTypes, text,
    #                        power, toughness, keywords, themeTags, layout, side
    # We need to add isCommander and isBackground at the end
    final_columns = settings.CARD_DATA_COLUMNS + ['isCommander', 'isBackground']

    # Ensure all columns exist
    for col in final_columns:
        if col not in df.columns:
            logger.warning(f"Column {col} missing, adding empty column")
            df[col] = ''

    df = df[final_columns]

    logger.info(f"Final dataset: {len(df)} cards, {len(df.columns)} columns")
    logger.info(f"Commanders: {df['isCommander'].sum()}")
    logger.info(f"Backgrounds: {df['isBackground'].sum()}")

    # Validate schema (check required columns present)
    try:
        validate_schema(df)
        logger.info("✓ Schema validation passed")
    except ValueError as e:
        logger.error(f"Schema validation failed: {e}")
        raise

    # Write to processed directory
    logger.info(f"Writing processed Parquet to {output_path}")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    loader.write_cards(df, output_path)

    logger.info(f"✓ Created {output_path}")

    return df


def initial_setup() -> None:
    """Download and process MTGJSON Parquet data.

    Modern Parquet-based setup workflow (replaces legacy CSV approach).

    Workflow:
    1. Download cards.parquet from MTGJSON → card_files/raw/cards.parquet
    2. Process and filter → card_files/processed/all_cards.parquet
    3. No color-specific files (filter at query time instead)

    Raises:
        Various exceptions from download/processing steps
    """
    logger.info("=" * 80)
    logger.info("Starting Parquet-based initial setup")
    logger.info("=" * 80)

    # Step 1: Download raw Parquet
    raw_dir = card_files_raw_dir()
    raw_path = os.path.join(raw_dir, "cards.parquet")

    if os.path.exists(raw_path):
        logger.info(f"Raw Parquet already exists: {raw_path}")
        logger.info("Skipping download (delete file to re-download)")
    else:
        download_parquet_from_mtgjson(raw_path)

    # Step 2: Process raw → processed
    processed_path = get_processed_cards_path()

    logger.info(f"Processing raw Parquet → {processed_path}")
    process_raw_parquet(raw_path, processed_path)

    logger.info("=" * 80)
    logger.info("✓ Parquet setup complete")
    logger.info(f"  Raw: {raw_path}")
    logger.info(f"  Processed: {processed_path}")
    logger.info("=" * 80)

    # Step 3: Optional image caching (if enabled)
    try:
        from code.file_setup.image_cache import ImageCache
        cache = ImageCache()

        if cache.is_enabled():
            logger.info("=" * 80)
            logger.info("Card image caching enabled - starting download")
            logger.info("=" * 80)

            # Download bulk data
            logger.info("Downloading Scryfall bulk data...")
            cache.download_bulk_data()

            # Download images
            logger.info("Downloading card images (this may take 1-2 hours)...")

            def progress(current, total, card_name):
                if current % 100 == 0:  # Log every 100 cards
                    pct = (current / total) * 100
                    logger.info(f"  Progress: {current}/{total} ({pct:.1f}%) - {card_name}")

            stats = cache.download_images(progress_callback=progress)

            logger.info("=" * 80)
            logger.info("✓ Image cache complete")
            logger.info(f"  Downloaded: {stats['downloaded']}")
            logger.info(f"  Skipped: {stats['skipped']}")
            logger.info(f"  Failed: {stats['failed']}")
            logger.info("=" * 80)
        else:
            logger.info("Card image caching disabled (CACHE_CARD_IMAGES=0)")
            logger.info("Images will be fetched from Scryfall API on demand")

    except Exception as e:
        logger.error(f"Failed to cache images (continuing anyway): {e}")
        logger.error("Images will be fetched from Scryfall API on demand")


def regenerate_processed_parquet() -> None:
    """Regenerate processed Parquet from existing raw file.

    Useful when:
    - Column processing logic changes
    - Adding new custom columns
    - Testing without re-downloading
    """
    logger.info("Regenerating processed Parquet from raw file")

    raw_path = os.path.join(card_files_raw_dir(), "cards.parquet")

    if not os.path.exists(raw_path):
        logger.error(f"Raw Parquet not found: {raw_path}")
        logger.error("Run initial_setup_parquet() first to download")
        raise FileNotFoundError(f"Raw Parquet not found: {raw_path}")

    processed_path = get_processed_cards_path()
    process_raw_parquet(raw_path, processed_path)

    logger.info(f"✓ Regenerated {processed_path}")