mtg_python_deckbuilder/code/scripts/benchmark_parquet.py

"""Benchmark Parquet vs CSV performance."""

import pandas as pd
import time
import os

def benchmark_full_load():
    """Benchmark loading full dataset."""
    csv_path = 'csv_files/cards.csv'
    parquet_path = 'csv_files/cards_parquet_test.parquet'

    print("=== FULL LOAD BENCHMARK ===\n")

    # CSV load
    print("Loading CSV...")
    start = time.time()
    df_csv = pd.read_csv(csv_path, low_memory=False)
    csv_time = time.time() - start
    csv_rows = len(df_csv)
    csv_memory = df_csv.memory_usage(deep=True).sum() / 1024 / 1024
    print(f"  Time: {csv_time:.3f}s")
    print(f"  Rows: {csv_rows:,}")
    print(f"  Memory: {csv_memory:.2f} MB")

    # Parquet load
    print("\nLoading Parquet...")
    start = time.time()
    df_parquet = pd.read_parquet(parquet_path)
    parquet_time = time.time() - start
    parquet_rows = len(df_parquet)
    parquet_memory = df_parquet.memory_usage(deep=True).sum() / 1024 / 1024
    print(f"  Time: {parquet_time:.3f}s")
    print(f"  Rows: {parquet_rows:,}")
    print(f"  Memory: {parquet_memory:.2f} MB")

    # Comparison
    speedup = csv_time / parquet_time
    memory_reduction = (1 - parquet_memory / csv_memory) * 100
    print(f"\n📊 Results:")
    print(f"  Speedup: {speedup:.2f}x faster")
    print(f"  Memory: {memory_reduction:.1f}% less")

    return df_csv, df_parquet

def benchmark_column_selection():
    """Benchmark loading with column selection (Parquet optimization)."""
    parquet_path = 'csv_files/cards_parquet_test.parquet'

    print("\n\n=== COLUMN SELECTION BENCHMARK (Parquet only) ===\n")

    # Essential columns for deck building
    essential_columns = ['name', 'colorIdentity', 'type', 'types', 'manaValue',
                         'manaCost', 'power', 'toughness', 'text', 'rarity']

    # Full load
    print("Loading all columns...")
    start = time.time()
    df_full = pd.read_parquet(parquet_path)
    full_time = time.time() - start
    full_memory = df_full.memory_usage(deep=True).sum() / 1024 / 1024
    print(f"  Time: {full_time:.3f}s")
    print(f"  Columns: {len(df_full.columns)}")
    print(f"  Memory: {full_memory:.2f} MB")

    # Selective load
    print(f"\nLoading {len(essential_columns)} essential columns...")
    start = time.time()
    df_selective = pd.read_parquet(parquet_path, columns=essential_columns)
    selective_time = time.time() - start
    selective_memory = df_selective.memory_usage(deep=True).sum() / 1024 / 1024
    print(f"  Time: {selective_time:.3f}s")
    print(f"  Columns: {len(df_selective.columns)}")
    print(f"  Memory: {selective_memory:.2f} MB")

    # Comparison
    speedup = full_time / selective_time
    memory_reduction = (1 - selective_memory / full_memory) * 100
    print(f"\n📊 Results:")
    print(f"  Speedup: {speedup:.2f}x faster")
    print(f"  Memory: {memory_reduction:.1f}% less")

def benchmark_filtering():
    """Benchmark filtering by colorIdentity (single file approach)."""
    parquet_path = 'csv_files/cards_parquet_test.parquet'

    print("\n\n=== COLOR IDENTITY FILTERING BENCHMARK ===\n")

    # Load data
    print("Loading Parquet with essential columns...")
    essential_columns = ['name', 'colorIdentity', 'type', 'manaValue']
    start = time.time()
    df = pd.read_parquet(parquet_path, columns=essential_columns)
    load_time = time.time() - start
    print(f"  Load time: {load_time:.3f}s")
    print(f"  Total cards: {len(df):,}")

    # Test different color identities
    test_cases = [
        ("Colorless (C)", ["C", ""]),
        ("Mono-White (W)", ["W", "C", ""]),
        ("Bant (GUW)", ["C", "", "G", "U", "W", "G,U", "G,W", "U,W", "G,U,W"]),
        ("5-Color (WUBRG)", ["C", "", "W", "U", "B", "R", "G",
                             "W,U", "W,B", "W,R", "W,G", "U,B", "U,R", "U,G", "B,R", "B,G", "R,G",
                             "W,U,B", "W,U,R", "W,U,G", "W,B,R", "W,B,G", "W,R,G", "U,B,R", "U,B,G", "U,R,G", "B,R,G",
                             "W,U,B,R", "W,U,B,G", "W,U,R,G", "W,B,R,G", "U,B,R,G",
                             "W,U,B,R,G"]),
    ]

    for test_name, valid_identities in test_cases:
        print(f"\n{test_name}:")
        start = time.time()
        filtered = df[df['colorIdentity'].isin(valid_identities)]
        filter_time = (time.time() - start) * 1000  # Convert to ms
        print(f"  Filter time: {filter_time:.1f}ms")
        print(f"  Cards found: {len(filtered):,}")
        print(f"  % of total: {len(filtered) / len(df) * 100:.1f}%")

def benchmark_data_types():
    """Check data types and list handling."""
    parquet_path = 'csv_files/cards_parquet_test.parquet'

    print("\n\n=== DATA TYPE ANALYSIS ===\n")

    df = pd.read_parquet(parquet_path)

    # Check list-type columns
    list_cols = []
    for col in df.columns:
        sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
        if isinstance(sample, (list, tuple)):
            list_cols.append(col)

    print(f"Columns stored as lists: {len(list_cols)}")
    for col in list_cols:
        sample = df[col].dropna().iloc[0]
        print(f"  {col}: {sample}")

    # Check critical columns for deck building
    critical_cols = ['name', 'colorIdentity', 'type', 'types', 'subtypes',
                     'manaValue', 'manaCost', 'text', 'keywords']

    print(f"\n✓ Critical columns for deck building:")
    for col in critical_cols:
        if col in df.columns:
            dtype = str(df[col].dtype)
            null_pct = (df[col].isna().sum() / len(df)) * 100
            sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
            sample_type = type(sample).__name__
            print(f"  {col:20s} dtype={dtype:10s} null={null_pct:5.1f}% sample_type={sample_type}")

if __name__ == "__main__":
    # Run benchmarks
    df_csv, df_parquet = benchmark_full_load()
    benchmark_column_selection()
    benchmark_filtering()
    benchmark_data_types()

    print("\n\n=== SUMMARY ===")
    print("✅ All benchmarks complete!")
    print("📁 File size: 77.2% smaller (88.94 MB → 20.27 MB)")