mtg_python_deckbuilder/code/scripts/benchmark_parquet.py

160 lines
6 KiB
Python

"""Benchmark Parquet vs CSV performance."""
import pandas as pd
import time
import os
def benchmark_full_load():
"""Benchmark loading full dataset."""
csv_path = 'csv_files/cards.csv'
parquet_path = 'csv_files/cards_parquet_test.parquet'
print("=== FULL LOAD BENCHMARK ===\n")
# CSV load
print("Loading CSV...")
start = time.time()
df_csv = pd.read_csv(csv_path, low_memory=False)
csv_time = time.time() - start
csv_rows = len(df_csv)
csv_memory = df_csv.memory_usage(deep=True).sum() / 1024 / 1024
print(f" Time: {csv_time:.3f}s")
print(f" Rows: {csv_rows:,}")
print(f" Memory: {csv_memory:.2f} MB")
# Parquet load
print("\nLoading Parquet...")
start = time.time()
df_parquet = pd.read_parquet(parquet_path)
parquet_time = time.time() - start
parquet_rows = len(df_parquet)
parquet_memory = df_parquet.memory_usage(deep=True).sum() / 1024 / 1024
print(f" Time: {parquet_time:.3f}s")
print(f" Rows: {parquet_rows:,}")
print(f" Memory: {parquet_memory:.2f} MB")
# Comparison
speedup = csv_time / parquet_time
memory_reduction = (1 - parquet_memory / csv_memory) * 100
print(f"\n📊 Results:")
print(f" Speedup: {speedup:.2f}x faster")
print(f" Memory: {memory_reduction:.1f}% less")
return df_csv, df_parquet
def benchmark_column_selection():
"""Benchmark loading with column selection (Parquet optimization)."""
parquet_path = 'csv_files/cards_parquet_test.parquet'
print("\n\n=== COLUMN SELECTION BENCHMARK (Parquet only) ===\n")
# Essential columns for deck building
essential_columns = ['name', 'colorIdentity', 'type', 'types', 'manaValue',
'manaCost', 'power', 'toughness', 'text', 'rarity']
# Full load
print("Loading all columns...")
start = time.time()
df_full = pd.read_parquet(parquet_path)
full_time = time.time() - start
full_memory = df_full.memory_usage(deep=True).sum() / 1024 / 1024
print(f" Time: {full_time:.3f}s")
print(f" Columns: {len(df_full.columns)}")
print(f" Memory: {full_memory:.2f} MB")
# Selective load
print(f"\nLoading {len(essential_columns)} essential columns...")
start = time.time()
df_selective = pd.read_parquet(parquet_path, columns=essential_columns)
selective_time = time.time() - start
selective_memory = df_selective.memory_usage(deep=True).sum() / 1024 / 1024
print(f" Time: {selective_time:.3f}s")
print(f" Columns: {len(df_selective.columns)}")
print(f" Memory: {selective_memory:.2f} MB")
# Comparison
speedup = full_time / selective_time
memory_reduction = (1 - selective_memory / full_memory) * 100
print(f"\n📊 Results:")
print(f" Speedup: {speedup:.2f}x faster")
print(f" Memory: {memory_reduction:.1f}% less")
def benchmark_filtering():
"""Benchmark filtering by colorIdentity (single file approach)."""
parquet_path = 'csv_files/cards_parquet_test.parquet'
print("\n\n=== COLOR IDENTITY FILTERING BENCHMARK ===\n")
# Load data
print("Loading Parquet with essential columns...")
essential_columns = ['name', 'colorIdentity', 'type', 'manaValue']
start = time.time()
df = pd.read_parquet(parquet_path, columns=essential_columns)
load_time = time.time() - start
print(f" Load time: {load_time:.3f}s")
print(f" Total cards: {len(df):,}")
# Test different color identities
test_cases = [
("Colorless (C)", ["C", ""]),
("Mono-White (W)", ["W", "C", ""]),
("Bant (GUW)", ["C", "", "G", "U", "W", "G,U", "G,W", "U,W", "G,U,W"]),
("5-Color (WUBRG)", ["C", "", "W", "U", "B", "R", "G",
"W,U", "W,B", "W,R", "W,G", "U,B", "U,R", "U,G", "B,R", "B,G", "R,G",
"W,U,B", "W,U,R", "W,U,G", "W,B,R", "W,B,G", "W,R,G", "U,B,R", "U,B,G", "U,R,G", "B,R,G",
"W,U,B,R", "W,U,B,G", "W,U,R,G", "W,B,R,G", "U,B,R,G",
"W,U,B,R,G"]),
]
for test_name, valid_identities in test_cases:
print(f"\n{test_name}:")
start = time.time()
filtered = df[df['colorIdentity'].isin(valid_identities)]
filter_time = (time.time() - start) * 1000 # Convert to ms
print(f" Filter time: {filter_time:.1f}ms")
print(f" Cards found: {len(filtered):,}")
print(f" % of total: {len(filtered) / len(df) * 100:.1f}%")
def benchmark_data_types():
"""Check data types and list handling."""
parquet_path = 'csv_files/cards_parquet_test.parquet'
print("\n\n=== DATA TYPE ANALYSIS ===\n")
df = pd.read_parquet(parquet_path)
# Check list-type columns
list_cols = []
for col in df.columns:
sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
if isinstance(sample, (list, tuple)):
list_cols.append(col)
print(f"Columns stored as lists: {len(list_cols)}")
for col in list_cols:
sample = df[col].dropna().iloc[0]
print(f" {col}: {sample}")
# Check critical columns for deck building
critical_cols = ['name', 'colorIdentity', 'type', 'types', 'subtypes',
'manaValue', 'manaCost', 'text', 'keywords']
print(f"\n✓ Critical columns for deck building:")
for col in critical_cols:
if col in df.columns:
dtype = str(df[col].dtype)
null_pct = (df[col].isna().sum() / len(df)) * 100
sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
sample_type = type(sample).__name__
print(f" {col:20s} dtype={dtype:10s} null={null_pct:5.1f}% sample_type={sample_type}")
if __name__ == "__main__":
# Run benchmarks
df_csv, df_parquet = benchmark_full_load()
benchmark_column_selection()
benchmark_filtering()
benchmark_data_types()
print("\n\n=== SUMMARY ===")
print("✅ All benchmarks complete!")
print("📁 File size: 77.2% smaller (88.94 MB → 20.27 MB)")