mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 15:40:12 +01:00
160 lines
6 KiB
Python
160 lines
6 KiB
Python
"""Benchmark Parquet vs CSV performance."""
|
|
|
|
import pandas as pd
|
|
import time
|
|
import os
|
|
|
|
def benchmark_full_load():
|
|
"""Benchmark loading full dataset."""
|
|
csv_path = 'csv_files/cards.csv'
|
|
parquet_path = 'csv_files/cards_parquet_test.parquet'
|
|
|
|
print("=== FULL LOAD BENCHMARK ===\n")
|
|
|
|
# CSV load
|
|
print("Loading CSV...")
|
|
start = time.time()
|
|
df_csv = pd.read_csv(csv_path, low_memory=False)
|
|
csv_time = time.time() - start
|
|
csv_rows = len(df_csv)
|
|
csv_memory = df_csv.memory_usage(deep=True).sum() / 1024 / 1024
|
|
print(f" Time: {csv_time:.3f}s")
|
|
print(f" Rows: {csv_rows:,}")
|
|
print(f" Memory: {csv_memory:.2f} MB")
|
|
|
|
# Parquet load
|
|
print("\nLoading Parquet...")
|
|
start = time.time()
|
|
df_parquet = pd.read_parquet(parquet_path)
|
|
parquet_time = time.time() - start
|
|
parquet_rows = len(df_parquet)
|
|
parquet_memory = df_parquet.memory_usage(deep=True).sum() / 1024 / 1024
|
|
print(f" Time: {parquet_time:.3f}s")
|
|
print(f" Rows: {parquet_rows:,}")
|
|
print(f" Memory: {parquet_memory:.2f} MB")
|
|
|
|
# Comparison
|
|
speedup = csv_time / parquet_time
|
|
memory_reduction = (1 - parquet_memory / csv_memory) * 100
|
|
print(f"\n📊 Results:")
|
|
print(f" Speedup: {speedup:.2f}x faster")
|
|
print(f" Memory: {memory_reduction:.1f}% less")
|
|
|
|
return df_csv, df_parquet
|
|
|
|
def benchmark_column_selection():
|
|
"""Benchmark loading with column selection (Parquet optimization)."""
|
|
parquet_path = 'csv_files/cards_parquet_test.parquet'
|
|
|
|
print("\n\n=== COLUMN SELECTION BENCHMARK (Parquet only) ===\n")
|
|
|
|
# Essential columns for deck building
|
|
essential_columns = ['name', 'colorIdentity', 'type', 'types', 'manaValue',
|
|
'manaCost', 'power', 'toughness', 'text', 'rarity']
|
|
|
|
# Full load
|
|
print("Loading all columns...")
|
|
start = time.time()
|
|
df_full = pd.read_parquet(parquet_path)
|
|
full_time = time.time() - start
|
|
full_memory = df_full.memory_usage(deep=True).sum() / 1024 / 1024
|
|
print(f" Time: {full_time:.3f}s")
|
|
print(f" Columns: {len(df_full.columns)}")
|
|
print(f" Memory: {full_memory:.2f} MB")
|
|
|
|
# Selective load
|
|
print(f"\nLoading {len(essential_columns)} essential columns...")
|
|
start = time.time()
|
|
df_selective = pd.read_parquet(parquet_path, columns=essential_columns)
|
|
selective_time = time.time() - start
|
|
selective_memory = df_selective.memory_usage(deep=True).sum() / 1024 / 1024
|
|
print(f" Time: {selective_time:.3f}s")
|
|
print(f" Columns: {len(df_selective.columns)}")
|
|
print(f" Memory: {selective_memory:.2f} MB")
|
|
|
|
# Comparison
|
|
speedup = full_time / selective_time
|
|
memory_reduction = (1 - selective_memory / full_memory) * 100
|
|
print(f"\n📊 Results:")
|
|
print(f" Speedup: {speedup:.2f}x faster")
|
|
print(f" Memory: {memory_reduction:.1f}% less")
|
|
|
|
def benchmark_filtering():
|
|
"""Benchmark filtering by colorIdentity (single file approach)."""
|
|
parquet_path = 'csv_files/cards_parquet_test.parquet'
|
|
|
|
print("\n\n=== COLOR IDENTITY FILTERING BENCHMARK ===\n")
|
|
|
|
# Load data
|
|
print("Loading Parquet with essential columns...")
|
|
essential_columns = ['name', 'colorIdentity', 'type', 'manaValue']
|
|
start = time.time()
|
|
df = pd.read_parquet(parquet_path, columns=essential_columns)
|
|
load_time = time.time() - start
|
|
print(f" Load time: {load_time:.3f}s")
|
|
print(f" Total cards: {len(df):,}")
|
|
|
|
# Test different color identities
|
|
test_cases = [
|
|
("Colorless (C)", ["C", ""]),
|
|
("Mono-White (W)", ["W", "C", ""]),
|
|
("Bant (GUW)", ["C", "", "G", "U", "W", "G,U", "G,W", "U,W", "G,U,W"]),
|
|
("5-Color (WUBRG)", ["C", "", "W", "U", "B", "R", "G",
|
|
"W,U", "W,B", "W,R", "W,G", "U,B", "U,R", "U,G", "B,R", "B,G", "R,G",
|
|
"W,U,B", "W,U,R", "W,U,G", "W,B,R", "W,B,G", "W,R,G", "U,B,R", "U,B,G", "U,R,G", "B,R,G",
|
|
"W,U,B,R", "W,U,B,G", "W,U,R,G", "W,B,R,G", "U,B,R,G",
|
|
"W,U,B,R,G"]),
|
|
]
|
|
|
|
for test_name, valid_identities in test_cases:
|
|
print(f"\n{test_name}:")
|
|
start = time.time()
|
|
filtered = df[df['colorIdentity'].isin(valid_identities)]
|
|
filter_time = (time.time() - start) * 1000 # Convert to ms
|
|
print(f" Filter time: {filter_time:.1f}ms")
|
|
print(f" Cards found: {len(filtered):,}")
|
|
print(f" % of total: {len(filtered) / len(df) * 100:.1f}%")
|
|
|
|
def benchmark_data_types():
|
|
"""Check data types and list handling."""
|
|
parquet_path = 'csv_files/cards_parquet_test.parquet'
|
|
|
|
print("\n\n=== DATA TYPE ANALYSIS ===\n")
|
|
|
|
df = pd.read_parquet(parquet_path)
|
|
|
|
# Check list-type columns
|
|
list_cols = []
|
|
for col in df.columns:
|
|
sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
|
|
if isinstance(sample, (list, tuple)):
|
|
list_cols.append(col)
|
|
|
|
print(f"Columns stored as lists: {len(list_cols)}")
|
|
for col in list_cols:
|
|
sample = df[col].dropna().iloc[0]
|
|
print(f" {col}: {sample}")
|
|
|
|
# Check critical columns for deck building
|
|
critical_cols = ['name', 'colorIdentity', 'type', 'types', 'subtypes',
|
|
'manaValue', 'manaCost', 'text', 'keywords']
|
|
|
|
print(f"\n✓ Critical columns for deck building:")
|
|
for col in critical_cols:
|
|
if col in df.columns:
|
|
dtype = str(df[col].dtype)
|
|
null_pct = (df[col].isna().sum() / len(df)) * 100
|
|
sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
|
|
sample_type = type(sample).__name__
|
|
print(f" {col:20s} dtype={dtype:10s} null={null_pct:5.1f}% sample_type={sample_type}")
|
|
|
|
if __name__ == "__main__":
|
|
# Run benchmarks
|
|
df_csv, df_parquet = benchmark_full_load()
|
|
benchmark_column_selection()
|
|
benchmark_filtering()
|
|
benchmark_data_types()
|
|
|
|
print("\n\n=== SUMMARY ===")
|
|
print("✅ All benchmarks complete!")
|
|
print("📁 File size: 77.2% smaller (88.94 MB → 20.27 MB)")
|