feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging

This commit is contained in:
matt 2025-10-18 21:32:12 -07:00
parent e9e949aae3
commit 8435312c8f
58 changed files with 11921 additions and 3961 deletions

View file

@ -0,0 +1,160 @@
"""Benchmark Parquet vs CSV performance."""
import pandas as pd
import time
import os
def benchmark_full_load():
"""Benchmark loading full dataset."""
csv_path = 'csv_files/cards.csv'
parquet_path = 'csv_files/cards_parquet_test.parquet'
print("=== FULL LOAD BENCHMARK ===\n")
# CSV load
print("Loading CSV...")
start = time.time()
df_csv = pd.read_csv(csv_path, low_memory=False)
csv_time = time.time() - start
csv_rows = len(df_csv)
csv_memory = df_csv.memory_usage(deep=True).sum() / 1024 / 1024
print(f" Time: {csv_time:.3f}s")
print(f" Rows: {csv_rows:,}")
print(f" Memory: {csv_memory:.2f} MB")
# Parquet load
print("\nLoading Parquet...")
start = time.time()
df_parquet = pd.read_parquet(parquet_path)
parquet_time = time.time() - start
parquet_rows = len(df_parquet)
parquet_memory = df_parquet.memory_usage(deep=True).sum() / 1024 / 1024
print(f" Time: {parquet_time:.3f}s")
print(f" Rows: {parquet_rows:,}")
print(f" Memory: {parquet_memory:.2f} MB")
# Comparison
speedup = csv_time / parquet_time
memory_reduction = (1 - parquet_memory / csv_memory) * 100
print(f"\n📊 Results:")
print(f" Speedup: {speedup:.2f}x faster")
print(f" Memory: {memory_reduction:.1f}% less")
return df_csv, df_parquet
def benchmark_column_selection():
"""Benchmark loading with column selection (Parquet optimization)."""
parquet_path = 'csv_files/cards_parquet_test.parquet'
print("\n\n=== COLUMN SELECTION BENCHMARK (Parquet only) ===\n")
# Essential columns for deck building
essential_columns = ['name', 'colorIdentity', 'type', 'types', 'manaValue',
'manaCost', 'power', 'toughness', 'text', 'rarity']
# Full load
print("Loading all columns...")
start = time.time()
df_full = pd.read_parquet(parquet_path)
full_time = time.time() - start
full_memory = df_full.memory_usage(deep=True).sum() / 1024 / 1024
print(f" Time: {full_time:.3f}s")
print(f" Columns: {len(df_full.columns)}")
print(f" Memory: {full_memory:.2f} MB")
# Selective load
print(f"\nLoading {len(essential_columns)} essential columns...")
start = time.time()
df_selective = pd.read_parquet(parquet_path, columns=essential_columns)
selective_time = time.time() - start
selective_memory = df_selective.memory_usage(deep=True).sum() / 1024 / 1024
print(f" Time: {selective_time:.3f}s")
print(f" Columns: {len(df_selective.columns)}")
print(f" Memory: {selective_memory:.2f} MB")
# Comparison
speedup = full_time / selective_time
memory_reduction = (1 - selective_memory / full_memory) * 100
print(f"\n📊 Results:")
print(f" Speedup: {speedup:.2f}x faster")
print(f" Memory: {memory_reduction:.1f}% less")
def benchmark_filtering():
"""Benchmark filtering by colorIdentity (single file approach)."""
parquet_path = 'csv_files/cards_parquet_test.parquet'
print("\n\n=== COLOR IDENTITY FILTERING BENCHMARK ===\n")
# Load data
print("Loading Parquet with essential columns...")
essential_columns = ['name', 'colorIdentity', 'type', 'manaValue']
start = time.time()
df = pd.read_parquet(parquet_path, columns=essential_columns)
load_time = time.time() - start
print(f" Load time: {load_time:.3f}s")
print(f" Total cards: {len(df):,}")
# Test different color identities
test_cases = [
("Colorless (C)", ["C", ""]),
("Mono-White (W)", ["W", "C", ""]),
("Bant (GUW)", ["C", "", "G", "U", "W", "G,U", "G,W", "U,W", "G,U,W"]),
("5-Color (WUBRG)", ["C", "", "W", "U", "B", "R", "G",
"W,U", "W,B", "W,R", "W,G", "U,B", "U,R", "U,G", "B,R", "B,G", "R,G",
"W,U,B", "W,U,R", "W,U,G", "W,B,R", "W,B,G", "W,R,G", "U,B,R", "U,B,G", "U,R,G", "B,R,G",
"W,U,B,R", "W,U,B,G", "W,U,R,G", "W,B,R,G", "U,B,R,G",
"W,U,B,R,G"]),
]
for test_name, valid_identities in test_cases:
print(f"\n{test_name}:")
start = time.time()
filtered = df[df['colorIdentity'].isin(valid_identities)]
filter_time = (time.time() - start) * 1000 # Convert to ms
print(f" Filter time: {filter_time:.1f}ms")
print(f" Cards found: {len(filtered):,}")
print(f" % of total: {len(filtered) / len(df) * 100:.1f}%")
def benchmark_data_types():
"""Check data types and list handling."""
parquet_path = 'csv_files/cards_parquet_test.parquet'
print("\n\n=== DATA TYPE ANALYSIS ===\n")
df = pd.read_parquet(parquet_path)
# Check list-type columns
list_cols = []
for col in df.columns:
sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
if isinstance(sample, (list, tuple)):
list_cols.append(col)
print(f"Columns stored as lists: {len(list_cols)}")
for col in list_cols:
sample = df[col].dropna().iloc[0]
print(f" {col}: {sample}")
# Check critical columns for deck building
critical_cols = ['name', 'colorIdentity', 'type', 'types', 'subtypes',
'manaValue', 'manaCost', 'text', 'keywords']
print(f"\n✓ Critical columns for deck building:")
for col in critical_cols:
if col in df.columns:
dtype = str(df[col].dtype)
null_pct = (df[col].isna().sum() / len(df)) * 100
sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
sample_type = type(sample).__name__
print(f" {col:20s} dtype={dtype:10s} null={null_pct:5.1f}% sample_type={sample_type}")
if __name__ == "__main__":
# Run benchmarks
df_csv, df_parquet = benchmark_full_load()
benchmark_column_selection()
benchmark_filtering()
benchmark_data_types()
print("\n\n=== SUMMARY ===")
print("✅ All benchmarks complete!")
print("📁 File size: 77.2% smaller (88.94 MB → 20.27 MB)")

View file

@ -0,0 +1,104 @@
"""Inspect MTGJSON Parquet file schema and compare to CSV."""
import pandas as pd
import os
import sys
def inspect_parquet():
"""Load and inspect Parquet file."""
parquet_path = 'csv_files/cards_parquet_test.parquet'
if not os.path.exists(parquet_path):
print(f"Error: {parquet_path} not found")
return
print("Loading Parquet file...")
df = pd.read_parquet(parquet_path)
print("\n=== PARQUET FILE INFO ===")
print(f"Rows: {len(df):,}")
print(f"Columns: {len(df.columns)}")
print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB")
print("\n=== PARQUET COLUMNS AND TYPES ===")
for col in sorted(df.columns):
dtype = str(df[col].dtype)
non_null = df[col].notna().sum()
null_pct = (1 - non_null / len(df)) * 100
print(f" {col:30s} {dtype:15s} ({null_pct:5.1f}% null)")
print("\n=== SAMPLE DATA (first card) ===")
first_card = df.iloc[0].to_dict()
for key, value in sorted(first_card.items()):
if isinstance(value, (list, dict)):
print(f" {key}: {type(value).__name__} with {len(value)} items")
else:
value_str = str(value)[:80]
print(f" {key}: {value_str}")
return df
def compare_to_csv():
"""Compare Parquet columns to CSV columns."""
csv_path = 'csv_files/cards.csv'
parquet_path = 'csv_files/cards_parquet_test.parquet'
if not os.path.exists(csv_path):
print(f"\nNote: {csv_path} not found, skipping comparison")
return
print("\n\n=== CSV FILE INFO ===")
print("Loading CSV file...")
df_csv = pd.read_csv(csv_path, low_memory=False, nrows=1)
csv_size = os.path.getsize(csv_path) / 1024 / 1024
print(f"File size: {csv_size:.2f} MB")
print(f"Columns: {len(df_csv.columns)}")
print("\n=== CSV COLUMNS ===")
csv_cols = set(df_csv.columns)
for col in sorted(df_csv.columns):
print(f" {col}")
# Load parquet columns
df_parquet = pd.read_parquet(parquet_path)
parquet_cols = set(df_parquet.columns)
print("\n\n=== SCHEMA COMPARISON ===")
# Columns in both
common = csv_cols & parquet_cols
print(f"\n✓ Columns in both (n={len(common)}):")
for col in sorted(common):
csv_type = str(df_csv[col].dtype)
parquet_type = str(df_parquet[col].dtype)
if csv_type != parquet_type:
print(f" {col:30s} CSV: {csv_type:15s} Parquet: {parquet_type}")
else:
print(f" {col:30s} {csv_type}")
# CSV only
csv_only = csv_cols - parquet_cols
if csv_only:
print(f"\n⚠ Columns only in CSV (n={len(csv_only)}):")
for col in sorted(csv_only):
print(f" {col}")
# Parquet only
parquet_only = parquet_cols - csv_cols
if parquet_only:
print(f"\n✓ Columns only in Parquet (n={len(parquet_only)}):")
for col in sorted(parquet_only):
print(f" {col}")
# File size comparison
parquet_size = os.path.getsize(parquet_path) / 1024 / 1024
size_reduction = (1 - parquet_size / csv_size) * 100
print(f"\n=== FILE SIZE COMPARISON ===")
print(f"CSV: {csv_size:.2f} MB")
print(f"Parquet: {parquet_size:.2f} MB")
print(f"Savings: {size_reduction:.1f}%")
if __name__ == "__main__":
df = inspect_parquet()
compare_to_csv()