feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging

2026-03-17 18:56:30 +01:00 · 2025-10-18 21:32:12 -07:00 · 2025-10-18 21:32:12 -07:00 · 8435312c8f
commit 8435312c8f
parent e9e949aae3
58 changed files with 11921 additions and 3961 deletions
--- a/code/scripts/benchmark_parquet.py
+++ b/code/scripts/benchmark_parquet.py
@ -0,0 +1,160 @@
+"""Benchmark Parquet vs CSV performance."""
+
+import pandas as pd
+import time
+import os
+
+def benchmark_full_load():
+    """Benchmark loading full dataset."""
+    csv_path = 'csv_files/cards.csv'
+    parquet_path = 'csv_files/cards_parquet_test.parquet'
+    
+    print("=== FULL LOAD BENCHMARK ===\n")
+    
+    # CSV load
+    print("Loading CSV...")
+    start = time.time()
+    df_csv = pd.read_csv(csv_path, low_memory=False)
+    csv_time = time.time() - start
+    csv_rows = len(df_csv)
+    csv_memory = df_csv.memory_usage(deep=True).sum() / 1024 / 1024
+    print(f"  Time: {csv_time:.3f}s")
+    print(f"  Rows: {csv_rows:,}")
+    print(f"  Memory: {csv_memory:.2f} MB")
+    
+    # Parquet load
+    print("\nLoading Parquet...")
+    start = time.time()
+    df_parquet = pd.read_parquet(parquet_path)
+    parquet_time = time.time() - start
+    parquet_rows = len(df_parquet)
+    parquet_memory = df_parquet.memory_usage(deep=True).sum() / 1024 / 1024
+    print(f"  Time: {parquet_time:.3f}s")
+    print(f"  Rows: {parquet_rows:,}")
+    print(f"  Memory: {parquet_memory:.2f} MB")
+    
+    # Comparison
+    speedup = csv_time / parquet_time
+    memory_reduction = (1 - parquet_memory / csv_memory) * 100
+    print(f"\n📊 Results:")
+    print(f"  Speedup: {speedup:.2f}x faster")
+    print(f"  Memory: {memory_reduction:.1f}% less")
+    
+    return df_csv, df_parquet
+
+def benchmark_column_selection():
+    """Benchmark loading with column selection (Parquet optimization)."""
+    parquet_path = 'csv_files/cards_parquet_test.parquet'
+    
+    print("\n\n=== COLUMN SELECTION BENCHMARK (Parquet only) ===\n")
+    
+    # Essential columns for deck building
+    essential_columns = ['name', 'colorIdentity', 'type', 'types', 'manaValue', 
+                         'manaCost', 'power', 'toughness', 'text', 'rarity']
+    
+    # Full load
+    print("Loading all columns...")
+    start = time.time()
+    df_full = pd.read_parquet(parquet_path)
+    full_time = time.time() - start
+    full_memory = df_full.memory_usage(deep=True).sum() / 1024 / 1024
+    print(f"  Time: {full_time:.3f}s")
+    print(f"  Columns: {len(df_full.columns)}")
+    print(f"  Memory: {full_memory:.2f} MB")
+    
+    # Selective load
+    print(f"\nLoading {len(essential_columns)} essential columns...")
+    start = time.time()
+    df_selective = pd.read_parquet(parquet_path, columns=essential_columns)
+    selective_time = time.time() - start
+    selective_memory = df_selective.memory_usage(deep=True).sum() / 1024 / 1024
+    print(f"  Time: {selective_time:.3f}s")
+    print(f"  Columns: {len(df_selective.columns)}")
+    print(f"  Memory: {selective_memory:.2f} MB")
+    
+    # Comparison
+    speedup = full_time / selective_time
+    memory_reduction = (1 - selective_memory / full_memory) * 100
+    print(f"\n📊 Results:")
+    print(f"  Speedup: {speedup:.2f}x faster")
+    print(f"  Memory: {memory_reduction:.1f}% less")
+
+def benchmark_filtering():
+    """Benchmark filtering by colorIdentity (single file approach)."""
+    parquet_path = 'csv_files/cards_parquet_test.parquet'
+    
+    print("\n\n=== COLOR IDENTITY FILTERING BENCHMARK ===\n")
+    
+    # Load data
+    print("Loading Parquet with essential columns...")
+    essential_columns = ['name', 'colorIdentity', 'type', 'manaValue']
+    start = time.time()
+    df = pd.read_parquet(parquet_path, columns=essential_columns)
+    load_time = time.time() - start
+    print(f"  Load time: {load_time:.3f}s")
+    print(f"  Total cards: {len(df):,}")
+    
+    # Test different color identities
+    test_cases = [
+        ("Colorless (C)", ["C", ""]),
+        ("Mono-White (W)", ["W", "C", ""]),
+        ("Bant (GUW)", ["C", "", "G", "U", "W", "G,U", "G,W", "U,W", "G,U,W"]),
+        ("5-Color (WUBRG)", ["C", "", "W", "U", "B", "R", "G", 
+                             "W,U", "W,B", "W,R", "W,G", "U,B", "U,R", "U,G", "B,R", "B,G", "R,G",
+                             "W,U,B", "W,U,R", "W,U,G", "W,B,R", "W,B,G", "W,R,G", "U,B,R", "U,B,G", "U,R,G", "B,R,G",
+                             "W,U,B,R", "W,U,B,G", "W,U,R,G", "W,B,R,G", "U,B,R,G",
+                             "W,U,B,R,G"]),
+    ]
+    
+    for test_name, valid_identities in test_cases:
+        print(f"\n{test_name}:")
+        start = time.time()
+        filtered = df[df['colorIdentity'].isin(valid_identities)]
+        filter_time = (time.time() - start) * 1000  # Convert to ms
+        print(f"  Filter time: {filter_time:.1f}ms")
+        print(f"  Cards found: {len(filtered):,}")
+        print(f"  % of total: {len(filtered) / len(df) * 100:.1f}%")
+
+def benchmark_data_types():
+    """Check data types and list handling."""
+    parquet_path = 'csv_files/cards_parquet_test.parquet'
+    
+    print("\n\n=== DATA TYPE ANALYSIS ===\n")
+    
+    df = pd.read_parquet(parquet_path)
+    
+    # Check list-type columns
+    list_cols = []
+    for col in df.columns:
+        sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
+        if isinstance(sample, (list, tuple)):
+            list_cols.append(col)
+    
+    print(f"Columns stored as lists: {len(list_cols)}")
+    for col in list_cols:
+        sample = df[col].dropna().iloc[0]
+        print(f"  {col}: {sample}")
+    
+    # Check critical columns for deck building
+    critical_cols = ['name', 'colorIdentity', 'type', 'types', 'subtypes', 
+                     'manaValue', 'manaCost', 'text', 'keywords']
+    
+    print(f"\n✓ Critical columns for deck building:")
+    for col in critical_cols:
+        if col in df.columns:
+            dtype = str(df[col].dtype)
+            null_pct = (df[col].isna().sum() / len(df)) * 100
+            sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
+            sample_type = type(sample).__name__
+            print(f"  {col:20s} dtype={dtype:10s} null={null_pct:5.1f}% sample_type={sample_type}")
+
+if __name__ == "__main__":
+    # Run benchmarks
+    df_csv, df_parquet = benchmark_full_load()
+    benchmark_column_selection()
+    benchmark_filtering()
+    benchmark_data_types()
+    
+    print("\n\n=== SUMMARY ===")
+    print("✅ All benchmarks complete!")
+    print("📁 File size: 77.2% smaller (88.94 MB → 20.27 MB)")
--- a/code/scripts/inspect_parquet.py
+++ b/code/scripts/inspect_parquet.py
@ -0,0 +1,104 @@
+"""Inspect MTGJSON Parquet file schema and compare to CSV."""
+
+import pandas as pd
+import os
+import sys
+
+def inspect_parquet():
+    """Load and inspect Parquet file."""
+    parquet_path = 'csv_files/cards_parquet_test.parquet'
+    
+    if not os.path.exists(parquet_path):
+        print(f"Error: {parquet_path} not found")
+        return
+    
+    print("Loading Parquet file...")
+    df = pd.read_parquet(parquet_path)
+    
+    print("\n=== PARQUET FILE INFO ===")
+    print(f"Rows: {len(df):,}")
+    print(f"Columns: {len(df.columns)}")
+    print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB")
+    
+    print("\n=== PARQUET COLUMNS AND TYPES ===")
+    for col in sorted(df.columns):
+        dtype = str(df[col].dtype)
+        non_null = df[col].notna().sum()
+        null_pct = (1 - non_null / len(df)) * 100
+        print(f"  {col:30s} {dtype:15s} ({null_pct:5.1f}% null)")
+    
+    print("\n=== SAMPLE DATA (first card) ===")
+    first_card = df.iloc[0].to_dict()
+    for key, value in sorted(first_card.items()):
+        if isinstance(value, (list, dict)):
+            print(f"  {key}: {type(value).__name__} with {len(value)} items")
+        else:
+            value_str = str(value)[:80]
+            print(f"  {key}: {value_str}")
+    
+    return df
+
+def compare_to_csv():
+    """Compare Parquet columns to CSV columns."""
+    csv_path = 'csv_files/cards.csv'
+    parquet_path = 'csv_files/cards_parquet_test.parquet'
+    
+    if not os.path.exists(csv_path):
+        print(f"\nNote: {csv_path} not found, skipping comparison")
+        return
+    
+    print("\n\n=== CSV FILE INFO ===")
+    print("Loading CSV file...")
+    df_csv = pd.read_csv(csv_path, low_memory=False, nrows=1)
+    
+    csv_size = os.path.getsize(csv_path) / 1024 / 1024
+    print(f"File size: {csv_size:.2f} MB")
+    print(f"Columns: {len(df_csv.columns)}")
+    
+    print("\n=== CSV COLUMNS ===")
+    csv_cols = set(df_csv.columns)
+    for col in sorted(df_csv.columns):
+        print(f"  {col}")
+    
+    # Load parquet columns
+    df_parquet = pd.read_parquet(parquet_path)
+    parquet_cols = set(df_parquet.columns)
+    
+    print("\n\n=== SCHEMA COMPARISON ===")
+    
+    # Columns in both
+    common = csv_cols & parquet_cols
+    print(f"\n✓ Columns in both (n={len(common)}):")
+    for col in sorted(common):
+        csv_type = str(df_csv[col].dtype)
+        parquet_type = str(df_parquet[col].dtype)
+        if csv_type != parquet_type:
+            print(f"  {col:30s} CSV: {csv_type:15s} Parquet: {parquet_type}")
+        else:
+            print(f"  {col:30s} {csv_type}")
+    
+    # CSV only
+    csv_only = csv_cols - parquet_cols
+    if csv_only:
+        print(f"\n⚠ Columns only in CSV (n={len(csv_only)}):")
+        for col in sorted(csv_only):
+            print(f"  {col}")
+    
+    # Parquet only
+    parquet_only = parquet_cols - csv_cols
+    if parquet_only:
+        print(f"\n✓ Columns only in Parquet (n={len(parquet_only)}):")
+        for col in sorted(parquet_only):
+            print(f"  {col}")
+    
+    # File size comparison
+    parquet_size = os.path.getsize(parquet_path) / 1024 / 1024
+    size_reduction = (1 - parquet_size / csv_size) * 100
+    print(f"\n=== FILE SIZE COMPARISON ===")
+    print(f"CSV:     {csv_size:.2f} MB")
+    print(f"Parquet: {parquet_size:.2f} MB")
+    print(f"Savings: {size_reduction:.1f}%")
+
+if __name__ == "__main__":
+    df = inspect_parquet()
+    compare_to_csv()