mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 15:40:12 +01:00
104 lines
3.3 KiB
Python
104 lines
3.3 KiB
Python
"""Inspect MTGJSON Parquet file schema and compare to CSV."""
|
|
|
|
import pandas as pd
|
|
import os
|
|
import sys
|
|
|
|
def inspect_parquet():
|
|
"""Load and inspect Parquet file."""
|
|
parquet_path = 'csv_files/cards_parquet_test.parquet'
|
|
|
|
if not os.path.exists(parquet_path):
|
|
print(f"Error: {parquet_path} not found")
|
|
return
|
|
|
|
print("Loading Parquet file...")
|
|
df = pd.read_parquet(parquet_path)
|
|
|
|
print("\n=== PARQUET FILE INFO ===")
|
|
print(f"Rows: {len(df):,}")
|
|
print(f"Columns: {len(df.columns)}")
|
|
print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB")
|
|
|
|
print("\n=== PARQUET COLUMNS AND TYPES ===")
|
|
for col in sorted(df.columns):
|
|
dtype = str(df[col].dtype)
|
|
non_null = df[col].notna().sum()
|
|
null_pct = (1 - non_null / len(df)) * 100
|
|
print(f" {col:30s} {dtype:15s} ({null_pct:5.1f}% null)")
|
|
|
|
print("\n=== SAMPLE DATA (first card) ===")
|
|
first_card = df.iloc[0].to_dict()
|
|
for key, value in sorted(first_card.items()):
|
|
if isinstance(value, (list, dict)):
|
|
print(f" {key}: {type(value).__name__} with {len(value)} items")
|
|
else:
|
|
value_str = str(value)[:80]
|
|
print(f" {key}: {value_str}")
|
|
|
|
return df
|
|
|
|
def compare_to_csv():
|
|
"""Compare Parquet columns to CSV columns."""
|
|
csv_path = 'csv_files/cards.csv'
|
|
parquet_path = 'csv_files/cards_parquet_test.parquet'
|
|
|
|
if not os.path.exists(csv_path):
|
|
print(f"\nNote: {csv_path} not found, skipping comparison")
|
|
return
|
|
|
|
print("\n\n=== CSV FILE INFO ===")
|
|
print("Loading CSV file...")
|
|
df_csv = pd.read_csv(csv_path, low_memory=False, nrows=1)
|
|
|
|
csv_size = os.path.getsize(csv_path) / 1024 / 1024
|
|
print(f"File size: {csv_size:.2f} MB")
|
|
print(f"Columns: {len(df_csv.columns)}")
|
|
|
|
print("\n=== CSV COLUMNS ===")
|
|
csv_cols = set(df_csv.columns)
|
|
for col in sorted(df_csv.columns):
|
|
print(f" {col}")
|
|
|
|
# Load parquet columns
|
|
df_parquet = pd.read_parquet(parquet_path)
|
|
parquet_cols = set(df_parquet.columns)
|
|
|
|
print("\n\n=== SCHEMA COMPARISON ===")
|
|
|
|
# Columns in both
|
|
common = csv_cols & parquet_cols
|
|
print(f"\n✓ Columns in both (n={len(common)}):")
|
|
for col in sorted(common):
|
|
csv_type = str(df_csv[col].dtype)
|
|
parquet_type = str(df_parquet[col].dtype)
|
|
if csv_type != parquet_type:
|
|
print(f" {col:30s} CSV: {csv_type:15s} Parquet: {parquet_type}")
|
|
else:
|
|
print(f" {col:30s} {csv_type}")
|
|
|
|
# CSV only
|
|
csv_only = csv_cols - parquet_cols
|
|
if csv_only:
|
|
print(f"\n⚠ Columns only in CSV (n={len(csv_only)}):")
|
|
for col in sorted(csv_only):
|
|
print(f" {col}")
|
|
|
|
# Parquet only
|
|
parquet_only = parquet_cols - csv_cols
|
|
if parquet_only:
|
|
print(f"\n✓ Columns only in Parquet (n={len(parquet_only)}):")
|
|
for col in sorted(parquet_only):
|
|
print(f" {col}")
|
|
|
|
# File size comparison
|
|
parquet_size = os.path.getsize(parquet_path) / 1024 / 1024
|
|
size_reduction = (1 - parquet_size / csv_size) * 100
|
|
print(f"\n=== FILE SIZE COMPARISON ===")
|
|
print(f"CSV: {csv_size:.2f} MB")
|
|
print(f"Parquet: {parquet_size:.2f} MB")
|
|
print(f"Savings: {size_reduction:.1f}%")
|
|
|
|
if __name__ == "__main__":
|
|
df = inspect_parquet()
|
|
compare_to_csv()
|