mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 15:40:12 +01:00
338 lines
11 KiB
Python
338 lines
11 KiB
Python
"""Data loader abstraction for CSV and Parquet formats.
|
|
|
|
This module provides a unified interface for reading and writing card data
|
|
in both CSV and Parquet formats. It handles format detection, conversion,
|
|
and schema validation.
|
|
|
|
Introduced in v3.0.0 as part of the Parquet migration.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
|
|
import pandas as pd
|
|
|
|
from logging_util import get_logger
|
|
from path_util import card_files_processed_dir
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
# Required columns for deck building
|
|
REQUIRED_COLUMNS = [
|
|
"name",
|
|
"colorIdentity",
|
|
"type", # MTGJSON uses 'type' not 'types'
|
|
"keywords",
|
|
"manaValue",
|
|
"text",
|
|
"power",
|
|
"toughness",
|
|
]
|
|
|
|
|
|
def validate_schema(df: pd.DataFrame, required: Optional[List[str]] = None) -> None:
|
|
"""Validate that DataFrame contains required columns.
|
|
|
|
Args:
|
|
df: DataFrame to validate
|
|
required: List of required columns (uses REQUIRED_COLUMNS if None)
|
|
|
|
Raises:
|
|
ValueError: If required columns are missing
|
|
"""
|
|
required = required or REQUIRED_COLUMNS
|
|
missing = [col for col in required if col not in df.columns]
|
|
|
|
if missing:
|
|
raise ValueError(
|
|
f"Schema validation failed: missing required columns {missing}. "
|
|
f"Available columns: {list(df.columns)}"
|
|
)
|
|
|
|
logger.debug(f"✓ Schema validation passed ({len(required)} required columns present)")
|
|
|
|
|
|
class DataLoader:
|
|
"""Unified data loading interface supporting CSV and Parquet formats.
|
|
|
|
This class provides transparent access to card data regardless of the
|
|
underlying storage format. It automatically detects the format based on
|
|
file extensions and provides conversion utilities.
|
|
|
|
Examples:
|
|
>>> loader = DataLoader()
|
|
>>> df = loader.read_cards("card_files/processed/all_cards.parquet")
|
|
>>> loader.write_cards(df, "output.parquet")
|
|
>>> loader.convert("input.csv", "output.parquet")
|
|
"""
|
|
|
|
def __init__(self, format: str = "auto"):
|
|
"""Initialize the data loader.
|
|
|
|
Args:
|
|
format: Format preference - "csv", "parquet", or "auto" (default: auto)
|
|
"auto" detects format from file extension
|
|
"""
|
|
self.format = format.lower()
|
|
if self.format not in ("csv", "parquet", "auto"):
|
|
raise ValueError(f"Unsupported format: {format}. Use 'csv', 'parquet', or 'auto'.")
|
|
|
|
def read_cards(
|
|
self,
|
|
path: str,
|
|
columns: Optional[List[str]] = None,
|
|
format: Optional[str] = None
|
|
) -> pd.DataFrame:
|
|
"""Load card data from a file.
|
|
|
|
Args:
|
|
path: File path (e.g., "card_files/processed/all_cards.parquet")
|
|
columns: Optional list of columns to load (Parquet optimization)
|
|
format: Override format detection (uses self.format if None)
|
|
|
|
Returns:
|
|
DataFrame with card data
|
|
|
|
Raises:
|
|
FileNotFoundError: If the file doesn't exist
|
|
ValueError: If format is unsupported
|
|
"""
|
|
if not os.path.exists(path):
|
|
raise FileNotFoundError(f"Card data file not found: {path}")
|
|
|
|
detected_format = format or self._detect_format(path)
|
|
|
|
logger.debug(f"Loading card data from {path} (format: {detected_format})")
|
|
|
|
if detected_format == "csv":
|
|
return self._read_csv(path, columns)
|
|
elif detected_format == "parquet":
|
|
return self._read_parquet(path, columns)
|
|
else:
|
|
raise ValueError(f"Unsupported format: {detected_format}")
|
|
|
|
def write_cards(
|
|
self,
|
|
df: pd.DataFrame,
|
|
path: str,
|
|
format: Optional[str] = None,
|
|
index: bool = False
|
|
) -> None:
|
|
"""Save card data to a file.
|
|
|
|
Args:
|
|
df: DataFrame to save
|
|
path: Output file path
|
|
format: Force format (overrides auto-detection)
|
|
index: Whether to write DataFrame index (default: False)
|
|
|
|
Raises:
|
|
ValueError: If format is unsupported
|
|
"""
|
|
detected_format = format or self._detect_format(path)
|
|
|
|
# Ensure output directory exists
|
|
os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
|
|
|
|
logger.debug(f"Writing card data to {path} (format: {detected_format}, rows: {len(df)})")
|
|
|
|
if detected_format == "csv":
|
|
self._write_csv(df, path, index)
|
|
elif detected_format == "parquet":
|
|
self._write_parquet(df, path, index)
|
|
else:
|
|
raise ValueError(f"Unsupported format: {detected_format}")
|
|
|
|
def convert(
|
|
self,
|
|
src_path: str,
|
|
dst_path: str,
|
|
columns: Optional[List[str]] = None
|
|
) -> None:
|
|
"""Convert between CSV and Parquet formats.
|
|
|
|
Args:
|
|
src_path: Source file path
|
|
dst_path: Destination file path
|
|
columns: Optional list of columns to include (all if None)
|
|
|
|
Examples:
|
|
>>> loader.convert("cards.csv", "cards.parquet")
|
|
>>> loader.convert("cards.parquet", "cards.csv", columns=["name", "type"])
|
|
"""
|
|
logger.info(f"Converting {src_path} → {dst_path}")
|
|
df = self.read_cards(src_path, columns=columns)
|
|
self.write_cards(df, dst_path)
|
|
logger.info(f"✓ Converted {len(df)} cards")
|
|
|
|
def _read_csv(self, path: str, columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
"""Read CSV file."""
|
|
try:
|
|
return pd.read_csv(path, usecols=columns, low_memory=False)
|
|
except Exception as e:
|
|
logger.error(f"Failed to read CSV from {path}: {e}")
|
|
raise
|
|
|
|
def _read_parquet(self, path: str, columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
"""Read Parquet file."""
|
|
try:
|
|
return pd.read_parquet(path, columns=columns)
|
|
except Exception as e:
|
|
logger.error(f"Failed to read Parquet from {path}: {e}")
|
|
raise
|
|
|
|
def _write_csv(self, df: pd.DataFrame, path: str, index: bool) -> None:
|
|
"""Write CSV file."""
|
|
try:
|
|
df.to_csv(path, index=index)
|
|
except Exception as e:
|
|
logger.error(f"Failed to write CSV to {path}: {e}")
|
|
raise
|
|
|
|
def _write_parquet(self, df: pd.DataFrame, path: str, index: bool) -> None:
|
|
"""Write Parquet file with Snappy compression."""
|
|
try:
|
|
df.to_parquet(path, index=index, compression="snappy", engine="pyarrow")
|
|
except Exception as e:
|
|
logger.error(f"Failed to write Parquet to {path}: {e}")
|
|
raise
|
|
|
|
def _detect_format(self, path: str) -> str:
|
|
"""Detect file format from extension.
|
|
|
|
Args:
|
|
path: File path to analyze
|
|
|
|
Returns:
|
|
Format string: "csv" or "parquet"
|
|
|
|
Raises:
|
|
ValueError: If format cannot be determined
|
|
"""
|
|
if self.format != "auto":
|
|
return self.format
|
|
|
|
# Check file extension
|
|
if path.endswith(".csv"):
|
|
return "csv"
|
|
elif path.endswith(".parquet"):
|
|
return "parquet"
|
|
|
|
# Try to infer from existing files (no extension provided)
|
|
if os.path.exists(f"{path}.parquet"):
|
|
return "parquet"
|
|
elif os.path.exists(f"{path}.csv"):
|
|
return "csv"
|
|
|
|
raise ValueError(
|
|
f"Cannot determine format for '{path}'. "
|
|
"Use .csv or .parquet extension, or specify format explicitly."
|
|
)
|
|
|
|
def write_batch_parquet(
|
|
self,
|
|
df: pd.DataFrame,
|
|
batch_id: int,
|
|
tag: str = "",
|
|
batches_dir: Optional[str] = None
|
|
) -> str:
|
|
"""Write a batch Parquet file (used during tagging).
|
|
|
|
Args:
|
|
df: DataFrame to save as a batch
|
|
batch_id: Unique batch identifier (e.g., 0, 1, 2...)
|
|
tag: Optional tag to include in filename (e.g., "white", "commander")
|
|
batches_dir: Directory for batch files (defaults to card_files/processed/batches)
|
|
|
|
Returns:
|
|
Path to the written batch file
|
|
|
|
Example:
|
|
>>> loader.write_batch_parquet(white_df, batch_id=0, tag="white")
|
|
'card_files/processed/batches/batch_0_white.parquet'
|
|
"""
|
|
if batches_dir is None:
|
|
batches_dir = os.path.join(card_files_processed_dir(), "batches")
|
|
|
|
os.makedirs(batches_dir, exist_ok=True)
|
|
|
|
# Build filename: batch_{id}_{tag}.parquet or batch_{id}.parquet
|
|
filename = f"batch_{batch_id}_{tag}.parquet" if tag else f"batch_{batch_id}.parquet"
|
|
path = os.path.join(batches_dir, filename)
|
|
|
|
logger.debug(f"Writing batch {batch_id} ({tag or 'no tag'}): {len(df)} cards → {path}")
|
|
self.write_cards(df, path, format="parquet")
|
|
|
|
return path
|
|
|
|
def merge_batches(
|
|
self,
|
|
output_path: Optional[str] = None,
|
|
batches_dir: Optional[str] = None,
|
|
cleanup: bool = True
|
|
) -> pd.DataFrame:
|
|
"""Merge all batch Parquet files into a single output file.
|
|
|
|
Args:
|
|
output_path: Path for merged output (defaults to card_files/processed/all_cards.parquet)
|
|
batches_dir: Directory containing batch files (defaults to card_files/processed/batches)
|
|
cleanup: Whether to delete batch files after merging (default: True)
|
|
|
|
Returns:
|
|
Merged DataFrame
|
|
|
|
Raises:
|
|
FileNotFoundError: If no batch files found
|
|
|
|
Example:
|
|
>>> loader.merge_batches() # Merges all batches → all_cards.parquet
|
|
"""
|
|
if batches_dir is None:
|
|
batches_dir = os.path.join(card_files_processed_dir(), "batches")
|
|
|
|
if output_path is None:
|
|
from code.path_util import get_processed_cards_path
|
|
output_path = get_processed_cards_path()
|
|
|
|
# Find all batch files
|
|
batch_files = sorted(Path(batches_dir).glob("batch_*.parquet"))
|
|
|
|
if not batch_files:
|
|
raise FileNotFoundError(f"No batch files found in {batches_dir}")
|
|
|
|
logger.info(f"Merging {len(batch_files)} batch files from {batches_dir}")
|
|
|
|
# Read and concatenate all batches
|
|
dfs = []
|
|
for batch_file in batch_files:
|
|
logger.debug(f"Reading batch: {batch_file.name}")
|
|
df = self.read_cards(str(batch_file), format="parquet")
|
|
dfs.append(df)
|
|
|
|
# Merge all batches
|
|
merged_df = pd.concat(dfs, ignore_index=True)
|
|
logger.info(f"Merged {len(merged_df)} total cards from {len(dfs)} batches")
|
|
|
|
# Write merged output
|
|
self.write_cards(merged_df, output_path, format="parquet")
|
|
logger.info(f"✓ Wrote merged data to {output_path}")
|
|
|
|
# Cleanup batch files if requested
|
|
if cleanup:
|
|
logger.debug(f"Cleaning up {len(batch_files)} batch files")
|
|
for batch_file in batch_files:
|
|
batch_file.unlink()
|
|
|
|
# Remove batches directory if empty
|
|
try:
|
|
Path(batches_dir).rmdir()
|
|
logger.debug(f"Removed empty batches directory: {batches_dir}")
|
|
except OSError:
|
|
pass # Directory not empty, keep it
|
|
|
|
return merged_df
|
|
|