mtg_python_deckbuilder/code/file_setup/data_loader.py

338 lines
11 KiB
Python

"""Data loader abstraction for CSV and Parquet formats.
This module provides a unified interface for reading and writing card data
in both CSV and Parquet formats. It handles format detection, conversion,
and schema validation.
Introduced in v3.0.0 as part of the Parquet migration.
"""
from __future__ import annotations
import os
from pathlib import Path
from typing import List, Optional
import pandas as pd
from logging_util import get_logger
from path_util import card_files_processed_dir
logger = get_logger(__name__)
# Required columns for deck building
REQUIRED_COLUMNS = [
"name",
"colorIdentity",
"type", # MTGJSON uses 'type' not 'types'
"keywords",
"manaValue",
"text",
"power",
"toughness",
]
def validate_schema(df: pd.DataFrame, required: Optional[List[str]] = None) -> None:
"""Validate that DataFrame contains required columns.
Args:
df: DataFrame to validate
required: List of required columns (uses REQUIRED_COLUMNS if None)
Raises:
ValueError: If required columns are missing
"""
required = required or REQUIRED_COLUMNS
missing = [col for col in required if col not in df.columns]
if missing:
raise ValueError(
f"Schema validation failed: missing required columns {missing}. "
f"Available columns: {list(df.columns)}"
)
logger.debug(f"✓ Schema validation passed ({len(required)} required columns present)")
class DataLoader:
"""Unified data loading interface supporting CSV and Parquet formats.
This class provides transparent access to card data regardless of the
underlying storage format. It automatically detects the format based on
file extensions and provides conversion utilities.
Examples:
>>> loader = DataLoader()
>>> df = loader.read_cards("card_files/processed/all_cards.parquet")
>>> loader.write_cards(df, "output.parquet")
>>> loader.convert("input.csv", "output.parquet")
"""
def __init__(self, format: str = "auto"):
"""Initialize the data loader.
Args:
format: Format preference - "csv", "parquet", or "auto" (default: auto)
"auto" detects format from file extension
"""
self.format = format.lower()
if self.format not in ("csv", "parquet", "auto"):
raise ValueError(f"Unsupported format: {format}. Use 'csv', 'parquet', or 'auto'.")
def read_cards(
self,
path: str,
columns: Optional[List[str]] = None,
format: Optional[str] = None
) -> pd.DataFrame:
"""Load card data from a file.
Args:
path: File path (e.g., "card_files/processed/all_cards.parquet")
columns: Optional list of columns to load (Parquet optimization)
format: Override format detection (uses self.format if None)
Returns:
DataFrame with card data
Raises:
FileNotFoundError: If the file doesn't exist
ValueError: If format is unsupported
"""
if not os.path.exists(path):
raise FileNotFoundError(f"Card data file not found: {path}")
detected_format = format or self._detect_format(path)
logger.debug(f"Loading card data from {path} (format: {detected_format})")
if detected_format == "csv":
return self._read_csv(path, columns)
elif detected_format == "parquet":
return self._read_parquet(path, columns)
else:
raise ValueError(f"Unsupported format: {detected_format}")
def write_cards(
self,
df: pd.DataFrame,
path: str,
format: Optional[str] = None,
index: bool = False
) -> None:
"""Save card data to a file.
Args:
df: DataFrame to save
path: Output file path
format: Force format (overrides auto-detection)
index: Whether to write DataFrame index (default: False)
Raises:
ValueError: If format is unsupported
"""
detected_format = format or self._detect_format(path)
# Ensure output directory exists
os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
logger.debug(f"Writing card data to {path} (format: {detected_format}, rows: {len(df)})")
if detected_format == "csv":
self._write_csv(df, path, index)
elif detected_format == "parquet":
self._write_parquet(df, path, index)
else:
raise ValueError(f"Unsupported format: {detected_format}")
def convert(
self,
src_path: str,
dst_path: str,
columns: Optional[List[str]] = None
) -> None:
"""Convert between CSV and Parquet formats.
Args:
src_path: Source file path
dst_path: Destination file path
columns: Optional list of columns to include (all if None)
Examples:
>>> loader.convert("cards.csv", "cards.parquet")
>>> loader.convert("cards.parquet", "cards.csv", columns=["name", "type"])
"""
logger.info(f"Converting {src_path}{dst_path}")
df = self.read_cards(src_path, columns=columns)
self.write_cards(df, dst_path)
logger.info(f"✓ Converted {len(df)} cards")
def _read_csv(self, path: str, columns: Optional[List[str]] = None) -> pd.DataFrame:
"""Read CSV file."""
try:
return pd.read_csv(path, usecols=columns, low_memory=False)
except Exception as e:
logger.error(f"Failed to read CSV from {path}: {e}")
raise
def _read_parquet(self, path: str, columns: Optional[List[str]] = None) -> pd.DataFrame:
"""Read Parquet file."""
try:
return pd.read_parquet(path, columns=columns)
except Exception as e:
logger.error(f"Failed to read Parquet from {path}: {e}")
raise
def _write_csv(self, df: pd.DataFrame, path: str, index: bool) -> None:
"""Write CSV file."""
try:
df.to_csv(path, index=index)
except Exception as e:
logger.error(f"Failed to write CSV to {path}: {e}")
raise
def _write_parquet(self, df: pd.DataFrame, path: str, index: bool) -> None:
"""Write Parquet file with Snappy compression."""
try:
df.to_parquet(path, index=index, compression="snappy", engine="pyarrow")
except Exception as e:
logger.error(f"Failed to write Parquet to {path}: {e}")
raise
def _detect_format(self, path: str) -> str:
"""Detect file format from extension.
Args:
path: File path to analyze
Returns:
Format string: "csv" or "parquet"
Raises:
ValueError: If format cannot be determined
"""
if self.format != "auto":
return self.format
# Check file extension
if path.endswith(".csv"):
return "csv"
elif path.endswith(".parquet"):
return "parquet"
# Try to infer from existing files (no extension provided)
if os.path.exists(f"{path}.parquet"):
return "parquet"
elif os.path.exists(f"{path}.csv"):
return "csv"
raise ValueError(
f"Cannot determine format for '{path}'. "
"Use .csv or .parquet extension, or specify format explicitly."
)
def write_batch_parquet(
self,
df: pd.DataFrame,
batch_id: int,
tag: str = "",
batches_dir: Optional[str] = None
) -> str:
"""Write a batch Parquet file (used during tagging).
Args:
df: DataFrame to save as a batch
batch_id: Unique batch identifier (e.g., 0, 1, 2...)
tag: Optional tag to include in filename (e.g., "white", "commander")
batches_dir: Directory for batch files (defaults to card_files/processed/batches)
Returns:
Path to the written batch file
Example:
>>> loader.write_batch_parquet(white_df, batch_id=0, tag="white")
'card_files/processed/batches/batch_0_white.parquet'
"""
if batches_dir is None:
batches_dir = os.path.join(card_files_processed_dir(), "batches")
os.makedirs(batches_dir, exist_ok=True)
# Build filename: batch_{id}_{tag}.parquet or batch_{id}.parquet
filename = f"batch_{batch_id}_{tag}.parquet" if tag else f"batch_{batch_id}.parquet"
path = os.path.join(batches_dir, filename)
logger.debug(f"Writing batch {batch_id} ({tag or 'no tag'}): {len(df)} cards → {path}")
self.write_cards(df, path, format="parquet")
return path
def merge_batches(
self,
output_path: Optional[str] = None,
batches_dir: Optional[str] = None,
cleanup: bool = True
) -> pd.DataFrame:
"""Merge all batch Parquet files into a single output file.
Args:
output_path: Path for merged output (defaults to card_files/processed/all_cards.parquet)
batches_dir: Directory containing batch files (defaults to card_files/processed/batches)
cleanup: Whether to delete batch files after merging (default: True)
Returns:
Merged DataFrame
Raises:
FileNotFoundError: If no batch files found
Example:
>>> loader.merge_batches() # Merges all batches → all_cards.parquet
"""
if batches_dir is None:
batches_dir = os.path.join(card_files_processed_dir(), "batches")
if output_path is None:
from code.path_util import get_processed_cards_path
output_path = get_processed_cards_path()
# Find all batch files
batch_files = sorted(Path(batches_dir).glob("batch_*.parquet"))
if not batch_files:
raise FileNotFoundError(f"No batch files found in {batches_dir}")
logger.info(f"Merging {len(batch_files)} batch files from {batches_dir}")
# Read and concatenate all batches
dfs = []
for batch_file in batch_files:
logger.debug(f"Reading batch: {batch_file.name}")
df = self.read_cards(str(batch_file), format="parquet")
dfs.append(df)
# Merge all batches
merged_df = pd.concat(dfs, ignore_index=True)
logger.info(f"Merged {len(merged_df)} total cards from {len(dfs)} batches")
# Write merged output
self.write_cards(merged_df, output_path, format="parquet")
logger.info(f"✓ Wrote merged data to {output_path}")
# Cleanup batch files if requested
if cleanup:
logger.debug(f"Cleaning up {len(batch_files)} batch files")
for batch_file in batch_files:
batch_file.unlink()
# Remove batches directory if empty
try:
Path(batches_dir).rmdir()
logger.debug(f"Removed empty batches directory: {batches_dir}")
except OSError:
pass # Directory not empty, keep it
return merged_df