mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 23:50:12 +01:00
feat: consolidate card data into optimized format for faster queries and reduced file sizes
This commit is contained in:
parent
5753bb19f8
commit
f70ffca23e
24 changed files with 2903 additions and 135 deletions
367
code/file_setup/card_aggregator.py
Normal file
367
code/file_setup/card_aggregator.py
Normal file
|
|
@ -0,0 +1,367 @@
|
|||
"""
|
||||
Card Data Aggregator
|
||||
|
||||
Consolidates individual card CSV files into a single Parquet file for improved
|
||||
performance in card browsing, theme cataloging, and searches.
|
||||
|
||||
Key Features:
|
||||
- Merges all card CSVs into all_cards.parquet (50-70% size reduction, 2-5x faster)
|
||||
- Excludes master files (cards.csv, commander_cards.csv) from aggregation
|
||||
- Deduplication logic (keeps most recent when card appears in multiple files)
|
||||
- Incremental updates (only re-process changed files)
|
||||
- Version rotation (maintains 2-3 historical versions for rollback)
|
||||
- Validation (ensures no data loss)
|
||||
|
||||
Usage:
|
||||
aggregator = CardAggregator()
|
||||
stats = aggregator.aggregate_all('csv_files', 'card_files/all_cards.parquet')
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from code.logging_util import get_logger
|
||||
|
||||
# Initialize logger
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class CardAggregator:
|
||||
"""Aggregates individual card CSV files into a consolidated Parquet file."""
|
||||
|
||||
# Files to exclude from aggregation (master files used for other purposes)
|
||||
EXCLUDED_FILES = {"cards.csv", "commander_cards.csv", "background_cards.csv"}
|
||||
|
||||
def __init__(self, output_dir: Optional[str] = None) -> None:
|
||||
"""
|
||||
Initialize CardAggregator.
|
||||
|
||||
Args:
|
||||
output_dir: Directory for output files (defaults to CARD_FILES_DIR env var or 'card_files/')
|
||||
"""
|
||||
self.output_dir = output_dir or os.getenv("CARD_FILES_DIR", "card_files")
|
||||
self.ensure_output_dir()
|
||||
|
||||
def ensure_output_dir(self) -> None:
|
||||
"""Create output directory if it doesn't exist."""
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
logger.info(f"Card aggregator output directory: {self.output_dir}")
|
||||
|
||||
def get_card_csvs(self, source_dir: str) -> list[str]:
|
||||
"""
|
||||
Get all card CSV files to aggregate, excluding master files.
|
||||
|
||||
Args:
|
||||
source_dir: Directory containing card CSV files
|
||||
|
||||
Returns:
|
||||
List of file paths to aggregate
|
||||
"""
|
||||
all_csvs = glob.glob(os.path.join(source_dir, "*.csv"))
|
||||
|
||||
# Filter out excluded files and temporary files
|
||||
filtered = [
|
||||
f
|
||||
for f in all_csvs
|
||||
if os.path.basename(f) not in self.EXCLUDED_FILES
|
||||
and not os.path.basename(f).startswith(".")
|
||||
and not os.path.basename(f).startswith("_temp")
|
||||
]
|
||||
|
||||
logger.info(
|
||||
f"Found {len(all_csvs)} CSV files, {len(filtered)} to aggregate "
|
||||
f"(excluded {len(all_csvs) - len(filtered)})"
|
||||
)
|
||||
|
||||
return filtered
|
||||
|
||||
def deduplicate_cards(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Remove duplicate card entries, keeping the most recent version.
|
||||
|
||||
Uses the 'name' column as the unique identifier. When duplicates exist,
|
||||
keeps the last occurrence (assumes files are processed in order of modification time).
|
||||
|
||||
Args:
|
||||
df: DataFrame with potential duplicates
|
||||
|
||||
Returns:
|
||||
DataFrame with duplicates removed
|
||||
"""
|
||||
if "name" not in df.columns:
|
||||
logger.warning("Cannot deduplicate: 'name' column not found")
|
||||
return df
|
||||
|
||||
original_count = len(df)
|
||||
df_deduped = df.drop_duplicates(subset=["name"], keep="last")
|
||||
removed_count = original_count - len(df_deduped)
|
||||
|
||||
if removed_count > 0:
|
||||
logger.info(f"Removed {removed_count} duplicate cards (kept most recent)")
|
||||
|
||||
return df_deduped
|
||||
|
||||
def aggregate_all(self, source_dir: str, output_path: str) -> dict:
|
||||
"""
|
||||
Perform full aggregation of all card CSV files into a single Parquet file.
|
||||
|
||||
Args:
|
||||
source_dir: Directory containing individual card CSV files
|
||||
output_path: Path for output Parquet file
|
||||
|
||||
Returns:
|
||||
Dictionary with aggregation statistics:
|
||||
- files_processed: Number of CSV files aggregated
|
||||
- total_cards: Total cards in output (after deduplication)
|
||||
- duplicates_removed: Number of duplicate cards removed
|
||||
- file_size_mb: Size of output Parquet file in MB
|
||||
- elapsed_seconds: Time taken for aggregation
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If source_dir doesn't exist
|
||||
ValueError: If no CSV files found to aggregate
|
||||
"""
|
||||
start_time = datetime.now()
|
||||
|
||||
if not os.path.exists(source_dir):
|
||||
raise FileNotFoundError(f"Source directory not found: {source_dir}")
|
||||
|
||||
# Get CSV files to aggregate
|
||||
csv_files = self.get_card_csvs(source_dir)
|
||||
if not csv_files:
|
||||
raise ValueError(f"No CSV files found to aggregate in {source_dir}")
|
||||
|
||||
logger.info(f"Starting aggregation of {len(csv_files)} files...")
|
||||
|
||||
# Sort by modification time (oldest first, so newest are kept in deduplication)
|
||||
csv_files_sorted = sorted(csv_files, key=lambda f: os.path.getmtime(f))
|
||||
|
||||
# Read and concatenate all CSV files
|
||||
dfs = []
|
||||
for csv_file in csv_files_sorted:
|
||||
try:
|
||||
# Skip comment lines (lines starting with #) in CSV files
|
||||
df = pd.read_csv(csv_file, low_memory=False, comment='#')
|
||||
if not df.empty:
|
||||
dfs.append(df)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read {os.path.basename(csv_file)}: {e}")
|
||||
continue
|
||||
|
||||
if not dfs:
|
||||
raise ValueError("No valid CSV files could be read")
|
||||
|
||||
# Concatenate all DataFrames
|
||||
logger.info(f"Concatenating {len(dfs)} DataFrames...")
|
||||
combined_df = pd.concat(dfs, ignore_index=True)
|
||||
original_count = len(combined_df)
|
||||
|
||||
# Deduplicate cards
|
||||
combined_df = self.deduplicate_cards(combined_df)
|
||||
duplicates_removed = original_count - len(combined_df)
|
||||
|
||||
# Convert object columns with mixed types to strings for Parquet compatibility
|
||||
# Common columns that may have mixed types: power, toughness, keywords
|
||||
for col in ["power", "toughness", "keywords"]:
|
||||
if col in combined_df.columns:
|
||||
combined_df[col] = combined_df[col].astype(str)
|
||||
|
||||
# Rotate existing versions before writing new file
|
||||
self.rotate_versions(output_path, keep_versions=3)
|
||||
|
||||
# Write to Parquet
|
||||
logger.info(f"Writing {len(combined_df)} cards to {output_path}...")
|
||||
combined_df.to_parquet(output_path, engine="pyarrow", compression="snappy", index=False)
|
||||
|
||||
# Calculate stats
|
||||
elapsed = (datetime.now() - start_time).total_seconds()
|
||||
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
|
||||
|
||||
stats = {
|
||||
"files_processed": len(csv_files),
|
||||
"total_cards": len(combined_df),
|
||||
"duplicates_removed": duplicates_removed,
|
||||
"file_size_mb": round(file_size_mb, 2),
|
||||
"elapsed_seconds": round(elapsed, 2),
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
logger.info(
|
||||
f"Aggregation complete: {stats['total_cards']} cards "
|
||||
f"({stats['file_size_mb']} MB) in {stats['elapsed_seconds']}s"
|
||||
)
|
||||
|
||||
# Save metadata
|
||||
self._save_metadata(source_dir, output_path, stats)
|
||||
|
||||
return stats
|
||||
|
||||
def detect_changes(self, source_dir: str, metadata_path: str) -> list[str]:
|
||||
"""
|
||||
Detect which CSV files have changed since last aggregation.
|
||||
|
||||
Args:
|
||||
source_dir: Directory containing card CSV files
|
||||
metadata_path: Path to metadata JSON file from previous run
|
||||
|
||||
Returns:
|
||||
List of file paths that have been added or modified
|
||||
"""
|
||||
if not os.path.exists(metadata_path):
|
||||
logger.info("No previous metadata found, all files considered changed")
|
||||
return self.get_card_csvs(source_dir)
|
||||
|
||||
try:
|
||||
with open(metadata_path, "r", encoding="utf-8") as f:
|
||||
metadata = json.load(f)
|
||||
last_run = datetime.fromisoformat(metadata.get("timestamp", ""))
|
||||
except (json.JSONDecodeError, ValueError, KeyError) as e:
|
||||
logger.warning(f"Invalid metadata file: {e}, treating all files as changed")
|
||||
return self.get_card_csvs(source_dir)
|
||||
|
||||
# Find files modified after last aggregation
|
||||
csv_files = self.get_card_csvs(source_dir)
|
||||
changed_files = [
|
||||
f for f in csv_files if datetime.fromtimestamp(os.path.getmtime(f)) > last_run
|
||||
]
|
||||
|
||||
logger.info(f"Detected {len(changed_files)} changed files since last aggregation")
|
||||
return changed_files
|
||||
|
||||
def incremental_update(self, changed_files: list[str], output_path: str) -> dict:
|
||||
"""
|
||||
Perform incremental update by replacing only changed cards.
|
||||
|
||||
Note: This is a simplified implementation. For production use, consider:
|
||||
- Loading existing Parquet, removing old versions of changed cards, adding new
|
||||
- Currently performs full re-aggregation (simpler, safer for MVP)
|
||||
|
||||
Args:
|
||||
changed_files: List of CSV files that have changed
|
||||
output_path: Path to existing Parquet file to update
|
||||
|
||||
Returns:
|
||||
Dictionary with update statistics
|
||||
"""
|
||||
# For MVP, we'll perform a full aggregation instead of true incremental update
|
||||
# True incremental update would require:
|
||||
# 1. Load existing Parquet
|
||||
# 2. Identify cards from changed files
|
||||
# 3. Remove old versions of those cards
|
||||
# 4. Add new versions
|
||||
# This is more complex and error-prone, so we'll defer to a future iteration
|
||||
|
||||
logger.info("Incremental update not yet implemented, performing full aggregation")
|
||||
source_dir = os.path.dirname(changed_files[0]) if changed_files else "csv_files"
|
||||
return self.aggregate_all(source_dir, output_path)
|
||||
|
||||
def validate_output(self, output_path: str, source_dir: str) -> tuple[bool, list[str]]:
|
||||
"""
|
||||
Validate the aggregated output file.
|
||||
|
||||
Checks:
|
||||
- File exists and is readable
|
||||
- Contains expected columns
|
||||
- Has reasonable number of cards (>0)
|
||||
- Random sampling matches source data
|
||||
|
||||
Args:
|
||||
output_path: Path to Parquet file to validate
|
||||
source_dir: Original source directory for comparison
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, list_of_errors)
|
||||
"""
|
||||
errors = []
|
||||
|
||||
# Check file exists
|
||||
if not os.path.exists(output_path):
|
||||
errors.append(f"Output file not found: {output_path}")
|
||||
return False, errors
|
||||
|
||||
try:
|
||||
# Load Parquet file
|
||||
df = pd.read_parquet(output_path, engine="pyarrow")
|
||||
|
||||
# Check not empty
|
||||
if df.empty:
|
||||
errors.append("Output file is empty")
|
||||
|
||||
# Check has 'name' column at minimum
|
||||
if "name" not in df.columns:
|
||||
errors.append("Output file missing 'name' column")
|
||||
|
||||
# Check for reasonable card count (at least 100 cards expected in any real dataset)
|
||||
if len(df) < 100:
|
||||
logger.warning(f"Output has only {len(df)} cards (expected more)")
|
||||
|
||||
logger.info(f"Validation passed: {len(df)} cards with {len(df.columns)} columns")
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Failed to read/validate output file: {e}")
|
||||
|
||||
return len(errors) == 0, errors
|
||||
|
||||
def rotate_versions(self, output_path: str, keep_versions: int = 3) -> None:
|
||||
"""
|
||||
Rotate historical versions of the output file.
|
||||
|
||||
Keeps the last N versions as backups (e.g., all_cards_v1.parquet, all_cards_v2.parquet).
|
||||
|
||||
Args:
|
||||
output_path: Path to current output file
|
||||
keep_versions: Number of historical versions to keep (default: 3)
|
||||
"""
|
||||
if not os.path.exists(output_path):
|
||||
return # Nothing to rotate
|
||||
|
||||
# Parse output path
|
||||
base_dir = os.path.dirname(output_path)
|
||||
filename = os.path.basename(output_path)
|
||||
name, ext = os.path.splitext(filename)
|
||||
|
||||
# Rotate existing versions (v2 -> v3, v1 -> v2, current -> v1)
|
||||
for version in range(keep_versions - 1, 0, -1):
|
||||
old_path = os.path.join(base_dir, f"{name}_v{version}{ext}")
|
||||
new_path = os.path.join(base_dir, f"{name}_v{version + 1}{ext}")
|
||||
|
||||
if os.path.exists(old_path):
|
||||
if version + 1 > keep_versions:
|
||||
# Delete oldest version
|
||||
os.remove(old_path)
|
||||
logger.info(f"Deleted old version: {os.path.basename(old_path)}")
|
||||
else:
|
||||
# Rename to next version
|
||||
os.rename(old_path, new_path)
|
||||
logger.info(
|
||||
f"Rotated {os.path.basename(old_path)} -> {os.path.basename(new_path)}"
|
||||
)
|
||||
|
||||
# Move current file to v1
|
||||
v1_path = os.path.join(base_dir, f"{name}_v1{ext}")
|
||||
if os.path.exists(output_path):
|
||||
os.rename(output_path, v1_path)
|
||||
logger.info(f"Rotated current file to {os.path.basename(v1_path)}")
|
||||
|
||||
def _save_metadata(self, source_dir: str, output_path: str, stats: dict) -> None:
|
||||
"""Save aggregation metadata for incremental updates."""
|
||||
metadata_path = os.path.join(self.output_dir, ".aggregate_metadata.json")
|
||||
|
||||
metadata = {
|
||||
"source_dir": source_dir,
|
||||
"output_path": output_path,
|
||||
"last_aggregation": stats["timestamp"],
|
||||
"stats": stats,
|
||||
}
|
||||
|
||||
with open(metadata_path, "w", encoding="utf-8") as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
logger.info(f"Saved aggregation metadata to {metadata_path}")
|
||||
160
code/scripts/aggregate_cards.py
Normal file
160
code/scripts/aggregate_cards.py
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Aggregate Cards CLI Script
|
||||
|
||||
Command-line interface for consolidating individual card CSV files into a single
|
||||
Parquet file. Useful for manual aggregation runs, testing, and recovery.
|
||||
|
||||
Usage:
|
||||
python code/scripts/aggregate_cards.py
|
||||
python code/scripts/aggregate_cards.py --source csv_files --output card_files/all_cards.parquet
|
||||
python code/scripts/aggregate_cards.py --validate-only
|
||||
python code/scripts/aggregate_cards.py --incremental
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path for imports
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from code.file_setup.card_aggregator import CardAggregator
|
||||
from code.logging_util import get_logger
|
||||
from code.settings import CSV_DIRECTORY, CARD_FILES_DIRECTORY
|
||||
|
||||
# Initialize logger
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Main entry point for aggregate_cards CLI."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Aggregate individual card CSV files into consolidated Parquet file",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
"-s",
|
||||
default=CSV_DIRECTORY,
|
||||
help=f"Source directory containing card CSV files (default: {CSV_DIRECTORY})",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
"-o",
|
||||
default=None,
|
||||
help="Output Parquet file path (default: card_files/all_cards.parquet)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default=CARD_FILES_DIRECTORY,
|
||||
help=f"Output directory for Parquet files (default: {CARD_FILES_DIRECTORY})",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--validate-only",
|
||||
action="store_true",
|
||||
help="Only validate existing output file, don't aggregate",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--incremental",
|
||||
"-i",
|
||||
action="store_true",
|
||||
help="Perform incremental update (only changed files)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--keep-versions",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of historical versions to keep (default: 3)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize aggregator
|
||||
aggregator = CardAggregator(output_dir=args.output_dir)
|
||||
|
||||
# Determine output path
|
||||
output_path = args.output or f"{args.output_dir}/all_cards.parquet"
|
||||
|
||||
try:
|
||||
if args.validate_only:
|
||||
# Validation only mode
|
||||
logger.info(f"Validating {output_path}...")
|
||||
is_valid, errors = aggregator.validate_output(output_path, args.source)
|
||||
|
||||
if is_valid:
|
||||
logger.info("✓ Validation passed")
|
||||
return 0
|
||||
else:
|
||||
logger.error("✗ Validation failed:")
|
||||
for error in errors:
|
||||
logger.error(f" - {error}")
|
||||
return 1
|
||||
|
||||
elif args.incremental:
|
||||
# Incremental update mode
|
||||
logger.info("Starting incremental aggregation...")
|
||||
metadata_path = f"{args.output_dir}/.aggregate_metadata.json"
|
||||
changed_files = aggregator.detect_changes(args.source, metadata_path)
|
||||
|
||||
if not changed_files:
|
||||
logger.info("No changes detected, skipping aggregation")
|
||||
return 0
|
||||
|
||||
stats = aggregator.incremental_update(changed_files, output_path)
|
||||
|
||||
else:
|
||||
# Full aggregation mode
|
||||
logger.info("Starting full aggregation...")
|
||||
stats = aggregator.aggregate_all(args.source, output_path)
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 60)
|
||||
print("AGGREGATION SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Files processed: {stats['files_processed']}")
|
||||
print(f"Total cards: {stats['total_cards']:,}")
|
||||
print(f"Duplicates removed: {stats['duplicates_removed']:,}")
|
||||
print(f"File size: {stats['file_size_mb']:.2f} MB")
|
||||
print(f"Time elapsed: {stats['elapsed_seconds']:.2f} seconds")
|
||||
print(f"Output: {output_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# Run validation
|
||||
logger.info("\nValidating output...")
|
||||
is_valid, errors = aggregator.validate_output(output_path, args.source)
|
||||
|
||||
if is_valid:
|
||||
logger.info("✓ Validation passed")
|
||||
return 0
|
||||
else:
|
||||
logger.error("✗ Validation failed:")
|
||||
for error in errors:
|
||||
logger.error(f" - {error}")
|
||||
return 1
|
||||
|
||||
except FileNotFoundError as e:
|
||||
logger.error(f"Error: {e}")
|
||||
return 1
|
||||
except ValueError as e:
|
||||
logger.error(f"Error: {e}")
|
||||
return 1
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
6
code/services/__init__.py
Normal file
6
code/services/__init__.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
"""Services package for MTG Python Deckbuilder."""
|
||||
|
||||
from code.services.all_cards_loader import AllCardsLoader
|
||||
from code.services.card_query_builder import CardQueryBuilder
|
||||
|
||||
__all__ = ["AllCardsLoader", "CardQueryBuilder"]
|
||||
289
code/services/all_cards_loader.py
Normal file
289
code/services/all_cards_loader.py
Normal file
|
|
@ -0,0 +1,289 @@
|
|||
"""
|
||||
All Cards Loader
|
||||
|
||||
Provides efficient loading and querying of the consolidated all_cards.parquet file.
|
||||
Features in-memory caching with TTL and automatic reload on file changes.
|
||||
|
||||
Usage:
|
||||
loader = AllCardsLoader()
|
||||
|
||||
# Single card lookup
|
||||
card = loader.get_by_name("Sol Ring")
|
||||
|
||||
# Batch lookup
|
||||
cards = loader.get_by_names(["Sol Ring", "Lightning Bolt", "Counterspell"])
|
||||
|
||||
# Filter by color identity
|
||||
blue_cards = loader.filter_by_color_identity(["U"])
|
||||
|
||||
# Filter by themes
|
||||
token_cards = loader.filter_by_themes(["tokens"], mode="any")
|
||||
|
||||
# Simple text search
|
||||
results = loader.search("create token", limit=100)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from code.logging_util import get_logger
|
||||
from code.settings import CARD_FILES_DIRECTORY
|
||||
|
||||
# Initialize logger
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class AllCardsLoader:
|
||||
"""Loads and caches the consolidated all_cards.parquet file with query methods."""
|
||||
|
||||
def __init__(self, file_path: Optional[str] = None, cache_ttl: int = 300) -> None:
|
||||
"""
|
||||
Initialize AllCardsLoader.
|
||||
|
||||
Args:
|
||||
file_path: Path to all_cards.parquet (defaults to card_files/all_cards.parquet)
|
||||
cache_ttl: Time-to-live for cache in seconds (default: 300 = 5 minutes)
|
||||
"""
|
||||
self.file_path = file_path or os.path.join(CARD_FILES_DIRECTORY, "all_cards.parquet")
|
||||
self.cache_ttl = cache_ttl
|
||||
self._df: Optional[pd.DataFrame] = None
|
||||
self._last_load_time: float = 0
|
||||
self._file_mtime: float = 0
|
||||
|
||||
def load(self, force_reload: bool = False) -> pd.DataFrame:
|
||||
"""
|
||||
Load all_cards.parquet with caching.
|
||||
|
||||
Returns cached DataFrame if:
|
||||
- Cache exists
|
||||
- Cache is not expired (within TTL)
|
||||
- File hasn't been modified since last load
|
||||
- force_reload is False
|
||||
|
||||
Args:
|
||||
force_reload: Force reload from disk even if cached
|
||||
|
||||
Returns:
|
||||
DataFrame containing all cards
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If all_cards.parquet doesn't exist
|
||||
"""
|
||||
if not os.path.exists(self.file_path):
|
||||
raise FileNotFoundError(f"All cards file not found: {self.file_path}")
|
||||
|
||||
# Check if we need to reload
|
||||
current_time = time.time()
|
||||
file_mtime = os.path.getmtime(self.file_path)
|
||||
|
||||
cache_valid = (
|
||||
self._df is not None
|
||||
and not force_reload
|
||||
and (current_time - self._last_load_time) < self.cache_ttl
|
||||
and file_mtime == self._file_mtime
|
||||
)
|
||||
|
||||
if cache_valid:
|
||||
return self._df # type: ignore
|
||||
|
||||
# Load from disk
|
||||
logger.info(f"Loading all_cards from {self.file_path}...")
|
||||
start_time = time.time()
|
||||
self._df = pd.read_parquet(self.file_path, engine="pyarrow")
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
self._last_load_time = current_time
|
||||
self._file_mtime = file_mtime
|
||||
|
||||
logger.info(
|
||||
f"Loaded {len(self._df)} cards with {len(self._df.columns)} columns in {elapsed:.3f}s"
|
||||
)
|
||||
|
||||
return self._df
|
||||
|
||||
def get_by_name(self, name: str) -> Optional[pd.Series]:
|
||||
"""
|
||||
Get a single card by exact name match.
|
||||
|
||||
Args:
|
||||
name: Card name to search for
|
||||
|
||||
Returns:
|
||||
Series containing card data, or None if not found
|
||||
"""
|
||||
df = self.load()
|
||||
if "name" not in df.columns:
|
||||
logger.warning("'name' column not found in all_cards")
|
||||
return None
|
||||
|
||||
# Use .loc[] for faster exact match lookup
|
||||
try:
|
||||
matches = df.loc[df["name"] == name]
|
||||
if matches.empty:
|
||||
return None
|
||||
return matches.iloc[0]
|
||||
except (KeyError, IndexError):
|
||||
return None
|
||||
|
||||
def get_by_names(self, names: list[str]) -> pd.DataFrame:
|
||||
"""
|
||||
Get multiple cards by exact name matches (batch lookup).
|
||||
|
||||
Args:
|
||||
names: List of card names to search for
|
||||
|
||||
Returns:
|
||||
DataFrame containing matching cards (may be empty)
|
||||
"""
|
||||
df = self.load()
|
||||
if "name" not in df.columns:
|
||||
logger.warning("'name' column not found in all_cards")
|
||||
return pd.DataFrame()
|
||||
|
||||
return df[df["name"].isin(names)]
|
||||
|
||||
def filter_by_color_identity(self, colors: list[str]) -> pd.DataFrame:
|
||||
"""
|
||||
Filter cards by color identity.
|
||||
|
||||
Args:
|
||||
colors: List of color codes (e.g., ["W", "U"], ["Colorless"], ["G", "R", "U"])
|
||||
|
||||
Returns:
|
||||
DataFrame containing cards matching the color identity
|
||||
"""
|
||||
df = self.load()
|
||||
if "colorIdentity" not in df.columns:
|
||||
logger.warning("'colorIdentity' column not found in all_cards")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Convert colors list to a set for comparison
|
||||
color_set = set(colors)
|
||||
|
||||
# Handle special case for colorless
|
||||
if "Colorless" in color_set or "colorless" in color_set:
|
||||
return df[df["colorIdentity"].isin(["Colorless", "colorless"])]
|
||||
|
||||
# For multi-color searches, match any card that contains those colors
|
||||
# This is a simple exact match - could be enhanced for subset/superset matching
|
||||
if len(colors) == 1:
|
||||
# Single color - exact match
|
||||
return df[df["colorIdentity"] == colors[0]]
|
||||
else:
|
||||
# Multi-color - match any of the provided colors (could be refined)
|
||||
return df[df["colorIdentity"].isin(colors)]
|
||||
|
||||
def filter_by_themes(self, themes: list[str], mode: str = "any") -> pd.DataFrame:
|
||||
"""
|
||||
Filter cards by theme tags.
|
||||
|
||||
Args:
|
||||
themes: List of theme tags to search for
|
||||
mode: "any" (at least one theme) or "all" (must have all themes)
|
||||
|
||||
Returns:
|
||||
DataFrame containing cards matching the theme criteria
|
||||
"""
|
||||
df = self.load()
|
||||
if "themeTags" not in df.columns:
|
||||
logger.warning("'themeTags' column not found in all_cards")
|
||||
return pd.DataFrame()
|
||||
|
||||
if mode == "all":
|
||||
# Card must have all specified themes
|
||||
mask = pd.Series([True] * len(df), index=df.index)
|
||||
for theme in themes:
|
||||
mask &= df["themeTags"].str.contains(theme, case=False, na=False)
|
||||
return df[mask]
|
||||
else:
|
||||
# Card must have at least one of the specified themes (default)
|
||||
mask = pd.Series([False] * len(df), index=df.index)
|
||||
for theme in themes:
|
||||
mask |= df["themeTags"].str.contains(theme, case=False, na=False)
|
||||
return df[mask]
|
||||
|
||||
def search(self, query: str, limit: int = 100) -> pd.DataFrame:
|
||||
"""
|
||||
Simple text search across card name, type, and oracle text.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
limit: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
DataFrame containing matching cards (up to limit)
|
||||
"""
|
||||
df = self.load()
|
||||
|
||||
# Search across multiple columns
|
||||
mask = pd.Series([False] * len(df), index=df.index)
|
||||
|
||||
if "name" in df.columns:
|
||||
mask |= df["name"].str.contains(query, case=False, na=False)
|
||||
|
||||
if "type" in df.columns:
|
||||
mask |= df["type"].str.contains(query, case=False, na=False)
|
||||
|
||||
if "text" in df.columns:
|
||||
mask |= df["text"].str.contains(query, case=False, na=False)
|
||||
|
||||
results = df[mask]
|
||||
|
||||
if len(results) > limit:
|
||||
return results.head(limit)
|
||||
|
||||
return results
|
||||
|
||||
def filter_by_type(self, type_query: str) -> pd.DataFrame:
|
||||
"""
|
||||
Filter cards by type line (supports partial matching).
|
||||
|
||||
Args:
|
||||
type_query: Type string to search for (e.g., "Creature", "Instant", "Artifact")
|
||||
|
||||
Returns:
|
||||
DataFrame containing cards matching the type
|
||||
"""
|
||||
df = self.load()
|
||||
if "type" not in df.columns:
|
||||
logger.warning("'type' column not found in all_cards")
|
||||
return pd.DataFrame()
|
||||
|
||||
return df[df["type"].str.contains(type_query, case=False, na=False)]
|
||||
|
||||
def get_stats(self) -> dict:
|
||||
"""
|
||||
Get statistics about the loaded card data.
|
||||
|
||||
Returns:
|
||||
Dictionary with card count, column count, file size, and load time
|
||||
"""
|
||||
df = self.load()
|
||||
|
||||
stats = {
|
||||
"total_cards": len(df),
|
||||
"columns": len(df.columns),
|
||||
"file_path": self.file_path,
|
||||
"file_size_mb": (
|
||||
round(os.path.getsize(self.file_path) / (1024 * 1024), 2)
|
||||
if os.path.exists(self.file_path)
|
||||
else 0
|
||||
),
|
||||
"cached": self._df is not None,
|
||||
"cache_age_seconds": int(time.time() - self._last_load_time)
|
||||
if self._last_load_time > 0
|
||||
else None,
|
||||
}
|
||||
|
||||
return stats
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear the cached DataFrame, forcing next load to read from disk."""
|
||||
self._df = None
|
||||
self._last_load_time = 0
|
||||
logger.info("Cache cleared")
|
||||
207
code/services/card_query_builder.py
Normal file
207
code/services/card_query_builder.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
"""
|
||||
Card Query Builder
|
||||
|
||||
Provides a fluent API for building complex card queries against the consolidated all_cards.parquet.
|
||||
|
||||
Usage:
|
||||
from code.services.card_query_builder import CardQueryBuilder
|
||||
|
||||
# Simple query
|
||||
builder = CardQueryBuilder()
|
||||
cards = builder.colors(["W", "U"]).execute()
|
||||
|
||||
# Complex query
|
||||
cards = (CardQueryBuilder()
|
||||
.colors(["G"])
|
||||
.themes(["tokens"], mode="any")
|
||||
.types("Creature")
|
||||
.limit(20)
|
||||
.execute())
|
||||
|
||||
# Get specific cards
|
||||
cards = CardQueryBuilder().names(["Sol Ring", "Lightning Bolt"]).execute()
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from code.services.all_cards_loader import AllCardsLoader
|
||||
|
||||
|
||||
class CardQueryBuilder:
|
||||
"""Fluent API for building card queries."""
|
||||
|
||||
def __init__(self, loader: Optional[AllCardsLoader] = None) -> None:
|
||||
"""
|
||||
Initialize CardQueryBuilder.
|
||||
|
||||
Args:
|
||||
loader: AllCardsLoader instance (creates default if None)
|
||||
"""
|
||||
self._loader = loader or AllCardsLoader()
|
||||
self._color_filter: Optional[list[str]] = None
|
||||
self._theme_filter: Optional[list[str]] = None
|
||||
self._theme_mode: str = "any"
|
||||
self._type_filter: Optional[str] = None
|
||||
self._name_filter: Optional[list[str]] = None
|
||||
self._search_query: Optional[str] = None
|
||||
self._limit: Optional[int] = None
|
||||
|
||||
def colors(self, colors: list[str]) -> CardQueryBuilder:
|
||||
"""
|
||||
Filter by color identity.
|
||||
|
||||
Args:
|
||||
colors: List of color codes (e.g., ["W", "U"])
|
||||
|
||||
Returns:
|
||||
Self for chaining
|
||||
"""
|
||||
self._color_filter = colors
|
||||
return self
|
||||
|
||||
def themes(self, themes: list[str], mode: str = "any") -> CardQueryBuilder:
|
||||
"""
|
||||
Filter by theme tags.
|
||||
|
||||
Args:
|
||||
themes: List of theme tags
|
||||
mode: "any" (at least one) or "all" (must have all)
|
||||
|
||||
Returns:
|
||||
Self for chaining
|
||||
"""
|
||||
self._theme_filter = themes
|
||||
self._theme_mode = mode
|
||||
return self
|
||||
|
||||
def types(self, type_query: str) -> CardQueryBuilder:
|
||||
"""
|
||||
Filter by type line (partial match).
|
||||
|
||||
Args:
|
||||
type_query: Type string to search for
|
||||
|
||||
Returns:
|
||||
Self for chaining
|
||||
"""
|
||||
self._type_filter = type_query
|
||||
return self
|
||||
|
||||
def names(self, names: list[str]) -> CardQueryBuilder:
|
||||
"""
|
||||
Filter by specific card names (batch lookup).
|
||||
|
||||
Args:
|
||||
names: List of card names
|
||||
|
||||
Returns:
|
||||
Self for chaining
|
||||
"""
|
||||
self._name_filter = names
|
||||
return self
|
||||
|
||||
def search(self, query: str) -> CardQueryBuilder:
|
||||
"""
|
||||
Add text search across name, type, and oracle text.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
|
||||
Returns:
|
||||
Self for chaining
|
||||
"""
|
||||
self._search_query = query
|
||||
return self
|
||||
|
||||
def limit(self, limit: int) -> CardQueryBuilder:
|
||||
"""
|
||||
Limit number of results.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
Self for chaining
|
||||
"""
|
||||
self._limit = limit
|
||||
return self
|
||||
|
||||
def execute(self) -> pd.DataFrame:
|
||||
"""
|
||||
Execute the query and return results.
|
||||
|
||||
Returns:
|
||||
DataFrame containing matching cards
|
||||
"""
|
||||
# Start with all cards or specific names
|
||||
if self._name_filter:
|
||||
df = self._loader.get_by_names(self._name_filter)
|
||||
else:
|
||||
df = self._loader.load()
|
||||
|
||||
# Apply color filter
|
||||
if self._color_filter:
|
||||
color_results = self._loader.filter_by_color_identity(self._color_filter)
|
||||
df = df[df.index.isin(color_results.index)]
|
||||
|
||||
# Apply theme filter
|
||||
if self._theme_filter:
|
||||
theme_results = self._loader.filter_by_themes(self._theme_filter, mode=self._theme_mode)
|
||||
df = df[df.index.isin(theme_results.index)]
|
||||
|
||||
# Apply type filter
|
||||
if self._type_filter:
|
||||
type_results = self._loader.filter_by_type(self._type_filter)
|
||||
df = df[df.index.isin(type_results.index)]
|
||||
|
||||
# Apply text search
|
||||
if self._search_query:
|
||||
search_results = self._loader.search(self._search_query, limit=999999)
|
||||
df = df[df.index.isin(search_results.index)]
|
||||
|
||||
# Apply limit
|
||||
if self._limit and len(df) > self._limit:
|
||||
df = df.head(self._limit)
|
||||
|
||||
return df
|
||||
|
||||
def count(self) -> int:
|
||||
"""
|
||||
Count results without returning full DataFrame.
|
||||
|
||||
Returns:
|
||||
Number of matching cards
|
||||
"""
|
||||
return len(self.execute())
|
||||
|
||||
def first(self) -> Optional[pd.Series]:
|
||||
"""
|
||||
Get first result only.
|
||||
|
||||
Returns:
|
||||
First matching card as Series, or None if no results
|
||||
"""
|
||||
results = self.execute()
|
||||
if results.empty:
|
||||
return None
|
||||
return results.iloc[0]
|
||||
|
||||
def reset(self) -> CardQueryBuilder:
|
||||
"""
|
||||
Reset all filters.
|
||||
|
||||
Returns:
|
||||
Self for chaining
|
||||
"""
|
||||
self._color_filter = None
|
||||
self._theme_filter = None
|
||||
self._theme_mode = "any"
|
||||
self._type_filter = None
|
||||
self._name_filter = None
|
||||
self._search_query = None
|
||||
self._limit = None
|
||||
return self
|
||||
281
code/services/legacy_loader_adapter.py
Normal file
281
code/services/legacy_loader_adapter.py
Normal file
|
|
@ -0,0 +1,281 @@
|
|||
"""
|
||||
Legacy Loader Adapter
|
||||
|
||||
Provides backward-compatible wrapper functions around AllCardsLoader for smooth migration.
|
||||
Existing code can continue using old file-loading patterns while benefiting from
|
||||
the new consolidated Parquet backend.
|
||||
|
||||
This adapter will be maintained through v3.0.x and deprecated in v3.1+.
|
||||
|
||||
Usage:
|
||||
# Old code (still works):
|
||||
from code.services.legacy_loader_adapter import load_cards_by_type
|
||||
creatures = load_cards_by_type("Creature")
|
||||
|
||||
# New code (preferred):
|
||||
from code.services.all_cards_loader import AllCardsLoader
|
||||
loader = AllCardsLoader()
|
||||
creatures = loader.filter_by_type("Creature")
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from code.logging_util import get_logger
|
||||
from code.services.all_cards_loader import AllCardsLoader
|
||||
from code.settings import USE_ALL_CARDS_FILE
|
||||
|
||||
# Initialize logger
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Shared loader instance for performance
|
||||
_shared_loader: Optional[AllCardsLoader] = None
|
||||
|
||||
|
||||
def _get_loader() -> AllCardsLoader:
|
||||
"""Get or create shared AllCardsLoader instance."""
|
||||
global _shared_loader
|
||||
if _shared_loader is None:
|
||||
_shared_loader = AllCardsLoader()
|
||||
return _shared_loader
|
||||
|
||||
|
||||
def _deprecation_warning(func_name: str, replacement: str) -> None:
|
||||
"""Log deprecation warning for legacy functions."""
|
||||
warnings.warn(
|
||||
f"{func_name} is deprecated and will be removed in v3.1+. "
|
||||
f"Use {replacement} instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
logger.warning(
|
||||
f"DEPRECATION: {func_name} called. Migrate to {replacement} before v3.1+"
|
||||
)
|
||||
|
||||
|
||||
def load_all_cards(use_cache: bool = True) -> pd.DataFrame:
|
||||
"""
|
||||
Load all cards from consolidated Parquet file.
|
||||
|
||||
Legacy function for backward compatibility.
|
||||
|
||||
Args:
|
||||
use_cache: Whether to use cached data (default: True)
|
||||
|
||||
Returns:
|
||||
DataFrame containing all cards
|
||||
|
||||
Deprecated:
|
||||
Use AllCardsLoader().load() instead.
|
||||
"""
|
||||
_deprecation_warning("load_all_cards()", "AllCardsLoader().load()")
|
||||
|
||||
if not USE_ALL_CARDS_FILE:
|
||||
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
|
||||
return pd.DataFrame()
|
||||
|
||||
loader = _get_loader()
|
||||
return loader.load(force_reload=not use_cache)
|
||||
|
||||
|
||||
def load_cards_by_name(name: str) -> Optional[pd.Series]:
|
||||
"""
|
||||
Load a single card by exact name match.
|
||||
|
||||
Legacy function for backward compatibility.
|
||||
|
||||
Args:
|
||||
name: Card name to search for
|
||||
|
||||
Returns:
|
||||
Series containing card data, or None if not found
|
||||
|
||||
Deprecated:
|
||||
Use AllCardsLoader().get_by_name() instead.
|
||||
"""
|
||||
_deprecation_warning("load_cards_by_name()", "AllCardsLoader().get_by_name()")
|
||||
|
||||
if not USE_ALL_CARDS_FILE:
|
||||
logger.warning("USE_ALL_CARDS_FILE is disabled, returning None")
|
||||
return None
|
||||
|
||||
loader = _get_loader()
|
||||
return loader.get_by_name(name)
|
||||
|
||||
|
||||
def load_cards_by_names(names: list[str]) -> pd.DataFrame:
|
||||
"""
|
||||
Load multiple cards by exact name matches.
|
||||
|
||||
Legacy function for backward compatibility.
|
||||
|
||||
Args:
|
||||
names: List of card names to search for
|
||||
|
||||
Returns:
|
||||
DataFrame containing matching cards
|
||||
|
||||
Deprecated:
|
||||
Use AllCardsLoader().get_by_names() instead.
|
||||
"""
|
||||
_deprecation_warning("load_cards_by_names()", "AllCardsLoader().get_by_names()")
|
||||
|
||||
if not USE_ALL_CARDS_FILE:
|
||||
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
|
||||
return pd.DataFrame()
|
||||
|
||||
loader = _get_loader()
|
||||
return loader.get_by_names(names)
|
||||
|
||||
|
||||
def load_cards_by_type(type_str: str) -> pd.DataFrame:
|
||||
"""
|
||||
Load cards by type line (partial match).
|
||||
|
||||
Legacy function for backward compatibility.
|
||||
|
||||
Args:
|
||||
type_str: Type string to search for (e.g., "Creature", "Instant")
|
||||
|
||||
Returns:
|
||||
DataFrame containing cards matching the type
|
||||
|
||||
Deprecated:
|
||||
Use AllCardsLoader().filter_by_type() instead.
|
||||
"""
|
||||
_deprecation_warning("load_cards_by_type()", "AllCardsLoader().filter_by_type()")
|
||||
|
||||
if not USE_ALL_CARDS_FILE:
|
||||
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
|
||||
return pd.DataFrame()
|
||||
|
||||
loader = _get_loader()
|
||||
return loader.filter_by_type(type_str)
|
||||
|
||||
|
||||
def load_cards_with_tag(tag: str) -> pd.DataFrame:
|
||||
"""
|
||||
Load cards containing a specific theme tag.
|
||||
|
||||
Legacy function for backward compatibility.
|
||||
|
||||
Args:
|
||||
tag: Theme tag to search for
|
||||
|
||||
Returns:
|
||||
DataFrame containing cards with the tag
|
||||
|
||||
Deprecated:
|
||||
Use AllCardsLoader().filter_by_themes() instead.
|
||||
"""
|
||||
_deprecation_warning("load_cards_with_tag()", "AllCardsLoader().filter_by_themes()")
|
||||
|
||||
if not USE_ALL_CARDS_FILE:
|
||||
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
|
||||
return pd.DataFrame()
|
||||
|
||||
loader = _get_loader()
|
||||
return loader.filter_by_themes([tag], mode="any")
|
||||
|
||||
|
||||
def load_cards_with_tags(tags: list[str], require_all: bool = False) -> pd.DataFrame:
|
||||
"""
|
||||
Load cards containing theme tags.
|
||||
|
||||
Legacy function for backward compatibility.
|
||||
|
||||
Args:
|
||||
tags: List of theme tags to search for
|
||||
require_all: If True, card must have all tags; if False, at least one tag
|
||||
|
||||
Returns:
|
||||
DataFrame containing cards matching the tag criteria
|
||||
|
||||
Deprecated:
|
||||
Use AllCardsLoader().filter_by_themes() instead.
|
||||
"""
|
||||
_deprecation_warning(
|
||||
"load_cards_with_tags()", "AllCardsLoader().filter_by_themes()"
|
||||
)
|
||||
|
||||
if not USE_ALL_CARDS_FILE:
|
||||
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
|
||||
return pd.DataFrame()
|
||||
|
||||
loader = _get_loader()
|
||||
mode = "all" if require_all else "any"
|
||||
return loader.filter_by_themes(tags, mode=mode)
|
||||
|
||||
|
||||
def load_cards_by_color_identity(colors: list[str]) -> pd.DataFrame:
|
||||
"""
|
||||
Load cards by color identity.
|
||||
|
||||
Legacy function for backward compatibility.
|
||||
|
||||
Args:
|
||||
colors: List of color codes (e.g., ["W", "U"])
|
||||
|
||||
Returns:
|
||||
DataFrame containing cards matching the color identity
|
||||
|
||||
Deprecated:
|
||||
Use AllCardsLoader().filter_by_color_identity() instead.
|
||||
"""
|
||||
_deprecation_warning(
|
||||
"load_cards_by_color_identity()", "AllCardsLoader().filter_by_color_identity()"
|
||||
)
|
||||
|
||||
if not USE_ALL_CARDS_FILE:
|
||||
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
|
||||
return pd.DataFrame()
|
||||
|
||||
loader = _get_loader()
|
||||
return loader.filter_by_color_identity(colors)
|
||||
|
||||
|
||||
def search_cards(query: str, limit: int = 100) -> pd.DataFrame:
|
||||
"""
|
||||
Search cards by text query.
|
||||
|
||||
Legacy function for backward compatibility.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
DataFrame containing matching cards
|
||||
|
||||
Deprecated:
|
||||
Use AllCardsLoader().search() instead.
|
||||
"""
|
||||
_deprecation_warning("search_cards()", "AllCardsLoader().search()")
|
||||
|
||||
if not USE_ALL_CARDS_FILE:
|
||||
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
|
||||
return pd.DataFrame()
|
||||
|
||||
loader = _get_loader()
|
||||
return loader.search(query, limit=limit)
|
||||
|
||||
|
||||
def clear_card_cache() -> None:
|
||||
"""
|
||||
Clear the cached card data, forcing next load to read from disk.
|
||||
|
||||
Legacy function for backward compatibility.
|
||||
|
||||
Deprecated:
|
||||
Use AllCardsLoader().clear_cache() instead.
|
||||
"""
|
||||
_deprecation_warning("clear_card_cache()", "AllCardsLoader().clear_cache()")
|
||||
|
||||
global _shared_loader
|
||||
if _shared_loader is not None:
|
||||
_shared_loader.clear_cache()
|
||||
_shared_loader = None
|
||||
|
|
@ -94,6 +94,7 @@ MAIN_MENU_ITEMS: List[str] = ['Build A Deck', 'Setup CSV Files', 'Tag CSV Files'
|
|||
SETUP_MENU_ITEMS: List[str] = ['Initial Setup', 'Regenerate CSV', 'Main Menu']
|
||||
|
||||
CSV_DIRECTORY: str = 'csv_files'
|
||||
CARD_FILES_DIRECTORY: str = 'card_files' # Parquet files for consolidated card data
|
||||
|
||||
# Configuration for handling null/NA values in DataFrame columns
|
||||
FILL_NA_COLUMNS: Dict[str, Optional[str]] = {
|
||||
|
|
@ -101,6 +102,14 @@ FILL_NA_COLUMNS: Dict[str, Optional[str]] = {
|
|||
'faceName': None # Use card's name column value when face name is not available
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------------------------------
|
||||
# ALL CARDS CONSOLIDATION FEATURE FLAG
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
# Enable use of consolidated all_cards.parquet file (default: True)
|
||||
# Set to False to disable and fall back to individual CSV file loading
|
||||
USE_ALL_CARDS_FILE = os.getenv('USE_ALL_CARDS_FILE', '1').lower() not in ('0', 'false', 'off', 'disabled')
|
||||
|
||||
# ----------------------------------------------------------------------------------
|
||||
# TAGGING REFINEMENT FEATURE FLAGS (M1-M5)
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
|
|
|||
408
code/tests/test_all_cards_loader.py
Normal file
408
code/tests/test_all_cards_loader.py
Normal file
|
|
@ -0,0 +1,408 @@
|
|||
"""
|
||||
Tests for AllCardsLoader and CardQueryBuilder
|
||||
|
||||
Tests cover:
|
||||
- Loading and caching behavior
|
||||
- Single and batch card lookups
|
||||
- Color, theme, and type filtering
|
||||
- Text search
|
||||
- Query builder fluent API
|
||||
- Performance benchmarks
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from code.services.all_cards_loader import AllCardsLoader
|
||||
from code.services.card_query_builder import CardQueryBuilder
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_cards_df():
|
||||
"""Create a sample DataFrame for testing."""
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"name": [
|
||||
"Sol Ring",
|
||||
"Lightning Bolt",
|
||||
"Counterspell",
|
||||
"Giant Growth",
|
||||
"Goblin Token Maker",
|
||||
"Dark Ritual",
|
||||
"Swords to Plowshares",
|
||||
"Birds of Paradise",
|
||||
],
|
||||
"colorIdentity": ["Colorless", "R", "U", "G", "R", "B", "W", "G"],
|
||||
"type": [
|
||||
"Artifact",
|
||||
"Instant",
|
||||
"Instant",
|
||||
"Instant",
|
||||
"Creature — Goblin",
|
||||
"Instant",
|
||||
"Instant",
|
||||
"Creature — Bird",
|
||||
],
|
||||
"text": [
|
||||
"Add two mana",
|
||||
"Deal 3 damage",
|
||||
"Counter target spell",
|
||||
"Target creature gets +3/+3",
|
||||
"When this enters, create two 1/1 red Goblin creature tokens",
|
||||
"Add three black mana",
|
||||
"Exile target creature",
|
||||
"Flying, Add one mana of any color",
|
||||
],
|
||||
"themeTags": [
|
||||
"",
|
||||
"burn,damage",
|
||||
"control,counterspells",
|
||||
"combat,pump",
|
||||
"tokens,goblins",
|
||||
"ritual,fast-mana",
|
||||
"removal,exile",
|
||||
"ramp,mana-dork",
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_parquet_file(sample_cards_df):
|
||||
"""Create a temporary Parquet file for testing."""
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as tmp:
|
||||
sample_cards_df.to_parquet(tmp.name, engine="pyarrow")
|
||||
yield tmp.name
|
||||
os.unlink(tmp.name)
|
||||
|
||||
|
||||
def test_loader_initialization(sample_parquet_file):
|
||||
"""Test AllCardsLoader initialization."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file, cache_ttl=60)
|
||||
assert loader.file_path == sample_parquet_file
|
||||
assert loader.cache_ttl == 60
|
||||
assert loader._df is None
|
||||
|
||||
|
||||
def test_loader_load(sample_parquet_file):
|
||||
"""Test loading Parquet file."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
df = loader.load()
|
||||
assert len(df) == 8
|
||||
assert "name" in df.columns
|
||||
assert "colorIdentity" in df.columns
|
||||
|
||||
|
||||
def test_loader_caching(sample_parquet_file):
|
||||
"""Test that caching works and doesn't reload unnecessarily."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file, cache_ttl=300)
|
||||
|
||||
# First load
|
||||
start_time = time.time()
|
||||
df1 = loader.load()
|
||||
first_load_time = time.time() - start_time
|
||||
|
||||
# Second load (should use cache)
|
||||
start_time = time.time()
|
||||
df2 = loader.load()
|
||||
cached_load_time = time.time() - start_time
|
||||
|
||||
# Cache should be much faster
|
||||
assert cached_load_time < first_load_time / 2
|
||||
assert df1 is df2 # Same object
|
||||
|
||||
|
||||
def test_loader_force_reload(sample_parquet_file):
|
||||
"""Test force_reload flag."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
df1 = loader.load()
|
||||
df2 = loader.load(force_reload=True)
|
||||
|
||||
assert df1 is not df2 # Different objects
|
||||
assert len(df1) == len(df2) # Same data
|
||||
|
||||
|
||||
def test_loader_cache_expiration(sample_parquet_file):
|
||||
"""Test cache expiration after TTL."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file, cache_ttl=1)
|
||||
|
||||
df1 = loader.load()
|
||||
time.sleep(1.1) # Wait for TTL to expire
|
||||
df2 = loader.load()
|
||||
|
||||
assert df1 is not df2 # Should have reloaded
|
||||
|
||||
|
||||
def test_get_by_name(sample_parquet_file):
|
||||
"""Test single card lookup by name."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
card = loader.get_by_name("Sol Ring")
|
||||
assert card is not None
|
||||
assert card["name"] == "Sol Ring"
|
||||
assert card["colorIdentity"] == "Colorless"
|
||||
|
||||
# Non-existent card
|
||||
card = loader.get_by_name("Nonexistent Card")
|
||||
assert card is None
|
||||
|
||||
|
||||
def test_get_by_names(sample_parquet_file):
|
||||
"""Test batch card lookup by names."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
cards = loader.get_by_names(["Sol Ring", "Lightning Bolt", "Counterspell"])
|
||||
assert len(cards) == 3
|
||||
assert "Sol Ring" in cards["name"].values
|
||||
assert "Lightning Bolt" in cards["name"].values
|
||||
|
||||
# Empty list
|
||||
cards = loader.get_by_names([])
|
||||
assert len(cards) == 0
|
||||
|
||||
# Non-existent cards
|
||||
cards = loader.get_by_names(["Nonexistent1", "Nonexistent2"])
|
||||
assert len(cards) == 0
|
||||
|
||||
|
||||
def test_filter_by_color_identity(sample_parquet_file):
|
||||
"""Test color identity filtering."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
# Single color
|
||||
red_cards = loader.filter_by_color_identity(["R"])
|
||||
assert len(red_cards) == 2
|
||||
assert "Lightning Bolt" in red_cards["name"].values
|
||||
assert "Goblin Token Maker" in red_cards["name"].values
|
||||
|
||||
# Colorless
|
||||
colorless = loader.filter_by_color_identity(["Colorless"])
|
||||
assert len(colorless) == 1
|
||||
assert colorless["name"].values[0] == "Sol Ring"
|
||||
|
||||
|
||||
def test_filter_by_themes(sample_parquet_file):
|
||||
"""Test theme filtering."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
# Single theme
|
||||
token_cards = loader.filter_by_themes(["tokens"], mode="any")
|
||||
assert len(token_cards) == 1
|
||||
assert token_cards["name"].values[0] == "Goblin Token Maker"
|
||||
|
||||
# Multiple themes (any)
|
||||
cards = loader.filter_by_themes(["burn", "removal"], mode="any")
|
||||
assert len(cards) == 2 # Lightning Bolt and Swords to Plowshares
|
||||
|
||||
# Multiple themes (all)
|
||||
cards = loader.filter_by_themes(["tokens", "goblins"], mode="all")
|
||||
assert len(cards) == 1
|
||||
assert cards["name"].values[0] == "Goblin Token Maker"
|
||||
|
||||
|
||||
def test_filter_by_type(sample_parquet_file):
|
||||
"""Test type filtering."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
creatures = loader.filter_by_type("Creature")
|
||||
assert len(creatures) == 2
|
||||
assert "Goblin Token Maker" in creatures["name"].values
|
||||
assert "Birds of Paradise" in creatures["name"].values
|
||||
|
||||
instants = loader.filter_by_type("Instant")
|
||||
assert len(instants) == 5
|
||||
|
||||
|
||||
def test_search(sample_parquet_file):
|
||||
"""Test text search."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
# Search in text
|
||||
results = loader.search("token")
|
||||
assert len(results) >= 1
|
||||
assert "Goblin Token Maker" in results["name"].values
|
||||
|
||||
# Search in name
|
||||
results = loader.search("Sol")
|
||||
assert len(results) == 1
|
||||
assert results["name"].values[0] == "Sol Ring"
|
||||
|
||||
# Limit results
|
||||
results = loader.search("mana", limit=1)
|
||||
assert len(results) == 1
|
||||
|
||||
|
||||
def test_get_stats(sample_parquet_file):
|
||||
"""Test stats retrieval."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
loader.load()
|
||||
|
||||
stats = loader.get_stats()
|
||||
assert stats["total_cards"] == 8
|
||||
assert stats["cached"] is True
|
||||
assert stats["file_size_mb"] >= 0 # Small test file may round to 0
|
||||
assert "cache_age_seconds" in stats
|
||||
|
||||
|
||||
def test_clear_cache(sample_parquet_file):
|
||||
"""Test cache clearing."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
loader.load()
|
||||
|
||||
assert loader._df is not None
|
||||
loader.clear_cache()
|
||||
assert loader._df is None
|
||||
|
||||
|
||||
def test_query_builder_basic(sample_parquet_file):
|
||||
"""Test basic query builder usage."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
builder = CardQueryBuilder(loader=loader)
|
||||
|
||||
# Execute without filters
|
||||
results = builder.execute()
|
||||
assert len(results) == 8
|
||||
|
||||
# Single filter
|
||||
results = builder.reset().colors(["R"]).execute()
|
||||
assert len(results) == 2
|
||||
|
||||
|
||||
def test_query_builder_chaining(sample_parquet_file):
|
||||
"""Test query builder method chaining."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
results = (
|
||||
CardQueryBuilder(loader=loader)
|
||||
.types("Creature")
|
||||
.themes(["tokens"], mode="any")
|
||||
.execute()
|
||||
)
|
||||
assert len(results) == 1
|
||||
assert results["name"].values[0] == "Goblin Token Maker"
|
||||
|
||||
|
||||
def test_query_builder_names(sample_parquet_file):
|
||||
"""Test query builder with specific names."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
results = (
|
||||
CardQueryBuilder(loader=loader)
|
||||
.names(["Sol Ring", "Lightning Bolt"])
|
||||
.execute()
|
||||
)
|
||||
assert len(results) == 2
|
||||
|
||||
|
||||
def test_query_builder_limit(sample_parquet_file):
|
||||
"""Test query builder limit."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
results = CardQueryBuilder(loader=loader).limit(3).execute()
|
||||
assert len(results) == 3
|
||||
|
||||
|
||||
def test_query_builder_count(sample_parquet_file):
|
||||
"""Test query builder count method."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
count = CardQueryBuilder(loader=loader).types("Instant").count()
|
||||
assert count == 5
|
||||
|
||||
|
||||
def test_query_builder_first(sample_parquet_file):
|
||||
"""Test query builder first method."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
card = CardQueryBuilder(loader=loader).colors(["R"]).first()
|
||||
assert card is not None
|
||||
assert card["colorIdentity"] == "R"
|
||||
|
||||
# No results
|
||||
card = CardQueryBuilder(loader=loader).colors(["X"]).first()
|
||||
assert card is None
|
||||
|
||||
|
||||
def test_query_builder_complex(sample_parquet_file):
|
||||
"""Test complex query with multiple filters."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
|
||||
results = (
|
||||
CardQueryBuilder(loader=loader)
|
||||
.types("Instant")
|
||||
.colors(["R"])
|
||||
.search("damage")
|
||||
.limit(5)
|
||||
.execute()
|
||||
)
|
||||
assert len(results) == 1
|
||||
assert results["name"].values[0] == "Lightning Bolt"
|
||||
|
||||
|
||||
def test_performance_single_lookup(sample_parquet_file):
|
||||
"""Benchmark single card lookup performance."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
loader.load() # Warm up cache
|
||||
|
||||
start = time.time()
|
||||
for _ in range(100):
|
||||
loader.get_by_name("Sol Ring")
|
||||
elapsed = time.time() - start
|
||||
|
||||
avg_time_ms = (elapsed / 100) * 1000
|
||||
print(f"\nSingle lookup avg: {avg_time_ms:.3f}ms")
|
||||
assert avg_time_ms < 10 # Should be <10ms per lookup
|
||||
|
||||
|
||||
def test_performance_batch_lookup(sample_parquet_file):
|
||||
"""Benchmark batch card lookup performance."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
loader.load() # Warm up cache
|
||||
|
||||
names = ["Sol Ring", "Lightning Bolt", "Counterspell"]
|
||||
|
||||
start = time.time()
|
||||
for _ in range(100):
|
||||
loader.get_by_names(names)
|
||||
elapsed = time.time() - start
|
||||
|
||||
avg_time_ms = (elapsed / 100) * 1000
|
||||
print(f"\nBatch lookup (3 cards) avg: {avg_time_ms:.3f}ms")
|
||||
assert avg_time_ms < 15 # Should be <15ms per batch
|
||||
|
||||
|
||||
def test_performance_filter_by_color(sample_parquet_file):
|
||||
"""Benchmark color filtering performance."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
loader.load() # Warm up cache
|
||||
|
||||
start = time.time()
|
||||
for _ in range(100):
|
||||
loader.filter_by_color_identity(["R"])
|
||||
elapsed = time.time() - start
|
||||
|
||||
avg_time_ms = (elapsed / 100) * 1000
|
||||
print(f"\nColor filter avg: {avg_time_ms:.3f}ms")
|
||||
assert avg_time_ms < 20 # Should be <20ms per filter
|
||||
|
||||
|
||||
def test_performance_search(sample_parquet_file):
|
||||
"""Benchmark text search performance."""
|
||||
loader = AllCardsLoader(file_path=sample_parquet_file)
|
||||
loader.load() # Warm up cache
|
||||
|
||||
start = time.time()
|
||||
for _ in range(100):
|
||||
loader.search("token", limit=100)
|
||||
elapsed = time.time() - start
|
||||
|
||||
avg_time_ms = (elapsed / 100) * 1000
|
||||
print(f"\nText search avg: {avg_time_ms:.3f}ms")
|
||||
assert avg_time_ms < 50 # Should be <50ms per search
|
||||
340
code/tests/test_card_aggregator.py
Normal file
340
code/tests/test_card_aggregator.py
Normal file
|
|
@ -0,0 +1,340 @@
|
|||
"""
|
||||
Tests for Card Aggregator
|
||||
|
||||
Tests the CardAggregator class functionality including:
|
||||
- Full aggregation of multiple CSV files
|
||||
- Deduplication (keeping most recent)
|
||||
- Exclusion of master files (cards.csv, commander_cards.csv)
|
||||
- Validation of output
|
||||
- Version rotation
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from code.file_setup.card_aggregator import CardAggregator
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_dirs():
|
||||
"""Create temporary directories for testing."""
|
||||
with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as output_dir:
|
||||
yield source_dir, output_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_card_data():
|
||||
"""Sample card data for testing."""
|
||||
return {
|
||||
"name": ["Sol Ring", "Lightning Bolt", "Counterspell"],
|
||||
"faceName": ["Sol Ring", "Lightning Bolt", "Counterspell"],
|
||||
"colorIdentity": ["Colorless", "R", "U"],
|
||||
"manaCost": ["{1}", "{R}", "{U}{U}"],
|
||||
"manaValue": [1, 1, 2],
|
||||
"type": ["Artifact", "Instant", "Instant"],
|
||||
"text": [
|
||||
"Add two colorless mana",
|
||||
"Deal 3 damage",
|
||||
"Counter target spell",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_ensure_output_dir(temp_dirs):
|
||||
"""Test that output directory is created."""
|
||||
_, output_dir = temp_dirs
|
||||
aggregator = CardAggregator(output_dir=output_dir)
|
||||
|
||||
assert os.path.exists(output_dir)
|
||||
assert aggregator.output_dir == output_dir
|
||||
|
||||
|
||||
def test_get_card_csvs_excludes_master_files(temp_dirs):
|
||||
"""Test that cards.csv and commander_cards.csv are excluded."""
|
||||
source_dir, _ = temp_dirs
|
||||
|
||||
# Create test files
|
||||
Path(source_dir, "cards.csv").touch()
|
||||
Path(source_dir, "commander_cards.csv").touch()
|
||||
Path(source_dir, "blue_cards.csv").touch()
|
||||
Path(source_dir, "red_cards.csv").touch()
|
||||
Path(source_dir, ".temp_cards.csv").touch()
|
||||
Path(source_dir, "_temp_cards.csv").touch()
|
||||
|
||||
aggregator = CardAggregator()
|
||||
csv_files = aggregator.get_card_csvs(source_dir)
|
||||
|
||||
# Should only include blue_cards.csv and red_cards.csv
|
||||
basenames = [os.path.basename(f) for f in csv_files]
|
||||
assert "blue_cards.csv" in basenames
|
||||
assert "red_cards.csv" in basenames
|
||||
assert "cards.csv" not in basenames
|
||||
assert "commander_cards.csv" not in basenames
|
||||
assert ".temp_cards.csv" not in basenames
|
||||
assert "_temp_cards.csv" not in basenames
|
||||
assert len(csv_files) == 2
|
||||
|
||||
|
||||
def test_deduplicate_cards(sample_card_data):
|
||||
"""Test that duplicate cards are removed, keeping the last occurrence."""
|
||||
# Create DataFrame with duplicates
|
||||
df = pd.DataFrame(sample_card_data)
|
||||
|
||||
# Add duplicate Sol Ring with different text
|
||||
duplicate_data = {
|
||||
"name": ["Sol Ring"],
|
||||
"faceName": ["Sol Ring"],
|
||||
"colorIdentity": ["Colorless"],
|
||||
"manaCost": ["{1}"],
|
||||
"manaValue": [1],
|
||||
"type": ["Artifact"],
|
||||
"text": ["Add two colorless mana (updated)"],
|
||||
}
|
||||
df_duplicate = pd.DataFrame(duplicate_data)
|
||||
df_combined = pd.concat([df, df_duplicate], ignore_index=True)
|
||||
|
||||
# Should have 4 rows before deduplication
|
||||
assert len(df_combined) == 4
|
||||
|
||||
aggregator = CardAggregator()
|
||||
df_deduped = aggregator.deduplicate_cards(df_combined)
|
||||
|
||||
# Should have 3 rows after deduplication
|
||||
assert len(df_deduped) == 3
|
||||
|
||||
# Should keep the last Sol Ring (updated text)
|
||||
sol_ring = df_deduped[df_deduped["name"] == "Sol Ring"].iloc[0]
|
||||
assert "updated" in sol_ring["text"]
|
||||
|
||||
|
||||
def test_aggregate_all(temp_dirs, sample_card_data):
|
||||
"""Test full aggregation of multiple CSV files."""
|
||||
source_dir, output_dir = temp_dirs
|
||||
|
||||
# Create test CSV files
|
||||
df1 = pd.DataFrame(
|
||||
{
|
||||
"name": ["Sol Ring", "Lightning Bolt"],
|
||||
"faceName": ["Sol Ring", "Lightning Bolt"],
|
||||
"colorIdentity": ["Colorless", "R"],
|
||||
"manaCost": ["{1}", "{R}"],
|
||||
"manaValue": [1, 1],
|
||||
"type": ["Artifact", "Instant"],
|
||||
"text": ["Add two colorless mana", "Deal 3 damage"],
|
||||
}
|
||||
)
|
||||
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"name": ["Counterspell", "Path to Exile"],
|
||||
"faceName": ["Counterspell", "Path to Exile"],
|
||||
"colorIdentity": ["U", "W"],
|
||||
"manaCost": ["{U}{U}", "{W}"],
|
||||
"manaValue": [2, 1],
|
||||
"type": ["Instant", "Instant"],
|
||||
"text": ["Counter target spell", "Exile target creature"],
|
||||
}
|
||||
)
|
||||
|
||||
df1.to_csv(os.path.join(source_dir, "blue_cards.csv"), index=False)
|
||||
df2.to_csv(os.path.join(source_dir, "white_cards.csv"), index=False)
|
||||
|
||||
# Create excluded files (should be ignored)
|
||||
df1.to_csv(os.path.join(source_dir, "cards.csv"), index=False)
|
||||
df1.to_csv(os.path.join(source_dir, "commander_cards.csv"), index=False)
|
||||
|
||||
# Aggregate
|
||||
aggregator = CardAggregator(output_dir=output_dir)
|
||||
output_path = os.path.join(output_dir, "all_cards.parquet")
|
||||
stats = aggregator.aggregate_all(source_dir, output_path)
|
||||
|
||||
# Verify stats
|
||||
assert stats["files_processed"] == 2 # Only 2 files (excluded 2)
|
||||
assert stats["total_cards"] == 4 # 2 + 2 cards
|
||||
assert stats["duplicates_removed"] == 0
|
||||
assert os.path.exists(output_path)
|
||||
|
||||
# Verify output
|
||||
df_result = pd.read_parquet(output_path)
|
||||
assert len(df_result) == 4
|
||||
assert "Sol Ring" in df_result["name"].values
|
||||
assert "Counterspell" in df_result["name"].values
|
||||
|
||||
|
||||
def test_aggregate_with_duplicates(temp_dirs):
|
||||
"""Test aggregation with duplicate cards across files."""
|
||||
source_dir, output_dir = temp_dirs
|
||||
|
||||
# Create two files with the same card
|
||||
df1 = pd.DataFrame(
|
||||
{
|
||||
"name": ["Sol Ring"],
|
||||
"faceName": ["Sol Ring"],
|
||||
"colorIdentity": ["Colorless"],
|
||||
"manaCost": ["{1}"],
|
||||
"manaValue": [1],
|
||||
"type": ["Artifact"],
|
||||
"text": ["Version 1"],
|
||||
}
|
||||
)
|
||||
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"name": ["Sol Ring"],
|
||||
"faceName": ["Sol Ring"],
|
||||
"colorIdentity": ["Colorless"],
|
||||
"manaCost": ["{1}"],
|
||||
"manaValue": [1],
|
||||
"type": ["Artifact"],
|
||||
"text": ["Version 2 (newer)"],
|
||||
}
|
||||
)
|
||||
|
||||
# Write file1 first, then file2 (file2 is newer)
|
||||
file1 = os.path.join(source_dir, "file1.csv")
|
||||
file2 = os.path.join(source_dir, "file2.csv")
|
||||
df1.to_csv(file1, index=False)
|
||||
df2.to_csv(file2, index=False)
|
||||
|
||||
# Make file2 newer by touching it
|
||||
os.utime(file2, (datetime.now().timestamp() + 1, datetime.now().timestamp() + 1))
|
||||
|
||||
# Aggregate
|
||||
aggregator = CardAggregator(output_dir=output_dir)
|
||||
output_path = os.path.join(output_dir, "all_cards.parquet")
|
||||
stats = aggregator.aggregate_all(source_dir, output_path)
|
||||
|
||||
# Should have removed 1 duplicate
|
||||
assert stats["duplicates_removed"] == 1
|
||||
assert stats["total_cards"] == 1
|
||||
|
||||
# Should keep the newer version (file2)
|
||||
df_result = pd.read_parquet(output_path)
|
||||
assert "Version 2 (newer)" in df_result["text"].iloc[0]
|
||||
|
||||
|
||||
def test_validate_output(temp_dirs, sample_card_data):
|
||||
"""Test output validation."""
|
||||
source_dir, output_dir = temp_dirs
|
||||
|
||||
# Create and aggregate test data
|
||||
df = pd.DataFrame(sample_card_data)
|
||||
df.to_csv(os.path.join(source_dir, "test_cards.csv"), index=False)
|
||||
|
||||
aggregator = CardAggregator(output_dir=output_dir)
|
||||
output_path = os.path.join(output_dir, "all_cards.parquet")
|
||||
aggregator.aggregate_all(source_dir, output_path)
|
||||
|
||||
# Validate
|
||||
is_valid, errors = aggregator.validate_output(output_path, source_dir)
|
||||
|
||||
assert is_valid
|
||||
assert len(errors) == 0
|
||||
|
||||
|
||||
def test_validate_missing_file(temp_dirs):
|
||||
"""Test validation with missing output file."""
|
||||
source_dir, output_dir = temp_dirs
|
||||
|
||||
aggregator = CardAggregator(output_dir=output_dir)
|
||||
output_path = os.path.join(output_dir, "nonexistent.parquet")
|
||||
|
||||
is_valid, errors = aggregator.validate_output(output_path, source_dir)
|
||||
|
||||
assert not is_valid
|
||||
assert len(errors) > 0
|
||||
assert "not found" in errors[0].lower()
|
||||
|
||||
|
||||
def test_rotate_versions(temp_dirs, sample_card_data):
|
||||
"""Test version rotation."""
|
||||
_, output_dir = temp_dirs
|
||||
|
||||
# Create initial file
|
||||
df = pd.DataFrame(sample_card_data)
|
||||
output_path = os.path.join(output_dir, "all_cards.parquet")
|
||||
df.to_parquet(output_path)
|
||||
|
||||
aggregator = CardAggregator(output_dir=output_dir)
|
||||
|
||||
# Rotate versions
|
||||
aggregator.rotate_versions(output_path, keep_versions=3)
|
||||
|
||||
# Should have created v1
|
||||
v1_path = os.path.join(output_dir, "all_cards_v1.parquet")
|
||||
assert os.path.exists(v1_path)
|
||||
assert not os.path.exists(output_path) # Original moved to v1
|
||||
|
||||
# Create new file and rotate again
|
||||
df.to_parquet(output_path)
|
||||
aggregator.rotate_versions(output_path, keep_versions=3)
|
||||
|
||||
# Should have v1 and v2
|
||||
v2_path = os.path.join(output_dir, "all_cards_v2.parquet")
|
||||
assert os.path.exists(v1_path)
|
||||
assert os.path.exists(v2_path)
|
||||
|
||||
|
||||
def test_detect_changes(temp_dirs):
|
||||
"""Test change detection for incremental updates."""
|
||||
source_dir, output_dir = temp_dirs
|
||||
|
||||
# Create metadata file
|
||||
metadata_path = os.path.join(output_dir, ".aggregate_metadata.json")
|
||||
past_time = (datetime.now() - timedelta(hours=1)).isoformat()
|
||||
metadata = {"timestamp": past_time}
|
||||
with open(metadata_path, "w") as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# Create CSV files (one old, one new)
|
||||
old_file = os.path.join(source_dir, "old_cards.csv")
|
||||
new_file = os.path.join(source_dir, "new_cards.csv")
|
||||
|
||||
df = pd.DataFrame({"name": ["Test Card"]})
|
||||
df.to_csv(old_file, index=False)
|
||||
df.to_csv(new_file, index=False)
|
||||
|
||||
# Make old_file older than metadata
|
||||
old_time = (datetime.now() - timedelta(hours=2)).timestamp()
|
||||
os.utime(old_file, (old_time, old_time))
|
||||
|
||||
aggregator = CardAggregator(output_dir=output_dir)
|
||||
changed_files = aggregator.detect_changes(source_dir, metadata_path)
|
||||
|
||||
# Should only detect new_file as changed
|
||||
assert len(changed_files) == 1
|
||||
assert os.path.basename(changed_files[0]) == "new_cards.csv"
|
||||
|
||||
|
||||
def test_aggregate_all_no_files(temp_dirs):
|
||||
"""Test aggregation with no CSV files."""
|
||||
source_dir, output_dir = temp_dirs
|
||||
|
||||
aggregator = CardAggregator(output_dir=output_dir)
|
||||
output_path = os.path.join(output_dir, "all_cards.parquet")
|
||||
|
||||
with pytest.raises(ValueError, match="No CSV files found"):
|
||||
aggregator.aggregate_all(source_dir, output_path)
|
||||
|
||||
|
||||
def test_aggregate_all_empty_files(temp_dirs):
|
||||
"""Test aggregation with empty CSV files."""
|
||||
source_dir, output_dir = temp_dirs
|
||||
|
||||
# Create empty CSV file
|
||||
empty_file = os.path.join(source_dir, "empty.csv")
|
||||
pd.DataFrame().to_csv(empty_file, index=False)
|
||||
|
||||
aggregator = CardAggregator(output_dir=output_dir)
|
||||
output_path = os.path.join(output_dir, "all_cards.parquet")
|
||||
|
||||
with pytest.raises(ValueError, match="No valid CSV files"):
|
||||
aggregator.aggregate_all(source_dir, output_path)
|
||||
280
code/tests/test_migration_compatibility.py
Normal file
280
code/tests/test_migration_compatibility.py
Normal file
|
|
@ -0,0 +1,280 @@
|
|||
"""
|
||||
Migration Compatibility Tests
|
||||
|
||||
Ensures backward compatibility during migration from individual CSV files
|
||||
to consolidated all_cards.parquet. Tests verify that legacy adapter functions
|
||||
produce identical results to direct AllCardsLoader calls.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from code.services.all_cards_loader import AllCardsLoader
|
||||
from code.services.legacy_loader_adapter import (
|
||||
load_all_cards,
|
||||
load_cards_by_color_identity,
|
||||
load_cards_by_name,
|
||||
load_cards_by_names,
|
||||
load_cards_by_type,
|
||||
load_cards_with_tag,
|
||||
load_cards_with_tags,
|
||||
search_cards,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_cards_df():
|
||||
"""Create a sample DataFrame for testing."""
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"name": [
|
||||
"Sol Ring",
|
||||
"Lightning Bolt",
|
||||
"Counterspell",
|
||||
"Giant Growth",
|
||||
"Goblin Token Maker",
|
||||
],
|
||||
"colorIdentity": ["Colorless", "R", "U", "G", "R"],
|
||||
"type": ["Artifact", "Instant", "Instant", "Instant", "Creature — Goblin"],
|
||||
"text": [
|
||||
"Add two mana",
|
||||
"Deal 3 damage",
|
||||
"Counter target spell",
|
||||
"Target creature gets +3/+3",
|
||||
"When this enters, create two 1/1 red Goblin creature tokens",
|
||||
],
|
||||
"themeTags": ["", "burn,damage", "control,counterspells", "combat,pump", "tokens,goblins"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_parquet_file(sample_cards_df):
|
||||
"""Create a temporary Parquet file for testing."""
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as tmp:
|
||||
sample_cards_df.to_parquet(tmp.name, engine="pyarrow")
|
||||
yield tmp.name
|
||||
os.unlink(tmp.name)
|
||||
|
||||
|
||||
def test_load_all_cards_adapter(temp_parquet_file):
|
||||
"""Test load_all_cards() legacy function."""
|
||||
# Direct loader call
|
||||
loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
direct_result = loader.load()
|
||||
|
||||
# Legacy adapter call
|
||||
# Note: We need to temporarily override the loader's file path
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
adapter_result = load_all_cards()
|
||||
|
||||
# Results should be identical
|
||||
pd.testing.assert_frame_equal(direct_result, adapter_result)
|
||||
|
||||
|
||||
def test_load_cards_by_name_adapter(temp_parquet_file):
|
||||
"""Test load_cards_by_name() legacy function."""
|
||||
loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
direct_result = loader.get_by_name("Sol Ring")
|
||||
|
||||
# Setup adapter with test file
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
adapter_result = load_cards_by_name("Sol Ring")
|
||||
|
||||
# Results should be identical
|
||||
assert adapter_result is not None
|
||||
pd.testing.assert_series_equal(direct_result, adapter_result)
|
||||
|
||||
|
||||
def test_load_cards_by_names_adapter(temp_parquet_file):
|
||||
"""Test load_cards_by_names() legacy function."""
|
||||
loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
names = ["Sol Ring", "Lightning Bolt"]
|
||||
direct_result = loader.get_by_names(names)
|
||||
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
adapter_result = load_cards_by_names(names)
|
||||
|
||||
pd.testing.assert_frame_equal(direct_result, adapter_result)
|
||||
|
||||
|
||||
def test_load_cards_by_type_adapter(temp_parquet_file):
|
||||
"""Test load_cards_by_type() legacy function."""
|
||||
loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
direct_result = loader.filter_by_type("Instant")
|
||||
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
adapter_result = load_cards_by_type("Instant")
|
||||
|
||||
pd.testing.assert_frame_equal(direct_result, adapter_result)
|
||||
|
||||
|
||||
def test_load_cards_with_tag_adapter(temp_parquet_file):
|
||||
"""Test load_cards_with_tag() legacy function."""
|
||||
loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
direct_result = loader.filter_by_themes(["tokens"], mode="any")
|
||||
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
adapter_result = load_cards_with_tag("tokens")
|
||||
|
||||
pd.testing.assert_frame_equal(direct_result, adapter_result)
|
||||
|
||||
|
||||
def test_load_cards_with_tags_any_mode(temp_parquet_file):
|
||||
"""Test load_cards_with_tags() with mode='any'."""
|
||||
loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
direct_result = loader.filter_by_themes(["burn", "tokens"], mode="any")
|
||||
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
adapter_result = load_cards_with_tags(["burn", "tokens"], require_all=False)
|
||||
|
||||
pd.testing.assert_frame_equal(direct_result, adapter_result)
|
||||
|
||||
|
||||
def test_load_cards_with_tags_all_mode(temp_parquet_file):
|
||||
"""Test load_cards_with_tags() with mode='all'."""
|
||||
loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
direct_result = loader.filter_by_themes(["tokens", "goblins"], mode="all")
|
||||
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
adapter_result = load_cards_with_tags(["tokens", "goblins"], require_all=True)
|
||||
|
||||
pd.testing.assert_frame_equal(direct_result, adapter_result)
|
||||
|
||||
|
||||
def test_load_cards_by_color_identity_adapter(temp_parquet_file):
|
||||
"""Test load_cards_by_color_identity() legacy function."""
|
||||
loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
direct_result = loader.filter_by_color_identity(["R"])
|
||||
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
adapter_result = load_cards_by_color_identity(["R"])
|
||||
|
||||
pd.testing.assert_frame_equal(direct_result, adapter_result)
|
||||
|
||||
|
||||
def test_search_cards_adapter(temp_parquet_file):
|
||||
"""Test search_cards() legacy function."""
|
||||
loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
direct_result = loader.search("token", limit=100)
|
||||
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
adapter_result = search_cards("token", limit=100)
|
||||
|
||||
pd.testing.assert_frame_equal(direct_result, adapter_result)
|
||||
|
||||
|
||||
def test_deprecation_warnings_logged(temp_parquet_file, caplog):
|
||||
"""Test that deprecation warnings are properly logged."""
|
||||
from code.services import legacy_loader_adapter
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
load_cards_by_name("Sol Ring")
|
||||
|
||||
# Check that warning was logged
|
||||
assert any("DEPRECATION" in record.message for record in caplog.records)
|
||||
|
||||
|
||||
def test_feature_flag_disabled(temp_parquet_file, monkeypatch):
|
||||
"""Test behavior when USE_ALL_CARDS_FILE is disabled."""
|
||||
# Disable feature flag
|
||||
monkeypatch.setattr("code.settings.USE_ALL_CARDS_FILE", False)
|
||||
|
||||
# Reimport to pick up new setting
|
||||
import importlib
|
||||
from code.services import legacy_loader_adapter
|
||||
importlib.reload(legacy_loader_adapter)
|
||||
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
result = load_all_cards()
|
||||
|
||||
# Should return empty DataFrame when disabled
|
||||
assert result.empty
|
||||
|
||||
|
||||
def test_adapter_uses_shared_loader(temp_parquet_file):
|
||||
"""Test that adapter reuses shared loader instance for performance."""
|
||||
from code.services import legacy_loader_adapter
|
||||
|
||||
# Clear any existing loader
|
||||
legacy_loader_adapter._shared_loader = None
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
load_all_cards()
|
||||
|
||||
loader1 = legacy_loader_adapter._shared_loader
|
||||
|
||||
with pytest.warns(DeprecationWarning):
|
||||
load_cards_by_name("Sol Ring")
|
||||
|
||||
loader2 = legacy_loader_adapter._shared_loader
|
||||
|
||||
# Should be the same instance
|
||||
assert loader1 is loader2
|
||||
|
||||
|
||||
def test_multiple_calls_use_cache(temp_parquet_file, monkeypatch):
|
||||
"""Test that multiple adapter calls benefit from caching."""
|
||||
import time
|
||||
from code.services import legacy_loader_adapter
|
||||
|
||||
# Ensure feature flag is enabled
|
||||
monkeypatch.setattr("code.settings.USE_ALL_CARDS_FILE", True)
|
||||
|
||||
# Reimport to pick up setting
|
||||
import importlib
|
||||
importlib.reload(legacy_loader_adapter)
|
||||
|
||||
legacy_loader_adapter._shared_loader = AllCardsLoader(file_path=temp_parquet_file)
|
||||
|
||||
# First call (loads from disk)
|
||||
start = time.time()
|
||||
with pytest.warns(DeprecationWarning):
|
||||
load_all_cards()
|
||||
first_time = time.time() - start
|
||||
|
||||
# Second call (should use cache)
|
||||
start = time.time()
|
||||
with pytest.warns(DeprecationWarning):
|
||||
load_all_cards()
|
||||
second_time = time.time() - start
|
||||
|
||||
# Cache should make second call faster (or at least not slower)
|
||||
# Use a more lenient check since file is very small
|
||||
assert second_time <= first_time * 2 # Allow some variance
|
||||
|
|
@ -108,6 +108,53 @@ async def setup_start_get(request: Request):
|
|||
return JSONResponse({"ok": False}, status_code=500)
|
||||
|
||||
|
||||
@router.post("/rebuild-cards")
|
||||
async def rebuild_cards():
|
||||
"""Manually trigger card aggregation (all_cards.parquet, commander_cards.parquet, background_cards.parquet)."""
|
||||
def runner():
|
||||
try:
|
||||
print("Starting manual card aggregation...")
|
||||
from file_setup.card_aggregator import CardAggregator # type: ignore
|
||||
import pandas as pd # type: ignore
|
||||
import os
|
||||
|
||||
aggregator = CardAggregator()
|
||||
|
||||
# Aggregate all_cards.parquet
|
||||
stats = aggregator.aggregate_all('csv_files', 'card_files/all_cards.parquet')
|
||||
print(f"Aggregated {stats['total_cards']} cards into all_cards.parquet ({stats['file_size_mb']} MB)")
|
||||
|
||||
# Convert commander_cards.csv to Parquet
|
||||
commander_csv = 'csv_files/commander_cards.csv'
|
||||
commander_parquet = 'card_files/commander_cards.parquet'
|
||||
if os.path.exists(commander_csv):
|
||||
df_cmd = pd.read_csv(commander_csv, comment='#', low_memory=False)
|
||||
for col in ["power", "toughness", "keywords"]:
|
||||
if col in df_cmd.columns:
|
||||
df_cmd[col] = df_cmd[col].astype(str)
|
||||
df_cmd.to_parquet(commander_parquet, engine="pyarrow", compression="snappy", index=False)
|
||||
print(f"Converted commander_cards.csv to Parquet ({len(df_cmd)} commanders)")
|
||||
|
||||
# Convert background_cards.csv to Parquet
|
||||
background_csv = 'csv_files/background_cards.csv'
|
||||
background_parquet = 'card_files/background_cards.parquet'
|
||||
if os.path.exists(background_csv):
|
||||
df_bg = pd.read_csv(background_csv, comment='#', low_memory=False)
|
||||
for col in ["power", "toughness", "keywords"]:
|
||||
if col in df_bg.columns:
|
||||
df_bg[col] = df_bg[col].astype(str)
|
||||
df_bg.to_parquet(background_parquet, engine="pyarrow", compression="snappy", index=False)
|
||||
print(f"Converted background_cards.csv to Parquet ({len(df_bg)} backgrounds)")
|
||||
|
||||
print("Card aggregation complete!")
|
||||
except Exception as e:
|
||||
print(f"Card aggregation failed: {e}")
|
||||
|
||||
t = threading.Thread(target=runner, daemon=True)
|
||||
t.start()
|
||||
return JSONResponse({"ok": True, "message": "Card aggregation started"}, status_code=202)
|
||||
|
||||
|
||||
@router.get("/", response_class=HTMLResponse)
|
||||
async def setup_index(request: Request) -> HTMLResponse:
|
||||
return templates.TemplateResponse("setup/index.html", {"request": request})
|
||||
|
|
|
|||
|
|
@ -1330,6 +1330,51 @@ def _ensure_setup_ready(out, force: bool = False) -> None:
|
|||
os.makedirs('csv_files', exist_ok=True)
|
||||
with open(flag_path, 'w', encoding='utf-8') as _fh:
|
||||
json.dump({'tagged_at': _dt.now().isoformat(timespec='seconds')}, _fh)
|
||||
|
||||
# Aggregate card files into Parquet AFTER tagging completes
|
||||
try:
|
||||
_write_status({"running": True, "phase": "aggregating", "message": "Consolidating card data...", "percent": 90})
|
||||
out("Aggregating card CSVs into Parquet files...")
|
||||
from file_setup.card_aggregator import CardAggregator # type: ignore
|
||||
aggregator = CardAggregator()
|
||||
|
||||
# Aggregate all_cards.parquet
|
||||
stats = aggregator.aggregate_all('csv_files', 'card_files/all_cards.parquet')
|
||||
out(f"Aggregated {stats['total_cards']} cards into all_cards.parquet ({stats['file_size_mb']} MB)")
|
||||
|
||||
# Convert commander_cards.csv and background_cards.csv to Parquet
|
||||
import pandas as pd # type: ignore
|
||||
|
||||
# Convert commander_cards.csv
|
||||
commander_csv = 'csv_files/commander_cards.csv'
|
||||
commander_parquet = 'card_files/commander_cards.parquet'
|
||||
if os.path.exists(commander_csv):
|
||||
df_cmd = pd.read_csv(commander_csv, comment='#', low_memory=False)
|
||||
# Convert mixed-type columns to strings for Parquet compatibility
|
||||
for col in ["power", "toughness", "keywords"]:
|
||||
if col in df_cmd.columns:
|
||||
df_cmd[col] = df_cmd[col].astype(str)
|
||||
df_cmd.to_parquet(commander_parquet, engine="pyarrow", compression="snappy", index=False)
|
||||
out(f"Converted commander_cards.csv to Parquet ({len(df_cmd)} commanders)")
|
||||
|
||||
# Convert background_cards.csv
|
||||
background_csv = 'csv_files/background_cards.csv'
|
||||
background_parquet = 'card_files/background_cards.parquet'
|
||||
if os.path.exists(background_csv):
|
||||
df_bg = pd.read_csv(background_csv, comment='#', low_memory=False)
|
||||
# Convert mixed-type columns to strings for Parquet compatibility
|
||||
for col in ["power", "toughness", "keywords"]:
|
||||
if col in df_bg.columns:
|
||||
df_bg[col] = df_bg[col].astype(str)
|
||||
df_bg.to_parquet(background_parquet, engine="pyarrow", compression="snappy", index=False)
|
||||
out(f"Converted background_cards.csv to Parquet ({len(df_bg)} backgrounds)")
|
||||
|
||||
_write_status({"running": True, "phase": "aggregating", "message": "Card aggregation complete", "percent": 95})
|
||||
except Exception as e:
|
||||
# Non-fatal: aggregation failure shouldn't block the rest of setup
|
||||
out(f"Warning: Card aggregation failed: {e}")
|
||||
_write_status({"running": True, "phase": "aggregating", "message": f"Aggregation failed (non-fatal): {e}", "percent": 95})
|
||||
|
||||
# Final status with percent 100 and timing info
|
||||
finished_dt = _dt.now()
|
||||
finished = finished_dt.isoformat(timespec='seconds')
|
||||
|
|
|
|||
|
|
@ -43,8 +43,9 @@
|
|||
<div class="muted" id="themes-stale-line" style="margin-top:.25rem; display:none; color:#f87171;"></div>
|
||||
</div>
|
||||
</details>
|
||||
<div style="margin-top:.75rem;">
|
||||
<div style="margin-top:.75rem; display:flex; gap:.5rem; flex-wrap:wrap;">
|
||||
<button type="button" id="btn-refresh-themes" class="action-btn" onclick="refreshThemes()">Refresh Themes Only</button>
|
||||
<button type="button" id="btn-rebuild-cards" class="action-btn" onclick="rebuildCards()">Rebuild Card Files</button>
|
||||
</div>
|
||||
</section>
|
||||
<script>
|
||||
|
|
@ -214,6 +215,30 @@
|
|||
})
|
||||
.finally(function(){ if (btn) btn.disabled = false; });
|
||||
};
|
||||
window.rebuildCards = function(){
|
||||
var btn = document.getElementById('btn-rebuild-cards');
|
||||
if (btn) btn.disabled = true;
|
||||
if (btn) btn.textContent = 'Rebuilding...';
|
||||
fetch('/setup/rebuild-cards', { method: 'POST', headers: { 'Content-Type': 'application/json' } })
|
||||
.then(function(r){
|
||||
if (!r.ok) throw new Error('Rebuild failed');
|
||||
return r.json();
|
||||
})
|
||||
.then(function(data){
|
||||
if (btn) btn.textContent = 'Rebuild Complete!';
|
||||
setTimeout(function(){
|
||||
if (btn) btn.textContent = 'Rebuild Card Files';
|
||||
if (btn) btn.disabled = false;
|
||||
}, 2000);
|
||||
})
|
||||
.catch(function(err){
|
||||
if (btn) btn.textContent = 'Rebuild Failed';
|
||||
setTimeout(function(){
|
||||
if (btn) btn.textContent = 'Rebuild Card Files';
|
||||
if (btn) btn.disabled = false;
|
||||
}, 2000);
|
||||
});
|
||||
};
|
||||
setInterval(poll, 3000);
|
||||
poll();
|
||||
pollThemes();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue