""" Tests for Card Aggregator Tests the CardAggregator class functionality including: - Full aggregation of multiple CSV files - Deduplication (keeping most recent) - Exclusion of master files (cards.csv, commander_cards.csv) - Validation of output - Version rotation """ from __future__ import annotations import json import os import tempfile from datetime import datetime, timedelta from pathlib import Path import pandas as pd import pytest from code.file_setup.card_aggregator import CardAggregator @pytest.fixture def temp_dirs(): """Create temporary directories for testing.""" with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as output_dir: yield source_dir, output_dir @pytest.fixture def sample_card_data(): """Sample card data for testing.""" return { "name": ["Sol Ring", "Lightning Bolt", "Counterspell"], "faceName": ["Sol Ring", "Lightning Bolt", "Counterspell"], "colorIdentity": ["Colorless", "R", "U"], "manaCost": ["{1}", "{R}", "{U}{U}"], "manaValue": [1, 1, 2], "type": ["Artifact", "Instant", "Instant"], "text": [ "Add two colorless mana", "Deal 3 damage", "Counter target spell", ], } def test_ensure_output_dir(temp_dirs): """Test that output directory is created.""" _, output_dir = temp_dirs aggregator = CardAggregator(output_dir=output_dir) assert os.path.exists(output_dir) assert aggregator.output_dir == output_dir def test_get_card_csvs_excludes_master_files(temp_dirs): """Test that cards.csv and commander_cards.csv are excluded.""" source_dir, _ = temp_dirs # Create test files Path(source_dir, "cards.csv").touch() Path(source_dir, "commander_cards.csv").touch() Path(source_dir, "blue_cards.csv").touch() Path(source_dir, "red_cards.csv").touch() Path(source_dir, ".temp_cards.csv").touch() Path(source_dir, "_temp_cards.csv").touch() aggregator = CardAggregator() csv_files = aggregator.get_card_csvs(source_dir) # Should only include blue_cards.csv and red_cards.csv basenames = [os.path.basename(f) for f in csv_files] assert "blue_cards.csv" in basenames assert "red_cards.csv" in basenames assert "cards.csv" not in basenames assert "commander_cards.csv" not in basenames assert ".temp_cards.csv" not in basenames assert "_temp_cards.csv" not in basenames assert len(csv_files) == 2 def test_deduplicate_cards(sample_card_data): """Test that duplicate cards are removed, keeping the last occurrence.""" # Create DataFrame with duplicates df = pd.DataFrame(sample_card_data) # Add duplicate Sol Ring with different text duplicate_data = { "name": ["Sol Ring"], "faceName": ["Sol Ring"], "colorIdentity": ["Colorless"], "manaCost": ["{1}"], "manaValue": [1], "type": ["Artifact"], "text": ["Add two colorless mana (updated)"], } df_duplicate = pd.DataFrame(duplicate_data) df_combined = pd.concat([df, df_duplicate], ignore_index=True) # Should have 4 rows before deduplication assert len(df_combined) == 4 aggregator = CardAggregator() df_deduped = aggregator.deduplicate_cards(df_combined) # Should have 3 rows after deduplication assert len(df_deduped) == 3 # Should keep the last Sol Ring (updated text) sol_ring = df_deduped[df_deduped["name"] == "Sol Ring"].iloc[0] assert "updated" in sol_ring["text"] def test_aggregate_all(temp_dirs, sample_card_data): """Test full aggregation of multiple CSV files.""" source_dir, output_dir = temp_dirs # Create test CSV files df1 = pd.DataFrame( { "name": ["Sol Ring", "Lightning Bolt"], "faceName": ["Sol Ring", "Lightning Bolt"], "colorIdentity": ["Colorless", "R"], "manaCost": ["{1}", "{R}"], "manaValue": [1, 1], "type": ["Artifact", "Instant"], "text": ["Add two colorless mana", "Deal 3 damage"], } ) df2 = pd.DataFrame( { "name": ["Counterspell", "Path to Exile"], "faceName": ["Counterspell", "Path to Exile"], "colorIdentity": ["U", "W"], "manaCost": ["{U}{U}", "{W}"], "manaValue": [2, 1], "type": ["Instant", "Instant"], "text": ["Counter target spell", "Exile target creature"], } ) df1.to_csv(os.path.join(source_dir, "blue_cards.csv"), index=False) df2.to_csv(os.path.join(source_dir, "white_cards.csv"), index=False) # Create excluded files (should be ignored) df1.to_csv(os.path.join(source_dir, "cards.csv"), index=False) df1.to_csv(os.path.join(source_dir, "commander_cards.csv"), index=False) # Aggregate aggregator = CardAggregator(output_dir=output_dir) output_path = os.path.join(output_dir, "all_cards.parquet") stats = aggregator.aggregate_all(source_dir, output_path) # Verify stats assert stats["files_processed"] == 2 # Only 2 files (excluded 2) assert stats["total_cards"] == 4 # 2 + 2 cards assert stats["duplicates_removed"] == 0 assert os.path.exists(output_path) # Verify output df_result = pd.read_parquet(output_path) assert len(df_result) == 4 assert "Sol Ring" in df_result["name"].values assert "Counterspell" in df_result["name"].values def test_aggregate_with_duplicates(temp_dirs): """Test aggregation with duplicate cards across files.""" source_dir, output_dir = temp_dirs # Create two files with the same card df1 = pd.DataFrame( { "name": ["Sol Ring"], "faceName": ["Sol Ring"], "colorIdentity": ["Colorless"], "manaCost": ["{1}"], "manaValue": [1], "type": ["Artifact"], "text": ["Version 1"], } ) df2 = pd.DataFrame( { "name": ["Sol Ring"], "faceName": ["Sol Ring"], "colorIdentity": ["Colorless"], "manaCost": ["{1}"], "manaValue": [1], "type": ["Artifact"], "text": ["Version 2 (newer)"], } ) # Write file1 first, then file2 (file2 is newer) file1 = os.path.join(source_dir, "file1.csv") file2 = os.path.join(source_dir, "file2.csv") df1.to_csv(file1, index=False) df2.to_csv(file2, index=False) # Make file2 newer by touching it os.utime(file2, (datetime.now().timestamp() + 1, datetime.now().timestamp() + 1)) # Aggregate aggregator = CardAggregator(output_dir=output_dir) output_path = os.path.join(output_dir, "all_cards.parquet") stats = aggregator.aggregate_all(source_dir, output_path) # Should have removed 1 duplicate assert stats["duplicates_removed"] == 1 assert stats["total_cards"] == 1 # Should keep the newer version (file2) df_result = pd.read_parquet(output_path) assert "Version 2 (newer)" in df_result["text"].iloc[0] def test_validate_output(temp_dirs, sample_card_data): """Test output validation.""" source_dir, output_dir = temp_dirs # Create and aggregate test data df = pd.DataFrame(sample_card_data) df.to_csv(os.path.join(source_dir, "test_cards.csv"), index=False) aggregator = CardAggregator(output_dir=output_dir) output_path = os.path.join(output_dir, "all_cards.parquet") aggregator.aggregate_all(source_dir, output_path) # Validate is_valid, errors = aggregator.validate_output(output_path, source_dir) assert is_valid assert len(errors) == 0 def test_validate_missing_file(temp_dirs): """Test validation with missing output file.""" source_dir, output_dir = temp_dirs aggregator = CardAggregator(output_dir=output_dir) output_path = os.path.join(output_dir, "nonexistent.parquet") is_valid, errors = aggregator.validate_output(output_path, source_dir) assert not is_valid assert len(errors) > 0 assert "not found" in errors[0].lower() def test_rotate_versions(temp_dirs, sample_card_data): """Test version rotation.""" _, output_dir = temp_dirs # Create initial file df = pd.DataFrame(sample_card_data) output_path = os.path.join(output_dir, "all_cards.parquet") df.to_parquet(output_path) aggregator = CardAggregator(output_dir=output_dir) # Rotate versions aggregator.rotate_versions(output_path, keep_versions=3) # Should have created v1 v1_path = os.path.join(output_dir, "all_cards_v1.parquet") assert os.path.exists(v1_path) assert not os.path.exists(output_path) # Original moved to v1 # Create new file and rotate again df.to_parquet(output_path) aggregator.rotate_versions(output_path, keep_versions=3) # Should have v1 and v2 v2_path = os.path.join(output_dir, "all_cards_v2.parquet") assert os.path.exists(v1_path) assert os.path.exists(v2_path) def test_detect_changes(temp_dirs): """Test change detection for incremental updates.""" source_dir, output_dir = temp_dirs # Create metadata file metadata_path = os.path.join(output_dir, ".aggregate_metadata.json") past_time = (datetime.now() - timedelta(hours=1)).isoformat() metadata = {"timestamp": past_time} with open(metadata_path, "w") as f: json.dump(metadata, f) # Create CSV files (one old, one new) old_file = os.path.join(source_dir, "old_cards.csv") new_file = os.path.join(source_dir, "new_cards.csv") df = pd.DataFrame({"name": ["Test Card"]}) df.to_csv(old_file, index=False) df.to_csv(new_file, index=False) # Make old_file older than metadata old_time = (datetime.now() - timedelta(hours=2)).timestamp() os.utime(old_file, (old_time, old_time)) aggregator = CardAggregator(output_dir=output_dir) changed_files = aggregator.detect_changes(source_dir, metadata_path) # Should only detect new_file as changed assert len(changed_files) == 1 assert os.path.basename(changed_files[0]) == "new_cards.csv" def test_aggregate_all_no_files(temp_dirs): """Test aggregation with no CSV files.""" source_dir, output_dir = temp_dirs aggregator = CardAggregator(output_dir=output_dir) output_path = os.path.join(output_dir, "all_cards.parquet") with pytest.raises(ValueError, match="No CSV files found"): aggregator.aggregate_all(source_dir, output_path) def test_aggregate_all_empty_files(temp_dirs): """Test aggregation with empty CSV files.""" source_dir, output_dir = temp_dirs # Create empty CSV file empty_file = os.path.join(source_dir, "empty.csv") pd.DataFrame().to_csv(empty_file, index=False) aggregator = CardAggregator(output_dir=output_dir) output_path = os.path.join(output_dir, "all_cards.parquet") with pytest.raises(ValueError, match="No valid CSV files"): aggregator.aggregate_all(source_dir, output_path)