mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 15:40:12 +01:00
fix: remove CSV fallback from theme catalog generation, add Parquet debug step
- Remove CSV fallback logic (Parquet-only in M4 migration) - Add better error messages when Parquet file missing or empty - Add workflow debug step to inspect Parquet file after tagging - Simplify build_theme_catalog function signature
This commit is contained in:
parent
9e6c3e66e9
commit
30dfca0b67
2 changed files with 134 additions and 110 deletions
53
.github/workflows/build-similarity-cache.yml
vendored
53
.github/workflows/build-similarity-cache.yml
vendored
|
|
@ -88,13 +88,60 @@ jobs:
|
||||||
echo "ERROR: Tagging completion flag not found"
|
echo "ERROR: Tagging completion flag not found"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Debug - Inspect Parquet file after tagging
|
||||||
|
if: steps.check_cache.outputs.needs_build == 'true'
|
||||||
|
run: |
|
||||||
|
python -c "
|
||||||
|
import pandas as pd
|
||||||
|
from code.path_util import get_processed_cards_path
|
||||||
|
|
||||||
# Verify theme catalog was generated
|
parquet_path = get_processed_cards_path()
|
||||||
|
print(f'Reading Parquet file: {parquet_path}')
|
||||||
|
print(f'File exists: {parquet_path.exists()}')
|
||||||
|
|
||||||
|
if not parquet_path.exists():
|
||||||
|
raise FileNotFoundError(f'Parquet file not found: {parquet_path}')
|
||||||
|
|
||||||
|
df = pd.read_parquet(parquet_path)
|
||||||
|
print(f'Loaded {len(df)} rows from Parquet file')
|
||||||
|
print(f'Columns: {list(df.columns)}')
|
||||||
|
print('')
|
||||||
|
|
||||||
|
# Show first 10 rows with their themeTags
|
||||||
|
print('First 10 cards with themeTags:')
|
||||||
|
print('=' * 80)
|
||||||
|
for idx, row in df.head(10).iterrows():
|
||||||
|
name = row.get('name', 'UNKNOWN')
|
||||||
|
tags = row.get('themeTags', [])
|
||||||
|
tag_count = len(tags) if isinstance(tags, list) else 0
|
||||||
|
print(f'{idx}: {name}')
|
||||||
|
print(f' Type: {type(tags).__name__}')
|
||||||
|
print(f' Count: {tag_count}')
|
||||||
|
if tag_count > 0:
|
||||||
|
# Show first 5 tags
|
||||||
|
sample = tags[:5] if tag_count > 5 else tags
|
||||||
|
print(f' Tags: {sample}')
|
||||||
|
if tag_count > 5:
|
||||||
|
print(f' ... and {tag_count - 5} more')
|
||||||
|
else:
|
||||||
|
print(f' Tags: (empty)')
|
||||||
|
print('')
|
||||||
|
"
|
||||||
|
|
||||||
|
- name: Generate theme catalog
|
||||||
|
if: steps.check_cache.outputs.needs_build == 'true'
|
||||||
|
run: |
|
||||||
if [ ! -f "config/themes/theme_catalog.csv" ]; then
|
if [ ! -f "config/themes/theme_catalog.csv" ]; then
|
||||||
echo "WARNING: Theme catalog not found, generating..."
|
echo "Theme catalog not found, generating..."
|
||||||
python -m code.scripts.generate_theme_catalog
|
python -m code.scripts.generate_theme_catalog
|
||||||
|
else
|
||||||
|
echo "Theme catalog already exists, skipping generation"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Verify theme catalog and tag statistics
|
||||||
|
if: steps.check_cache.outputs.needs_build == 'true'
|
||||||
|
run: |
|
||||||
# Detailed check of what tags were actually written
|
# Detailed check of what tags were actually written
|
||||||
python -c "
|
python -c "
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
|
||||||
|
|
@ -111,23 +111,38 @@ def _load_theme_counts_from_parquet(
|
||||||
Counter of theme occurrences
|
Counter of theme occurrences
|
||||||
"""
|
"""
|
||||||
if pd is None:
|
if pd is None:
|
||||||
|
print(" pandas not available, skipping parquet load")
|
||||||
return Counter()
|
return Counter()
|
||||||
|
|
||||||
counts: Counter[str] = Counter()
|
counts: Counter[str] = Counter()
|
||||||
|
|
||||||
if not parquet_path.exists():
|
if not parquet_path.exists():
|
||||||
|
print(f" Parquet file does not exist: {parquet_path}")
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
# Read only themeTags column for efficiency
|
# Read only themeTags column for efficiency
|
||||||
try:
|
try:
|
||||||
df = pd.read_parquet(parquet_path, columns=["themeTags"])
|
df = pd.read_parquet(parquet_path, columns=["themeTags"])
|
||||||
except Exception:
|
print(f" Loaded {len(df)} rows from parquet")
|
||||||
|
except Exception as e:
|
||||||
# If themeTags column doesn't exist, return empty
|
# If themeTags column doesn't exist, return empty
|
||||||
|
print(f" Failed to read themeTags column: {e}")
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
# Convert to list for fast iteration (faster than iterrows)
|
# Convert to list for fast iteration (faster than iterrows)
|
||||||
theme_tags_list = df["themeTags"].tolist()
|
theme_tags_list = df["themeTags"].tolist()
|
||||||
|
|
||||||
|
# Debug: check first few entries
|
||||||
|
non_empty_count = 0
|
||||||
|
for i, raw_value in enumerate(theme_tags_list[:10]):
|
||||||
|
if raw_value is not None and not (isinstance(raw_value, float) and pd.isna(raw_value)):
|
||||||
|
non_empty_count += 1
|
||||||
|
if i < 3: # Show first 3 non-empty
|
||||||
|
print(f" Sample tag {i}: {raw_value!r} (type: {type(raw_value).__name__})")
|
||||||
|
|
||||||
|
if non_empty_count == 0:
|
||||||
|
print(" WARNING: No non-empty themeTags found in first 10 rows")
|
||||||
|
|
||||||
for raw_value in theme_tags_list:
|
for raw_value in theme_tags_list:
|
||||||
if raw_value is None or (isinstance(raw_value, float) and pd.isna(raw_value)):
|
if raw_value is None or (isinstance(raw_value, float) and pd.isna(raw_value)):
|
||||||
continue
|
continue
|
||||||
|
|
@ -146,43 +161,11 @@ def _load_theme_counts_from_parquet(
|
||||||
counts[key] += 1
|
counts[key] += 1
|
||||||
theme_variants[key].add(display)
|
theme_variants[key].add(display)
|
||||||
|
|
||||||
|
print(f" Found {len(counts)} unique themes from parquet")
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
def _load_theme_counts(csv_path: Path, theme_variants: Dict[str, set[str]]) -> Counter[str]:
|
# CSV fallback removed in M4 migration - Parquet is now required
|
||||||
"""Load theme counts from CSV file (fallback method).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
csv_path: Path to CSV file
|
|
||||||
theme_variants: Dict to accumulate theme name variants
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Counter of theme occurrences
|
|
||||||
"""
|
|
||||||
counts: Counter[str] = Counter()
|
|
||||||
if not csv_path.exists():
|
|
||||||
return counts
|
|
||||||
with csv_path.open("r", encoding="utf-8-sig", newline="") as handle:
|
|
||||||
reader = csv.DictReader(handle)
|
|
||||||
if not reader.fieldnames or "themeTags" not in reader.fieldnames:
|
|
||||||
return counts
|
|
||||||
for row in reader:
|
|
||||||
raw_value = row.get("themeTags")
|
|
||||||
tags = parse_theme_tags(raw_value)
|
|
||||||
if not tags:
|
|
||||||
continue
|
|
||||||
seen_in_row: set[str] = set()
|
|
||||||
for tag in tags:
|
|
||||||
display = normalize_theme_display(tag)
|
|
||||||
if not display:
|
|
||||||
continue
|
|
||||||
key = canonical_key(display)
|
|
||||||
if key in seen_in_row:
|
|
||||||
continue
|
|
||||||
seen_in_row.add(key)
|
|
||||||
counts[key] += 1
|
|
||||||
theme_variants[key].add(display)
|
|
||||||
return counts
|
|
||||||
|
|
||||||
|
|
||||||
def _select_display_name(options: Sequence[str]) -> str:
|
def _select_display_name(options: Sequence[str]) -> str:
|
||||||
|
|
@ -214,97 +197,91 @@ def build_theme_catalog(
|
||||||
output_path: Path,
|
output_path: Path,
|
||||||
*,
|
*,
|
||||||
generated_at: Optional[datetime] = None,
|
generated_at: Optional[datetime] = None,
|
||||||
commander_filename: str = "commander_cards.csv",
|
|
||||||
cards_filename: str = "cards.csv",
|
|
||||||
logs_directory: Optional[Path] = None,
|
logs_directory: Optional[Path] = None,
|
||||||
use_parquet: bool = True,
|
|
||||||
min_card_count: int = 3,
|
min_card_count: int = 3,
|
||||||
) -> CatalogBuildResult:
|
) -> CatalogBuildResult:
|
||||||
"""Build theme catalog from card data.
|
"""Build theme catalog from Parquet card data.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
csv_directory: Directory containing CSV files (fallback)
|
csv_directory: Base directory (used to locate card_files/processed/all_cards.parquet)
|
||||||
output_path: Where to write the catalog CSV
|
output_path: Where to write the catalog CSV
|
||||||
generated_at: Optional timestamp for generation
|
generated_at: Optional timestamp for generation
|
||||||
commander_filename: Name of commander CSV file
|
|
||||||
cards_filename: Name of cards CSV file
|
|
||||||
logs_directory: Optional directory to copy output to
|
logs_directory: Optional directory to copy output to
|
||||||
use_parquet: If True, try to use all_cards.parquet first (default: True)
|
|
||||||
min_card_count: Minimum number of cards required to include theme (default: 3)
|
min_card_count: Minimum number of cards required to include theme (default: 3)
|
||||||
use_parquet: If True, try to use all_cards.parquet first (default: True)
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
CatalogBuildResult with generated rows and metadata
|
CatalogBuildResult with generated rows and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If pandas/pyarrow not available
|
||||||
|
FileNotFoundError: If all_cards.parquet doesn't exist
|
||||||
|
RuntimeError: If no theme tags found in Parquet file
|
||||||
"""
|
"""
|
||||||
csv_directory = csv_directory.resolve()
|
csv_directory = csv_directory.resolve()
|
||||||
output_path = output_path.resolve()
|
output_path = output_path.resolve()
|
||||||
|
|
||||||
theme_variants: Dict[str, set[str]] = defaultdict(set)
|
theme_variants: Dict[str, set[str]] = defaultdict(set)
|
||||||
|
|
||||||
# Try to use parquet file first (much faster)
|
# Parquet-only mode (M4 migration: CSV files removed)
|
||||||
used_parquet = False
|
if not HAS_PARQUET_SUPPORT:
|
||||||
if use_parquet and HAS_PARQUET_SUPPORT:
|
raise RuntimeError(
|
||||||
try:
|
"Pandas is required for theme catalog generation. "
|
||||||
# Use processed parquet files (M4 migration)
|
"Install with: pip install pandas pyarrow"
|
||||||
parquet_dir = csv_directory.parent / "card_files" / "processed"
|
)
|
||||||
|
|
||||||
# Load all card counts from all_cards.parquet (includes commanders)
|
|
||||||
all_cards_parquet = parquet_dir / "all_cards.parquet"
|
|
||||||
card_counts = _load_theme_counts_from_parquet(
|
|
||||||
all_cards_parquet, theme_variants=theme_variants
|
|
||||||
)
|
|
||||||
|
|
||||||
# For commander counts, filter all_cards by is_commander column
|
|
||||||
if all_cards_parquet.exists() and pd is not None:
|
|
||||||
df_commanders = pd.read_parquet(all_cards_parquet)
|
|
||||||
df_commanders = df_commanders[df_commanders.get('is_commander', False)]
|
|
||||||
commander_counts = Counter()
|
|
||||||
for tags in df_commanders['themeTags'].tolist():
|
|
||||||
if tags is None or (isinstance(tags, float) and pd.isna(tags)):
|
|
||||||
continue
|
|
||||||
from code.deck_builder.theme_catalog_loader import parse_theme_tags, normalize_theme_display, canonical_key
|
|
||||||
parsed = parse_theme_tags(tags)
|
|
||||||
if not parsed:
|
|
||||||
continue
|
|
||||||
seen = set()
|
|
||||||
for tag in parsed:
|
|
||||||
display = normalize_theme_display(tag)
|
|
||||||
if not display:
|
|
||||||
continue
|
|
||||||
key = canonical_key(display)
|
|
||||||
if key not in seen:
|
|
||||||
seen.add(key)
|
|
||||||
commander_counts[key] += 1
|
|
||||||
theme_variants[key].add(display)
|
|
||||||
else:
|
|
||||||
commander_counts = Counter()
|
|
||||||
|
|
||||||
used_parquet = True
|
|
||||||
print("✓ Loaded theme data from parquet files")
|
|
||||||
print(f" - Commanders: {len(commander_counts)} themes")
|
|
||||||
print(f" - All cards: {len(card_counts)} themes")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠ Failed to load from parquet: {e}")
|
|
||||||
print(" Falling back to CSV files...")
|
|
||||||
used_parquet = False
|
|
||||||
|
|
||||||
# Fallback to CSV files if parquet not available or failed
|
# Use processed parquet files (M4 migration)
|
||||||
if not used_parquet:
|
parquet_dir = csv_directory.parent / "card_files" / "processed"
|
||||||
commander_counts = _load_theme_counts(csv_directory / commander_filename, theme_variants)
|
all_cards_parquet = parquet_dir / "all_cards.parquet"
|
||||||
|
|
||||||
card_counts: Counter[str] = Counter()
|
print(f"Loading theme data from parquet: {all_cards_parquet}")
|
||||||
cards_path = csv_directory / cards_filename
|
print(f" File exists: {all_cards_parquet.exists()}")
|
||||||
if cards_path.exists():
|
|
||||||
card_counts = _load_theme_counts(cards_path, theme_variants)
|
if not all_cards_parquet.exists():
|
||||||
else:
|
raise FileNotFoundError(
|
||||||
# Fallback: scan all *_cards.csv except commander
|
f"Required Parquet file not found: {all_cards_parquet}\n"
|
||||||
for candidate in csv_directory.glob("*_cards.csv"):
|
f"Run tagging first: python -c \"from code.tagging.tagger import run_tagging; run_tagging()\""
|
||||||
if candidate.name == commander_filename:
|
)
|
||||||
continue
|
|
||||||
card_counts += _load_theme_counts(candidate, theme_variants)
|
# Load all card counts from all_cards.parquet (includes commanders)
|
||||||
|
card_counts = _load_theme_counts_from_parquet(
|
||||||
print("✓ Loaded theme data from CSV files")
|
all_cards_parquet, theme_variants=theme_variants
|
||||||
|
)
|
||||||
|
|
||||||
|
# For commander counts, filter all_cards by is_commander column
|
||||||
|
df_commanders = pd.read_parquet(all_cards_parquet)
|
||||||
|
df_commanders = df_commanders[df_commanders.get('is_commander', False)]
|
||||||
|
commander_counts = Counter()
|
||||||
|
for tags in df_commanders['themeTags'].tolist():
|
||||||
|
if tags is None or (isinstance(tags, float) and pd.isna(tags)):
|
||||||
|
continue
|
||||||
|
from code.deck_builder.theme_catalog_loader import parse_theme_tags, normalize_theme_display, canonical_key
|
||||||
|
parsed = parse_theme_tags(tags)
|
||||||
|
if not parsed:
|
||||||
|
continue
|
||||||
|
seen = set()
|
||||||
|
for tag in parsed:
|
||||||
|
display = normalize_theme_display(tag)
|
||||||
|
if not display:
|
||||||
|
continue
|
||||||
|
key = canonical_key(display)
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
commander_counts[key] += 1
|
||||||
|
theme_variants[key].add(display)
|
||||||
|
|
||||||
|
# Verify we found theme tags
|
||||||
|
total_themes_found = len(card_counts) + len(commander_counts)
|
||||||
|
if total_themes_found == 0:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"No theme tags found in {all_cards_parquet}\n"
|
||||||
|
f"The Parquet file exists but contains no themeTags data. "
|
||||||
|
f"This usually means tagging hasn't completed or failed.\n"
|
||||||
|
f"Check that 'themeTags' column exists and is populated."
|
||||||
|
)
|
||||||
|
|
||||||
|
print("✓ Loaded theme data from parquet files")
|
||||||
|
print(f" - Commanders: {len(commander_counts)} themes")
|
||||||
|
print(f" - All cards: {len(card_counts)} themes")
|
||||||
|
|
||||||
keys = sorted(set(card_counts.keys()) | set(commander_counts.keys()))
|
keys = sorted(set(card_counts.keys()) | set(commander_counts.keys()))
|
||||||
generated_at_iso = _derive_generated_at(generated_at)
|
generated_at_iso = _derive_generated_at(generated_at)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue