fix: remove CSV fallback from theme catalog generation, add Parquet debug step

- Remove CSV fallback logic (Parquet-only in M4 migration) - Add better error messages when Parquet file missing or empty - Add workflow debug step to inspect Parquet file after tagging - Simplify build_theme_catalog function signature
2026-03-18 03:06:31 +01:00 · 2025-10-18 22:22:35 -07:00 · 2025-10-18 22:22:35 -07:00 · 30dfca0b67
commit 30dfca0b67
parent 9e6c3e66e9
2 changed files with 134 additions and 110 deletions
--- a/.github/workflows/build-similarity-cache.yml
+++ b/.github/workflows/build-similarity-cache.yml
@ -88,13 +88,60 @@ jobs:
            echo "ERROR: Tagging completion flag not found"
            exit 1
          fi
      - name: Debug - Inspect Parquet file after tagging
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          python -c "
          import pandas as pd
          from code.path_util import get_processed_cards_path
-          # Verify theme catalog was generated
+          parquet_path = get_processed_cards_path()
          print(f'Reading Parquet file: {parquet_path}')
          print(f'File exists: {parquet_path.exists()}')
          if not parquet_path.exists():
              raise FileNotFoundError(f'Parquet file not found: {parquet_path}')
          df = pd.read_parquet(parquet_path)
          print(f'Loaded {len(df)} rows from Parquet file')
          print(f'Columns: {list(df.columns)}')
          print('')
          # Show first 10 rows with their themeTags
          print('First 10 cards with themeTags:')
          print('=' * 80)
          for idx, row in df.head(10).iterrows():
              name = row.get('name', 'UNKNOWN')
              tags = row.get('themeTags', [])
              tag_count = len(tags) if isinstance(tags, list) else 0
              print(f'{idx}: {name}')
              print(f'   Type: {type(tags).__name__}')
              print(f'   Count: {tag_count}')
              if tag_count > 0:
                  # Show first 5 tags
                  sample = tags[:5] if tag_count > 5 else tags
                  print(f'   Tags: {sample}')
                  if tag_count > 5:
                      print(f'   ... and {tag_count - 5} more')
              else:
                  print(f'   Tags: (empty)')
              print('')
          "
      - name: Generate theme catalog
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          if [ ! -f "config/themes/theme_catalog.csv" ]; then
-            echo "WARNING: Theme catalog not found, generating..."
+            echo "Theme catalog not found, generating..."
            python -m code.scripts.generate_theme_catalog
          else
            echo "Theme catalog already exists, skipping generation"
          fi
-          
+      
      - name: Verify theme catalog and tag statistics
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          # Detailed check of what tags were actually written
          python -c "
          import pandas as pd
--- a/code/scripts/generate_theme_catalog.py
+++ b/code/scripts/generate_theme_catalog.py
@ -111,23 +111,38 @@ def _load_theme_counts_from_parquet(
        Counter of theme occurrences
    """
    if pd is None:
        print("  pandas not available, skipping parquet load")
        return Counter()
    counts: Counter[str] = Counter()
    if not parquet_path.exists():
        print(f"  Parquet file does not exist: {parquet_path}")
        return counts
    # Read only themeTags column for efficiency
    try:
        df = pd.read_parquet(parquet_path, columns=["themeTags"])
-    except Exception:
+        print(f"  Loaded {len(df)} rows from parquet")
    except Exception as e:
        # If themeTags column doesn't exist, return empty
        print(f"  Failed to read themeTags column: {e}")
        return counts
    # Convert to list for fast iteration (faster than iterrows)
    theme_tags_list = df["themeTags"].tolist()
    # Debug: check first few entries
    non_empty_count = 0
    for i, raw_value in enumerate(theme_tags_list[:10]):
        if raw_value is not None and not (isinstance(raw_value, float) and pd.isna(raw_value)):
            non_empty_count += 1
            if i < 3:  # Show first 3 non-empty
                print(f"    Sample tag {i}: {raw_value!r} (type: {type(raw_value).__name__})")
    if non_empty_count == 0:
        print("  WARNING: No non-empty themeTags found in first 10 rows")
    for raw_value in theme_tags_list:
        if raw_value is None or (isinstance(raw_value, float) and pd.isna(raw_value)):
            continue
@ -146,43 +161,11 @@ def _load_theme_counts_from_parquet(
            counts[key] += 1
            theme_variants[key].add(display)
    print(f"  Found {len(counts)} unique themes from parquet")
    return counts
-def _load_theme_counts(csv_path: Path, theme_variants: Dict[str, set[str]]) -> Counter[str]:
+# CSV fallback removed in M4 migration - Parquet is now required
    """Load theme counts from CSV file (fallback method).
    Args:
        csv_path: Path to CSV file
        theme_variants: Dict to accumulate theme name variants
    Returns:
        Counter of theme occurrences
    """
    counts: Counter[str] = Counter()
    if not csv_path.exists():
        return counts
    with csv_path.open("r", encoding="utf-8-sig", newline="") as handle:
        reader = csv.DictReader(handle)
        if not reader.fieldnames or "themeTags" not in reader.fieldnames:
            return counts
        for row in reader:
            raw_value = row.get("themeTags")
            tags = parse_theme_tags(raw_value)
            if not tags:
                continue
            seen_in_row: set[str] = set()
            for tag in tags:
                display = normalize_theme_display(tag)
                if not display:
                    continue
                key = canonical_key(display)
                if key in seen_in_row:
                    continue
                seen_in_row.add(key)
                counts[key] += 1
                theme_variants[key].add(display)
    return counts
 def _select_display_name(options: Sequence[str]) -> str:
@ -214,97 +197,91 @@ def build_theme_catalog(
    output_path: Path,
    *,
    generated_at: Optional[datetime] = None,
    commander_filename: str = "commander_cards.csv",
    cards_filename: str = "cards.csv",
    logs_directory: Optional[Path] = None,
    use_parquet: bool = True,
    min_card_count: int = 3,
 ) -> CatalogBuildResult:
-    """Build theme catalog from card data.
+    """Build theme catalog from Parquet card data.
    Args:
-        csv_directory: Directory containing CSV files (fallback)
+        csv_directory: Base directory (used to locate card_files/processed/all_cards.parquet)
        output_path: Where to write the catalog CSV
        generated_at: Optional timestamp for generation
        commander_filename: Name of commander CSV file
        cards_filename: Name of cards CSV file
        logs_directory: Optional directory to copy output to
        use_parquet: If True, try to use all_cards.parquet first (default: True)
        min_card_count: Minimum number of cards required to include theme (default: 3)
        use_parquet: If True, try to use all_cards.parquet first (default: True)
    Returns:
        CatalogBuildResult with generated rows and metadata
    Raises:
        RuntimeError: If pandas/pyarrow not available
        FileNotFoundError: If all_cards.parquet doesn't exist
        RuntimeError: If no theme tags found in Parquet file
    """
    csv_directory = csv_directory.resolve()
    output_path = output_path.resolve()
    theme_variants: Dict[str, set[str]] = defaultdict(set)
-    # Try to use parquet file first (much faster)
+    # Parquet-only mode (M4 migration: CSV files removed)
-    used_parquet = False
+    if not HAS_PARQUET_SUPPORT:
-    if use_parquet and HAS_PARQUET_SUPPORT:
+        raise RuntimeError(
-        try:
+            "Pandas is required for theme catalog generation. "
-            # Use processed parquet files (M4 migration)
+            "Install with: pip install pandas pyarrow"
-            parquet_dir = csv_directory.parent / "card_files" / "processed"
+        )
            # Load all card counts from all_cards.parquet (includes commanders)
            all_cards_parquet = parquet_dir / "all_cards.parquet"
            card_counts = _load_theme_counts_from_parquet(
                all_cards_parquet, theme_variants=theme_variants
            )
            # For commander counts, filter all_cards by is_commander column
            if all_cards_parquet.exists() and pd is not None:
                df_commanders = pd.read_parquet(all_cards_parquet)
                df_commanders = df_commanders[df_commanders.get('is_commander', False)]
                commander_counts = Counter()
                for tags in df_commanders['themeTags'].tolist():
                    if tags is None or (isinstance(tags, float) and pd.isna(tags)):
                        continue
                    from code.deck_builder.theme_catalog_loader import parse_theme_tags, normalize_theme_display, canonical_key
                    parsed = parse_theme_tags(tags)
                    if not parsed:
                        continue
                    seen = set()
                    for tag in parsed:
                        display = normalize_theme_display(tag)
                        if not display:
                            continue
                        key = canonical_key(display)
                        if key not in seen:
                            seen.add(key)
                            commander_counts[key] += 1
                            theme_variants[key].add(display)
            else:
                commander_counts = Counter()
            used_parquet = True
            print("✓ Loaded theme data from parquet files")
            print(f"  - Commanders: {len(commander_counts)} themes")
            print(f"  - All cards: {len(card_counts)} themes")
        except Exception as e:
            print(f"⚠ Failed to load from parquet: {e}")
            print("  Falling back to CSV files...")
            used_parquet = False
-    # Fallback to CSV files if parquet not available or failed
+    # Use processed parquet files (M4 migration)
-    if not used_parquet:
+    parquet_dir = csv_directory.parent / "card_files" / "processed"
-        commander_counts = _load_theme_counts(csv_directory / commander_filename, theme_variants)
+    all_cards_parquet = parquet_dir / "all_cards.parquet"
-
+    
-        card_counts: Counter[str] = Counter()
+    print(f"Loading theme data from parquet: {all_cards_parquet}")
-        cards_path = csv_directory / cards_filename
+    print(f"  File exists: {all_cards_parquet.exists()}")
-        if cards_path.exists():
+    
-            card_counts = _load_theme_counts(cards_path, theme_variants)
+    if not all_cards_parquet.exists():
-        else:
+        raise FileNotFoundError(
-            # Fallback: scan all *_cards.csv except commander
+            f"Required Parquet file not found: {all_cards_parquet}\n"
-            for candidate in csv_directory.glob("*_cards.csv"):
+            f"Run tagging first: python -c \"from code.tagging.tagger import run_tagging; run_tagging()\""
-                if candidate.name == commander_filename:
+        )
-                    continue
+    
-                card_counts += _load_theme_counts(candidate, theme_variants)
+    # Load all card counts from all_cards.parquet (includes commanders)
-        
+    card_counts = _load_theme_counts_from_parquet(
-        print("✓ Loaded theme data from CSV files")
+        all_cards_parquet, theme_variants=theme_variants
    )
    # For commander counts, filter all_cards by is_commander column
    df_commanders = pd.read_parquet(all_cards_parquet)
    df_commanders = df_commanders[df_commanders.get('is_commander', False)]
    commander_counts = Counter()
    for tags in df_commanders['themeTags'].tolist():
        if tags is None or (isinstance(tags, float) and pd.isna(tags)):
            continue
        from code.deck_builder.theme_catalog_loader import parse_theme_tags, normalize_theme_display, canonical_key
        parsed = parse_theme_tags(tags)
        if not parsed:
            continue
        seen = set()
        for tag in parsed:
            display = normalize_theme_display(tag)
            if not display:
                continue
            key = canonical_key(display)
            if key not in seen:
                seen.add(key)
                commander_counts[key] += 1
                theme_variants[key].add(display)
    # Verify we found theme tags
    total_themes_found = len(card_counts) + len(commander_counts)
    if total_themes_found == 0:
        raise RuntimeError(
            f"No theme tags found in {all_cards_parquet}\n"
            f"The Parquet file exists but contains no themeTags data. "
            f"This usually means tagging hasn't completed or failed.\n"
            f"Check that 'themeTags' column exists and is populated."
        )
    print("✓ Loaded theme data from parquet files")
    print(f"  - Commanders: {len(commander_counts)} themes")
    print(f"  - All cards: {len(card_counts)} themes")
    keys = sorted(set(card_counts.keys()) | set(commander_counts.keys()))
    generated_at_iso = _derive_generated_at(generated_at)