fix: add detailed tag validation to CI workflow

2026-01-31 05:35:17 +01:00 · 2025-10-18 21:56:23 -07:00 · 2025-10-18 21:56:23 -07:00 · 8e8b788091
commit 8e8b788091
parent e92f2ccfb4
1 changed files with 45 additions and 2 deletions
--- a/.github/workflows/build-similarity-cache.yml
+++ b/.github/workflows/build-similarity-cache.yml
@ -78,10 +78,53 @@ jobs:
        run: |
          python -c "from code.file_setup.setup import initial_setup; initial_setup()"
-      - name: Run tagging (parallel)
+      - name: Run tagging (serial for CI reliability)
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
-          python -c "from code.tagging.tagger import run_tagging; run_tagging(parallel=True)"
+          python -c "from code.tagging.tagger import run_tagging; run_tagging(parallel=False)"
          # Verify tagging completed
          if [ ! -f "card_files/processed/.tagging_complete.json" ]; then
            echo "ERROR: Tagging completion flag not found"
            exit 1
          fi
          # Detailed check of what tags were actually written
          python -c "
          import pandas as pd
          from code.path_util import get_processed_cards_path
          df = pd.read_parquet(get_processed_cards_path())
          # Count total tags
          total_tags = 0
          cards_with_tags = 0
          sample_cards = []
          for idx, row in df.head(10).iterrows():
              name = row['name']
              tags = row['themeTags']
              tag_count = len(tags) if isinstance(tags, list) else 0
              total_tags += tag_count
              if tag_count > 0:
                  cards_with_tags += 1
                  sample_cards.append(f'{name}: {tag_count} tags')
          print(f'Sample of first 10 cards:')
          for card in sample_cards:
              print(f'  {card}')
          # Full count
          all_tags = df['themeTags'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()
          all_with_tags = (df['themeTags'].apply(lambda x: len(x) if isinstance(x, list) else 0) > 0).sum()
          print(f'')
          print(f'Total cards: {len(df):,}')
          print(f'Cards with tags: {all_with_tags:,}')
          print(f'Total theme tags: {all_tags:,}')
          if all_tags < 10000:
              raise ValueError(f'Only {all_tags} tags found, expected >10k')
          "
      - name: Build similarity cache (Parquet) from card_files/processed/all_cards.parquet
        if: steps.check_cache.outputs.needs_build == 'true'