mtg_python_deckbuilder/.github/workflows/build-similarity-cache.yml

name: Build Similarity Cache

# Manual trigger + weekly schedule + callable from other workflows
on:
  workflow_dispatch:
    inputs:
      force_rebuild:
        description: 'Force rebuild even if cache exists'
        required: false
        type: boolean
        default: true
  workflow_call:  # Allow this workflow to be called by other workflows
  schedule:
    # Run every Sunday at 2 AM UTC
    - cron: '0 2 * * 0'

jobs:
  build-cache:
    runs-on: ubuntu-latest
    timeout-minutes: 45

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 1

      - name: Set up Python 3.11
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
          cache: 'pip'

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt

      - name: Check if cache needs rebuild
        id: check_cache
        run: |
          FORCE="${{ github.event.inputs.force_rebuild }}"
          if [ "$FORCE" = "true" ] || [ ! -f "card_files/similarity_cache.parquet" ]; then
            echo "needs_build=true" >> $GITHUB_OUTPUT
            echo "Cache doesn't exist or force rebuild requested"
          else
            # Check cache age via metadata JSON
            CACHE_AGE_DAYS=$(python -c "
          import json
          from datetime import datetime
          from pathlib import Path

          metadata_path = Path('card_files/similarity_cache_metadata.json')
          if metadata_path.exists():
              with open(metadata_path) as f:
                  data = json.load(f)
              build_date = data.get('build_date')
              if build_date:
                  age = (datetime.now() - datetime.fromisoformat(build_date)).days
                  print(age)
              else:
                  print(999)
          else:
              print(999)
          " || echo "999")

            if [ "$CACHE_AGE_DAYS" -gt 7 ]; then
              echo "needs_build=true" >> $GITHUB_OUTPUT
              echo "Cache is $CACHE_AGE_DAYS days old, rebuilding"
            else
              echo "needs_build=false" >> $GITHUB_OUTPUT
              echo "Cache is only $CACHE_AGE_DAYS days old, skipping"
            fi
          fi

      - name: Run initial setup
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          python -c "from code.file_setup.setup import initial_setup; initial_setup()"

      - name: Run tagging (serial for CI reliability)
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          python -c "from code.tagging.tagger import run_tagging; run_tagging(parallel=False)"

          # Verify tagging completed
          if [ ! -f "card_files/processed/.tagging_complete.json" ]; then
            echo "ERROR: Tagging completion flag not found"
            exit 1
          fi

      - name: Debug - Inspect Parquet file after tagging
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          python -c "
          import pandas as pd
          from pathlib import Path
          from code.path_util import get_processed_cards_path

          parquet_path = Path(get_processed_cards_path())
          print(f'Reading Parquet file: {parquet_path}')
          print(f'File exists: {parquet_path.exists()}')

          if not parquet_path.exists():
              raise FileNotFoundError(f'Parquet file not found: {parquet_path}')

          df = pd.read_parquet(parquet_path)
          print(f'Loaded {len(df)} rows from Parquet file')
          print(f'Columns: {list(df.columns)}')
          print('')

          # Show first 5 rows completely
          print('First 5 complete rows:')
          print('=' * 100)
          for idx, row in df.head(5).iterrows():
              print(f'Row {idx}:')
              for col in df.columns:
                  value = row[col]
                  if isinstance(value, (list, tuple)) or hasattr(value, '__array__'):
                      # For array-like, show type and length
                      try:
                          length = len(value)
                          print(f'  {col}: {type(value).__name__}[{length}] = {value}')
                      except:
                          print(f'  {col}: {type(value).__name__} = {value}')
                  else:
                      print(f'  {col}: {value}')
              print('-' * 100)
          "

      - name: Generate theme catalog
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          if [ ! -f "config/themes/theme_catalog.csv" ]; then
            echo "Theme catalog not found, generating..."
            python -m code.scripts.generate_theme_catalog
          else
            echo "Theme catalog already exists, skipping generation"
          fi

      - name: Verify theme catalog and tag statistics
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          # Detailed check of what tags were actually written
          python -c "
          import pandas as pd
          from code.path_util import get_processed_cards_path
          df = pd.read_parquet(get_processed_cards_path())

          # Helper to count tags (handles both list and numpy array)
          def count_tags(x):
              if x is None:
                  return 0
              if hasattr(x, '__len__'):
                  try:
                      return len(x)
                  except:
                      return 0
              return 0

          # Count total tags
          total_tags = 0
          cards_with_tags = 0
          sample_cards = []

          for idx, row in df.head(10).iterrows():
              name = row['name']
              tags = row['themeTags']
              tag_count = count_tags(tags)
              total_tags += tag_count
              if tag_count > 0:
                  cards_with_tags += 1
                  sample_cards.append(f'{name}: {tag_count} tags')

          print(f'Sample of first 10 cards:')
          for card in sample_cards:
              print(f'  {card}')

          # Full count
          all_tags = df['themeTags'].apply(count_tags).sum()
          all_with_tags = (df['themeTags'].apply(count_tags) > 0).sum()

          print(f'')
          print(f'Total cards: {len(df):,}')
          print(f'Cards with tags: {all_with_tags:,}')
          print(f'Total theme tags: {all_tags:,}')

          if all_tags < 10000:
              raise ValueError(f'Only {all_tags} tags found, expected >10k')
          "

      - name: Build similarity cache (Parquet) from card_files/processed/all_cards.parquet
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          python -m code.scripts.build_similarity_cache_parquet --parallel --checkpoint-interval 1000 --force

      - name: Verify cache was created
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          if [ ! -f "card_files/similarity_cache.parquet" ]; then
            echo "ERROR: Cache Parquet file was not created"
            exit 1
          fi
          if [ ! -f "card_files/similarity_cache_metadata.json" ]; then
            echo "ERROR: Cache metadata file was not created"
            exit 1
          fi

          # Check cache validity
          python -c "
          import json
          from pathlib import Path
          from code.web.services.similarity_cache import get_cache

          cache = get_cache()
          stats = cache.get_stats()

          if stats['total_cards'] < 20000:
              raise ValueError(f\"Cache only has {stats['total_cards']} cards, expected ~30k\")

          print(f\"✓ Cache is valid with {stats['total_cards']:,} cards, {stats['total_entries']:,} entries\")
          print(f\"  File size: {stats['file_size_mb']:.2f} MB\")
          "

      - name: Get cache metadata for commit message
        if: steps.check_cache.outputs.needs_build == 'true'
        id: cache_meta
        run: |
          METADATA=$(python -c "
          import json
          from pathlib import Path
          from code.web.services.similarity_cache import get_cache

          cache = get_cache()
          stats = cache.get_stats()
          metadata = cache._metadata or {}

          build_date = metadata.get('build_date', 'unknown')
          print(f\"{stats['total_cards']} cards, {stats['total_entries']} entries, {stats['file_size_mb']:.1f}MB, built {build_date}\")
          ")
          echo "metadata=$METADATA" >> $GITHUB_OUTPUT

      - name: Commit and push cache
        if: steps.check_cache.outputs.needs_build == 'true'
        run: |
          git config --local user.email "github-actions[bot]@users.noreply.github.com"
          git config --local user.name "github-actions[bot]"

          # Fetch all branches
          git fetch origin

          # Try to checkout existing branch, or create new orphan branch
          if git ls-remote --heads origin similarity-cache-data | grep similarity-cache-data; then
            echo "Checking out existing similarity-cache-data branch..."
            git checkout similarity-cache-data
          else
            echo "Creating new orphan branch similarity-cache-data..."
            git checkout --orphan similarity-cache-data
            git rm -rf . || true
            # Create minimal README for the branch
            echo "# Similarity Cache Data" > README.md
            echo "This branch contains pre-built similarity cache files for the MTG Deckbuilder." >> README.md
            echo "Updated automatically by GitHub Actions." >> README.md
            echo "" >> README.md
            echo "## Files" >> README.md
            echo "- \`card_files/similarity_cache.parquet\` - Pre-computed card similarity cache" >> README.md
            echo "- \`card_files/similarity_cache_metadata.json\` - Cache metadata" >> README.md
            echo "- \`card_files/processed/all_cards.parquet\` - Tagged card database" >> README.md
            echo "- \`card_files/processed/.tagging_complete.json\` - Tagging status" >> README.md
          fi

          # Ensure directories exist
          mkdir -p card_files/processed

          # Add similarity cache files (use -f to override .gitignore)
          git add -f card_files/similarity_cache.parquet
          git add -f card_files/similarity_cache_metadata.json

          # Add processed Parquet and status file
          git add -f card_files/processed/all_cards.parquet
          git add -f card_files/processed/.tagging_complete.json

          git add README.md 2>/dev/null || true

          # Check if there are changes to commit
          if git diff --staged --quiet; then
            echo "No changes to commit"
          else
            git commit -m "chore: update similarity cache [${{ steps.cache_meta.outputs.metadata }}]"
            git push origin similarity-cache-data --force
          fi

      - name: Summary
        if: always()
        run: |
          if [ "${{ steps.check_cache.outputs.needs_build }}" = "true" ]; then
            echo "✓ Similarity cache built and committed"
            echo "  Metadata: ${{ steps.cache_meta.outputs.metadata }}"
          else
            echo "⊘ Cache is recent, no rebuild needed"
          fi