name: Build Similarity Cache # Manual trigger + weekly schedule + callable from other workflows on: workflow_dispatch: inputs: force_rebuild: description: 'Force rebuild even if cache exists' required: false type: boolean default: true workflow_call: # Allow this workflow to be called by other workflows schedule: # Run every Sunday at 2 AM UTC - cron: '0 2 * * 0' jobs: build-cache: runs-on: ubuntu-latest timeout-minutes: 45 steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 1 - name: Set up Python 3.11 uses: actions/setup-python@v5 with: python-version: '3.11' cache: 'pip' - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt - name: Check if cache needs rebuild id: check_cache run: | FORCE="${{ github.event.inputs.force_rebuild }}" if [ "$FORCE" = "true" ] || [ ! -f "card_files/similarity_cache.parquet" ]; then echo "needs_build=true" >> $GITHUB_OUTPUT echo "Cache doesn't exist or force rebuild requested" else # Check cache age via metadata JSON CACHE_AGE_DAYS=$(python -c " import json from datetime import datetime from pathlib import Path metadata_path = Path('card_files/similarity_cache_metadata.json') if metadata_path.exists(): with open(metadata_path) as f: data = json.load(f) build_date = data.get('build_date') if build_date: age = (datetime.now() - datetime.fromisoformat(build_date)).days print(age) else: print(999) else: print(999) " || echo "999") if [ "$CACHE_AGE_DAYS" -gt 7 ]; then echo "needs_build=true" >> $GITHUB_OUTPUT echo "Cache is $CACHE_AGE_DAYS days old, rebuilding" else echo "needs_build=false" >> $GITHUB_OUTPUT echo "Cache is only $CACHE_AGE_DAYS days old, skipping" fi fi - name: Run initial setup if: steps.check_cache.outputs.needs_build == 'true' run: | python -c "from code.file_setup.setup import initial_setup; initial_setup()" - name: Run tagging (serial for CI reliability) if: steps.check_cache.outputs.needs_build == 'true' run: | python -c "from code.tagging.tagger import run_tagging; run_tagging(parallel=False)" # Verify tagging completed if [ ! -f "card_files/processed/.tagging_complete.json" ]; then echo "ERROR: Tagging completion flag not found" exit 1 fi # Debug step - uncomment if needed to inspect Parquet file contents # - name: Debug - Inspect Parquet file after tagging # if: steps.check_cache.outputs.needs_build == 'true' # run: | # python -c " # import pandas as pd # from pathlib import Path # from code.path_util import get_processed_cards_path # # parquet_path = Path(get_processed_cards_path()) # print(f'Reading Parquet file: {parquet_path}') # print(f'File exists: {parquet_path.exists()}') # # if not parquet_path.exists(): # raise FileNotFoundError(f'Parquet file not found: {parquet_path}') # # df = pd.read_parquet(parquet_path) # print(f'Loaded {len(df)} rows from Parquet file') # print(f'Columns: {list(df.columns)}') # print('') # # # Show first 5 rows completely # print('First 5 complete rows:') # print('=' * 100) # for idx, row in df.head(5).iterrows(): # print(f'Row {idx}:') # for col in df.columns: # value = row[col] # if isinstance(value, (list, tuple)) or hasattr(value, '__array__'): # # For array-like, show type and length # try: # length = len(value) # print(f' {col}: {type(value).__name__}[{length}] = {value}') # except: # print(f' {col}: {type(value).__name__} = {value}') # else: # print(f' {col}: {value}') # print('-' * 100) # " - name: Generate theme catalog if: steps.check_cache.outputs.needs_build == 'true' run: | if [ ! -f "config/themes/theme_catalog.csv" ]; then echo "Theme catalog not found, generating..." python -m code.scripts.generate_theme_catalog else echo "Theme catalog already exists, skipping generation" fi - name: Verify theme catalog and tag statistics if: steps.check_cache.outputs.needs_build == 'true' run: | # Detailed check of what tags were actually written python -c " import pandas as pd from code.path_util import get_processed_cards_path df = pd.read_parquet(get_processed_cards_path()) # Helper to count tags (handles both list and numpy array) def count_tags(x): if x is None: return 0 if hasattr(x, '__len__'): try: return len(x) except: return 0 return 0 # Count total tags total_tags = 0 cards_with_tags = 0 sample_cards = [] for idx, row in df.head(10).iterrows(): name = row['name'] tags = row['themeTags'] tag_count = count_tags(tags) total_tags += tag_count if tag_count > 0: cards_with_tags += 1 sample_cards.append(f'{name}: {tag_count} tags') print(f'Sample of first 10 cards:') for card in sample_cards: print(f' {card}') # Full count all_tags = df['themeTags'].apply(count_tags).sum() all_with_tags = (df['themeTags'].apply(count_tags) > 0).sum() print(f'') print(f'Total cards: {len(df):,}') print(f'Cards with tags: {all_with_tags:,}') print(f'Total theme tags: {all_tags:,}') if all_tags < 10000: raise ValueError(f'Only {all_tags} tags found, expected >10k') " - name: Build similarity cache (Parquet) from card_files/processed/all_cards.parquet if: steps.check_cache.outputs.needs_build == 'true' run: | python -m code.scripts.build_similarity_cache_parquet --parallel --checkpoint-interval 1000 --force - name: Verify cache was created if: steps.check_cache.outputs.needs_build == 'true' run: | if [ ! -f "card_files/similarity_cache.parquet" ]; then echo "ERROR: Similarity cache not created" exit 1 fi if [ ! -f "card_files/similarity_cache_metadata.json" ]; then echo "ERROR: Similarity cache metadata not created" exit 1 fi if [ ! -f "card_files/processed/commander_cards.parquet" ]; then echo "ERROR: Commander cache not created" exit 1 fi echo "✓ All cache files created successfully" - name: Get cache metadata for commit message if: steps.check_cache.outputs.needs_build == 'true' id: cache_meta run: | METADATA=$(python -c " import json from pathlib import Path from code.web.services.similarity_cache import get_cache cache = get_cache() stats = cache.get_stats() metadata = cache._metadata or {} build_date = metadata.get('build_date', 'unknown') print(f\"{stats['total_cards']} cards, {stats['total_entries']} entries, {stats['file_size_mb']:.1f}MB, built {build_date}\") ") echo "metadata=$METADATA" >> $GITHUB_OUTPUT - name: Commit and push cache if: steps.check_cache.outputs.needs_build == 'true' run: | git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" # Create a detached commit with just the cache files (no checkout needed) echo "Creating cache-only commit..." # Create README for the branch cat > README-cache.md << 'EOF' # Similarity Cache Data This branch contains pre-built similarity cache files for the MTG Deckbuilder. Updated automatically by GitHub Actions. ## Files - `card_files/similarity_cache.parquet` - Pre-computed card similarity cache - `card_files/similarity_cache_metadata.json` - Cache metadata - `card_files/processed/all_cards.parquet` - Tagged card database - `card_files/processed/commander_cards.parquet` - Commander-only cache (fast lookups) - `card_files/processed/.tagging_complete.json` - Tagging status EOF # Start with clean index git rm -rf --cached . > /dev/null 2>&1 || true # Add cache files to index (use -f to override .gitignore) git add -f card_files/similarity_cache.parquet git add -f card_files/similarity_cache_metadata.json git add -f card_files/processed/all_cards.parquet git add -f card_files/processed/commander_cards.parquet git add -f card_files/processed/.tagging_complete.json git add -f README-cache.md # Create a new commit TREE=$(git write-tree) COMMIT=$(git commit-tree $TREE -m "chore: update similarity cache [${{ steps.cache_meta.outputs.metadata }}]") echo "Created commit: $COMMIT" # Push directly to similarity-cache-data branch (force push) git push origin $COMMIT:refs/heads/similarity-cache-data --force echo "Successfully pushed cache to similarity-cache-data branch" - name: Summary if: always() run: | if [ "${{ steps.check_cache.outputs.needs_build }}" = "true" ]; then echo "✓ Similarity cache built and committed" echo " Metadata: ${{ steps.cache_meta.outputs.metadata }}" else echo "⊘ Cache is recent, no rebuild needed" fi