mtg_python_deckbuilder/.github/workflows/build-similarity-cache.yml
matt bff64de370 fix: systematically handle numpy arrays from Parquet files across codebase
- Add ensure_theme_tags_list() utility to builder_utils for simpler numpy array handling
- Update phase3_creatures.py: 6 locations now use bu.ensure_theme_tags_list()
- Update phase4_spells.py: 9 locations now use bu.ensure_theme_tags_list()
- Update tagger.py: 2 locations use hasattr/list() for numpy compatibility
- Update extract_themes.py: 2 locations use hasattr/list() for numpy compatibility
- Fix build-similarity-cache.yml verification script to handle numpy arrays
- Enhance workflow debug output to show complete row data

Parquet files return numpy.ndarray objects for array columns, not Python lists.
The M4 migration added numpy support to canonical parse_theme_tags() in builder_utils,
but many parts of the codebase still used isinstance(list) checks that fail with arrays.
This commit systematically replaces all 19 instances with proper numpy array handling.

Fixes GitHub Actions workflow 'RuntimeError: No theme tags found' and verification failures.
2025-10-18 22:47:09 -07:00

301 lines
11 KiB
YAML

name: Build Similarity Cache
# Manual trigger + weekly schedule + callable from other workflows
on:
workflow_dispatch:
inputs:
force_rebuild:
description: 'Force rebuild even if cache exists'
required: false
type: boolean
default: true
workflow_call: # Allow this workflow to be called by other workflows
schedule:
# Run every Sunday at 2 AM UTC
- cron: '0 2 * * 0'
jobs:
build-cache:
runs-on: ubuntu-latest
timeout-minutes: 45
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Check if cache needs rebuild
id: check_cache
run: |
FORCE="${{ github.event.inputs.force_rebuild }}"
if [ "$FORCE" = "true" ] || [ ! -f "card_files/similarity_cache.parquet" ]; then
echo "needs_build=true" >> $GITHUB_OUTPUT
echo "Cache doesn't exist or force rebuild requested"
else
# Check cache age via metadata JSON
CACHE_AGE_DAYS=$(python -c "
import json
from datetime import datetime
from pathlib import Path
metadata_path = Path('card_files/similarity_cache_metadata.json')
if metadata_path.exists():
with open(metadata_path) as f:
data = json.load(f)
build_date = data.get('build_date')
if build_date:
age = (datetime.now() - datetime.fromisoformat(build_date)).days
print(age)
else:
print(999)
else:
print(999)
" || echo "999")
if [ "$CACHE_AGE_DAYS" -gt 7 ]; then
echo "needs_build=true" >> $GITHUB_OUTPUT
echo "Cache is $CACHE_AGE_DAYS days old, rebuilding"
else
echo "needs_build=false" >> $GITHUB_OUTPUT
echo "Cache is only $CACHE_AGE_DAYS days old, skipping"
fi
fi
- name: Run initial setup
if: steps.check_cache.outputs.needs_build == 'true'
run: |
python -c "from code.file_setup.setup import initial_setup; initial_setup()"
- name: Run tagging (serial for CI reliability)
if: steps.check_cache.outputs.needs_build == 'true'
run: |
python -c "from code.tagging.tagger import run_tagging; run_tagging(parallel=False)"
# Verify tagging completed
if [ ! -f "card_files/processed/.tagging_complete.json" ]; then
echo "ERROR: Tagging completion flag not found"
exit 1
fi
- name: Debug - Inspect Parquet file after tagging
if: steps.check_cache.outputs.needs_build == 'true'
run: |
python -c "
import pandas as pd
from pathlib import Path
from code.path_util import get_processed_cards_path
parquet_path = Path(get_processed_cards_path())
print(f'Reading Parquet file: {parquet_path}')
print(f'File exists: {parquet_path.exists()}')
if not parquet_path.exists():
raise FileNotFoundError(f'Parquet file not found: {parquet_path}')
df = pd.read_parquet(parquet_path)
print(f'Loaded {len(df)} rows from Parquet file')
print(f'Columns: {list(df.columns)}')
print('')
# Show first 5 rows completely
print('First 5 complete rows:')
print('=' * 100)
for idx, row in df.head(5).iterrows():
print(f'Row {idx}:')
for col in df.columns:
value = row[col]
if isinstance(value, (list, tuple)) or hasattr(value, '__array__'):
# For array-like, show type and length
try:
length = len(value)
print(f' {col}: {type(value).__name__}[{length}] = {value}')
except:
print(f' {col}: {type(value).__name__} = {value}')
else:
print(f' {col}: {value}')
print('-' * 100)
"
- name: Generate theme catalog
if: steps.check_cache.outputs.needs_build == 'true'
run: |
if [ ! -f "config/themes/theme_catalog.csv" ]; then
echo "Theme catalog not found, generating..."
python -m code.scripts.generate_theme_catalog
else
echo "Theme catalog already exists, skipping generation"
fi
- name: Verify theme catalog and tag statistics
if: steps.check_cache.outputs.needs_build == 'true'
run: |
# Detailed check of what tags were actually written
python -c "
import pandas as pd
from code.path_util import get_processed_cards_path
df = pd.read_parquet(get_processed_cards_path())
# Helper to count tags (handles both list and numpy array)
def count_tags(x):
if x is None:
return 0
if hasattr(x, '__len__'):
try:
return len(x)
except:
return 0
return 0
# Count total tags
total_tags = 0
cards_with_tags = 0
sample_cards = []
for idx, row in df.head(10).iterrows():
name = row['name']
tags = row['themeTags']
tag_count = count_tags(tags)
total_tags += tag_count
if tag_count > 0:
cards_with_tags += 1
sample_cards.append(f'{name}: {tag_count} tags')
print(f'Sample of first 10 cards:')
for card in sample_cards:
print(f' {card}')
# Full count
all_tags = df['themeTags'].apply(count_tags).sum()
all_with_tags = (df['themeTags'].apply(count_tags) > 0).sum()
print(f'')
print(f'Total cards: {len(df):,}')
print(f'Cards with tags: {all_with_tags:,}')
print(f'Total theme tags: {all_tags:,}')
if all_tags < 10000:
raise ValueError(f'Only {all_tags} tags found, expected >10k')
"
- name: Build similarity cache (Parquet) from card_files/processed/all_cards.parquet
if: steps.check_cache.outputs.needs_build == 'true'
run: |
python -m code.scripts.build_similarity_cache_parquet --parallel --checkpoint-interval 1000 --force
- name: Verify cache was created
if: steps.check_cache.outputs.needs_build == 'true'
run: |
if [ ! -f "card_files/similarity_cache.parquet" ]; then
echo "ERROR: Cache Parquet file was not created"
exit 1
fi
if [ ! -f "card_files/similarity_cache_metadata.json" ]; then
echo "ERROR: Cache metadata file was not created"
exit 1
fi
# Check cache validity
python -c "
import json
from pathlib import Path
from code.web.services.similarity_cache import get_cache
cache = get_cache()
stats = cache.get_stats()
if stats['total_cards'] < 20000:
raise ValueError(f\"Cache only has {stats['total_cards']} cards, expected ~30k\")
print(f\"✓ Cache is valid with {stats['total_cards']:,} cards, {stats['total_entries']:,} entries\")
print(f\" File size: {stats['file_size_mb']:.2f} MB\")
"
- name: Get cache metadata for commit message
if: steps.check_cache.outputs.needs_build == 'true'
id: cache_meta
run: |
METADATA=$(python -c "
import json
from pathlib import Path
from code.web.services.similarity_cache import get_cache
cache = get_cache()
stats = cache.get_stats()
metadata = cache._metadata or {}
build_date = metadata.get('build_date', 'unknown')
print(f\"{stats['total_cards']} cards, {stats['total_entries']} entries, {stats['file_size_mb']:.1f}MB, built {build_date}\")
")
echo "metadata=$METADATA" >> $GITHUB_OUTPUT
- name: Commit and push cache
if: steps.check_cache.outputs.needs_build == 'true'
run: |
git config --local user.email "github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
# Fetch all branches
git fetch origin
# Try to checkout existing branch, or create new orphan branch
if git ls-remote --heads origin similarity-cache-data | grep similarity-cache-data; then
echo "Checking out existing similarity-cache-data branch..."
git checkout similarity-cache-data
else
echo "Creating new orphan branch similarity-cache-data..."
git checkout --orphan similarity-cache-data
git rm -rf . || true
# Create minimal README for the branch
echo "# Similarity Cache Data" > README.md
echo "This branch contains pre-built similarity cache files for the MTG Deckbuilder." >> README.md
echo "Updated automatically by GitHub Actions." >> README.md
echo "" >> README.md
echo "## Files" >> README.md
echo "- \`card_files/similarity_cache.parquet\` - Pre-computed card similarity cache" >> README.md
echo "- \`card_files/similarity_cache_metadata.json\` - Cache metadata" >> README.md
echo "- \`card_files/processed/all_cards.parquet\` - Tagged card database" >> README.md
echo "- \`card_files/processed/.tagging_complete.json\` - Tagging status" >> README.md
fi
# Ensure directories exist
mkdir -p card_files/processed
# Add similarity cache files (use -f to override .gitignore)
git add -f card_files/similarity_cache.parquet
git add -f card_files/similarity_cache_metadata.json
# Add processed Parquet and status file
git add -f card_files/processed/all_cards.parquet
git add -f card_files/processed/.tagging_complete.json
git add README.md 2>/dev/null || true
# Check if there are changes to commit
if git diff --staged --quiet; then
echo "No changes to commit"
else
git commit -m "chore: update similarity cache [${{ steps.cache_meta.outputs.metadata }}]"
git push origin similarity-cache-data --force
fi
- name: Summary
if: always()
run: |
if [ "${{ steps.check_cache.outputs.needs_build }}" = "true" ]; then
echo "✓ Similarity cache built and committed"
echo " Metadata: ${{ steps.cache_meta.outputs.metadata }}"
else
echo "⊘ Cache is recent, no rebuild needed"
fi