mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 15:40:12 +01:00
- Add ensure_theme_tags_list() utility to builder_utils for simpler numpy array handling - Update phase3_creatures.py: 6 locations now use bu.ensure_theme_tags_list() - Update phase4_spells.py: 9 locations now use bu.ensure_theme_tags_list() - Update tagger.py: 2 locations use hasattr/list() for numpy compatibility - Update extract_themes.py: 2 locations use hasattr/list() for numpy compatibility - Fix build-similarity-cache.yml verification script to handle numpy arrays - Enhance workflow debug output to show complete row data Parquet files return numpy.ndarray objects for array columns, not Python lists. The M4 migration added numpy support to canonical parse_theme_tags() in builder_utils, but many parts of the codebase still used isinstance(list) checks that fail with arrays. This commit systematically replaces all 19 instances with proper numpy array handling. Fixes GitHub Actions workflow 'RuntimeError: No theme tags found' and verification failures.
301 lines
11 KiB
YAML
301 lines
11 KiB
YAML
name: Build Similarity Cache
|
|
|
|
# Manual trigger + weekly schedule + callable from other workflows
|
|
on:
|
|
workflow_dispatch:
|
|
inputs:
|
|
force_rebuild:
|
|
description: 'Force rebuild even if cache exists'
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
workflow_call: # Allow this workflow to be called by other workflows
|
|
schedule:
|
|
# Run every Sunday at 2 AM UTC
|
|
- cron: '0 2 * * 0'
|
|
|
|
jobs:
|
|
build-cache:
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 45
|
|
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
with:
|
|
fetch-depth: 1
|
|
|
|
- name: Set up Python 3.11
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: '3.11'
|
|
cache: 'pip'
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
python -m pip install --upgrade pip
|
|
pip install -r requirements.txt
|
|
|
|
- name: Check if cache needs rebuild
|
|
id: check_cache
|
|
run: |
|
|
FORCE="${{ github.event.inputs.force_rebuild }}"
|
|
if [ "$FORCE" = "true" ] || [ ! -f "card_files/similarity_cache.parquet" ]; then
|
|
echo "needs_build=true" >> $GITHUB_OUTPUT
|
|
echo "Cache doesn't exist or force rebuild requested"
|
|
else
|
|
# Check cache age via metadata JSON
|
|
CACHE_AGE_DAYS=$(python -c "
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
metadata_path = Path('card_files/similarity_cache_metadata.json')
|
|
if metadata_path.exists():
|
|
with open(metadata_path) as f:
|
|
data = json.load(f)
|
|
build_date = data.get('build_date')
|
|
if build_date:
|
|
age = (datetime.now() - datetime.fromisoformat(build_date)).days
|
|
print(age)
|
|
else:
|
|
print(999)
|
|
else:
|
|
print(999)
|
|
" || echo "999")
|
|
|
|
if [ "$CACHE_AGE_DAYS" -gt 7 ]; then
|
|
echo "needs_build=true" >> $GITHUB_OUTPUT
|
|
echo "Cache is $CACHE_AGE_DAYS days old, rebuilding"
|
|
else
|
|
echo "needs_build=false" >> $GITHUB_OUTPUT
|
|
echo "Cache is only $CACHE_AGE_DAYS days old, skipping"
|
|
fi
|
|
fi
|
|
|
|
- name: Run initial setup
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -c "from code.file_setup.setup import initial_setup; initial_setup()"
|
|
|
|
- name: Run tagging (serial for CI reliability)
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -c "from code.tagging.tagger import run_tagging; run_tagging(parallel=False)"
|
|
|
|
# Verify tagging completed
|
|
if [ ! -f "card_files/processed/.tagging_complete.json" ]; then
|
|
echo "ERROR: Tagging completion flag not found"
|
|
exit 1
|
|
fi
|
|
|
|
- name: Debug - Inspect Parquet file after tagging
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -c "
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from code.path_util import get_processed_cards_path
|
|
|
|
parquet_path = Path(get_processed_cards_path())
|
|
print(f'Reading Parquet file: {parquet_path}')
|
|
print(f'File exists: {parquet_path.exists()}')
|
|
|
|
if not parquet_path.exists():
|
|
raise FileNotFoundError(f'Parquet file not found: {parquet_path}')
|
|
|
|
df = pd.read_parquet(parquet_path)
|
|
print(f'Loaded {len(df)} rows from Parquet file')
|
|
print(f'Columns: {list(df.columns)}')
|
|
print('')
|
|
|
|
# Show first 5 rows completely
|
|
print('First 5 complete rows:')
|
|
print('=' * 100)
|
|
for idx, row in df.head(5).iterrows():
|
|
print(f'Row {idx}:')
|
|
for col in df.columns:
|
|
value = row[col]
|
|
if isinstance(value, (list, tuple)) or hasattr(value, '__array__'):
|
|
# For array-like, show type and length
|
|
try:
|
|
length = len(value)
|
|
print(f' {col}: {type(value).__name__}[{length}] = {value}')
|
|
except:
|
|
print(f' {col}: {type(value).__name__} = {value}')
|
|
else:
|
|
print(f' {col}: {value}')
|
|
print('-' * 100)
|
|
"
|
|
|
|
- name: Generate theme catalog
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
if [ ! -f "config/themes/theme_catalog.csv" ]; then
|
|
echo "Theme catalog not found, generating..."
|
|
python -m code.scripts.generate_theme_catalog
|
|
else
|
|
echo "Theme catalog already exists, skipping generation"
|
|
fi
|
|
|
|
- name: Verify theme catalog and tag statistics
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
# Detailed check of what tags were actually written
|
|
python -c "
|
|
import pandas as pd
|
|
from code.path_util import get_processed_cards_path
|
|
df = pd.read_parquet(get_processed_cards_path())
|
|
|
|
# Helper to count tags (handles both list and numpy array)
|
|
def count_tags(x):
|
|
if x is None:
|
|
return 0
|
|
if hasattr(x, '__len__'):
|
|
try:
|
|
return len(x)
|
|
except:
|
|
return 0
|
|
return 0
|
|
|
|
# Count total tags
|
|
total_tags = 0
|
|
cards_with_tags = 0
|
|
sample_cards = []
|
|
|
|
for idx, row in df.head(10).iterrows():
|
|
name = row['name']
|
|
tags = row['themeTags']
|
|
tag_count = count_tags(tags)
|
|
total_tags += tag_count
|
|
if tag_count > 0:
|
|
cards_with_tags += 1
|
|
sample_cards.append(f'{name}: {tag_count} tags')
|
|
|
|
print(f'Sample of first 10 cards:')
|
|
for card in sample_cards:
|
|
print(f' {card}')
|
|
|
|
# Full count
|
|
all_tags = df['themeTags'].apply(count_tags).sum()
|
|
all_with_tags = (df['themeTags'].apply(count_tags) > 0).sum()
|
|
|
|
print(f'')
|
|
print(f'Total cards: {len(df):,}')
|
|
print(f'Cards with tags: {all_with_tags:,}')
|
|
print(f'Total theme tags: {all_tags:,}')
|
|
|
|
if all_tags < 10000:
|
|
raise ValueError(f'Only {all_tags} tags found, expected >10k')
|
|
"
|
|
|
|
- name: Build similarity cache (Parquet) from card_files/processed/all_cards.parquet
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -m code.scripts.build_similarity_cache_parquet --parallel --checkpoint-interval 1000 --force
|
|
|
|
- name: Verify cache was created
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
if [ ! -f "card_files/similarity_cache.parquet" ]; then
|
|
echo "ERROR: Cache Parquet file was not created"
|
|
exit 1
|
|
fi
|
|
if [ ! -f "card_files/similarity_cache_metadata.json" ]; then
|
|
echo "ERROR: Cache metadata file was not created"
|
|
exit 1
|
|
fi
|
|
|
|
# Check cache validity
|
|
python -c "
|
|
import json
|
|
from pathlib import Path
|
|
from code.web.services.similarity_cache import get_cache
|
|
|
|
cache = get_cache()
|
|
stats = cache.get_stats()
|
|
|
|
if stats['total_cards'] < 20000:
|
|
raise ValueError(f\"Cache only has {stats['total_cards']} cards, expected ~30k\")
|
|
|
|
print(f\"✓ Cache is valid with {stats['total_cards']:,} cards, {stats['total_entries']:,} entries\")
|
|
print(f\" File size: {stats['file_size_mb']:.2f} MB\")
|
|
"
|
|
|
|
- name: Get cache metadata for commit message
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
id: cache_meta
|
|
run: |
|
|
METADATA=$(python -c "
|
|
import json
|
|
from pathlib import Path
|
|
from code.web.services.similarity_cache import get_cache
|
|
|
|
cache = get_cache()
|
|
stats = cache.get_stats()
|
|
metadata = cache._metadata or {}
|
|
|
|
build_date = metadata.get('build_date', 'unknown')
|
|
print(f\"{stats['total_cards']} cards, {stats['total_entries']} entries, {stats['file_size_mb']:.1f}MB, built {build_date}\")
|
|
")
|
|
echo "metadata=$METADATA" >> $GITHUB_OUTPUT
|
|
|
|
- name: Commit and push cache
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
|
git config --local user.name "github-actions[bot]"
|
|
|
|
# Fetch all branches
|
|
git fetch origin
|
|
|
|
# Try to checkout existing branch, or create new orphan branch
|
|
if git ls-remote --heads origin similarity-cache-data | grep similarity-cache-data; then
|
|
echo "Checking out existing similarity-cache-data branch..."
|
|
git checkout similarity-cache-data
|
|
else
|
|
echo "Creating new orphan branch similarity-cache-data..."
|
|
git checkout --orphan similarity-cache-data
|
|
git rm -rf . || true
|
|
# Create minimal README for the branch
|
|
echo "# Similarity Cache Data" > README.md
|
|
echo "This branch contains pre-built similarity cache files for the MTG Deckbuilder." >> README.md
|
|
echo "Updated automatically by GitHub Actions." >> README.md
|
|
echo "" >> README.md
|
|
echo "## Files" >> README.md
|
|
echo "- \`card_files/similarity_cache.parquet\` - Pre-computed card similarity cache" >> README.md
|
|
echo "- \`card_files/similarity_cache_metadata.json\` - Cache metadata" >> README.md
|
|
echo "- \`card_files/processed/all_cards.parquet\` - Tagged card database" >> README.md
|
|
echo "- \`card_files/processed/.tagging_complete.json\` - Tagging status" >> README.md
|
|
fi
|
|
|
|
# Ensure directories exist
|
|
mkdir -p card_files/processed
|
|
|
|
# Add similarity cache files (use -f to override .gitignore)
|
|
git add -f card_files/similarity_cache.parquet
|
|
git add -f card_files/similarity_cache_metadata.json
|
|
|
|
# Add processed Parquet and status file
|
|
git add -f card_files/processed/all_cards.parquet
|
|
git add -f card_files/processed/.tagging_complete.json
|
|
|
|
git add README.md 2>/dev/null || true
|
|
|
|
# Check if there are changes to commit
|
|
if git diff --staged --quiet; then
|
|
echo "No changes to commit"
|
|
else
|
|
git commit -m "chore: update similarity cache [${{ steps.cache_meta.outputs.metadata }}]"
|
|
git push origin similarity-cache-data --force
|
|
fi
|
|
|
|
- name: Summary
|
|
if: always()
|
|
run: |
|
|
if [ "${{ steps.check_cache.outputs.needs_build }}" = "true" ]; then
|
|
echo "✓ Similarity cache built and committed"
|
|
echo " Metadata: ${{ steps.cache_meta.outputs.metadata }}"
|
|
else
|
|
echo "⊘ Cache is recent, no rebuild needed"
|
|
fi
|