mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 15:40:12 +01:00
244 lines
9.4 KiB
YAML
244 lines
9.4 KiB
YAML
name: Build Similarity Cache
|
|
|
|
# Manual trigger + weekly schedule + callable from other workflows
|
|
on:
|
|
workflow_dispatch:
|
|
inputs:
|
|
force_rebuild:
|
|
description: 'Force rebuild even if cache exists'
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
workflow_call: # Allow this workflow to be called by other workflows
|
|
schedule:
|
|
# Run every Sunday at 2 AM UTC
|
|
- cron: '0 2 * * 0'
|
|
|
|
jobs:
|
|
build-cache:
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 45
|
|
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
with:
|
|
fetch-depth: 1
|
|
|
|
- name: Set up Python 3.11
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: '3.11'
|
|
cache: 'pip'
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
python -m pip install --upgrade pip
|
|
pip install -r requirements.txt
|
|
|
|
- name: Check if cache needs rebuild
|
|
id: check_cache
|
|
run: |
|
|
FORCE="${{ github.event.inputs.force_rebuild }}"
|
|
if [ "$FORCE" = "true" ] || [ ! -f "card_files/similarity_cache.parquet" ]; then
|
|
echo "needs_build=true" >> $GITHUB_OUTPUT
|
|
echo "Cache doesn't exist or force rebuild requested"
|
|
else
|
|
# Check cache age via metadata JSON
|
|
CACHE_AGE_DAYS=$(python -c "
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
metadata_path = Path('card_files/similarity_cache_metadata.json')
|
|
if metadata_path.exists():
|
|
with open(metadata_path) as f:
|
|
data = json.load(f)
|
|
build_date = data.get('build_date')
|
|
if build_date:
|
|
age = (datetime.now() - datetime.fromisoformat(build_date)).days
|
|
print(age)
|
|
else:
|
|
print(999)
|
|
else:
|
|
print(999)
|
|
" || echo "999")
|
|
|
|
if [ "$CACHE_AGE_DAYS" -gt 7 ]; then
|
|
echo "needs_build=true" >> $GITHUB_OUTPUT
|
|
echo "Cache is $CACHE_AGE_DAYS days old, rebuilding"
|
|
else
|
|
echo "needs_build=false" >> $GITHUB_OUTPUT
|
|
echo "Cache is only $CACHE_AGE_DAYS days old, skipping"
|
|
fi
|
|
fi
|
|
|
|
- name: Run initial setup
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -c "from code.file_setup.setup import initial_setup; initial_setup()"
|
|
|
|
- name: Run tagging (serial for CI reliability)
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -c "from code.tagging.tagger import run_tagging; run_tagging(parallel=False)"
|
|
|
|
# Verify tagging completed
|
|
if [ ! -f "card_files/processed/.tagging_complete.json" ]; then
|
|
echo "ERROR: Tagging completion flag not found"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify theme catalog was generated
|
|
if [ ! -f "config/themes/theme_catalog.csv" ]; then
|
|
echo "WARNING: Theme catalog not found, generating..."
|
|
python -c "from code.deck_builder.theme_catalog_loader import generate_theme_catalog; generate_theme_catalog()"
|
|
fi
|
|
|
|
# Detailed check of what tags were actually written
|
|
python -c "
|
|
import pandas as pd
|
|
from code.path_util import get_processed_cards_path
|
|
df = pd.read_parquet(get_processed_cards_path())
|
|
|
|
# Count total tags
|
|
total_tags = 0
|
|
cards_with_tags = 0
|
|
sample_cards = []
|
|
|
|
for idx, row in df.head(10).iterrows():
|
|
name = row['name']
|
|
tags = row['themeTags']
|
|
tag_count = len(tags) if isinstance(tags, list) else 0
|
|
total_tags += tag_count
|
|
if tag_count > 0:
|
|
cards_with_tags += 1
|
|
sample_cards.append(f'{name}: {tag_count} tags')
|
|
|
|
print(f'Sample of first 10 cards:')
|
|
for card in sample_cards:
|
|
print(f' {card}')
|
|
|
|
# Full count
|
|
all_tags = df['themeTags'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()
|
|
all_with_tags = (df['themeTags'].apply(lambda x: len(x) if isinstance(x, list) else 0) > 0).sum()
|
|
|
|
print(f'')
|
|
print(f'Total cards: {len(df):,}')
|
|
print(f'Cards with tags: {all_with_tags:,}')
|
|
print(f'Total theme tags: {all_tags:,}')
|
|
|
|
if all_tags < 10000:
|
|
raise ValueError(f'Only {all_tags} tags found, expected >10k')
|
|
"
|
|
|
|
- name: Build similarity cache (Parquet) from card_files/processed/all_cards.parquet
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -m code.scripts.build_similarity_cache_parquet --parallel --checkpoint-interval 1000 --force
|
|
|
|
- name: Verify cache was created
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
if [ ! -f "card_files/similarity_cache.parquet" ]; then
|
|
echo "ERROR: Cache Parquet file was not created"
|
|
exit 1
|
|
fi
|
|
if [ ! -f "card_files/similarity_cache_metadata.json" ]; then
|
|
echo "ERROR: Cache metadata file was not created"
|
|
exit 1
|
|
fi
|
|
|
|
# Check cache validity
|
|
python -c "
|
|
import json
|
|
from pathlib import Path
|
|
from code.web.services.similarity_cache import get_cache
|
|
|
|
cache = get_cache()
|
|
stats = cache.get_stats()
|
|
|
|
if stats['total_cards'] < 20000:
|
|
raise ValueError(f\"Cache only has {stats['total_cards']} cards, expected ~30k\")
|
|
|
|
print(f\"✓ Cache is valid with {stats['total_cards']:,} cards, {stats['total_entries']:,} entries\")
|
|
print(f\" File size: {stats['file_size_mb']:.2f} MB\")
|
|
"
|
|
|
|
- name: Get cache metadata for commit message
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
id: cache_meta
|
|
run: |
|
|
METADATA=$(python -c "
|
|
import json
|
|
from pathlib import Path
|
|
from code.web.services.similarity_cache import get_cache
|
|
|
|
cache = get_cache()
|
|
stats = cache.get_stats()
|
|
metadata = cache._metadata or {}
|
|
|
|
build_date = metadata.get('build_date', 'unknown')
|
|
print(f\"{stats['total_cards']} cards, {stats['total_entries']} entries, {stats['file_size_mb']:.1f}MB, built {build_date}\")
|
|
")
|
|
echo "metadata=$METADATA" >> $GITHUB_OUTPUT
|
|
|
|
- name: Commit and push cache
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
|
git config --local user.name "github-actions[bot]"
|
|
|
|
# Fetch all branches
|
|
git fetch origin
|
|
|
|
# Try to checkout existing branch, or create new orphan branch
|
|
if git ls-remote --heads origin similarity-cache-data | grep similarity-cache-data; then
|
|
echo "Checking out existing similarity-cache-data branch..."
|
|
git checkout similarity-cache-data
|
|
else
|
|
echo "Creating new orphan branch similarity-cache-data..."
|
|
git checkout --orphan similarity-cache-data
|
|
git rm -rf . || true
|
|
# Create minimal README for the branch
|
|
echo "# Similarity Cache Data" > README.md
|
|
echo "This branch contains pre-built similarity cache files for the MTG Deckbuilder." >> README.md
|
|
echo "Updated automatically by GitHub Actions." >> README.md
|
|
echo "" >> README.md
|
|
echo "## Files" >> README.md
|
|
echo "- \`card_files/similarity_cache.parquet\` - Pre-computed card similarity cache" >> README.md
|
|
echo "- \`card_files/similarity_cache_metadata.json\` - Cache metadata" >> README.md
|
|
echo "- \`card_files/processed/all_cards.parquet\` - Tagged card database" >> README.md
|
|
echo "- \`card_files/processed/.tagging_complete.json\` - Tagging status" >> README.md
|
|
fi
|
|
|
|
# Ensure directories exist
|
|
mkdir -p card_files/processed
|
|
|
|
# Add similarity cache files (use -f to override .gitignore)
|
|
git add -f card_files/similarity_cache.parquet
|
|
git add -f card_files/similarity_cache_metadata.json
|
|
|
|
# Add processed Parquet and status file
|
|
git add -f card_files/processed/all_cards.parquet
|
|
git add -f card_files/processed/.tagging_complete.json
|
|
|
|
git add README.md 2>/dev/null || true
|
|
|
|
# Check if there are changes to commit
|
|
if git diff --staged --quiet; then
|
|
echo "No changes to commit"
|
|
else
|
|
git commit -m "chore: update similarity cache [${{ steps.cache_meta.outputs.metadata }}]"
|
|
git push origin similarity-cache-data --force
|
|
fi
|
|
|
|
- name: Summary
|
|
if: always()
|
|
run: |
|
|
if [ "${{ steps.check_cache.outputs.needs_build }}" = "true" ]; then
|
|
echo "✓ Similarity cache built and committed"
|
|
echo " Metadata: ${{ steps.cache_meta.outputs.metadata }}"
|
|
else
|
|
echo "⊘ Cache is recent, no rebuild needed"
|
|
fi
|