mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2026-03-07 22:22:34 +01:00
The debug step was helpful for diagnosing numpy array issues but is no longer needed for normal operation. Commented out rather than removed so it's available if needed for future troubleshooting.
294 lines
11 KiB
YAML
294 lines
11 KiB
YAML
name: Build Similarity Cache
|
|
|
|
# Manual trigger + weekly schedule + callable from other workflows
|
|
on:
|
|
workflow_dispatch:
|
|
inputs:
|
|
force_rebuild:
|
|
description: 'Force rebuild even if cache exists'
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
workflow_call: # Allow this workflow to be called by other workflows
|
|
schedule:
|
|
# Run every Sunday at 2 AM UTC
|
|
- cron: '0 2 * * 0'
|
|
|
|
jobs:
|
|
build-cache:
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 45
|
|
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
with:
|
|
fetch-depth: 1
|
|
|
|
- name: Set up Python 3.11
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: '3.11'
|
|
cache: 'pip'
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
python -m pip install --upgrade pip
|
|
pip install -r requirements.txt
|
|
|
|
- name: Check if cache needs rebuild
|
|
id: check_cache
|
|
run: |
|
|
FORCE="${{ github.event.inputs.force_rebuild }}"
|
|
if [ "$FORCE" = "true" ] || [ ! -f "card_files/similarity_cache.parquet" ]; then
|
|
echo "needs_build=true" >> $GITHUB_OUTPUT
|
|
echo "Cache doesn't exist or force rebuild requested"
|
|
else
|
|
# Check cache age via metadata JSON
|
|
CACHE_AGE_DAYS=$(python -c "
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
metadata_path = Path('card_files/similarity_cache_metadata.json')
|
|
if metadata_path.exists():
|
|
with open(metadata_path) as f:
|
|
data = json.load(f)
|
|
build_date = data.get('build_date')
|
|
if build_date:
|
|
age = (datetime.now() - datetime.fromisoformat(build_date)).days
|
|
print(age)
|
|
else:
|
|
print(999)
|
|
else:
|
|
print(999)
|
|
" || echo "999")
|
|
|
|
if [ "$CACHE_AGE_DAYS" -gt 7 ]; then
|
|
echo "needs_build=true" >> $GITHUB_OUTPUT
|
|
echo "Cache is $CACHE_AGE_DAYS days old, rebuilding"
|
|
else
|
|
echo "needs_build=false" >> $GITHUB_OUTPUT
|
|
echo "Cache is only $CACHE_AGE_DAYS days old, skipping"
|
|
fi
|
|
fi
|
|
|
|
- name: Run initial setup
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -c "from code.file_setup.setup import initial_setup; initial_setup()"
|
|
|
|
- name: Run tagging (serial for CI reliability)
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -c "from code.tagging.tagger import run_tagging; run_tagging(parallel=False)"
|
|
|
|
# Verify tagging completed
|
|
if [ ! -f "card_files/processed/.tagging_complete.json" ]; then
|
|
echo "ERROR: Tagging completion flag not found"
|
|
exit 1
|
|
fi
|
|
|
|
# Debug step - uncomment if needed to inspect Parquet file contents
|
|
# - name: Debug - Inspect Parquet file after tagging
|
|
# if: steps.check_cache.outputs.needs_build == 'true'
|
|
# run: |
|
|
# python -c "
|
|
# import pandas as pd
|
|
# from pathlib import Path
|
|
# from code.path_util import get_processed_cards_path
|
|
#
|
|
# parquet_path = Path(get_processed_cards_path())
|
|
# print(f'Reading Parquet file: {parquet_path}')
|
|
# print(f'File exists: {parquet_path.exists()}')
|
|
#
|
|
# if not parquet_path.exists():
|
|
# raise FileNotFoundError(f'Parquet file not found: {parquet_path}')
|
|
#
|
|
# df = pd.read_parquet(parquet_path)
|
|
# print(f'Loaded {len(df)} rows from Parquet file')
|
|
# print(f'Columns: {list(df.columns)}')
|
|
# print('')
|
|
#
|
|
# # Show first 5 rows completely
|
|
# print('First 5 complete rows:')
|
|
# print('=' * 100)
|
|
# for idx, row in df.head(5).iterrows():
|
|
# print(f'Row {idx}:')
|
|
# for col in df.columns:
|
|
# value = row[col]
|
|
# if isinstance(value, (list, tuple)) or hasattr(value, '__array__'):
|
|
# # For array-like, show type and length
|
|
# try:
|
|
# length = len(value)
|
|
# print(f' {col}: {type(value).__name__}[{length}] = {value}')
|
|
# except:
|
|
# print(f' {col}: {type(value).__name__} = {value}')
|
|
# else:
|
|
# print(f' {col}: {value}')
|
|
# print('-' * 100)
|
|
# "
|
|
|
|
- name: Generate theme catalog
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
if [ ! -f "config/themes/theme_catalog.csv" ]; then
|
|
echo "Theme catalog not found, generating..."
|
|
python -m code.scripts.generate_theme_catalog
|
|
else
|
|
echo "Theme catalog already exists, skipping generation"
|
|
fi
|
|
|
|
- name: Verify theme catalog and tag statistics
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
# Detailed check of what tags were actually written
|
|
python -c "
|
|
import pandas as pd
|
|
from code.path_util import get_processed_cards_path
|
|
df = pd.read_parquet(get_processed_cards_path())
|
|
|
|
# Helper to count tags (handles both list and numpy array)
|
|
def count_tags(x):
|
|
if x is None:
|
|
return 0
|
|
if hasattr(x, '__len__'):
|
|
try:
|
|
return len(x)
|
|
except:
|
|
return 0
|
|
return 0
|
|
|
|
# Count total tags
|
|
total_tags = 0
|
|
cards_with_tags = 0
|
|
sample_cards = []
|
|
|
|
for idx, row in df.head(10).iterrows():
|
|
name = row['name']
|
|
tags = row['themeTags']
|
|
tag_count = count_tags(tags)
|
|
total_tags += tag_count
|
|
if tag_count > 0:
|
|
cards_with_tags += 1
|
|
sample_cards.append(f'{name}: {tag_count} tags')
|
|
|
|
print(f'Sample of first 10 cards:')
|
|
for card in sample_cards:
|
|
print(f' {card}')
|
|
|
|
# Full count
|
|
all_tags = df['themeTags'].apply(count_tags).sum()
|
|
all_with_tags = (df['themeTags'].apply(count_tags) > 0).sum()
|
|
|
|
print(f'')
|
|
print(f'Total cards: {len(df):,}')
|
|
print(f'Cards with tags: {all_with_tags:,}')
|
|
print(f'Total theme tags: {all_tags:,}')
|
|
|
|
if all_tags < 10000:
|
|
raise ValueError(f'Only {all_tags} tags found, expected >10k')
|
|
"
|
|
|
|
- name: Build similarity cache (Parquet) from card_files/processed/all_cards.parquet
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
python -m code.scripts.build_similarity_cache_parquet --parallel --checkpoint-interval 1000 --force
|
|
|
|
- name: Verify cache was created
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
if [ ! -f "card_files/similarity_cache.parquet" ]; then
|
|
echo "ERROR: Similarity cache not created"
|
|
exit 1
|
|
fi
|
|
if [ ! -f "card_files/similarity_cache_metadata.json" ]; then
|
|
echo "ERROR: Similarity cache metadata not created"
|
|
exit 1
|
|
fi
|
|
if [ ! -f "card_files/processed/commander_cards.parquet" ]; then
|
|
echo "ERROR: Commander cache not created"
|
|
exit 1
|
|
fi
|
|
|
|
echo "✓ All cache files created successfully"
|
|
|
|
- name: Get cache metadata for commit message
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
id: cache_meta
|
|
run: |
|
|
METADATA=$(python -c "
|
|
import json
|
|
from pathlib import Path
|
|
from code.web.services.similarity_cache import get_cache
|
|
|
|
cache = get_cache()
|
|
stats = cache.get_stats()
|
|
metadata = cache._metadata or {}
|
|
|
|
build_date = metadata.get('build_date', 'unknown')
|
|
print(f\"{stats['total_cards']} cards, {stats['total_entries']} entries, {stats['file_size_mb']:.1f}MB, built {build_date}\")
|
|
")
|
|
echo "metadata=$METADATA" >> $GITHUB_OUTPUT
|
|
|
|
- name: Commit and push cache
|
|
if: steps.check_cache.outputs.needs_build == 'true'
|
|
run: |
|
|
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
|
git config --local user.name "github-actions[bot]"
|
|
|
|
# Fetch all branches
|
|
git fetch origin
|
|
|
|
# Try to checkout existing branch, or create new orphan branch
|
|
if git ls-remote --heads origin similarity-cache-data | grep similarity-cache-data; then
|
|
echo "Checking out existing similarity-cache-data branch..."
|
|
git checkout similarity-cache-data
|
|
else
|
|
echo "Creating new orphan branch similarity-cache-data..."
|
|
git checkout --orphan similarity-cache-data
|
|
git rm -rf . || true
|
|
# Create minimal README for the branch
|
|
echo "# Similarity Cache Data" > README.md
|
|
echo "This branch contains pre-built similarity cache files for the MTG Deckbuilder." >> README.md
|
|
echo "Updated automatically by GitHub Actions." >> README.md
|
|
echo "" >> README.md
|
|
echo "## Files" >> README.md
|
|
echo "- \`card_files/similarity_cache.parquet\` - Pre-computed card similarity cache" >> README.md
|
|
echo "- \`card_files/similarity_cache_metadata.json\` - Cache metadata" >> README.md
|
|
echo "- \`card_files/processed/all_cards.parquet\` - Tagged card database" >> README.md
|
|
echo "- \`card_files/processed/commander_cards.parquet\` - Commander-only cache (fast lookups)" >> README.md
|
|
echo "- \`card_files/processed/.tagging_complete.json\` - Tagging status" >> README.md
|
|
fi
|
|
|
|
# Ensure directories exist
|
|
mkdir -p card_files/processed
|
|
|
|
# Add similarity cache files (use -f to override .gitignore)
|
|
git add -f card_files/similarity_cache.parquet
|
|
git add -f card_files/similarity_cache_metadata.json
|
|
|
|
# Add processed Parquet and status file
|
|
git add -f card_files/processed/all_cards.parquet
|
|
git add -f card_files/processed/commander_cards.parquet
|
|
git add -f card_files/processed/.tagging_complete.json
|
|
|
|
git add README.md 2>/dev/null || true
|
|
|
|
# Check if there are changes to commit
|
|
if git diff --staged --quiet; then
|
|
echo "No changes to commit"
|
|
else
|
|
git commit -m "chore: update similarity cache [${{ steps.cache_meta.outputs.metadata }}]"
|
|
git push origin similarity-cache-data --force
|
|
fi
|
|
|
|
- name: Summary
|
|
if: always()
|
|
run: |
|
|
if [ "${{ steps.check_cache.outputs.needs_build }}" = "true" ]; then
|
|
echo "✓ Similarity cache built and committed"
|
|
echo " Metadata: ${{ steps.cache_meta.outputs.metadata }}"
|
|
else
|
|
echo "⊘ Cache is recent, no rebuild needed"
|
|
fi
|