fix: remove CSV fallback from theme catalog generation, add Parquet debug step

- Remove CSV fallback logic (Parquet-only in M4 migration)
- Add better error messages when Parquet file missing or empty
- Add workflow debug step to inspect Parquet file after tagging
- Simplify build_theme_catalog function signature
This commit is contained in:
matt 2025-10-18 22:22:35 -07:00
parent 9e6c3e66e9
commit 30dfca0b67
2 changed files with 134 additions and 110 deletions

View file

@ -88,13 +88,60 @@ jobs:
echo "ERROR: Tagging completion flag not found"
exit 1
fi
- name: Debug - Inspect Parquet file after tagging
if: steps.check_cache.outputs.needs_build == 'true'
run: |
python -c "
import pandas as pd
from code.path_util import get_processed_cards_path
# Verify theme catalog was generated
parquet_path = get_processed_cards_path()
print(f'Reading Parquet file: {parquet_path}')
print(f'File exists: {parquet_path.exists()}')
if not parquet_path.exists():
raise FileNotFoundError(f'Parquet file not found: {parquet_path}')
df = pd.read_parquet(parquet_path)
print(f'Loaded {len(df)} rows from Parquet file')
print(f'Columns: {list(df.columns)}')
print('')
# Show first 10 rows with their themeTags
print('First 10 cards with themeTags:')
print('=' * 80)
for idx, row in df.head(10).iterrows():
name = row.get('name', 'UNKNOWN')
tags = row.get('themeTags', [])
tag_count = len(tags) if isinstance(tags, list) else 0
print(f'{idx}: {name}')
print(f' Type: {type(tags).__name__}')
print(f' Count: {tag_count}')
if tag_count > 0:
# Show first 5 tags
sample = tags[:5] if tag_count > 5 else tags
print(f' Tags: {sample}')
if tag_count > 5:
print(f' ... and {tag_count - 5} more')
else:
print(f' Tags: (empty)')
print('')
"
- name: Generate theme catalog
if: steps.check_cache.outputs.needs_build == 'true'
run: |
if [ ! -f "config/themes/theme_catalog.csv" ]; then
echo "WARNING: Theme catalog not found, generating..."
echo "Theme catalog not found, generating..."
python -m code.scripts.generate_theme_catalog
else
echo "Theme catalog already exists, skipping generation"
fi
- name: Verify theme catalog and tag statistics
if: steps.check_cache.outputs.needs_build == 'true'
run: |
# Detailed check of what tags were actually written
python -c "
import pandas as pd