From bff64de3703168a9a81adf43e7028484ce6503d2 Mon Sep 17 00:00:00 2001 From: matt Date: Sat, 18 Oct 2025 22:47:09 -0700 Subject: [PATCH] fix: systematically handle numpy arrays from Parquet files across codebase - Add ensure_theme_tags_list() utility to builder_utils for simpler numpy array handling - Update phase3_creatures.py: 6 locations now use bu.ensure_theme_tags_list() - Update phase4_spells.py: 9 locations now use bu.ensure_theme_tags_list() - Update tagger.py: 2 locations use hasattr/list() for numpy compatibility - Update extract_themes.py: 2 locations use hasattr/list() for numpy compatibility - Fix build-similarity-cache.yml verification script to handle numpy arrays - Enhance workflow debug output to show complete row data Parquet files return numpy.ndarray objects for array columns, not Python lists. The M4 migration added numpy support to canonical parse_theme_tags() in builder_utils, but many parts of the codebase still used isinstance(list) checks that fail with arrays. This commit systematically replaces all 19 instances with proper numpy array handling. Fixes GitHub Actions workflow 'RuntimeError: No theme tags found' and verification failures. --- .github/workflows/build-similarity-cache.yml | 17 ++++++++++++++--- code/deck_builder/builder_utils.py | 12 ++++++++++++ code/deck_builder/phases/phase3_creatures.py | 12 ++++++------ code/deck_builder/phases/phase4_spells.py | 20 ++++++++++---------- code/scripts/extract_themes.py | 5 +++-- code/tagging/tagger.py | 3 ++- 6 files changed, 47 insertions(+), 22 deletions(-) diff --git a/.github/workflows/build-similarity-cache.yml b/.github/workflows/build-similarity-cache.yml index a4a4bbc..b393bfe 100644 --- a/.github/workflows/build-similarity-cache.yml +++ b/.github/workflows/build-similarity-cache.yml @@ -147,6 +147,17 @@ jobs: from code.path_util import get_processed_cards_path df = pd.read_parquet(get_processed_cards_path()) + # Helper to count tags (handles both list and numpy array) + def count_tags(x): + if x is None: + return 0 + if hasattr(x, '__len__'): + try: + return len(x) + except: + return 0 + return 0 + # Count total tags total_tags = 0 cards_with_tags = 0 @@ -155,7 +166,7 @@ jobs: for idx, row in df.head(10).iterrows(): name = row['name'] tags = row['themeTags'] - tag_count = len(tags) if isinstance(tags, list) else 0 + tag_count = count_tags(tags) total_tags += tag_count if tag_count > 0: cards_with_tags += 1 @@ -166,8 +177,8 @@ jobs: print(f' {card}') # Full count - all_tags = df['themeTags'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum() - all_with_tags = (df['themeTags'].apply(lambda x: len(x) if isinstance(x, list) else 0) > 0).sum() + all_tags = df['themeTags'].apply(count_tags).sum() + all_with_tags = (df['themeTags'].apply(count_tags) > 0).sum() print(f'') print(f'Total cards: {len(df):,}') diff --git a/code/deck_builder/builder_utils.py b/code/deck_builder/builder_utils.py index 5fc98d4..6847ecf 100644 --- a/code/deck_builder/builder_utils.py +++ b/code/deck_builder/builder_utils.py @@ -249,6 +249,18 @@ def parse_theme_tags(val) -> list[str]: return [] +def ensure_theme_tags_list(val) -> list[str]: + """Safely convert themeTags value to list, handling None, lists, and numpy arrays. + + This is a simpler wrapper around parse_theme_tags for the common case where + you just need to ensure you have a list to work with. + """ + if val is None: + return [] + return parse_theme_tags(val) + + + def normalize_theme_list(raw) -> list[str]: """Parse then lowercase + strip each tag.""" tags = parse_theme_tags(raw) diff --git a/code/deck_builder/phases/phase3_creatures.py b/code/deck_builder/phases/phase3_creatures.py index bbf5f60..fe380af 100644 --- a/code/deck_builder/phases/phase3_creatures.py +++ b/code/deck_builder/phases/phase3_creatures.py @@ -120,7 +120,7 @@ class CreatureAdditionMixin: mana_cost=row.get('manaCost',''), mana_value=row.get('manaValue', row.get('cmc','')), creature_types=row.get('creatureTypes', []) if isinstance(row.get('creatureTypes', []), list) else [], - tags=row.get('themeTags', []) if isinstance(row.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(row.get('themeTags')), role='creature', sub_role='all_theme', added_by='creature_all_theme', @@ -231,7 +231,7 @@ class CreatureAdditionMixin: mana_cost=row.get('manaCost',''), mana_value=row.get('manaValue', row.get('cmc','')), creature_types=row.get('creatureTypes', []) if isinstance(row.get('creatureTypes', []), list) else [], - tags=row.get('themeTags', []) if isinstance(row.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(row.get('themeTags')), role='creature', sub_role=role, added_by='creature_add', @@ -288,7 +288,7 @@ class CreatureAdditionMixin: mana_cost=row.get('manaCost',''), mana_value=row.get('manaValue', row.get('cmc','')), creature_types=row.get('creatureTypes', []) if isinstance(row.get('creatureTypes', []), list) else [], - tags=row.get('themeTags', []) if isinstance(row.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(row.get('themeTags')), role='creature', sub_role='fill', added_by='creature_fill', @@ -551,7 +551,7 @@ class CreatureAdditionMixin: mana_cost=row.get('manaCost',''), mana_value=row.get('manaValue', row.get('cmc','')), creature_types=row.get('creatureTypes', []) if isinstance(row.get('creatureTypes', []), list) else [], - tags=row.get('themeTags', []) if isinstance(row.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(row.get('themeTags')), role='creature', sub_role=role, added_by='creature_add', @@ -590,7 +590,7 @@ class CreatureAdditionMixin: mana_cost=row.get('manaCost',''), mana_value=row.get('manaValue', row.get('cmc','')), creature_types=row.get('creatureTypes', []) if isinstance(row.get('creatureTypes', []), list) else [], - tags=row.get('themeTags', []) if isinstance(row.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(row.get('themeTags')), role='creature', sub_role='fill', added_by='creature_fill', @@ -672,7 +672,7 @@ class CreatureAdditionMixin: mana_cost=row.get('manaCost',''), mana_value=row.get('manaValue', row.get('cmc','')), creature_types=row.get('creatureTypes', []) if isinstance(row.get('creatureTypes', []), list) else [], - tags=row.get('themeTags', []) if isinstance(row.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(row.get('themeTags')), role='creature', sub_role='all_theme', added_by='creature_all_theme', diff --git a/code/deck_builder/phases/phase4_spells.py b/code/deck_builder/phases/phase4_spells.py index 3ec39fb..632806d 100644 --- a/code/deck_builder/phases/phase4_spells.py +++ b/code/deck_builder/phases/phase4_spells.py @@ -193,7 +193,7 @@ class SpellAdditionMixin: card_type=r.get('type',''), mana_cost=r.get('manaCost',''), mana_value=r.get('manaValue', r.get('cmc','')), - tags=r.get('themeTags', []) if isinstance(r.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(r.get('themeTags')), role='ramp', sub_role=phase_name.lower(), added_by='spell_ramp' @@ -322,7 +322,7 @@ class SpellAdditionMixin: card_type=r.get('type',''), mana_cost=r.get('manaCost',''), mana_value=r.get('manaValue', r.get('cmc','')), - tags=r.get('themeTags', []) if isinstance(r.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(r.get('themeTags')), role='removal', sub_role='spot', added_by='spell_removal' @@ -399,7 +399,7 @@ class SpellAdditionMixin: card_type=r.get('type',''), mana_cost=r.get('manaCost',''), mana_value=r.get('manaValue', r.get('cmc','')), - tags=r.get('themeTags', []) if isinstance(r.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(r.get('themeTags')), role='wipe', sub_role='board', added_by='spell_wipe' @@ -493,7 +493,7 @@ class SpellAdditionMixin: card_type=r.get('type',''), mana_cost=r.get('manaCost',''), mana_value=r.get('manaValue', r.get('cmc','')), - tags=r.get('themeTags', []) if isinstance(r.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(r.get('themeTags')), role='card_advantage', sub_role='conditional', added_by='spell_draw' @@ -516,7 +516,7 @@ class SpellAdditionMixin: card_type=r.get('type',''), mana_cost=r.get('manaCost',''), mana_value=r.get('manaValue', r.get('cmc','')), - tags=r.get('themeTags', []) if isinstance(r.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(r.get('themeTags')), role='card_advantage', sub_role='unconditional', added_by='spell_draw' @@ -713,7 +713,7 @@ class SpellAdditionMixin: card_type=r.get('type',''), mana_cost=r.get('manaCost',''), mana_value=r.get('manaValue', r.get('cmc','')), - tags=r.get('themeTags', []) if isinstance(r.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(r.get('themeTags')), role='protection', added_by='spell_protection' ) @@ -879,7 +879,7 @@ class SpellAdditionMixin: card_type=row.get('type', ''), mana_cost=row.get('manaCost', ''), mana_value=row.get('manaValue', row.get('cmc', '')), - tags=row.get('themeTags', []) if isinstance(row.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(row.get('themeTags')), role='theme_spell', sub_role=role, added_by='spell_theme_fill', @@ -942,7 +942,7 @@ class SpellAdditionMixin: card_type=row.get('type', ''), mana_cost=row.get('manaCost', ''), mana_value=row.get('manaValue', row.get('cmc', '')), - tags=row.get('themeTags', []) if isinstance(row.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(row.get('themeTags')), role='theme_spell', sub_role='fill_multi', added_by='spell_theme_fill', @@ -1006,7 +1006,7 @@ class SpellAdditionMixin: card_type=r0.get('type',''), mana_cost=r0.get('manaCost',''), mana_value=r0.get('manaValue', r0.get('cmc','')), - tags=r0.get('themeTags', []) if isinstance(r0.get('themeTags', []), list) else [], + tags=bu.ensure_theme_tags_list(r0.get('themeTags')), role='filler', sub_role=r0.get('_fillerCat',''), added_by='spell_general_filler' @@ -1058,4 +1058,4 @@ class SpellAdditionMixin: """ """Public method for orchestration: delegates to add_non_creature_spells.""" return self.add_non_creature_spells() - \ No newline at end of file + diff --git a/code/scripts/extract_themes.py b/code/scripts/extract_themes.py index d3b4fdc..c45e7c5 100644 --- a/code/scripts/extract_themes.py +++ b/code/scripts/extract_themes.py @@ -126,7 +126,7 @@ def tally_tag_frequencies_by_base_color() -> Dict[str, Dict[str, int]]: return derived # Iterate rows for _, row in df.iterrows(): - tags = row['themeTags'] if isinstance(row['themeTags'], list) else [] + tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else [] # Compute base colors contribution ci = row['colorIdentity'] if 'colorIdentity' in row else None letters = set(ci) if isinstance(ci, list) else set() @@ -162,7 +162,7 @@ def gather_theme_tag_rows() -> List[List[str]]: if 'themeTags' not in df.columns: continue for _, row in df.iterrows(): - tags = row['themeTags'] if isinstance(row['themeTags'], list) else [] + tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else [] if tags: rows.append(tags) return rows @@ -523,3 +523,4 @@ def main() -> None: if __name__ == "__main__": main() + diff --git a/code/tagging/tagger.py b/code/tagging/tagger.py index 096938d..526aa5f 100644 --- a/code/tagging/tagger.py +++ b/code/tagging/tagger.py @@ -1054,7 +1054,7 @@ def tag_for_keywords(df: pd.DataFrame, color: str) -> None: exclusion_keywords = {'partner'} def _merge_keywords(row: pd.Series) -> list[str]: - base_tags = row['themeTags'] if isinstance(row['themeTags'], list) else [] + base_tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else [] keywords_raw = row['keywords'] if isinstance(keywords_raw, str): @@ -6892,3 +6892,4 @@ def run_tagging(parallel: bool = False, max_workers: int | None = None): +