Refactored multiple tagger functions, through to lifegain related functions so that they use vector masks in place of iterrows

2025-12-18 16:40:12 +01:00 · 2025-01-08 08:29:00 -08:00 · 2025-01-08 08:29:00 -08:00 · aac2b26be8
commit aac2b26be8
parent 083ef937af
3 changed files with 1818 additions and 1227 deletions
--- a/settings.py
+++ b/settings.py
@ -207,3 +207,51 @@ COLUMN_ORDER = [
 # Constants for type detection and processing
 OUTLAW_TYPES = ['Assassin', 'Mercenary', 'Pirate', 'Rogue', 'Warlock']
 TYPE_DETECTION_BATCH_SIZE = 1000
 # Aura-related constants
 AURA_SPECIFIC_CARDS = [
    'Ardenn, Intrepid Archaeologist',   # Aura movement
    'Calix, Guided By Fate',            # Create duplicate Auras
    'Gilwain, Casting Director',        # Creates role tokens
    'Ivy, Gleeful Spellthief',          # Copies spells that have single target
    'Killian, Ink Duelist',             # Targetted spell cost reduction
 ]
 # Equipment-related constants
 EQUIPMENT_EXCLUSIONS = [
    'Bruenor Battlehammer',         # Equipment cost reduction
    'Nazahn, Revered Bladesmith',   # Equipment tutor
    'Stonehewer Giant',             # Equipment tutor
 ]
 EQUIPMENT_SPECIFIC_CARDS = [
    'Ardenn, Intrepid Archaeologist',   # Equipment movement
    'Armory Automaton',                 # Mass equip ability
    'Brass Squire',                     # Free equip ability
    'Danitha Capashen, Paragon',        # Equipment cost reduction
    'Halvar, God of Battle',            # Equipment movement
    'Kemba, Kha Regent',                # Equipment payoff
    'Kosei, Penitent Warlord',          # Wants to be eequipped
    'Puresteel Paladin',                # Equipment draw engine
    'Reyav, Master Smith',              # Equipment combat boost
    'Sram, Senior Edificer',            # Equipment card draw
    'Valduk, Keeper of the Flame'       # Equipment token creation
 ]
 EQUIPMENT_RELATED_TAGS = [
    'Equipment',           # Base equipment tag
    'Equipment Matters',   # Cards that care about equipment
    'Voltron',             # Commander-focused equipment strategy
    'Artifacts Matter',    # Equipment are artifacts
    'Warriors Matter',     # Common equipment tribal synergy
    'Knights Matter'       # Common equipment tribal synergy
 ]
 EQUIPMENT_TEXT_PATTERNS = [
    'attach',           # Equipment attachment
    'equip',            # Equipment keyword
    'equipped',         # Equipment state
    'equipment',        # Equipment type
    'unattach',         # Equipment removal
    'unequip',          # Equipment removal
 ]
 TYPE_DETECTION_BATCH_SIZE = 1000
--- a/tagger.py
+++ b/tagger.py
--- a/utility.py
+++ b/utility.py
@ -1,9 +1,12 @@
 from typing import Union, List
 import pandas as pd
 import re
 import logging
-from typing import Dict, Optional, Set
+
 from typing import Dict, List, Optional, Set, Union
 from time import perf_counter
 import settings
 def pluralize(word: str) -> str:
    """Convert a word to its plural form using basic English pluralization rules.
@ -65,6 +68,96 @@ def create_type_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex:
        masks = [df['type'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
        return pd.concat(masks, axis=1).any(axis=1)
 def create_text_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
    """Create a boolean mask for rows where text matches one or more patterns.
    Args:
        df: DataFrame to search
        type_text: Type text pattern(s) to match. Can be a single string or list of strings.
        regex: Whether to treat patterns as regex expressions (default: True)
    Returns:
        Boolean Series indicating matching rows
    Raises:
        ValueError: If type_text is empty or None
        TypeError: If type_text is not a string or list of strings
    """
    if not type_text:
        raise ValueError("type_text cannot be empty or None")
    if isinstance(type_text, str):
        type_text = [type_text]
    elif not isinstance(type_text, list):
        raise TypeError("type_text must be a string or list of strings")
    if regex:
        pattern = '|'.join(f'{p}' for p in type_text)
        return df['text'].str.contains(pattern, case=False, na=False, regex=True)
    else:
        masks = [df['text'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
        return pd.concat(masks, axis=1).any(axis=1)
 def create_keyword_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
    """Create a boolean mask for rows where keyword text matches one or more patterns.
    Args:
        df: DataFrame to search
        type_text: Type text pattern(s) to match. Can be a single string or list of strings.
        regex: Whether to treat patterns as regex expressions (default: True)
    Returns:
        Boolean Series indicating matching rows
    Raises:
        ValueError: If type_text is empty or None
        TypeError: If type_text is not a string or list of strings
    """
    if not type_text:
        raise ValueError("type_text cannot be empty or None")
    if isinstance(type_text, str):
        type_text = [type_text]
    elif not isinstance(type_text, list):
        raise TypeError("type_text must be a string or list of strings")
    if regex:
        pattern = '|'.join(f'{p}' for p in type_text)
        return df['keywords'].str.contains(pattern, case=False, na=False, regex=True)
    else:
        masks = [df['keywords'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
        return pd.concat(masks, axis=1).any(axis=1)
 def create_name_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
    """Create a boolean mask for rows where name matches one or more patterns.
    Args:
        df: DataFrame to search
        type_text: Type text pattern(s) to match. Can be a single string or list of strings.
        regex: Whether to treat patterns as regex expressions (default: True)
    Returns:
        Boolean Series indicating matching rows
    Raises:
        ValueError: If type_text is empty or None
        TypeError: If type_text is not a string or list of strings
    """
    if not type_text:
        raise ValueError("type_text cannot be empty or None")
    if isinstance(type_text, str):
        type_text = [type_text]
    elif not isinstance(type_text, list):
        raise TypeError("type_text must be a string or list of strings")
    if regex:
        pattern = '|'.join(f'{p}' for p in type_text)
        return df['name'].str.contains(pattern, case=False, na=False, regex=True)
    else:
        masks = [df['name'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
        return pd.concat(masks, axis=1).any(axis=1)
 def extract_creature_types(type_text: str, creature_types: List[str], non_creature_types: List[str]) -> List[str]:
    """Extract creature types from a type text string.