Refactored multiple tagger functions, through to lifegain related functions so that they use vector masks in place of iterrows

2025-09-22 04:50:46 +02:00 · 2025-01-08 08:29:00 -08:00 · 2025-01-08 08:29:00 -08:00 · aac2b26be8
commit aac2b26be8
parent 083ef937af
3 changed files with 1818 additions and 1227 deletions
--- a/settings.py
+++ b/settings.py
@ -206,4 +206,52 @@ COLUMN_ORDER = [

 # Constants for type detection and processing
 OUTLAW_TYPES = ['Assassin', 'Mercenary', 'Pirate', 'Rogue', 'Warlock']
+TYPE_DETECTION_BATCH_SIZE = 1000
+
+# Aura-related constants
+AURA_SPECIFIC_CARDS = [
+    'Ardenn, Intrepid Archaeologist',   # Aura movement
+    'Calix, Guided By Fate',            # Create duplicate Auras
+    'Gilwain, Casting Director',        # Creates role tokens
+    'Ivy, Gleeful Spellthief',          # Copies spells that have single target
+    'Killian, Ink Duelist',             # Targetted spell cost reduction
+]
+# Equipment-related constants
+EQUIPMENT_EXCLUSIONS = [
+    'Bruenor Battlehammer',         # Equipment cost reduction
+    'Nazahn, Revered Bladesmith',   # Equipment tutor
+    'Stonehewer Giant',             # Equipment tutor
+]
+
+EQUIPMENT_SPECIFIC_CARDS = [
+    'Ardenn, Intrepid Archaeologist',   # Equipment movement
+    'Armory Automaton',                 # Mass equip ability
+    'Brass Squire',                     # Free equip ability
+    'Danitha Capashen, Paragon',        # Equipment cost reduction
+    'Halvar, God of Battle',            # Equipment movement
+    'Kemba, Kha Regent',                # Equipment payoff
+    'Kosei, Penitent Warlord',          # Wants to be eequipped
+    'Puresteel Paladin',                # Equipment draw engine
+    'Reyav, Master Smith',              # Equipment combat boost
+    'Sram, Senior Edificer',            # Equipment card draw
+    'Valduk, Keeper of the Flame'       # Equipment token creation
+]
+
+EQUIPMENT_RELATED_TAGS = [
+    'Equipment',           # Base equipment tag
+    'Equipment Matters',   # Cards that care about equipment
+    'Voltron',             # Commander-focused equipment strategy
+    'Artifacts Matter',    # Equipment are artifacts
+    'Warriors Matter',     # Common equipment tribal synergy
+    'Knights Matter'       # Common equipment tribal synergy
+]
+
+EQUIPMENT_TEXT_PATTERNS = [
+    'attach',           # Equipment attachment
+    'equip',            # Equipment keyword
+    'equipped',         # Equipment state
+    'equipment',        # Equipment type
+    'unattach',         # Equipment removal
+    'unequip',          # Equipment removal
+]
 TYPE_DETECTION_BATCH_SIZE = 1000
--- a/tagger.py
+++ b/tagger.py
--- a/utility.py
+++ b/utility.py
@ -1,9 +1,12 @@
-from typing import Union, List
 import pandas as pd
 import re
 import logging
-from typing import Dict, Optional, Set
+
+from typing import Dict, List, Optional, Set, Union
 from time import perf_counter
+
+import settings
+
 def pluralize(word: str) -> str:
    """Convert a word to its plural form using basic English pluralization rules.

@ -65,6 +68,96 @@ def create_type_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex:
        masks = [df['type'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
        return pd.concat(masks, axis=1).any(axis=1)

+def create_text_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
+    """Create a boolean mask for rows where text matches one or more patterns.
+
+    Args:
+        df: DataFrame to search
+        type_text: Type text pattern(s) to match. Can be a single string or list of strings.
+        regex: Whether to treat patterns as regex expressions (default: True)
+
+    Returns:
+        Boolean Series indicating matching rows
+
+    Raises:
+        ValueError: If type_text is empty or None
+        TypeError: If type_text is not a string or list of strings
+    """
+    if not type_text:
+        raise ValueError("type_text cannot be empty or None")
+
+    if isinstance(type_text, str):
+        type_text = [type_text]
+    elif not isinstance(type_text, list):
+        raise TypeError("type_text must be a string or list of strings")
+
+    if regex:
+        pattern = '|'.join(f'{p}' for p in type_text)
+        return df['text'].str.contains(pattern, case=False, na=False, regex=True)
+    else:
+        masks = [df['text'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
+        return pd.concat(masks, axis=1).any(axis=1)
+
+def create_keyword_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
+    """Create a boolean mask for rows where keyword text matches one or more patterns.
+
+    Args:
+        df: DataFrame to search
+        type_text: Type text pattern(s) to match. Can be a single string or list of strings.
+        regex: Whether to treat patterns as regex expressions (default: True)
+
+    Returns:
+        Boolean Series indicating matching rows
+
+    Raises:
+        ValueError: If type_text is empty or None
+        TypeError: If type_text is not a string or list of strings
+    """
+    if not type_text:
+        raise ValueError("type_text cannot be empty or None")
+
+    if isinstance(type_text, str):
+        type_text = [type_text]
+    elif not isinstance(type_text, list):
+        raise TypeError("type_text must be a string or list of strings")
+
+    if regex:
+        pattern = '|'.join(f'{p}' for p in type_text)
+        return df['keywords'].str.contains(pattern, case=False, na=False, regex=True)
+    else:
+        masks = [df['keywords'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
+        return pd.concat(masks, axis=1).any(axis=1)
+
+def create_name_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
+    """Create a boolean mask for rows where name matches one or more patterns.
+
+    Args:
+        df: DataFrame to search
+        type_text: Type text pattern(s) to match. Can be a single string or list of strings.
+        regex: Whether to treat patterns as regex expressions (default: True)
+
+    Returns:
+        Boolean Series indicating matching rows
+
+    Raises:
+        ValueError: If type_text is empty or None
+        TypeError: If type_text is not a string or list of strings
+    """
+    if not type_text:
+        raise ValueError("type_text cannot be empty or None")
+
+    if isinstance(type_text, str):
+        type_text = [type_text]
+    elif not isinstance(type_text, list):
+        raise TypeError("type_text must be a string or list of strings")
+
+    if regex:
+        pattern = '|'.join(f'{p}' for p in type_text)
+        return df['name'].str.contains(pattern, case=False, na=False, regex=True)
+    else:
+        masks = [df['name'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
+        return pd.concat(masks, axis=1).any(axis=1)
+
 def extract_creature_types(type_text: str, creature_types: List[str], non_creature_types: List[str]) -> List[str]:
    """Extract creature types from a type text string.