Refactored multiple tagger functions, through to lifegain related functions so that they use vector masks in place of iterrows

This commit is contained in:
mwisnowski 2025-01-08 08:29:00 -08:00
parent 083ef937af
commit aac2b26be8
3 changed files with 1818 additions and 1227 deletions

View file

@ -206,4 +206,52 @@ COLUMN_ORDER = [
# Constants for type detection and processing
OUTLAW_TYPES = ['Assassin', 'Mercenary', 'Pirate', 'Rogue', 'Warlock']
TYPE_DETECTION_BATCH_SIZE = 1000
# Aura-related constants
AURA_SPECIFIC_CARDS = [
'Ardenn, Intrepid Archaeologist', # Aura movement
'Calix, Guided By Fate', # Create duplicate Auras
'Gilwain, Casting Director', # Creates role tokens
'Ivy, Gleeful Spellthief', # Copies spells that have single target
'Killian, Ink Duelist', # Targetted spell cost reduction
]
# Equipment-related constants
EQUIPMENT_EXCLUSIONS = [
'Bruenor Battlehammer', # Equipment cost reduction
'Nazahn, Revered Bladesmith', # Equipment tutor
'Stonehewer Giant', # Equipment tutor
]
EQUIPMENT_SPECIFIC_CARDS = [
'Ardenn, Intrepid Archaeologist', # Equipment movement
'Armory Automaton', # Mass equip ability
'Brass Squire', # Free equip ability
'Danitha Capashen, Paragon', # Equipment cost reduction
'Halvar, God of Battle', # Equipment movement
'Kemba, Kha Regent', # Equipment payoff
'Kosei, Penitent Warlord', # Wants to be eequipped
'Puresteel Paladin', # Equipment draw engine
'Reyav, Master Smith', # Equipment combat boost
'Sram, Senior Edificer', # Equipment card draw
'Valduk, Keeper of the Flame' # Equipment token creation
]
EQUIPMENT_RELATED_TAGS = [
'Equipment', # Base equipment tag
'Equipment Matters', # Cards that care about equipment
'Voltron', # Commander-focused equipment strategy
'Artifacts Matter', # Equipment are artifacts
'Warriors Matter', # Common equipment tribal synergy
'Knights Matter' # Common equipment tribal synergy
]
EQUIPMENT_TEXT_PATTERNS = [
'attach', # Equipment attachment
'equip', # Equipment keyword
'equipped', # Equipment state
'equipment', # Equipment type
'unattach', # Equipment removal
'unequip', # Equipment removal
]
TYPE_DETECTION_BATCH_SIZE = 1000

2900
tagger.py

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,12 @@
from typing import Union, List
import pandas as pd
import re
import logging
from typing import Dict, Optional, Set
from typing import Dict, List, Optional, Set, Union
from time import perf_counter
import settings
def pluralize(word: str) -> str:
"""Convert a word to its plural form using basic English pluralization rules.
@ -65,6 +68,96 @@ def create_type_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex:
masks = [df['type'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
return pd.concat(masks, axis=1).any(axis=1)
def create_text_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
"""Create a boolean mask for rows where text matches one or more patterns.
Args:
df: DataFrame to search
type_text: Type text pattern(s) to match. Can be a single string or list of strings.
regex: Whether to treat patterns as regex expressions (default: True)
Returns:
Boolean Series indicating matching rows
Raises:
ValueError: If type_text is empty or None
TypeError: If type_text is not a string or list of strings
"""
if not type_text:
raise ValueError("type_text cannot be empty or None")
if isinstance(type_text, str):
type_text = [type_text]
elif not isinstance(type_text, list):
raise TypeError("type_text must be a string or list of strings")
if regex:
pattern = '|'.join(f'{p}' for p in type_text)
return df['text'].str.contains(pattern, case=False, na=False, regex=True)
else:
masks = [df['text'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
return pd.concat(masks, axis=1).any(axis=1)
def create_keyword_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
"""Create a boolean mask for rows where keyword text matches one or more patterns.
Args:
df: DataFrame to search
type_text: Type text pattern(s) to match. Can be a single string or list of strings.
regex: Whether to treat patterns as regex expressions (default: True)
Returns:
Boolean Series indicating matching rows
Raises:
ValueError: If type_text is empty or None
TypeError: If type_text is not a string or list of strings
"""
if not type_text:
raise ValueError("type_text cannot be empty or None")
if isinstance(type_text, str):
type_text = [type_text]
elif not isinstance(type_text, list):
raise TypeError("type_text must be a string or list of strings")
if regex:
pattern = '|'.join(f'{p}' for p in type_text)
return df['keywords'].str.contains(pattern, case=False, na=False, regex=True)
else:
masks = [df['keywords'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
return pd.concat(masks, axis=1).any(axis=1)
def create_name_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
"""Create a boolean mask for rows where name matches one or more patterns.
Args:
df: DataFrame to search
type_text: Type text pattern(s) to match. Can be a single string or list of strings.
regex: Whether to treat patterns as regex expressions (default: True)
Returns:
Boolean Series indicating matching rows
Raises:
ValueError: If type_text is empty or None
TypeError: If type_text is not a string or list of strings
"""
if not type_text:
raise ValueError("type_text cannot be empty or None")
if isinstance(type_text, str):
type_text = [type_text]
elif not isinstance(type_text, list):
raise TypeError("type_text must be a string or list of strings")
if regex:
pattern = '|'.join(f'{p}' for p in type_text)
return df['name'].str.contains(pattern, case=False, na=False, regex=True)
else:
masks = [df['name'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
return pd.concat(masks, axis=1).any(axis=1)
def extract_creature_types(type_text: str, creature_types: List[str], non_creature_types: List[str]) -> List[str]:
"""Extract creature types from a type text string.