Started work refactoring the tagging functions by using Traycer

This commit is contained in:
mwisnowski 2024-12-31 12:04:10 -08:00
parent e404515d91
commit a4abea2c3c
4 changed files with 736 additions and 516 deletions

View file

@ -32,6 +32,18 @@ board_wipe_tags = ['destroy all', 'destroy each', 'return all', 'return each', '
card_types = ['Artifact','Creature', 'Enchantment', 'Instant', 'Land', 'Planeswalker', 'Sorcery',
'Kindred', 'Dungeon', 'Battle']
# Mapping of card types to their corresponding theme tags
TYPE_TAG_MAPPING = {
'Artifact': ['Artifacts Matter'],
'Battle': ['Battles Matter'],
#'Creature': [],
'Enchantment': ['Enchantments Matter'],
'Instant': ['Spells Matter', 'Spellslinger'],
'Land': ['Lands Matter'],
'Planeswalker': ['Superfriends'],
'Sorcery': ['Spells Matter', 'Spellslinger']
}
csv_directory = 'csv_files'
colors = ['colorless', 'white', 'blue', 'black', 'red', 'green',
@ -149,3 +161,31 @@ targetted_removal_tags = ['exile target', 'destroy target', 'return target', 'sh
'deals damage to target', 'loses all abilities']
triggers = ['when', 'whenever', 'at']
# Constants for DataFrame validation and processing
REQUIRED_COLUMNS = [
'name', 'faceName', 'edhrecRank', 'colorIdentity', 'colors',
'manaCost', 'manaValue', 'type', 'creatureTypes', 'text',
'power', 'toughness', 'keywords', 'themeTags', 'layout', 'side'
]
DEFAULT_THEME_TAGS = [
'Aggro', 'Aristocrats', 'Artifacts Matter', 'Big Mana', 'Blink',
'Board Wipes', 'Burn', 'Cantrips', 'Card Draw', 'Clones',
'Combat Matters', 'Control', 'Counters Matter', 'Energy',
'Enter the Battlefield', 'Equipment', 'Exile Matters', 'Infect',
'Interaction', 'Lands Matter', 'Leave the Battlefield', 'Legends Matter',
'Life Matters', 'Mill', 'Monarch', 'Protection', 'Ramp', 'Reanimate',
'Removal', 'Sacrifice Matters', 'Spellslinger', 'Stax', 'Super Friends',
'Theft', 'Token Creation', 'Tokens Matter', 'Voltron', 'X Spells'
]
COLUMN_ORDER = [
'name', 'faceName', 'edhrecRank', 'colorIdentity', 'colors',
'manaCost', 'manaValue', 'type', 'creatureTypes', 'text',
'power', 'toughness', 'keywords', 'themeTags', 'layout', 'side'
]
# Constants for type detection and processing
OUTLAW_TYPES = ['Assassin', 'Mercenary', 'Pirate', 'Rogue', 'Warlock']
TYPE_DETECTION_BATCH_SIZE = 1000

872
tagger.py

File diff suppressed because it is too large Load diff

View file

@ -1,61 +0,0 @@
def tag_for_cantrips(df, color):
"""
Tag cards in the DataFrame as cantrips based on specific criteria.
Cantrips are defined as low-cost spells (mana value <= 2) that draw cards.
The function excludes certain card types, keywords, and specific named cards
from being tagged as cantrips.
Parameters:
df (pd.DataFrame): The DataFrame containing card data.
color (str): The color identifier for logging purposes.
Returns:
None: The function modifies the DataFrame in place by applying tags.
"""
logging.info('Tagging cantrips in %s_cards.csv', color)
# Convert mana value to numeric
df['manaValue'] = pd.to_numeric(df['manaValue'], errors='coerce')
# Define exclusion conditions
excluded_types = df['type'].str.contains('Land|Equipment', na=False)
excluded_keywords = df['keywords'].str.contains('Channel|Cycling|Connive|Learn|Ravenous', na=False)
has_loot = df['themeTags'].apply(lambda x: 'Loot' in x)
# Define name exclusions
EXCLUDED_NAMES = {
'Archivist of Oghma', 'Argothian Enchantress', 'Audacity', 'Betrayal', 'Bequeathal', 'Blood Scrivener', 'Brigon, Soldier of Meletis',
'Compost', 'Concealing curtains // Revealing Eye', 'Cryptbreaker', 'Curiosity', 'Cuse of Vengeance', 'Cryptek', 'Dakra Mystic',
'Dawn of a New Age', 'Dockside Chef', 'Dreamcatcher', 'Edgewall Innkeeper', 'Eidolon of Philosophy', 'Evolved Sleeper',
'Femeref Enchantress', 'Finneas, Ace Archer', 'Flumph', 'Folk Hero', 'Frodo, Adventurous Hobbit', 'Goblin Artisans',
'Goldberry, River-Daughter', 'Gollum, Scheming Guide', 'Hatching Plans', 'Ideas Unbound', 'Ingenius Prodigy', 'Ior Ruin Expedition',
"Jace's Erasure", 'Keeper of the Mind', 'Kor Spiritdancer', 'Lodestone Bauble', 'Puresteel Paladin', 'Jeweled Bird', 'Mindblade Render',
"Multani's Presence", "Nahiri's Lithoforming", 'Ordeal of Thassa', 'Pollywog Prodigy', 'Priest of Forgotten Gods', 'Ravenous Squirrel',
'Read the Runes', 'Red Death, Shipwrecker', 'Roil Cartographer', 'Sage of Lat-Name', 'Saprazzan Heir', 'Scion of Halaster', 'See Beyond',
'Selhoff Entomber', 'Shielded Aether Theif', 'Shore Keeper', 'silverquill Silencer', 'Soldevi Sage', 'Soldevi Sentry', 'Spiritual Focus',
'Sram, Senior Edificer', 'Staff of the Storyteller', 'Stirge', 'Sylvan Echoes', "Sythis Harvest's Hand", 'Sygg, River Cutthroat',
'Tenuous Truce', 'Test of Talents', 'Thalakos seer', "Tribute to Horobi // Echo of Deaths Wail", 'Vampire Gourmand', 'Vampiric Rites',
'Vampirism', 'Vessel of Paramnesia', "Witch's Caultron", 'Wall of Mulch', 'Waste Not', 'Well Rested'
# Add other excluded names here
}
excluded_names = df['name'].isin(EXCLUDED_NAMES)
# Define cantrip conditions
has_draw = df['text'].str.contains('draw a card', case=False, na=False)
low_cost = df['manaValue'] <= 2
# Combine all conditions
cantrip_mask = (
~excluded_types &
~excluded_keywords &
~has_loot &
~excluded_names &
has_draw &
low_cost
)
# Apply tags using vectorized operation
apply_tag_vectorized(df, cantrip_mask, TAG_GROUPS['Cantrips'])
logging.info('Finished tagging cantrips in %s_cards.csv', color)

View file

@ -1,4 +1,18 @@
def pluralize(word):
from typing import Union, List
import pandas as pd
import re
import logging
from typing import Dict, Optional, Set
from time import perf_counter
def pluralize(word: str) -> str:
"""Convert a word to its plural form using basic English pluralization rules.
Args:
word: The singular word to pluralize
Returns:
The pluralized word
"""
if word.endswith('y'):
return word[:-1] + 'ies'
elif word.endswith(('s', 'sh', 'ch', 'x', 'z')):
@ -8,10 +22,261 @@ def pluralize(word):
else:
return word + 's'
def sort_list(list_to_sort):
if isinstance(list_to_sort, list):
list_to_sort = sorted(list_to_sort)
return list_to_sort
def sort_list(items: Union[List, pd.Series]) -> Union[List, pd.Series]:
"""Sort a list or pandas Series in ascending order.
Args:
items: List or Series to sort
Returns:
Sorted list or Series
"""
if isinstance(items, (list, pd.Series)):
return sorted(items) if isinstance(items, list) else items.sort_values()
return items
def create_regex_mask(df: pd.DataFrame, column: str, pattern: str) -> pd.Series:
"""Create a boolean mask for rows where a column matches a regex pattern.
Args:
df: DataFrame to search
column: Column name to search in
pattern: Regex pattern to match
Returns:
Boolean Series indicating matching rows
"""
return df[column].str.contains(pattern, case=False, na=False, regex=True)
def combine_masks(masks: List[pd.Series], logical_operator: str = 'and') -> pd.Series:
"""Combine multiple boolean masks with a logical operator.
Args:
masks: List of boolean Series masks to combine
logical_operator: Logical operator to use ('and' or 'or')
Returns:
Combined boolean mask
"""
if not masks:
return pd.Series([], dtype=bool)
result = masks[0]
for mask in masks[1:]:
if logical_operator == 'and':
result = result & mask
else:
result = result | mask
return result
def safe_str_contains(series: pd.Series, patterns: Union[str, List[str]], regex: bool = False) -> pd.Series:
"""Safely check if strings in a Series contain one or more patterns, handling NA values.
Args:
series: String Series to check
patterns: String or list of strings to look for
regex: Whether to treat patterns as regex expressions
Returns:
Boolean Series indicating which strings contain any of the patterns
"""
if isinstance(patterns, str):
patterns = [patterns]
if regex:
pattern = '|'.join(f'({p})' for p in patterns)
return series.fillna('').str.contains(pattern, case=False, na=False, regex=True)
else:
return list_to_sort
masks = [series.fillna('').str.contains(p, case=False, na=False, regex=False) for p in patterns]
return pd.concat(masks, axis=1).any(axis=1)
def create_type_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
"""Create a boolean mask for rows where type matches one or more patterns.
Args:
df: DataFrame to search
type_text: Type text pattern(s) to match. Can be a single string or list of strings.
regex: Whether to treat patterns as regex expressions (default: True)
Returns:
Boolean Series indicating matching rows
Raises:
ValueError: If type_text is empty or None
TypeError: If type_text is not a string or list of strings
"""
if not type_text:
raise ValueError("type_text cannot be empty or None")
if isinstance(type_text, str):
type_text = [type_text]
elif not isinstance(type_text, list):
raise TypeError("type_text must be a string or list of strings")
if regex:
pattern = '|'.join(f'{p}' for p in type_text)
return df['type'].str.contains(pattern, case=False, na=False, regex=True)
else:
masks = [df['type'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
return pd.concat(masks, axis=1).any(axis=1)
def create_combined_type_mask(df: pd.DataFrame, type_patterns: Dict[str, List[str]], logical_operator: str = 'and') -> pd.Series:
"""Create a combined boolean mask from multiple type patterns.
Args:
df: DataFrame to search
type_patterns: Dictionary mapping type categories to lists of patterns
logical_operator: How to combine masks ('and' or 'or')
Returns:
Combined boolean mask
Example:
patterns = {
'creature': ['Creature', 'Artifact Creature'],
'enchantment': ['Enchantment', 'Enchantment Creature']
}
mask = create_combined_type_mask(df, patterns, 'or')
"""
if not type_patterns:
return pd.Series(True, index=df.index)
category_masks = []
for patterns in type_patterns.values():
category_masks.append(create_type_mask(df, patterns))
return combine_masks(category_masks, logical_operator)
def extract_creature_types(type_text: str, creature_types: List[str], non_creature_types: List[str]) -> List[str]:
"""Extract creature types from a type text string.
Args:
type_text: The type line text to parse
creature_types: List of valid creature types
non_creature_types: List of non-creature types to exclude
Returns:
List of extracted creature types
"""
types = [t.strip() for t in type_text.split()]
return [t for t in types if t in creature_types and t not in non_creature_types]
def find_types_in_text(text: str, name: str, creature_types: List[str]) -> List[str]:
"""Find creature types mentioned in card text.
Args:
text: Card text to search
name: Card name to exclude from search
creature_types: List of valid creature types
Returns:
List of found creature types
"""
if pd.isna(text):
return []
found_types = []
words = text.split()
for word in words:
clean_word = re.sub(r'[^a-zA-Z-]', '', word)
if clean_word in creature_types:
if clean_word not in name:
found_types.append(clean_word)
return list(set(found_types))
def add_outlaw_type(types: List[str], outlaw_types: List[str]) -> List[str]:
"""Add Outlaw type if card has an outlaw-related type.
Args:
types: List of current types
outlaw_types: List of types that qualify for Outlaw
Returns:
Updated list of types
"""
if any(t in outlaw_types for t in types) and 'Outlaw' not in types:
return types + ['Outlaw']
return types
def batch_update_types(df: pd.DataFrame, mask: pd.Series, new_types: List[str]) -> None:
"""Update creature types for multiple rows efficiently.
Args:
df: DataFrame to update
mask: Boolean mask indicating which rows to update
new_types: List of types to add
"""
df.loc[mask, 'creatureTypes'] = df.loc[mask, 'creatureTypes'].apply(
lambda x: sorted(list(set(x + new_types)))
)
def create_tag_mask(df: pd.DataFrame, tag_patterns: Union[str, List[str]], column: str = 'themeTags') -> pd.Series:
"""Create a boolean mask for rows where tags match specified patterns.
Args:
df: DataFrame to search
tag_patterns: String or list of strings to match against tags
column: Column containing tags to search (default: 'themeTags')
Returns:
Boolean Series indicating matching rows
"""
if isinstance(tag_patterns, str):
tag_patterns = [tag_patterns]
# Handle empty DataFrame case
if len(df) == 0:
return pd.Series([], dtype=bool)
# Create mask for each pattern
masks = [df[column].apply(lambda x: any(pattern in tag for tag in x)) for pattern in tag_patterns]
# Combine masks with OR
return pd.concat(masks, axis=1).any(axis=1)
def validate_dataframe_columns(df: pd.DataFrame, required_columns: Set[str]) -> None:
"""Validate that DataFrame contains all required columns.
Args:
df: DataFrame to validate
required_columns: Set of column names that must be present
Raises:
ValueError: If any required columns are missing
"""
missing = required_columns - set(df.columns)
if missing:
raise ValueError(f"Missing required columns: {missing}")
def apply_tag_vectorized(df: pd.DataFrame, mask: pd.Series, tags: List[str]) -> None:
"""Apply tags to rows in a dataframe based on a boolean mask.
Args:
df: The dataframe to modify
mask: Boolean series indicating which rows to tag
tags: List of tags to apply
"""
if not isinstance(tags, list):
tags = [tags]
# Get current tags for masked rows
current_tags = df.loc[mask, 'themeTags']
# Add new tags
df.loc[mask, 'themeTags'] = current_tags.apply(lambda x: sorted(list(set(x + tags))))
def log_performance_metrics(start_time: float, operation: str, df_size: int) -> None:
"""Log performance metrics for an operation.
Args:
start_time: Start time from perf_counter()
operation: Description of the operation performed
df_size: Size of the DataFrame processed
"""
duration = perf_counter() - start_time
logging.info(
f"{operation} completed in {duration:.2f}s for {df_size} rows "
f"({duration/df_size*1000:.2f}ms per row)"
)