Started work refactoring the tagging functions by using Traycer

2025-12-16 23:50:12 +01:00 · 2024-12-31 12:04:10 -08:00 · 2024-12-31 12:04:10 -08:00 · a4abea2c3c
commit a4abea2c3c
parent e404515d91
4 changed files with 736 additions and 516 deletions
--- a/settings.py
+++ b/settings.py
@ -32,6 +32,18 @@ board_wipe_tags = ['destroy all', 'destroy each', 'return all', 'return each', '
 card_types = ['Artifact','Creature', 'Enchantment', 'Instant', 'Land', 'Planeswalker', 'Sorcery',
              'Kindred', 'Dungeon', 'Battle']

+# Mapping of card types to their corresponding theme tags
+TYPE_TAG_MAPPING = {
+    'Artifact': ['Artifacts Matter'],
+    'Battle': ['Battles Matter'],
+    #'Creature': [],
+    'Enchantment': ['Enchantments Matter'],
+    'Instant': ['Spells Matter', 'Spellslinger'],
+    'Land': ['Lands Matter'],
+    'Planeswalker': ['Superfriends'],
+    'Sorcery': ['Spells Matter', 'Spellslinger']
+}
+
 csv_directory = 'csv_files'

 colors = ['colorless', 'white', 'blue', 'black', 'red', 'green',
@ -149,3 +161,31 @@ targetted_removal_tags = ['exile target', 'destroy target', 'return target', 'sh
                'deals damage to target', 'loses all abilities']

 triggers = ['when', 'whenever', 'at']
+
+# Constants for DataFrame validation and processing
+REQUIRED_COLUMNS = [
+    'name', 'faceName', 'edhrecRank', 'colorIdentity', 'colors',
+    'manaCost', 'manaValue', 'type', 'creatureTypes', 'text',
+    'power', 'toughness', 'keywords', 'themeTags', 'layout', 'side'
+]
+
+DEFAULT_THEME_TAGS = [
+    'Aggro', 'Aristocrats', 'Artifacts Matter', 'Big Mana', 'Blink',
+    'Board Wipes', 'Burn', 'Cantrips', 'Card Draw', 'Clones',
+    'Combat Matters', 'Control', 'Counters Matter', 'Energy',
+    'Enter the Battlefield', 'Equipment', 'Exile Matters', 'Infect',
+    'Interaction', 'Lands Matter', 'Leave the Battlefield', 'Legends Matter',
+    'Life Matters', 'Mill', 'Monarch', 'Protection', 'Ramp', 'Reanimate',
+    'Removal', 'Sacrifice Matters', 'Spellslinger', 'Stax', 'Super Friends',
+    'Theft', 'Token Creation', 'Tokens Matter', 'Voltron', 'X Spells'
+]
+
+COLUMN_ORDER = [
+    'name', 'faceName', 'edhrecRank', 'colorIdentity', 'colors',
+    'manaCost', 'manaValue', 'type', 'creatureTypes', 'text',
+    'power', 'toughness', 'keywords', 'themeTags', 'layout', 'side'
+]
+
+# Constants for type detection and processing
+OUTLAW_TYPES = ['Assassin', 'Mercenary', 'Pirate', 'Rogue', 'Warlock']
+TYPE_DETECTION_BATCH_SIZE = 1000
--- a/tagger.py
+++ b/tagger.py
--- a/traycer_testing.py
+++ b/traycer_testing.py
@ -1,61 +0,0 @@
-def tag_for_cantrips(df, color):
-    """
-    Tag cards in the DataFrame as cantrips based on specific criteria.
-    
-    Cantrips are defined as low-cost spells (mana value <= 2) that draw cards.
-    The function excludes certain card types, keywords, and specific named cards
-    from being tagged as cantrips.
-    
-    Parameters:
-        df (pd.DataFrame): The DataFrame containing card data.
-        color (str): The color identifier for logging purposes.
-    
-    Returns:
-        None: The function modifies the DataFrame in place by applying tags.
-    """
-    logging.info('Tagging cantrips in %s_cards.csv', color)
-
-    # Convert mana value to numeric
-    df['manaValue'] = pd.to_numeric(df['manaValue'], errors='coerce')
-
-    # Define exclusion conditions
-    excluded_types = df['type'].str.contains('Land|Equipment', na=False)
-    excluded_keywords = df['keywords'].str.contains('Channel|Cycling|Connive|Learn|Ravenous', na=False)
-    has_loot = df['themeTags'].apply(lambda x: 'Loot' in x)
-
-    # Define name exclusions
-    EXCLUDED_NAMES = {
-        'Archivist of Oghma', 'Argothian Enchantress', 'Audacity', 'Betrayal', 'Bequeathal', 'Blood Scrivener', 'Brigon, Soldier of Meletis',
-        'Compost', 'Concealing curtains // Revealing Eye', 'Cryptbreaker', 'Curiosity', 'Cuse of Vengeance', 'Cryptek', 'Dakra Mystic',
-        'Dawn of a New Age', 'Dockside Chef', 'Dreamcatcher', 'Edgewall Innkeeper', 'Eidolon of Philosophy', 'Evolved Sleeper',
-        'Femeref Enchantress', 'Finneas, Ace Archer', 'Flumph', 'Folk Hero', 'Frodo, Adventurous Hobbit', 'Goblin Artisans',
-        'Goldberry, River-Daughter', 'Gollum, Scheming Guide', 'Hatching Plans', 'Ideas Unbound', 'Ingenius Prodigy', 'Ior Ruin Expedition',
-        "Jace's Erasure", 'Keeper of the Mind', 'Kor Spiritdancer', 'Lodestone Bauble', 'Puresteel Paladin', 'Jeweled Bird', 'Mindblade Render',
-        "Multani's Presence", "Nahiri's Lithoforming", 'Ordeal of Thassa', 'Pollywog Prodigy', 'Priest of Forgotten Gods', 'Ravenous Squirrel',
-        'Read the Runes', 'Red Death, Shipwrecker', 'Roil Cartographer', 'Sage of Lat-Name', 'Saprazzan Heir', 'Scion of Halaster', 'See Beyond',
-        'Selhoff Entomber', 'Shielded Aether Theif', 'Shore Keeper', 'silverquill Silencer', 'Soldevi Sage', 'Soldevi Sentry', 'Spiritual Focus',
-        'Sram, Senior Edificer', 'Staff of the Storyteller', 'Stirge', 'Sylvan Echoes', "Sythis Harvest's Hand", 'Sygg, River Cutthroat',
-        'Tenuous Truce', 'Test of Talents', 'Thalakos seer', "Tribute to Horobi // Echo of Deaths Wail", 'Vampire Gourmand', 'Vampiric Rites',
-        'Vampirism', 'Vessel of Paramnesia', "Witch's Caultron", 'Wall of Mulch', 'Waste Not', 'Well Rested'
-        # Add other excluded names here
-    }
-    excluded_names = df['name'].isin(EXCLUDED_NAMES)
-
-    # Define cantrip conditions
-    has_draw = df['text'].str.contains('draw a card', case=False, na=False)
-    low_cost = df['manaValue'] <= 2
-
-    # Combine all conditions
-    cantrip_mask = (
-        ~excluded_types &
-        ~excluded_keywords &
-        ~has_loot &
-        ~excluded_names &
-        has_draw &
-        low_cost
-    )
-
-    # Apply tags using vectorized operation
-    apply_tag_vectorized(df, cantrip_mask, TAG_GROUPS['Cantrips'])
-
-    logging.info('Finished tagging cantrips in %s_cards.csv', color)
--- a/utility.py
+++ b/utility.py
@ -1,4 +1,18 @@
-def pluralize(word):
+from typing import Union, List
+import pandas as pd
+import re
+import logging
+from typing import Dict, Optional, Set
+from time import perf_counter
+def pluralize(word: str) -> str:
+    """Convert a word to its plural form using basic English pluralization rules.
+
+    Args:
+        word: The singular word to pluralize
+
+    Returns:
+        The pluralized word
+    """
    if word.endswith('y'):
        return word[:-1] + 'ies'
    elif word.endswith(('s', 'sh', 'ch', 'x', 'z')):
@ -8,10 +22,261 @@ def pluralize(word):
    else:
        return word + 's'

-def sort_list(list_to_sort):
-    if isinstance(list_to_sort, list):
-        list_to_sort = sorted(list_to_sort)
-        return list_to_sort
+def sort_list(items: Union[List, pd.Series]) -> Union[List, pd.Series]:
+    """Sort a list or pandas Series in ascending order.
+
+    Args:
+        items: List or Series to sort
+
+    Returns:
+        Sorted list or Series
+    """
+    if isinstance(items, (list, pd.Series)):
+        return sorted(items) if isinstance(items, list) else items.sort_values()
+    return items
+
+def create_regex_mask(df: pd.DataFrame, column: str, pattern: str) -> pd.Series:
+    """Create a boolean mask for rows where a column matches a regex pattern.
+
+    Args:
+        df: DataFrame to search
+        column: Column name to search in
+        pattern: Regex pattern to match
+
+    Returns:
+        Boolean Series indicating matching rows
+    """
+    return df[column].str.contains(pattern, case=False, na=False, regex=True)
+
+def combine_masks(masks: List[pd.Series], logical_operator: str = 'and') -> pd.Series:
+    """Combine multiple boolean masks with a logical operator.
+
+    Args:
+        masks: List of boolean Series masks to combine
+        logical_operator: Logical operator to use ('and' or 'or')
+
+    Returns:
+        Combined boolean mask
+    """
+    if not masks:
+        return pd.Series([], dtype=bool)
+        
+    result = masks[0]
+    for mask in masks[1:]:
+        if logical_operator == 'and':
+            result = result & mask
+        else:
+            result = result | mask
+    return result
+
+def safe_str_contains(series: pd.Series, patterns: Union[str, List[str]], regex: bool = False) -> pd.Series:
+    """Safely check if strings in a Series contain one or more patterns, handling NA values.
+
+    Args:
+        series: String Series to check
+        patterns: String or list of strings to look for
+        regex: Whether to treat patterns as regex expressions
+
+    Returns:
+        Boolean Series indicating which strings contain any of the patterns
+    """
+    if isinstance(patterns, str):
+        patterns = [patterns]
+    
+    if regex:
+        pattern = '|'.join(f'({p})' for p in patterns)
+        return series.fillna('').str.contains(pattern, case=False, na=False, regex=True)
    else:
-        return list_to_sort
-  
+        masks = [series.fillna('').str.contains(p, case=False, na=False, regex=False) for p in patterns]
+        return pd.concat(masks, axis=1).any(axis=1)
+
+def create_type_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series:
+    """Create a boolean mask for rows where type matches one or more patterns.
+
+    Args:
+        df: DataFrame to search
+        type_text: Type text pattern(s) to match. Can be a single string or list of strings.
+        regex: Whether to treat patterns as regex expressions (default: True)
+
+    Returns:
+        Boolean Series indicating matching rows
+
+    Raises:
+        ValueError: If type_text is empty or None
+        TypeError: If type_text is not a string or list of strings
+    """
+    if not type_text:
+        raise ValueError("type_text cannot be empty or None")
+
+    if isinstance(type_text, str):
+        type_text = [type_text]
+    elif not isinstance(type_text, list):
+        raise TypeError("type_text must be a string or list of strings")
+
+    if regex:
+        pattern = '|'.join(f'{p}' for p in type_text)
+        return df['type'].str.contains(pattern, case=False, na=False, regex=True)
+    else:
+        masks = [df['type'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
+        return pd.concat(masks, axis=1).any(axis=1)
+
+def create_combined_type_mask(df: pd.DataFrame, type_patterns: Dict[str, List[str]], logical_operator: str = 'and') -> pd.Series:
+    """Create a combined boolean mask from multiple type patterns.
+
+    Args:
+        df: DataFrame to search
+        type_patterns: Dictionary mapping type categories to lists of patterns
+        logical_operator: How to combine masks ('and' or 'or')
+
+    Returns:
+        Combined boolean mask
+
+    Example:
+        patterns = {
+            'creature': ['Creature', 'Artifact Creature'],
+            'enchantment': ['Enchantment', 'Enchantment Creature']
+        }
+        mask = create_combined_type_mask(df, patterns, 'or')
+    """
+    if not type_patterns:
+        return pd.Series(True, index=df.index)
+
+    category_masks = []
+    for patterns in type_patterns.values():
+        category_masks.append(create_type_mask(df, patterns))
+
+    return combine_masks(category_masks, logical_operator)
+
+def extract_creature_types(type_text: str, creature_types: List[str], non_creature_types: List[str]) -> List[str]:
+    """Extract creature types from a type text string.
+
+    Args:
+        type_text: The type line text to parse
+        creature_types: List of valid creature types
+        non_creature_types: List of non-creature types to exclude
+
+    Returns:
+        List of extracted creature types
+    """
+    types = [t.strip() for t in type_text.split()]
+    return [t for t in types if t in creature_types and t not in non_creature_types]
+
+def find_types_in_text(text: str, name: str, creature_types: List[str]) -> List[str]:
+    """Find creature types mentioned in card text.
+
+    Args:
+        text: Card text to search
+        name: Card name to exclude from search
+        creature_types: List of valid creature types
+
+    Returns:
+        List of found creature types
+    """
+    if pd.isna(text):
+        return []
+        
+    found_types = []
+    words = text.split()
+    
+    for word in words:
+        clean_word = re.sub(r'[^a-zA-Z-]', '', word)
+        if clean_word in creature_types:
+            if clean_word not in name:
+                found_types.append(clean_word)
+                
+    return list(set(found_types))
+
+def add_outlaw_type(types: List[str], outlaw_types: List[str]) -> List[str]:
+    """Add Outlaw type if card has an outlaw-related type.
+
+    Args:
+        types: List of current types
+        outlaw_types: List of types that qualify for Outlaw
+
+    Returns:
+        Updated list of types
+    """
+    if any(t in outlaw_types for t in types) and 'Outlaw' not in types:
+        return types + ['Outlaw']
+    return types
+
+def batch_update_types(df: pd.DataFrame, mask: pd.Series, new_types: List[str]) -> None:
+    """Update creature types for multiple rows efficiently.
+
+    Args:
+        df: DataFrame to update
+        mask: Boolean mask indicating which rows to update
+        new_types: List of types to add
+    """
+    df.loc[mask, 'creatureTypes'] = df.loc[mask, 'creatureTypes'].apply(
+        lambda x: sorted(list(set(x + new_types)))
+    )
+
+def create_tag_mask(df: pd.DataFrame, tag_patterns: Union[str, List[str]], column: str = 'themeTags') -> pd.Series:
+    """Create a boolean mask for rows where tags match specified patterns.
+
+    Args:
+        df: DataFrame to search
+        tag_patterns: String or list of strings to match against tags
+        column: Column containing tags to search (default: 'themeTags')
+
+    Returns:
+        Boolean Series indicating matching rows
+    """
+    if isinstance(tag_patterns, str):
+        tag_patterns = [tag_patterns]
+
+    # Handle empty DataFrame case
+    if len(df) == 0:
+        return pd.Series([], dtype=bool)
+
+    # Create mask for each pattern
+    masks = [df[column].apply(lambda x: any(pattern in tag for tag in x)) for pattern in tag_patterns]
+    
+    # Combine masks with OR
+    return pd.concat(masks, axis=1).any(axis=1)
+
+def validate_dataframe_columns(df: pd.DataFrame, required_columns: Set[str]) -> None:
+    """Validate that DataFrame contains all required columns.
+
+    Args:
+        df: DataFrame to validate
+        required_columns: Set of column names that must be present
+
+    Raises:
+        ValueError: If any required columns are missing
+    """
+    missing = required_columns - set(df.columns)
+    if missing:
+        raise ValueError(f"Missing required columns: {missing}")
+    
+def apply_tag_vectorized(df: pd.DataFrame, mask: pd.Series, tags: List[str]) -> None:
+    """Apply tags to rows in a dataframe based on a boolean mask.
+    
+    Args:
+        df: The dataframe to modify
+        mask: Boolean series indicating which rows to tag
+        tags: List of tags to apply
+    """
+    if not isinstance(tags, list):
+        tags = [tags]
+        
+    # Get current tags for masked rows
+    current_tags = df.loc[mask, 'themeTags']
+    
+    # Add new tags
+    df.loc[mask, 'themeTags'] = current_tags.apply(lambda x: sorted(list(set(x + tags))))
+
+def log_performance_metrics(start_time: float, operation: str, df_size: int) -> None:
+    """Log performance metrics for an operation.
+
+    Args:
+        start_time: Start time from perf_counter()
+        operation: Description of the operation performed
+        df_size: Size of the DataFrame processed
+    """
+    duration = perf_counter() - start_time
+    logging.info(
+        f"{operation} completed in {duration:.2f}s for {df_size} rows "
+        f"({duration/df_size*1000:.2f}ms per row)"
+    )