Massively overhauled the tagging process, refactored code to improve general effeciciency, implemented parallel processing to reduce total runtime

2025-09-22 04:50:46 +02:00 · 2025-08-14 11:21:09 -07:00 · 2025-08-14 11:21:09 -07:00 · 27ee13fb54
commit 27ee13fb54
parent 02e2c09874
7 changed files with 742 additions and 519 deletions
--- a/code/deck_builder/builder.py
+++ b/code/deck_builder/builder.py
@ -1,33 +1,29 @@
 from __future__ import annotations

 import math
-import numpy as np
-import os
-import random
+import pprint
 import time
-from functools import lru_cache
+# from functools import lru_cache
 from typing import Dict, List, Optional, Union

-import inquirer.prompt
-import keyboard
+# import keyboard
 import pandas as pd
-import pprint
-from fuzzywuzzy import process
+# from fuzzywuzzy import process
 from tqdm import tqdm

 from settings import CSV_DIRECTORY, MULTIPLE_COPY_CARDS
 from .builder_constants import (
    BASIC_LANDS, CARD_TYPES, DEFAULT_NON_BASIC_LAND_SLOTS,
-    COMMANDER_CSV_PATH, FUZZY_MATCH_THRESHOLD, MAX_FUZZY_CHOICES, FETCH_LAND_DEFAULT_COUNT,
+    FETCH_LAND_DEFAULT_COUNT,
    COMMANDER_POWER_DEFAULT, COMMANDER_TOUGHNESS_DEFAULT, COMMANDER_MANA_COST_DEFAULT,
    COMMANDER_MANA_VALUE_DEFAULT, COMMANDER_TYPE_DEFAULT, COMMANDER_TEXT_DEFAULT, 
    THEME_PRIORITY_BONUS, THEME_POOL_SIZE_MULTIPLIER, DECK_DIRECTORY,
    COMMANDER_COLOR_IDENTITY_DEFAULT, COMMANDER_COLORS_DEFAULT, COMMANDER_TAGS_DEFAULT, 
    COMMANDER_THEMES_DEFAULT, COMMANDER_CREATURE_TYPES_DEFAULT, DUAL_LAND_TYPE_MAP,
-    CSV_READ_TIMEOUT, CSV_PROCESSING_BATCH_SIZE, CSV_VALIDATION_RULES, CSV_REQUIRED_COLUMNS,
+    CSV_READ_TIMEOUT, CSV_VALIDATION_RULES, CSV_REQUIRED_COLUMNS,
    STAPLE_LAND_CONDITIONS, TRIPLE_LAND_TYPE_MAP, MISC_LAND_MAX_COUNT, MISC_LAND_MIN_COUNT,
    MISC_LAND_POOL_SIZE, LAND_REMOVAL_MAX_ATTEMPTS, PROTECTED_LANDS,
-    MANA_COLORS, MANA_PIP_PATTERNS, THEME_WEIGHT_MULTIPLIER
+    MANA_COLORS, THEME_WEIGHT_MULTIPLIER
 )
 from . import builder_utils
 from file_setup import setup_utils
@ -75,7 +71,7 @@ from type_definitions import (
    ArtifactDF,
    CreatureDF,
    NonCreatureDF,
-    PlaneswalkerDF,
+    
    NonPlaneswalkerDF)

 import logging_util
--- a/code/input_handler.py
+++ b/code/input_handler.py
@ -2,16 +2,14 @@

 from __future__ import annotations

-import logging
-import os
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Tuple, Union

-import inquirer.prompt 
+import inquirer 
 from settings import (
-    COLORS, COLOR_ABRV
+    COLOR_ABRV
 )
 from deck_builder.builder_constants import (DEFAULT_MAX_CARD_PRICE,
-    DEFAULT_MAX_DECK_PRICE, DEFAULT_THEME_TAGS, MONO_COLOR_MAP,
+    DEFAULT_THEME_TAGS, MONO_COLOR_MAP,
    DUAL_COLOR_MAP, TRI_COLOR_MAP, OTHER_COLOR_MAP
 )

@ -26,7 +24,7 @@ from exceptions import (
    InvalidNumberError,
    InvalidQuestionTypeError,
    MaxAttemptsError,
-    PriceError,
+    
    PriceLimitError,
    PriceValidationError
 )
--- a/code/main.py
+++ b/code/main.py
@ -13,7 +13,7 @@ from pathlib import Path
 from typing import NoReturn, Optional

 # Third-party imports
-import inquirer.prompt
+import inquirer

 # Local imports
 from deck_builder import DeckBuilder
@ -104,7 +104,7 @@ def run_menu() -> NoReturn:
                case 'Setup':
                    setup()
                case 'Tag CSV Files':
-                    tagger.run_tagging()
+                    tagger.run_tagging(parallel=True)
                case 'Build a Deck':
                    builder.determine_commander()
                case 'Quit':
--- a/code/tagging/tag_constants.py
+++ b/code/tagging/tag_constants.py
@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Final, Tuple, Pattern, Union, Callable
+from typing import Dict, List, Final

 TRIGGERS: List[str] = ['when', 'whenever', 'at']

@ -20,7 +20,7 @@ TAG_GROUPS: Dict[str, List[str]] = {
 }

 # Common regex patterns
-PATTERN_GROUPS: Dict[str, Optional[str]]  = {
+PATTERN_GROUPS: Dict[str, str]  = {
    "draw": r"draw[s]? a card|draw[s]? one card",
    "combat": r"attack[s]?|block[s]?|combat damage",
    "tokens": r"create[s]? .* token|put[s]? .* token",
@ -30,6 +30,31 @@ PATTERN_GROUPS: Dict[str, Optional[str]]  = {
    "cost_reduction": r"cost[s]? \{[\d\w]\} less|affinity for|cost[s]? less to cast|chosen type cost|copy cost|from exile cost|from exile this turn cost|from your graveyard cost|has undaunted|have affinity for artifacts|other than your hand cost|spells cost|spells you cast cost|that target .* cost|those spells cost|you cast cost|you pay cost"
 }

+# Common phrase groups (lists) used across taggers
+PHRASE_GROUPS: Dict[str, List[str]] = {
+    # Variants for monarch wording
+    "monarch": [
+        r"becomes? the monarch",
+        r"can\'t become the monarch",
+        r"is the monarch",
+        r"was the monarch",
+        r"you are the monarch",
+        r"you become the monarch",
+        r"you can\'t become the monarch",
+        r"you\'re the monarch"
+    ],
+    # Variants for blink-style return to battlefield wording
+    "blink_return": [
+        r"return it to the battlefield",
+        r"return that card to the battlefield",
+        r"return them to the battlefield",
+        r"return those cards to the battlefield",
+        r"return .* to the battlefield"
+    ]
+}
+# Common action patterns
+CREATE_ACTION_PATTERN: Final[str] = r"create|put"
+
 # Creature/Counter types
 COUNTER_TYPES: List[str] = [r'\+0/\+1', r'\+0/\+2', r'\+1/\+0', r'\+1/\+2', r'\+2/\+0', r'\+2/\+2',
                '-0/-1', '-0/-2', '-1/-0', '-1/-2', '-2/-0', '-2/-2',
@ -128,7 +153,7 @@ REQUIRED_COLUMNS: List[str] = [
 ]

 # Mapping of card types to their corresponding theme tags
-TYPE_TAG_MAPPING: List[str] = {
+TYPE_TAG_MAPPING: Dict[str, List[str]] = {
    'Artifact': ['Artifacts Matter'],
    'Battle': ['Battles Matter'],
    #'Creature': [],
@ -268,12 +293,12 @@ LANDS_MATTER_PATTERNS: Dict[str, List[str]] = {
    ]
 }

-DOMAIN_PATTERNS: List[str] = {
+DOMAIN_PATTERNS: Dict[str, List[str]] = {
    'keyword': ['domain'],
    'text': ['basic land types among lands you control']
 }

-LANDFALL_PATTERNS: List[str] = {
+LANDFALL_PATTERNS: Dict[str, List[str]] = {
    'keyword': ['landfall'],
    'triggers': [
        'whenever a land enters the battlefield under your control',
@ -281,7 +306,7 @@ LANDFALL_PATTERNS: List[str] = {
    ]
 }

-LANDWALK_PATTERNS: List[str] = {
+LANDWALK_PATTERNS: Dict[str, List[str]] = {
    'basic': [
        'plainswalker',
        'islandwalk',
@ -404,7 +429,7 @@ ARISTOCRAT_EXCLUSION_PATTERNS: List[str] = [

 # Constants for stax functionality
 STAX_TEXT_PATTERNS: List[str] = [
-    'an opponent controls'
+    'an opponent controls',
    'can\'t attack',
    'can\'t be cast', 
    'can\'t be activated',
@ -422,11 +447,7 @@ STAX_TEXT_PATTERNS: List[str] = [
    'opponents control',
    'opponents control can\'t',
    'opponents control enter tapped',
-    'spells cost {1} more',
-    'spells cost {2} more',
-    'spells cost {3} more',
-    'spells cost {4} more',
-    'spells cost {5} more',
+    r'spells cost \{\d\} more',
    'that player doesn\'t',
    'unless that player pays',
    'you control your opponent',
--- a/code/tagging/tag_utils.py
+++ b/code/tagging/tag_utils.py
@ -16,7 +16,10 @@ from __future__ import annotations

 # Standard library imports
 import re
-from typing import List, Set, Union, Any
+from typing import List, Set, Union, Any, Tuple
+from functools import lru_cache
+
+import numpy as np

 # Third-party imports
 import pandas as pd
@ -24,6 +27,43 @@ import pandas as pd
 # Local application imports
 from . import tag_constants

+
+# --- Internal helpers for performance -----------------------------------------------------------
+@lru_cache(maxsize=2048)
+def _build_joined_pattern(parts: Tuple[str, ...]) -> str:
+    """Join multiple regex parts with '|'. Cached for reuse across calls."""
+    return '|'.join(parts)
+
+
+@lru_cache(maxsize=2048)
+def _compile_pattern(pattern: str, ignore_case: bool = True):
+    """Compile a regex pattern with optional IGNORECASE. Cached for reuse."""
+    flags = re.IGNORECASE if ignore_case else 0
+    return re.compile(pattern, flags)
+
+def _ensure_norm_series(df: pd.DataFrame, source_col: str, norm_col: str) -> pd.Series:
+    """Ensure a cached normalized string series exists on df for source_col.
+
+    Normalization here means: fillna('') and cast to str once. This avoids
+    repeating fill/astype work on every mask creation. Extra columns are
+    later dropped by final reindex in output.
+
+    Args:
+        df: DataFrame containing the column
+        source_col: Name of the source column (e.g., 'text')
+        norm_col: Name of the cache column to create/use (e.g., '__text_s')
+
+    Returns:
+        The normalized pandas Series.
+    """
+    if norm_col in df.columns:
+        return df[norm_col]
+    # Create normalized string series
+    series = df[source_col].fillna('') if source_col in df.columns else pd.Series([''] * len(df), index=df.index)
+    series = series.astype(str)
+    df[norm_col] = series
+    return df[norm_col]
+
 def pluralize(word: str) -> str:
    """Convert a word to its plural form using basic English pluralization rules.

@ -78,12 +118,21 @@ def create_type_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex:
    elif not isinstance(type_text, list):
        raise TypeError("type_text must be a string or list of strings")

+    if len(df) == 0:
+        return pd.Series([], dtype=bool)
+
+    # Use normalized cached series
+    type_series = _ensure_norm_series(df, 'type', '__type_s')
+
    if regex:
-        pattern = '|'.join(f'{p}' for p in type_text)
-        return df['type'].str.contains(pattern, case=False, na=False, regex=True)
+        pattern = _build_joined_pattern(tuple(type_text)) if len(type_text) > 1 else type_text[0]
+        compiled = _compile_pattern(pattern, ignore_case=True)
+        return type_series.str.contains(compiled, na=False, regex=True)
    else:
-        masks = [df['type'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
-        return pd.concat(masks, axis=1).any(axis=1)
+        masks = [type_series.str.contains(p, case=False, na=False, regex=False) for p in type_text]
+        if not masks:
+            return pd.Series(False, index=df.index)
+        return pd.Series(np.logical_or.reduce(masks), index=df.index)

 def create_text_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True, combine_with_or: bool = True) -> pd.Series[bool]:
    """Create a boolean mask for rows where text matches one or more patterns.
@ -109,15 +158,22 @@ def create_text_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex:
    elif not isinstance(type_text, list):
        raise TypeError("type_text must be a string or list of strings")

+    if len(df) == 0:
+        return pd.Series([], dtype=bool)
+
+    # Use normalized cached series
+    text_series = _ensure_norm_series(df, 'text', '__text_s')
+
    if regex:
-        pattern = '|'.join(f'{p}' for p in type_text)
-        return df['text'].str.contains(pattern, case=False, na=False, regex=True)
+        pattern = _build_joined_pattern(tuple(type_text)) if len(type_text) > 1 else type_text[0]
+        compiled = _compile_pattern(pattern, ignore_case=True)
+        return text_series.str.contains(compiled, na=False, regex=True)
    else:
-        masks = [df['text'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
-        if combine_with_or:
-            return pd.concat(masks, axis=1).any(axis=1)
-        else:
-            return pd.concat(masks, axis=1).all(axis=1)
+        masks = [text_series.str.contains(p, case=False, na=False, regex=False) for p in type_text]
+        if not masks:
+            return pd.Series(False, index=df.index)
+        reduced = np.logical_or.reduce(masks) if combine_with_or else np.logical_and.reduce(masks)
+        return pd.Series(reduced, index=df.index)

 def create_keyword_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series[bool]:
    """Create a boolean mask for rows where keyword text matches one or more patterns.
@ -151,18 +207,18 @@ def create_keyword_mask(df: pd.DataFrame, type_text: Union[str, List[str]], rege
    elif not isinstance(type_text, list):
        raise TypeError("type_text must be a string or list of strings")

-    # Create default mask for null values
-    # Handle null values and convert to string
-    keywords = df['keywords'].fillna('')
-    # Convert non-string values to strings
-    keywords = keywords.astype(str)
+    # Use normalized cached series for keywords
+    keywords = _ensure_norm_series(df, 'keywords', '__keywords_s')

    if regex:
-        pattern = '|'.join(f'{p}' for p in type_text)
-        return keywords.str.contains(pattern, case=False, na=False, regex=True)
+        pattern = _build_joined_pattern(tuple(type_text)) if len(type_text) > 1 else type_text[0]
+        compiled = _compile_pattern(pattern, ignore_case=True)
+        return keywords.str.contains(compiled, na=False, regex=True)
    else:
        masks = [keywords.str.contains(p, case=False, na=False, regex=False) for p in type_text]
-        return pd.concat(masks, axis=1).any(axis=1)
+        if not masks:
+            return pd.Series(False, index=df.index)
+        return pd.Series(np.logical_or.reduce(masks), index=df.index)

 def create_name_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex: bool = True) -> pd.Series[bool]:
    """Create a boolean mask for rows where name matches one or more patterns.
@ -187,12 +243,21 @@ def create_name_mask(df: pd.DataFrame, type_text: Union[str, List[str]], regex:
    elif not isinstance(type_text, list):
        raise TypeError("type_text must be a string or list of strings")

+    if len(df) == 0:
+        return pd.Series([], dtype=bool)
+
+    # Use normalized cached series
+    name_series = _ensure_norm_series(df, 'name', '__name_s')
+
    if regex:
-        pattern = '|'.join(f'{p}' for p in type_text)
-        return df['name'].str.contains(pattern, case=False, na=False, regex=True)
+        pattern = _build_joined_pattern(tuple(type_text)) if len(type_text) > 1 else type_text[0]
+        compiled = _compile_pattern(pattern, ignore_case=True)
+        return name_series.str.contains(compiled, na=False, regex=True)
    else:
-        masks = [df['name'].str.contains(p, case=False, na=False, regex=False) for p in type_text]
-        return pd.concat(masks, axis=1).any(axis=1)
+        masks = [name_series.str.contains(p, case=False, na=False, regex=False) for p in type_text]
+        if not masks:
+            return pd.Series(False, index=df.index)
+        return pd.Series(np.logical_or.reduce(masks), index=df.index)

 def extract_creature_types(type_text: str, creature_types: List[str], non_creature_types: List[str]) -> List[str]:
    """Extract creature types from a type text string.
@ -307,6 +372,31 @@ def apply_tag_vectorized(df: pd.DataFrame, mask: pd.Series[bool], tags: Union[st
    # Add new tags
    df.loc[mask, 'themeTags'] = current_tags.apply(lambda x: sorted(list(set(x + tags))))

+def apply_rules(df: pd.DataFrame, rules: List[dict]) -> None:
+        """Apply a list of rules to a DataFrame.
+
+        Each rule dict supports:
+            - mask: pd.Series of booleans or a callable df->mask
+            - tags: str|List[str]
+
+        Example:
+            rules = [
+                { 'mask': lambda d: create_text_mask(d, 'lifelink'), 'tags': ['Lifelink'] },
+            ]
+
+        Args:
+                df: DataFrame to update
+                rules: list of rule dicts
+        """
+        for rule in rules:
+                mask = rule.get('mask')
+                if callable(mask):
+                        mask = mask(df)
+                if mask is None:
+                        continue
+                tags = rule.get('tags', [])
+                apply_tag_vectorized(df, mask, tags)
+
 def create_mass_effect_mask(df: pd.DataFrame, effect_type: str) -> pd.Series[bool]:
    """Create a boolean mask for cards with mass removal effects of a specific type.

@ -326,6 +416,60 @@ def create_mass_effect_mask(df: pd.DataFrame, effect_type: str) -> pd.Series[boo
    patterns = tag_constants.BOARD_WIPE_TEXT_PATTERNS[effect_type]
    return create_text_mask(df, patterns)

+def create_trigger_mask(
+    df: pd.DataFrame,
+    subjects: Union[str, List[str]],
+    include_attacks: bool = False,
+) -> pd.Series:
+    """Create a mask for text that contains trigger phrases followed by subjects.
+
+    Example: with subjects=['a creature','you'] builds patterns:
+      'when a creature', 'whenever you', 'at you', etc.
+
+    Args:
+        df: DataFrame
+        subjects: A subject string or list (will be normalized to list)
+        include_attacks: If True, also include '{trigger} .* attacks'
+
+    Returns:
+        Boolean Series mask
+    """
+    subs = [subjects] if isinstance(subjects, str) else subjects
+    patterns: List[str] = []
+    for trig in tag_constants.TRIGGERS:
+        patterns.extend([f"{trig} {s}" for s in subs])
+        if include_attacks:
+            patterns.append(f"{trig} .* attacks")
+    return create_text_mask(df, patterns)
+
+def create_numbered_phrase_mask(
+    df: pd.DataFrame,
+    verb: Union[str, List[str]],
+    noun: str = '',
+    numbers: List[str] | None = None,
+) -> pd.Series:
+    """Create a boolean mask for phrases like 'draw {num} card'.
+
+    Args:
+        df: DataFrame to search
+    verb: Action verb or list of verbs (e.g., 'draw' or ['gain', 'gains'])
+    noun: Optional object noun in singular form (e.g., 'card'); if empty, omitted
+        numbers: Optional list of number words/digits (defaults to tag_constants.NUM_TO_SEARCH)
+
+    Returns:
+        Boolean Series mask
+    """
+    if numbers is None:
+        numbers = tag_constants.NUM_TO_SEARCH
+    # Normalize verbs to list
+    verbs = [verb] if isinstance(verb, str) else verb
+    # Build patterns
+    if noun:
+        patterns = [fr"{v}\s+{num}\s+{noun}" for v in verbs for num in numbers]
+    else:
+        patterns = [fr"{v}\s+{num}" for v in verbs for num in numbers]
+    return create_text_mask(df, patterns)
+
 def create_damage_pattern(number: Union[int, str]) -> str:
    """Create a pattern for matching X damage effects.

--- a/code/tagging/tagger.py
+++ b/code/tagging/tagger.py
--- a/requirements.txt
+++ b/requirements.txt
@ -3,6 +3,10 @@ inquirer>=3.1.3
 typing_extensions>=4.5.0
 fuzzywuzzy>=0.18.0
 python-Levenshtein>=0.12.0
+tqdm>=4.66.0
+scrython>=1.10.0
+numpy>=1.24.0
+requests>=2.31.0

 # Development dependencies
 mypy>=1.3.0