""" Utilities for include/exclude card functionality. Provides fuzzy matching, card name normalization, and validation for must-include and must-exclude card lists. """ from __future__ import annotations import difflib import re from typing import List, Dict, Set, Tuple, Optional from dataclasses import dataclass from .builder_constants import POPULAR_CARDS, ICONIC_CARDS # Fuzzy matching configuration FUZZY_CONFIDENCE_THRESHOLD = 0.95 # 95% confidence for auto-acceptance (more conservative) MAX_SUGGESTIONS = 3 # Maximum suggestions to show for fuzzy matches MAX_INCLUDES = 10 # Maximum include cards allowed MAX_EXCLUDES = 15 # Maximum exclude cards allowed @dataclass @dataclass class FuzzyMatchResult: """Result of a fuzzy card name match.""" input_name: str matched_name: Optional[str] confidence: float suggestions: List[str] auto_accepted: bool @dataclass class IncludeExcludeDiagnostics: """Diagnostics for include/exclude processing.""" missing_includes: List[str] ignored_color_identity: List[str] illegal_dropped: List[str] illegal_allowed: List[str] excluded_removed: List[str] duplicates_collapsed: Dict[str, int] include_added: List[str] include_over_ideal: Dict[str, List[str]] # e.g., {"creatures": ["Card A"]} when includes exceed ideal category counts fuzzy_corrections: Dict[str, str] confirmation_needed: List[Dict[str, any]] list_size_warnings: Dict[str, int] def normalize_card_name(name: str) -> str: """ Normalize card names for robust matching. Handles: - Case normalization (casefold) - Punctuation normalization (commas, apostrophes) - Whitespace cleanup - Unicode apostrophe normalization - Arena/Alchemy prefix removal Args: name: Raw card name input Returns: Normalized card name for matching """ if not name: return "" # Basic cleanup s = str(name).strip() # Normalize unicode characters s = s.replace('\u2019', "'") # Curly apostrophe to straight s = s.replace('\u2018', "'") # Opening single quote s = s.replace('\u201C', '"') # Opening double quote s = s.replace('\u201D', '"') # Closing double quote s = s.replace('\u2013', "-") # En dash s = s.replace('\u2014', "-") # Em dash # Remove Arena/Alchemy prefix if s.startswith('A-') and len(s) > 2: s = s[2:] # Normalize whitespace s = " ".join(s.split()) # Case normalization return s.casefold() def normalize_punctuation(name: str) -> str: """ Normalize punctuation for fuzzy matching. Specifically handles the case where users might omit commas: "Krenko, Mob Boss" vs "Krenko Mob Boss" Args: name: Card name to normalize Returns: Name with punctuation variations normalized """ if not name: return "" # Remove common punctuation for comparison s = normalize_card_name(name) # Remove commas, colons, and extra spaces for fuzzy matching s = re.sub(r'[,:]', ' ', s) s = re.sub(r'\s+', ' ', s) return s.strip() def fuzzy_match_card_name( input_name: str, card_names: Set[str], confidence_threshold: float = FUZZY_CONFIDENCE_THRESHOLD ) -> FuzzyMatchResult: """ Perform fuzzy matching on a card name against a set of valid names. Args: input_name: User input card name card_names: Set of valid card names to match against confidence_threshold: Minimum confidence for auto-acceptance Returns: FuzzyMatchResult with match information """ if not input_name or not card_names: return FuzzyMatchResult( input_name=input_name, matched_name=None, confidence=0.0, suggestions=[], auto_accepted=False ) # Normalize input for matching normalized_input = normalize_punctuation(input_name) # Create normalized lookup for card names normalized_to_original = {} for name in card_names: normalized = normalize_punctuation(name) if normalized not in normalized_to_original: normalized_to_original[normalized] = name normalized_names = set(normalized_to_original.keys()) # Exact match check (after normalization) if normalized_input in normalized_names: return FuzzyMatchResult( input_name=input_name, matched_name=normalized_to_original[normalized_input], confidence=1.0, suggestions=[], auto_accepted=True ) # Enhanced fuzzy matching with intelligent prefix prioritization input_lower = normalized_input.lower() # Convert constants to lowercase for matching popular_cards_lower = {card.lower() for card in POPULAR_CARDS} iconic_cards_lower = {card.lower() for card in ICONIC_CARDS} # Collect candidates with different scoring strategies candidates = [] best_raw_similarity = 0.0 for name in normalized_names: name_lower = name.lower() base_score = difflib.SequenceMatcher(None, input_lower, name_lower).ratio() # Skip very low similarity matches early if base_score < 0.3: continue final_score = base_score # Track best raw similarity to decide on true no-match vs. weak suggestions if base_score > best_raw_similarity: best_raw_similarity = base_score # Strong boost for exact prefix matches (input is start of card name) if name_lower.startswith(input_lower): final_score = min(1.0, base_score + 0.5) # Moderate boost for word-level prefix matches elif any(word.startswith(input_lower) for word in name_lower.split()): final_score = min(1.0, base_score + 0.3) # Special case: if input could be abbreviation of first word, boost heavily elif len(input_lower) <= 6: first_word = name_lower.split()[0] if name_lower.split() else "" if first_word and first_word.startswith(input_lower): final_score = min(1.0, base_score + 0.4) # Boost for cards where input is contained as substring elif input_lower in name_lower: final_score = min(1.0, base_score + 0.2) # Special boost for very short inputs that are obvious abbreviations if len(input_lower) <= 4: # For short inputs, heavily favor cards that start with the input if name_lower.startswith(input_lower): final_score = min(1.0, final_score + 0.3) # Popularity boost for well-known cards if name_lower in popular_cards_lower: final_score = min(1.0, final_score + 0.25) # Extra boost for super iconic cards like Lightning Bolt (only when relevant) if name_lower in iconic_cards_lower: # Only boost if there's some relevance to the input if any(word[:3] in input_lower or input_lower[:3] in word for word in name_lower.split()): final_score = min(1.0, final_score + 0.3) # Extra boost for Lightning Bolt when input is 'lightning' or similar if name_lower == 'lightning bolt' and input_lower in ['lightning', 'lightn', 'light']: final_score = min(1.0, final_score + 0.2) # Special handling for Lightning Bolt variants if 'lightning' in name_lower and 'bolt' in name_lower: if input_lower in ['bolt', 'lightn', 'lightning']: final_score = min(1.0, final_score + 0.4) # Simplicity boost: prefer shorter, simpler card names for short inputs if len(input_lower) <= 6: # Boost shorter card names slightly if len(name_lower) <= len(input_lower) * 2: final_score = min(1.0, final_score + 0.05) # Cap total boost to avoid over-accepting near-misses; allow only small boost if final_score > base_score: max_total_boost = 0.06 final_score = min(1.0, base_score + min(final_score - base_score, max_total_boost)) candidates.append((final_score, name)) if not candidates: return FuzzyMatchResult( input_name=input_name, matched_name=None, confidence=0.0, suggestions=[], auto_accepted=False ) # Sort candidates by score (highest first) candidates.sort(key=lambda x: x[0], reverse=True) # Get best match and confidence best_score, best_match = candidates[0] confidence = best_score # If raw similarity never cleared a minimal bar, treat as no reasonable match # even if boosted scores exist; return confidence 0.0 and no suggestions. if best_raw_similarity < 0.35: return FuzzyMatchResult( input_name=input_name, matched_name=None, confidence=0.0, suggestions=[], auto_accepted=False ) # Convert back to original names, preserving score-based order suggestions = [normalized_to_original[match] for _, match in candidates[:MAX_SUGGESTIONS]] best_original = normalized_to_original[best_match] # Auto-accept if confidence is high enough auto_accepted = confidence >= confidence_threshold matched_name = best_original if auto_accepted else None return FuzzyMatchResult( input_name=input_name, matched_name=matched_name, confidence=confidence, suggestions=suggestions, auto_accepted=auto_accepted ) def validate_list_sizes(includes: List[str], excludes: List[str]) -> Dict[str, any]: """ Validate that include/exclude lists are within acceptable size limits. Args: includes: List of include card names excludes: List of exclude card names Returns: Dictionary with validation results and warnings """ include_count = len(includes) exclude_count = len(excludes) warnings = {} errors = [] # Size limit checks if include_count > MAX_INCLUDES: errors.append(f"Too many include cards: {include_count} (max {MAX_INCLUDES})") elif include_count >= int(MAX_INCLUDES * 0.8): # 80% warning threshold warnings['includes_approaching_limit'] = f"Approaching include limit: {include_count}/{MAX_INCLUDES}" if exclude_count > MAX_EXCLUDES: errors.append(f"Too many exclude cards: {exclude_count} (max {MAX_EXCLUDES})") elif exclude_count >= int(MAX_EXCLUDES * 0.8): # 80% warning threshold warnings['excludes_approaching_limit'] = f"Approaching exclude limit: {exclude_count}/{MAX_EXCLUDES}" return { 'valid': len(errors) == 0, 'errors': errors, 'warnings': warnings, 'counts': { 'includes': include_count, 'excludes': exclude_count, 'includes_limit': MAX_INCLUDES, 'excludes_limit': MAX_EXCLUDES } } def collapse_duplicates(card_names: List[str]) -> Tuple[List[str], Dict[str, int]]: """ Remove duplicates from card list and track collapsed counts. Commander format allows only one copy of each card (except for exceptions), so duplicate entries in user input should be collapsed to single copies. Args: card_names: List of card names (may contain duplicates) Returns: Tuple of (unique_names, duplicate_counts) """ if not card_names: return [], {} seen = {} unique_names = [] for name in card_names: if not name or not name.strip(): continue name = name.strip() normalized = normalize_card_name(name) if normalized not in seen: seen[normalized] = {'original': name, 'count': 1} unique_names.append(name) else: seen[normalized]['count'] += 1 # Extract duplicate counts (only for names that appeared more than once) duplicates = { data['original']: data['count'] for data in seen.values() if data['count'] > 1 } return unique_names, duplicates def parse_card_list_input(input_text: str) -> List[str]: """ Parse user input text into a list of card names. Supports: - Newline separated (preferred for cards with commas in names) - Comma separated only for simple lists without newlines - Whitespace cleanup Note: Always prioritizes newlines over commas to avoid splitting card names that contain commas like "Byrke, Long ear Of the Law". Args: input_text: Raw user input text Returns: List of parsed card names """ if not input_text: return [] # Always split on newlines first - this is the preferred format # and prevents breaking card names with commas lines = input_text.split('\n') # If we only have one line and it contains commas, # then it might be comma-separated input vs a single card name with commas if len(lines) == 1 and ',' in lines[0]: text = lines[0].strip() # Better heuristic: if there are no spaces around commas AND # the text contains common MTG name patterns, treat as single card # Common patterns: "Name, Title", "First, Last Name", etc. import re # Check for patterns that suggest it's a single card name: # 1. Comma followed by a capitalized word (title/surname pattern) # 2. Single comma with reasonable length text on both sides title_pattern = re.search(r'^[^,]{2,30},\s+[A-Z][^,]{2,30}$', text.strip()) if title_pattern: # This looks like "Byrke, Long ear Of the Law" - single card names = [text] else: # This looks like "Card1,Card2" or "Card1, Card2" - multiple cards names = text.split(',') else: names = lines # Use newline split # Clean up each name cleaned = [] for name in names: name = name.strip() if name: # Skip empty entries cleaned.append(name) return cleaned def get_baseline_performance_metrics() -> Dict[str, any]: """ Get baseline performance metrics for regression testing. Returns: Dictionary with timing and memory baselines """ import time start_time = time.time() # Simulate some basic operations for baseline test_names = ['Lightning Bolt', 'Krenko, Mob Boss', 'Sol Ring'] * 100 for name in test_names: normalize_card_name(name) normalize_punctuation(name) end_time = time.time() return { 'normalization_time_ms': (end_time - start_time) * 1000, 'operations_count': len(test_names) * 2, # 2 operations per name 'timestamp': time.time() }