mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-09-22 04:50:46 +02:00
454 lines
15 KiB
Python
454 lines
15 KiB
Python
"""
|
|
Utilities for include/exclude card functionality.
|
|
|
|
Provides fuzzy matching, card name normalization, and validation
|
|
for must-include and must-exclude card lists.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import difflib
|
|
import re
|
|
from typing import List, Dict, Set, Tuple, Optional
|
|
from dataclasses import dataclass
|
|
|
|
from .builder_constants import POPULAR_CARDS, ICONIC_CARDS
|
|
|
|
|
|
# Fuzzy matching configuration
|
|
FUZZY_CONFIDENCE_THRESHOLD = 0.95 # 95% confidence for auto-acceptance (more conservative)
|
|
MAX_SUGGESTIONS = 3 # Maximum suggestions to show for fuzzy matches
|
|
MAX_INCLUDES = 10 # Maximum include cards allowed
|
|
MAX_EXCLUDES = 15 # Maximum exclude cards allowed
|
|
|
|
|
|
@dataclass
|
|
@dataclass
|
|
class FuzzyMatchResult:
|
|
"""Result of a fuzzy card name match."""
|
|
input_name: str
|
|
matched_name: Optional[str]
|
|
confidence: float
|
|
suggestions: List[str]
|
|
auto_accepted: bool
|
|
|
|
|
|
@dataclass
|
|
class IncludeExcludeDiagnostics:
|
|
"""Diagnostics for include/exclude processing."""
|
|
missing_includes: List[str]
|
|
ignored_color_identity: List[str]
|
|
illegal_dropped: List[str]
|
|
illegal_allowed: List[str]
|
|
excluded_removed: List[str]
|
|
duplicates_collapsed: Dict[str, int]
|
|
include_added: List[str]
|
|
include_over_ideal: Dict[str, List[str]] # e.g., {"creatures": ["Card A"]} when includes exceed ideal category counts
|
|
fuzzy_corrections: Dict[str, str]
|
|
confirmation_needed: List[Dict[str, any]]
|
|
list_size_warnings: Dict[str, int]
|
|
|
|
|
|
def normalize_card_name(name: str) -> str:
|
|
"""
|
|
Normalize card names for robust matching.
|
|
|
|
Handles:
|
|
- Case normalization (casefold)
|
|
- Punctuation normalization (commas, apostrophes)
|
|
- Whitespace cleanup
|
|
- Unicode apostrophe normalization
|
|
- Arena/Alchemy prefix removal
|
|
|
|
Args:
|
|
name: Raw card name input
|
|
|
|
Returns:
|
|
Normalized card name for matching
|
|
"""
|
|
if not name:
|
|
return ""
|
|
|
|
# Basic cleanup
|
|
s = str(name).strip()
|
|
|
|
# Normalize unicode characters
|
|
s = s.replace('\u2019', "'") # Curly apostrophe to straight
|
|
s = s.replace('\u2018', "'") # Opening single quote
|
|
s = s.replace('\u201C', '"') # Opening double quote
|
|
s = s.replace('\u201D', '"') # Closing double quote
|
|
s = s.replace('\u2013', "-") # En dash
|
|
s = s.replace('\u2014', "-") # Em dash
|
|
|
|
# Remove Arena/Alchemy prefix
|
|
if s.startswith('A-') and len(s) > 2:
|
|
s = s[2:]
|
|
|
|
# Normalize whitespace
|
|
s = " ".join(s.split())
|
|
|
|
# Case normalization
|
|
return s.casefold()
|
|
|
|
|
|
def normalize_punctuation(name: str) -> str:
|
|
"""
|
|
Normalize punctuation for fuzzy matching.
|
|
|
|
Specifically handles the case where users might omit commas:
|
|
"Krenko, Mob Boss" vs "Krenko Mob Boss"
|
|
|
|
Args:
|
|
name: Card name to normalize
|
|
|
|
Returns:
|
|
Name with punctuation variations normalized
|
|
"""
|
|
if not name:
|
|
return ""
|
|
|
|
# Remove common punctuation for comparison
|
|
s = normalize_card_name(name)
|
|
|
|
# Remove commas, colons, and extra spaces for fuzzy matching
|
|
s = re.sub(r'[,:]', ' ', s)
|
|
s = re.sub(r'\s+', ' ', s)
|
|
|
|
return s.strip()
|
|
|
|
|
|
def fuzzy_match_card_name(
|
|
input_name: str,
|
|
card_names: Set[str],
|
|
confidence_threshold: float = FUZZY_CONFIDENCE_THRESHOLD
|
|
) -> FuzzyMatchResult:
|
|
"""
|
|
Perform fuzzy matching on a card name against a set of valid names.
|
|
|
|
Args:
|
|
input_name: User input card name
|
|
card_names: Set of valid card names to match against
|
|
confidence_threshold: Minimum confidence for auto-acceptance
|
|
|
|
Returns:
|
|
FuzzyMatchResult with match information
|
|
"""
|
|
if not input_name or not card_names:
|
|
return FuzzyMatchResult(
|
|
input_name=input_name,
|
|
matched_name=None,
|
|
confidence=0.0,
|
|
suggestions=[],
|
|
auto_accepted=False
|
|
)
|
|
|
|
# Normalize input for matching
|
|
normalized_input = normalize_punctuation(input_name)
|
|
|
|
# Create normalized lookup for card names
|
|
normalized_to_original = {}
|
|
for name in card_names:
|
|
normalized = normalize_punctuation(name)
|
|
if normalized not in normalized_to_original:
|
|
normalized_to_original[normalized] = name
|
|
|
|
normalized_names = set(normalized_to_original.keys())
|
|
|
|
# Exact match check (after normalization)
|
|
if normalized_input in normalized_names:
|
|
return FuzzyMatchResult(
|
|
input_name=input_name,
|
|
matched_name=normalized_to_original[normalized_input],
|
|
confidence=1.0,
|
|
suggestions=[],
|
|
auto_accepted=True
|
|
)
|
|
|
|
# Enhanced fuzzy matching with intelligent prefix prioritization
|
|
input_lower = normalized_input.lower()
|
|
|
|
# Convert constants to lowercase for matching
|
|
popular_cards_lower = {card.lower() for card in POPULAR_CARDS}
|
|
iconic_cards_lower = {card.lower() for card in ICONIC_CARDS}
|
|
|
|
# Collect candidates with different scoring strategies
|
|
candidates = []
|
|
best_raw_similarity = 0.0
|
|
|
|
for name in normalized_names:
|
|
name_lower = name.lower()
|
|
base_score = difflib.SequenceMatcher(None, input_lower, name_lower).ratio()
|
|
|
|
# Skip very low similarity matches early
|
|
if base_score < 0.3:
|
|
continue
|
|
|
|
final_score = base_score
|
|
# Track best raw similarity to decide on true no-match vs. weak suggestions
|
|
if base_score > best_raw_similarity:
|
|
best_raw_similarity = base_score
|
|
|
|
# Strong boost for exact prefix matches (input is start of card name)
|
|
if name_lower.startswith(input_lower):
|
|
final_score = min(1.0, base_score + 0.5)
|
|
|
|
# Moderate boost for word-level prefix matches
|
|
elif any(word.startswith(input_lower) for word in name_lower.split()):
|
|
final_score = min(1.0, base_score + 0.3)
|
|
|
|
# Special case: if input could be abbreviation of first word, boost heavily
|
|
elif len(input_lower) <= 6:
|
|
first_word = name_lower.split()[0] if name_lower.split() else ""
|
|
if first_word and first_word.startswith(input_lower):
|
|
final_score = min(1.0, base_score + 0.4)
|
|
|
|
# Boost for cards where input is contained as substring
|
|
elif input_lower in name_lower:
|
|
final_score = min(1.0, base_score + 0.2)
|
|
|
|
# Special boost for very short inputs that are obvious abbreviations
|
|
if len(input_lower) <= 4:
|
|
# For short inputs, heavily favor cards that start with the input
|
|
if name_lower.startswith(input_lower):
|
|
final_score = min(1.0, final_score + 0.3)
|
|
|
|
# Popularity boost for well-known cards
|
|
if name_lower in popular_cards_lower:
|
|
final_score = min(1.0, final_score + 0.25)
|
|
|
|
# Extra boost for super iconic cards like Lightning Bolt (only when relevant)
|
|
if name_lower in iconic_cards_lower:
|
|
# Only boost if there's some relevance to the input
|
|
if any(word[:3] in input_lower or input_lower[:3] in word for word in name_lower.split()):
|
|
final_score = min(1.0, final_score + 0.3)
|
|
# Extra boost for Lightning Bolt when input is 'lightning' or similar
|
|
if name_lower == 'lightning bolt' and input_lower in ['lightning', 'lightn', 'light']:
|
|
final_score = min(1.0, final_score + 0.2)
|
|
|
|
# Special handling for Lightning Bolt variants
|
|
if 'lightning' in name_lower and 'bolt' in name_lower:
|
|
if input_lower in ['bolt', 'lightn', 'lightning']:
|
|
final_score = min(1.0, final_score + 0.4)
|
|
|
|
# Simplicity boost: prefer shorter, simpler card names for short inputs
|
|
if len(input_lower) <= 6:
|
|
# Boost shorter card names slightly
|
|
if len(name_lower) <= len(input_lower) * 2:
|
|
final_score = min(1.0, final_score + 0.05)
|
|
|
|
# Cap total boost to avoid over-accepting near-misses; allow only small boost
|
|
if final_score > base_score:
|
|
max_total_boost = 0.06
|
|
final_score = min(1.0, base_score + min(final_score - base_score, max_total_boost))
|
|
|
|
candidates.append((final_score, name))
|
|
|
|
if not candidates:
|
|
return FuzzyMatchResult(
|
|
input_name=input_name,
|
|
matched_name=None,
|
|
confidence=0.0,
|
|
suggestions=[],
|
|
auto_accepted=False
|
|
)
|
|
|
|
# Sort candidates by score (highest first)
|
|
candidates.sort(key=lambda x: x[0], reverse=True)
|
|
|
|
# Get best match and confidence
|
|
best_score, best_match = candidates[0]
|
|
confidence = best_score
|
|
# If raw similarity never cleared a minimal bar, treat as no reasonable match
|
|
# even if boosted scores exist; return confidence 0.0 and no suggestions.
|
|
if best_raw_similarity < 0.35:
|
|
return FuzzyMatchResult(
|
|
input_name=input_name,
|
|
matched_name=None,
|
|
confidence=0.0,
|
|
suggestions=[],
|
|
auto_accepted=False
|
|
)
|
|
|
|
# Convert back to original names, preserving score-based order
|
|
suggestions = [normalized_to_original[match] for _, match in candidates[:MAX_SUGGESTIONS]]
|
|
best_original = normalized_to_original[best_match]
|
|
|
|
# Auto-accept if confidence is high enough
|
|
auto_accepted = confidence >= confidence_threshold
|
|
matched_name = best_original if auto_accepted else None
|
|
|
|
return FuzzyMatchResult(
|
|
input_name=input_name,
|
|
matched_name=matched_name,
|
|
confidence=confidence,
|
|
suggestions=suggestions,
|
|
auto_accepted=auto_accepted
|
|
)
|
|
|
|
|
|
def validate_list_sizes(includes: List[str], excludes: List[str]) -> Dict[str, any]:
|
|
"""
|
|
Validate that include/exclude lists are within acceptable size limits.
|
|
|
|
Args:
|
|
includes: List of include card names
|
|
excludes: List of exclude card names
|
|
|
|
Returns:
|
|
Dictionary with validation results and warnings
|
|
"""
|
|
include_count = len(includes)
|
|
exclude_count = len(excludes)
|
|
|
|
warnings = {}
|
|
errors = []
|
|
|
|
# Size limit checks
|
|
if include_count > MAX_INCLUDES:
|
|
errors.append(f"Too many include cards: {include_count} (max {MAX_INCLUDES})")
|
|
elif include_count >= int(MAX_INCLUDES * 0.8): # 80% warning threshold
|
|
warnings['includes_approaching_limit'] = f"Approaching include limit: {include_count}/{MAX_INCLUDES}"
|
|
|
|
if exclude_count > MAX_EXCLUDES:
|
|
errors.append(f"Too many exclude cards: {exclude_count} (max {MAX_EXCLUDES})")
|
|
elif exclude_count >= int(MAX_EXCLUDES * 0.8): # 80% warning threshold
|
|
warnings['excludes_approaching_limit'] = f"Approaching exclude limit: {exclude_count}/{MAX_EXCLUDES}"
|
|
|
|
return {
|
|
'valid': len(errors) == 0,
|
|
'errors': errors,
|
|
'warnings': warnings,
|
|
'counts': {
|
|
'includes': include_count,
|
|
'excludes': exclude_count,
|
|
'includes_limit': MAX_INCLUDES,
|
|
'excludes_limit': MAX_EXCLUDES
|
|
}
|
|
}
|
|
|
|
|
|
def collapse_duplicates(card_names: List[str]) -> Tuple[List[str], Dict[str, int]]:
|
|
"""
|
|
Remove duplicates from card list and track collapsed counts.
|
|
|
|
Commander format allows only one copy of each card (except for exceptions),
|
|
so duplicate entries in user input should be collapsed to single copies.
|
|
|
|
Args:
|
|
card_names: List of card names (may contain duplicates)
|
|
|
|
Returns:
|
|
Tuple of (unique_names, duplicate_counts)
|
|
"""
|
|
if not card_names:
|
|
return [], {}
|
|
|
|
seen = {}
|
|
unique_names = []
|
|
|
|
for name in card_names:
|
|
if not name or not name.strip():
|
|
continue
|
|
|
|
name = name.strip()
|
|
normalized = normalize_card_name(name)
|
|
|
|
if normalized not in seen:
|
|
seen[normalized] = {'original': name, 'count': 1}
|
|
unique_names.append(name)
|
|
else:
|
|
seen[normalized]['count'] += 1
|
|
|
|
# Extract duplicate counts (only for names that appeared more than once)
|
|
duplicates = {
|
|
data['original']: data['count']
|
|
for data in seen.values()
|
|
if data['count'] > 1
|
|
}
|
|
|
|
return unique_names, duplicates
|
|
|
|
|
|
def parse_card_list_input(input_text: str) -> List[str]:
|
|
"""
|
|
Parse user input text into a list of card names.
|
|
|
|
Supports:
|
|
- Newline separated (preferred for cards with commas in names)
|
|
- Comma separated only for simple lists without newlines
|
|
- Whitespace cleanup
|
|
|
|
Note: Always prioritizes newlines over commas to avoid splitting card names
|
|
that contain commas like "Byrke, Long ear Of the Law".
|
|
|
|
Args:
|
|
input_text: Raw user input text
|
|
|
|
Returns:
|
|
List of parsed card names
|
|
"""
|
|
if not input_text:
|
|
return []
|
|
|
|
# Always split on newlines first - this is the preferred format
|
|
# and prevents breaking card names with commas
|
|
lines = input_text.split('\n')
|
|
|
|
# If we only have one line and it contains commas,
|
|
# then it might be comma-separated input vs a single card name with commas
|
|
if len(lines) == 1 and ',' in lines[0]:
|
|
text = lines[0].strip()
|
|
|
|
# Better heuristic: if there are no spaces around commas AND
|
|
# the text contains common MTG name patterns, treat as single card
|
|
# Common patterns: "Name, Title", "First, Last Name", etc.
|
|
import re
|
|
|
|
# Check for patterns that suggest it's a single card name:
|
|
# 1. Comma followed by a capitalized word (title/surname pattern)
|
|
# 2. Single comma with reasonable length text on both sides
|
|
title_pattern = re.search(r'^[^,]{2,30},\s+[A-Z][^,]{2,30}$', text.strip())
|
|
|
|
if title_pattern:
|
|
# This looks like "Byrke, Long ear Of the Law" - single card
|
|
names = [text]
|
|
else:
|
|
# This looks like "Card1,Card2" or "Card1, Card2" - multiple cards
|
|
names = text.split(',')
|
|
else:
|
|
names = lines # Use newline split
|
|
|
|
# Clean up each name
|
|
cleaned = []
|
|
for name in names:
|
|
name = name.strip()
|
|
if name: # Skip empty entries
|
|
cleaned.append(name)
|
|
|
|
return cleaned
|
|
|
|
|
|
def get_baseline_performance_metrics() -> Dict[str, any]:
|
|
"""
|
|
Get baseline performance metrics for regression testing.
|
|
|
|
Returns:
|
|
Dictionary with timing and memory baselines
|
|
"""
|
|
import time
|
|
|
|
start_time = time.time()
|
|
|
|
# Simulate some basic operations for baseline
|
|
test_names = ['Lightning Bolt', 'Krenko, Mob Boss', 'Sol Ring'] * 100
|
|
for name in test_names:
|
|
normalize_card_name(name)
|
|
normalize_punctuation(name)
|
|
|
|
end_time = time.time()
|
|
|
|
return {
|
|
'normalization_time_ms': (end_time - start_time) * 1000,
|
|
'operations_count': len(test_names) * 2, # 2 operations per name
|
|
'timestamp': time.time()
|
|
}
|