mtg_python_deckbuilder/code/deck_builder/include_exclude_utils.py

"""
Utilities for include/exclude card functionality.

Provides fuzzy matching, card name normalization, and validation
for must-include and must-exclude card lists.
"""

from __future__ import annotations

import difflib
import re
from typing import List, Dict, Set, Tuple, Optional
from dataclasses import dataclass

from .builder_constants import POPULAR_CARDS, ICONIC_CARDS


# Fuzzy matching configuration
FUZZY_CONFIDENCE_THRESHOLD = 0.95  # 95% confidence for auto-acceptance (more conservative)
MAX_SUGGESTIONS = 3  # Maximum suggestions to show for fuzzy matches
MAX_INCLUDES = 10  # Maximum include cards allowed
MAX_EXCLUDES = 15  # Maximum exclude cards allowed


@dataclass
@dataclass
class FuzzyMatchResult:
    """Result of a fuzzy card name match."""
    input_name: str
    matched_name: Optional[str]
    confidence: float
    suggestions: List[str]
    auto_accepted: bool


@dataclass
class IncludeExcludeDiagnostics:
    """Diagnostics for include/exclude processing."""
    missing_includes: List[str]
    ignored_color_identity: List[str]
    illegal_dropped: List[str]
    illegal_allowed: List[str]
    excluded_removed: List[str]
    duplicates_collapsed: Dict[str, int]
    include_added: List[str]
    include_over_ideal: Dict[str, List[str]]  # e.g., {"creatures": ["Card A"]} when includes exceed ideal category counts
    fuzzy_corrections: Dict[str, str]
    confirmation_needed: List[Dict[str, any]]
    list_size_warnings: Dict[str, int]


def normalize_card_name(name: str) -> str:
    """
    Normalize card names for robust matching.

    Handles:
    - Case normalization (casefold)
    - Punctuation normalization (commas, apostrophes)
    - Whitespace cleanup
    - Unicode apostrophe normalization
    - Arena/Alchemy prefix removal

    Args:
        name: Raw card name input

    Returns:
        Normalized card name for matching
    """
    if not name:
        return ""

    # Basic cleanup
    s = str(name).strip()

    # Normalize unicode characters
    s = s.replace('\u2019', "'")  # Curly apostrophe to straight
    s = s.replace('\u2018', "'")  # Opening single quote
    s = s.replace('\u201C', '"')  # Opening double quote
    s = s.replace('\u201D', '"')  # Closing double quote
    s = s.replace('\u2013', "-")  # En dash
    s = s.replace('\u2014', "-")  # Em dash

    # Remove Arena/Alchemy prefix
    if s.startswith('A-') and len(s) > 2:
        s = s[2:]

    # Normalize whitespace
    s = " ".join(s.split())

    # Case normalization
    return s.casefold()


def normalize_punctuation(name: str) -> str:
    """
    Normalize punctuation for fuzzy matching.

    Specifically handles the case where users might omit commas:
    "Krenko, Mob Boss" vs "Krenko Mob Boss"

    Args:
        name: Card name to normalize

    Returns:
        Name with punctuation variations normalized
    """
    if not name:
        return ""

    # Remove common punctuation for comparison
    s = normalize_card_name(name)

    # Remove commas, colons, and extra spaces for fuzzy matching
    s = re.sub(r'[,:]', ' ', s)
    s = re.sub(r'\s+', ' ', s)

    return s.strip()


def fuzzy_match_card_name(
    input_name: str,
    card_names: Set[str],
    confidence_threshold: float = FUZZY_CONFIDENCE_THRESHOLD
) -> FuzzyMatchResult:
    """
    Perform fuzzy matching on a card name against a set of valid names.

    Args:
        input_name: User input card name
        card_names: Set of valid card names to match against
        confidence_threshold: Minimum confidence for auto-acceptance

    Returns:
        FuzzyMatchResult with match information
    """
    if not input_name or not card_names:
        return FuzzyMatchResult(
            input_name=input_name,
            matched_name=None,
            confidence=0.0,
            suggestions=[],
            auto_accepted=False
        )

    # Normalize input for matching
    normalized_input = normalize_punctuation(input_name)

    # Create normalized lookup for card names
    normalized_to_original = {}
    for name in card_names:
        normalized = normalize_punctuation(name)
        if normalized not in normalized_to_original:
            normalized_to_original[normalized] = name

    normalized_names = set(normalized_to_original.keys())

    # Exact match check (after normalization)
    if normalized_input in normalized_names:
        return FuzzyMatchResult(
            input_name=input_name,
            matched_name=normalized_to_original[normalized_input],
            confidence=1.0,
            suggestions=[],
            auto_accepted=True
        )

    # Enhanced fuzzy matching with intelligent prefix prioritization
    input_lower = normalized_input.lower()

    # Convert constants to lowercase for matching
    popular_cards_lower = {card.lower() for card in POPULAR_CARDS}
    iconic_cards_lower = {card.lower() for card in ICONIC_CARDS}

    # Collect candidates with different scoring strategies
    candidates = []
    best_raw_similarity = 0.0

    for name in normalized_names:
        name_lower = name.lower()
        base_score = difflib.SequenceMatcher(None, input_lower, name_lower).ratio()

        # Skip very low similarity matches early
        if base_score < 0.3:
            continue

        final_score = base_score
        # Track best raw similarity to decide on true no-match vs. weak suggestions
        if base_score > best_raw_similarity:
            best_raw_similarity = base_score

        # Strong boost for exact prefix matches (input is start of card name)
        if name_lower.startswith(input_lower):
            final_score = min(1.0, base_score + 0.5)

        # Moderate boost for word-level prefix matches
        elif any(word.startswith(input_lower) for word in name_lower.split()):
            final_score = min(1.0, base_score + 0.3)

        # Special case: if input could be abbreviation of first word, boost heavily
        elif len(input_lower) <= 6:
            first_word = name_lower.split()[0] if name_lower.split() else ""
            if first_word and first_word.startswith(input_lower):
                final_score = min(1.0, base_score + 0.4)

        # Boost for cards where input is contained as substring
        elif input_lower in name_lower:
            final_score = min(1.0, base_score + 0.2)

        # Special boost for very short inputs that are obvious abbreviations
        if len(input_lower) <= 4:
            # For short inputs, heavily favor cards that start with the input
            if name_lower.startswith(input_lower):
                final_score = min(1.0, final_score + 0.3)

        # Popularity boost for well-known cards
        if name_lower in popular_cards_lower:
            final_score = min(1.0, final_score + 0.25)

        # Extra boost for super iconic cards like Lightning Bolt (only when relevant)
        if name_lower in iconic_cards_lower:
            # Only boost if there's some relevance to the input
            if any(word[:3] in input_lower or input_lower[:3] in word for word in name_lower.split()):
                final_score = min(1.0, final_score + 0.3)
            # Extra boost for Lightning Bolt when input is 'lightning' or similar
            if name_lower == 'lightning bolt' and input_lower in ['lightning', 'lightn', 'light']:
                final_score = min(1.0, final_score + 0.2)

        # Special handling for Lightning Bolt variants
        if 'lightning' in name_lower and 'bolt' in name_lower:
            if input_lower in ['bolt', 'lightn', 'lightning']:
                final_score = min(1.0, final_score + 0.4)

        # Simplicity boost: prefer shorter, simpler card names for short inputs
        if len(input_lower) <= 6:
            # Boost shorter card names slightly
            if len(name_lower) <= len(input_lower) * 2:
                final_score = min(1.0, final_score + 0.05)

        # Cap total boost to avoid over-accepting near-misses; allow only small boost
        if final_score > base_score:
            max_total_boost = 0.06
            final_score = min(1.0, base_score + min(final_score - base_score, max_total_boost))

        candidates.append((final_score, name))

    if not candidates:
        return FuzzyMatchResult(
            input_name=input_name,
            matched_name=None,
            confidence=0.0,
            suggestions=[],
            auto_accepted=False
        )

    # Sort candidates by score (highest first)
    candidates.sort(key=lambda x: x[0], reverse=True)

    # Get best match and confidence
    best_score, best_match = candidates[0]
    confidence = best_score
    # If raw similarity never cleared a minimal bar, treat as no reasonable match
    # even if boosted scores exist; return confidence 0.0 and no suggestions.
    if best_raw_similarity < 0.35:
        return FuzzyMatchResult(
            input_name=input_name,
            matched_name=None,
            confidence=0.0,
            suggestions=[],
            auto_accepted=False
        )

    # Convert back to original names, preserving score-based order
    suggestions = [normalized_to_original[match] for _, match in candidates[:MAX_SUGGESTIONS]]
    best_original = normalized_to_original[best_match]

    # Auto-accept if confidence is high enough
    auto_accepted = confidence >= confidence_threshold
    matched_name = best_original if auto_accepted else None

    return FuzzyMatchResult(
        input_name=input_name,
        matched_name=matched_name,
        confidence=confidence,
        suggestions=suggestions,
        auto_accepted=auto_accepted
    )


def validate_list_sizes(includes: List[str], excludes: List[str]) -> Dict[str, any]:
    """
    Validate that include/exclude lists are within acceptable size limits.

    Args:
        includes: List of include card names
        excludes: List of exclude card names

    Returns:
        Dictionary with validation results and warnings
    """
    include_count = len(includes)
    exclude_count = len(excludes)

    warnings = {}
    errors = []

    # Size limit checks
    if include_count > MAX_INCLUDES:
        errors.append(f"Too many include cards: {include_count} (max {MAX_INCLUDES})")
    elif include_count >= int(MAX_INCLUDES * 0.8):  # 80% warning threshold
        warnings['includes_approaching_limit'] = f"Approaching include limit: {include_count}/{MAX_INCLUDES}"

    if exclude_count > MAX_EXCLUDES:
        errors.append(f"Too many exclude cards: {exclude_count} (max {MAX_EXCLUDES})")
    elif exclude_count >= int(MAX_EXCLUDES * 0.8):  # 80% warning threshold
        warnings['excludes_approaching_limit'] = f"Approaching exclude limit: {exclude_count}/{MAX_EXCLUDES}"

    return {
        'valid': len(errors) == 0,
        'errors': errors,
        'warnings': warnings,
        'counts': {
            'includes': include_count,
            'excludes': exclude_count,
            'includes_limit': MAX_INCLUDES,
            'excludes_limit': MAX_EXCLUDES
        }
    }


def collapse_duplicates(card_names: List[str]) -> Tuple[List[str], Dict[str, int]]:
    """
    Remove duplicates from card list and track collapsed counts.

    Commander format allows only one copy of each card (except for exceptions),
    so duplicate entries in user input should be collapsed to single copies.

    Args:
        card_names: List of card names (may contain duplicates)

    Returns:
        Tuple of (unique_names, duplicate_counts)
    """
    if not card_names:
        return [], {}

    seen = {}
    unique_names = []

    for name in card_names:
        if not name or not name.strip():
            continue

        name = name.strip()
        normalized = normalize_card_name(name)

        if normalized not in seen:
            seen[normalized] = {'original': name, 'count': 1}
            unique_names.append(name)
        else:
            seen[normalized]['count'] += 1

    # Extract duplicate counts (only for names that appeared more than once)
    duplicates = {
        data['original']: data['count']
        for data in seen.values()
        if data['count'] > 1
    }

    return unique_names, duplicates


def parse_card_list_input(input_text: str) -> List[str]:
    """
    Parse user input text into a list of card names.

    Supports:
    - Newline separated (preferred for cards with commas in names)
    - Comma separated only for simple lists without newlines
    - Whitespace cleanup

    Note: Always prioritizes newlines over commas to avoid splitting card names
    that contain commas like "Byrke, Long ear Of the Law".

    Args:
        input_text: Raw user input text

    Returns:
        List of parsed card names
    """
    if not input_text:
        return []

    # Always split on newlines first - this is the preferred format
    # and prevents breaking card names with commas
    lines = input_text.split('\n')

    # If we only have one line and it contains commas,
    # then it might be comma-separated input vs a single card name with commas
    if len(lines) == 1 and ',' in lines[0]:
        text = lines[0].strip()

        # Better heuristic: if there are no spaces around commas AND
        # the text contains common MTG name patterns, treat as single card
        # Common patterns: "Name, Title", "First, Last Name", etc.
        import re

        # Check for patterns that suggest it's a single card name:
        # 1. Comma followed by a capitalized word (title/surname pattern)
        # 2. Single comma with reasonable length text on both sides
        title_pattern = re.search(r'^[^,]{2,30},\s+[A-Z][^,]{2,30}$', text.strip())

        if title_pattern:
            # This looks like "Byrke, Long ear Of the Law" - single card
            names = [text]
        else:
            # This looks like "Card1,Card2" or "Card1, Card2" - multiple cards
            names = text.split(',')
    else:
        names = lines  # Use newline split

    # Clean up each name
    cleaned = []
    for name in names:
        name = name.strip()
        if name:  # Skip empty entries
            cleaned.append(name)

    return cleaned


def get_baseline_performance_metrics() -> Dict[str, any]:
    """
    Get baseline performance metrics for regression testing.

    Returns:
        Dictionary with timing and memory baselines
    """
    import time

    start_time = time.time()

    # Simulate some basic operations for baseline
    test_names = ['Lightning Bolt', 'Krenko, Mob Boss', 'Sol Ring'] * 100
    for name in test_names:
        normalize_card_name(name)
        normalize_punctuation(name)

    end_time = time.time()

    return {
        'normalization_time_ms': (end_time - start_time) * 1000,
        'operations_count': len(test_names) * 2,  # 2 operations per name
        'timestamp': time.time()
    }