""" Full audit of Protection-tagged cards with kindred metadata support (M2 Phase 2). Created: October 8, 2025 Purpose: Audit and validate Protection tag precision after implementing grant detection. Can be re-run periodically to check tagging quality. This script audits ALL Protection-tagged cards and categorizes them: - Grant: Gives broad protection to other permanents YOU control - Kindred: Gives protection to specific creature types (metadata tags) - Mixed: Both broad and kindred/inherent - Inherent: Only has protection itself - ConditionalSelf: Only conditionally grants to itself - Opponent: Grants to opponent's permanents - Neither: False positive Outputs: - m2_audit_v2.json: Full analysis with summary - m2_audit_v2_grant.csv: Cards for main Protection tag - m2_audit_v2_kindred.csv: Cards for kindred metadata tags - m2_audit_v2_mixed.csv: Cards with both broad and kindred grants - m2_audit_v2_conditional.csv: Conditional self-grants (exclude) - m2_audit_v2_inherent.csv: Inherent protection only (exclude) - m2_audit_v2_opponent.csv: Opponent grants (exclude) - m2_audit_v2_neither.csv: False positives (exclude) - m2_audit_v2_all.csv: All cards combined """ import sys from pathlib import Path import pandas as pd import json # Add project root to path project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from code.tagging.protection_grant_detection import ( categorize_protection_card, get_kindred_protection_tags, is_granting_protection, ) def load_all_cards(): """Load all cards from color/identity CSV files.""" csv_dir = project_root / 'csv_files' # Get all color/identity CSVs (not the raw cards.csv) csv_files = list(csv_dir.glob('*_cards.csv')) csv_files = [f for f in csv_files if f.stem not in ['cards', 'testdata']] all_cards = [] for csv_file in csv_files: try: df = pd.read_csv(csv_file) all_cards.append(df) except Exception as e: print(f"Warning: Could not load {csv_file.name}: {e}") # Combine all DataFrames combined = pd.concat(all_cards, ignore_index=True) # Drop duplicates (cards appear in multiple color files) combined = combined.drop_duplicates(subset=['name'], keep='first') return combined def audit_all_protection_cards(): """Audit all Protection-tagged cards.""" print("Loading all cards...") df = load_all_cards() print(f"Total cards loaded: {len(df)}") # Filter to Protection-tagged cards (column is 'themeTags' in color CSVs) df_prot = df[df['themeTags'].str.contains('Protection', case=False, na=False)].copy() print(f"Protection-tagged cards: {len(df_prot)}") # Categorize each card categories = [] grants_list = [] kindred_tags_list = [] for idx, row in df_prot.iterrows(): name = row['name'] text = str(row.get('text', '')).replace('\\n', '\n') # Convert escaped newlines to real newlines keywords = str(row.get('keywords', '')) card_type = str(row.get('type', '')) # Categorize with kindred exclusion enabled category = categorize_protection_card(name, text, keywords, card_type, exclude_kindred=True) # Check if it grants broadly grants_broad = is_granting_protection(text, keywords, exclude_kindred=True) # Get kindred tags kindred_tags = get_kindred_protection_tags(text) categories.append(category) grants_list.append(grants_broad) kindred_tags_list.append(', '.join(sorted(kindred_tags)) if kindred_tags else '') df_prot['category'] = categories df_prot['grants_broad'] = grants_list df_prot['kindred_tags'] = kindred_tags_list # Generate summary (convert numpy types to native Python for JSON serialization) summary = { 'total': int(len(df_prot)), 'categories': {k: int(v) for k, v in df_prot['category'].value_counts().to_dict().items()}, 'grants_broad_count': int(df_prot['grants_broad'].sum()), 'kindred_cards_count': int((df_prot['kindred_tags'] != '').sum()), } # Calculate keep vs remove keep_categories = {'Grant', 'Mixed'} kindred_only = df_prot[df_prot['category'] == 'Kindred'] keep_count = len(df_prot[df_prot['category'].isin(keep_categories)]) remove_count = len(df_prot[~df_prot['category'].isin(keep_categories | {'Kindred'})]) summary['keep_main_tag'] = keep_count summary['kindred_metadata'] = len(kindred_only) summary['remove'] = remove_count summary['precision_estimate'] = round((keep_count / len(df_prot)) * 100, 1) if len(df_prot) > 0 else 0 # Print summary print(f"\n{'='*60}") print("AUDIT SUMMARY") print(f"{'='*60}") print(f"Total Protection-tagged cards: {summary['total']}") print(f"\nCategories:") for cat, count in sorted(summary['categories'].items()): pct = (count / summary['total']) * 100 print(f" {cat:20s} {count:4d} ({pct:5.1f}%)") print(f"\n{'='*60}") print(f"Main Protection tag: {keep_count:4d} ({keep_count/len(df_prot)*100:5.1f}%)") print(f"Kindred metadata only: {len(kindred_only):4d} ({len(kindred_only)/len(df_prot)*100:5.1f}%)") print(f"Remove: {remove_count:4d} ({remove_count/len(df_prot)*100:5.1f}%)") print(f"{'='*60}") print(f"Precision estimate: {summary['precision_estimate']}%") print(f"{'='*60}\n") # Export results output_dir = project_root / 'logs' / 'roadmaps' / 'source' / 'tagging_refinement' output_dir.mkdir(parents=True, exist_ok=True) # Export JSON summary with open(output_dir / 'm2_audit_v2.json', 'w') as f: json.dump({ 'summary': summary, 'cards': df_prot[['name', 'type', 'category', 'grants_broad', 'kindred_tags', 'keywords', 'text']].to_dict(orient='records') }, f, indent=2) # Export CSVs by category export_cols = ['name', 'type', 'category', 'grants_broad', 'kindred_tags', 'keywords', 'text'] # Grant category df_grant = df_prot[df_prot['category'] == 'Grant'] df_grant[export_cols].to_csv(output_dir / 'm2_audit_v2_grant.csv', index=False) print(f"Exported {len(df_grant)} Grant cards to m2_audit_v2_grant.csv") # Kindred category df_kindred = df_prot[df_prot['category'] == 'Kindred'] df_kindred[export_cols].to_csv(output_dir / 'm2_audit_v2_kindred.csv', index=False) print(f"Exported {len(df_kindred)} Kindred cards to m2_audit_v2_kindred.csv") # Mixed category df_mixed = df_prot[df_prot['category'] == 'Mixed'] df_mixed[export_cols].to_csv(output_dir / 'm2_audit_v2_mixed.csv', index=False) print(f"Exported {len(df_mixed)} Mixed cards to m2_audit_v2_mixed.csv") # ConditionalSelf category df_conditional = df_prot[df_prot['category'] == 'ConditionalSelf'] df_conditional[export_cols].to_csv(output_dir / 'm2_audit_v2_conditional.csv', index=False) print(f"Exported {len(df_conditional)} ConditionalSelf cards to m2_audit_v2_conditional.csv") # Inherent category df_inherent = df_prot[df_prot['category'] == 'Inherent'] df_inherent[export_cols].to_csv(output_dir / 'm2_audit_v2_inherent.csv', index=False) print(f"Exported {len(df_inherent)} Inherent cards to m2_audit_v2_inherent.csv") # Opponent category df_opponent = df_prot[df_prot['category'] == 'Opponent'] df_opponent[export_cols].to_csv(output_dir / 'm2_audit_v2_opponent.csv', index=False) print(f"Exported {len(df_opponent)} Opponent cards to m2_audit_v2_opponent.csv") # Neither category df_neither = df_prot[df_prot['category'] == 'Neither'] df_neither[export_cols].to_csv(output_dir / 'm2_audit_v2_neither.csv', index=False) print(f"Exported {len(df_neither)} Neither cards to m2_audit_v2_neither.csv") # All cards df_prot[export_cols].to_csv(output_dir / 'm2_audit_v2_all.csv', index=False) print(f"Exported {len(df_prot)} total cards to m2_audit_v2_all.csv") print(f"\nAll files saved to: {output_dir}") return df_prot, summary if __name__ == '__main__': df_results, summary = audit_all_protection_cards()