mtg_python_deckbuilder/code/scripts/audit_protection_full_v2.py

203 lines
8.1 KiB
Python

"""
Full audit of Protection-tagged cards with kindred metadata support (M2 Phase 2).
Created: October 8, 2025
Purpose: Audit and validate Protection tag precision after implementing grant detection.
Can be re-run periodically to check tagging quality.
This script audits ALL Protection-tagged cards and categorizes them:
- Grant: Gives broad protection to other permanents YOU control
- Kindred: Gives protection to specific creature types (metadata tags)
- Mixed: Both broad and kindred/inherent
- Inherent: Only has protection itself
- ConditionalSelf: Only conditionally grants to itself
- Opponent: Grants to opponent's permanents
- Neither: False positive
Outputs:
- m2_audit_v2.json: Full analysis with summary
- m2_audit_v2_grant.csv: Cards for main Protection tag
- m2_audit_v2_kindred.csv: Cards for kindred metadata tags
- m2_audit_v2_mixed.csv: Cards with both broad and kindred grants
- m2_audit_v2_conditional.csv: Conditional self-grants (exclude)
- m2_audit_v2_inherent.csv: Inherent protection only (exclude)
- m2_audit_v2_opponent.csv: Opponent grants (exclude)
- m2_audit_v2_neither.csv: False positives (exclude)
- m2_audit_v2_all.csv: All cards combined
"""
import sys
from pathlib import Path
import pandas as pd
import json
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from code.tagging.protection_grant_detection import (
categorize_protection_card,
get_kindred_protection_tags,
is_granting_protection,
)
def load_all_cards():
"""Load all cards from color/identity CSV files."""
csv_dir = project_root / 'csv_files'
# Get all color/identity CSVs (not the raw cards.csv)
csv_files = list(csv_dir.glob('*_cards.csv'))
csv_files = [f for f in csv_files if f.stem not in ['cards', 'testdata']]
all_cards = []
for csv_file in csv_files:
try:
df = pd.read_csv(csv_file)
all_cards.append(df)
except Exception as e:
print(f"Warning: Could not load {csv_file.name}: {e}")
# Combine all DataFrames
combined = pd.concat(all_cards, ignore_index=True)
# Drop duplicates (cards appear in multiple color files)
combined = combined.drop_duplicates(subset=['name'], keep='first')
return combined
def audit_all_protection_cards():
"""Audit all Protection-tagged cards."""
print("Loading all cards...")
df = load_all_cards()
print(f"Total cards loaded: {len(df)}")
# Filter to Protection-tagged cards (column is 'themeTags' in color CSVs)
df_prot = df[df['themeTags'].str.contains('Protection', case=False, na=False)].copy()
print(f"Protection-tagged cards: {len(df_prot)}")
# Categorize each card
categories = []
grants_list = []
kindred_tags_list = []
for idx, row in df_prot.iterrows():
name = row['name']
text = str(row.get('text', '')).replace('\\n', '\n') # Convert escaped newlines to real newlines
keywords = str(row.get('keywords', ''))
card_type = str(row.get('type', ''))
# Categorize with kindred exclusion enabled
category = categorize_protection_card(name, text, keywords, card_type, exclude_kindred=True)
# Check if it grants broadly
grants_broad = is_granting_protection(text, keywords, exclude_kindred=True)
# Get kindred tags
kindred_tags = get_kindred_protection_tags(text)
categories.append(category)
grants_list.append(grants_broad)
kindred_tags_list.append(', '.join(sorted(kindred_tags)) if kindred_tags else '')
df_prot['category'] = categories
df_prot['grants_broad'] = grants_list
df_prot['kindred_tags'] = kindred_tags_list
# Generate summary (convert numpy types to native Python for JSON serialization)
summary = {
'total': int(len(df_prot)),
'categories': {k: int(v) for k, v in df_prot['category'].value_counts().to_dict().items()},
'grants_broad_count': int(df_prot['grants_broad'].sum()),
'kindred_cards_count': int((df_prot['kindred_tags'] != '').sum()),
}
# Calculate keep vs remove
keep_categories = {'Grant', 'Mixed'}
kindred_only = df_prot[df_prot['category'] == 'Kindred']
keep_count = len(df_prot[df_prot['category'].isin(keep_categories)])
remove_count = len(df_prot[~df_prot['category'].isin(keep_categories | {'Kindred'})])
summary['keep_main_tag'] = keep_count
summary['kindred_metadata'] = len(kindred_only)
summary['remove'] = remove_count
summary['precision_estimate'] = round((keep_count / len(df_prot)) * 100, 1) if len(df_prot) > 0 else 0
# Print summary
print(f"\n{'='*60}")
print("AUDIT SUMMARY")
print(f"{'='*60}")
print(f"Total Protection-tagged cards: {summary['total']}")
print(f"\nCategories:")
for cat, count in sorted(summary['categories'].items()):
pct = (count / summary['total']) * 100
print(f" {cat:20s} {count:4d} ({pct:5.1f}%)")
print(f"\n{'='*60}")
print(f"Main Protection tag: {keep_count:4d} ({keep_count/len(df_prot)*100:5.1f}%)")
print(f"Kindred metadata only: {len(kindred_only):4d} ({len(kindred_only)/len(df_prot)*100:5.1f}%)")
print(f"Remove: {remove_count:4d} ({remove_count/len(df_prot)*100:5.1f}%)")
print(f"{'='*60}")
print(f"Precision estimate: {summary['precision_estimate']}%")
print(f"{'='*60}\n")
# Export results
output_dir = project_root / 'logs' / 'roadmaps' / 'source' / 'tagging_refinement'
output_dir.mkdir(parents=True, exist_ok=True)
# Export JSON summary
with open(output_dir / 'm2_audit_v2.json', 'w') as f:
json.dump({
'summary': summary,
'cards': df_prot[['name', 'type', 'category', 'grants_broad', 'kindred_tags', 'keywords', 'text']].to_dict(orient='records')
}, f, indent=2)
# Export CSVs by category
export_cols = ['name', 'type', 'category', 'grants_broad', 'kindred_tags', 'keywords', 'text']
# Grant category
df_grant = df_prot[df_prot['category'] == 'Grant']
df_grant[export_cols].to_csv(output_dir / 'm2_audit_v2_grant.csv', index=False)
print(f"Exported {len(df_grant)} Grant cards to m2_audit_v2_grant.csv")
# Kindred category
df_kindred = df_prot[df_prot['category'] == 'Kindred']
df_kindred[export_cols].to_csv(output_dir / 'm2_audit_v2_kindred.csv', index=False)
print(f"Exported {len(df_kindred)} Kindred cards to m2_audit_v2_kindred.csv")
# Mixed category
df_mixed = df_prot[df_prot['category'] == 'Mixed']
df_mixed[export_cols].to_csv(output_dir / 'm2_audit_v2_mixed.csv', index=False)
print(f"Exported {len(df_mixed)} Mixed cards to m2_audit_v2_mixed.csv")
# ConditionalSelf category
df_conditional = df_prot[df_prot['category'] == 'ConditionalSelf']
df_conditional[export_cols].to_csv(output_dir / 'm2_audit_v2_conditional.csv', index=False)
print(f"Exported {len(df_conditional)} ConditionalSelf cards to m2_audit_v2_conditional.csv")
# Inherent category
df_inherent = df_prot[df_prot['category'] == 'Inherent']
df_inherent[export_cols].to_csv(output_dir / 'm2_audit_v2_inherent.csv', index=False)
print(f"Exported {len(df_inherent)} Inherent cards to m2_audit_v2_inherent.csv")
# Opponent category
df_opponent = df_prot[df_prot['category'] == 'Opponent']
df_opponent[export_cols].to_csv(output_dir / 'm2_audit_v2_opponent.csv', index=False)
print(f"Exported {len(df_opponent)} Opponent cards to m2_audit_v2_opponent.csv")
# Neither category
df_neither = df_prot[df_prot['category'] == 'Neither']
df_neither[export_cols].to_csv(output_dir / 'm2_audit_v2_neither.csv', index=False)
print(f"Exported {len(df_neither)} Neither cards to m2_audit_v2_neither.csv")
# All cards
df_prot[export_cols].to_csv(output_dir / 'm2_audit_v2_all.csv', index=False)
print(f"Exported {len(df_prot)} total cards to m2_audit_v2_all.csv")
print(f"\nAll files saved to: {output_dir}")
return df_prot, summary
if __name__ == '__main__':
df_results, summary = audit_all_protection_cards()