mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2026-03-25 06:26:31 +01:00
feat: add theme editorial quality system with scoring, linting, and comprehensive documentation (#54)
This commit is contained in:
parent
de8087d940
commit
1ebc2fcb3c
12 changed files with 3169 additions and 157 deletions
201
code/scripts/backfill_editorial_fields.py
Normal file
201
code/scripts/backfill_editorial_fields.py
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
"""Backfill M1 editorial tracking fields (description_source, popularity_pinned) to theme YAML files.
|
||||
|
||||
This script adds tracking metadata to existing theme YAMLs to support editorial workflows:
|
||||
- description_source: Classifies descriptions as 'rule', 'generic', or 'manual'
|
||||
- popularity_pinned: Boolean flag to prevent auto-population_bucket updates
|
||||
|
||||
Usage:
|
||||
python code/scripts/backfill_editorial_fields.py [--dry-run] [--verbose]
|
||||
|
||||
Options:
|
||||
--dry-run: Show changes without writing files
|
||||
--verbose: Print detailed progress
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import yaml
|
||||
|
||||
# Add project root to path
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT / 'code'))
|
||||
|
||||
from type_definitions_theme_catalog import ThemeYAMLFile
|
||||
from web.services.theme_editorial_service import ThemeEditorialService
|
||||
|
||||
|
||||
def load_yaml_raw(file_path: Path) -> Dict:
|
||||
"""Load YAML file preserving order and comments."""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def write_yaml_preserving_order(file_path: Path, data: Dict) -> None:
|
||||
"""Write YAML file with consistent formatting."""
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.safe_dump(
|
||||
data,
|
||||
f,
|
||||
default_flow_style=False,
|
||||
allow_unicode=True,
|
||||
sort_keys=False, # Preserve insertion order
|
||||
indent=2,
|
||||
)
|
||||
|
||||
|
||||
def backfill_theme_yaml(
|
||||
file_path: Path,
|
||||
service: ThemeEditorialService,
|
||||
dry_run: bool = False,
|
||||
verbose: bool = False
|
||||
) -> Tuple[bool, List[str]]:
|
||||
"""Backfill M1 editorial fields to a single theme YAML.
|
||||
|
||||
Args:
|
||||
file_path: Path to theme YAML file
|
||||
service: ThemeEditorialService instance for inference
|
||||
dry_run: If True, don't write changes
|
||||
verbose: If True, print detailed messages
|
||||
|
||||
Returns:
|
||||
Tuple of (modified, changes) where:
|
||||
- modified: True if file was changed
|
||||
- changes: List of change descriptions
|
||||
"""
|
||||
try:
|
||||
# Load raw YAML
|
||||
raw_data = load_yaml_raw(file_path)
|
||||
|
||||
# Validate against ThemeYAMLFile model
|
||||
theme = ThemeYAMLFile(**raw_data)
|
||||
|
||||
changes = []
|
||||
modified = False
|
||||
|
||||
# Check description_source
|
||||
if not raw_data.get('description_source'):
|
||||
if theme.description:
|
||||
inferred = service.infer_description_source(theme.description)
|
||||
raw_data['description_source'] = inferred
|
||||
changes.append(f"Added description_source='{inferred}'")
|
||||
modified = True
|
||||
else:
|
||||
changes.append("Skipped description_source (no description)")
|
||||
|
||||
# Check popularity_pinned
|
||||
if 'popularity_pinned' not in raw_data:
|
||||
raw_data['popularity_pinned'] = False
|
||||
changes.append("Added popularity_pinned=False")
|
||||
modified = True
|
||||
|
||||
# Write back if modified and not dry-run
|
||||
if modified and not dry_run:
|
||||
write_yaml_preserving_order(file_path, raw_data)
|
||||
|
||||
if verbose and modified:
|
||||
print(f"{'[DRY-RUN] ' if dry_run else ''}Modified: {file_path.name}")
|
||||
for change in changes:
|
||||
print(f" - {change}")
|
||||
|
||||
return modified, changes
|
||||
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f"ERROR processing {file_path.name}: {e}", file=sys.stderr)
|
||||
return False, [f"Error: {e}"]
|
||||
|
||||
|
||||
def backfill_catalog(
|
||||
catalog_dir: Path,
|
||||
dry_run: bool = False,
|
||||
verbose: bool = False
|
||||
) -> Dict[str, int]:
|
||||
"""Backfill all theme YAML files in catalog directory.
|
||||
|
||||
Args:
|
||||
catalog_dir: Path to themes/catalog/ directory
|
||||
dry_run: If True, don't write changes
|
||||
verbose: If True, print detailed progress
|
||||
|
||||
Returns:
|
||||
Statistics dict with counts
|
||||
"""
|
||||
service = ThemeEditorialService()
|
||||
|
||||
yaml_files = sorted(catalog_dir.glob('*.yml'))
|
||||
|
||||
stats = {
|
||||
'total': len(yaml_files),
|
||||
'modified': 0,
|
||||
'unchanged': 0,
|
||||
'errors': 0,
|
||||
}
|
||||
|
||||
print(f"Processing {stats['total']} theme YAML files...")
|
||||
if dry_run:
|
||||
print("[DRY-RUN MODE] No files will be modified\n")
|
||||
|
||||
for yaml_path in yaml_files:
|
||||
modified, changes = backfill_theme_yaml(yaml_path, service, dry_run, verbose)
|
||||
|
||||
if changes and changes[0].startswith('Error:'):
|
||||
stats['errors'] += 1
|
||||
elif modified:
|
||||
stats['modified'] += 1
|
||||
else:
|
||||
stats['unchanged'] += 1
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Backfill M1 editorial tracking fields to theme YAML files"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help="Show changes without writing files"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help="Print detailed progress"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--catalog-dir',
|
||||
type=Path,
|
||||
default=ROOT / 'config' / 'themes' / 'catalog',
|
||||
help="Path to theme catalog directory (default: config/themes/catalog)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.catalog_dir.exists():
|
||||
print(f"ERROR: Catalog directory not found: {args.catalog_dir}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Run backfill
|
||||
stats = backfill_catalog(args.catalog_dir, args.dry_run, args.verbose)
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Backfill {'Summary (DRY-RUN)' if args.dry_run else 'Complete'}:")
|
||||
print(f" Total files: {stats['total']}")
|
||||
print(f" Modified: {stats['modified']}")
|
||||
print(f" Unchanged: {stats['unchanged']}")
|
||||
print(f" Errors: {stats['errors']}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\nRe-run without --dry-run to apply changes.")
|
||||
|
||||
return 0 if stats['errors'] == 0 else 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
@ -34,6 +34,7 @@ if str(CODE_ROOT) not in sys.path:
|
|||
from type_definitions_theme_catalog import ThemeCatalog, ThemeYAMLFile
|
||||
from scripts.extract_themes import load_whitelist_config
|
||||
from scripts.build_theme_catalog import build_catalog
|
||||
from web.services.theme_editorial_service import ThemeEditorialService
|
||||
|
||||
CATALOG_JSON = ROOT / 'config' / 'themes' / 'theme_list.json'
|
||||
|
||||
|
|
@ -110,13 +111,35 @@ def validate_catalog(data: Dict, *, whitelist: Dict, allow_soft_exceed: bool = T
|
|||
return errors
|
||||
|
||||
|
||||
def validate_yaml_files(*, whitelist: Dict, strict_alias: bool = False) -> List[str]:
|
||||
def validate_yaml_files(
|
||||
*,
|
||||
whitelist: Dict,
|
||||
strict_alias: bool = False,
|
||||
check_editorial_quality: bool = False,
|
||||
lint_enabled: bool = False,
|
||||
lint_duplication_threshold: float = 0.5,
|
||||
lint_quality_threshold: float = 0.3
|
||||
) -> List[str]:
|
||||
"""Validate individual YAML catalog files.
|
||||
|
||||
strict_alias: if True, treat presence of a deprecated alias (normalization key)
|
||||
as a hard error instead of a soft ignored transitional state.
|
||||
check_editorial_quality: if True, check M1 editorial quality fields (description_source, etc.).
|
||||
lint_enabled: if True, run M4 linter checks (duplication, quality scoring).
|
||||
lint_duplication_threshold: flag themes with duplication ratio above this (default 0.5).
|
||||
lint_quality_threshold: flag themes with quality score below this (default 0.3).
|
||||
"""
|
||||
errors: List[str] = []
|
||||
|
||||
# M4: Initialize editorial service for lint checks
|
||||
editorial_service = None
|
||||
global_card_freq = None
|
||||
if lint_enabled:
|
||||
try:
|
||||
editorial_service = ThemeEditorialService()
|
||||
global_card_freq = editorial_service.calculate_global_card_frequency()
|
||||
except Exception as e:
|
||||
errors.append(f"[LINT] Failed to initialize editorial service: {e}")
|
||||
catalog_dir = ROOT / 'config' / 'themes' / 'catalog'
|
||||
if not catalog_dir.exists():
|
||||
return errors
|
||||
|
|
@ -142,6 +165,72 @@ def validate_yaml_files(*, whitelist: Dict, strict_alias: bool = False) -> List[
|
|||
if obj.id in seen_ids:
|
||||
errors.append(f"Duplicate YAML id: {obj.id}")
|
||||
seen_ids.add(obj.id)
|
||||
|
||||
# M1 Editorial Field Validation (opt-in)
|
||||
if check_editorial_quality:
|
||||
if obj.description and not obj.description_source:
|
||||
errors.append(f"Missing description_source in {path.name} (has description but no source metadata)")
|
||||
if obj.description_source == 'generic':
|
||||
# Soft warning: generic descriptions should be upgraded
|
||||
errors.append(f"[QUALITY] {path.name} has generic description_source - consider upgrading to rule-based or manual")
|
||||
if obj.popularity_pinned and not obj.popularity_bucket:
|
||||
errors.append(f"Invalid configuration in {path.name}: popularity_pinned=True but popularity_bucket is missing")
|
||||
|
||||
# M4 Linter Checks (opt-in)
|
||||
if lint_enabled and editorial_service and global_card_freq is not None:
|
||||
# Only lint themes with example cards
|
||||
if obj.example_cards and len(obj.example_cards) > 0:
|
||||
# Check 1: High Duplication Ratio
|
||||
try:
|
||||
dup_ratio = editorial_service.calculate_duplication_ratio(
|
||||
example_cards=obj.example_cards,
|
||||
global_card_freq=global_card_freq,
|
||||
duplication_threshold=0.4 # Cards in >40% of themes
|
||||
)
|
||||
if dup_ratio > lint_duplication_threshold:
|
||||
# Calculate total themes for identifying generic cards
|
||||
index = editorial_service.load_index()
|
||||
total_themes = len(index.slug_to_entry)
|
||||
generic_cards = [
|
||||
card for card in obj.example_cards
|
||||
if global_card_freq.get(card, 0) / max(1, total_themes) > 0.4
|
||||
]
|
||||
errors.append(
|
||||
f"[LINT-WARNING] {path.name} has high duplication ratio ({dup_ratio:.2f} > {lint_duplication_threshold}). "
|
||||
f"Generic cards: {', '.join(generic_cards[:5])}{' ...' if len(generic_cards) > 5 else ''}"
|
||||
)
|
||||
except Exception as e:
|
||||
errors.append(f"[LINT] Failed to check duplication for {path.name}: {e}")
|
||||
|
||||
# Check 2: Low Quality Score
|
||||
try:
|
||||
# Create a minimal ThemeEntry for quality scoring
|
||||
from type_definitions_theme_catalog import ThemeEntry
|
||||
theme_entry = ThemeEntry(
|
||||
theme=obj.display_name,
|
||||
example_cards=obj.example_cards,
|
||||
description_source=obj.description_source
|
||||
)
|
||||
tier, score = editorial_service.calculate_enhanced_quality_score(
|
||||
theme_entry=theme_entry,
|
||||
global_card_freq=global_card_freq
|
||||
)
|
||||
if score < lint_quality_threshold:
|
||||
suggestions = []
|
||||
if len(obj.example_cards) < 5:
|
||||
suggestions.append("Add more example cards (target: 8+)")
|
||||
if obj.description_source == 'generic':
|
||||
suggestions.append("Upgrade to manual or rule-based description")
|
||||
if dup_ratio > 0.4:
|
||||
suggestions.append("Replace generic staples with unique cards")
|
||||
|
||||
errors.append(
|
||||
f"[LINT-WARNING] {path.name} has low quality score ({score:.2f} < {lint_quality_threshold}, tier={tier}). "
|
||||
f"Suggestions: {'; '.join(suggestions) if suggestions else 'Review theme curation'}"
|
||||
)
|
||||
except Exception as e:
|
||||
errors.append(f"[LINT] Failed to check quality for {path.name}: {e}")
|
||||
|
||||
# Normalization alias check: display_name should already be normalized if in map
|
||||
if normalization_map and obj.display_name in normalization_map.keys():
|
||||
if strict_alias:
|
||||
|
|
@ -164,6 +253,10 @@ def main(): # pragma: no cover
|
|||
parser.add_argument('--fail-soft-exceed', action='store_true', help='Treat synergy list length > cap as error even for soft exceed')
|
||||
parser.add_argument('--yaml-schema', action='store_true', help='Print JSON Schema for per-file ThemeYAML and exit')
|
||||
parser.add_argument('--strict-alias', action='store_true', help='Fail if any YAML uses an alias name slated for normalization')
|
||||
parser.add_argument('--check-quality', action='store_true', help='Enable M1 editorial quality checks (description_source, popularity_pinned)')
|
||||
parser.add_argument('--lint', action='store_true', help='Enable M4 linter checks (duplication, quality scoring)')
|
||||
parser.add_argument('--lint-duplication-threshold', type=float, default=0.5, help='Duplication ratio threshold for linter warnings (default: 0.5)')
|
||||
parser.add_argument('--lint-quality-threshold', type=float, default=0.3, help='Quality score threshold for linter warnings (default: 0.3)')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.schema:
|
||||
|
|
@ -184,7 +277,14 @@ def main(): # pragma: no cover
|
|||
whitelist = load_whitelist_config()
|
||||
data = load_catalog_file()
|
||||
errors = validate_catalog(data, whitelist=whitelist, allow_soft_exceed=not args.fail_soft_exceed)
|
||||
errors.extend(validate_yaml_files(whitelist=whitelist, strict_alias=args.strict_alias))
|
||||
errors.extend(validate_yaml_files(
|
||||
whitelist=whitelist,
|
||||
strict_alias=args.strict_alias,
|
||||
check_editorial_quality=args.check_quality,
|
||||
lint_enabled=args.lint,
|
||||
lint_duplication_threshold=args.lint_duplication_threshold,
|
||||
lint_quality_threshold=args.lint_quality_threshold
|
||||
))
|
||||
|
||||
if args.rebuild_pass:
|
||||
rebuilt = build_catalog(limit=0, verbose=False)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue