""" Card image caching system. Downloads and manages local cache of Magic: The Gathering card images from Scryfall, with graceful fallback to API when images are missing. Features: - Optional caching (disabled by default for open source users) - Uses Scryfall bulk data API (respects rate limits and guidelines) - Downloads from Scryfall CDN (no rate limits on image files) - Progress tracking for long downloads - Resume capability if interrupted - Graceful fallback to API if images missing Environment Variables: CACHE_CARD_IMAGES: 1=enable caching, 0=disable (default: 0) Image Sizes: - small: 160px width (for list views) - normal: 488px width (for prominent displays, hover previews) Directory Structure: card_files/images/small/ - Small thumbnails (~900 MB - 1.5 GB) card_files/images/normal/ - Normal images (~2.4 GB - 4.5 GB) See: https://scryfall.com/docs/api """ import json import logging import os import re import time from pathlib import Path from typing import Any, Optional from urllib.request import Request, urlopen from code.file_setup.scryfall_bulk_data import ScryfallBulkDataClient logger = logging.getLogger(__name__) # Scryfall CDN has no rate limits, but we'll be conservative DOWNLOAD_DELAY = 0.05 # 50ms between image downloads (20 req/sec) # Image sizes to cache IMAGE_SIZES = ["small", "normal"] # Card name sanitization (filesystem-safe) INVALID_CHARS = r'[<>:"/\\|?*]' def sanitize_filename(card_name: str) -> str: """ Sanitize card name for use as filename. Args: card_name: Original card name Returns: Filesystem-safe filename """ # Replace invalid characters with underscore safe_name = re.sub(INVALID_CHARS, "_", card_name) # Remove multiple consecutive underscores safe_name = re.sub(r"_+", "_", safe_name) # Trim leading/trailing underscores safe_name = safe_name.strip("_") return safe_name class ImageCache: """Manages local card image cache.""" def __init__( self, base_dir: str = "card_files/images", bulk_data_path: str = "card_files/raw/scryfall_bulk_data.json", ): """ Initialize image cache. Args: base_dir: Base directory for cached images bulk_data_path: Path to Scryfall bulk data JSON """ self.base_dir = Path(base_dir) self.bulk_data_path = Path(bulk_data_path) self.client = ScryfallBulkDataClient() self._last_download_time: float = 0.0 def is_enabled(self) -> bool: """Check if image caching is enabled via environment variable.""" return os.getenv("CACHE_CARD_IMAGES", "0") == "1" def get_image_path(self, card_name: str, size: str = "normal") -> Optional[Path]: """ Get local path to cached image if it exists. Args: card_name: Card name size: Image size ('small' or 'normal') Returns: Path to cached image, or None if not cached """ if not self.is_enabled(): return None safe_name = sanitize_filename(card_name) image_path = self.base_dir / size / f"{safe_name}.jpg" if image_path.exists(): return image_path return None def get_image_url(self, card_name: str, size: str = "normal") -> str: """ Get image URL (local path if cached, Scryfall API otherwise). Args: card_name: Card name size: Image size ('small' or 'normal') Returns: URL or local path to image """ # Check local cache first local_path = self.get_image_path(card_name, size) if local_path: # Return as static file path for web serving return f"/static/card_images/{size}/{sanitize_filename(card_name)}.jpg" # Fallback to Scryfall API from urllib.parse import quote card_query = quote(card_name) return f"https://api.scryfall.com/cards/named?fuzzy={card_query}&format=image&version={size}" def _rate_limit_wait(self) -> None: """Wait to respect rate limits between downloads.""" elapsed = time.time() - self._last_download_time if elapsed < DOWNLOAD_DELAY: time.sleep(DOWNLOAD_DELAY - elapsed) self._last_download_time = time.time() def _download_image(self, image_url: str, output_path: Path) -> bool: """ Download single image from Scryfall CDN. Args: image_url: Image URL from bulk data output_path: Local path to save image Returns: True if successful, False otherwise """ self._rate_limit_wait() try: # Ensure output directory exists output_path.parent.mkdir(parents=True, exist_ok=True) req = Request(image_url) req.add_header("User-Agent", "MTG-Deckbuilder/3.0 (Image Cache)") with urlopen(req, timeout=30) as response: image_data = response.read() with open(output_path, "wb") as f: f.write(image_data) return True except Exception as e: logger.debug(f"Failed to download {image_url}: {e}") # Clean up partial download if output_path.exists(): output_path.unlink() return False def _load_bulk_data(self) -> list[dict[str, Any]]: """ Load card data from bulk data JSON. Returns: List of card objects with image URLs Raises: FileNotFoundError: If bulk data file doesn't exist json.JSONDecodeError: If file is invalid JSON """ if not self.bulk_data_path.exists(): raise FileNotFoundError( f"Bulk data file not found: {self.bulk_data_path}. " "Run download_bulk_data() first." ) logger.info(f"Loading bulk data from {self.bulk_data_path}") with open(self.bulk_data_path, "r", encoding="utf-8") as f: return json.load(f) def _filter_to_our_cards(self, bulk_cards: list[dict[str, Any]]) -> list[dict[str, Any]]: """ Filter bulk data to only cards in our all_cards.parquet file. Deduplicates by card name (takes first printing only). Args: bulk_cards: Full Scryfall bulk data Returns: Filtered list of cards matching our dataset (one per unique name) """ try: import pandas as pd from code.path_util import get_processed_cards_path # Load our card names parquet_path = get_processed_cards_path() df = pd.read_parquet(parquet_path, columns=["name"]) our_card_names = set(df["name"].str.lower()) logger.info(f"Filtering {len(bulk_cards)} Scryfall cards to {len(our_card_names)} cards in our dataset") # Filter and deduplicate - keep only first printing of each card seen_names = set() filtered = [] for card in bulk_cards: card_name_lower = card.get("name", "").lower() if card_name_lower in our_card_names and card_name_lower not in seen_names: filtered.append(card) seen_names.add(card_name_lower) logger.info(f"Filtered to {len(filtered)} unique cards with image data") return filtered except Exception as e: logger.warning(f"Could not filter to our cards: {e}. Using all Scryfall cards.") return bulk_cards def download_bulk_data(self, progress_callback=None) -> None: """ Download latest Scryfall bulk data JSON. Args: progress_callback: Optional callback(bytes_downloaded, total_bytes) Raises: Exception: If download fails """ logger.info("Downloading Scryfall bulk data...") self.bulk_data_path.parent.mkdir(parents=True, exist_ok=True) self.client.get_bulk_data( output_path=str(self.bulk_data_path), progress_callback=progress_callback, ) logger.info("Bulk data download complete") def download_images( self, sizes: Optional[list[str]] = None, progress_callback=None, max_cards: Optional[int] = None, ) -> dict[str, int]: """ Download card images from Scryfall CDN. Args: sizes: Image sizes to download (default: ['small', 'normal']) progress_callback: Optional callback(current, total, card_name) max_cards: Maximum cards to download (for testing) Returns: Dictionary with download statistics Raises: FileNotFoundError: If bulk data not available """ if not self.is_enabled(): logger.info("Image caching disabled (CACHE_CARD_IMAGES=0)") return {"skipped": 0} if sizes is None: sizes = IMAGE_SIZES logger.info(f"Starting image download for sizes: {sizes}") # Load bulk data and filter to our cards bulk_cards = self._load_bulk_data() cards = self._filter_to_our_cards(bulk_cards) total_cards = len(cards) if max_cards is None else min(max_cards, len(cards)) stats = { "total": total_cards, "downloaded": 0, "skipped": 0, "failed": 0, } for i, card in enumerate(cards[:total_cards]): card_name = card.get("name") if not card_name: stats["skipped"] += 1 continue # Collect all faces to download (single-faced or multi-faced) faces_to_download = [] # Check if card has direct image_uris (single-faced card) if card.get("image_uris"): faces_to_download.append({ "name": card_name, "image_uris": card["image_uris"], }) # Handle double-faced cards (get all faces) elif card.get("card_faces"): for face_idx, face in enumerate(card["card_faces"]): if face.get("image_uris"): # For multi-faced cards, append face name or index face_name = face.get("name", f"{card_name}_face{face_idx}") faces_to_download.append({ "name": face_name, "image_uris": face["image_uris"], }) # Skip if no faces found if not faces_to_download: logger.debug(f"No image URIs for {card_name}") stats["skipped"] += 1 continue # Download each face in each requested size for face in faces_to_download: face_name = face["name"] image_uris = face["image_uris"] for size in sizes: image_url = image_uris.get(size) if not image_url: continue # Check if already cached safe_name = sanitize_filename(face_name) output_path = self.base_dir / size / f"{safe_name}.jpg" if output_path.exists(): stats["skipped"] += 1 continue # Download image if self._download_image(image_url, output_path): stats["downloaded"] += 1 else: stats["failed"] += 1 # Progress callback if progress_callback: progress_callback(i + 1, total_cards, card_name) # Invalidate cached summary since we just downloaded new images self.invalidate_summary_cache() logger.info(f"Image download complete: {stats}") return stats def cache_statistics(self) -> dict[str, Any]: """ Get statistics about cached images. Uses a cached summary.json file to avoid scanning thousands of files. Regenerates summary if it doesn't exist or is stale (based on WEB_AUTO_REFRESH_DAYS, default 7 days, matching the main card data staleness check). Returns: Dictionary with cache stats (count, size, etc.) """ stats = {"enabled": self.is_enabled()} if not self.is_enabled(): return stats summary_file = self.base_dir / "summary.json" # Get staleness threshold from environment (same as card data check) try: refresh_days = int(os.getenv('WEB_AUTO_REFRESH_DAYS', '7')) except Exception: refresh_days = 7 if refresh_days <= 0: # Never consider stale refresh_seconds = float('inf') else: refresh_seconds = refresh_days * 24 * 60 * 60 # Convert days to seconds # Check if summary exists and is recent (less than refresh_seconds old) use_cached = False if summary_file.exists(): try: import time file_age = time.time() - summary_file.stat().st_mtime if file_age < refresh_seconds: use_cached = True except Exception: pass # Try to use cached summary if use_cached: try: import json with summary_file.open('r', encoding='utf-8') as f: cached_stats = json.load(f) stats.update(cached_stats) return stats except Exception as e: logger.warning(f"Could not read cache summary: {e}") # Regenerate summary (fast - just count files and estimate size) for size in IMAGE_SIZES: size_dir = self.base_dir / size if size_dir.exists(): # Fast count: count .jpg files without statting each one count = sum(1 for _ in size_dir.glob("*.jpg")) # Estimate total size based on typical averages to avoid stat() calls # Small images: ~40 KB avg, Normal images: ~100 KB avg avg_size_kb = 40 if size == "small" else 100 estimated_size_mb = (count * avg_size_kb) / 1024 stats[size] = { "count": count, "size_mb": round(estimated_size_mb, 1), } else: stats[size] = {"count": 0, "size_mb": 0.0} # Save summary for next time try: import json with summary_file.open('w', encoding='utf-8') as f: json.dump({k: v for k, v in stats.items() if k != "enabled"}, f) except Exception as e: logger.warning(f"Could not write cache summary: {e}") return stats def invalidate_summary_cache(self) -> None: """Delete the cached summary file to force regeneration on next call.""" if not self.is_enabled(): return summary_file = self.base_dir / "summary.json" if summary_file.exists(): try: summary_file.unlink() logger.debug("Invalidated cache summary file") except Exception as e: logger.warning(f"Could not delete cache summary: {e}") def main(): """CLI entry point for image caching.""" import argparse parser = argparse.ArgumentParser(description="Card image cache management") parser.add_argument( "--download", action="store_true", help="Download images from Scryfall", ) parser.add_argument( "--stats", action="store_true", help="Show cache statistics", ) parser.add_argument( "--max-cards", type=int, help="Maximum cards to download (for testing)", ) parser.add_argument( "--sizes", nargs="+", default=IMAGE_SIZES, choices=IMAGE_SIZES, help="Image sizes to download", ) parser.add_argument( "--force", action="store_true", help="Force re-download of bulk data even if recent", ) args = parser.parse_args() # Setup logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) cache = ImageCache() if args.stats: stats = cache.cache_statistics() print("\nCache Statistics:") print(f" Enabled: {stats['enabled']}") if stats["enabled"]: for size in IMAGE_SIZES: if size in stats: print( f" {size.capitalize()}: {stats[size]['count']} images " f"({stats[size]['size_mb']:.1f} MB)" ) elif args.download: if not cache.is_enabled(): print("Image caching is disabled. Set CACHE_CARD_IMAGES=1 to enable.") return # Check if bulk data already exists and is recent (within 24 hours) bulk_data_exists = cache.bulk_data_path.exists() bulk_data_age_hours = None if bulk_data_exists: import time age_seconds = time.time() - cache.bulk_data_path.stat().st_mtime bulk_data_age_hours = age_seconds / 3600 print(f"Bulk data file exists (age: {bulk_data_age_hours:.1f} hours)") # Download bulk data if missing, old, or forced if not bulk_data_exists or bulk_data_age_hours > 24 or args.force: print("Downloading Scryfall bulk data...") def bulk_progress(downloaded, total): if total > 0: pct = (downloaded / total) * 100 print(f" Progress: {downloaded / 1024 / 1024:.1f} MB / " f"{total / 1024 / 1024:.1f} MB ({pct:.1f}%)", end="\r") cache.download_bulk_data(progress_callback=bulk_progress) print("\nBulk data downloaded successfully") else: print("Bulk data is recent, skipping download (use --force to re-download)") # Download images print(f"\nDownloading card images (sizes: {', '.join(args.sizes)})...") def image_progress(current, total, card_name): pct = (current / total) * 100 print(f" Progress: {current}/{total} ({pct:.1f}%) - {card_name}", end="\r") stats = cache.download_images( sizes=args.sizes, progress_callback=image_progress, max_cards=args.max_cards, ) print("\n\nDownload complete:") print(f" Total: {stats['total']}") print(f" Downloaded: {stats['downloaded']}") print(f" Skipped: {stats['skipped']}") print(f" Failed: {stats['failed']}") else: parser.print_help() if __name__ == "__main__": main()