siyuan/scripts/check-lang-keys.py
2025-12-14 10:35:22 +08:00

377 lines
13 KiB
Python

"""
Check if language file keys are complete.
This script checks all language files in app/appearance/langs/ directory
and finds:
- Missing keys: keys that exist in most files but not in current file
- Extra keys: keys that don't exist in most files but exist in current file
- Duplicate keys: keys that appear multiple times in the same file
using statistical methods.
Usage:
python scripts/check-lang-keys.py
python scripts/check-lang-keys.py -d app/appearance/langs
python scripts/check-lang-keys.py --dir app/appearance/langs
Options:
-d, --dir DIR Language files directory path (default: app/appearance/langs)
-h, --help Show help message and exit
Exit codes:
0 All language files have complete keys
1 Some language files have missing or extra keys
"""
import json
import os
import re
from pathlib import Path
from argparse import ArgumentParser
from collections import defaultdict, Counter
def find_duplicate_keys_recursive(data, prefix="", duplicates=None):
"""Recursively find duplicate keys in nested JSON structure.
Args:
data: JSON data (dict, list, or primitive)
prefix: Current key prefix (for nested keys)
duplicates: Set to store duplicate keys found
Returns:
set: Set of duplicate keys (using dot notation for nested keys)
"""
if duplicates is None:
duplicates = set()
if isinstance(data, dict):
seen_keys = {}
for key, value in data.items():
full_key = f"{prefix}.{key}" if prefix else key
# Check for duplicate at current level
if key in seen_keys:
duplicates.add(full_key)
seen_keys[key] = True
# Recursively check nested structures
if isinstance(value, dict):
find_duplicate_keys_recursive(value, full_key, duplicates)
elif isinstance(value, list):
for i, item in enumerate(value):
if isinstance(item, dict):
find_duplicate_keys_recursive(item, f"{full_key}[{i}]", duplicates)
return duplicates
def find_duplicate_keys(file_path):
"""Find duplicate keys in JSON file including nested structures.
Args:
file_path (Path): Language file path
Returns:
list: List of duplicate keys found in the file (using dot notation for nested keys)
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if not isinstance(data, dict):
return []
duplicates = find_duplicate_keys_recursive(data)
return sorted(duplicates)
except Exception as e:
print(f"Error: Failed to check duplicate keys in {file_path}: {e}")
return []
def collect_all_keys(data, prefix=""):
"""Recursively collect all keys from nested JSON structure.
Args:
data: JSON data (dict, list, or primitive)
prefix: Current key prefix (for nested keys)
Returns:
set: Set of all keys (using dot notation for nested keys)
"""
keys = set()
if isinstance(data, dict):
for key, value in data.items():
full_key = f"{prefix}.{key}" if prefix else key
keys.add(full_key)
# Recursively collect nested keys
if isinstance(value, dict):
keys.update(collect_all_keys(value, full_key))
elif isinstance(value, list):
# Handle list of objects
for i, item in enumerate(value):
if isinstance(item, dict):
keys.update(collect_all_keys(item, f"{full_key}[{i}]"))
return keys
def get_key_order(file_path):
"""Get the order of keys as they appear in the JSON file.
Args:
file_path (Path): Language file path
Returns:
dict: Dictionary mapping key name (including nested paths) to its position index
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
data = json.loads(content)
# Collect all keys recursively
all_keys = collect_all_keys(data)
# Get order by parsing the raw text
key_order = {}
pattern = r'["\']([^"\']+)["\']\s*:'
index = 0
# Track nesting path
nesting_stack = []
for match in re.finditer(pattern, content):
key = match.group(1)
pos = match.start()
# Check nesting level
text_before = content[:pos]
open_braces = text_before.count('{')
close_braces = text_before.count('}')
nesting_level = open_braces - close_braces
# Build full key path based on nesting
if nesting_level == 1: # Top level
nesting_stack = [key]
full_key = key
elif nesting_level > 1: # Nested level
# Keep only the relevant nesting levels
nesting_stack = nesting_stack[:nesting_level - 1] + [key]
full_key = ".".join(nesting_stack)
else:
continue
# Only record if this is a valid key we're tracking
if full_key in all_keys and full_key not in key_order:
key_order[full_key] = index
index += 1
return key_order
except Exception as e:
return {}
def load_lang_file(file_path):
"""Load language file and return key set and file content.
Args:
file_path (Path): Language file path
Returns:
tuple: (key set (including nested keys), file content dict, duplicate keys list, key order dict)
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Collect all keys including nested ones
all_keys = collect_all_keys(data)
duplicates = find_duplicate_keys(file_path)
key_order = get_key_order(file_path)
return all_keys, data, duplicates, key_order
except json.JSONDecodeError as e:
print(f"Error: Failed to parse file {file_path}: {e}")
return None, None, [], {}
except Exception as e:
print(f"Error: Failed to read file {file_path}: {e}")
return None, None, [], {}
def check_lang_keys(langs_dir):
"""Check if language file keys are complete.
Uses statistical method: count how many files contain each key, then determine:
- Missing keys: keys that exist in most files but not in current file
- Extra keys: keys that don't exist in most files but exist in current file
- Duplicate keys: keys that appear multiple times in the same file
Args:
langs_dir (str): Language files directory path
Returns:
bool: True if all files have complete keys, False otherwise
"""
langs_path = Path(langs_dir)
if not langs_path.exists():
print(f"Error: Directory does not exist: {langs_dir}")
return False
# Load all language files
lang_keys = {}
duplicate_keys_by_file = {}
key_order_by_file = {}
for lang_file in sorted(langs_path.glob("*.json")):
keys, data, duplicates, key_order = load_lang_file(lang_file)
if keys is None:
continue
lang_keys[lang_file.name] = keys
if duplicates:
duplicate_keys_by_file[lang_file.name] = duplicates
key_order_by_file[lang_file.name] = key_order
if not lang_keys:
print("Error: No language files found")
return False
total_files = len(lang_keys)
if total_files == 0:
print("Error: No valid language files found")
return False
# Count how many files contain each key
key_count = defaultdict(int)
all_keys = set()
for keys in lang_keys.values():
all_keys.update(keys)
for key in keys:
key_count[key] += 1
# Calculate threshold: if a key exists in more than half of the files, it should exist
threshold = (total_files + 1) // 2 # Round up, e.g., 5 files need 3
# Classify keys: expected keys and unexpected keys
expected_keys = {key for key, count in key_count.items() if count >= threshold}
unexpected_keys = all_keys - expected_keys
# Find reference file (file with most keys) for ordering missing keys
reference_file = max(lang_keys.items(), key=lambda x: len(x[1]))[0]
reference_key_order = key_order_by_file.get(reference_file, {})
print(f"Checked {total_files} language files")
print(f"Threshold: keys need to exist in at least {threshold} files to be considered expected")
print(f"Expected keys: {len(expected_keys)}")
print(f"Unexpected keys: {len(unexpected_keys)}\n")
# Check keys for each file
all_complete = True
file_issues = {} # {lang_name: {'missing': set, 'extra': set, 'duplicates': list}}
for lang_name, keys in lang_keys.items():
# Find missing keys (should exist but don't exist in current file)
missing = expected_keys - keys
# Find extra keys (shouldn't exist but exist in current file)
extra = keys & unexpected_keys
# Get duplicate keys
duplicates = duplicate_keys_by_file.get(lang_name, [])
if missing or extra or duplicates:
file_issues[lang_name] = {'missing': missing, 'extra': extra, 'duplicates': duplicates}
if missing or duplicates:
all_complete = False
# Output results
if all_complete and not file_issues:
print("All language files have complete keys!")
return True
# Report issues grouped by file
print("Issues found:")
print(" Missing keys: exist in most files but not in current file")
print(" Extra keys: don't exist in most files but exist in current file")
print(" Duplicate keys: keys that appear multiple times in the same file\n")
for lang_name in sorted(file_issues.keys()):
issues = file_issues[lang_name]
missing = issues['missing']
extra = issues['extra']
duplicates = issues['duplicates']
key_order = key_order_by_file.get(lang_name, {})
# Sort function for missing keys: use reference file order
def sort_missing_by_order(key):
return (reference_key_order.get(key, float('inf')), key)
# Sort function for extra/duplicate keys: use current file order
def sort_by_order(key):
return (key_order.get(key, float('inf')), key)
has_issues = False
if missing or extra or duplicates:
has_issues = True
print(f" {lang_name}:")
# Show extra keys
if extra:
key_word = "key" if len(extra) == 1 else "keys"
print(f" {len(extra)} Extra {key_word}:")
extra_with_count = [(key, key_count[key]) for key in sorted(extra, key=sort_by_order)]
if len(extra_with_count) <= 10:
for key, count in extra_with_count:
print(f" - {key} (exists in only {count}/{total_files} files)")
else:
for key, count in extra_with_count[:10]:
print(f" - {key} (exists in only {count}/{total_files} files)")
print(f" ... {len(extra_with_count) - 10} more keys not shown")
# Show missing keys
if missing:
key_word = "key" if len(missing) == 1 else "keys"
print(f" {len(missing)} Missing {key_word}:")
missing_with_count = [(key, key_count[key]) for key in sorted(missing, key=sort_missing_by_order)]
if len(missing_with_count) <= 10:
for key, count in missing_with_count:
print(f" - {key} (exists in {count}/{total_files} files)")
else:
for key, count in missing_with_count[:10]:
print(f" - {key} (exists in {count}/{total_files} files)")
print(f" ... {len(missing_with_count) - 10} more keys not shown")
# Show duplicate keys
if duplicates:
key_word = "key" if len(duplicates) == 1 else "keys"
print(f" {len(duplicates)} Duplicate {key_word}:")
for key in sorted(duplicates, key=sort_by_order):
print(f" - {key}")
if has_issues:
print()
return False
def main():
parser = ArgumentParser(
description="Check if language file keys are complete"
)
parser.add_argument(
"-d", "--dir",
default="app/appearance/langs",
help="Language files directory path (default: app/appearance/langs)"
)
args = parser.parse_args()
# Get project root directory (parent of script directory)
script_dir = Path(__file__).parent
project_root = script_dir.parent
langs_dir = project_root / args.dir
success = check_lang_keys(langs_dir)
exit(0 if success else 1)
if __name__ == "__main__":
main()