siyuan/scripts/check-lang-keys.py

"""
Check if language file keys are complete.

This script checks all language files in app/appearance/langs/ directory
and finds:
- Missing keys: keys that exist in most files but not in current file
- Extra keys: keys that don't exist in most files but exist in current file
- Duplicate keys: keys that appear multiple times in the same file
using statistical methods.

Usage:
    python scripts/check-lang-keys.py
    python scripts/check-lang-keys.py -d app/appearance/langs
    python scripts/check-lang-keys.py --dir app/appearance/langs

Options:
    -d, --dir DIR    Language files directory path (default: app/appearance/langs)
    -h, --help       Show help message and exit

Exit codes:
    0    All language files have complete keys
    1    Some language files have missing or extra keys
"""

import json
import os
import re
from pathlib import Path
from argparse import ArgumentParser
from collections import defaultdict, Counter


def find_duplicate_keys_recursive(data, prefix="", duplicates=None):
    """Recursively find duplicate keys in nested JSON structure.

    Args:
        data: JSON data (dict, list, or primitive)
        prefix: Current key prefix (for nested keys)
        duplicates: Set to store duplicate keys found

    Returns:
        set: Set of duplicate keys (using dot notation for nested keys)
    """
    if duplicates is None:
        duplicates = set()
    
    if isinstance(data, dict):
        seen_keys = {}
        for key, value in data.items():
            full_key = f"{prefix}.{key}" if prefix else key
            
            # Check for duplicate at current level
            if key in seen_keys:
                duplicates.add(full_key)
            seen_keys[key] = True
            
            # Recursively check nested structures
            if isinstance(value, dict):
                find_duplicate_keys_recursive(value, full_key, duplicates)
            elif isinstance(value, list):
                for i, item in enumerate(value):
                    if isinstance(item, dict):
                        find_duplicate_keys_recursive(item, f"{full_key}[{i}]", duplicates)
    
    return duplicates


def find_duplicate_keys(file_path):
    """Find duplicate keys in JSON file including nested structures.

    Args:
        file_path (Path): Language file path

    Returns:
        list: List of duplicate keys found in the file (using dot notation for nested keys)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if not isinstance(data, dict):
            return []
        
        duplicates = find_duplicate_keys_recursive(data)
        return sorted(duplicates)
    except Exception as e:
        print(f"Error: Failed to check duplicate keys in {file_path}: {e}")
        return []


def collect_all_keys(data, prefix=""):
    """Recursively collect all keys from nested JSON structure.

    Args:
        data: JSON data (dict, list, or primitive)
        prefix: Current key prefix (for nested keys)

    Returns:
        set: Set of all keys (using dot notation for nested keys)
    """
    keys = set()
    
    if isinstance(data, dict):
        for key, value in data.items():
            full_key = f"{prefix}.{key}" if prefix else key
            keys.add(full_key)
            
            # Recursively collect nested keys
            if isinstance(value, dict):
                keys.update(collect_all_keys(value, full_key))
            elif isinstance(value, list):
                # Handle list of objects
                for i, item in enumerate(value):
                    if isinstance(item, dict):
                        keys.update(collect_all_keys(item, f"{full_key}[{i}]"))
    
    return keys


def get_key_order(file_path):
    """Get the order of keys as they appear in the JSON file.

    Args:
        file_path (Path): Language file path

    Returns:
        dict: Dictionary mapping key name (including nested paths) to its position index
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            data = json.loads(content)
        
        # Collect all keys recursively
        all_keys = collect_all_keys(data)
        
        # Get order by parsing the raw text
        key_order = {}
        pattern = r'["\']([^"\']+)["\']\s*:'
        index = 0
        
        # Track nesting path
        nesting_stack = []
        
        for match in re.finditer(pattern, content):
            key = match.group(1)
            pos = match.start()
            
            # Check nesting level
            text_before = content[:pos]
            open_braces = text_before.count('{')
            close_braces = text_before.count('}')
            nesting_level = open_braces - close_braces
            
            # Build full key path based on nesting
            if nesting_level == 1:  # Top level
                nesting_stack = [key]
                full_key = key
            elif nesting_level > 1:  # Nested level
                # Keep only the relevant nesting levels
                nesting_stack = nesting_stack[:nesting_level - 1] + [key]
                full_key = ".".join(nesting_stack)
            else:
                continue
            
            # Only record if this is a valid key we're tracking
            if full_key in all_keys and full_key not in key_order:
                key_order[full_key] = index
                index += 1
        
        return key_order
    except Exception as e:
        return {}


def load_lang_file(file_path):
    """Load language file and return key set and file content.

    Args:
        file_path (Path): Language file path

    Returns:
        tuple: (key set (including nested keys), file content dict, duplicate keys list, key order dict)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            # Collect all keys including nested ones
            all_keys = collect_all_keys(data)
            duplicates = find_duplicate_keys(file_path)
            key_order = get_key_order(file_path)
            return all_keys, data, duplicates, key_order
    except json.JSONDecodeError as e:
        print(f"Error: Failed to parse file {file_path}: {e}")
        return None, None, [], {}
    except Exception as e:
        print(f"Error: Failed to read file {file_path}: {e}")
        return None, None, [], {}


def check_lang_keys(langs_dir):
    """Check if language file keys are complete.

    Uses statistical method: count how many files contain each key, then determine:
    - Missing keys: keys that exist in most files but not in current file
    - Extra keys: keys that don't exist in most files but exist in current file
    - Duplicate keys: keys that appear multiple times in the same file

    Args:
        langs_dir (str): Language files directory path

    Returns:
        bool: True if all files have complete keys, False otherwise
    """
    langs_path = Path(langs_dir)
    if not langs_path.exists():
        print(f"Error: Directory does not exist: {langs_dir}")
        return False

    # Load all language files
    lang_keys = {}
    duplicate_keys_by_file = {}
    key_order_by_file = {}
    
    for lang_file in sorted(langs_path.glob("*.json")):
        keys, data, duplicates, key_order = load_lang_file(lang_file)
        if keys is None:
            continue
        lang_keys[lang_file.name] = keys
        if duplicates:
            duplicate_keys_by_file[lang_file.name] = duplicates
        key_order_by_file[lang_file.name] = key_order

    if not lang_keys:
        print("Error: No language files found")
        return False

    total_files = len(lang_keys)
    if total_files == 0:
        print("Error: No valid language files found")
        return False

    # Count how many files contain each key
    key_count = defaultdict(int)
    all_keys = set()
    
    for keys in lang_keys.values():
        all_keys.update(keys)
        for key in keys:
            key_count[key] += 1

    # Calculate threshold: if a key exists in more than half of the files, it should exist
    threshold = (total_files + 1) // 2  # Round up, e.g., 5 files need 3

    # Classify keys: expected keys and unexpected keys
    expected_keys = {key for key, count in key_count.items() if count >= threshold}
    unexpected_keys = all_keys - expected_keys

    # Find reference file (file with most keys) for ordering missing keys
    reference_file = max(lang_keys.items(), key=lambda x: len(x[1]))[0]
    reference_key_order = key_order_by_file.get(reference_file, {})

    print(f"Checked {total_files} language files")
    print(f"Threshold: keys need to exist in at least {threshold} files to be considered expected")
    print(f"Expected keys: {len(expected_keys)}")
    print(f"Unexpected keys: {len(unexpected_keys)}\n")

    # Check keys for each file
    all_complete = True
    file_issues = {}  # {lang_name: {'missing': set, 'extra': set, 'duplicates': list}}

    for lang_name, keys in lang_keys.items():
        # Find missing keys (should exist but don't exist in current file)
        missing = expected_keys - keys
        # Find extra keys (shouldn't exist but exist in current file)
        extra = keys & unexpected_keys
        # Get duplicate keys
        duplicates = duplicate_keys_by_file.get(lang_name, [])

        if missing or extra or duplicates:
            file_issues[lang_name] = {'missing': missing, 'extra': extra, 'duplicates': duplicates}
            if missing or duplicates:
                all_complete = False

    # Output results
    if all_complete and not file_issues:
        print("All language files have complete keys!")
        return True

    # Report issues grouped by file
    print("Issues found:")
    print("  Missing keys: exist in most files but not in current file")
    print("  Extra keys: don't exist in most files but exist in current file")
    print("  Duplicate keys: keys that appear multiple times in the same file\n")
    
    for lang_name in sorted(file_issues.keys()):
        issues = file_issues[lang_name]
        missing = issues['missing']
        extra = issues['extra']
        duplicates = issues['duplicates']
        key_order = key_order_by_file.get(lang_name, {})
        
        # Sort function for missing keys: use reference file order
        def sort_missing_by_order(key):
            return (reference_key_order.get(key, float('inf')), key)
        
        # Sort function for extra/duplicate keys: use current file order
        def sort_by_order(key):
            return (key_order.get(key, float('inf')), key)
        
        has_issues = False
        if missing or extra or duplicates:
            has_issues = True
            print(f"  {lang_name}:")

        # Show extra keys
        if extra:
            key_word = "key" if len(extra) == 1 else "keys"
            print(f"    {len(extra)} Extra {key_word}:")
            extra_with_count = [(key, key_count[key]) for key in sorted(extra, key=sort_by_order)]
            if len(extra_with_count) <= 10:
                for key, count in extra_with_count:
                    print(f"      - {key} (exists in only {count}/{total_files} files)")
            else:
                for key, count in extra_with_count[:10]:
                    print(f"      - {key} (exists in only {count}/{total_files} files)")
                print(f"      ... {len(extra_with_count) - 10} more keys not shown")

        # Show missing keys
        if missing:
            key_word = "key" if len(missing) == 1 else "keys"
            print(f"    {len(missing)} Missing {key_word}:")
            missing_with_count = [(key, key_count[key]) for key in sorted(missing, key=sort_missing_by_order)]
            if len(missing_with_count) <= 10:
                for key, count in missing_with_count:
                    print(f"      - {key} (exists in {count}/{total_files} files)")
            else:
                for key, count in missing_with_count[:10]:
                    print(f"      - {key} (exists in {count}/{total_files} files)")
                print(f"      ... {len(missing_with_count) - 10} more keys not shown")

        # Show duplicate keys
        if duplicates:
            key_word = "key" if len(duplicates) == 1 else "keys"
            print(f"    {len(duplicates)} Duplicate {key_word}:")
            for key in sorted(duplicates, key=sort_by_order):
                print(f"      - {key}")

        if has_issues:
            print()

    return False


def main():
    parser = ArgumentParser(
        description="Check if language file keys are complete"
    )
    parser.add_argument(
        "-d", "--dir",
        default="app/appearance/langs",
        help="Language files directory path (default: app/appearance/langs)"
    )
    args = parser.parse_args()

    # Get project root directory (parent of script directory)
    script_dir = Path(__file__).parent
    project_root = script_dir.parent
    langs_dir = project_root / args.dir

    success = check_lang_keys(langs_dir)
    exit(0 if success else 1)


if __name__ == "__main__":
    main()
:bug: Fix i18n key names and add checking script (#16578) 2025-12-14 10:35:22 +08:00			`"""`
			`Check if language file keys are complete.`

			`This script checks all language files in app/appearance/langs/ directory`
			`and finds:`
			`- Missing keys: keys that exist in most files but not in current file`
			`- Extra keys: keys that don't exist in most files but exist in current file`
			`- Duplicate keys: keys that appear multiple times in the same file`
			`using statistical methods.`

			`Usage:`
			`python scripts/check-lang-keys.py`
			`python scripts/check-lang-keys.py -d app/appearance/langs`
			`python scripts/check-lang-keys.py --dir app/appearance/langs`

			`Options:`
			`-d, --dir DIR Language files directory path (default: app/appearance/langs)`
			`-h, --help Show help message and exit`

			`Exit codes:`
			`0 All language files have complete keys`
			`1 Some language files have missing or extra keys`
			`"""`

			`import json`
			`import os`
			`import re`
			`from pathlib import Path`
			`from argparse import ArgumentParser`
			`from collections import defaultdict, Counter`


			`def find_duplicate_keys_recursive(data, prefix="", duplicates=None):`
			`"""Recursively find duplicate keys in nested JSON structure.`

			`Args:`
			`data: JSON data (dict, list, or primitive)`
			`prefix: Current key prefix (for nested keys)`
			`duplicates: Set to store duplicate keys found`

			`Returns:`
			`set: Set of duplicate keys (using dot notation for nested keys)`
			`"""`
			`if duplicates is None:`
			`duplicates = set()`

			`if isinstance(data, dict):`
			`seen_keys = {}`
			`for key, value in data.items():`
			`full_key = f"{prefix}.{key}" if prefix else key`

			`# Check for duplicate at current level`
			`if key in seen_keys:`
			`duplicates.add(full_key)`
			`seen_keys[key] = True`

			`# Recursively check nested structures`
			`if isinstance(value, dict):`
			`find_duplicate_keys_recursive(value, full_key, duplicates)`
			`elif isinstance(value, list):`
			`for i, item in enumerate(value):`
			`if isinstance(item, dict):`
			`find_duplicate_keys_recursive(item, f"{full_key}[{i}]", duplicates)`

			`return duplicates`


			`def find_duplicate_keys(file_path):`
			`"""Find duplicate keys in JSON file including nested structures.`

			`Args:`
			`file_path (Path): Language file path`

			`Returns:`
			`list: List of duplicate keys found in the file (using dot notation for nested keys)`
			`"""`
			`try:`
			`with open(file_path, 'r', encoding='utf-8') as f:`
			`data = json.load(f)`

			`if not isinstance(data, dict):`
			`return []`

			`duplicates = find_duplicate_keys_recursive(data)`
			`return sorted(duplicates)`
			`except Exception as e:`
			`print(f"Error: Failed to check duplicate keys in {file_path}: {e}")`
			`return []`


			`def collect_all_keys(data, prefix=""):`
			`"""Recursively collect all keys from nested JSON structure.`

			`Args:`
			`data: JSON data (dict, list, or primitive)`
			`prefix: Current key prefix (for nested keys)`

			`Returns:`
			`set: Set of all keys (using dot notation for nested keys)`
			`"""`
			`keys = set()`

			`if isinstance(data, dict):`
			`for key, value in data.items():`
			`full_key = f"{prefix}.{key}" if prefix else key`
			`keys.add(full_key)`

			`# Recursively collect nested keys`
			`if isinstance(value, dict):`
			`keys.update(collect_all_keys(value, full_key))`
			`elif isinstance(value, list):`
			`# Handle list of objects`
			`for i, item in enumerate(value):`
			`if isinstance(item, dict):`
			`keys.update(collect_all_keys(item, f"{full_key}[{i}]"))`

			`return keys`


			`def get_key_order(file_path):`
			`"""Get the order of keys as they appear in the JSON file.`

			`Args:`
			`file_path (Path): Language file path`

			`Returns:`
			`dict: Dictionary mapping key name (including nested paths) to its position index`
			`"""`
			`try:`
			`with open(file_path, 'r', encoding='utf-8') as f:`
			`content = f.read()`
			`data = json.loads(content)`

			`# Collect all keys recursively`
			`all_keys = collect_all_keys(data)`

			`# Get order by parsing the raw text`
			`key_order = {}`
			`pattern = r'["\']([^"\']+)["\']\s*:'`
			`index = 0`

			`# Track nesting path`
			`nesting_stack = []`

			`for match in re.finditer(pattern, content):`
			`key = match.group(1)`
			`pos = match.start()`

			`# Check nesting level`
			`text_before = content[:pos]`
			`open_braces = text_before.count('{')`
			`close_braces = text_before.count('}')`
			`nesting_level = open_braces - close_braces`

			`# Build full key path based on nesting`
			`if nesting_level == 1: # Top level`
			`nesting_stack = [key]`
			`full_key = key`
			`elif nesting_level > 1: # Nested level`
			`# Keep only the relevant nesting levels`
			`nesting_stack = nesting_stack[:nesting_level - 1] + [key]`
			`full_key = ".".join(nesting_stack)`
			`else:`
			`continue`

			`# Only record if this is a valid key we're tracking`
			`if full_key in all_keys and full_key not in key_order:`
			`key_order[full_key] = index`
			`index += 1`

			`return key_order`
			`except Exception as e:`
			`return {}`


			`def load_lang_file(file_path):`
			`"""Load language file and return key set and file content.`

			`Args:`
			`file_path (Path): Language file path`

			`Returns:`
			`tuple: (key set (including nested keys), file content dict, duplicate keys list, key order dict)`
			`"""`
			`try:`
			`with open(file_path, 'r', encoding='utf-8') as f:`
			`data = json.load(f)`
			`# Collect all keys including nested ones`
			`all_keys = collect_all_keys(data)`
			`duplicates = find_duplicate_keys(file_path)`
			`key_order = get_key_order(file_path)`
			`return all_keys, data, duplicates, key_order`
			`except json.JSONDecodeError as e:`
			`print(f"Error: Failed to parse file {file_path}: {e}")`
			`return None, None, [], {}`
			`except Exception as e:`
			`print(f"Error: Failed to read file {file_path}: {e}")`
			`return None, None, [], {}`


			`def check_lang_keys(langs_dir):`
			`"""Check if language file keys are complete.`

			`Uses statistical method: count how many files contain each key, then determine:`
			`- Missing keys: keys that exist in most files but not in current file`
			`- Extra keys: keys that don't exist in most files but exist in current file`
			`- Duplicate keys: keys that appear multiple times in the same file`

			`Args:`
			`langs_dir (str): Language files directory path`

			`Returns:`
			`bool: True if all files have complete keys, False otherwise`
			`"""`
			`langs_path = Path(langs_dir)`
			`if not langs_path.exists():`
			`print(f"Error: Directory does not exist: {langs_dir}")`
			`return False`

			`# Load all language files`
			`lang_keys = {}`
			`duplicate_keys_by_file = {}`
			`key_order_by_file = {}`

			`for lang_file in sorted(langs_path.glob("*.json")):`
			`keys, data, duplicates, key_order = load_lang_file(lang_file)`
			`if keys is None:`
			`continue`
			`lang_keys[lang_file.name] = keys`
			`if duplicates:`
			`duplicate_keys_by_file[lang_file.name] = duplicates`
			`key_order_by_file[lang_file.name] = key_order`

			`if not lang_keys:`
			`print("Error: No language files found")`
			`return False`

			`total_files = len(lang_keys)`
			`if total_files == 0:`
			`print("Error: No valid language files found")`
			`return False`

			`# Count how many files contain each key`
			`key_count = defaultdict(int)`
			`all_keys = set()`

			`for keys in lang_keys.values():`
			`all_keys.update(keys)`
			`for key in keys:`
			`key_count[key] += 1`

			`# Calculate threshold: if a key exists in more than half of the files, it should exist`
			`threshold = (total_files + 1) // 2 # Round up, e.g., 5 files need 3`

			`# Classify keys: expected keys and unexpected keys`
			`expected_keys = {key for key, count in key_count.items() if count >= threshold}`
			`unexpected_keys = all_keys - expected_keys`

			`# Find reference file (file with most keys) for ordering missing keys`
			`reference_file = max(lang_keys.items(), key=lambda x: len(x[1]))[0]`
			`reference_key_order = key_order_by_file.get(reference_file, {})`

			`print(f"Checked {total_files} language files")`
			`print(f"Threshold: keys need to exist in at least {threshold} files to be considered expected")`
			`print(f"Expected keys: {len(expected_keys)}")`
			`print(f"Unexpected keys: {len(unexpected_keys)}\n")`

			`# Check keys for each file`
			`all_complete = True`
			`file_issues = {} # {lang_name: {'missing': set, 'extra': set, 'duplicates': list}}`

			`for lang_name, keys in lang_keys.items():`
			`# Find missing keys (should exist but don't exist in current file)`
			`missing = expected_keys - keys`
			`# Find extra keys (shouldn't exist but exist in current file)`
			`extra = keys & unexpected_keys`
			`# Get duplicate keys`
			`duplicates = duplicate_keys_by_file.get(lang_name, [])`

			`if missing or extra or duplicates:`
			`file_issues[lang_name] = {'missing': missing, 'extra': extra, 'duplicates': duplicates}`
			`if missing or duplicates:`
			`all_complete = False`

			`# Output results`
			`if all_complete and not file_issues:`
			`print("All language files have complete keys!")`
			`return True`

			`# Report issues grouped by file`
			`print("Issues found:")`
			`print(" Missing keys: exist in most files but not in current file")`
			`print(" Extra keys: don't exist in most files but exist in current file")`
			`print(" Duplicate keys: keys that appear multiple times in the same file\n")`

			`for lang_name in sorted(file_issues.keys()):`
			`issues = file_issues[lang_name]`
			`missing = issues['missing']`
			`extra = issues['extra']`
			`duplicates = issues['duplicates']`
			`key_order = key_order_by_file.get(lang_name, {})`

			`# Sort function for missing keys: use reference file order`
			`def sort_missing_by_order(key):`
			`return (reference_key_order.get(key, float('inf')), key)`

			`# Sort function for extra/duplicate keys: use current file order`
			`def sort_by_order(key):`
			`return (key_order.get(key, float('inf')), key)`

			`has_issues = False`
			`if missing or extra or duplicates:`
			`has_issues = True`
			`print(f" {lang_name}:")`

			`# Show extra keys`
			`if extra:`
			`key_word = "key" if len(extra) == 1 else "keys"`
			`print(f" {len(extra)} Extra {key_word}:")`
			`extra_with_count = [(key, key_count[key]) for key in sorted(extra, key=sort_by_order)]`
			`if len(extra_with_count) <= 10:`
			`for key, count in extra_with_count:`
			`print(f" - {key} (exists in only {count}/{total_files} files)")`
			`else:`
			`for key, count in extra_with_count[:10]:`
			`print(f" - {key} (exists in only {count}/{total_files} files)")`
			`print(f" ... {len(extra_with_count) - 10} more keys not shown")`

			`# Show missing keys`
			`if missing:`
			`key_word = "key" if len(missing) == 1 else "keys"`
			`print(f" {len(missing)} Missing {key_word}:")`
			`missing_with_count = [(key, key_count[key]) for key in sorted(missing, key=sort_missing_by_order)]`
			`if len(missing_with_count) <= 10:`
			`for key, count in missing_with_count:`
			`print(f" - {key} (exists in {count}/{total_files} files)")`
			`else:`
			`for key, count in missing_with_count[:10]:`
			`print(f" - {key} (exists in {count}/{total_files} files)")`
			`print(f" ... {len(missing_with_count) - 10} more keys not shown")`

			`# Show duplicate keys`
			`if duplicates:`
			`key_word = "key" if len(duplicates) == 1 else "keys"`
			`print(f" {len(duplicates)} Duplicate {key_word}:")`
			`for key in sorted(duplicates, key=sort_by_order):`
			`print(f" - {key}")`

			`if has_issues:`
			`print()`

			`return False`


			`def main():`
			`parser = ArgumentParser(`
			`description="Check if language file keys are complete"`
			`)`
			`parser.add_argument(`
			`"-d", "--dir",`
			`default="app/appearance/langs",`
			`help="Language files directory path (default: app/appearance/langs)"`
			`)`
			`args = parser.parse_args()`

			`# Get project root directory (parent of script directory)`
			`script_dir = Path(__file__).parent`
			`project_root = script_dir.parent`
			`langs_dir = project_root / args.dir`

			`success = check_lang_keys(langs_dir)`
			`exit(0 if success else 1)`


			`if __name__ == "__main__":`
			`main()`