Add Fluent-based localization manager and add script to export source strings for translations

Changelog-Added: Added Fluent-based localization manager and added script to export source strings for translations Signed-off-by: Terry Yiu <git@tyiu.xyz>
2026-02-09 18:44:20 +01:00 · 2025-06-26 20:56:32 -04:00
parent 80820a52d2
commit d07c3e9135
9 changed files with 1894 additions and 105 deletions
--- a/scripts/export_source_strings.py
+++ b/scripts/export_source_strings.py
@@ -0,0 +1,595 @@
+#!/usr/bin/env python3
+"""
+Export US English (en-US) strings defined in tr! and tr_plural! macros in Rust code
+by generating a main.ftl file that can be used for translating into other languages.
+
+This script also creates a Psuedolocalized English (en-XA) main.ftl file with a given number of characters accented,
+so that developers can easily detect which strings have been internationalized or not without needing to have
+actual translations for a non-English language instead.
+"""
+
+import os
+import re
+import argparse
+from pathlib import Path
+from typing import Set, Dict, List, Tuple
+import json
+import collections
+import hashlib
+
+def find_rust_files(project_root: Path) -> List[Path]:
+    """Find all Rust files in the project."""
+    rust_files = []
+    for root, dirs, files in os.walk(project_root):
+        # Skip irrelevant directories
+        dirs[:] = [d for d in dirs if d not in ['target', '.git', '.cargo']]
+
+        for file in files:
+            # Find only Rust source files
+            if file.endswith('.rs'):
+                rust_files.append(Path(root) / file)
+
+    return rust_files
+
+def strip_rust_comments(code: str) -> str:
+    """Remove // line comments, /* ... */ block comments, and doc comments (///, //!, //! ...) from Rust code."""
+    # Remove block comments first
+    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
+    # Remove line comments
+    code = re.sub(r'//.*', '', code)
+    # Remove doc comments (/// and //! at start of line)
+    code = re.sub(r'^\s*///.*$', '', code, flags=re.MULTILINE)
+    code = re.sub(r'^\s*//!.*$', '', code, flags=re.MULTILINE)
+    return code
+
+def extract_tr_macros_with_lines(content: str, file_path: str) -> dict:
+    """Extract tr! macro calls from Rust code with comments and line numbers. Handles multi-line macros."""
+    matches = []
+    # Strip comments before processing
+    content = strip_rust_comments(content)
+    # Search the entire content for tr! macro calls (multi-line aware)
+    for macro_content in extract_macro_calls(content, 'tr!'):
+        args = parse_macro_arguments(macro_content)
+        if len(args) >= 2:  # Must have at least message and comment
+            message = args[0].strip()
+            comment = args[1].strip()  # Second argument is always the comment
+            # Validate placeholders
+            if not validate_placeholders(message, file_path):
+                continue
+            if not any(skip in message.lower() for skip in [
+                '/', '\\', '.ftl', '.rs', 'http', 'https', 'www', '@',
+                'crates/', 'src/', 'target/', 'build.rs']):
+                # Find the line number where this macro starts
+                macro_start = f'tr!({macro_content}'
+                idx = content.find(macro_start)
+                line_num = content[:idx].count('\n') + 1 if idx != -1 else 1
+                matches.append((message, comment, line_num, file_path))
+    return matches
+
+def extract_tr_plural_macros_with_lines(content: str, file_path: str) -> dict:
+    """Extract tr_plural! macro calls from Rust code with new signature and correct keying, skipping macro definitions and doc comments."""
+    matches = []
+    # Skip macro definitions
+    if 'macro_rules! tr_plural' in content or file_path.endswith('i18n/mod.rs'):
+        return matches
+    for idx, macro_content in enumerate(extract_macro_calls(content, 'tr_plural!')):
+        args = parse_macro_arguments(macro_content)
+        if len(args) >= 4:
+            one = args[0].strip()
+            other = args[1].strip()
+            comment = args[2].strip()
+            key = other
+            if key and not key.startswith('//') and not key.startswith('$'):
+                matches.append((key, comment, idx + 1, file_path))
+    return matches
+
+def parse_macro_arguments(content: str) -> List[str]:
+    """Parse macro arguments, handling quoted strings, param = value pairs, commas, and inline comments."""
+    # Remove all // comments
+    content = re.sub(r'//.*', '', content)
+    # Collapse all whitespace/newlines to a single space
+    content = re.sub(r'\s+', ' ', content.strip())
+    args = []
+    i = 0
+    n = len(content)
+    while i < n:
+        # Skip whitespace
+        while i < n and content[i].isspace():
+            i += 1
+        if i >= n:
+            break
+        # Handle quoted strings
+        if content[i] in ['"', "'"]:
+            quote_char = content[i]
+            i += 1
+            arg_start = i
+            while i < n:
+                if content[i] == '\\' and i + 1 < n:
+                    i += 2
+                elif content[i] == quote_char:
+                    break
+                else:
+                    i += 1
+            arg = content[arg_start:i]
+            args.append(arg)
+            i += 1  # Skip closing quote
+        else:
+            arg_start = i
+            paren_count = 0
+            brace_count = 0
+            while i < n:
+                char = content[i]
+                if char == '(':
+                    paren_count += 1
+                elif char == ')':
+                    paren_count -= 1
+                elif char == '{':
+                    brace_count += 1
+                elif char == '}':
+                    brace_count -= 1
+                elif char == ',' and paren_count == 0 and brace_count == 0:
+                    break
+                i += 1
+            arg = content[arg_start:i].strip()
+            if arg:
+                args.append(arg)
+        # Skip the comma if we found one
+        if i < n and content[i] == ',':
+            i += 1
+    return args
+
+def extract_macro_calls(content: str, macro_name: str):
+    """Extract all macro calls of the given macro_name from the entire content, handling parentheses inside quoted strings and multi-line macros."""
+    calls = []
+    idx = 0
+    macro_start = f'{macro_name}('
+    content_len = len(content)
+    while idx < content_len:
+        start = content.find(macro_start, idx)
+        if start == -1:
+            break
+        i = start + len(macro_start)
+        paren_count = 1  # Start after the initial '('
+        in_quote = False
+        quote_char = ''
+        macro_content = ''
+        while i < content_len:
+            c = content[i]
+            if in_quote:
+                macro_content += c
+                if c == quote_char and (i == 0 or content[i-1] != '\\'):
+                    in_quote = False
+            else:
+                if c in ('"', "'"):
+                    in_quote = True
+                    quote_char = c
+                    macro_content += c
+                elif c == '(':
+                    paren_count += 1
+                    macro_content += c
+                elif c == ')':
+                    paren_count -= 1
+                    if paren_count == 0:
+                        break
+                    else:
+                        macro_content += c
+                else:
+                    macro_content += c
+            i += 1
+        # Only add if we found a closing parenthesis
+        if i < content_len and content[i] == ')':
+            calls.append(macro_content)
+            idx = i + 1
+        else:
+            # Malformed macro, skip past this occurrence
+            idx = start + len(macro_start)
+    return calls
+
+def validate_placeholders(message: str, file_path: str = "") -> bool:
+    """Validate that all placeholders in a message are named and start with a letter."""
+    import re
+
+    # Find all placeholders in the message
+    placeholder_pattern = r'\{([^}]*)\}'
+    placeholders = re.findall(placeholder_pattern, message)
+
+    valid = True
+    for placeholder in placeholders:
+        if not placeholder.strip():
+            print(f"[VALIDATE] Warning: Empty placeholder {{}} found in message: '{message[:100]}...' {file_path}")
+            valid = False
+        elif not placeholder[0].isalpha():
+            print(f"[VALIDATE] Warning: Placeholder '{{{placeholder}}}' does not start with a letter in message: '{message[:100]}...' {file_path}")
+            valid = False
+    if not valid:
+        print(f"[VALIDATE] Message rejected: '{message}'")
+    return valid
+
+def extract_tr_macros(content: str) -> List[Tuple[str, str]]:
+    """Extract tr! macro calls from Rust code with comments."""
+    filtered_matches = []
+    # Strip comments before processing
+    content = strip_rust_comments(content)
+    # Process the entire content instead of line by line to handle multi-line macros
+    for macro_content in extract_macro_calls(content, 'tr!'):
+        args = parse_macro_arguments(macro_content)
+        if len(args) >= 2:  # Must have at least message and comment
+            message = args[0].strip()
+            comment = args[1].strip()  # Second argument is always the comment
+            # Debug output for identification strings
+            if "identification" in comment.lower():
+                print(f"[DEBUG] Found identification tr! macro: message='{message}', comment='{comment}', args={args}")
+                norm_key = normalize_key(message, comment)
+                print(f"[DEBUG] Normalized key: '{norm_key}'")
+            # Validate placeholders
+            if not validate_placeholders(message):
+                continue
+            # More specific filtering logic
+            should_skip = False
+            for skip in ['/', '.ftl', '.rs', 'http', 'https', 'www', 'crates/', 'src/', 'target/', 'build.rs']:
+                if skip in message.lower():
+                    should_skip = True
+                    break
+            # Special handling for @ - only skip if it looks like an actual email address
+            if '@' in message and (
+                # Skip if it's a short string that looks like an email
+                len(message) < 50 or
+                # Skip if it contains common email patterns
+                any(pattern in message.lower() for pattern in ['@gmail.com', '@yahoo.com', '@hotmail.com', '@outlook.com'])
+            ):
+                should_skip = True
+            if not should_skip:
+                # Store as (message, comment) tuple to preserve all combinations
+                filtered_matches.append((message, comment))
+    return filtered_matches
+
+def extract_tr_plural_macros(content: str, file_path: str = "") -> Dict[str, dict]:
+    """Extract tr_plural! macro calls from Rust code with new signature, skipping macro definitions and doc comments."""
+    filtered_matches = {}
+    # Skip macro definitions
+    if 'macro_rules! tr_plural' in content or file_path.endswith('i18n/mod.rs'):
+        print(f"[DEBUG] Skipping macro definitions in {file_path}")
+        return filtered_matches
+    for macro_content in extract_macro_calls(content, 'tr_plural!'):
+        print(f"[DEBUG] Found tr_plural! macro in {file_path}: {macro_content}")
+        args = parse_macro_arguments(macro_content)
+        print(f"[DEBUG] Parsed args: {args}")
+        if len(args) >= 4:
+            one = args[0].strip()
+            other = args[1].strip()
+            comment = args[2].strip()
+            key = other
+            if key and not key.startswith('//') and not key.startswith('$'):
+                print(f"[DEBUG] Adding plural key '{key}' from {file_path}")
+                filtered_matches[key] = {
+                    'one': one,
+                    'other': other,
+                    'comment': comment
+                }
+    return filtered_matches
+
+def escape_rust_placeholders(text: str) -> str:
+    """Convert Rust-style placeholders to Fluent-style placeholders"""
+    # Unescape double quotes first
+    text = text.replace('\\"', '"')
+    # Convert Rust placeholders to Fluent placeholders
+    return re.sub(r'\{([a-zA-Z][a-zA-Z0-9_]*)\}', r'{$\1}', text)
+
+def simple_hash(s: str) -> str:
+    """Simple hash function using MD5 - matches Rust implementation, 4 hex chars"""
+    return hashlib.md5(s.encode('utf-8')).hexdigest()[:4]
+
+def normalize_key(message, comment=None):
+    """Normalize a message to create a consistent key - matches Rust normalize_ftl_key function"""
+    # Remove quotes and normalize
+    key = message.strip('"\'')
+    # Unescape double quotes
+    key = key.replace('\\"', '"')
+    # Replace each invalid character with exactly one underscore (allow hyphens and underscores)
+    key = re.sub(r'[^a-zA-Z0-9_-]', '_', key)
+    # Remove leading/trailing underscores
+    key = key.strip('_')
+    # Add 'k_' prefix if the result doesn't start with a letter (Fluent requirement)
+    if not (key and key[0].isalpha()):
+        key = "k_" + key
+
+    # If we have a comment, append a hash of it to reduce collisions
+    if comment:
+        # Create a hash of the comment and append it to the key
+        hash_str = f"_{simple_hash(comment)}"
+        key += hash_str
+
+    return key
+
+def pseudolocalize(text: str) -> str:
+    """Convert English text to pseudolocalized text for testing."""
+    # Common pseudolocalization patterns
+    replacements = {
+        'a': 'à', 'e': 'é', 'i': 'í', 'o': 'ó', 'u': 'ú',
+        'A': 'À', 'E': 'É', 'I': 'Í', 'O': 'Ó', 'U': 'Ú',
+        'n': 'ñ', 'N': 'Ñ', 'c': 'ç', 'C': 'Ç'
+    }
+
+    # First, protect Fluent placeables from pseudolocalization
+    placeable_pattern = r'\{ *\$[a-zA-Z][a-zA-Z0-9_]* *\}'
+    placeables = re.findall(placeable_pattern, text)
+
+    # Replace placeables with unique placeholders that won't be modified
+    protected_text = text
+    for i, placeable in enumerate(placeables):
+        placeholder = f"<<PLACEABLE_{i}>>"
+        protected_text = protected_text.replace(placeable, placeholder, 1)
+
+    # Apply character replacements, skipping <<PLACEABLE_n>>
+    result = ''
+    i = 0
+    while i < len(protected_text):
+        if protected_text.startswith('<<PLACEABLE_', i):
+            end = protected_text.find('>>', i)
+            if end != -1:
+                result += protected_text[i:end+2]
+                i = end + 2
+                continue
+        char = protected_text[i]
+        result += replacements.get(char, char)
+        i += 1
+
+    # Restore placeables
+    for i, placeable in enumerate(placeables):
+        placeholder = f"<<PLACEABLE_{i}>>"
+        result = result.replace(placeholder, placeable)
+
+    # Wrap pseudolocalized string with square brackets so that it can be distinguished from other strings
+    return f'{{"["}}{result}{{"]"}}'
+
+def generate_ftl_content(tr_strings: Dict[str, str],
+                        plural_strings: Dict[str, dict],
+                        tr_occurrences: Dict[Tuple[str, str], list],
+                        plural_occurrences: Dict[Tuple[str, str], list],
+                        pseudolocalize_content: bool = False) -> str:
+    """Generate FTL file content from extracted strings with comments."""
+
+    lines = [
+        "# Main translation file for Notedeck",
+        "# This file contains common UI strings used throughout the application",
+        "# Auto-generated by extract_i18n.py - DO NOT EDIT MANUALLY",
+        "",
+    ]
+
+    # Sort strings for consistent output
+    sorted_tr = sorted(tr_strings.items(), key=lambda item: item[0].lower())
+    sorted_plural = sorted(plural_strings.items(), key=lambda item: item[0].lower())
+
+    # Add regular tr! strings
+    if sorted_tr:
+        lines.append("# Regular strings")
+        for norm_key, (original_message, comment) in sorted_tr:
+            lines.append("")
+            # Write the comment
+            if comment:
+                lines.append(f"# {comment}")
+            # Apply pseudolocalization if requested
+            value = escape_rust_placeholders(original_message)
+            value = pseudolocalize(value) if pseudolocalize_content else value
+            lines.append(f"{norm_key} = {value}")
+        lines.append("")
+
+    # Add pluralized strings
+    if sorted_plural:
+        lines.append("# Pluralized strings")
+        for key, data in sorted_plural:
+            lines.append("")
+
+            one = data['one']
+            other = data['other']
+            comment = data['comment']
+            # Write comment
+            if comment:
+                lines.append(f"# {comment}")
+            norm_key = normalize_key(key, comment)
+            one_val = escape_rust_placeholders(one)
+            other_val = escape_rust_placeholders(other)
+            if pseudolocalize_content:
+                one_val = pseudolocalize(one_val)
+                other_val = pseudolocalize(other_val)
+            lines.append(f'{norm_key} =')
+            lines.append(f'    {{ $count ->')
+            lines.append(f'        [one] {one_val}')
+            lines.append(f'       *[other] {other_val}')
+            lines.append(f'    }}')
+            lines.append("")
+
+    return "\n".join(lines)
+
+def read_existing_ftl(ftl_path: Path) -> Dict[str, str]:
+    """Read existing FTL file to preserve comments and custom translations."""
+    if not ftl_path.exists():
+        return {}
+
+    existing_translations = {}
+    with open(ftl_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    # Extract key-value pairs
+    pattern = r'^([^#\s][^=]*?)\s*=\s*(.+)$'
+    for line in content.split('\n'):
+        match = re.match(pattern, line.strip())
+        if match:
+            key = match.group(1).strip()
+            value = match.group(2).strip()
+            # For existing FTL files, we need to handle keys that may have hash suffixes
+            # Strip the hash suffix if present (8 hex characters after underscore)
+            original_key = re.sub(r'_[0-9a-f]{8}$', '', key)
+            norm_key = normalize_key(original_key)
+            existing_translations[norm_key] = value
+
+    return existing_translations
+
+def main():
+    parser = argparse.ArgumentParser(description='Extract i18n macros and generate FTL file')
+    parser.add_argument('--project-root', type=str, default='.',
+                       help='Project root directory (default: current directory)')
+    parser.add_argument('--dry-run', action='store_true',
+                       help='Show what would be generated without writing to file')
+    parser.add_argument('--fail-on-collisions', action='store_true',
+                       help='Exit with error if key collisions are detected')
+
+    args = parser.parse_args()
+
+    project_root = Path(args.project_root)
+
+    print(f"Scanning Rust files in {project_root}...")
+
+    # Find all Rust files
+    rust_files = find_rust_files(project_root)
+    print(f"Found {len(rust_files)} Rust files")
+
+    # Extract strings from all files
+    all_tr_strings = {}
+    all_plural_strings = {}
+
+    # Track normalized keys to detect actual key collisions
+    all_tr_normalized_keys = {}
+    all_plural_normalized_keys = {}
+
+    # Track collisions
+    tr_collisions = {}
+    plural_collisions = {}
+
+    # Track all occurrences for intra-file collision detection
+    tr_occurrences = collections.defaultdict(list)
+    plural_occurrences = collections.defaultdict(list)
+
+    for rust_file in rust_files:
+        try:
+            with open(rust_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+
+            # For intra-file collision detection
+            tr_lines = extract_tr_macros_with_lines(content, str(rust_file))
+            for key, comment, line, file_path in tr_lines:
+                tr_occurrences[(file_path, key)].append((comment, line))
+            plural_lines = extract_tr_plural_macros_with_lines(content, str(rust_file))
+            for key, comment, line, file_path in plural_lines:
+                plural_occurrences[(file_path, key)].append((comment, line))
+
+            tr_strings = extract_tr_macros(content)
+            plural_strings = extract_tr_plural_macros(content, str(rust_file))
+
+            if tr_strings or plural_strings:
+                print(f"  {rust_file}: {len(tr_strings)} tr!, {len(plural_strings)} tr_plural!")
+
+            # Check for collisions in tr! strings using normalized keys
+            for message, comment in tr_strings:
+                norm_key = normalize_key(message, comment)
+                if norm_key in all_tr_normalized_keys:
+                    # This is a real key collision (same normalized key)
+                    if norm_key not in tr_collisions:
+                        tr_collisions[norm_key] = []
+                    tr_collisions[norm_key].append((rust_file, all_tr_normalized_keys[norm_key]))
+                    tr_collisions[norm_key].append((rust_file, comment))
+                # Store by normalized key to preserve all unique combinations
+                all_tr_strings[norm_key] = (message, comment)
+                all_tr_normalized_keys[norm_key] = comment
+
+            # Check for collisions in plural strings using normalized keys
+            for key, data in plural_strings.items():
+                comment = data['comment']
+                norm_key = normalize_key(key, comment)
+                if norm_key in all_plural_normalized_keys:
+                    # This is a real key collision (same normalized key)
+                    if norm_key not in plural_collisions:
+                        plural_collisions[norm_key] = []
+                    plural_collisions[norm_key].append((rust_file, all_plural_normalized_keys[norm_key]))
+                    plural_collisions[norm_key].append((rust_file, data))
+                all_plural_strings[key] = data
+                all_plural_normalized_keys[norm_key] = data
+
+        except Exception as e:
+            print(f"Error reading {rust_file}: {e}")
+
+    # Intra-file collision detection
+    has_intra_file_collisions = False
+    for (file_path, key), occurrences in tr_occurrences.items():
+        comments = set(c for c, _ in occurrences)
+        if len(occurrences) > 1 and len(comments) > 1:
+            has_intra_file_collisions = True
+            print(f"\n⚠️  Intra-file key collision in {file_path} for '{key}':")
+            for comment, line in occurrences:
+                comment_text = f" (comment: '{comment}')" if comment else " (no comment)"
+                print(f"    Line {line}{comment_text}")
+    for (file_path, key), occurrences in plural_occurrences.items():
+        comments = set(c for c, _ in occurrences)
+        if len(occurrences) > 1 and len(comments) > 1:
+            has_intra_file_collisions = True
+            print(f"\n⚠️  Intra-file key collision in {file_path} for '{key}':")
+            for comment, line in occurrences:
+                comment_text = f" (comment: '{comment}')" if comment else " (no comment)"
+                print(f"    Line {line}{comment_text}")
+    if has_intra_file_collisions and args.fail_on_collisions:
+        print(f"❌ Exiting due to intra-file key collisions (--fail-on-collisions flag)")
+        exit(1)
+
+    # Report collisions
+    has_collisions = False
+
+    if tr_collisions:
+        has_collisions = True
+        print(f"\n⚠️  Key collisions detected in tr! strings:")
+        for key, collisions in tr_collisions.items():
+            print(f"  '{key}':")
+            for file_path, comment in collisions:
+                comment_text = f" (comment: '{comment}')" if comment else " (no comment)"
+                print(f"    {file_path}{comment_text}")
+
+    if plural_collisions:
+        has_collisions = True
+        print(f"\n⚠️  Key collisions detected in tr_plural! strings:")
+        for key, collisions in plural_collisions.items():
+            print(f"  '{key}':")
+            for file_path, comment in collisions:
+                comment_text = f" (comment: '{comment}')" if comment else " (no comment)"
+                print(f"    {file_path}{comment_text}")
+
+    if has_collisions:
+        print(f"\n💡 Collision resolution: The last occurrence of each key will be used.")
+        if args.fail_on_collisions:
+            print(f"❌ Exiting due to key collisions (--fail-on-collisions flag)")
+            exit(1)
+
+    print(f"\nExtracted strings:")
+    print(f"  Regular strings: {len(all_tr_strings)}")
+    print(f"  Plural strings: {len(all_plural_strings)}")
+
+    # Debug: print all keys in all_tr_strings
+    print("[DEBUG] All tr! keys:")
+    for k in all_tr_strings.keys():
+        print(f"  {k}")
+
+    # Generate FTL content for both locales
+    locales = ['en-US', 'en-XA']
+
+    for locale in locales:
+        pseudolocalize_content = (locale == 'en-XA')
+        ftl_content = generate_ftl_content(all_tr_strings, all_plural_strings, tr_occurrences, plural_occurrences, pseudolocalize_content)
+        output_path = Path(f'assets/translations/{locale}/main.ftl')
+
+        if args.dry_run:
+            print(f"\n--- Generated FTL content for {locale} ---")
+            print(ftl_content)
+            print(f"--- End of content for {locale} ---")
+        else:
+            # Ensure output directory exists
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+
+            # Write to file
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(ftl_content)
+
+            print(f"\nGenerated FTL file: {output_path}")
+
+    if not args.dry_run:
+        print(f"\nTotal strings: {len(all_tr_strings) + len(all_plural_strings)}")
+
+if __name__ == '__main__':
+    main()