#!/usr/bin/env python3 """ Export US English (en-US) strings defined in tr! and tr_plural! macros in Rust code by generating a main.ftl file that can be used for translating into other languages. This script also creates a Psuedolocalized English (en-XA) main.ftl file with a given number of characters accented, so that developers can easily detect which strings have been internationalized or not without needing to have actual translations for a non-English language instead. """ import os import re import argparse from pathlib import Path from typing import Set, Dict, List, Tuple import json import collections import hashlib def find_rust_files(project_root: Path) -> List[Path]: """Find all Rust files in the project.""" rust_files = [] for root, dirs, files in os.walk(project_root): # Skip irrelevant directories dirs[:] = [d for d in dirs if d not in ['target', '.git', '.cargo']] for file in files: # Find only Rust source files if file.endswith('.rs'): rust_files.append(Path(root) / file) return rust_files def strip_rust_comments(code: str) -> str: """Remove // line comments, /* ... */ block comments, and doc comments (///, //!, //! ...) from Rust code.""" # Remove block comments first code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) # Remove line comments code = re.sub(r'//.*', '', code) # Remove doc comments (/// and //! at start of line) code = re.sub(r'^\s*///.*$', '', code, flags=re.MULTILINE) code = re.sub(r'^\s*//!.*$', '', code, flags=re.MULTILINE) return code def extract_tr_macros_with_lines(content: str, file_path: str) -> dict: """Extract tr! macro calls from Rust code with comments and line numbers. Handles multi-line macros.""" matches = [] # Strip comments before processing content = strip_rust_comments(content) # Search the entire content for tr! macro calls (multi-line aware) for macro_content in extract_macro_calls(content, 'tr!'): args = parse_macro_arguments(macro_content) if len(args) >= 2: # Must have at least message and comment message = args[0].strip() comment = args[1].strip() # Second argument is always the comment # Validate placeholders if not validate_placeholders(message, file_path): continue if not any(skip in message.lower() for skip in [ '/', '\\', '.ftl', '.rs', 'http', 'https', 'www', '@', 'crates/', 'src/', 'target/', 'build.rs']): # Find the line number where this macro starts macro_start = f'tr!({macro_content}' idx = content.find(macro_start) line_num = content[:idx].count('\n') + 1 if idx != -1 else 1 matches.append((message, comment, line_num, file_path)) return matches def extract_tr_plural_macros_with_lines(content: str, file_path: str) -> dict: """Extract tr_plural! macro calls from Rust code with new signature and correct keying, skipping macro definitions and doc comments.""" matches = [] # Skip macro definitions if 'macro_rules! tr_plural' in content or file_path.endswith('i18n/mod.rs'): return matches for idx, macro_content in enumerate(extract_macro_calls(content, 'tr_plural!')): args = parse_macro_arguments(macro_content) if len(args) >= 4: one = args[0].strip() other = args[1].strip() comment = args[2].strip() key = other if key and not key.startswith('//') and not key.startswith('$'): matches.append((key, comment, idx + 1, file_path)) return matches def parse_macro_arguments(content: str) -> List[str]: """Parse macro arguments, handling quoted strings, param = value pairs, commas, and inline comments.""" # Remove all // comments content = re.sub(r'//.*', '', content) # Collapse all whitespace/newlines to a single space content = re.sub(r'\s+', ' ', content.strip()) args = [] i = 0 n = len(content) while i < n: # Skip whitespace while i < n and content[i].isspace(): i += 1 if i >= n: break # Handle quoted strings if content[i] in ['"', "'"]: quote_char = content[i] i += 1 arg_start = i while i < n: if content[i] == '\\' and i + 1 < n: i += 2 elif content[i] == quote_char: break else: i += 1 arg = content[arg_start:i] args.append(arg) i += 1 # Skip closing quote else: arg_start = i paren_count = 0 brace_count = 0 while i < n: char = content[i] if char == '(': paren_count += 1 elif char == ')': paren_count -= 1 elif char == '{': brace_count += 1 elif char == '}': brace_count -= 1 elif char == ',' and paren_count == 0 and brace_count == 0: break i += 1 arg = content[arg_start:i].strip() if arg: args.append(arg) # Skip the comma if we found one if i < n and content[i] == ',': i += 1 return args def extract_macro_calls(content: str, macro_name: str): """Extract all macro calls of the given macro_name from the entire content, handling parentheses inside quoted strings and multi-line macros.""" calls = [] idx = 0 macro_start = f'{macro_name}(' content_len = len(content) while idx < content_len: start = content.find(macro_start, idx) if start == -1: break i = start + len(macro_start) paren_count = 1 # Start after the initial '(' in_quote = False quote_char = '' macro_content = '' while i < content_len: c = content[i] if in_quote: macro_content += c if c == quote_char and (i == 0 or content[i-1] != '\\'): in_quote = False else: if c in ('"', "'"): in_quote = True quote_char = c macro_content += c elif c == '(': paren_count += 1 macro_content += c elif c == ')': paren_count -= 1 if paren_count == 0: break else: macro_content += c else: macro_content += c i += 1 # Only add if we found a closing parenthesis if i < content_len and content[i] == ')': calls.append(macro_content) idx = i + 1 else: # Malformed macro, skip past this occurrence idx = start + len(macro_start) return calls def validate_placeholders(message: str, file_path: str = "") -> bool: """Validate that all placeholders in a message are named and start with a letter.""" import re # Find all placeholders in the message placeholder_pattern = r'\{([^}]*)\}' placeholders = re.findall(placeholder_pattern, message) valid = True for placeholder in placeholders: if not placeholder.strip(): print(f"[VALIDATE] Warning: Empty placeholder {{}} found in message: '{message[:100]}...' {file_path}") valid = False elif not placeholder[0].isalpha(): print(f"[VALIDATE] Warning: Placeholder '{{{placeholder}}}' does not start with a letter in message: '{message[:100]}...' {file_path}") valid = False if not valid: print(f"[VALIDATE] Message rejected: '{message}'") return valid def extract_tr_macros(content: str) -> List[Tuple[str, str]]: """Extract tr! macro calls from Rust code with comments.""" filtered_matches = [] # Strip comments before processing content = strip_rust_comments(content) # Process the entire content instead of line by line to handle multi-line macros for macro_content in extract_macro_calls(content, 'tr!'): args = parse_macro_arguments(macro_content) if len(args) >= 2: # Must have at least message and comment message = args[0].strip() comment = args[1].strip() # Second argument is always the comment # Debug output for identification strings if "identification" in comment.lower(): print(f"[DEBUG] Found identification tr! macro: message='{message}', comment='{comment}', args={args}") norm_key = normalize_key(message, comment) print(f"[DEBUG] Normalized key: '{norm_key}'") # Validate placeholders if not validate_placeholders(message): continue # More specific filtering logic should_skip = False for skip in ['/', '.ftl', '.rs', 'http', 'https', 'www', 'crates/', 'src/', 'target/', 'build.rs']: if skip in message.lower(): should_skip = True break # Special handling for @ - only skip if it looks like an actual email address if '@' in message and ( # Skip if it's a short string that looks like an email len(message) < 50 or # Skip if it contains common email patterns any(pattern in message.lower() for pattern in ['@gmail.com', '@yahoo.com', '@hotmail.com', '@outlook.com']) ): should_skip = True if not should_skip: # Store as (message, comment) tuple to preserve all combinations filtered_matches.append((message, comment)) return filtered_matches def extract_tr_plural_macros(content: str, file_path: str = "") -> Dict[str, dict]: """Extract tr_plural! macro calls from Rust code with new signature, skipping macro definitions and doc comments.""" filtered_matches = {} # Skip macro definitions if 'macro_rules! tr_plural' in content or file_path.endswith('i18n/mod.rs'): print(f"[DEBUG] Skipping macro definitions in {file_path}") return filtered_matches for macro_content in extract_macro_calls(content, 'tr_plural!'): print(f"[DEBUG] Found tr_plural! macro in {file_path}: {macro_content}") args = parse_macro_arguments(macro_content) print(f"[DEBUG] Parsed args: {args}") if len(args) >= 4: one = args[0].strip() other = args[1].strip() comment = args[2].strip() key = other if key and not key.startswith('//') and not key.startswith('$'): print(f"[DEBUG] Adding plural key '{key}' from {file_path}") filtered_matches[key] = { 'one': one, 'other': other, 'comment': comment } return filtered_matches def escape_rust_placeholders(text: str) -> str: """Convert Rust-style placeholders to Fluent-style placeholders""" # Unescape double quotes first text = text.replace('\\"', '"') # Convert Rust placeholders to Fluent placeholders return re.sub(r'\{([a-zA-Z][a-zA-Z0-9_]*)\}', r'{$\1}', text) def simple_hash(s: str) -> str: """Simple hash function using MD5 - matches Rust implementation, 4 hex chars""" return hashlib.md5(s.encode('utf-8')).hexdigest()[:4] def normalize_key(message, comment=None): """Normalize a message to create a consistent key - matches Rust normalize_ftl_key function""" # Remove quotes and normalize key = message.strip('"\'') # Unescape double quotes key = key.replace('\\"', '"') # Replace each invalid character with exactly one underscore (allow hyphens and underscores) key = re.sub(r'[^a-zA-Z0-9_-]', '_', key) # Remove leading/trailing underscores key = key.strip('_') # Add 'k_' prefix if the result doesn't start with a letter (Fluent requirement) if not (key and key[0].isalpha()): key = "k_" + key # If we have a comment, append a hash of it to reduce collisions if comment: # Create a hash of the comment and append it to the key hash_str = f"_{simple_hash(comment)}" key += hash_str return key def pseudolocalize(text: str) -> str: """Convert English text to pseudolocalized text for testing.""" # Common pseudolocalization patterns replacements = { 'a': 'à', 'e': 'é', 'i': 'í', 'o': 'ó', 'u': 'ú', 'A': 'À', 'E': 'É', 'I': 'Í', 'O': 'Ó', 'U': 'Ú', 'n': 'ñ', 'N': 'Ñ', 'c': 'ç', 'C': 'Ç' } # First, protect Fluent placeables from pseudolocalization placeable_pattern = r'\{ *\$[a-zA-Z][a-zA-Z0-9_]* *\}' placeables = re.findall(placeable_pattern, text) # Replace placeables with unique placeholders that won't be modified protected_text = text for i, placeable in enumerate(placeables): placeholder = f"<>" protected_text = protected_text.replace(placeable, placeholder, 1) # Apply character replacements, skipping <> result = '' i = 0 while i < len(protected_text): if protected_text.startswith('<>', i) if end != -1: result += protected_text[i:end+2] i = end + 2 continue char = protected_text[i] result += replacements.get(char, char) i += 1 # Restore placeables for i, placeable in enumerate(placeables): placeholder = f"<>" result = result.replace(placeholder, placeable) # Wrap pseudolocalized string with square brackets so that it can be distinguished from other strings return f'{{"["}}{result}{{"]"}}' def generate_ftl_content(tr_strings: Dict[str, str], plural_strings: Dict[str, dict], tr_occurrences: Dict[Tuple[str, str], list], plural_occurrences: Dict[Tuple[str, str], list], pseudolocalize_content: bool = False) -> str: """Generate FTL file content from extracted strings with comments.""" lines = [ "# Main translation file for Notedeck", "# This file contains common UI strings used throughout the application", "# Auto-generated by extract_i18n.py - DO NOT EDIT MANUALLY", "", ] # Sort strings for consistent output sorted_tr = sorted(tr_strings.items(), key=lambda item: item[0].lower()) sorted_plural = sorted(plural_strings.items(), key=lambda item: item[0].lower()) # Add regular tr! strings if sorted_tr: lines.append("# Regular strings") for norm_key, (original_message, comment) in sorted_tr: lines.append("") # Write the comment if comment: lines.append(f"# {comment}") # Apply pseudolocalization if requested value = escape_rust_placeholders(original_message) value = pseudolocalize(value) if pseudolocalize_content else value lines.append(f"{norm_key} = {value}") lines.append("") # Add pluralized strings if sorted_plural: lines.append("# Pluralized strings") for key, data in sorted_plural: lines.append("") one = data['one'] other = data['other'] comment = data['comment'] # Write comment if comment: lines.append(f"# {comment}") norm_key = normalize_key(key, comment) one_val = escape_rust_placeholders(one) other_val = escape_rust_placeholders(other) if pseudolocalize_content: one_val = pseudolocalize(one_val) other_val = pseudolocalize(other_val) lines.append(f'{norm_key} =') lines.append(f' {{ $count ->') lines.append(f' [one] {one_val}') lines.append(f' *[other] {other_val}') lines.append(f' }}') lines.append("") return "\n".join(lines) def read_existing_ftl(ftl_path: Path) -> Dict[str, str]: """Read existing FTL file to preserve comments and custom translations.""" if not ftl_path.exists(): return {} existing_translations = {} with open(ftl_path, 'r', encoding='utf-8') as f: content = f.read() # Extract key-value pairs pattern = r'^([^#\s][^=]*?)\s*=\s*(.+)$' for line in content.split('\n'): match = re.match(pattern, line.strip()) if match: key = match.group(1).strip() value = match.group(2).strip() # For existing FTL files, we need to handle keys that may have hash suffixes # Strip the hash suffix if present (8 hex characters after underscore) original_key = re.sub(r'_[0-9a-f]{8}$', '', key) norm_key = normalize_key(original_key) existing_translations[norm_key] = value return existing_translations def main(): parser = argparse.ArgumentParser(description='Extract i18n macros and generate FTL file') parser.add_argument('--project-root', type=str, default='.', help='Project root directory (default: current directory)') parser.add_argument('--dry-run', action='store_true', help='Show what would be generated without writing to file') parser.add_argument('--fail-on-collisions', action='store_true', help='Exit with error if key collisions are detected') args = parser.parse_args() project_root = Path(args.project_root) print(f"Scanning Rust files in {project_root}...") # Find all Rust files rust_files = find_rust_files(project_root) print(f"Found {len(rust_files)} Rust files") # Extract strings from all files all_tr_strings = {} all_plural_strings = {} # Track normalized keys to detect actual key collisions all_tr_normalized_keys = {} all_plural_normalized_keys = {} # Track collisions tr_collisions = {} plural_collisions = {} # Track all occurrences for intra-file collision detection tr_occurrences = collections.defaultdict(list) plural_occurrences = collections.defaultdict(list) for rust_file in rust_files: try: with open(rust_file, 'r', encoding='utf-8') as f: content = f.read() # For intra-file collision detection tr_lines = extract_tr_macros_with_lines(content, str(rust_file)) for key, comment, line, file_path in tr_lines: tr_occurrences[(file_path, key)].append((comment, line)) plural_lines = extract_tr_plural_macros_with_lines(content, str(rust_file)) for key, comment, line, file_path in plural_lines: plural_occurrences[(file_path, key)].append((comment, line)) tr_strings = extract_tr_macros(content) plural_strings = extract_tr_plural_macros(content, str(rust_file)) if tr_strings or plural_strings: print(f" {rust_file}: {len(tr_strings)} tr!, {len(plural_strings)} tr_plural!") # Check for collisions in tr! strings using normalized keys for message, comment in tr_strings: norm_key = normalize_key(message, comment) if norm_key in all_tr_normalized_keys: # This is a real key collision (same normalized key) if norm_key not in tr_collisions: tr_collisions[norm_key] = [] tr_collisions[norm_key].append((rust_file, all_tr_normalized_keys[norm_key])) tr_collisions[norm_key].append((rust_file, comment)) # Store by normalized key to preserve all unique combinations all_tr_strings[norm_key] = (message, comment) all_tr_normalized_keys[norm_key] = comment # Check for collisions in plural strings using normalized keys for key, data in plural_strings.items(): comment = data['comment'] norm_key = normalize_key(key, comment) if norm_key in all_plural_normalized_keys: # This is a real key collision (same normalized key) if norm_key not in plural_collisions: plural_collisions[norm_key] = [] plural_collisions[norm_key].append((rust_file, all_plural_normalized_keys[norm_key])) plural_collisions[norm_key].append((rust_file, data)) all_plural_strings[key] = data all_plural_normalized_keys[norm_key] = data except Exception as e: print(f"Error reading {rust_file}: {e}") # Intra-file collision detection has_intra_file_collisions = False for (file_path, key), occurrences in tr_occurrences.items(): comments = set(c for c, _ in occurrences) if len(occurrences) > 1 and len(comments) > 1: has_intra_file_collisions = True print(f"\n⚠️ Intra-file key collision in {file_path} for '{key}':") for comment, line in occurrences: comment_text = f" (comment: '{comment}')" if comment else " (no comment)" print(f" Line {line}{comment_text}") for (file_path, key), occurrences in plural_occurrences.items(): comments = set(c for c, _ in occurrences) if len(occurrences) > 1 and len(comments) > 1: has_intra_file_collisions = True print(f"\n⚠️ Intra-file key collision in {file_path} for '{key}':") for comment, line in occurrences: comment_text = f" (comment: '{comment}')" if comment else " (no comment)" print(f" Line {line}{comment_text}") if has_intra_file_collisions and args.fail_on_collisions: print(f"❌ Exiting due to intra-file key collisions (--fail-on-collisions flag)") exit(1) # Report collisions has_collisions = False if tr_collisions: has_collisions = True print(f"\n⚠️ Key collisions detected in tr! strings:") for key, collisions in tr_collisions.items(): print(f" '{key}':") for file_path, comment in collisions: comment_text = f" (comment: '{comment}')" if comment else " (no comment)" print(f" {file_path}{comment_text}") if plural_collisions: has_collisions = True print(f"\n⚠️ Key collisions detected in tr_plural! strings:") for key, collisions in plural_collisions.items(): print(f" '{key}':") for file_path, comment in collisions: comment_text = f" (comment: '{comment}')" if comment else " (no comment)" print(f" {file_path}{comment_text}") if has_collisions: print(f"\n💡 Collision resolution: The last occurrence of each key will be used.") if args.fail_on_collisions: print(f"❌ Exiting due to key collisions (--fail-on-collisions flag)") exit(1) print(f"\nExtracted strings:") print(f" Regular strings: {len(all_tr_strings)}") print(f" Plural strings: {len(all_plural_strings)}") # Debug: print all keys in all_tr_strings print("[DEBUG] All tr! keys:") for k in all_tr_strings.keys(): print(f" {k}") # Generate FTL content for both locales locales = ['en-US', 'en-XA'] for locale in locales: pseudolocalize_content = (locale == 'en-XA') ftl_content = generate_ftl_content(all_tr_strings, all_plural_strings, tr_occurrences, plural_occurrences, pseudolocalize_content) output_path = Path(f'assets/translations/{locale}/main.ftl') if args.dry_run: print(f"\n--- Generated FTL content for {locale} ---") print(ftl_content) print(f"--- End of content for {locale} ---") else: # Ensure output directory exists output_path.parent.mkdir(parents=True, exist_ok=True) # Write to file with open(output_path, 'w', encoding='utf-8') as f: f.write(ftl_content) print(f"\nGenerated FTL file: {output_path}") if not args.dry_run: print(f"\nTotal strings: {len(all_tr_strings) + len(all_plural_strings)}") if __name__ == '__main__': main()