mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 11:24:19 +01:00
324 lines
13 KiB
Python
324 lines
13 KiB
Python
"""AST parser using Tree-sitter for multi-language support.
|
|
|
|
This module provides true structural code analysis using Tree-sitter,
|
|
enabling high-fidelity pattern detection that understands code semantics
|
|
rather than just text matching.
|
|
"""
|
|
|
|
import ast
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass
|
|
from functools import lru_cache
|
|
from pathlib import Path
|
|
from typing import Any, Optional, List, Dict, Union
|
|
|
|
from theauditor.js_semantic_parser import get_semantic_ast, get_semantic_ast_batch
|
|
from theauditor.ast_patterns import ASTPatternMixin
|
|
from theauditor.ast_extractors import ASTExtractorMixin
|
|
|
|
|
|
@dataclass
|
|
class ASTMatch:
|
|
"""Represents an AST pattern match."""
|
|
|
|
node_type: str
|
|
start_line: int
|
|
end_line: int
|
|
start_col: int
|
|
snippet: str
|
|
metadata: Dict[str, Any] = None
|
|
|
|
|
|
class ASTParser(ASTPatternMixin, ASTExtractorMixin):
|
|
"""Multi-language AST parser using Tree-sitter for structural analysis."""
|
|
|
|
def __init__(self):
|
|
"""Initialize parser with Tree-sitter language support."""
|
|
self.has_tree_sitter = False
|
|
self.parsers = {}
|
|
self.languages = {}
|
|
self.project_type = None # Cache project type detection
|
|
|
|
# Try to import tree-sitter and language bindings
|
|
try:
|
|
import tree_sitter
|
|
self.tree_sitter = tree_sitter
|
|
self.has_tree_sitter = True
|
|
self._init_tree_sitter_parsers()
|
|
except ImportError:
|
|
print("Warning: Tree-sitter not available. Install with: pip install tree-sitter tree-sitter-python tree-sitter-javascript tree-sitter-typescript")
|
|
|
|
def _init_tree_sitter_parsers(self):
|
|
"""Initialize Tree-sitter language parsers with proper bindings."""
|
|
if not self.has_tree_sitter:
|
|
return
|
|
|
|
# Use tree-sitter-language-pack for all languages
|
|
try:
|
|
from tree_sitter_language_pack import get_language, get_parser
|
|
|
|
# Python parser
|
|
try:
|
|
python_lang = get_language("python")
|
|
python_parser = get_parser("python")
|
|
self.parsers["python"] = python_parser
|
|
self.languages["python"] = python_lang
|
|
except Exception as e:
|
|
# Python has built-in fallback, so we can continue with a warning
|
|
print(f"Warning: Failed to initialize Python parser: {e}")
|
|
print(" AST analysis for Python will use built-in parser as fallback.")
|
|
|
|
# JavaScript parser (CRITICAL - must fail fast)
|
|
try:
|
|
js_lang = get_language("javascript")
|
|
js_parser = get_parser("javascript")
|
|
self.parsers["javascript"] = js_parser
|
|
self.languages["javascript"] = js_lang
|
|
except Exception as e:
|
|
raise RuntimeError(
|
|
f"Failed to load tree-sitter grammar for JavaScript: {e}\n"
|
|
"This is often due to missing build tools or corrupted installation.\n"
|
|
"Please try: pip install --force-reinstall tree-sitter-language-pack\n"
|
|
"Or install with AST support: pip install -e '.[ast]'"
|
|
)
|
|
|
|
# TypeScript parser (CRITICAL - must fail fast)
|
|
try:
|
|
ts_lang = get_language("typescript")
|
|
ts_parser = get_parser("typescript")
|
|
self.parsers["typescript"] = ts_parser
|
|
self.languages["typescript"] = ts_lang
|
|
except Exception as e:
|
|
raise RuntimeError(
|
|
f"Failed to load tree-sitter grammar for TypeScript: {e}\n"
|
|
"This is often due to missing build tools or corrupted installation.\n"
|
|
"Please try: pip install --force-reinstall tree-sitter-language-pack\n"
|
|
"Or install with AST support: pip install -e '.[ast]'"
|
|
)
|
|
|
|
except ImportError as e:
|
|
# If tree-sitter is installed but language pack is not, this is a critical error
|
|
# The user clearly intends to use tree-sitter, so we should fail loudly
|
|
print(f"ERROR: tree-sitter is installed but tree-sitter-language-pack is not: {e}")
|
|
print("This means tree-sitter AST analysis cannot work properly.")
|
|
print("Please install with: pip install tree-sitter-language-pack")
|
|
print("Or install TheAuditor with full AST support: pip install -e '.[ast]'")
|
|
# Set flags to indicate no language support
|
|
self.has_tree_sitter = False
|
|
# Don't raise - allow fallback to regex-based parsing
|
|
|
|
def _detect_project_type(self) -> str:
|
|
"""Detect the primary project type based on manifest files.
|
|
|
|
Returns:
|
|
'polyglot' if multiple language manifest files exist
|
|
'javascript' if only package.json exists
|
|
'python' if only Python manifest files exist
|
|
'go' if only go.mod exists
|
|
'unknown' otherwise
|
|
"""
|
|
if self.project_type is not None:
|
|
return self.project_type
|
|
|
|
# Check all manifest files first
|
|
has_js = Path("package.json").exists()
|
|
has_python = (Path("requirements.txt").exists() or
|
|
Path("pyproject.toml").exists() or
|
|
Path("setup.py").exists())
|
|
has_go = Path("go.mod").exists()
|
|
|
|
# Determine project type based on combinations
|
|
if has_js and has_python:
|
|
self.project_type = "polyglot" # NEW: Properly handle mixed projects
|
|
elif has_js and has_go:
|
|
self.project_type = "polyglot"
|
|
elif has_python and has_go:
|
|
self.project_type = "polyglot"
|
|
elif has_js:
|
|
self.project_type = "javascript"
|
|
elif has_python:
|
|
self.project_type = "python"
|
|
elif has_go:
|
|
self.project_type = "go"
|
|
else:
|
|
self.project_type = "unknown"
|
|
|
|
return self.project_type
|
|
|
|
def parse_file(self, file_path: Path, language: str = None, root_path: str = None) -> Any:
|
|
"""Parse a file into an AST.
|
|
|
|
Args:
|
|
file_path: Path to the source file.
|
|
language: Programming language (auto-detected if None).
|
|
root_path: Absolute path to project root (for sandbox resolution).
|
|
|
|
Returns:
|
|
AST tree object or None if parsing fails.
|
|
"""
|
|
if language is None:
|
|
language = self._detect_language(file_path)
|
|
|
|
try:
|
|
with open(file_path, "rb") as f:
|
|
content = f.read()
|
|
|
|
# Compute content hash for caching
|
|
content_hash = hashlib.md5(content).hexdigest()
|
|
|
|
# For JavaScript/TypeScript, try semantic parser first
|
|
# CRITICAL FIX: Include None and polyglot project types
|
|
# When project_type is None (not detected yet) or polyglot, still try semantic parsing
|
|
project_type = self._detect_project_type()
|
|
if language in ["javascript", "typescript"] and project_type in ["javascript", "polyglot", None, "unknown"]:
|
|
try:
|
|
# Attempt to use the TypeScript Compiler API for semantic analysis
|
|
# Normalize path for cross-platform compatibility
|
|
normalized_path = str(file_path).replace("\\", "/")
|
|
semantic_result = get_semantic_ast(normalized_path, project_root=root_path)
|
|
|
|
if semantic_result.get("success"):
|
|
# Return the semantic AST with full type information
|
|
return {
|
|
"type": "semantic_ast",
|
|
"tree": semantic_result,
|
|
"language": language,
|
|
"content": content.decode("utf-8", errors="ignore"),
|
|
"has_types": semantic_result.get("hasTypes", False),
|
|
"diagnostics": semantic_result.get("diagnostics", []),
|
|
"symbols": semantic_result.get("symbols", [])
|
|
}
|
|
else:
|
|
# Log but continue to Tree-sitter/regex fallback
|
|
error_msg = semantic_result.get('error', 'Unknown error')
|
|
print(f"Warning: Semantic parser failed for {file_path}: {error_msg}")
|
|
print(f" Falling back to Tree-sitter/regex parser.")
|
|
# Continue to fallback options below
|
|
|
|
except Exception as e:
|
|
# Log but continue to Tree-sitter/regex fallback
|
|
print(f"Warning: Exception in semantic parser for {file_path}: {e}")
|
|
print(f" Falling back to Tree-sitter/regex parser.")
|
|
# Continue to fallback options below
|
|
|
|
# Use Tree-sitter if available
|
|
if self.has_tree_sitter and language in self.parsers:
|
|
try:
|
|
# Use cached parser
|
|
tree = self._parse_treesitter_cached(content_hash, content, language)
|
|
return {"type": "tree_sitter", "tree": tree, "language": language, "content": content}
|
|
except Exception as e:
|
|
print(f"Warning: Tree-sitter parsing failed for {file_path}: {e}")
|
|
print(f" Falling back to alternative parser if available.")
|
|
# Continue to fallback options below
|
|
|
|
# Fallback to built-in parsers for Python
|
|
if language == "python":
|
|
decoded = content.decode("utf-8", errors="ignore")
|
|
python_ast = self._parse_python_cached(content_hash, decoded)
|
|
if python_ast:
|
|
return {"type": "python_ast", "tree": python_ast, "language": language, "content": decoded}
|
|
|
|
# Return minimal structure to signal regex fallback for JS/TS
|
|
if language in ["javascript", "typescript"]:
|
|
print(f"Warning: AST parsing unavailable for {file_path}. Using regex fallback.")
|
|
decoded = content.decode("utf-8", errors="ignore")
|
|
return {"type": "regex_fallback", "tree": None, "language": language, "content": decoded}
|
|
|
|
# Return None for unsupported languages
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"Warning: Failed to parse {file_path}: {e}")
|
|
return None
|
|
|
|
def _detect_language(self, file_path: Path) -> str:
|
|
"""Detect language from file extension."""
|
|
ext_map = {
|
|
".py": "python",
|
|
".js": "javascript",
|
|
".jsx": "javascript",
|
|
".ts": "typescript",
|
|
".tsx": "typescript",
|
|
".mjs": "javascript",
|
|
".cjs": "javascript",
|
|
".vue": "javascript", # Vue SFCs contain JavaScript/TypeScript
|
|
}
|
|
return ext_map.get(file_path.suffix.lower(), "") # Empty not unknown
|
|
|
|
def _parse_python_builtin(self, content: str) -> Optional[ast.AST]:
|
|
"""Parse Python code using built-in ast module."""
|
|
try:
|
|
return ast.parse(content)
|
|
except SyntaxError:
|
|
return None
|
|
|
|
@lru_cache(maxsize=500)
|
|
def _parse_python_cached(self, content_hash: str, content: str) -> Optional[ast.AST]:
|
|
"""Parse Python code with caching based on content hash.
|
|
|
|
Args:
|
|
content_hash: MD5 hash of the file content
|
|
content: The actual file content
|
|
|
|
Returns:
|
|
Parsed AST or None if parsing fails
|
|
"""
|
|
return self._parse_python_builtin(content)
|
|
|
|
@lru_cache(maxsize=500)
|
|
def _parse_treesitter_cached(self, content_hash: str, content: bytes, language: str) -> Any:
|
|
"""Parse code using Tree-sitter with caching based on content hash.
|
|
|
|
Args:
|
|
content_hash: MD5 hash of the file content
|
|
content: The actual file content as bytes
|
|
language: The programming language
|
|
|
|
Returns:
|
|
Parsed Tree-sitter tree
|
|
"""
|
|
parser = self.parsers[language]
|
|
return parser.parse(content)
|
|
|
|
|
|
def supports_language(self, language: str) -> bool:
|
|
"""Check if a language is supported for AST parsing.
|
|
|
|
Args:
|
|
language: Programming language name.
|
|
|
|
Returns:
|
|
True if AST parsing is supported.
|
|
"""
|
|
# Python is always supported via built-in ast module
|
|
if language == "python":
|
|
return True
|
|
|
|
# JavaScript and TypeScript are always supported via fallback
|
|
if language in ["javascript", "typescript"]:
|
|
return True
|
|
|
|
# Check Tree-sitter support for other languages
|
|
if self.has_tree_sitter and language in self.parsers:
|
|
return True
|
|
|
|
return False
|
|
|
|
def get_supported_languages(self) -> List[str]:
|
|
"""Get list of supported languages.
|
|
|
|
Returns:
|
|
List of language names.
|
|
"""
|
|
# Always supported via built-in or fallback
|
|
languages = ["python", "javascript", "typescript"]
|
|
|
|
if self.has_tree_sitter:
|
|
languages.extend(self.parsers.keys())
|
|
|
|
return sorted(set(languages))
|