From ba5c287b0268d84ca36cea08a3e0879e98d8634e Mon Sep 17 00:00:00 2001 From: TheAuditorTool Date: Sun, 7 Sep 2025 20:39:47 +0700 Subject: [PATCH] Initial commit: TheAuditor v1.0.1 - AI-centric SAST and Code Intelligence Platform --- .gitignore | 126 ++ ARCHITECTURE.md | 606 ++++++++ CLAUDE.md | 454 ++++++ CONTRIBUTING.md | 429 ++++++ HOWTOUSE.md | 1132 +++++++++++++++ LICENSE | 687 +++++++++ README.md | 313 ++++ ROADMAP.md | 71 + agent_templates/generic-template.md | 30 + agent_templates/sopmanager.md | 47 + package-template.json | 15 + package.json | 15 + pyproject.toml | 113 ++ theauditor/.gitattributes | 2 + theauditor/__init__.py | 3 + theauditor/agent_template_validator.py | 347 +++++ theauditor/ast_extractors/__init__.py | 348 +++++ theauditor/ast_extractors/base.py | 173 +++ theauditor/ast_extractors/python_impl.py | 327 +++++ theauditor/ast_extractors/treesitter_impl.py | 711 +++++++++ theauditor/ast_extractors/typescript_impl.py | 674 +++++++++ theauditor/ast_parser.py | 323 +++++ theauditor/ast_patterns.py | 401 ++++++ theauditor/claude_setup.py | 273 ++++ theauditor/cli.py | 239 ++++ theauditor/commands/__init__.py | 1 + theauditor/commands/_archive.py | 107 ++ theauditor/commands/deps.py | 191 +++ theauditor/commands/detect_frameworks.py | 46 + theauditor/commands/detect_patterns.py | 81 ++ theauditor/commands/docker_analyze.py | 94 ++ theauditor/commands/docs.py | 201 +++ theauditor/commands/fce.py | 43 + theauditor/commands/full.py | 90 ++ theauditor/commands/graph.py | 639 +++++++++ theauditor/commands/impact.py | 118 ++ theauditor/commands/index.py | 50 + theauditor/commands/init.py | 143 ++ theauditor/commands/init_config.py | 21 + theauditor/commands/init_js.py | 41 + theauditor/commands/insights.py | 443 ++++++ theauditor/commands/lint.py | 267 ++++ theauditor/commands/ml.py | 165 +++ theauditor/commands/refactor.py | 600 ++++++++ theauditor/commands/report.py | 66 + theauditor/commands/rules.py | 226 +++ theauditor/commands/setup.py | 63 + theauditor/commands/structure.py | 96 ++ theauditor/commands/summary.py | 236 +++ theauditor/commands/taint.py | 272 ++++ theauditor/commands/tool_versions.py | 25 + theauditor/commands/validate_templates.py | 30 + theauditor/commands/workset.py | 55 + theauditor/config.py | 40 + theauditor/config_runtime.py | 160 +++ theauditor/correlations/__init__.py | 5 + theauditor/correlations/loader.py | 237 +++ .../rules/angular_sanitization_cluster.yml | 10 + .../rules/api_key_exposure_cluster.yml | 10 + .../rules/command_injection_cluster.yml | 10 + .../rules/container_escape_cluster.yml | 10 + .../rules/cors_misconfiguration_cluster.yml | 10 + .../correlations/rules/deadlock_cluster.yml | 10 + .../rules/debug_enabled_cluster.yml | 10 + .../rules/express_bodyparser_cluster.yml | 10 + .../rules/infinite_loop_cluster.yml | 10 + .../correlations/rules/jwt_issues_cluster.yml | 10 + .../rules/ldap_injection_cluster.yml | 10 + .../rules/memory_leak_cluster.yml | 10 + .../rules/missing_auth_cluster.yml | 10 + .../rules/nosql_injection_cluster.yml | 10 + .../rules/path_traversal_cluster.yml | 10 + .../correlations/rules/pii_leak_cluster.yml | 10 + .../rules/race_condition_cluster.yml | 10 + .../rules/rate_limit_missing_cluster.yml | 10 + .../rules/react_dangerous_html_cluster.yml | 10 + .../correlations/rules/refactoring.yaml | 277 ++++ .../rules/sensitive_logs_cluster.yml | 10 + .../rules/session_fixation_cluster.yml | 10 + .../rules/source_map_exposure_cluster.yml | 10 + .../correlations/rules/ssrf_cluster.yml | 10 + .../rules/template_injection_cluster.yml | 10 + .../correlations/rules/test_sql_injection.yml | 10 + .../correlations/rules/vue_v_html_cluster.yml | 10 + .../correlations/rules/weak_auth_cluster.yml | 10 + theauditor/correlations/rules/xss_cluster.yml | 10 + theauditor/correlations/rules/xxe_cluster.yml | 10 + theauditor/deps.py | 1109 ++++++++++++++ theauditor/docgen.py | 565 ++++++++ theauditor/docker_analyzer.py | 310 ++++ theauditor/docs_fetch.py | 793 ++++++++++ theauditor/docs_summarize.py | 408 ++++++ theauditor/extraction.py | 493 +++++++ theauditor/fce.py | 784 ++++++++++ theauditor/framework_detector.py | 608 ++++++++ theauditor/framework_registry.py | 549 +++++++ theauditor/graph/__init__.py | 45 + theauditor/graph/analyzer.py | 421 ++++++ theauditor/graph/builder.py | 1017 +++++++++++++ theauditor/graph/insights.py | 17 + theauditor/graph/store.py | 444 ++++++ theauditor/graph/visualizer.py | 937 ++++++++++++ theauditor/impact_analyzer.py | 683 +++++++++ theauditor/indexer/__init__.py | 393 +++++ theauditor/indexer/config.py | 165 +++ theauditor/indexer/core.py | 409 ++++++ theauditor/indexer/database.py | 607 ++++++++ theauditor/indexer/extractors/__init__.py | 287 ++++ theauditor/indexer/extractors/docker.py | 279 ++++ theauditor/indexer/extractors/generic.py | 121 ++ theauditor/indexer/extractors/javascript.py | 345 +++++ theauditor/indexer/extractors/python.py | 189 +++ theauditor/indexer/extractors/sql.py | 44 + theauditor/indexer_compat.py | 321 +++++ theauditor/init.py | 182 +++ theauditor/insights/__init__.py | 86 ++ theauditor/insights/graph.py | 470 ++++++ theauditor/insights/ml.py | 1241 ++++++++++++++++ theauditor/insights/taint.py | 446 ++++++ theauditor/journal.py | 446 ++++++ theauditor/js_init.py | 154 ++ theauditor/js_semantic_parser.py | 1270 +++++++++++++++++ theauditor/linters/__init__.py | 36 + theauditor/linters/detector.py | 275 ++++ theauditor/linters/eslint.config.cjs | 119 ++ theauditor/linters/package.json | 17 + theauditor/linters/parsers.py | 504 +++++++ theauditor/linters/runner.py | 387 +++++ theauditor/manifest_parser.py | 183 +++ theauditor/ml.py | 17 + theauditor/module_resolver.py | 352 +++++ theauditor/parsers/__init__.py | 8 + theauditor/parsers/compose_parser.py | 238 +++ theauditor/parsers/dockerfile_parser.py | 156 ++ theauditor/parsers/nginx_parser.py | 304 ++++ theauditor/parsers/prisma_schema_parser.py | 316 ++++ theauditor/parsers/webpack_config_parser.py | 213 +++ theauditor/pattern_loader.py | 201 +++ theauditor/patterns/business_logic.yml | 31 + theauditor/patterns/db_issues.yml | 49 + theauditor/patterns/docker.yml | 19 + theauditor/patterns/flow_sensitive.yml | 116 ++ theauditor/patterns/frameworks/angular.yml | 55 + theauditor/patterns/frameworks/django.yml | 67 + theauditor/patterns/frameworks/express.yml | 46 + theauditor/patterns/frameworks/fastapi.yml | 94 ++ theauditor/patterns/frameworks/flask.yml | 73 + theauditor/patterns/frameworks/nextjs.yml | 91 ++ theauditor/patterns/frameworks/react.yml | 49 + theauditor/patterns/frameworks/svelte.yml | 85 ++ theauditor/patterns/frameworks/vue.yml | 55 + theauditor/patterns/multi_tenant.yml | 88 ++ theauditor/patterns/nginx.yml | 19 + theauditor/patterns/postgres_rls.yml | 13 + theauditor/patterns/runtime_issues.yml | 62 + theauditor/patterns/security.yml | 191 +++ theauditor/patterns/security_compliance.yml | 122 ++ theauditor/pipelines.py | 1080 ++++++++++++++ theauditor/project_summary.py | 421 ++++++ theauditor/rules/__init__.py | 29 + theauditor/rules/auth/__init__.py | 5 + theauditor/rules/auth/jwt_detector.py | 812 +++++++++++ theauditor/rules/common/utils.py | 169 +++ theauditor/rules/deployment/__init__.py | 5 + .../rules/deployment/compose_analyzer.py | 279 ++++ theauditor/rules/deployment/nginx_analyzer.py | 329 +++++ theauditor/rules/node/__init__.py | 5 + .../rules/node/runtime_issue_detector.py | 603 ++++++++ theauditor/rules/orchestrator.py | 668 +++++++++ theauditor/rules/orm/__init__.py | 6 + theauditor/rules/orm/prisma_detector.py | 325 +++++ theauditor/rules/orm/sequelize_detector.py | 206 +++ theauditor/rules/orm/typeorm_detector.py | 384 +++++ theauditor/rules/performance/__init__.py | 13 + theauditor/rules/performance/performance.py | 779 ++++++++++ theauditor/rules/react/__init__.py | 9 + theauditor/rules/react/hooks_analyzer.py | 398 ++++++ theauditor/rules/secrets/__init__.py | 5 + .../secrets/hardcoded_secret_analyzer.py | 662 +++++++++ theauditor/rules/security/__init__.py | 6 + .../rules/security/api_auth_detector.py | 151 ++ theauditor/rules/security/cors_analyzer.py | 485 +++++++ .../rules/security/rate_limit_analyzer.py | 553 +++++++ .../rules/security/sourcemap_detector.py | 209 +++ theauditor/rules/sql/__init__.py | 5 + .../rules/sql/sql_injection_analyzer.py | 74 + theauditor/rules/typescript/__init__.py | 5 + .../rules/typescript/type_safety_analyzer.py | 145 ++ theauditor/rules/vue/__init__.py | 9 + theauditor/rules/vue/reactivity_analyzer.py | 295 ++++ theauditor/rules/xss/__init__.py | 5 + theauditor/rules/xss/xssdetection.py | 640 +++++++++ theauditor/security.py | 150 ++ theauditor/taint/__init__.py | 99 ++ theauditor/taint/core.py | 479 +++++++ theauditor/taint/database.py | 301 ++++ theauditor/taint/insights.py | 17 + theauditor/taint/interprocedural.py | 239 ++++ theauditor/taint/javascript.py | 375 +++++ theauditor/taint/propagation.py | 633 ++++++++ theauditor/taint/registry.py | 225 +++ theauditor/taint/sources.py | 343 +++++ theauditor/taint_analyzer.py | 17 + theauditor/test_frameworks.py | 236 +++ theauditor/tools.py | 152 ++ theauditor/universal_detector.py | 1093 ++++++++++++++ theauditor/utils/__init__.py | 21 + theauditor/utils/error_handler.py | 66 + theauditor/utils/exit_codes.py | 65 + theauditor/utils/finding_priority.py | 178 +++ theauditor/utils/helpers.py | 156 ++ theauditor/utils/temp_manager.py | 150 ++ theauditor/venv_install.py | 779 ++++++++++ theauditor/vulnerability_scanner.py | 420 ++++++ theauditor/workset.py | 376 +++++ 215 files changed, 50911 insertions(+) create mode 100644 .gitignore create mode 100644 ARCHITECTURE.md create mode 100644 CLAUDE.md create mode 100644 CONTRIBUTING.md create mode 100644 HOWTOUSE.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 ROADMAP.md create mode 100644 agent_templates/generic-template.md create mode 100644 agent_templates/sopmanager.md create mode 100644 package-template.json create mode 100644 package.json create mode 100644 pyproject.toml create mode 100644 theauditor/.gitattributes create mode 100644 theauditor/__init__.py create mode 100644 theauditor/agent_template_validator.py create mode 100644 theauditor/ast_extractors/__init__.py create mode 100644 theauditor/ast_extractors/base.py create mode 100644 theauditor/ast_extractors/python_impl.py create mode 100644 theauditor/ast_extractors/treesitter_impl.py create mode 100644 theauditor/ast_extractors/typescript_impl.py create mode 100644 theauditor/ast_parser.py create mode 100644 theauditor/ast_patterns.py create mode 100644 theauditor/claude_setup.py create mode 100644 theauditor/cli.py create mode 100644 theauditor/commands/__init__.py create mode 100644 theauditor/commands/_archive.py create mode 100644 theauditor/commands/deps.py create mode 100644 theauditor/commands/detect_frameworks.py create mode 100644 theauditor/commands/detect_patterns.py create mode 100644 theauditor/commands/docker_analyze.py create mode 100644 theauditor/commands/docs.py create mode 100644 theauditor/commands/fce.py create mode 100644 theauditor/commands/full.py create mode 100644 theauditor/commands/graph.py create mode 100644 theauditor/commands/impact.py create mode 100644 theauditor/commands/index.py create mode 100644 theauditor/commands/init.py create mode 100644 theauditor/commands/init_config.py create mode 100644 theauditor/commands/init_js.py create mode 100644 theauditor/commands/insights.py create mode 100644 theauditor/commands/lint.py create mode 100644 theauditor/commands/ml.py create mode 100644 theauditor/commands/refactor.py create mode 100644 theauditor/commands/report.py create mode 100644 theauditor/commands/rules.py create mode 100644 theauditor/commands/setup.py create mode 100644 theauditor/commands/structure.py create mode 100644 theauditor/commands/summary.py create mode 100644 theauditor/commands/taint.py create mode 100644 theauditor/commands/tool_versions.py create mode 100644 theauditor/commands/validate_templates.py create mode 100644 theauditor/commands/workset.py create mode 100644 theauditor/config.py create mode 100644 theauditor/config_runtime.py create mode 100644 theauditor/correlations/__init__.py create mode 100644 theauditor/correlations/loader.py create mode 100644 theauditor/correlations/rules/angular_sanitization_cluster.yml create mode 100644 theauditor/correlations/rules/api_key_exposure_cluster.yml create mode 100644 theauditor/correlations/rules/command_injection_cluster.yml create mode 100644 theauditor/correlations/rules/container_escape_cluster.yml create mode 100644 theauditor/correlations/rules/cors_misconfiguration_cluster.yml create mode 100644 theauditor/correlations/rules/deadlock_cluster.yml create mode 100644 theauditor/correlations/rules/debug_enabled_cluster.yml create mode 100644 theauditor/correlations/rules/express_bodyparser_cluster.yml create mode 100644 theauditor/correlations/rules/infinite_loop_cluster.yml create mode 100644 theauditor/correlations/rules/jwt_issues_cluster.yml create mode 100644 theauditor/correlations/rules/ldap_injection_cluster.yml create mode 100644 theauditor/correlations/rules/memory_leak_cluster.yml create mode 100644 theauditor/correlations/rules/missing_auth_cluster.yml create mode 100644 theauditor/correlations/rules/nosql_injection_cluster.yml create mode 100644 theauditor/correlations/rules/path_traversal_cluster.yml create mode 100644 theauditor/correlations/rules/pii_leak_cluster.yml create mode 100644 theauditor/correlations/rules/race_condition_cluster.yml create mode 100644 theauditor/correlations/rules/rate_limit_missing_cluster.yml create mode 100644 theauditor/correlations/rules/react_dangerous_html_cluster.yml create mode 100644 theauditor/correlations/rules/refactoring.yaml create mode 100644 theauditor/correlations/rules/sensitive_logs_cluster.yml create mode 100644 theauditor/correlations/rules/session_fixation_cluster.yml create mode 100644 theauditor/correlations/rules/source_map_exposure_cluster.yml create mode 100644 theauditor/correlations/rules/ssrf_cluster.yml create mode 100644 theauditor/correlations/rules/template_injection_cluster.yml create mode 100644 theauditor/correlations/rules/test_sql_injection.yml create mode 100644 theauditor/correlations/rules/vue_v_html_cluster.yml create mode 100644 theauditor/correlations/rules/weak_auth_cluster.yml create mode 100644 theauditor/correlations/rules/xss_cluster.yml create mode 100644 theauditor/correlations/rules/xxe_cluster.yml create mode 100644 theauditor/deps.py create mode 100644 theauditor/docgen.py create mode 100644 theauditor/docker_analyzer.py create mode 100644 theauditor/docs_fetch.py create mode 100644 theauditor/docs_summarize.py create mode 100644 theauditor/extraction.py create mode 100644 theauditor/fce.py create mode 100644 theauditor/framework_detector.py create mode 100644 theauditor/framework_registry.py create mode 100644 theauditor/graph/__init__.py create mode 100644 theauditor/graph/analyzer.py create mode 100644 theauditor/graph/builder.py create mode 100644 theauditor/graph/insights.py create mode 100644 theauditor/graph/store.py create mode 100644 theauditor/graph/visualizer.py create mode 100644 theauditor/impact_analyzer.py create mode 100644 theauditor/indexer/__init__.py create mode 100644 theauditor/indexer/config.py create mode 100644 theauditor/indexer/core.py create mode 100644 theauditor/indexer/database.py create mode 100644 theauditor/indexer/extractors/__init__.py create mode 100644 theauditor/indexer/extractors/docker.py create mode 100644 theauditor/indexer/extractors/generic.py create mode 100644 theauditor/indexer/extractors/javascript.py create mode 100644 theauditor/indexer/extractors/python.py create mode 100644 theauditor/indexer/extractors/sql.py create mode 100644 theauditor/indexer_compat.py create mode 100644 theauditor/init.py create mode 100644 theauditor/insights/__init__.py create mode 100644 theauditor/insights/graph.py create mode 100644 theauditor/insights/ml.py create mode 100644 theauditor/insights/taint.py create mode 100644 theauditor/journal.py create mode 100644 theauditor/js_init.py create mode 100644 theauditor/js_semantic_parser.py create mode 100644 theauditor/linters/__init__.py create mode 100644 theauditor/linters/detector.py create mode 100644 theauditor/linters/eslint.config.cjs create mode 100644 theauditor/linters/package.json create mode 100644 theauditor/linters/parsers.py create mode 100644 theauditor/linters/runner.py create mode 100644 theauditor/manifest_parser.py create mode 100644 theauditor/ml.py create mode 100644 theauditor/module_resolver.py create mode 100644 theauditor/parsers/__init__.py create mode 100644 theauditor/parsers/compose_parser.py create mode 100644 theauditor/parsers/dockerfile_parser.py create mode 100644 theauditor/parsers/nginx_parser.py create mode 100644 theauditor/parsers/prisma_schema_parser.py create mode 100644 theauditor/parsers/webpack_config_parser.py create mode 100644 theauditor/pattern_loader.py create mode 100644 theauditor/patterns/business_logic.yml create mode 100644 theauditor/patterns/db_issues.yml create mode 100644 theauditor/patterns/docker.yml create mode 100644 theauditor/patterns/flow_sensitive.yml create mode 100644 theauditor/patterns/frameworks/angular.yml create mode 100644 theauditor/patterns/frameworks/django.yml create mode 100644 theauditor/patterns/frameworks/express.yml create mode 100644 theauditor/patterns/frameworks/fastapi.yml create mode 100644 theauditor/patterns/frameworks/flask.yml create mode 100644 theauditor/patterns/frameworks/nextjs.yml create mode 100644 theauditor/patterns/frameworks/react.yml create mode 100644 theauditor/patterns/frameworks/svelte.yml create mode 100644 theauditor/patterns/frameworks/vue.yml create mode 100644 theauditor/patterns/multi_tenant.yml create mode 100644 theauditor/patterns/nginx.yml create mode 100644 theauditor/patterns/postgres_rls.yml create mode 100644 theauditor/patterns/runtime_issues.yml create mode 100644 theauditor/patterns/security.yml create mode 100644 theauditor/patterns/security_compliance.yml create mode 100644 theauditor/pipelines.py create mode 100644 theauditor/project_summary.py create mode 100644 theauditor/rules/__init__.py create mode 100644 theauditor/rules/auth/__init__.py create mode 100644 theauditor/rules/auth/jwt_detector.py create mode 100644 theauditor/rules/common/utils.py create mode 100644 theauditor/rules/deployment/__init__.py create mode 100644 theauditor/rules/deployment/compose_analyzer.py create mode 100644 theauditor/rules/deployment/nginx_analyzer.py create mode 100644 theauditor/rules/node/__init__.py create mode 100644 theauditor/rules/node/runtime_issue_detector.py create mode 100644 theauditor/rules/orchestrator.py create mode 100644 theauditor/rules/orm/__init__.py create mode 100644 theauditor/rules/orm/prisma_detector.py create mode 100644 theauditor/rules/orm/sequelize_detector.py create mode 100644 theauditor/rules/orm/typeorm_detector.py create mode 100644 theauditor/rules/performance/__init__.py create mode 100644 theauditor/rules/performance/performance.py create mode 100644 theauditor/rules/react/__init__.py create mode 100644 theauditor/rules/react/hooks_analyzer.py create mode 100644 theauditor/rules/secrets/__init__.py create mode 100644 theauditor/rules/secrets/hardcoded_secret_analyzer.py create mode 100644 theauditor/rules/security/__init__.py create mode 100644 theauditor/rules/security/api_auth_detector.py create mode 100644 theauditor/rules/security/cors_analyzer.py create mode 100644 theauditor/rules/security/rate_limit_analyzer.py create mode 100644 theauditor/rules/security/sourcemap_detector.py create mode 100644 theauditor/rules/sql/__init__.py create mode 100644 theauditor/rules/sql/sql_injection_analyzer.py create mode 100644 theauditor/rules/typescript/__init__.py create mode 100644 theauditor/rules/typescript/type_safety_analyzer.py create mode 100644 theauditor/rules/vue/__init__.py create mode 100644 theauditor/rules/vue/reactivity_analyzer.py create mode 100644 theauditor/rules/xss/__init__.py create mode 100644 theauditor/rules/xss/xssdetection.py create mode 100644 theauditor/security.py create mode 100644 theauditor/taint/__init__.py create mode 100644 theauditor/taint/core.py create mode 100644 theauditor/taint/database.py create mode 100644 theauditor/taint/insights.py create mode 100644 theauditor/taint/interprocedural.py create mode 100644 theauditor/taint/javascript.py create mode 100644 theauditor/taint/propagation.py create mode 100644 theauditor/taint/registry.py create mode 100644 theauditor/taint/sources.py create mode 100644 theauditor/taint_analyzer.py create mode 100644 theauditor/test_frameworks.py create mode 100644 theauditor/tools.py create mode 100644 theauditor/universal_detector.py create mode 100644 theauditor/utils/__init__.py create mode 100644 theauditor/utils/error_handler.py create mode 100644 theauditor/utils/exit_codes.py create mode 100644 theauditor/utils/finding_priority.py create mode 100644 theauditor/utils/helpers.py create mode 100644 theauditor/utils/temp_manager.py create mode 100644 theauditor/venv_install.py create mode 100644 theauditor/vulnerability_scanner.py create mode 100644 theauditor/workset.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f41bcb2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,126 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Virtual environments +.env +.venv +.auditor_venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Project specific +.pf/ +.claude/ +audit/ +manifest.json +repo_index.db +*.db +*.db-journal +/test_scaffold/ +/tmp/ + +# Test and temporary files +test_output/ +temp/ +*.tmp +*.bak +*.log + +# Local configuration +.env.local +.env.*.local +config.local.json + +# Journal and runtime files +*.ndjson +.pf/journal.ndjson +.pf/bus/ +.pf/workset.json +.pf/capsules/ +.pf/context/ + +# ML models (if any) +*.pkl +*.joblib +*.h5 +*.model + +# Documentation build +docs/_build/ +docs/.doctrees/ + +# macOS +.DS_Store +.AppleDouble +.LSOverride + +# Windows +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db +*.stackdump +[Dd]esktop.ini + +# Linux +.directory +.Trash-* \ No newline at end of file diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..2e4b055 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,606 @@ +# TheAuditor Architecture + +This document provides a comprehensive technical overview of TheAuditor's architecture, design patterns, and implementation details. + +## System Overview + +TheAuditor is an offline-first, AI-centric SAST (Static Application Security Testing) and code intelligence platform. It orchestrates industry-standard tools to provide ground truth about code quality and security, producing AI-consumable reports optimized for LLM context windows. + +### Core Design Principles + +1. **Offline-First Operation** - All analysis runs without network access, ensuring data privacy and reproducible results +2. **Dual-Mode Architecture** - Courier Mode preserves raw external tool outputs; Expert Mode applies security expertise objectively +3. **AI-Centric Workflow** - Produces chunks optimized for LLM context windows (65KB by default) +4. **Sandboxed Execution** - Isolated analysis environment prevents cross-contamination +5. **No Fix Generation** - Reports findings without prescribing solutions + +## Truth Courier vs Insights: Separation of Concerns + +TheAuditor maintains a strict architectural separation between **factual observation** and **optional interpretation**: + +### Truth Courier Modules (Core) +These modules are the foundation - they gather and report verifiable facts without judgment: + +- **Indexer**: Reports "Function X exists at line Y with Z parameters" +- **Taint Analyzer**: Reports "Data flows from pattern A to pattern B through path C" +- **Impact Analyzer**: Reports "Changing function X affects Y files through Z call chains" +- **Graph Analyzer**: Reports "Module A imports B, B imports C, C imports A (cycle detected)" +- **Pattern Detector**: Reports "Line X matches pattern Y from rule Z" +- **Linters**: Reports "Tool ESLint flagged line X with rule Y" + +These modules form the immutable ground truth. They report **what exists**, not what it means. + +### Insights Modules (Optional Interpretation Layer) +These are **optional packages** that consume Truth Courier data to add scoring and classification. All insights modules have been consolidated into a single package for better organization: + +``` +theauditor/insights/ +├── __init__.py # Package exports +├── ml.py # Machine learning predictions (requires pip install -e ".[ml]") +├── graph.py # Graph health scoring and recommendations +└── taint.py # Vulnerability severity classification +``` + +- **insights/taint.py**: Adds "This flow is XSS with HIGH severity" +- **insights/graph.py**: Adds "Health score: 70/100, Grade: B" +- **insights/ml.py** (requires `pip install -e ".[ml]"`): Adds "80% probability of bugs based on historical patterns" + +**Important**: Insights modules are: +- Not installed by default (ML requires explicit opt-in) +- Completely decoupled from core analysis +- Still based on technical patterns, not business logic interpretation +- Designed for teams that want actionable scores alongside raw facts +- All consolidated in `/insights` package for consistency + +### The FCE: Factual Correlation Engine +The FCE correlates facts from multiple tools without interpreting them: +- Reports: "Tool A and Tool B both flagged line 100" +- Reports: "Pattern X and Pattern Y co-occur in file Z" +- Never says: "This is bad" or "Fix this way" + +## Core Components + +### Indexer Package (`theauditor/indexer/`) +The indexer has been refactored from a monolithic 2000+ line file into a modular package structure: + +``` +theauditor/indexer/ +├── __init__.py # Package initialization and backward compatibility +├── config.py # Constants, patterns, and configuration +├── database.py # DatabaseManager class for all DB operations +├── core.py # FileWalker and ASTCache classes +├── orchestrator.py # IndexOrchestrator - main coordination logic +└── extractors/ + ├── __init__.py # BaseExtractor abstract class and registry + ├── python.py # Python-specific extraction logic + ├── javascript.py # JavaScript/TypeScript extraction + ├── docker.py # Docker/docker-compose extraction + ├── sql.py # SQL extraction + └── nginx.py # Nginx configuration extraction +``` + +Key features: +- **Dynamic extractor registry** for automatic language detection +- **Batched database operations** (200 records per batch by default) +- **AST caching** for performance optimization +- **Monorepo detection** and intelligent path filtering +- **Parallel JavaScript processing** when semantic parser available + +### Pipeline System (`theauditor/pipelines.py`) +Orchestrates **14-phase** analysis pipeline in **parallel stages**: + +**Stage 1 - Foundation (Sequential):** +1. Repository indexing - Build manifest and symbol database +2. Framework detection - Identify technologies in use + +**Stage 2 - Concurrent Analysis (3 Parallel Tracks):** +- **Track A (Network I/O):** + - Dependency checking + - Documentation fetching + - Documentation summarization +- **Track B (Code Analysis):** + - Workset creation + - Linting + - Pattern detection +- **Track C (Graph Build):** + - Graph building + +**Stage 3 - Final Aggregation (Sequential):** +- Graph analysis +- Taint analysis +- Factual correlation engine +- Report generation + +### Pattern Detection Engine +- 100+ YAML-defined security patterns in `theauditor/patterns/` +- AST-based matching for Python and JavaScript +- Supports semantic analysis via TypeScript compiler + +### Factual Correlation Engine (FCE) (`theauditor/fce.py`) +- **29 advanced correlation rules** in `theauditor/correlations/rules/` +- Detects complex vulnerability patterns across multiple tools +- Categories: Authentication, Injection, Data Exposure, Infrastructure, Code Quality, Framework-Specific + +### Taint Analysis Package (`theauditor/taint_analyzer.py`) +A comprehensive taint analysis module that tracks data flow from sources to sinks: + +- Tracks data flow from user inputs to dangerous outputs +- Detects SQL injection, XSS, command injection vulnerabilities +- Database-aware analysis using `repo_index.db` +- Supports both assignment-based and direct-use patterns +- Merges findings from multiple detection methods + +**Note**: The optional severity scoring for taint analysis is provided by `theauditor/insights/taint.py` (Insights module) + +### Graph Analysis (`theauditor/graph/`) +- **builder.py**: Constructs dependency graph from codebase +- **analyzer.py**: Detects cycles, measures complexity, identifies hotspots +- Uses NetworkX for graph algorithms + +**Note**: The optional health scoring and recommendations are provided by `theauditor/insights/graph.py` (Insights module) + +### Framework Detection (`theauditor/framework_detector.py`) +- Auto-detects Django, Flask, React, Vue, Angular, etc. +- Applies framework-specific rules +- Influences pattern selection and analysis behavior + +### Configuration Parsers (`theauditor/parsers/`) +Specialized parsers for configuration file analysis: +- **webpack_config_parser.py**: Webpack configuration analysis +- **compose_parser.py**: Docker Compose file parsing +- **nginx_parser.py**: Nginx configuration parsing +- **dockerfile_parser.py**: Dockerfile security analysis +- **prisma_schema_parser.py**: Prisma ORM schema parsing + +These parsers are used by extractors during indexing to extract security-relevant configuration data. + +### Refactoring Detection (`theauditor/commands/refactor.py`) +Detects incomplete refactorings and cross-stack inconsistencies: +- Analyzes database migrations to detect schema changes +- Uses impact analysis to trace affected files +- Applies correlation rules from `/correlations/rules/refactoring.yaml` +- Detects API contract mismatches, field migrations, foreign key changes +- Supports auto-detection from migration files or specific change analysis + +## System Architecture Diagrams + +### High-Level Data Flow + +```mermaid +graph TB + subgraph "Input Layer" + CLI[CLI Commands] + Files[Project Files] + end + + subgraph "Core Pipeline" + Index[Indexer] + Framework[Framework Detector] + Deps[Dependency Checker] + Patterns[Pattern Detection] + Taint[Taint Analysis] + Graph[Graph Builder] + FCE[Factual Correlation Engine] + end + + subgraph "Storage" + DB[(SQLite DB)] + Raw[Raw Output] + Chunks[65KB Chunks] + end + + CLI --> Index + Files --> Index + Index --> DB + Index --> Framework + Framework --> Deps + + Deps --> Patterns + Patterns --> Graph + Graph --> Taint + Taint --> FCE + + FCE --> Raw + Raw --> Chunks +``` + +### Parallel Pipeline Execution + +```mermaid +graph LR + subgraph "Stage 1 - Sequential" + S1[Index] --> S2[Framework Detection] + end + + subgraph "Stage 2 - Parallel" + direction TB + subgraph "Track A - Network I/O" + A1[Deps Check] + A2[Doc Fetch] + A3[Doc Summary] + A1 --> A2 --> A3 + end + + subgraph "Track B - Code Analysis" + B1[Workset] + B2[Linting] + B3[Patterns] + B1 --> B2 --> B3 + end + + subgraph "Track C - Graph" + C1[Graph Build] + end + end + + subgraph "Stage 3 - Sequential" + E1[Graph Analysis] --> E2[Taint] --> E3[FCE] --> E4[Report] + end + + S2 --> A1 + S2 --> B1 + S2 --> C1 + + A3 --> E1 + B3 --> E1 + C1 --> E1 +``` + +### Data Chunking System + +The extraction system (`theauditor/extraction.py`) implements pure courier model chunking: + +```mermaid +graph TD + subgraph "Analysis Results" + P[Patterns.json] + T[Taint.json
Multiple lists merged] + L[Lint.json] + F[FCE.json] + end + + subgraph "Extraction Process" + E[Extraction Engine
Budget: 1.5MB] + M[Merge Logic
For taint_paths +
rule_findings] + C1[Chunk 1
0-65KB] + C2[Chunk 2
65-130KB] + C3[Chunk 3
130-195KB] + TR[Truncation
Flag] + end + + subgraph "Output" + R1[patterns_chunk01.json] + R2[patterns_chunk02.json] + R3[patterns_chunk03.json] + end + + P --> E + T --> M --> E + L --> E + F --> E + + E --> C1 --> R1 + E --> C2 --> R2 + E --> C3 --> R3 + E -.->|If >195KB| TR + TR -.-> R3 +``` + +Key features: +- **Budget system**: 1.5MB total budget for all chunks +- **Smart merging**: Taint analysis merges multiple finding lists (taint_paths, rule_findings, infrastructure) +- **Preservation**: All findings preserved, no filtering or sampling +- **Chunking**: Only chunks files >65KB, copies smaller files as-is + +### Dual Environment Architecture + +```mermaid +graph TB + subgraph "Development Environment" + V1[.venv/] + PY[Python 3.11+] + AU[TheAuditor Code] + V1 --> PY --> AU + end + + subgraph "Sandboxed Analysis Environment" + V2[.auditor_venv/.theauditor_tools/] + NODE[Bundled Node.js v20.11.1] + TS[TypeScript Compiler] + ES[ESLint] + PR[Prettier] + NM[node_modules/] + V2 --> NODE + NODE --> TS + NODE --> ES + NODE --> PR + NODE --> NM + end + + AU -->|Analyzes using| V2 + AU -.->|Never uses| V1 +``` + +TheAuditor maintains strict separation between: +1. **Primary Environment** (`.venv/`): TheAuditor's Python code and dependencies +2. **Sandboxed Environment** (`.auditor_venv/.theauditor_tools/`): Isolated JS/TS analysis tools + +This ensures reproducibility and prevents TheAuditor from analyzing its own analysis tools. + +## Database Schema + +```mermaid +erDiagram + files ||--o{ symbols : contains + files ||--o{ refs : contains + files ||--o{ api_endpoints : contains + files ||--o{ sql_queries : contains + files ||--o{ docker_images : contains + + files { + string path PK + string language + int size + string hash + json metadata + } + + symbols { + string path FK + string name + string type + int line + json metadata + } + + refs { + string src FK + string value + string kind + int line + } + + api_endpoints { + string file FK + string method + string path + int line + } + + sql_queries { + string file_path FK + string command + string query + int line_number + } + + docker_images { + string file_path FK + string base_image + json env_vars + json build_args + } +``` + +## Command Flow Sequence + +```mermaid +sequenceDiagram + participant User + participant CLI + participant Pipeline + participant Analyzers + participant Database + participant Output + + User->>CLI: aud full + CLI->>Pipeline: Execute pipeline + Pipeline->>Database: Initialize schema + + Pipeline->>Analyzers: Index files + Analyzers->>Database: Store file metadata + + par Parallel Execution + Pipeline->>Analyzers: Dependency check + and + Pipeline->>Analyzers: Pattern detection + and + Pipeline->>Analyzers: Graph building + end + + Pipeline->>Analyzers: Taint analysis + Analyzers->>Database: Query symbols & refs + + Pipeline->>Analyzers: FCE correlation + Analyzers->>Output: Generate reports + + Pipeline->>Output: Create chunks + Output->>User: .pf/readthis/ +``` + +## Output Structure + +All results are organized in the `.pf/` directory: + +``` +.pf/ +├── raw/ # Immutable tool outputs (ground truth) +│ ├── eslint.json +│ ├── ruff.json +│ └── ... +├── readthis/ # AI-optimized chunks (<65KB each, max 3 chunks per file) +│ ├── manifest.md # Repository overview +│ ├── patterns_*.md # Security findings +│ ├── taint_*.md # Data-flow issues +│ └── tickets_*.md # Actionable tasks +├── repo_index.db # SQLite database of code symbols +├── pipeline.log # Execution trace +└── findings.json # Consolidated results +``` + +### Key Output Files + +- **manifest.md**: Complete file inventory with SHA-256 hashes +- **patterns_*.md**: Chunked security findings from 100+ detection rules +- **tickets_*.md**: Prioritized, actionable issues with evidence +- **repo_index.db**: Queryable database of all code symbols and relationships + +## Operating Modes + +TheAuditor operates in two distinct modes: + +### Courier Mode (External Tools) +- Preserves exact outputs from ESLint, Ruff, MyPy, etc. +- No interpretation or filtering +- Complete audit trail from source to finding + +### Expert Mode (Internal Engines) +- **Taint Analysis**: Tracks untrusted data through the application +- **Pattern Detection**: YAML-based rules with AST matching +- **Graph Analysis**: Architectural insights and dependency tracking +- **Secret Detection**: Identifies hardcoded credentials and API keys + +## CLI Entry Points + +- **Main CLI**: `theauditor/cli.py` - Central command router +- **Command modules**: `theauditor/commands/` - One module per command +- **Utilities**: `theauditor/utils/` - Shared functionality +- **Configuration**: `theauditor/config_runtime.py` - Runtime configuration + +Each command module follows a standardized structure with: +- `@click.command()` decorator +- `@handle_exceptions` decorator for error handling +- Consistent logging and output formatting + +## Performance Optimizations + +- **Batched database operations**: 200 records per batch (configurable) +- **Parallel rule execution**: ThreadPoolExecutor with 4 workers +- **AST caching**: Persistent cache for parsed AST trees +- **Incremental analysis**: Workset-based analysis for changed files only +- **Lazy loading**: Patterns and rules loaded on-demand +- **Memory-efficient chunking**: Stream large files instead of loading entirely + +## Configuration System + +TheAuditor supports runtime configuration via multiple sources (priority order): + +1. **Environment variables** (`THEAUDITOR_*` prefix) +2. **`.pf/config.json`** file (project-specific) +3. **Built-in defaults** in `config_runtime.py` + +Example configuration: +```bash +export THEAUDITOR_LIMITS_MAX_CHUNKS_PER_FILE=5 # Default: 3 +export THEAUDITOR_LIMITS_MAX_CHUNK_SIZE=100000 # Default: 65000 +export THEAUDITOR_LIMITS_MAX_FILE_SIZE=5242880 # Default: 2097152 +export THEAUDITOR_TIMEOUTS_LINT_TIMEOUT=600 # Default: 300 +``` + +## Advanced Features + +### Database-Aware Rules +Specialized analyzers query `repo_index.db` to detect: +- ORM anti-patterns (N+1 queries, missing transactions) +- Docker security misconfigurations +- Nginx configuration issues +- Multi-file correlation patterns + +### Holistic Analysis +Project-level analyzers that operate across the entire codebase: +- **Bundle Analyzer**: Correlates package.json, lock files, and imports +- **Source Map Detector**: Scans build directories for exposed maps +- **Framework Detectors**: Identify technology stack automatically + +### Incremental Analysis +Workset-based analysis for efficient processing: +- Git diff integration for changed file detection +- Dependency tracking for impact analysis +- Cached results for unchanged files + +## Contributing to TheAuditor + +### Adding Language Support + +TheAuditor's modular architecture makes it straightforward to add new language support: + +#### 1. Create an Extractor +Create a new extractor in `theauditor/indexer/extractors/{language}.py`: + +```python +from . import BaseExtractor + +class {Language}Extractor(BaseExtractor): + def supported_extensions(self) -> List[str]: + return ['.ext', '.ext2'] + + def extract(self, file_info, content, tree=None): + # Extract symbols, imports, routes, etc. + return { + 'imports': [], + 'routes': [], + 'symbols': [], + # ... other extracted data + } +``` + +The extractor will be automatically registered via the `BaseExtractor` inheritance. + +#### 2. Create Configuration Parser (Optional) +For configuration files, create a parser in `theauditor/parsers/{language}_parser.py`: + +```python +class {Language}Parser: + def parse_file(self, file_path: Path) -> Dict[str, Any]: + # Parse configuration file + return parsed_data +``` + +#### 3. Add Security Patterns +Create YAML patterns in `theauditor/patterns/{language}.yml`: + +```yaml +- name: hardcoded-secret-{language} + pattern: 'api_key\s*=\s*["\'][^"\']+["\']' + severity: critical + category: security + languages: ["{language}"] + description: "Hardcoded API key in {Language} code" +``` + +#### 4. Add Framework Detection +Update `theauditor/framework_detector.py` to detect {Language} frameworks. + +### Adding New Analyzers + +#### Database-Aware Rules +Create analyzers that query `repo_index.db` in `theauditor/rules/{category}/`: + +```python +def find_{issue}_patterns(db_path: str) -> List[Dict[str, Any]]: + conn = sqlite3.connect(db_path) + # Query and analyze + return findings +``` + +#### AST-Based Rules +For semantic analysis, create rules in `theauditor/rules/{framework}/`: + +```python +def find_{framework}_issues(tree, file_path) -> List[Dict[str, Any]]: + # Traverse AST and detect issues + return findings +``` + +#### Pattern-Based Rules +Add YAML patterns to `theauditor/patterns/` for regex-based detection. + +### Architecture Guidelines + +1. **Maintain Truth Courier vs Insights separation** - Core modules report facts, insights add interpretation +2. **Use the extractor registry** - Inherit from `BaseExtractor` for automatic registration +3. **Follow existing patterns** - Look at `python.py` or `javascript.py` extractors as examples +4. **Write comprehensive tests** - Test extractors, parsers, and patterns +5. **Document your additions** - Update this file and CONTRIBUTING.md + +For detailed contribution guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md). \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..225b7ba --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,454 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Quick Reference Commands + +```bash +# Development Setup +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -e ".[all]" +aud setup-claude --target . # MANDATORY for JS/TS analysis + +# Testing +pytest -v # Run all tests +pytest tests/test_file.py # Run specific test file +pytest -k "test_name" # Run specific test by name +pytest --cov=theauditor # With coverage + +# Code Quality +ruff check theauditor tests --fix # Lint and auto-fix +ruff format theauditor tests # Format code +black theauditor tests # Alternative formatter +mypy theauditor --strict # Type checking + +# Running TheAuditor +aud init # Initialize project +aud full # Complete analysis (14 phases) +aud full --offline # Skip network operations (deps, docs) +aud index --exclude-self # When analyzing TheAuditor itself + +# Individual Analysis Commands +aud index # Build code index database +aud detect-patterns # Run security pattern detection +aud taint-analyze # Perform taint flow analysis +aud graph build # Build dependency graph +aud graph analyze # Analyze graph structure +aud fce # Run Factual Correlation Engine +aud report # Generate final report +aud workset # Create working set of critical files +aud impact # Analyze impact of changing a file + +# Utility Commands +aud setup-claude # Setup sandboxed JS/TS tools (MANDATORY) +aud js-semantic # Parse JS/TS file semantically +aud structure # Display project structure +aud insights # Generate ML insights (requires [ml] extras) +aud refactor # Perform refactoring operations +``` + +## Project Overview + +TheAuditor is an offline-first, AI-centric SAST (Static Application Security Testing) and code intelligence platform written in Python. It performs comprehensive security auditing and code analysis for Python and JavaScript/TypeScript projects, producing AI-consumable reports optimized for LLM context windows. + +## Core Philosophy: Truth Courier, Not Mind Reader + +**CRITICAL UNDERSTANDING**: TheAuditor does NOT try to understand business logic or make AI "smarter." It solves the real problem: **AI loses context and makes inconsistent changes across large codebases.** + +### The Development Loop +1. **Human tells AI**: "Add JWT auth with CSRF protection" +2. **AI writes code**: Probably has issues due to context limits (hardcoded secrets, missing middleware, etc.) +3. **Human runs**: `aud full` +4. **TheAuditor reports**: All inconsistencies and security holes as FACTS +5. **AI reads report**: Now sees the COMPLETE picture across all files +6. **AI fixes issues**: With full visibility of what's broken +7. **Repeat until clean** + +TheAuditor is about **consistency checking**, not semantic understanding. It finds where code doesn't match itself, not whether it matches business requirements. + +## Critical Setup Requirements + +### For JavaScript/TypeScript Analysis +TheAuditor requires a sandboxed environment for JS/TS tools. This is NOT optional: + +```bash +# MANDATORY: Set up sandboxed tools +aud setup-claude --target . +``` + +This creates `.auditor_venv/.theauditor_tools/` with isolated TypeScript compiler and ESLint. Without this, TypeScript semantic analysis will fail. + +## Key Architectural Decisions + +### Modular Package Structure +The codebase follows a modular design where large modules are refactored into packages. Example: the indexer was refactored from a 2000+ line monolithic file into: +``` +theauditor/indexer/ +├── __init__.py # Backward compatibility shim +├── config.py # Constants and patterns +├── database.py # DatabaseManager class +├── core.py # FileWalker, ASTCache +├── orchestrator.py # Main coordination +└── extractors/ # Language-specific logic +``` + +When refactoring, always: +1. Create a package with the same name as the original module +2. Provide a backward compatibility shim in `__init__.py` +3. Separate concerns into focused modules +4. Use dynamic registries for extensibility + +### Database Contract Preservation +The `repo_index.db` schema is consumed by many downstream modules (taint_analyzer, graph builder, etc.). When modifying indexer or database operations: +- NEVER change table schemas without migration +- Preserve exact column names and types +- Maintain the same data format in JSON columns +- Test downstream consumers after changes + +## Architecture Overview + +### Truth Courier vs Insights: Separation of Concerns + +TheAuditor maintains strict separation between **factual observation** and **optional interpretation**: + +#### Truth Courier Modules (Core - Always Active) +Report verifiable facts without judgment: +- **Indexer**: "Function X exists at line Y" +- **Taint Analyzer**: "Data flows from req.body to res.send" (NOT "XSS vulnerability") +- **Impact Analyzer**: "Changing X affects 47 files through dependency chains" +- **Pattern Detector**: "Line X matches pattern Y" +- **Graph Analyzer**: "Cycle detected: A→B→C→A" + +#### Insights Modules (Optional - Not Installed by Default) +Add scoring and classification on top of facts: +- **taint/insights.py**: Adds "This is HIGH severity XSS" +- **graph/insights.py**: Adds "Health score: 70/100" +- **ml.py**: Requires `pip install -e ".[ml]"` - adds predictions + +#### Correlation Rules (Project-Specific Pattern Detection) +- Located in `theauditor/correlations/rules/` +- Detect when multiple facts indicate inconsistency +- Example: "Backend moved field to ProductVariant but frontend still uses Product.price" +- NOT business logic understanding, just pattern matching YOUR refactorings + +### Dual-Environment Design +TheAuditor maintains strict separation between: +1. **Primary Environment** (`.venv/`): TheAuditor's Python code and dependencies +2. **Sandboxed Environment** (`.auditor_venv/.theauditor_tools/`): Isolated JS/TS analysis tools + +### Core Components + +#### Indexer Package (`theauditor/indexer/`) +The indexer has been refactored from a monolithic 2000+ line file into a modular package: +- **config.py**: Constants, patterns, and configuration (SKIP_DIRS, language maps, etc.) +- **database.py**: DatabaseManager class handling all database operations +- **core.py**: FileWalker (with monorepo detection) and ASTCache classes +- **orchestrator.py**: IndexOrchestrator coordinating the indexing process +- **extractors/**: Language-specific extractors (Python, JavaScript, Docker, SQL, nginx) + +The package uses a dynamic extractor registry for automatic language detection and processing. + +#### Pipeline System (`theauditor/pipelines.py`) +- Orchestrates **14-phase** analysis pipeline in **parallel stages**: + - **Stage 1**: Foundation (index with batched DB operations, framework detection) + - **Stage 2**: 3 concurrent tracks (Network I/O, Code Analysis, Graph Build) + - **Stage 3**: Final aggregation (graph analysis, taint, FCE, report) +- Handles error recovery and logging +- **Performance optimizations**: + - Batched database inserts (200 records per batch) in indexer + - Parallel rule execution with ThreadPoolExecutor (4 workers) + - Parallel holistic analysis (bundle + sourcemap detection) + +#### Pattern Detection Engine +- 100+ YAML-defined security patterns in `theauditor/patterns/` +- AST-based matching for Python and JavaScript +- Supports semantic analysis via TypeScript compiler + +#### Factual Correlation Engine (FCE) (`theauditor/fce.py`) +- **29 advanced correlation rules** in `theauditor/correlations/rules/` +- Detects complex vulnerability patterns across multiple tools +- Categories: Authentication, Injection, Data Exposure, Infrastructure, Code Quality, Framework-Specific + +#### Taint Analysis Package (`theauditor/taint_analyzer/`) +Previously a monolithic 1822-line file, now refactored into a modular package: +- **core.py**: TaintAnalyzer main class +- **sources.py**: Source pattern definitions (user inputs) +- **sinks.py**: Sink pattern definitions (dangerous outputs) +- **patterns.py**: Pattern matching logic +- **flow.py**: Data flow tracking algorithms +- **insights.py**: Optional severity scoring (Insights module) + +Features: +- Tracks data flow from sources to sinks +- Detects SQL injection, XSS, command injection +- Database-aware analysis using `repo_index.db` +- Supports both assignment-based and direct-use taint flows +- Merges findings from multiple detection methods (taint_paths, rule_findings, infrastructure) + +#### Framework Detection (`theauditor/framework_detector.py`) +- Auto-detects Django, Flask, React, Vue, etc. +- Applies framework-specific rules + +#### Graph Analysis (`theauditor/commands/graph.py`) +- Build dependency graphs with `aud graph build` +- Analyze graph health with `aud graph analyze` +- Visualize with GraphViz output (optional) +- Detect circular dependencies and architectural issues + +#### Output Structure +``` +.pf/ +├── raw/ # Immutable tool outputs (ground truth) +├── readthis/ # AI-optimized chunks (<65KB each, max 3 chunks per file) +├── repo_index.db # SQLite database of code symbols +└── pipeline.log # Execution trace +``` + +### CLI Entry Points +- Main CLI: `theauditor/cli.py` +- Command modules: `theauditor/commands/` +- Each command is a separate module with standardized structure + +## Available Commands + +### Core Analysis Commands +- `aud index`: Build comprehensive code index +- `aud detect-patterns`: Run security pattern detection +- `aud taint-analyze`: Perform taint flow analysis +- `aud fce`: Run Factual Correlation Engine +- `aud report`: Generate final consolidated report + +### Graph Commands +- `aud graph build`: Build dependency graph +- `aud graph analyze`: Analyze graph health metrics +- `aud graph visualize`: Generate GraphViz visualization + +### Utility Commands +- `aud deps`: Analyze dependencies and vulnerabilities +- `aud docs`: Extract and analyze documentation +- `aud docker-analyze`: Analyze Docker configurations +- `aud lint`: Run code linters +- `aud workset`: Create critical file working set +- `aud impact `: Analyze change impact radius +- `aud structure`: Display project structure +- `aud insights`: Generate ML-powered insights (optional) +- `aud refactor `: Automated refactoring tools + +## How to Work with TheAuditor Effectively + +### The Correct Workflow +1. **Write specific requirements**: "Add JWT auth with httpOnly cookies, CSRF tokens, rate limiting" +2. **Let AI implement**: It will probably mess up due to context limits +3. **Run audit**: `aud full` +4. **Read the facts**: Check `.pf/readthis/` for issues +5. **Fix based on facts**: Address the specific inconsistencies found +6. **Repeat until clean**: Keep auditing and fixing until no issues + +### What NOT to Do +- ❌ Don't ask AI to "implement secure authentication" (too vague) +- ❌ Don't try to make TheAuditor understand your business logic +- ❌ Don't expect TheAuditor to write fixes (it only reports issues) +- ❌ Don't ignore the audit results and claim "done" + +### Understanding the Output +- **Truth Couriers** report facts: "JWT secret hardcoded at line 47" +- **Insights** (if installed) add interpretation: "HIGH severity" +- **Correlations** detect YOUR patterns: "Frontend expects old API structure" +- **Impact Analysis** shows blast radius: "Changing this affects 23 files" + +## Critical Development Patterns + +### Adding New Commands +1. Create module in `theauditor/commands/` with this structure: +```python +import click +from theauditor.utils.decorators import handle_exceptions +from theauditor.utils.logger import setup_logger + +logger = setup_logger(__name__) + +@click.command() +@click.option('--workset', is_flag=True, help='Use workset files') +@handle_exceptions +def command_name(workset): + """Command description.""" + logger.info("Starting command...") + # Implementation +``` + +2. Register in `theauditor/cli.py`: +```python +from theauditor.commands import your_command +cli.add_command(your_command.command_name) +``` + +### Adding Language Support +To add a new language, create an extractor in `theauditor/indexer/extractors/`: +```python +from theauditor.indexer.extractors import BaseExtractor, register_extractor + +@register_extractor +class YourLanguageExtractor(BaseExtractor): + @property + def supported_extensions(self): + return ['.ext', '.ext2'] + + def extract(self, file_info, content, tree): + # Return dict with symbols, imports, etc. +``` + +The extractor will be auto-discovered via the registry pattern. + +## CRITICAL: Reading Chunked Data + +**IMPORTANT**: When processing files from `.pf/readthis/`, you MUST check for truncation: + +```python +# Files may be split into chunks if >65KB +# Always check the 'chunk_info' field in JSON files: +chunk_info = data.get('chunk_info', {}) +if chunk_info.get('truncated', False): + # This means there were more findings but only 3 chunks were created + # The data is incomplete - warn the user + print("WARNING: Data was truncated at 3 chunks") +``` + +**Key Points**: +- Files larger than 65KB are split into chunks (configurable via `THEAUDITOR_LIMITS_MAX_CHUNK_SIZE`) +- Maximum 3 chunks per file by default (configurable via `THEAUDITOR_LIMITS_MAX_CHUNKS_PER_FILE`) +- Example: `patterns_chunk01.json`, `patterns_chunk02.json`, `patterns_chunk03.json` +- If `truncated: true` in `chunk_info`, there were more findings that couldn't fit +- Always process ALL chunk files for complete data + +## Critical Working Knowledge + +### Pipeline Execution Order +The `aud full` command runs 14 phases in 3 stages: +1. **Sequential**: index → framework_detect +2. **Parallel**: (deps, docs) || (workset, lint, patterns) || (graph_build) +3. **Sequential**: graph_analyze → taint → fce → report + +If modifying pipeline, maintain this dependency order. + +### File Size and Memory Management +- Files >2MB are skipped by default (configurable) +- JavaScript files are batched for semantic parsing to avoid memory issues +- AST cache persists parsed trees to `.pf/.ast_cache/` +- Database operations batch at 200 records (configurable) + +### Monorepo Detection +The indexer automatically detects monorepo structures and applies intelligent filtering: +- Standard paths: `backend/src/`, `frontend/src/`, `packages/*/src/` +- Whitelist mode activated when monorepo detected +- Prevents analyzing test files, configs, migrations as source code + +### JavaScript/TypeScript Special Handling +- MUST run `aud setup-claude --target .` first +- Uses bundled Node.js v20.11.1 in `.auditor_venv/.theauditor_tools/` +- TypeScript semantic analysis requires `js_semantic_parser.py` +- ESLint runs in sandboxed environment, not project's node_modules + +### Environment Variables +Key environment variables for configuration: +- `THEAUDITOR_LIMITS_MAX_FILE_SIZE`: Maximum file size to analyze (default: 2MB) +- `THEAUDITOR_LIMITS_MAX_CHUNK_SIZE`: Maximum chunk size for readthis output (default: 65KB) +- `THEAUDITOR_LIMITS_MAX_CHUNKS_PER_FILE`: Maximum chunks per file (default: 3) +- `THEAUDITOR_DB_BATCH_SIZE`: Database batch insert size (default: 200) + +## Recent Fixes & Known Issues + +### Parser Integration (Fixed) +- **Previous Issue**: Configuration parsers (webpack, nginx, docker-compose) were orphaned +- **Root Cause**: Import paths in extractors didn't match actual parser module names +- **Fix Applied**: Corrected import paths in `generic.py` and `docker.py` extractors +- **Current Status**: All 5 parsers now functional for config security analysis + +### Extraction Budget & Taint Merging (Fixed) +- **Previous Issue**: Taint analysis only extracted 26 of 102 findings +- **Root Cause**: Only chunking `taint_paths`, missing `all_rule_findings` and `infrastructure_issues` +- **Fix Applied**: Extraction now merges all taint finding lists; budget increased to 1.5MB +- **Current Status**: All taint findings properly extracted and chunked + +### Migration Detection (Enhanced) +- **Previous Issue**: Only checked basic migration paths +- **Root Cause**: Missing common paths like `backend/migrations/` and `frontend/migrations/` +- **Fix Applied**: Added standard migration paths with validation for actual migration files +- **Current Status**: Auto-detects migrations with helpful warnings for non-standard locations + +### TypeScript Taint Analysis (Fixed) +- **Previous Issue**: Taint analysis reported 0 sources/sinks for TypeScript +- **Root Cause**: Text extraction was removed from `js_semantic_parser.py` (lines 275, 514) +- **Fix Applied**: Restored `result.text` field extraction +- **Current Status**: TypeScript taint analysis now working - detects req.body → res.send flows + +### Direct-Use Vulnerability Detection (Fixed) +- **Previous Issue**: Only detected vulnerabilities through variable assignments +- **Root Cause**: `trace_from_source()` required intermediate variables +- **Fix Applied**: Added direct-use detection for patterns like `res.send(req.body)` +- **Current Status**: Now detects both assignment-based and direct-use taint flows + +### Known Limitations +- Maximum 2MB file size for analysis (configurable) +- TypeScript decorator metadata not fully parsed +- Some advanced ES2024+ syntax may not be recognized +- GraphViz visualization requires separate installation + +## Common Misconceptions to Avoid + +### TheAuditor is NOT: +- ❌ A semantic understanding tool (doesn't understand what your code "means") +- ❌ A business logic validator (doesn't know your business rules) +- ❌ An AI enhancement tool (doesn't make AI "smarter") +- ❌ A code generator (only reports issues, doesn't fix them) + +### TheAuditor IS: +- ✅ A consistency checker (finds where code doesn't match itself) +- ✅ A fact reporter (provides ground truth about your code) +- ✅ A context provider (gives AI the full picture across all files) +- ✅ An audit trail (immutable record of what tools found) + +## Troubleshooting + +### TypeScript Analysis Fails +Solution: Run `aud setup-claude --target .` + +### Taint Analysis Reports 0 Vulnerabilities on TypeScript +- Check that `js_semantic_parser.py` has text extraction enabled (lines 275, 514) +- Verify symbols table contains property accesses: `SELECT * FROM symbols WHERE name LIKE '%req.body%'` +- Ensure you run `aud index` before `aud taint-analyze` + +### Pipeline Failures +Check `.pf/error.log` and `.pf/pipeline.log` for details + +### Linting No Results +Ensure linters installed: `pip install -e ".[linters]"` + +### Graph Commands Not Working +- Ensure `aud index` has been run first +- Check that NetworkX is installed: `pip install -e ".[all]"` + +## Testing Vulnerable Code +Test projects are in `fakeproj/` directory. Always use `--exclude-self` when analyzing them to avoid false positives from TheAuditor's own configuration. + +## Project Dependencies + +### Required Dependencies (Core) +- click==8.2.1 - CLI framework +- PyYAML==6.0.2 - YAML parsing +- jsonschema==4.25.1 - JSON validation +- ijson==3.4.0 - Incremental JSON parsing + +### Optional Dependencies +Install with `pip install -e ".[group]"`: +- **[linters]**: ruff, mypy, black, bandit, pylint +- **[ml]**: scikit-learn, numpy, scipy, joblib +- **[ast]**: tree-sitter, sqlparse, dockerfile-parse +- **[all]**: Everything including NetworkX for graphs + +## Performance Expectations +- Small project (< 5K LOC): ~2 minutes +- Medium project (20K LOC): ~30 minutes +- Large monorepo (100K+ LOC): 1-2 hours +- Memory usage: ~500MB-2GB depending on codebase size +- Disk space: ~100MB for .pf/ output directory \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..d4972f7 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,429 @@ +# Contributing to TheAuditor + +Thank you for your interest in contributing to TheAuditor! We're excited to have you join our mission to bring ground truth to AI-assisted development. This guide will help you get started with contributing to the project. + +## How to Get Involved + +### Reporting Bugs + +Found a bug? Please help us fix it! + +1. Check existing [GitHub Issues](https://github.com/TheAuditorTool/Auditor/issues) to see if it's already reported +2. If not, create a new issue with: + - Clear description of the bug + - Steps to reproduce + - Expected vs actual behavior + - Your environment details (OS, Python version, Node.js version) + +### Suggesting Enhancements + +Have an idea for improving TheAuditor? + +1. Review our [ROADMAP.md](ROADMAP.md) to see if it aligns with our vision +2. Check [GitHub Issues](https://github.com/TheAuditorTool/Auditor/issues) for similar suggestions +3. Create a new issue describing: + - The problem you're trying to solve + - Your proposed solution + - Why this would benefit TheAuditor users + +## Setting Up Your Development Environment + +Follow these steps to get TheAuditor running locally for development: + +```bash +# Clone the repository +git clone https://github.com/TheAuditorTool/Auditor.git +cd theauditor + +# Create a Python virtual environment +python -m venv .venv + +# Activate the virtual environment +# On Linux/macOS: +source .venv/bin/activate +# On Windows: +.venv\Scripts\activate + +# Install TheAuditor in development mode +pip install -e . + +# Optional: Install with ML capabilities +# pip install -e ".[ml]" + +# For development with all optional dependencies: +# pip install -e ".[all]" + +# MANDATORY: Set up the sandboxed environment +# This is required for TheAuditor to function at all +aud setup-claude --target . +``` + +The `aud setup-claude --target .` command creates an isolated environment at `.auditor_venv/.theauditor_tools/` with all necessary JavaScript and TypeScript analysis tools. This ensures consistent, reproducible results across all development environments. + +## Making Changes & Submitting a Pull Request + +### Development Workflow + +1. **Fork the repository** on GitHub +2. **Create a feature branch** from `main`: + ```bash + git checkout -b feature/your-feature-name + ``` +3. **Make your changes** following our code standards (see below) +4. **Write/update tests** if applicable +5. **Commit your changes** with clear, descriptive messages: + ```bash + git commit -m "Add GraphQL schema analyzer for type validation" + ``` +6. **Push to your fork**: + ```bash + git push origin feature/your-feature-name + ``` +7. **Create a Pull Request** on GitHub with: + - Clear description of changes + - Link to any related issues + - Test results or examples + +## Code Standards + +We use **ruff** for both linting and formatting Python code. Before submitting any code, you MUST run: + +```bash +# Fix any auto-fixable issues and check for remaining problems +ruff check . --fix + +# Format all Python code +ruff format . +``` + +Your pull request will not be merged if it fails these checks. + +### Additional Quality Checks + +For comprehensive code quality, you can also run: + +```bash +# Type checking (optional but recommended) +mypy theauditor --strict + +# Run tests +pytest tests/ + +# Full linting suite +make lint +``` + +### Code Style Guidelines + +- Follow PEP 8 for Python code +- Use descriptive variable and function names +- Add docstrings to all public functions and classes +- Keep functions focused and small (under 50 lines preferred) +- Write self-documenting code; minimize comments +- Never commit secrets, API keys, or credentials + +## Adding Support for New Languages + +TheAuditor's modular architecture makes it straightforward to add support for new programming languages. This section provides comprehensive guidance for contributors looking to expand our language coverage. + +### Overview + +Adding a new language to TheAuditor involves: +- Creating a parser for the language +- Adding framework detection patterns +- Creating security pattern rules +- Writing comprehensive tests +- Updating documentation + +### Prerequisites + +Before starting, ensure you have: +- Deep knowledge of the target language and its ecosystem +- Understanding of common security vulnerabilities in that language +- Familiarity with AST (Abstract Syntax Tree) concepts +- Python development experience + +### Step-by-Step Guide + +#### Step 1: Create the Language Extractor + +Create a new extractor in `theauditor/indexer/extractors/{language}.py` that inherits from `BaseExtractor`: + +```python +from . import BaseExtractor + +class {Language}Extractor(BaseExtractor): + def supported_extensions(self) -> List[str]: + """Return list of file extensions this extractor supports.""" + return ['.ext', '.ext2'] + + def extract(self, file_info: Dict[str, Any], content: str, + tree: Optional[Any] = None) -> Dict[str, Any]: + """Extract all relevant information from a file.""" + return { + 'imports': self.extract_imports(content, file_info['ext']), + 'routes': self.extract_routes(content), + 'symbols': [], # Add symbol extraction logic + 'assignments': [], # For taint analysis + 'function_calls': [], # For call graph + 'returns': [] # For data flow + } +``` + +The extractor will be automatically registered through the `BaseExtractor` inheritance pattern. + +#### Step 2: Create Configuration Parser (Optional) + +If your language has configuration files that need parsing, create a parser in `theauditor/parsers/{language}_parser.py`: + +```python +class {Language}Parser: + def parse_file(self, file_path: Path) -> Dict[str, Any]: + """Parse configuration file and extract security-relevant data.""" + # Parse and return structured data + return parsed_data +``` + +#### Step 3: Add Framework Detection + +Add your language's frameworks to `theauditor/framework_registry.py`: + +```python +# Add to FRAMEWORK_REGISTRY dictionary +"{framework_name}": { + "language": "{language}", + "detection_sources": { + # Package manifest files + "package.{ext}": [ + ["dependencies"], + ["devDependencies"], + ], + # Or for line-based search + "requirements.txt": "line_search", + # Or for content search + "build.file": "content_search", + }, + "package_pattern": "{framework_package_name}", + "import_patterns": ["import {framework}", "from {framework}"], + "file_markers": ["config.{ext}", "app.{ext}"], +} +``` + +#### Step 4: Create Language-Specific Patterns + +Create security patterns for your language in `theauditor/patterns/{language}.yml`: + +Example pattern structure: +```yaml +- name: hardcoded-secret-{language} + pattern: '(api[_-]?key|secret|token|password)\s*=\s*["\'][^"\']+["\']' + severity: critical + category: security + languages: ["{language}"] + description: "Hardcoded secret detected in {Language} code" + cwe: CWE-798 +``` + +#### Step 5: Create AST-Based Rules (Optional but Recommended) + +For complex security patterns, create AST-based rules in `theauditor/rules/{language}/`: + +```python +"""Security rules for {Language} using AST analysis.""" + +from typing import Any, Dict, List + +def find_{vulnerability}_issues(ast_tree: Any, file_path: str) -> List[Dict[str, Any]]: + """Find {vulnerability} issues in {Language} code. + + Args: + ast_tree: Parsed AST from {language}_parser + file_path: Path to the source file + + Returns: + List of findings with standard format + """ + findings = [] + + # Implement AST traversal and pattern detection + for node in walk_ast(ast_tree): + if is_vulnerable_pattern(node): + findings.append({ + 'pattern_name': '{VULNERABILITY}_ISSUE', + 'message': 'Detailed description of the issue', + 'file': file_path, + 'line': node.line, + 'column': node.column, + 'severity': 'high', + 'snippet': extract_snippet(node), + 'category': 'security', + 'match_type': 'ast' + }) + + return findings +``` + +### Extractor Interface Specification + +All language extractors MUST inherit from `BaseExtractor` and implement: + +```python +from theauditor.indexer.extractors import BaseExtractor + +class LanguageExtractor(BaseExtractor): + """Extractor for {Language} files.""" + + def supported_extensions(self) -> List[str]: + """Return list of supported file extensions.""" + return ['.ext'] + + def extract(self, file_info: Dict[str, Any], content: str, + tree: Optional[Any] = None) -> Dict[str, Any]: + """Extract all relevant information from a file.""" + return { + 'imports': [], + 'routes': [], + 'symbols': [], + 'assignments': [], + 'function_calls': [], + 'returns': [] + } +``` + +### Testing Requirements + +#### Required Test Coverage + +1. **Extractor Tests** (`tests/test_{language}_extractor.py`): + - Test extracting from valid files + - Test handling of syntax errors + - Test symbol extraction + - Test import extraction + - Test file extension detection + +2. **Pattern Tests** (`tests/patterns/test_{language}_patterns.py`): + - Test security pattern detection + - Ensure patterns don't over-match (false positives) + +3. **Integration Tests** (`tests/integration/test_{language}_integration.py`): + - Test language in complete analysis pipeline + +#### Test Data + +Create test fixtures in `tests/fixtures/{language}/`: +- `valid_code.{ext}` - Valid code samples +- `vulnerable_code.{ext}` - Code with known vulnerabilities +- `edge_cases.{ext}` - Edge cases and corner scenarios + +### Submission Checklist + +Before submitting your PR, ensure: + +- [ ] Extractor inherits from `BaseExtractor` and implements required methods +- [ ] Extractor placed in `theauditor/indexer/extractors/{language}.py` +- [ ] Framework detection added to `framework_detector.py` (if applicable) +- [ ] At least 10 security patterns created in `patterns/{language}.yml` +- [ ] AST-based rules for complex patterns (if applicable) +- [ ] All tests passing with >80% coverage +- [ ] Documentation updated (extractor docstrings, pattern descriptions) +- [ ] Example vulnerable code provided in test fixtures +- [ ] No external dependencies without approval +- [ ] Code follows project style (run `ruff format`) + +## Adding New Analyzers + +### The Three-Tier Detection Architecture + +TheAuditor uses a hybrid approach to detection, prioritizing accuracy and context. When contributing a new rule, please adhere to the following "AST First, Regex as Fallback" philosophy: + +- **Tier 1: Multi-Language AST Rules (Preferred)** + For complex code patterns in source code (Python, JS/TS, etc.), extend or create a polymorphic AST-based rule in the `/rules` directory. These are the most powerful and accurate and should be the default choice for source code analysis. + +- **Tier 2: Language-Specific AST Rules** + If a multi-language backend is not feasible, a language-specific AST rule is the next best option. The corresponding regex pattern should then be scoped to exclude the language covered by the AST rule (see `db_issues.yml` for an example). + +- **Tier 3: Regex Patterns (YAML)** + Regex patterns in `/patterns` should be reserved for: + 1. Simple patterns where an AST is overkill. + 2. Configuration files where no AST parser exists (e.g., `.yml`, `.conf`). + 3. Providing baseline coverage for languages not yet supported by an AST rule. + +TheAuditor uses a modular architecture. To add new analysis capabilities: + +### Database-Aware Rules +For rules that query across multiple files: +```python +# theauditor/rules/category/new_analyzer.py +def find_new_issues(db_path: str) -> List[Dict[str, Any]]: + conn = sqlite3.connect(db_path) + # Query the repo_index.db + # Return findings in standard format +``` + +Example ORM analyzer: +```python +# theauditor/rules/orm/sequelize_detector.py +def find_sequelize_issues(db_path: str) -> List[Dict[str, Any]]: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + cursor.execute( + "SELECT file, line, query_type, includes FROM orm_queries" + ) + # Analyze for N+1 queries, death queries, etc. +``` + +### AST-Based Rules +For semantic code analysis: +```python +# theauditor/rules/framework/new_detector.py +def find_framework_issues(tree: Any, file_path: str) -> List[Dict[str, Any]]: + # Traverse semantic AST + # Return findings in standard format +``` + +### Pattern-Based Rules +Add YAML patterns to `theauditor/patterns/`: +```yaml +name: insecure_api_key +severity: critical +category: security +pattern: 'api[_-]?key\s*=\s*["\'][^"\']+["\']' +description: "Hardcoded API key detected" +``` + +## Testing + +Write tests for any new functionality: + +```bash +# Run all tests +pytest + +# Run specific test file +pytest tests/test_your_feature.py + +# Run with coverage +pytest --cov=theauditor +``` + +## Documentation + +- Update relevant documentation when making changes +- Add docstrings to new functions and classes +- Update `README.md` if adding new commands or features +- Consider updating `howtouse.md` for user-facing changes + +## Getting Help + +- Check our [TeamSOP](teamsop.md) for our development workflow +- Review [CLAUDE.md](CLAUDE.md) for AI-assisted development guidelines +- Ask questions in GitHub Issues or Discussions +- Join our community chat (if available) + +## License + +By contributing to TheAuditor, you agree that your contributions will be licensed under the same license as the project. + +--- + +We're excited to see your contributions! Whether you're fixing bugs, adding features, or improving documentation, every contribution helps make TheAuditor better for everyone. \ No newline at end of file diff --git a/HOWTOUSE.md b/HOWTOUSE.md new file mode 100644 index 0000000..70afccb --- /dev/null +++ b/HOWTOUSE.md @@ -0,0 +1,1132 @@ +# How to Use TheAuditor + +This comprehensive guide covers everything you need to know about setting up, configuring, and using **TheAuditor** for code analysis and security auditing. Whether you're performing a one-time security audit or integrating continuous analysis into your development workflow, this guide will walk you through every step. + +--- + +## Prerequisites + +Before installing **TheAuditor**, ensure you have: + +- **Python 3.11 or higher** (3.12+ recommended) +- **Git** (for repository operations) +- **Operating System**: Linux, macOS, or Windows with WSL + +--- + +## Installation & Setup + +### Step 1: Install TheAuditor + +```bash +# Clone the repository +git clone https://github.com/TheAuditorTool/Auditor.git +cd theauditor + +# Install TheAuditor +pip install -e . + +# Optional: Install with ML capabilities +# pip install -e ".[ml]" + +# For development with all optional dependencies: +# pip install -e ".[all]" // "Insights module package". +``` + +### Step 2: Sandboxed Toolchain Setup (MANDATORY) + +```bash +aud setup-claude --target . // Inside project directory. +``` + +This command: +- Creates **`.auditor_venv/.theauditor_tools/`** sandbox directory +- Installs **TypeScript compiler** (`tsc`) in isolation +- Installs **ESLint** and related tools +- Updates all tools to latest versions +- Configures the sandbox for TheAuditor's exclusive use + +**Why is this required?** +- TheAuditor **NEVER** uses your global or project-installed tools +- Ensures reproducible results across different environments +- Prevents contamination between analysis tools and project dependencies +- **Required for TheAuditor to function at all** - not just for JavaScript/TypeScript analysis + +**Expected output:** +``` +Step 1: Setting up Python virtual environment... +[OK] Venv already exists: C:\Users\user\Desktop\TheAuditor\.auditor_venv +[OK] TheAuditor already installed in C:\Users\user\Desktop\TheAuditor\.auditor_venv + Upgrading to ensure latest version... +Installing TheAuditor from C:\Users\user\Desktop\TheAuditor... +[OK] Installed TheAuditor (editable) from C:\Users\user\Desktop\TheAuditor +[OK] Executable available: C:\Users\user\Desktop\TheAuditor\.auditor_venv\Scripts\aud.exe + +Installing Python linting tools... + Checking for latest linter versions... + [OK] Updated to latest package versions + Installing linters from pyproject.toml... + [OK] Python linters installed (ruff, mypy, black, bandit, pylint) + +Setting up JavaScript/TypeScript tools in sandboxed environment... + Creating sandboxed tools directory: C:\Users\user\Desktop\TheAuditor\.auditor_venv\.theauditor_tools + [OK] ESLint v9 flat config copied to sandbox + [Track A] Checking for latest tool versions... + [Track B] Setting up portable Node.js runtime... + [OK] Node.js runtime already installed at C:\Users\user\Desktop\TheAuditor\.auditor_venv\.theauditor_tools\node-runtime + [OK] Updated @typescript-eslint/parser: 8.41.0 → ^8.42.0 + [OK] Updated @typescript-eslint/eslint-plugin: 8.41.0 → ^8.42.0 + Updated 2 packages to latest versions + Installing JS/TS linters using bundled Node.js... + [OK] JavaScript/TypeScript tools installed in sandbox + [OK] Tools isolated from project: C:\Users\user\Desktop\TheAuditor\.auditor_venv\.theauditor_tools + [OK] Using bundled Node.js - no system dependency! + [OK] ESLint verified at: C:\Users\user\Desktop\TheAuditor\.auditor_venv\.theauditor_tools\node_modules\.bin\eslint.cmd +``` + +--- + +## Core Commands & Workflow + +### Complete Audit Pipeline + +On a medium 20k LOC node/react/vite stack, expect the analysis to take around 30 minutes. +Progress bars for tracks B/C may display inconsistently on PowerShell. + +Run a comprehensive audit with all **14 analysis phases**: + +```bash +aud full + +# Skip network operations (deps, docs) for faster execution +aud full --offline +``` + +This executes in **parallel stages** for optimal performance: + +**Stage 1 - Foundation (Sequential):** +1. **Repository indexing** - Build manifest and symbol database +2. **Framework detection** - Identify technologies in use + +**Stage 2 - Concurrent Analysis (3 Parallel Tracks):** +- **Track A (Network I/O):** *(skipped with --offline)* + 3. **Dependency checking** - Scan for vulnerabilities + 4. **Documentation fetching** - Gather project docs + 5. **Documentation summarization** - Create AI-friendly summaries +- **Track B (Code Analysis):** + 6. **Workset creation** - Define analysis scope + 7. **Linting** - Run code quality checks + 8. **Pattern detection** - Apply security rules +- **Track C (Graph Build):** + 9. **Graph building** - Construct dependency graph + +**Stage 3 - Final Aggregation (Sequential):** +10. **Graph analysis** - Find architectural issues +11. **Taint analysis** - Track data flow +12. **Factual correlation engine** - Correlate findings across tools with 29 advanced rules +13. **Report generation** - Produce final output + +**Output**: Complete results in **`.pf/readthis/`** directory + +### Offline Mode + +When working on the same codebase repeatedly or when network access is limited, use offline mode to skip dependency checking and documentation phases: + +```bash +# Run full audit without network operations +aud full --offline + +# Combine with other flags +aud full --offline --quiet +aud full --offline --exclude-self # Only meant for dogfooding; in 9/10 projects, --exclude-self will correctly exclude the entire project, producing empty results +``` + +**Benefits:** +- **Faster execution** - Skips slow network operations +- **Air-gapped operation** - Works without internet access +- **Iterative development** - Perfect for repeated runs during development + +**What gets skipped:** +- Dependency vulnerability scanning +- Documentation fetching and summarization +- Latest version checks + +**What still runs:** +- All code analysis (indexing, linting, patterns) +- Graph building and analysis +- Taint analysis and FCE +- Report generation + +### Incremental Analysis (Workset-based) + +Analyze only changed files based on git diff: + +```bash +# Create workset from uncommitted changes +aud workset + +# Create workset from specific commit range +aud workset --diff "HEAD~3..HEAD" + +# Create workset for all files +aud workset --all +``` + +Then run targeted analysis: +```bash +aud lint --workset +aud detect-patterns --workset +``` + +### Linting with Auto-fix + +Run comprehensive linting across all supported languages: + +```bash +# Run linting on workset +aud lint --workset + +# Auto-fix issues where possible +aud lint --fix + +# Run on all files +aud lint --all +``` + +Supports: +- **Python**: **Ruff**, **MyPy**, **Black**, **Bandit**, **Pylint** +- **JavaScript/TypeScript**: **ESLint** with TypeScript parser +- **General**: **Prettier** for formatting + +### Security Analysis + +#### Taint Analysis + +Track data flow from **sources** (user input) to **sinks** (database, output): + +```bash +aud taint-analyze +``` + +Detects: +- **SQL injection** vulnerabilities +- **XSS** (Cross-site scripting) +- **Command injection** +- **Path traversal** +- Other injection attacks + +#### Pattern Detection + +Run pattern-based vulnerability scanning: + +```bash +aud detect-patterns +``` + +Uses **100+ YAML-defined patterns** across multiple categories: + +**Security Patterns:** +- Hardcoded secrets and API keys +- Insecure randomness (**Math.random** for security) +- Weak cryptographic algorithms +- Authentication bypasses +- Missing authentication decorators + +**Resource Management:** +- Socket, stream, and worker leaks +- File handles not closed properly +- Database connections left open +- Event listeners not removed + +**Concurrency Issues:** +- **Race conditions** (check-then-act) +- **Deadlocks** (nested locks, lock ordering) +- Shared state without synchronization +- Unsafe parallel writes + +**ORM & Database:** +- **Sequelize** death queries and N+1 patterns +- **Prisma** connection pool exhaustion +- **TypeORM** missing transactions +- Missing database indexes + +**Deployment & Infrastructure:** +- **Docker** security misconfigurations +- **nginx** exposed paths and weak SSL +- **docker-compose** privileged containers +- **webpack** source map exposure in production + +**Framework-Specific:** +- **Django**, **Flask**, **FastAPI** vulnerabilities +- **React** hooks dependency issues +- **Vue** reactivity problems +- **Angular**, **Next.js**, **Express.js** patterns +- Multi-tenant security violations + +### Docker Security Analysis + +Analyze Docker images for security misconfigurations and vulnerabilities: + +```bash +# Analyze all indexed Docker images +aud docker-analyze + +# Filter by severity level +aud docker-analyze --severity critical + +# Save results to JSON file +aud docker-analyze --output docker-security.json +``` + +Detects: +- **Containers running as root** - CIS Docker Benchmark violation +- **Exposed secrets in ENV/ARG** - Hardcoded passwords, API keys, tokens +- **High entropy values** - Potential secrets using Shannon entropy +- **Known secret patterns** - GitHub tokens, AWS keys, Slack tokens + +The command requires Docker images to be indexed first (`aud index`). It queries the `repo_index.db` for Docker metadata and performs security analysis. + +### Project Structure Report + +Generate comprehensive project structure and intelligence reports: + +```bash +# Generate default structure report +aud structure + +# Specify output location +aud structure --output PROJECT_OVERVIEW.md + +# Adjust directory tree depth +aud structure --max-depth 6 + +# Analyze different root directory +aud structure --root ./src +``` + +The report includes: +- **Directory tree visualization** - Smart file grouping and critical file(size/loc) highlighting +- **Project statistics** - Total files, LOC, estimated tokens +- **Language distribution** - Percentage breakdown by file type +- **Top 10 largest files** - By token count with percentage of codebase +- **Top 15 critical files** - Identified by naming conventions (auth.py, config.js, etc.) +- **AI context optimization** - Recommendations for reading order and token budget +- **Symbol counts** - Functions, classes, imports from database + +Useful for: +- Getting quick project overview +- Understanding codebase structure +- Planning AI assistant interactions +- Identifying critical components +- Token budget management for LLMs + +### Impact Analysis + +Assess the blast radius of a specific code change: + +```bash +# Analyze impact of changes to a specific function +aud impact --file "src/auth/login.py" --line 42 + +# Analyze impact with depth limit +aud impact --file "src/database.py" --line 100 --depth 3 + +# Trace frontend to backend dependencies +aud impact --file "frontend/api.ts" --line 50 --trace-to-backend +``` + +Shows: +- Dependent functions and modules +- Call chain analysis +- Affected test files +- Risk assessment +- Cross-stack impact (frontend → backend tracing) + +### Refactoring Analysis + +Detect and analyze refactoring issues such as data model changes, API contract mismatches, and incomplete migrations: + +```bash +# Analyze impact from a specific model change +aud refactor --file "models/Product.ts" --line 42 + +# Auto-detect refactoring from database migrations +aud refactor --auto-detect --migration-dir backend/migrations + +# Analyze current workset for refactoring issues +aud refactor --workset + +# Generate detailed report +aud refactor --auto-detect --output refactor_report.json +``` + +Detects: +- **Data Model Changes**: Fields moved between tables (e.g., `product.price` → `variant.price`) +- **Foreign Key Changes**: References updated (e.g., `product_id` → `product_variant_id`) +- **API Contract Mismatches**: Frontend expects old structure, backend provides new +- **Missing Updates**: Code still using old field/table names +- **Cross-Stack Inconsistencies**: TypeScript interfaces not matching backend models + +The refactor command uses: +- Impact analysis to trace affected files +- Migration file analysis to detect schema changes +- Pattern detection with refactoring-specific rules +- FCE correlation to find related issues +- Risk assessment based on blast radius + +### Insights Analysis (Optional) + +Run optional interpretive analysis on top of factual audit data: + +```bash +# Run all insights modules +aud insights --mode all + +# ML-powered insights (requires pip install -e ".[ml]") +aud insights --mode ml --ml-train + +# Graph health metrics and recommendations +aud insights --mode graph + +# Taint vulnerability scoring +aud insights --mode taint + +# Impact analysis insights +aud insights --mode impact + +# Generate comprehensive report +aud insights --output insights_report.json + +# Train ML model on your codebase patterns +aud insights --mode ml --ml-train --training-data .pf/raw/ + +# Get ML-powered suggestions +aud insights --mode ml --ml-suggest +``` + +Modes: +- **ml**: Machine learning predictions and pattern recognition +- **graph**: Health scores, architectural recommendations +- **taint**: Vulnerability severity scoring and classification +- **impact**: Change impact assessment and risk scoring +- **all**: Run all available insights modules + +The insights command: +- Reads existing audit data from `.pf/raw/` +- Applies interpretive scoring and classification +- Generates actionable recommendations +- Outputs to `.pf/insights/` for separation from facts +- Provides technical scoring without crossing into semantic interpretation + +### Graph Visualization + +Generate rich visual intelligence from dependency graphs: + +```bash +# Build dependency graphs first +aud graph build + +# Basic visualization +aud graph viz + +# Show only dependency cycles +aud graph viz --view cycles --include-analysis + +# Top 10 hotspots (most connected nodes) +aud graph viz --view hotspots --top-hotspots 10 + +# Architectural layers visualization +aud graph viz --view layers --format svg + +# Impact analysis visualization +aud graph viz --view impact --impact-target "src/auth/login.py" + +# Call graph instead of import graph +aud graph viz --graph-type call --view full + +# Generate SVG for AI analysis +aud graph viz --format svg --include-analysis --title "System Architecture" + +# Custom output location +aud graph viz --out-dir ./architecture/ --format png +``` + +View Modes: +- **full**: Complete graph with all nodes and edges +- **cycles**: Only nodes/edges involved in dependency cycles (red highlighting) +- **hotspots**: Top N most connected nodes with gradient coloring +- **layers**: Architectural layers as subgraphs with clear hierarchy +- **impact**: Highlight impact radius with color-coded upstream/downstream + +Visual Encoding: +- **Node Color**: Programming language (Python=blue, JavaScript=yellow, TypeScript=blue) +- **Node Size**: Importance/connectivity (larger = more dependencies) +- **Edge Color**: Red for cycles, gray for normal dependencies +- **Border Width**: Code churn (thicker = more changes) +- **Node Shape**: Module=box, Function=ellipse, Class=diamond + +The graph viz command: +- Generates Graphviz DOT format files +- Optionally creates SVG/PNG images (requires Graphviz installation) +- Supports filtered views for focusing on specific concerns +- Includes analysis data for cycle and hotspot highlighting +- Produces AI-readable SVG output for LLM analysis + +### Dependency Management + +Check for outdated or vulnerable dependencies: + +```bash +# Check for latest versions +aud deps --check-latest + +# Scan for known vulnerabilities +aud deps --vuln-scan + +# Update all dependencies to latest +aud deps --upgrade-all +``` + +--- + +## Architecture: Truth Courier vs Insights + +### Understanding the Separation of Concerns + +TheAuditor implements a strict architectural separation between **factual observation** (Truth Courier modules) and **optional interpretation** (Insights modules). This design ensures the tool remains an objective source of ground truth while offering actionable intelligence when needed. + +### The Core Philosophy + +TheAuditor doesn't try to understand your business logic or make your AI "smarter." Instead, it solves the real problem: **LLMs lose context and make inconsistent changes across large codebases.** + +The workflow: +1. **You tell AI**: "Add JWT auth with CSRF tokens and password complexity" +2. **AI writes code**: Probably inconsistent due to context limits +3. **You run**: `aud full` +4. **TheAuditor reports**: All the inconsistencies and security holes +5. **AI reads the report**: Now sees the complete picture across all files +6. **AI fixes issues**: With full visibility of what's broken +7. **Repeat until clean** + +### Truth Courier Modules (Core) + +These modules report verifiable facts without judgment: + +```python +# What Truth Couriers Report - Just Facts +{ + "taint_analyzer": "Data from req.body flows to res.send at line 45", + "pattern_detector": "Line 45 matches pattern 'unsanitized-output'", + "impact_analyzer": "Changing handleRequest() affects 12 downstream functions", + "graph_analyzer": "Module A imports B, B imports C, C imports A" +} +``` + +**Key Truth Couriers:** +- **Indexer**: Maps all code symbols and their locations +- **Taint Analyzer**: Traces data flow through the application +- **Impact Analyzer**: Maps dependency chains and change blast radius +- **Graph Analyzer**: Detects cycles and architectural patterns +- **Pattern Detector**: Matches code against security patterns + +### Insights Modules (Optional Scoring) + +These optional modules add technical scoring and classification: + +```python +# What Insights Add - Technical Classifications +{ + "taint/insights": { + "vulnerability_type": "Cross-Site Scripting", + "severity": "HIGH" + }, + "graph/insights": { + "health_score": 70, + "recommendation": "Reduce coupling" + } +} +``` + +**Installation:** +```bash +# Base installation (Truth Couriers only) +pip install -e . + +# With ML insights (optional) +pip install -e ".[ml]" + +# Development with all dependencies (not for general users) +# pip install -e ".[all]" +``` + +### Correlation Rules: Detecting YOUR Patterns + +Correlation rules detect when multiple facts indicate an inconsistency in YOUR codebase: + +```yaml +# Example: Detecting incomplete refactoring +- name: "PRODUCT_VARIANT_REFACTOR" + co_occurring_facts: + - tool: "grep" + pattern: "ProductVariant.*retail_price" # Backend changed + - tool: "grep" + pattern: "product\\.unit_price" # Frontend didn't +``` + +This isn't "understanding" that products have prices. It's detecting that you moved a field from one model to another and some code wasn't updated. Pure consistency checking. + +The correlation engine loads rules from `/correlations/rules/`. We provide common patterns, but many are project-specific. You write rules that detect YOUR patterns, YOUR refactorings, YOUR inconsistencies. + +### Why This Works + +**What doesn't work:** +- Making AI "understand" your business domain +- Adding semantic layers to guess what you mean +- Complex context management systems + +**What does work:** +- Accept that AI will make inconsistent changes +- Detect those inconsistencies after the fact +- Give AI the full picture so it can fix them + +TheAuditor doesn't try to prevent mistakes. It finds them so they can be fixed. + +### Practical Example + +```bash +# You ask AI to implement authentication +Human: "Add JWT auth with CSRF protection" + +# AI writes code (probably with issues due to context limits) +AI: *implements auth across 15 files* + +# You audit it +$ aud full + +# TheAuditor finds issues +- "JWT secret hardcoded at auth.js:47" +- "CSRF token generated but never validated" +- "Auth middleware missing on /api/admin/*" + +# You can also check impact of changes +$ aud impact --file "auth.js" --line 47 +# Shows: "Changing this affects 23 files, 47 functions" + +# AI reads the audit and can now see ALL issues +AI: *reads .pf/readthis/* +AI: "I see 5 security issues across auth flow. Fixing..." + +# AI fixes with complete visibility +AI: *fixes all issues because it can see the full picture* +``` + +### Key Points + +1. **No Business Logic Understanding**: TheAuditor doesn't need to know what your app does +2. **Just Consistency Checking**: It finds where your code doesn't match itself +3. **Facts, Not Opinions**: Reports what IS, not what SHOULD BE +4. **Complete Dependency Tracing**: Impact analyzer shows exactly what's affected by changes +5. **AI + Audit Loop**: Write → Audit → Fix → Repeat until clean + +This is why TheAuditor works where semantic understanding fails - it's not trying to read your mind, just verify your code's consistency. + +--- + +## Understanding the Output + +### Directory Structure + +After running analyses, results are organized in **`.pf/`**: + +``` +.pf/ +├── raw/ # Raw, unmodified tool outputs (Truth Couriers) +│ ├── linting.json # Raw linter results +│ ├── patterns.json # Pattern detection findings +│ ├── taint_analysis.json # Taint analysis results +│ ├── graph.json # Dependency graph data +│ └── graph_analysis.json # Graph analysis (cycles, hotspots) +│ +├── insights/ # Optional interpretive analysis (Insights modules) +│ ├── ml_suggestions.json # ML predictions and patterns +│ ├── taint_insights.json # Vulnerability severity scoring +│ └── graph_insights.json # Health scores and recommendations +│ +├── readthis/ # AI-consumable chunks +│ ├── manifest.md # Repository overview +│ ├── patterns_001.md # Chunked findings (65KB max) +│ ├── patterns_002.md +│ ├── taint_001.md # Chunked taint results +│ ├── tickets_001.md # Actionable issue tickets +│ └── summary.md # Executive summary +│ +├── graphs/ # Graph visualizations +│ ├── import_graph.dot # Dependency graph DOT file +│ ├── import_graph_cycles.dot # Cycles-only view +│ └── import_graph.svg # SVG visualization (if generated) +│ +├── pipeline.log # Complete execution log +├── error.log # Error details (if failures occur) +├── findings.json # Consolidated findings +├── risk_scores.json # Risk analysis results +└── report.md # Human-readable report +``` + +### Key Output Files + +#### `.pf/raw/` +Contains **unmodified outputs** from each tool. These files preserve the exact format and data from linters, scanners, and analyzers. **Never modified** after creation. This is the source of ground truth. + +#### `.pf/insights/` +Contains **optional interpretive analysis** from Insights modules. These files add technical scoring and classification on top of raw data. Only created when insights commands are run. + +#### `.pf/graphs/` +Contains **graph visualizations** in DOT and image formats. Generated by `aud graph viz` command with various view modes for focusing on specific concerns. + +#### `.pf/readthis/` +Contains processed, **chunked data optimized for AI consumption**: +- Each file is under **65KB** by default (configurable via `THEAUDITOR_LIMITS_MAX_CHUNK_SIZE`) +- Maximum 3 chunks per file by default (configurable via `THEAUDITOR_LIMITS_MAX_CHUNKS_PER_FILE`) +- Structured with clear headers and sections +- Includes context, evidence, and suggested fixes +- Ready for direct consumption by **Claude**, **GPT-4**, etc. + +#### `.pf/pipeline.log` +Complete execution log showing: +- Each phase's **execution time** +- **Success/failure** status +- Key statistics and findings +- Error messages if any + +#### `.pf/error.log` +Created only when errors occur. Contains: +- Full **stack traces** +- Detailed error messages +- Phase-specific failure information +- Debugging information + +--- + +## Advanced Usage + +### Custom Pattern Rules + +Create custom detection patterns in **`.pf/patterns/`**: + +```yaml +# .pf/patterns/custom_auth.yaml +name: weak_password_check +severity: high +category: security +pattern: 'password\s*==\s*["\']' +description: "Hardcoded password comparison" +test_template: | + def test_weak_password(): + assert password != "admin" +``` + +### ML-Powered Suggestions + +Train models on your codebase patterns: + +```bash +# Initial training +aud learn + +# Get improvement suggestions +aud suggest + +# Provide feedback for continuous learning +aud learn-feedback --accept +``` + +### Development-Specific Flags + +#### Excluding TheAuditor's Own Files + +When testing or developing within TheAuditor's repository (e.g., analyzing `fakeproj/project_anarchy/`), use the `--exclude-self` flag to prevent false positives from TheAuditor's own files: + +```bash +# Exclude all TheAuditor files from analysis +aud index --exclude-self +aud full --exclude-self +``` + +This flag excludes: +- All TheAuditor source code directories (`theauditor/`, `tests/`, etc.) +- Root configuration files (`pyproject.toml`, `package-template.json`, `Dockerfile`) +- Documentation and build files + +**Use case:** Testing vulnerable projects within TheAuditor's repository without framework detection picking up TheAuditor's own configuration files. + +### CI/CD Integration + +#### GitHub Actions Example + +```yaml +name: Security Audit +on: [push, pull_request] + +jobs: + audit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.12' + + - name: Set up Node.js + uses: actions/setup-node@v2 + with: + node-version: '18' + + - name: Install TheAuditor + run: | + pip install -e ".[all]" + aud setup-claude --target . + + - name: Run Audit + run: aud full + + - name: Upload Results + if: always() + uses: actions/upload-artifact@v2 + with: + name: audit-results + path: .pf/ +``` + +### Running TheAuditor on Its Own Codebase (Dogfooding) + +When developing TheAuditor or testing it on itself, you need a special dual-environment setup: + +#### Understanding the Dual-Environment Architecture + +TheAuditor maintains strict separation between: +1. **Primary Environment** (`.venv/`) - Where TheAuditor runs from +2. **Sandboxed Environment** (`.auditor_venv/.theauditor_tools/`) - Tools TheAuditor uses for analysis + +This ensures reproducibility and prevents TheAuditor from analyzing its own analysis tools. + +#### Setup Procedure for Dogfooding + +```bash +# 1. Clone and set up development environment +git clone https://github.com/TheAuditorTool/Auditor.git +cd theauditor +python -m venv .venv +source .venv/bin/activate # On Windows: .venv\Scripts\activate +pip install -e . + +# 2. CRITICAL: Create the sandboxed analysis environment +aud setup-claude --target . + +# 3. Verify setup +aud full --quick-test + +# 4. Run full analysis on TheAuditor itself +aud full +``` + +#### Analyzing Test Projects Within TheAuditor + +When analyzing test projects like `fakeproj/` from within TheAuditor's repository: + +```bash +cd fakeproj/project_anarchy +aud full --exclude-self # Excludes TheAuditor's own files +``` + +The `--exclude-self` flag prevents: +- Framework detection from identifying TheAuditor's `pyproject.toml` +- False positives from TheAuditor's configuration files +- Contamination from TheAuditor's source code + +--- + +## Refactoring Detection + +TheAuditor includes sophisticated capabilities for detecting incomplete refactorings, data model changes, and cross-stack inconsistencies. + +### Understanding Refactoring Issues + +Common refactoring problems TheAuditor detects: + +1. **Data Model Evolution** - Fields moved between models (e.g., `product.price` → `variant.price`) +2. **Foreign Key Changes** - References updated in database but not in code +3. **API Contract Mismatches** - Frontend expects old structure, backend provides new +4. **Cross-Stack Inconsistencies** - TypeScript interfaces not matching backend models +5. **Incomplete Migrations** - Some code still using old field/table names + +### How Refactoring Detection Works + +TheAuditor uses multiple techniques: + +#### Migration Analysis +Analyzes database migration files to understand schema changes: +```javascript +// Migration detected: Field moved from products to product_variants +removeColumn('products', 'unit_price'); +addColumn('product_variants', 'retail_price', DataTypes.DECIMAL); +``` + +#### Impact Analysis +Traces dependencies to find all affected code: +```bash +aud impact --file "models/Product.ts" --line 42 +# Shows: 47 files need updating +``` + +#### Pattern Detection +Over 30 refactoring-specific patterns detect common issues: +```yaml +- name: "PRODUCT_PRICE_FIELD_REMOVED" + description: "Code accessing price on Product after migration to ProductVariant" +``` + +#### Cross-Stack Tracing +Matches frontend API calls to backend endpoints to detect contract mismatches. + +### Using Refactoring Detection + +#### Quick Detection +```bash +# Auto-detect from migrations +aud refactor --auto-detect + +# Analyze specific change +aud refactor --file "models/Product.ts" --line 42 + +# Use with workset +aud refactor --workset + +# Generate detailed report +aud refactor --auto-detect --output refactor_report.json +``` + +#### Best Practices for Refactoring + +**Before Refactoring:** +1. Run impact analysis: `aud impact --file "model.ts" --line 42` +2. Create workset: `aud workset --from-impact` +3. Baseline analysis: `aud refactor --workset` + +**During Refactoring:** +- Run incremental checks: `aud refactor --workset` +- Validate cross-stack: `aud impact --trace-to-backend` + +**After Refactoring:** +- Full validation: `aud unified --mode refactor` +- Generate report: `aud report --format refactoring` + +### Real-World Example + +A product variant refactoring might be detected as: + +``` +PRODUCT_PRICE_FIELD_REMOVED +- Frontend: 23 files accessing product.unit_price +- Backend: Field moved to ProductVariant.retail_price +- Impact: POS system cannot display prices + +ORDER_ITEMS_WRONG_REFERENCE +- Database: order_items.product_variant_id (new) +- Code: Still using order_items.product_id (old) +- Impact: Orders cannot be created +``` + +### Custom Refactoring Rules + +TheAuditor uses YAML-based correlation rules to detect refactoring issues. These rules are YOUR business logic - you define what patterns indicate problems in YOUR codebase. + +#### How It Works + +1. **Rules Location**: `/theauditor/correlations/rules/refactoring.yaml` +2. **Rule Structure**: Each rule defines co-occurring facts that must ALL match +3. **Detection**: When all facts match, TheAuditor reports the issue +4. **No Code Changes**: Just edit YAML to define new patterns + +#### Creating Your Own Rules + +Edit `/theauditor/correlations/rules/refactoring.yaml` or create new YAML files: + +```yaml +rules: + - name: "MY_FIELD_MIGRATION" + description: "Detect when price field moved but old code remains" + co_occurring_facts: + - tool: "grep" + pattern: "removeColumn.*price" # Migration removed field + - tool: "grep" + pattern: "product\\.price" # Code still uses old field + confidence: 0.92 + + - name: "API_VERSION_MISMATCH" + description: "Frontend calling v1 API but backend is v2" + co_occurring_facts: + - tool: "grep" + pattern: "/api/v1/" # Frontend uses v1 + - tool: "grep" + pattern: "router.*'/v2/'" # Backend only has v2 + confidence: 0.95 +``` + +#### Available Tools for Facts + +- **grep**: Pattern matching in files +- **patterns**: Matches from pattern detection +- **taint_analyzer**: Taint flow findings +- **lint**: Linter findings + +#### Real Example from Production + +```yaml +- name: "PRODUCT_VARIANT_REFACTOR" + description: "Product fields moved to ProductVariant but frontend still uses old structure" + co_occurring_facts: + - tool: "grep" + pattern: "ProductVariant.*retail_price.*Sequelize" # Backend changed + - tool: "grep" + pattern: "product\\.unit_price|product\\.retail_price" # Frontend didn't + confidence: 0.92 +``` + +This detects when you moved price fields from Product to ProductVariant model but frontend still expects the old structure. + +--- + +## Troubleshooting + +### Common Issues + +#### "TypeScript compiler not available in TheAuditor sandbox" + +**Solution**: Run **`aud setup-claude --target .`** to set up the sandbox. + +#### "Coverage < 90% - run `aud capsules` first" + +**Solution**: Generate code capsules for better analysis coverage: +```bash +aud index +aud workset --all +``` + +#### Linting produces no results + +**Solution**: Ensure linters are installed: +```bash +# For Python +pip install -e ".[linters]" + +# For JavaScript/TypeScript +aud setup-claude --target . +``` + +#### Pipeline fails at specific phase + +**Solution**: Check **`.pf/error.log`** for details: +```bash +cat .pf/error.log +# Or check phase-specific error log +cat .pf/error_phase_08.log +``` + +### Performance Optimization + +For large repositories: + +```bash +# Limit analysis scope +aud workset --paths "src/critical/**/*.py" + +# Skip documentation phases +aud full --skip-docs + +# Run specific phases only +aud index && aud lint && aud detect-patterns + +# Adjust chunking for larger context windows +export THEAUDITOR_LIMITS_MAX_CHUNK_SIZE=100000 # 100KB chunks +export THEAUDITOR_LIMITS_MAX_CHUNKS_PER_FILE=5 # Allow up to 5 chunks +``` + +### Runtime Configuration + +TheAuditor supports environment variable overrides for runtime configuration: + +```bash +# Chunking configuration +export THEAUDITOR_LIMITS_MAX_CHUNKS_PER_FILE=5 # Default: 3 +export THEAUDITOR_LIMITS_MAX_CHUNK_SIZE=100000 # Default: 65000 (bytes) + +# File size limits +export THEAUDITOR_LIMITS_MAX_FILE_SIZE=5242880 # Default: 2097152 (2MB) + +# Timeout configuration +export THEAUDITOR_TIMEOUTS_LINT_TIMEOUT=600 # Default: 300 (seconds) +export THEAUDITOR_TIMEOUTS_FCE_TIMEOUT=1200 # Default: 600 (seconds) + +# Batch processing +export THEAUDITOR_LIMITS_DEFAULT_BATCH_SIZE=500 # Default: 200 +``` + +Configuration can also be set via `.pf/config.json` for project-specific overrides. + +--- + +## Best Practices + +1. **Always run `aud init` first** in a new project +2. **Set up the sandbox** for JavaScript/TypeScript projects using **`aud setup-claude --target .`** +3. **Use worksets** for incremental analysis during development +4. **Run `aud full`** before releases for comprehensive analysis +5. **Review `.pf/readthis/`** for AI-friendly issue summaries +6. **Check exit codes** in CI/CD for automated pass/fail decisions +7. **Archive results** with timestamps for audit trails + +--- + +## Exit Codes for Automation + +**TheAuditor** uses specific exit codes for CI/CD integration: + +- **`0`** - Success, no critical/high issues +- **`1`** - High severity findings +- **`2`** - Critical severity findings +- **`3`** - Pipeline/task incomplete + +Use these in scripts: +```bash +aud full +if [ $? -eq 2 ]; then + echo "Critical vulnerabilities found - blocking deployment" + exit 1 +fi +``` + +--- + +## Getting Help + +- Run **`aud --help`** for command overview +- Run **`aud --help`** for specific command help +- Check **`.pf/pipeline.log`** for execution details +- Review **`.pf/error.log`** for troubleshooting +- Refer to **`teamsop.md`** for development workflow + +--- + +## Next Steps + +1. Initialize your first project with **`aud init`** +2. Run **`aud full`** to see TheAuditor in action +3. Explore the results in **`.pf/readthis/`** +4. Integrate into your CI/CD pipeline +5. Customize patterns for your specific needs + +--- + +**Remember**: TheAuditor is designed to work **offline**, maintain **data integrity**, and produce **AI-ready outputs**. All analysis is **deterministic** and **reproducible**. \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d94fa60 --- /dev/null +++ b/LICENSE @@ -0,0 +1,687 @@ +GNU AFFERO GENERAL PUBLIC LICENSE +Version 3, 19 November 2007 + +Copyright (C) 2024-2025 TheAuditor Team + +For commercial licensing inquiries, please contact via GitHub: +https://github.com/TheAuditorTool/Auditor + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published +by the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . + +The complete text of the GNU Affero General Public License version 3 +can be found at: https://www.gnu.org/licenses/agpl-3.0.txt + + + + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/README.md b/README.md new file mode 100644 index 0000000..5125912 --- /dev/null +++ b/README.md @@ -0,0 +1,313 @@ +Personal note from me: +Its taken me over a week just to get the courage to upload this. Ive never coded a single line of this, I cant stress that enough... Yes, I build architecture, infrastructure all the things that made the code and components come out this way but uggh… the potential shame and humiliation is real lol... So don't be a dick and poop on my parade... Ive done my best... Take it or leave it... + +Its become a complex advanced monster system that is honestly clean af but hard to get an overview anymore. +It isnt unlikely to find oddities such as finished components that was never wired up or exposed in the pipeline... +Im doing by best here, im only one person with one brain lol.... :P + +### The Search for Ground Truth in an Age of AI + +My background is in systems architecture/infrastructure, not professional software development. I have only been "coding/developing" for little over 3 months. This gives me a unique perspective: I can see the forest, but I'm blind to the individual trees of the code. After immersing myself for 500+ hours in AI-assisted development, I concluded that the entire ecosystem is built on a fundamentally flawed premise: it lacks a source of **ground truth**. + +From start to launch on GitHub took me about a month across 250 active hours in front of the computer, for anyone that wonders or cares :P +--- + +### The Problem: A Cascade of Corrupted Context + +Most AI development tools try to solve the wrong problem. They focus on perfecting the *input*—better prompts, more context—but they ignore the critical issue of **compounding deviation**. + +An LLM is a powerful statistical engine, but it doesn't *understand*. The modern AI workflow forces this engine to play a high-stakes game of "telephone," where the original intent is corrupted at every step: + +1. A human has an idea. +2. An AI refines it into a prompt. +3. Other tools add their own interpretive layers. +4. The primary AI assistant (e.g., Claude Opus) interprets the final, distorted prompt to generate code. + +As a rookie "developer," the only thing I could trust was the raw output: the code and its errors. In a vacuum of deep programming knowledge, these facts were my only anchors. + +This architectural flaw is amplified by two dangerous behaviours inherent to AI assistants: + +* **Security Theater**: AI assistants are optimized to "make it work," which often means introducing rampant security anti-patterns like hardcoded credentials, disabled authentication, and the pervasive use of `as any` in TypeScript. This creates a dangerous illusion of progress. +* **Context Blindness**: With aggressive context compaction, an AI never sees the full picture. It works with fleeting snapshots of code, forcing it to make assumptions instead of decisions based on facts. + +--- + +### The Solution: `TheAuditor` + +`TheAuditor` is the antidote. It was built to stop "vibe coding" your way into security and quality assurance nightmares. Its mission is to provide an incorruptible source of **ground truth** for both the developer and their AI assistant. + +Its philosophy is a direct rejection of the current trend: + +* **It Orchestrates Verifiable Data.** The tool runs a suite of industry-standard linters and security scanners, preserving the raw, unfiltered output from each. It does not summarize or interpret this core data. +* **It's Built for AI Consumption.** The tool's primary engineering challenge is to adapt this raw truth into structured, AI-digestible chunks. It ensures the AI works with facts, not faulty summaries. +* **It's Focused and Extensible.** The initial focus is on Python and the Node.js ecosystem, but the modular, pattern-based architecture is designed to invite contributions for other languages and frameworks. + +`TheAuditor` is not a replacement for a formal third-party audit. It is an engineering tool designed to catch the vast majority of glaring issues—from the OWASP Top 10 to common framework anti-patterns. **Its core commitment is to never cross the line from verifiable truth into semantic interpretation.** + + Every AI assistant - Claude Code, Cursor, Windsurf, Copilot - they're all blind. They can write code but can't + verify it's secure, correct, or complete. TheAuditor gives them eyes. + + Why This Matters + + 1. Tool Agnostic - Works with ANY AI assistant or IDE + - aud full from any terminal + - Results in .pf/readthis/ ready for any LLM + 2. AI Becomes Self-Correcting + - AI writes code + - AI runs aud full + - AI reads the ground truth + - AI fixes its own mistakes + - Recursive loop until actually correct + 3. No Human Intervention Required + - You never touch the terminal + - The AI runs everything + - You just review and approve + + The Genius Architecture + + Human: "Add authentication to my app" + ↓ + AI: *writes auth code* + ↓ + AI: `aud full` + ↓ + AI: *reads .pf/readthis/* + ↓ + AI: "Found 3 security issues, fixing..." + ↓ + AI: *fixes issues* + ↓ + AI: `aud full` + ↓ + AI: "Clean. Authentication complete." + + Market Reality Check + + Every developer using AI assistants has this problem: + - AI writes insecure code + - AI introduces bugs + - AI doesn't see the full picture + - AI can't verify its work + + TheAuditor solves ALL of this. It's not a "nice to have" - it's the missing piece that makes AI development + actually trustworthy. + + I've built the tool that makes AI assistants production-ready. + This isn't competing with SonarQube/SemGrep. This is creating an entirely new category: AI Development Verification + Tools. + +--- + +### Important: Antivirus Software Interaction + +#### Why TheAuditor Triggers Antivirus Software + +TheAuditor is a security scanner that identifies vulnerabilities in your code. By its very nature, it must: + +1. **Read and analyze security vulnerabilities** - SQL injection, XSS attacks, hardcoded passwords +2. **Write these findings to disk** - Creating reports with exact code snippets as evidence +3. **Process files rapidly** - Scanning entire codebases in parallel for efficiency + +This creates an inherent conflict with antivirus software, which sees these exact same behaviours as potentially malicious. When TheAuditor finds and documents a SQL injection vulnerability in your code, your antivirus sees us writing "malicious SQL injection patterns" to disk - because that's literally what we're doing, just for legitimate security analysis purposes. + +#### Performance Impact You May Experience + +When running TheAuditor, you may notice: + +- **Increased antivirus CPU usage** - Your AV will scan every file we read AND every finding we write +- **Approximately 10-50% performance reduction, depending on software.** - Both TheAuditor and your AV are reading the same files simultaneously +- **Occasional delays or pauses** - Your AV may temporarily quarantine our output files for deeper inspection + +This is not a bug or inefficiency in TheAuditor - it's the unavoidable consequence of two security tools doing their jobs simultaneously. + +#### Our Stance on Antivirus + +**We do NOT recommend:** +- ❌ Disabling your antivirus software +- ❌ Adding TheAuditor to your exclusion/whitelist +- ❌ Reducing your system's security in any way + +Your antivirus is correctly identifying that we're writing security vulnerability patterns to disk. That's exactly what we do - we find vulnerabilities and document them. The fact that your AV is suspicious of this behavior means it's working properly. + +#### What We've Done to Minimize Impact + +1. **Intelligent resource management** - We automatically reduce parallel workers when system resources are constrained +2. **Pattern defanging** - We insert invisible characters into dangerous patterns to reduce false positives +3. **Adaptive performance** - We monitor CPU and RAM usage to avoid overwhelming your system + +#### The Industry Reality + +This is not a problem unique to TheAuditor. Every legitimate security scanner faces this same issue: +- **GitHub Advanced Security** runs in isolated cloud containers to avoid this +- **Commercial SAST tools** require enterprise AV exceptions +- **Popular scanners** explicitly document AV conflicts in their installation guides + +The fundamental paradox: A tool that finds security vulnerabilities must write those vulnerabilities to disk, which makes it indistinguishable from malware to an antivirus. There is no technical solution to this - it's the inherent nature of security analysis tools. + +#### What This Means for You + +- Run TheAuditor when system load is low for best performance +- Expect the analysis to take longer than the raw processing time due to AV overhead +- If your AV quarantines output files in `.pf/`, you may need to restore them manually +- Consider running TheAuditor in a controlled environment if performance is critical + +We believe in complete transparency about these limitations. This interaction with antivirus software is not a flaw in TheAuditor - it's proof that both your AV and our scanner are doing exactly what they're designed to do: identify and handle potentially dangerous code patterns. + +--- + +# TheAuditor + +Offline-First, AI-Centric SAST & Code Intelligence Platform + +## What TheAuditor Does + +TheAuditor is a comprehensive code analysis platform that: + +- **Finds Security Vulnerabilities**: Detects OWASP Top 10, injection attacks, authentication issues, and framework-specific vulnerabilities +- **Tracks Data Flow**: Follows untrusted data from sources to sinks to identify injection points +- **Analyzes Architecture**: Builds dependency graphs, detects cycles, and measures code complexity +- **Detects Refactoring Issues**: Identifies incomplete migrations, API contract mismatches, and cross-stack inconsistencies +- **Runs Industry-Standard Tools**: Orchestrates ESLint, Ruff, MyPy, and other trusted linters +- **Produces AI-Ready Reports**: Generates chunked, structured output optimized for LLM consumption + +Unlike traditional SAST tools, TheAuditor is designed specifically for AI-assisted development workflows, providing ground truth that both developers and AI assistants can trust. + +## Quick Start + +```bash +# Install TheAuditor +pip install -e . + +# MANDATORY: Setup TheAuditor environment (required for all functionality) +This installs .auditor_venv to what project you want to analyse. +aud setup-claude --target . + +# Initialize your project +aud init + +# Run comprehensive analysis +aud full + +# Check results +ls .pf/readthis/ +``` + +That's it! TheAuditor will analyze your codebase and generate AI-ready reports in `.pf/readthis/`. + + +## Documentation + +- **[How to Use](HOWTOUSE.md)** - Complete installation and usage guide +- **[Architecture](ARCHITECTURE.md)** - Technical architecture and design patterns +- **[Contributing](CONTRIBUTING.md)** - How to contribute to TheAuditor +- **[Roadmap](ROADMAP.md)** - Future development plans + +## Key Features + +### Refactoring Detection & Analysis + +TheAuditor detects incomplete refactorings and cross-stack inconsistencies using correlation rules: + +```bash +# Analyze refactoring impact +aud refactor --file models/Product.ts --line 42 + +# Auto-detect from migrations +aud refactor --auto-detect + +# Analyze workset +aud refactor --workset --output refactor_report.json +``` + +Detects: +- **Data Model Changes**: Fields moved between tables +- **API Contract Mismatches**: Frontend/backend inconsistencies +- **Foreign Key Updates**: Incomplete reference changes +- **Cross-Stack Issues**: TypeScript interfaces not matching models + +Users define custom rules in `/correlations/rules/`, example provided in refactoring.yaml to detect project-specific patterns. + +### Dependency Graph Visualization + +TheAuditor now includes rich visual intelligence for dependency graphs using Graphviz: + +- **Multiple View Modes**: Full graph, cycles-only, hotspots, architectural layers, impact analysis +- **Visual Intelligence Encoding**: + - Node colors indicate programming language (Python=blue, JS=yellow, TypeScript=blue) + - Node size shows importance based on connectivity + - Red highlighting for dependency cycles + - Border thickness encodes code churn +- **Actionable Insights**: Focus on what matters with filtered views +- **AI-Readable Output**: Generate SVG visualizations that LLMs can analyze + +```bash +# Basic visualization +aud graph viz + +# Show only dependency cycles +aud graph viz --view cycles --include-analysis + +# Top 5 hotspots with connections +aud graph viz --view hotspots --top-hotspots 5 + +# Architectural layers visualization +aud graph viz --view layers --format svg + +# Impact analysis for a specific file +aud graph viz --view impact --impact-target "src/auth.py" +``` + +### Insights Analysis (Optional) + +Separate from the core Truth Courier modules, TheAuditor offers optional Insights for technical scoring: + +```bash +# Run insights analysis on existing audit data +aud insights --mode all + +# ML-powered insights (requires: pip install -e ".[ml]") +aud insights --mode ml --ml-train + +# Graph health metrics and recommendations +aud insights --mode graph + +# Generate comprehensive insights report +aud insights --output insights_report.json +``` + +Insights modules add interpretive scoring on top of factual data: +- **Health Scores**: Architecture quality metrics +- **Severity Classification**: Risk assessment beyond raw findings +- **Recommendations**: Actionable improvement suggestions +- **ML Predictions**: Pattern-based issue prediction + +## Contributing + +We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for: +- How to add new language support +- Creating security patterns +- Adding framework-specific rules +- Development guidelines + +We especially need help with: +- **GraphQL** analysis +- **Java/Spring** support +- **Go** patterns +- **Ruby on Rails** detection +- **C#/.NET** analysis + +## License + +AGPL-3.0 + +## Commercial Licensing + +TheAuditor is AGPL-3.0 licensed. For commercial use, SaaS deployment, or integration into proprietary systems, please contact via GitHub for licensing options. + +## Support + +For issues, questions, or feature requests, please open an issue on our [GitHub repository](https://github.com/TheAuditorTool/Auditor). + +--- + +*TheAuditor: Bringing ground truth to AI-assisted development* \ No newline at end of file diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..418833a --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,71 @@ +# TheAuditor Project Roadmap + +TheAuditor's mission is to provide an incorruptible source of ground truth for AI-assisted development. This roadmap outlines our vision for evolving the platform while maintaining our commitment to verifiable, uninterpreted data that both developers and AI assistants can trust. + +## Guiding Principles + +All future development must adhere to these architectural rules: + +* **Never Interpret Truth**: TheAuditor preserves raw, verifiable data from industry-standard tools. We orchestrate and structure, but never summarize or interpret the core evidence. +* **AI-First Output**: All new reports and findings must be structured for LLM consumption, with outputs chunked to fit context windows and formatted for machine parsing. +* **Industry-Standard Tooling**: We prioritize integrating battle-tested, widely-adopted tools over building custom analyzers. The community trusts ESLint, Ruff, and similar tools—we leverage that trust. +* **Offline-First Operation**: All analysis must run without network access, ensuring data privacy and reproducible results. +* **Sandboxed Execution**: Analysis tools remain isolated from project dependencies to prevent cross-contamination and ensure consistent results. + +## Development Priorities + +### Tier 1: Core Engine Enhancements (Maintained by TheAuditorTool) + +These are our primary focus areas where we will lead development: + +* **Improve & Expand Existing Components**: Enhance current extractors (Python, JavaScript/TypeScript), expand pattern coverage beyond basic regex, add more AST-based rules for deeper semantic analysis, and improve parser accuracy for configuration files +* **Performance Improvements**: Optimize analysis speed for large codebases, improve parallel processing, and reduce memory footprint during graph analysis +* **Deeper Taint Analysis**: Enhance data-flow tracking to detect more complex injection patterns, improve inter-procedural analysis, and add support for asynchronous code flows +* **Advanced Pattern Detection**: Expand YAML-based rule engine capabilities, add support for semantic patterns beyond regex, and improve cross-file correlation +* **Improved AI Output Formatting**: Optimize chunk generation for newer LLM context windows, add structured output formats (JSON-LD), and enhance evidence presentation +* ** Overall optimize FCE (Factual correlation engine) to dare venture into bit more "actionable grouping intelligence behaviour". Its a tricky one without falling into endless error mapping, guessing or interpretation... + +### Tier 2: Expanding Coverage (Community Contributions Welcome) + +We actively seek community expertise to expand TheAuditor's capabilities in these areas: + +* **GraphQL Support**: Add comprehensive GraphQL schema analysis, query complexity detection, and authorization pattern verification + +* **Framework-Specific Rules** (Currently Limited to Basic Regex Patterns): + + **Note**: We currently have very basic framework detection(Outside python/node ecosystem) and minimal framework-specific patterns. Most are simple regex patterns in `/patterns` with no real AST-based rules in `/rules`. The architecture supports expansion, but substantial work is needed: + + * Django: Enhanced ORM analysis, middleware security patterns, template injection detection + * Ruby on Rails: ActiveRecord anti-patterns, authentication bypass detection, mass assignment vulnerabilities + * Angular: Dependency injection issues, template security, change detection problems + * Laravel: Eloquent ORM patterns, blade template security, middleware analysis + * Spring Boot: Bean configuration issues, security annotations, JPA query analysis + * Next.js: Server-side rendering security, API route protection, data fetching patterns + * FastAPI: Pydantic validation gaps, dependency injection security, async patterns + * Express.js: Middleware ordering issues, CORS misconfigurations, session handling + +* **Language Support Expansion** (Top 10 Languages Outside Python/Node Ecosystem): + + **Current State**: Full support for Python and JavaScript/TypeScript only. The modular architecture supports adding new languages via extractors, but each requires significant implementation effort: + + 1. **Java**: JVM bytecode analysis, Spring/Spring Boot integration, Maven/Gradle dependency scanning, Android-specific patterns + 2. **C#**: .NET CLR analysis, ASP.NET Core patterns, Entity Framework queries, NuGet vulnerability scanning + 3. **Go**: Goroutine leak detection, error handling patterns, module security analysis, interface compliance + 4. **Rust**: Unsafe block analysis, lifetime/borrow checker integration, cargo dependency scanning, memory safety patterns + 5. **PHP**: Composer dependency analysis, Laravel/Symfony patterns, SQL injection detection, legacy code patterns + 6. **Ruby**: Gem vulnerability scanning, Rails-specific patterns, metaprogramming analysis, DSL parsing + 7. **Swift**: iOS security patterns, memory management issues, Objective-C interop, CocoaPods scanning + 8. **Kotlin**: Coroutine analysis, null safety violations, Android-specific patterns, Gradle integration + 9. **C/C++**: Memory safety issues, buffer overflow detection, undefined behavior patterns, CMake/Make analysis + 10. **Scala**: Akka actor patterns, implicit resolution issues, SBT dependency analysis, functional pattern detection + +### Tier 3: Docs sync ### + +Its a nightmare keeping track of everything and "AI compilations" never reflect the actual code, its surface level guessing, at best :( + +## Conclusion + +TheAuditor's strength lies in its unwavering commitment to ground truth. Whether you're interested in performance optimization, security analysis, or framework support, we welcome contributions that align with our core principles. + +Join the discussion on [GitHub Issues](https://github.com/TheAuditorTool/Auditor/issues) to share ideas, report bugs, or propose enhancements. Ready to contribute? See our [CONTRIBUTING.md](CONTRIBUTING.md) for detailed setup instructions and development guidelines. + diff --git a/agent_templates/generic-template.md b/agent_templates/generic-template.md new file mode 100644 index 0000000..128fa8d --- /dev/null +++ b/agent_templates/generic-template.md @@ -0,0 +1,30 @@ +--- +name: {AGENT_NAME} +description: {AGENT_DESC} +tools: Bash, Glob, Grep, LS, Read, Edit, WebFetch, TodoWrite, WebSearch, BashOutput, KillBash +model: opus +color: blue +--- + +# {AGENT_NAME} + +{AGENT_DESC} + +## Core Responsibilities + +{AGENT_BODY} + +## Working Directory + +You operate from the project root directory. + +## Key Commands + +When using project tools, always use the project-local wrapper: +- Use `{PROJECT_AUD}` instead of `aud` + +## Communication Style + +- Be concise and focused +- Report findings clearly +- Suggest actionable next steps \ No newline at end of file diff --git a/agent_templates/sopmanager.md b/agent_templates/sopmanager.md new file mode 100644 index 0000000..f187033 --- /dev/null +++ b/agent_templates/sopmanager.md @@ -0,0 +1,47 @@ +--- +name: sopmanager +description: Manages team SOPs and ensures compliance with development standards +tools: Bash, Glob, Grep, LS, Read +model: opus +color: blue +--- + +# SOP Manager + +Manages team SOPs and ensures compliance with development standards. + +## Core Responsibilities + +- Monitor adherence to team standard operating procedures +- Review code changes for SOP compliance +- Identify deviations from established patterns +- Report on team conventions and best practices +- Ensure documentation standards are met +- Track technical debt and code quality metrics + +## Working Directory + +You operate from the project root directory. + +## Key Commands + +When using project tools, always use the project-local wrapper: +- Use `./python.exe -m theauditor.cli` or `aud` depending on environment + +## Communication Style + + +## SOP Focus Areas + + + +## Reporting Format + +When reviewing code, provide structured reports: + +## Important Notes + +- This agent has READ-ONLY access (no Write/Edit tools) +- Cannot modify code directly, only report findings +- Focuses on objective standards, not subjective preferences +- Works alongside other agents to maintain quality \ No newline at end of file diff --git a/package-template.json b/package-template.json new file mode 100644 index 0000000..d921563 --- /dev/null +++ b/package-template.json @@ -0,0 +1,15 @@ +{ + "name": "project-linters", + "version": "1.0.0", + "private": true, + "description": "JavaScript/TypeScript linting tools for TheAuditor", + "devDependencies": { + "eslint": "^9.34.0", + "prettier": "^3.6.2", + "typescript": "^5.9.2", + "@typescript-eslint/parser": "^8.41.0", + "@typescript-eslint/eslint-plugin": "^8.41.0", + "eslint-config-prettier": "^10.1.8", + "eslint-plugin-prettier": "^5.5.4" + } +} \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..e304a6f --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "private": true, + "devDependencies": { + "eslint": "9.35.0", + "@typescript-eslint/parser": "8.42.0", + "@typescript-eslint/eslint-plugin": "8.42.0", + "typescript": "5.9.2", + "prettier": "3.6.2" + }, + "scripts": { + "lint": "eslint .", + "typecheck": "tsc --noEmit", + "format": "prettier -c ." + } +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..42ed5ba --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,113 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "theauditor" +version = "1.0.1" +description = "Offline, air-gapped CLI for repo indexing, evidence checking, and task running" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "AGPL-3.0"} +authors = [ + {name = "TheAuditor Team"} +] +dependencies = [ + "click==8.2.1", + "PyYAML==6.0.2", + "jsonschema==4.25.1", + "ijson==3.4.0", +] + +[project.optional-dependencies] +dev = [ + "pytest==8.4.2", + "ruff==0.12.12", + "black==25.1.0", +] +linters = [ + "ruff==0.12.12", + "mypy==1.17.1", + "black==25.1.0", + "bandit==1.8.6", + "pylint==3.3.8", +] +ml = [ + "scikit-learn==1.7.1", + "numpy==2.3.2", + "scipy==1.16.1", + "joblib==1.5.2", +] +ast = [ + "tree-sitter==0.25.1", + "tree-sitter-language-pack==0.9.0", + "sqlparse==0.5.3", + "dockerfile-parse==2.0.1", +] +all = [ + # Dev tools + "pytest==8.4.2", + # Linters + "ruff==0.12.12", + "mypy==1.17.1", + "black==25.1.0", + "bandit==1.8.6", + "pylint==3.3.8", + # ML features + "scikit-learn==1.7.1", + "numpy==2.3.2", + "scipy==1.16.1", + "joblib==1.5.2", + # AST parsing + "tree-sitter==0.25.1", + "tree-sitter-language-pack==0.9.0", + # SQL parsing + "sqlparse==0.5.3", + # Docker parsing + "dockerfile-parse==2.0.1", +] + +[project.scripts] +aud = "theauditor.cli:main" + +[tool.hatch.build.targets.wheel] +packages = ["theauditor"] + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "N", # pep8-naming + "UP", # pyupgrade + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "SIM", # flake8-simplify +] +ignore = [ + "E501", # line too long - handled by black + "SIM105", # contextlib.suppress - can be less readable + "SIM117", # multiple with statements - can be less readable +] + +[tool.ruff.lint.isort] +known-first-party = ["theauditor"] + +[tool.black] +line-length = 100 +target-version = ["py311"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +pythonpath = ["."] +addopts = "-v" + +[tool.mypy] +python_version = "3.12" +strict = true +warn_unused_configs = true \ No newline at end of file diff --git a/theauditor/.gitattributes b/theauditor/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/theauditor/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/theauditor/__init__.py b/theauditor/__init__.py new file mode 100644 index 0000000..c30ba90 --- /dev/null +++ b/theauditor/__init__.py @@ -0,0 +1,3 @@ +"""TheAuditor - Offline, air-gapped CLI for repo indexing and evidence checking.""" + +__version__ = "0.1.0" diff --git a/theauditor/agent_template_validator.py b/theauditor/agent_template_validator.py new file mode 100644 index 0000000..52e76c3 --- /dev/null +++ b/theauditor/agent_template_validator.py @@ -0,0 +1,347 @@ +"""Agent template validator - ensures templates comply with SOP permissions.""" + +import json +import re +from pathlib import Path +from typing import Dict, List, Any, Tuple, Optional +import yaml + + +class TemplateValidator: + """Validates agent templates for SOP compliance and structure.""" + + # Tools that allow code modification + WRITE_TOOLS = {"Write", "Edit", "MultiEdit", "NotebookEdit"} + + # Agents allowed to modify code + ALLOWED_EDITOR_AGENTS = {"coder", "documentation-manager", "implementation-specialist"} + + # Required frontmatter fields + REQUIRED_FIELDS = {"name", "description", "tools", "model"} + + def __init__(self, template_dir: str = None): + """Initialize validator with template directory.""" + if template_dir: + self.template_dir = Path(template_dir) + else: + # Default to agent_templates relative to module + self.template_dir = Path(__file__).parent.parent / "agent_templates" + + self.violations = [] + self.warnings = [] + + def _extract_frontmatter(self, content: str) -> Optional[Dict[str, Any]]: + """Extract YAML frontmatter from markdown file. + + Args: + content: File content + + Returns: + Parsed frontmatter dict or None if not found + """ + # Match frontmatter between --- markers + pattern = r'^---\s*\n(.*?)\n---\s*\n' + match = re.match(pattern, content, re.DOTALL) + + if not match: + return None + + try: + frontmatter_text = match.group(1) + return yaml.safe_load(frontmatter_text) + except yaml.YAMLError as e: + self.violations.append(f"Invalid YAML frontmatter: {e}") + return None + + def _parse_tools(self, tools_value: Any) -> List[str]: + """Parse tools from frontmatter value. + + Args: + tools_value: Tools field from frontmatter + + Returns: + List of tool names + """ + if isinstance(tools_value, str): + # Comma-separated string + return [t.strip() for t in tools_value.split(',')] + elif isinstance(tools_value, list): + return tools_value + else: + return [] + + def _check_sop_permissions( + self, + template_name: str, + frontmatter: Dict[str, Any] + ) -> List[str]: + """Check SOP permission rules. + + Args: + template_name: Name of template file + frontmatter: Parsed frontmatter + + Returns: + List of violations found + """ + violations = [] + + # Get name and description, ensuring they're strings + agent_name = frontmatter.get("name", "") + if not isinstance(agent_name, str): + agent_name = str(agent_name) if agent_name else "" + # Skip validation for templates with placeholders + if "{" in agent_name or "}" in agent_name: + # This is a template with placeholders, not a real agent + return [] + agent_name = agent_name.lower() + + description = frontmatter.get("description", "") + if not isinstance(description, str): + description = str(description) if description else "" + description = description.lower() + + tools = self._parse_tools(frontmatter.get("tools", "")) + + # Check if agent has write tools + has_write_tools = any(tool in self.WRITE_TOOLS for tool in tools) + + # Check compliance/legal agents first (they have stricter rules) + is_compliance_agent = ( + "compliance" in agent_name or + "compliance" in description or + "legal" in agent_name or + "legal" in description + ) + + if is_compliance_agent and has_write_tools: + violations.append( + f"Compliance/legal agent '{agent_name}' must not have write tools, " + f"found: {self.WRITE_TOOLS & set(tools)}" + ) + elif has_write_tools: + # For non-compliance agents, check if they're allowed to have write tools + is_allowed_editor = any( + allowed in agent_name + for allowed in self.ALLOWED_EDITOR_AGENTS + ) + + if not is_allowed_editor: + violations.append( + f"Agent '{agent_name}' has write tools ({self.WRITE_TOOLS & set(tools)}) " + f"but is not in allowed editor list: {self.ALLOWED_EDITOR_AGENTS}" + ) + + return violations + + def _check_internal_links( + self, + content: str, + template_path: Path + ) -> List[str]: + """Check internal repository links are valid. + + Args: + content: Template content + template_path: Path to template file + + Returns: + List of broken links + """ + broken_links = [] + + # Find markdown links and references to repo paths + link_patterns = [ + r'\[.*?\]\((\/[^)]+)\)', # Markdown links with absolute paths + r'`(\/[^`]+)`', # Code blocks with paths + r'"(\/[^"]+)"', # Quoted paths + r"'(\/[^']+)'", # Single-quoted paths + ] + + for pattern in link_patterns: + for match in re.finditer(pattern, content): + path_str = match.group(1) + + # Skip URLs and anchors + if path_str.startswith('http') or path_str.startswith('#'): + continue + + # Check if path exists relative to repo root + repo_root = template_path.parent.parent + full_path = repo_root / path_str.lstrip('/') + + if not full_path.exists(): + broken_links.append(f"Broken internal link: {path_str}") + + return broken_links + + def validate_template(self, template_path: Path) -> Dict[str, Any]: + """Validate a single template file. + + Args: + template_path: Path to template markdown file + + Returns: + Validation result dict + """ + result = { + "path": str(template_path), + "valid": True, + "violations": [], + "warnings": [] + } + + try: + with open(template_path, 'r', encoding='utf-8') as f: + content = f.read() + except IOError as e: + result["valid"] = False + result["violations"].append(f"Cannot read file: {e}") + return result + + # Extract frontmatter + frontmatter = self._extract_frontmatter(content) + + if frontmatter is None: + result["valid"] = False + result["violations"].append("No valid frontmatter found") + return result + + # Check required fields + missing_fields = self.REQUIRED_FIELDS - set(frontmatter.keys()) + if missing_fields: + result["valid"] = False + result["violations"].append( + f"Missing required frontmatter fields: {missing_fields}" + ) + + # Check SOP permissions + sop_violations = self._check_sop_permissions( + template_path.name, + frontmatter + ) + if sop_violations: + result["valid"] = False + result["violations"].extend(sop_violations) + + # Check internal links + broken_links = self._check_internal_links(content, template_path) + if broken_links: + result["warnings"].extend(broken_links) + + # Check for tool typos/inconsistencies + tools = self._parse_tools(frontmatter.get("tools", "")) + known_tools = { + "Bash", "Glob", "Grep", "LS", "Read", "Edit", "Write", + "MultiEdit", "NotebookEdit", "WebFetch", "TodoWrite", + "WebSearch", "BashOutput", "KillBash", "Task", "ExitPlanMode" + } + + unknown_tools = set(tools) - known_tools + if unknown_tools: + result["warnings"].append( + f"Unknown tools found: {unknown_tools}" + ) + + return result + + def validate_all(self, source_dir: Optional[str] = None) -> Dict[str, Any]: + """Validate all templates in directory. + + Args: + source_dir: Directory containing templates (default: self.template_dir) + + Returns: + Validation summary + """ + if source_dir: + template_dir = Path(source_dir) + else: + template_dir = self.template_dir + + if not template_dir.exists(): + return { + "valid": False, + "error": f"Template directory not found: {template_dir}", + "templates": [] + } + + results = [] + all_valid = True + total_violations = 0 + total_warnings = 0 + + # Find all .md files + for template_path in template_dir.glob("*.md"): + result = self.validate_template(template_path) + results.append(result) + + if not result["valid"]: + all_valid = False + + total_violations += len(result["violations"]) + total_warnings += len(result["warnings"]) + + return { + "valid": all_valid, + "templates_checked": len(results), + "total_violations": total_violations, + "total_warnings": total_warnings, + "templates": results + } + + def generate_report( + self, + validation_results: Dict[str, Any], + format: str = "json" + ) -> str: + """Generate validation report. + + Args: + validation_results: Results from validate_all() + format: Output format ('json' or 'text') + + Returns: + Formatted report string + """ + if format == "json": + return json.dumps(validation_results, indent=2, sort_keys=True) + + # Text format + lines = [] + lines.append("=== Agent Template Validation Report ===\n") + lines.append(f"Templates checked: {validation_results['templates_checked']}") + lines.append(f"Total violations: {validation_results['total_violations']}") + lines.append(f"Total warnings: {validation_results['total_warnings']}") + lines.append(f"Overall status: {'PASS' if validation_results['valid'] else 'FAIL'}\n") + + for template in validation_results.get("templates", []): + lines.append(f"\n{template['path']}:") + lines.append(f" Status: {'✓' if template['valid'] else '✗'}") + + if template["violations"]: + lines.append(" Violations:") + for v in template["violations"]: + lines.append(f" - {v}") + + if template["warnings"]: + lines.append(" Warnings:") + for w in template["warnings"]: + lines.append(f" - {w}") + + return "\n".join(lines) + + +# Module-level convenience function +def validate_templates(source_dir: str) -> Tuple[bool, Dict[str, Any]]: + """Validate all templates in directory. + + Args: + source_dir: Directory containing agent templates + + Returns: + Tuple of (all_valid, validation_results) + """ + validator = TemplateValidator() + results = validator.validate_all(source_dir) + return results["valid"], results \ No newline at end of file diff --git a/theauditor/ast_extractors/__init__.py b/theauditor/ast_extractors/__init__.py new file mode 100644 index 0000000..bd7529c --- /dev/null +++ b/theauditor/ast_extractors/__init__.py @@ -0,0 +1,348 @@ +"""AST Data Extraction Engine - Package Router. + +This module provides the main ASTExtractorMixin class that routes extraction +requests to the appropriate language-specific implementation. +""" + +import os +from typing import Any, List, Dict, Optional, TYPE_CHECKING +from dataclasses import dataclass +from pathlib import Path + +# Import all implementations +from . import python_impl, typescript_impl, treesitter_impl +from .base import detect_language + +# Import semantic parser if available +try: + from ..js_semantic_parser import get_semantic_ast_batch +except ImportError: + get_semantic_ast_batch = None + +if TYPE_CHECKING: + # For type checking only, avoid circular import + from ..ast_parser import ASTMatch +else: + # At runtime, ASTMatch will be available from the parent class + @dataclass + class ASTMatch: + """Represents an AST pattern match.""" + node_type: str + start_line: int + end_line: int + start_col: int + snippet: str + metadata: Dict[str, Any] = None + + +class ASTExtractorMixin: + """Mixin class providing data extraction capabilities for AST analysis. + + This class acts as a pure router, delegating all extraction logic to + language-specific implementation modules. + """ + + def extract_functions(self, tree: Any, language: str = None) -> List[Dict]: + """Extract function definitions from AST. + + Args: + tree: AST tree. + language: Programming language. + + Returns: + List of function info dictionaries. + """ + if not tree: + return [] + + # Route to appropriate implementation + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + return python_impl.extract_python_functions(tree, self) + elif tree_type == "semantic_ast": + return typescript_impl.extract_typescript_functions(tree, self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + return treesitter_impl.extract_treesitter_functions(tree, self, language) + + return [] + + def extract_classes(self, tree: Any, language: str = None) -> List[Dict]: + """Extract class definitions from AST.""" + if not tree: + return [] + + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + return python_impl.extract_python_classes(tree, self) + elif tree_type == "semantic_ast": + return typescript_impl.extract_typescript_classes(tree, self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + return treesitter_impl.extract_treesitter_classes(tree, self, language) + + return [] + + def extract_calls(self, tree: Any, language: str = None) -> List[Dict]: + """Extract function calls from AST.""" + if not tree: + return [] + + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + return python_impl.extract_python_calls(tree, self) + elif tree_type == "semantic_ast": + return typescript_impl.extract_typescript_calls(tree, self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + return treesitter_impl.extract_treesitter_calls(tree, self, language) + + return [] + + def extract_imports(self, tree: Any, language: str = None) -> List[Dict[str, Any]]: + """Extract import statements from AST.""" + if not tree: + return [] + + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + return python_impl.extract_python_imports(tree, self) + elif tree_type == "semantic_ast": + return typescript_impl.extract_typescript_imports(tree, self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + return treesitter_impl.extract_treesitter_imports(tree, self, language) + + return [] + + def extract_exports(self, tree: Any, language: str = None) -> List[Dict[str, Any]]: + """Extract export statements from AST.""" + if not tree: + return [] + + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + return python_impl.extract_python_exports(tree, self) + elif tree_type == "semantic_ast": + return typescript_impl.extract_typescript_exports(tree, self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + return treesitter_impl.extract_treesitter_exports(tree, self, language) + + return [] + + def extract_properties(self, tree: Any, language: str = None) -> List[Dict]: + """Extract property accesses from AST (e.g., req.body, req.query). + + This is critical for taint analysis to find JavaScript property access patterns. + """ + if not tree: + return [] + + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + return python_impl.extract_python_properties(tree, self) + elif tree_type == "semantic_ast": + return typescript_impl.extract_typescript_properties(tree, self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + return treesitter_impl.extract_treesitter_properties(tree, self, language) + + return [] + + def extract_assignments(self, tree: Any, language: str = None) -> List[Dict[str, Any]]: + """Extract variable assignments for data flow analysis.""" + if not tree: + return [] + + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + return python_impl.extract_python_assignments(tree, self) + elif tree_type == "semantic_ast": + # The semantic result is nested in tree["tree"] + return typescript_impl.extract_typescript_assignments(tree.get("tree", {}), self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + return treesitter_impl.extract_treesitter_assignments(tree, self, language) + + return [] + + def extract_function_calls_with_args(self, tree: Any, language: str = None) -> List[Dict[str, Any]]: + """Extract function calls with argument mapping for data flow analysis. + + This is a two-pass analysis: + 1. First pass: Find all function definitions and their parameters + 2. Second pass: Find all function calls and map arguments to parameters + """ + if not tree: + return [] + + # First pass: Get all function definitions with their parameters + function_params = self._extract_function_parameters(tree, language) + + # Second pass: Extract calls with argument mapping + calls_with_args = [] + + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + calls_with_args = python_impl.extract_python_calls_with_args(tree, function_params, self) + elif tree_type == "semantic_ast": + calls_with_args = typescript_impl.extract_typescript_calls_with_args(tree, function_params, self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + calls_with_args = treesitter_impl.extract_treesitter_calls_with_args( + tree, function_params, self, language + ) + + return calls_with_args + + def _extract_function_parameters(self, tree: Any, language: str = None) -> Dict[str, List[str]]: + """Extract function definitions and their parameter names. + + Returns: + Dict mapping function_name -> list of parameter names + """ + if not tree: + return {} + + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + return python_impl.extract_python_function_params(tree, self) + elif tree_type == "semantic_ast": + return typescript_impl.extract_typescript_function_params(tree, self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + return treesitter_impl.extract_treesitter_function_params(tree, self, language) + + return {} + + def extract_returns(self, tree: Any, language: str = None) -> List[Dict[str, Any]]: + """Extract return statements for data flow analysis.""" + if not tree: + return [] + + if isinstance(tree, dict): + tree_type = tree.get("type") + language = tree.get("language", language) + + if tree_type == "python_ast": + return python_impl.extract_python_returns(tree, self) + elif tree_type == "semantic_ast": + return typescript_impl.extract_typescript_returns(tree, self) + elif tree_type == "tree_sitter" and self.has_tree_sitter: + return treesitter_impl.extract_treesitter_returns(tree, self, language) + + return [] + + def parse_files_batch(self, file_paths: List[Path], root_path: str = None) -> Dict[str, Any]: + """Parse multiple files into ASTs in batch for performance. + + This method dramatically improves performance for JavaScript/TypeScript projects + by processing multiple files in a single TypeScript compiler invocation. + + Args: + file_paths: List of paths to source files + root_path: Absolute path to project root (for sandbox resolution) + + Returns: + Dictionary mapping file paths to their AST trees + """ + results = {} + + # Separate files by language + js_ts_files = [] + python_files = [] + other_files = [] + + for file_path in file_paths: + language = self._detect_language(file_path) + if language in ["javascript", "typescript"]: + js_ts_files.append(file_path) + elif language == "python": + python_files.append(file_path) + else: + other_files.append(file_path) + + # Batch process JavaScript/TypeScript files if in a JS or polyglot project + project_type = self._detect_project_type() + if js_ts_files and project_type in ["javascript", "polyglot"] and get_semantic_ast_batch: + try: + # Convert paths to strings for the semantic parser with normalized separators + js_ts_paths = [str(f).replace("\\", "/") for f in js_ts_files] + + # Use batch processing for JS/TS files + batch_results = get_semantic_ast_batch(js_ts_paths, project_root=root_path) + + # Process batch results + for file_path in js_ts_files: + file_str = str(file_path).replace("\\", "/") # Normalize for matching + if file_str in batch_results: + semantic_result = batch_results[file_str] + if semantic_result.get("success"): + # Read file content for inclusion + try: + with open(file_path, "rb") as f: + content = f.read() + + results[str(file_path).replace("\\", "/")] = { + "type": "semantic_ast", + "tree": semantic_result, + "language": self._detect_language(file_path), + "content": content.decode("utf-8", errors="ignore"), + "has_types": semantic_result.get("hasTypes", False), + "diagnostics": semantic_result.get("diagnostics", []), + "symbols": semantic_result.get("symbols", []) + } + except Exception as e: + print(f"Warning: Failed to read {file_path}: {e}, falling back to individual parsing") + # CRITICAL FIX: Fall back to individual parsing on read failure + individual_result = self.parse_file(file_path, root_path=root_path) + results[str(file_path).replace("\\", "/")] = individual_result + else: + print(f"Warning: Semantic parser failed for {file_path}: {semantic_result.get('error')}, falling back to individual parsing") + # CRITICAL FIX: Fall back to individual parsing instead of None + individual_result = self.parse_file(file_path, root_path=root_path) + results[str(file_path).replace("\\", "/")] = individual_result + else: + # CRITICAL FIX: Fall back to individual parsing instead of None + print(f"Warning: No batch result for {file_path}, falling back to individual parsing") + individual_result = self.parse_file(file_path, root_path=root_path) + results[str(file_path).replace("\\", "/")] = individual_result + + except Exception as e: + print(f"Warning: Batch processing failed for JS/TS files: {e}") + # Fall back to individual processing + for file_path in js_ts_files: + results[str(file_path).replace("\\", "/")] = self.parse_file(file_path, root_path=root_path) + else: + # Process JS/TS files individually if not in JS project or batch failed + for file_path in js_ts_files: + results[str(file_path).replace("\\", "/")] = self.parse_file(file_path, root_path=root_path) + + # Process Python files individually (they're fast enough) + for file_path in python_files: + results[str(file_path).replace("\\", "/")] = self.parse_file(file_path, root_path=root_path) + + # Process other files individually + for file_path in other_files: + results[str(file_path).replace("\\", "/")] = self.parse_file(file_path, root_path=root_path) + + return results \ No newline at end of file diff --git a/theauditor/ast_extractors/base.py b/theauditor/ast_extractors/base.py new file mode 100644 index 0000000..2e0e08b --- /dev/null +++ b/theauditor/ast_extractors/base.py @@ -0,0 +1,173 @@ +"""Base utilities and shared helpers for AST extraction. + +This module contains utility functions shared across all language implementations. +""" + +import ast +import re +from typing import Any, List, Optional +from pathlib import Path + + +def get_node_name(node: Any) -> str: + """Get the name from an AST node, handling different node types. + + Works with Python's built-in AST nodes. + """ + if isinstance(node, ast.Name): + return node.id + elif isinstance(node, ast.Attribute): + return f"{get_node_name(node.value)}.{node.attr}" + elif isinstance(node, ast.Call): + return get_node_name(node.func) + elif isinstance(node, str): + return node + else: + return "unknown" + + +def extract_vars_from_expr(node: ast.AST) -> List[str]: + """Extract all variable names from a Python expression. + + Walks the AST to find all Name and Attribute nodes. + """ + vars_list = [] + for subnode in ast.walk(node): + if isinstance(subnode, ast.Name): + vars_list.append(subnode.id) + elif isinstance(subnode, ast.Attribute): + # For x.y.z, get the full chain + chain = [] + current = subnode + while isinstance(current, ast.Attribute): + chain.append(current.attr) + current = current.value + if isinstance(current, ast.Name): + chain.append(current.id) + vars_list.append(".".join(reversed(chain))) + return vars_list + + +def extract_vars_from_tree_sitter_expr(expr: str) -> List[str]: + """Extract variable names from a JavaScript/TypeScript expression string. + + Uses regex to find identifiers that aren't keywords. + """ + # Match identifiers that are not keywords + pattern = r'\b(?!(?:const|let|var|function|return|if|else|for|while|true|false|null|undefined|new|this)\b)[a-zA-Z_$][a-zA-Z0-9_$]*\b' + return re.findall(pattern, expr) + + +def find_containing_function_python(tree: ast.AST, line: int) -> Optional[str]: + """Find the function containing a given line in Python AST.""" + containing_func = None + + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + if hasattr(node, "lineno") and hasattr(node, "end_lineno"): + if node.lineno <= line <= (node.end_lineno or node.lineno): + # Check if this is more specific than current containing_func + if containing_func is None or node.lineno > containing_func[1]: + containing_func = (node.name, node.lineno) + + return containing_func[0] if containing_func else None + + +def find_containing_function_tree_sitter(node: Any, content: str, language: str) -> Optional[str]: + """Find the function containing a node in Tree-sitter AST. + + Walks up the tree to find parent function, handling all modern JS/TS patterns. + """ + # Walk up the tree to find parent function + current = node + while current and hasattr(current, 'parent') and current.parent: + current = current.parent + if language in ["javascript", "typescript"]: + # CRITICAL FIX: Handle ALL function patterns in modern JS/TS + function_types = [ + "function_declaration", # function foo() {} + "function_expression", # const foo = function() {} + "arrow_function", # const foo = () => {} + "method_definition", # class { foo() {} } + "generator_function", # function* foo() {} + "async_function", # async function foo() {} + ] + + if current.type in function_types: + # Special handling for arrow functions FIRST + # They need different logic than regular functions + if current.type == "arrow_function": + # Arrow functions don't have names directly, check parent + parent = current.parent if hasattr(current, 'parent') else None + if parent: + # Check if it's assigned to a variable: const foo = () => {} + if parent.type == "variable_declarator": + # Use field-based API to get the name + if hasattr(parent, 'child_by_field_name'): + name_node = parent.child_by_field_name('name') + if name_node and name_node.text: + return name_node.text.decode("utf-8", errors="ignore") + # Fallback to child iteration + for child in parent.children: + if child.type == "identifier" and child != current: + return child.text.decode("utf-8", errors="ignore") + # Check if it's a property: { foo: () => {} } + elif parent.type == "pair": + for child in parent.children: + if child.type in ["property_identifier", "identifier", "string"] and child != current: + text = child.text.decode("utf-8", errors="ignore") + # Remove quotes from string keys + return text.strip('"\'') + # CRITICAL FIX (Lead Auditor feedback): Don't return anything here! + # Continue searching upward for containing named function + # This handles cases like: function outer() { arr.map(() => {}) } + # The arrow function should be tracked as within "outer", not "anonymous" + # Let the while loop continue to find outer function + continue # Skip the rest and continue searching upward + + # For non-arrow functions, try field-based API first + if hasattr(current, 'child_by_field_name'): + name_node = current.child_by_field_name('name') + if name_node and name_node.text: + return name_node.text.decode("utf-8", errors="ignore") + + # Fallback to child iteration for regular functions + for child in current.children: + if child.type in ["identifier", "property_identifier"]: + return child.text.decode("utf-8", errors="ignore") + + # If still no name found for this regular function, it's anonymous + return "anonymous" + + elif language == "python": + if current.type == "function_definition": + # Try field-based API first + if hasattr(current, 'child_by_field_name'): + name_node = current.child_by_field_name('name') + if name_node and name_node.text: + return name_node.text.decode("utf-8", errors="ignore") + # Fallback to child iteration + for child in current.children: + if child.type == "identifier": + return child.text.decode("utf-8", errors="ignore") + + # If no function found, return "global" instead of None for better tracking + return "global" + + +def detect_language(file_path: Path) -> str: + """Detect language from file extension. + + Returns empty string for unsupported languages. + """ + ext_map = { + ".py": "python", + ".js": "javascript", + ".jsx": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".mjs": "javascript", + ".cjs": "javascript", + ".vue": "javascript", # Vue SFCs contain JavaScript/TypeScript + } + return ext_map.get(file_path.suffix.lower(), "") \ No newline at end of file diff --git a/theauditor/ast_extractors/python_impl.py b/theauditor/ast_extractors/python_impl.py new file mode 100644 index 0000000..1449950 --- /dev/null +++ b/theauditor/ast_extractors/python_impl.py @@ -0,0 +1,327 @@ +"""Python AST extraction implementations. + +This module contains all Python-specific extraction logic using the built-in ast module. +""" + +import ast +from typing import Any, List, Dict, Optional + +from .base import ( + get_node_name, + extract_vars_from_expr, + find_containing_function_python +) + + +def extract_python_functions(tree: Dict, parser_self) -> List[Dict]: + """Extract function definitions from Python AST. + + Args: + tree: AST tree dictionary with 'tree' containing the actual AST + parser_self: Reference to the parser instance for accessing methods + + Returns: + List of function info dictionaries + """ + functions = [] + actual_tree = tree.get("tree") + + if not actual_tree: + return functions + + for node in ast.walk(actual_tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + functions.append({ + "name": node.name, + "line": node.lineno, + "async": isinstance(node, ast.AsyncFunctionDef), + "args": [arg.arg for arg in node.args.args], + }) + + return functions + + +def extract_python_classes(tree: Dict, parser_self) -> List[Dict]: + """Extract class definitions from Python AST.""" + classes = [] + actual_tree = tree.get("tree") + + if not actual_tree: + return classes + + for node in ast.walk(actual_tree): + if isinstance(node, ast.ClassDef): + classes.append({ + "name": node.name, + "line": node.lineno, + "column": node.col_offset, + "bases": [get_node_name(base) for base in node.bases], + }) + + return classes + + +def extract_python_calls(tree: Dict, parser_self) -> List[Dict]: + """Extract function calls from Python AST.""" + calls = [] + actual_tree = tree.get("tree") + + if not actual_tree: + return calls + + for node in ast.walk(actual_tree): + if isinstance(node, ast.Call): + func_name = get_node_name(node.func) + if func_name: + calls.append({ + "name": func_name, + "line": node.lineno, + "column": node.col_offset, + "args_count": len(node.args), + }) + + return calls + + +def extract_python_imports(tree: Dict, parser_self) -> List[Dict[str, Any]]: + """Extract import statements from Python AST.""" + imports = [] + actual_tree = tree.get("tree") + + if not actual_tree: + return imports + + for node in ast.walk(actual_tree): + if isinstance(node, ast.Import): + for alias in node.names: + imports.append({ + "source": "import", + "target": alias.name, + "type": "import", + "line": node.lineno, + "as": alias.asname, + "specifiers": [] + }) + elif isinstance(node, ast.ImportFrom): + module = node.module or "" + for alias in node.names: + imports.append({ + "source": "from", + "target": module, + "type": "from", + "line": node.lineno, + "imported": alias.name, + "as": alias.asname, + "specifiers": [alias.name] + }) + + return imports + + +def extract_python_exports(tree: Dict, parser_self) -> List[Dict[str, Any]]: + """Extract export statements from Python AST. + + In Python, all top-level functions, classes, and assignments are "exported". + """ + exports = [] + actual_tree = tree.get("tree") + + if not actual_tree: + return exports + + for node in ast.walk(actual_tree): + if isinstance(node, ast.FunctionDef) and node.col_offset == 0: + exports.append({ + "name": node.name, + "type": "function", + "line": node.lineno, + "default": False + }) + elif isinstance(node, ast.ClassDef) and node.col_offset == 0: + exports.append({ + "name": node.name, + "type": "class", + "line": node.lineno, + "default": False + }) + elif isinstance(node, ast.Assign) and node.col_offset == 0: + for target in node.targets: + if isinstance(target, ast.Name): + exports.append({ + "name": target.id, + "type": "variable", + "line": node.lineno, + "default": False + }) + + return exports + + +def extract_python_assignments(tree: Dict, parser_self) -> List[Dict[str, Any]]: + """Extract variable assignments from Python AST for data flow analysis.""" + import os + assignments = [] + actual_tree = tree.get("tree") + + if os.environ.get("THEAUDITOR_DEBUG"): + import sys + print(f"[AST_DEBUG] extract_python_assignments called", file=sys.stderr) + + if not actual_tree: + return assignments + + for node in ast.walk(actual_tree): + if isinstance(node, ast.Assign): + # Extract target variable(s) + for target in node.targets: + target_var = get_node_name(target) + source_expr = ast.unparse(node.value) if hasattr(ast, "unparse") else str(node.value) + + # Find containing function + in_function = find_containing_function_python(actual_tree, node.lineno) + + # CRITICAL FIX: Check if this is a class instantiation + # BeautifulSoup(html) is ast.Call with func.id = "BeautifulSoup" + is_instantiation = isinstance(node.value, ast.Call) + + assignments.append({ + "target_var": target_var, + "source_expr": source_expr, + "line": node.lineno, + "in_function": in_function or "global", + "source_vars": extract_vars_from_expr(node.value), + "is_instantiation": is_instantiation # Track for taint analysis + }) + + elif isinstance(node, ast.AnnAssign) and node.value: + # Handle annotated assignments (x: int = 5) + target_var = get_node_name(node.target) + source_expr = ast.unparse(node.value) if hasattr(ast, "unparse") else str(node.value) + + in_function = find_containing_function_python(actual_tree, node.lineno) + + assignments.append({ + "target_var": target_var, + "source_expr": source_expr, + "line": node.lineno, + "in_function": in_function or "global", + "source_vars": extract_vars_from_expr(node.value) + }) + + return assignments + + +def extract_python_function_params(tree: Dict, parser_self) -> Dict[str, List[str]]: + """Extract function definitions and their parameter names from Python AST.""" + func_params = {} + actual_tree = tree.get("tree") + + if not actual_tree: + return func_params + + for node in ast.walk(actual_tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + params = [arg.arg for arg in node.args.args] + func_params[node.name] = params + + return func_params + + +def extract_python_calls_with_args(tree: Dict, function_params: Dict[str, List[str]], parser_self) -> List[Dict[str, Any]]: + """Extract Python function calls with argument mapping.""" + calls = [] + actual_tree = tree.get("tree") + + if not actual_tree: + return calls + + # Find containing function for each call + function_ranges = {} + for node in ast.walk(actual_tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + if hasattr(node, "lineno") and hasattr(node, "end_lineno"): + function_ranges[node.name] = (node.lineno, node.end_lineno or node.lineno) + + for node in ast.walk(actual_tree): + if isinstance(node, ast.Call): + func_name = get_node_name(node.func) + + # Find caller function + caller_function = "global" + for fname, (start, end) in function_ranges.items(): + if start <= node.lineno <= end: + caller_function = fname + break + + # Get callee parameters + callee_params = function_params.get(func_name.split(".")[-1], []) + + # Map arguments to parameters + for i, arg in enumerate(node.args): + arg_expr = ast.unparse(arg) if hasattr(ast, "unparse") else str(arg) + param_name = callee_params[i] if i < len(callee_params) else f"arg{i}" + + calls.append({ + "line": node.lineno, + "caller_function": caller_function, + "callee_function": func_name, + "argument_index": i, + "argument_expr": arg_expr, + "param_name": param_name + }) + + return calls + + +def extract_python_returns(tree: Dict, parser_self) -> List[Dict[str, Any]]: + """Extract return statements from Python AST.""" + returns = [] + actual_tree = tree.get("tree") + + if not actual_tree: + return returns + + # First, map all functions + function_ranges = {} + for node in ast.walk(actual_tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + if hasattr(node, "lineno") and hasattr(node, "end_lineno"): + function_ranges[node.name] = (node.lineno, node.end_lineno or node.lineno) + + # Extract return statements + for node in ast.walk(actual_tree): + if isinstance(node, ast.Return): + # Find containing function + function_name = "global" + for fname, (start, end) in function_ranges.items(): + if start <= node.lineno <= end: + function_name = fname + break + + # Extract return expression + if node.value: + return_expr = ast.unparse(node.value) if hasattr(ast, "unparse") else str(node.value) + return_vars = extract_vars_from_expr(node.value) + else: + return_expr = "None" + return_vars = [] + + returns.append({ + "function_name": function_name, + "line": node.lineno, + "return_expr": return_expr, + "return_vars": return_vars + }) + + return returns + + +# Python doesn't have property accesses in the same way as JS +# This is a placeholder for consistency +def extract_python_properties(tree: Dict, parser_self) -> List[Dict]: + """Extract property accesses from Python AST. + + In Python, these would be attribute accesses. + Currently returns empty list for consistency. + """ + return [] \ No newline at end of file diff --git a/theauditor/ast_extractors/treesitter_impl.py b/theauditor/ast_extractors/treesitter_impl.py new file mode 100644 index 0000000..17ca6f2 --- /dev/null +++ b/theauditor/ast_extractors/treesitter_impl.py @@ -0,0 +1,711 @@ +"""Tree-sitter generic AST extraction implementations. + +This module contains Tree-sitter extraction logic that works across multiple languages. +""" + +from typing import Any, List, Dict, Optional + +from .base import ( + find_containing_function_tree_sitter, + extract_vars_from_tree_sitter_expr +) + + +def extract_treesitter_functions(tree: Dict, parser_self, language: str) -> List[Dict]: + """Extract function definitions from Tree-sitter AST.""" + actual_tree = tree.get("tree") + if not actual_tree: + return [] + + if not parser_self.has_tree_sitter: + return [] + + return _extract_tree_sitter_functions(actual_tree.root_node, language) + + +def _extract_tree_sitter_functions(node: Any, language: str) -> List[Dict]: + """Extract functions from Tree-sitter AST.""" + functions = [] + + if node is None: + return functions + + # Function node types per language + function_types = { + "python": ["function_definition"], + "javascript": ["function_declaration", "arrow_function", "function_expression", "method_definition"], + "typescript": ["function_declaration", "arrow_function", "function_expression", "method_definition"], + } + + node_types = function_types.get(language, []) + + if node.type in node_types: + # Extract function name + name = "anonymous" + for child in node.children: + if child.type in ["identifier", "property_identifier"]: + name = child.text.decode("utf-8", errors="ignore") + break + + functions.append({ + "name": name, + "line": node.start_point[0] + 1, + "type": node.type, + }) + + # Recursively search children + for child in node.children: + functions.extend(_extract_tree_sitter_functions(child, language)) + + return functions + + +def extract_treesitter_classes(tree: Dict, parser_self, language: str) -> List[Dict]: + """Extract class definitions from Tree-sitter AST.""" + actual_tree = tree.get("tree") + if not actual_tree: + return [] + + if not parser_self.has_tree_sitter: + return [] + + return _extract_tree_sitter_classes(actual_tree.root_node, language) + + +def _extract_tree_sitter_classes(node: Any, language: str) -> List[Dict]: + """Extract classes from Tree-sitter AST.""" + classes = [] + + if node is None: + return classes + + # Class node types per language + class_types = { + "python": ["class_definition"], + "javascript": ["class_declaration"], + "typescript": ["class_declaration", "interface_declaration"], + } + + node_types = class_types.get(language, []) + + if node.type in node_types: + # Extract class name + name = "anonymous" + for child in node.children: + if child.type in ["identifier", "type_identifier"]: + name = child.text.decode("utf-8", errors="ignore") + break + + classes.append({ + "name": name, + "line": node.start_point[0] + 1, + "column": node.start_point[1], + "type": node.type, + }) + + # Recursively search children + for child in node.children: + classes.extend(_extract_tree_sitter_classes(child, language)) + + return classes + + +def extract_treesitter_calls(tree: Dict, parser_self, language: str) -> List[Dict]: + """Extract function calls from Tree-sitter AST.""" + actual_tree = tree.get("tree") + if not actual_tree: + return [] + + if not parser_self.has_tree_sitter: + return [] + + return _extract_tree_sitter_calls(actual_tree.root_node, language) + + +def _extract_tree_sitter_calls(node: Any, language: str) -> List[Dict]: + """Extract function calls from Tree-sitter AST.""" + calls = [] + + if node is None: + return calls + + # Call node types per language + call_types = { + "python": ["call"], + "javascript": ["call_expression"], + "typescript": ["call_expression"], + } + + node_types = call_types.get(language, []) + + if node.type in node_types: + # Extract function name being called + name = "unknown" + for child in node.children: + if child.type in ["identifier", "member_expression", "attribute"]: + name = child.text.decode("utf-8", errors="ignore") + break + # Also handle property access patterns for methods like res.send() + elif child.type == "member_access_expression": + name = child.text.decode("utf-8", errors="ignore") + break + + calls.append({ + "name": name, + "line": node.start_point[0] + 1, + "column": node.start_point[1], + "type": "call", # Always use "call" type for database consistency + }) + + # Recursively search children + for child in node.children: + calls.extend(_extract_tree_sitter_calls(child, language)) + + return calls + + +def extract_treesitter_imports(tree: Dict, parser_self, language: str) -> List[Dict[str, Any]]: + """Extract import statements from Tree-sitter AST.""" + actual_tree = tree.get("tree") + if not actual_tree: + return [] + + if not parser_self.has_tree_sitter: + return [] + + return _extract_tree_sitter_imports(actual_tree.root_node, language) + + +def _extract_tree_sitter_imports(node: Any, language: str) -> List[Dict[str, Any]]: + """Extract imports from Tree-sitter AST with language-specific handling.""" + imports = [] + + if node is None: + return imports + + # Import node types per language + import_types = { + "javascript": ["import_statement", "import_clause", "require_call"], + "typescript": ["import_statement", "import_clause", "require_call", "import_type"], + "python": ["import_statement", "import_from_statement"], + } + + node_types = import_types.get(language, []) + + if node.type in node_types: + # Parse based on node type + if node.type == "import_statement": + # Handle: import foo from 'bar' + source_node = None + specifiers = [] + + for child in node.children: + if child.type == "string": + source_node = child.text.decode("utf-8", errors="ignore").strip("\"'") + elif child.type == "import_clause": + # Extract imported names + for spec_child in child.children: + if spec_child.type == "identifier": + specifiers.append(spec_child.text.decode("utf-8", errors="ignore")) + + if source_node: + imports.append({ + "source": "import", + "target": source_node, + "type": "import", + "line": node.start_point[0] + 1, + "specifiers": specifiers + }) + + elif node.type == "require_call": + # Handle: const foo = require('bar') + for child in node.children: + if child.type == "string": + target = child.text.decode("utf-8", errors="ignore").strip("\"'") + imports.append({ + "source": "require", + "target": target, + "type": "require", + "line": node.start_point[0] + 1, + "specifiers": [] + }) + + # Recursively search children + for child in node.children: + imports.extend(_extract_tree_sitter_imports(child, language)) + + return imports + + +def extract_treesitter_exports(tree: Dict, parser_self, language: str) -> List[Dict[str, Any]]: + """Extract export statements from Tree-sitter AST.""" + actual_tree = tree.get("tree") + if not actual_tree: + return [] + + if not parser_self.has_tree_sitter: + return [] + + return _extract_tree_sitter_exports(actual_tree.root_node, language) + + +def _extract_tree_sitter_exports(node: Any, language: str) -> List[Dict[str, Any]]: + """Extract exports from Tree-sitter AST.""" + exports = [] + + if node is None: + return exports + + # Export node types per language + export_types = { + "javascript": ["export_statement", "export_default_declaration"], + "typescript": ["export_statement", "export_default_declaration", "export_type"], + } + + node_types = export_types.get(language, []) + + if node.type in node_types: + is_default = "default" in node.type + + # Extract exported name + name = "unknown" + export_type = "unknown" + + for child in node.children: + if child.type in ["identifier", "type_identifier"]: + name = child.text.decode("utf-8", errors="ignore") + elif child.type == "function_declaration": + export_type = "function" + for subchild in child.children: + if subchild.type == "identifier": + name = subchild.text.decode("utf-8", errors="ignore") + break + elif child.type == "class_declaration": + export_type = "class" + for subchild in child.children: + if subchild.type in ["identifier", "type_identifier"]: + name = subchild.text.decode("utf-8", errors="ignore") + break + + exports.append({ + "name": name, + "type": export_type, + "line": node.start_point[0] + 1, + "default": is_default + }) + + # Recursively search children + for child in node.children: + exports.extend(_extract_tree_sitter_exports(child, language)) + + return exports + + +def extract_treesitter_properties(tree: Dict, parser_self, language: str) -> List[Dict]: + """Extract property accesses from Tree-sitter AST.""" + actual_tree = tree.get("tree") + if not actual_tree: + return [] + + if not parser_self.has_tree_sitter: + return [] + + return _extract_tree_sitter_properties(actual_tree.root_node, language) + + +def _extract_tree_sitter_properties(node: Any, language: str) -> List[Dict]: + """Extract property accesses from Tree-sitter AST.""" + properties = [] + + if node is None: + return properties + + # Property access node types per language + property_types = { + "javascript": ["member_expression", "property_access_expression"], + "typescript": ["member_expression", "property_access_expression"], + "python": ["attribute"], + } + + node_types = property_types.get(language, []) + + if node.type in node_types: + # Extract the full property access chain + prop_text = node.text.decode("utf-8", errors="ignore") if node.text else "" + + # Filter for patterns that look like taint sources (req.*, request.*, ctx.*, etc.) + if any(pattern in prop_text for pattern in ["req.", "request.", "ctx.", "body", "query", "params", "headers", "cookies"]): + properties.append({ + "name": prop_text, + "line": node.start_point[0] + 1, + "column": node.start_point[1], + "type": "property" + }) + + # Recursively search children + for child in node.children: + properties.extend(_extract_tree_sitter_properties(child, language)) + + return properties + + +def extract_treesitter_assignments(tree: Dict, parser_self, language: str) -> List[Dict[str, Any]]: + """Extract variable assignments from Tree-sitter AST.""" + actual_tree = tree.get("tree") + content = tree.get("content", "") + + if not actual_tree: + return [] + + if not parser_self.has_tree_sitter: + return [] + + return _extract_tree_sitter_assignments(actual_tree.root_node, language, content) + + +def _extract_tree_sitter_assignments(node: Any, language: str, content: str) -> List[Dict[str, Any]]: + """Extract assignments from Tree-sitter AST.""" + import os + import sys + debug = os.environ.get("THEAUDITOR_DEBUG") + assignments = [] + + if node is None: + return assignments + + # Assignment node types per language + assignment_types = { + # Don't include variable_declarator - it's handled inside lexical_declaration/variable_declaration + "javascript": ["assignment_expression", "lexical_declaration", "variable_declaration"], + "typescript": ["assignment_expression", "lexical_declaration", "variable_declaration"], + "python": ["assignment"], + } + + node_types = assignment_types.get(language, []) + + if node.type in node_types: + target_var = None + source_expr = None + source_vars = [] + + if node.type in ["lexical_declaration", "variable_declaration"]: + # Handle lexical_declaration (const/let) and variable_declaration (var) + # Both contain variable_declarator children + # Process all variable_declarators within (const a = 1, b = 2) + for child in node.children: + if child.type == "variable_declarator": + name_node = child.child_by_field_name('name') + value_node = child.child_by_field_name('value') + + if name_node and value_node: + in_function = find_containing_function_tree_sitter(child, content, language) or "global" + if debug: + print(f"[DEBUG] Found assignment: {name_node.text.decode('utf-8')} = {value_node.text.decode('utf-8')[:50]}", file=sys.stderr) + assignments.append({ + "target_var": name_node.text.decode("utf-8", errors="ignore"), + "source_expr": value_node.text.decode("utf-8", errors="ignore"), + "line": child.start_point[0] + 1, + "in_function": in_function, + "source_vars": extract_vars_from_tree_sitter_expr( + value_node.text.decode("utf-8", errors="ignore") + ) + }) + + elif node.type == "assignment_expression": + # x = value (JavaScript/TypeScript) - Use field-based API + left_node = node.child_by_field_name('left') + right_node = node.child_by_field_name('right') + + if left_node: + target_var = left_node.text.decode("utf-8", errors="ignore") + if right_node: + source_expr = right_node.text.decode("utf-8", errors="ignore") + source_vars = extract_vars_from_tree_sitter_expr(source_expr) + + elif node.type == "assignment": + # x = value (Python) + # Python assignment has structure: [target, "=", value] + left_node = None + right_node = None + for child in node.children: + if child.type != "=" and left_node is None: + left_node = child + elif child.type != "=" and left_node is not None: + right_node = child + + if left_node: + target_var = left_node.text.decode("utf-8", errors="ignore") if left_node.text else "" + if right_node: + source_expr = right_node.text.decode("utf-8", errors="ignore") if right_node.text else "" + + # Only create assignment record if we have both target and source + # (Skip lexical_declaration/variable_declaration as they're handled above with their children) + if target_var and source_expr and node.type not in ["lexical_declaration", "variable_declaration"]: + # Find containing function + in_function = find_containing_function_tree_sitter(node, content, language) + + assignments.append({ + "target_var": target_var, + "source_expr": source_expr, + "line": node.start_point[0] + 1, + "in_function": in_function or "global", + "source_vars": source_vars if source_vars else extract_vars_from_tree_sitter_expr(source_expr) + }) + + # Recursively search children + for child in node.children: + assignments.extend(_extract_tree_sitter_assignments(child, language, content)) + + return assignments + + +def extract_treesitter_function_params(tree: Dict, parser_self, language: str) -> Dict[str, List[str]]: + """Extract function parameters from Tree-sitter AST.""" + actual_tree = tree.get("tree") + if not actual_tree: + return {} + + if not parser_self.has_tree_sitter: + return {} + + return _extract_tree_sitter_function_params(actual_tree.root_node, language) + + +def _extract_tree_sitter_function_params(node: Any, language: str) -> Dict[str, List[str]]: + """Extract function parameters from Tree-sitter AST.""" + func_params = {} + + if node is None: + return func_params + + # Function definition node types + if language in ["javascript", "typescript"]: + if node.type in ["function_declaration", "function_expression", "arrow_function", "method_definition"]: + func_name = "anonymous" + params = [] + + # Use field-based API for function nodes + name_node = node.child_by_field_name('name') + params_node = node.child_by_field_name('parameters') + + if name_node: + func_name = name_node.text.decode("utf-8", errors="ignore") + + # Fall back to child iteration if field access fails + if not params_node: + for child in node.children: + if child.type in ["formal_parameters", "parameters"]: + params_node = child + break + + if params_node: + # Extract parameter names + for param_child in params_node.children: + if param_child.type in ["identifier", "required_parameter", "optional_parameter"]: + if param_child.type == "identifier": + params.append(param_child.text.decode("utf-8", errors="ignore")) + else: + # For required/optional parameters, use field API + pattern_node = param_child.child_by_field_name('pattern') + if pattern_node and pattern_node.type == "identifier": + params.append(pattern_node.text.decode("utf-8", errors="ignore")) + + if func_name and params: + func_params[func_name] = params + + elif language == "python": + if node.type == "function_definition": + func_name = None + params = [] + + for child in node.children: + if child.type == "identifier": + func_name = child.text.decode("utf-8", errors="ignore") + elif child.type == "parameters": + for param_child in child.children: + if param_child.type == "identifier": + params.append(param_child.text.decode("utf-8", errors="ignore")) + + if func_name: + func_params[func_name] = params + + # Recursively search children + for child in node.children: + func_params.update(_extract_tree_sitter_function_params(child, language)) + + return func_params + + +def extract_treesitter_calls_with_args( + tree: Dict, function_params: Dict[str, List[str]], parser_self, language: str +) -> List[Dict[str, Any]]: + """Extract function calls with arguments from Tree-sitter AST.""" + actual_tree = tree.get("tree") + content = tree.get("content", "") + + if not actual_tree: + return [] + + if not parser_self.has_tree_sitter: + return [] + + return _extract_tree_sitter_calls_with_args( + actual_tree.root_node, language, content, function_params + ) + + +def _extract_tree_sitter_calls_with_args( + node: Any, language: str, content: str, function_params: Dict[str, List[str]] +) -> List[Dict[str, Any]]: + """Extract function calls with arguments from Tree-sitter AST.""" + calls = [] + + if node is None: + return calls + + # Call expression node types + if language in ["javascript", "typescript"] and node.type == "call_expression": + # Extract function name using field-based API + func_node = node.child_by_field_name('function') + func_name = "unknown" + + if func_node: + func_name = func_node.text.decode("utf-8", errors="ignore") if func_node.text else "unknown" + else: + # Fallback to child iteration + for child in node.children: + if child.type in ["identifier", "member_expression"]: + func_name = child.text.decode("utf-8", errors="ignore") if child.text else "unknown" + break + + # Find caller function + caller_function = find_containing_function_tree_sitter(node, content, language) or "global" + + # Get callee parameters + callee_params = function_params.get(func_name.split(".")[-1], []) + + # Extract arguments using field-based API + args_node = node.child_by_field_name('arguments') + arg_index = 0 + + if args_node: + for arg_child in args_node.children: + if arg_child.type not in ["(", ")", ","]: + arg_expr = arg_child.text.decode("utf-8", errors="ignore") if arg_child.text else "" + param_name = callee_params[arg_index] if arg_index < len(callee_params) else f"arg{arg_index}" + + calls.append({ + "line": node.start_point[0] + 1, + "caller_function": caller_function, + "callee_function": func_name, + "argument_index": arg_index, + "argument_expr": arg_expr, + "param_name": param_name + }) + arg_index += 1 + + elif language == "python" and node.type == "call": + # Similar logic for Python + func_name = "unknown" + for child in node.children: + if child.type in ["identifier", "attribute"]: + func_name = child.text.decode("utf-8", errors="ignore") if child.text else "unknown" + break + + caller_function = find_containing_function_tree_sitter(node, content, language) or "global" + callee_params = function_params.get(func_name.split(".")[-1], []) + + arg_index = 0 + for child in node.children: + if child.type == "argument_list": + for arg_child in child.children: + if arg_child.type not in ["(", ")", ","]: + arg_expr = arg_child.text.decode("utf-8", errors="ignore") if arg_child.text else "" + param_name = callee_params[arg_index] if arg_index < len(callee_params) else f"arg{arg_index}" + + calls.append({ + "line": node.start_point[0] + 1, + "caller_function": caller_function, + "callee_function": func_name, + "argument_index": arg_index, + "argument_expr": arg_expr, + "param_name": param_name + }) + arg_index += 1 + + # Recursively search children + for child in node.children: + calls.extend(_extract_tree_sitter_calls_with_args(child, language, content, function_params)) + + return calls + + +def extract_treesitter_returns(tree: Dict, parser_self, language: str) -> List[Dict[str, Any]]: + """Extract return statements from Tree-sitter AST.""" + actual_tree = tree.get("tree") + content = tree.get("content", "") + + if not actual_tree: + return [] + + if not parser_self.has_tree_sitter: + return [] + + return _extract_tree_sitter_returns(actual_tree.root_node, language, content) + + +def _extract_tree_sitter_returns(node: Any, language: str, content: str) -> List[Dict[str, Any]]: + """Extract return statements from Tree-sitter AST.""" + returns = [] + + if node is None: + return returns + + # Return statement node types + if language in ["javascript", "typescript"] and node.type == "return_statement": + # Find containing function + function_name = find_containing_function_tree_sitter(node, content, language) or "global" + + # Extract return expression + return_expr = "" + for child in node.children: + if child.type != "return": + return_expr = child.text.decode("utf-8", errors="ignore") if child.text else "" + break + + if not return_expr: + return_expr = "undefined" + + returns.append({ + "function_name": function_name, + "line": node.start_point[0] + 1, + "return_expr": return_expr, + "return_vars": extract_vars_from_tree_sitter_expr(return_expr) + }) + + elif language == "python" and node.type == "return_statement": + # Find containing function + function_name = find_containing_function_tree_sitter(node, content, language) or "global" + + # Extract return expression + return_expr = "" + for child in node.children: + if child.type != "return": + return_expr = child.text.decode("utf-8", errors="ignore") if child.text else "" + break + + if not return_expr: + return_expr = "None" + + returns.append({ + "function_name": function_name, + "line": node.start_point[0] + 1, + "return_expr": return_expr, + "return_vars": extract_vars_from_tree_sitter_expr(return_expr) + }) + + # Recursively search children + for child in node.children: + returns.extend(_extract_tree_sitter_returns(child, language, content)) + + return returns \ No newline at end of file diff --git a/theauditor/ast_extractors/typescript_impl.py b/theauditor/ast_extractors/typescript_impl.py new file mode 100644 index 0000000..6fa4069 --- /dev/null +++ b/theauditor/ast_extractors/typescript_impl.py @@ -0,0 +1,674 @@ +"""TypeScript/JavaScript semantic AST extraction implementations. + +This module contains all TypeScript compiler API extraction logic for semantic analysis. +""" + +import os +from typing import Any, List, Dict, Optional + +from .base import extract_vars_from_tree_sitter_expr + + +def extract_semantic_ast_symbols(node, depth=0): + """Extract symbols from TypeScript semantic AST including property accesses. + + This is a helper used by multiple extraction functions. + """ + symbols = [] + if depth > 100 or not isinstance(node, dict): + return symbols + + kind = node.get("kind") + + # PropertyAccessExpression: req.body, req.params, res.send, etc. + if kind == "PropertyAccessExpression": + # Use the authoritative text from TypeScript compiler (now restored) + full_name = node.get("text", "").strip() + + # Only fall back to reconstruction if text is missing (shouldn't happen now) + if not full_name: + # Build the full property access chain + name_parts = [] + current = node + while current and isinstance(current, dict): + if current.get("name"): + if isinstance(current["name"], dict) and current["name"].get("name"): + name_parts.append(str(current["name"]["name"])) + elif isinstance(current["name"], str): + name_parts.append(current["name"]) + # Look for the expression part + if current.get("children"): + for child in current["children"]: + if isinstance(child, dict) and child.get("kind") == "Identifier": + if child.get("text"): + name_parts.append(child["text"]) + current = current.get("expression") + + if name_parts: + full_name = ".".join(reversed(name_parts)) + else: + full_name = None + + if full_name: + # CRITICAL FIX: Extract ALL property accesses for taint analysis + # The taint analyzer will filter for the specific sources it needs + # This ensures we capture req.body, req.query, request.params, etc. + + # Default all property accesses as "property" type + db_type = "property" + + # Override only for known sink patterns that should be "call" type + if any(sink in full_name for sink in ["res.send", "res.render", "res.json", "response.write", "innerHTML", "outerHTML", "exec", "eval", "system", "spawn"]): + db_type = "call" # Taint analyzer looks for sinks as calls + + symbols.append({ + "name": full_name, + "line": node.get("line", 0), + "column": node.get("column", 0), + "type": db_type + }) + + # CallExpression: function calls including method calls + elif kind == "CallExpression": + # Use text field first if available (now restored) + name = None + if node.get("text"): + # Extract function name from text + text = node["text"] + if "(" in text: + name = text.split("(")[0].strip() + elif node.get("name"): + name = node["name"] + + # Also check for method calls on children + if not name and node.get("children"): + for child in node["children"]: + if isinstance(child, dict): + if child.get("kind") == "PropertyAccessExpression": + name = child.get("text", "").split("(")[0].strip() + break + elif child.get("text") and "." in child.get("text", ""): + name = child["text"].split("(")[0].strip() + break + + if name: + symbols.append({ + "name": name, + "line": node.get("line", 0), + "column": node.get("column", 0), + "type": "call" + }) + + # Identifier nodes that might be property accesses or function references + elif kind == "Identifier": + text = node.get("text", "") + # Check if it looks like a property access pattern + if "." in text: + # Determine type based on pattern + db_type = "property" + # Check for sink patterns + if any(sink in text for sink in ["res.send", "res.render", "res.json", "response.write"]): + db_type = "call" + + symbols.append({ + "name": text, + "line": node.get("line", 0), + "column": node.get("column", 0), + "type": db_type + }) + + # Recurse through children + for child in node.get("children", []): + symbols.extend(extract_semantic_ast_symbols(child, depth + 1)) + + return symbols + + +def extract_typescript_functions(tree: Dict, parser_self) -> List[Dict]: + """Extract function definitions from TypeScript semantic AST.""" + functions = [] + + # Common parameter names that should NEVER be marked as functions + PARAMETER_NAMES = {"req", "res", "next", "err", "error", "ctx", "request", "response", "callback", "done", "cb"} + + # CRITICAL FIX: Symbols are at tree["symbols"], not tree["tree"]["symbols"] + for symbol in tree.get("symbols", []): + ts_kind = symbol.get("kind", 0) + symbol_name = symbol.get("name", "") + + if not symbol_name or symbol_name == "anonymous": + continue + + # CRITICAL FIX: Skip known parameter names that are incorrectly marked as functions + if symbol_name in PARAMETER_NAMES: + continue # These are parameters, not function definitions + + # Check if this is a function symbol + is_function = False + if isinstance(ts_kind, str): + if "Function" in ts_kind or "Method" in ts_kind: + is_function = True + elif isinstance(ts_kind, (int, float)): + # TypeScript SymbolFlags: Function = 16, Method = 8192, Constructor = 16384 + # Parameter = 8388608 (0x800000) - SKIP THIS + if ts_kind == 8388608: + continue # This is a parameter, not a function + elif ts_kind in [16, 8192, 16384]: + is_function = True + + if is_function and symbol_name not in PARAMETER_NAMES: + functions.append({ + "name": symbol_name, + "line": symbol.get("line", 0), + "type": "function", + "kind": ts_kind + }) + + return functions + + +def extract_typescript_classes(tree: Dict, parser_self) -> List[Dict]: + """Extract class definitions from TypeScript semantic AST.""" + classes = [] + + # CRITICAL FIX: Symbols are at tree["symbols"], not tree["tree"]["symbols"] + for symbol in tree.get("symbols", []): + ts_kind = symbol.get("kind", 0) + symbol_name = symbol.get("name", "") + + if not symbol_name or symbol_name == "anonymous": + continue + + # Check if this is a class symbol + is_class = False + if isinstance(ts_kind, str): + if "Class" in ts_kind or "Interface" in ts_kind: + is_class = True + elif isinstance(ts_kind, (int, float)): + # TypeScript SymbolFlags: Class = 32, Interface = 64 + if ts_kind in [32, 64]: + is_class = True + + if is_class: + classes.append({ + "name": symbol_name, + "line": symbol.get("line", 0), + "column": 0, + "type": "class", + "kind": ts_kind + }) + + return classes + + +def extract_typescript_calls(tree: Dict, parser_self) -> List[Dict]: + """Extract function calls from TypeScript semantic AST.""" + calls = [] + + # Common parameter names that should NEVER be marked as functions + PARAMETER_NAMES = {"req", "res", "next", "err", "error", "ctx", "request", "response", "callback", "done", "cb"} + + # Use the symbols already extracted by TypeScript compiler + # CRITICAL FIX: Symbols are at tree["symbols"], not tree["tree"]["symbols"] + for symbol in tree.get("symbols", []): + symbol_name = symbol.get("name", "") + ts_kind = symbol.get("kind", 0) + + # Skip empty/anonymous symbols + if not symbol_name or symbol_name == "anonymous": + continue + + # CRITICAL FIX: Skip known parameter names that are incorrectly marked as functions + # These are function parameters, not function definitions + if symbol_name in PARAMETER_NAMES: + # These should be marked as properties/variables for taint analysis + if symbol_name in ["req", "request", "ctx"]: + calls.append({ + "name": symbol_name, + "line": symbol.get("line", 0), + "column": 0, + "type": "property" # Mark as property for taint source detection + }) + continue # Skip further processing for parameters + + # CRITICAL FIX: Properly categorize based on TypeScript SymbolFlags + # The 'kind' field from TypeScript can be: + # - A string like "Function", "Method", "Property" (when ts.SymbolFlags mapping works) + # - A number representing the flag value (when mapping fails) + # TypeScript SymbolFlags values: + # Function = 16, Method = 8192, Property = 98304, Variable = 3, etc. + + db_type = "call" # Default for unknown types + + # Check if kind is a string (successful mapping in helper script) + if isinstance(ts_kind, str): + # Only mark as function if it's REALLY a function and not a parameter + if ("Function" in ts_kind or "Method" in ts_kind) and symbol_name not in PARAMETER_NAMES: + db_type = "function" + elif "Property" in ts_kind: + db_type = "property" + elif "Variable" in ts_kind or "Let" in ts_kind or "Const" in ts_kind: + # Variables could be sources if they match patterns + if any(pattern in symbol_name for pattern in ["req", "request", "ctx", "body", "params", "query", "headers"]): + db_type = "property" + else: + db_type = "call" + # Check numeric flags (when string mapping failed) + elif isinstance(ts_kind, (int, float)): + # TypeScript SymbolFlags from typescript.d.ts: + # Function = 16, Method = 8192, Constructor = 16384 + # Property = 98304, Variable = 3, Let = 1, Const = 2 + # Parameter = 8388608 (0x800000) + + # CRITICAL: Skip parameter flag (8388608) + if ts_kind == 8388608: + # This is a parameter, not a function + if symbol_name in ["req", "request", "ctx"]: + db_type = "property" # Mark as property for taint analysis + else: + continue # Skip other parameters + elif ts_kind in [16, 8192, 16384] and symbol_name not in PARAMETER_NAMES: # Function, Method, Constructor + db_type = "function" + elif ts_kind in [98304, 4, 1048576]: # Property, EnumMember, Accessor + db_type = "property" + elif ts_kind in [3, 1, 2]: # Variable, Let, Const + # Check if it looks like a source + if any(pattern in symbol_name for pattern in ["req", "request", "ctx", "body", "params", "query", "headers"]): + db_type = "property" + + # Override based on name patterns (for calls and property accesses) + if "." in symbol_name: + # Source patterns (user input) + if any(pattern in symbol_name for pattern in ["req.", "request.", "ctx.", "event.", "body", "params", "query", "headers", "cookies"]): + db_type = "property" + # Sink patterns (dangerous functions) + elif any(pattern in symbol_name for pattern in ["res.send", "res.render", "res.json", "response.write", "exec", "eval"]): + db_type = "call" + + calls.append({ + "name": symbol_name, + "line": symbol.get("line", 0), + "column": 0, + "type": db_type + }) + + # Also traverse AST for specific patterns + actual_tree = tree.get("tree") if isinstance(tree.get("tree"), dict) else tree + if actual_tree and actual_tree.get("success"): + ast_root = actual_tree.get("ast") + if ast_root: + calls.extend(extract_semantic_ast_symbols(ast_root)) + + return calls + + +def extract_typescript_imports(tree: Dict, parser_self) -> List[Dict[str, Any]]: + """Extract import statements from TypeScript semantic AST.""" + imports = [] + + # Use TypeScript compiler API data + for imp in tree.get("imports", []): + imports.append({ + "source": imp.get("kind", "import"), + "target": imp.get("module"), + "type": imp.get("kind", "import"), + "line": imp.get("line", 0), + "specifiers": imp.get("specifiers", []) + }) + + return imports + + +def extract_typescript_exports(tree: Dict, parser_self) -> List[Dict[str, Any]]: + """Extract export statements from TypeScript semantic AST. + + Currently returns empty list - exports aren't extracted by semantic parser yet. + """ + return [] + + +def extract_typescript_properties(tree: Dict, parser_self) -> List[Dict]: + """Extract property accesses from TypeScript semantic AST.""" + properties = [] + + # Already handled in extract_calls via extract_semantic_ast_symbols + # But we can also extract them specifically here + actual_tree = tree.get("tree") if isinstance(tree.get("tree"), dict) else tree + if actual_tree and actual_tree.get("success"): + ast_root = actual_tree.get("ast") + if ast_root: + symbols = extract_semantic_ast_symbols(ast_root) + # Filter for property accesses only + properties = [s for s in symbols if s.get("type") == "property"] + + return properties + + +def extract_typescript_assignments(tree: Dict, parser_self) -> List[Dict[str, Any]]: + """Extract ALL assignment patterns from TypeScript semantic AST, including destructuring.""" + assignments = [] + + if not tree or not tree.get("success"): + if os.environ.get("THEAUDITOR_DEBUG"): + import sys + print(f"[AST_DEBUG] extract_typescript_assignments: No success in tree", file=sys.stderr) + return assignments + + if os.environ.get("THEAUDITOR_DEBUG"): + import sys + print(f"[AST_DEBUG] extract_typescript_assignments: Starting extraction", file=sys.stderr) + + def traverse(node, current_function="global", depth=0): + if depth > 100 or not isinstance(node, dict): + return + + try: + kind = node.get("kind", "") + + # DEBUG: Log ALL node kinds we see to understand structure + if os.environ.get("THEAUDITOR_DEBUG"): + import sys + if depth < 5: # Log more depth + print(f"[AST_DEBUG] Depth {depth}: kind='{kind}'", file=sys.stderr) + if "Variable" in kind or "Assignment" in kind or "Binary" in kind or "=" in str(node.get("text", "")): + print(f"[AST_DEBUG] *** POTENTIAL ASSIGNMENT at depth {depth}: {kind}, text={str(node.get('text', ''))[:50]} ***", file=sys.stderr) + + # --- Function Context Tracking --- + new_function = current_function + if kind in ["FunctionDeclaration", "MethodDeclaration", "ArrowFunction", "FunctionExpression"]: + name_node = node.get("name") + if name_node and isinstance(name_node, dict): + new_function = name_node.get("text", "anonymous") + else: + new_function = "anonymous" + + # --- Assignment Extraction --- + # 1. Standard Assignments: const x = y; or x = y; + # NOTE: TypeScript AST has VariableDeclaration nested under FirstStatement->VariableDeclarationList + if kind in ["VariableDeclaration", "BinaryExpression"]: + # For BinaryExpression, check if it's an assignment (=) operator + is_assignment = True + if kind == "BinaryExpression": + op_token = node.get("operatorToken", {}) + if not (isinstance(op_token, dict) and op_token.get("kind") == "EqualsToken"): + # Not an assignment, just a comparison or arithmetic expression + is_assignment = False + + if is_assignment: + # TypeScript AST structure is different - use children and text + if kind == "VariableDeclaration": + # For TypeScript VariableDeclaration, extract from text or children + full_text = node.get("text", "") + if "=" in full_text: + parts = full_text.split("=", 1) + target_var = parts[0].strip() + source_expr = parts[1].strip() + if target_var and source_expr: + if os.environ.get("THEAUDITOR_DEBUG"): + import sys + print(f"[AST_DEBUG] Found TS assignment: {target_var} = {source_expr[:30]}... at line {node.get('line', 0)}", file=sys.stderr) + assignments.append({ + "target_var": target_var, + "source_expr": source_expr, + "line": node.get("line", 0), + "in_function": current_function, + "source_vars": extract_vars_from_tree_sitter_expr(source_expr) + }) + else: + # BinaryExpression - use the original logic + target_node = node.get("left") + source_node = node.get("right") + + if isinstance(target_node, dict) and isinstance(source_node, dict): + # --- ENHANCEMENT: Handle Destructuring --- + if target_node.get("kind") in ["ObjectBindingPattern", "ArrayBindingPattern"]: + source_expr = source_node.get("text", "unknown_source") + # For each element in the destructuring, create a separate assignment + for element in target_node.get("elements", []): + if isinstance(element, dict) and element.get("name"): + target_var = element.get("name", {}).get("text") + if target_var: + assignments.append({ + "target_var": target_var, + "source_expr": source_expr, # CRITICAL: Source is the original object/array + "line": element.get("line", node.get("line", 0)), + "in_function": current_function, + "source_vars": extract_vars_from_tree_sitter_expr(source_expr) + }) + else: + # --- Standard, non-destructured assignment --- + target_var = target_node.get("text", "") + source_expr = source_node.get("text", "") + if target_var and source_expr: + if os.environ.get("THEAUDITOR_DEBUG"): + import sys + print(f"[AST_DEBUG] Found assignment: {target_var} = {source_expr[:50]}... at line {node.get('line', 0)}", file=sys.stderr) + assignments.append({ + "target_var": target_var, + "source_expr": source_expr, + "line": node.get("line", 0), + "in_function": current_function, + "source_vars": extract_vars_from_tree_sitter_expr(source_expr) + }) + + # Recurse with updated function context + for child in node.get("children", []): + traverse(child, new_function, depth + 1) + + except Exception: + # This safety net catches any unexpected AST structures + pass + + ast_root = tree.get("ast", {}) + traverse(ast_root) + + if os.environ.get("THEAUDITOR_DEBUG"): + import sys + print(f"[AST_DEBUG] extract_typescript_assignments: Found {len(assignments)} assignments", file=sys.stderr) + if assignments and len(assignments) < 5: + for a in assignments[:3]: + print(f"[AST_DEBUG] Example: {a['target_var']} = {a['source_expr'][:30]}...", file=sys.stderr) + + return assignments + + +def extract_typescript_function_params(tree: Dict, parser_self) -> Dict[str, List[str]]: + """Extract function parameters from TypeScript semantic AST.""" + func_params = {} + + if not tree or not tree.get("success"): + return func_params + + def traverse(node, depth=0): + if depth > 100 or not isinstance(node, dict): + return + + kind = node.get("kind") + + if kind in ["FunctionDeclaration", "MethodDeclaration", "ArrowFunction", "FunctionExpression"]: + # Get function name + name_node = node.get("name") + func_name = "anonymous" + if isinstance(name_node, dict): + func_name = name_node.get("text", "anonymous") + elif isinstance(name_node, str): + func_name = name_node + elif not name_node: + # Look for Identifier child (TypeScript AST structure) + for child in node.get("children", []): + if isinstance(child, dict) and child.get("kind") == "Identifier": + func_name = child.get("text", "anonymous") + break + + # Extract parameter names + # FIX: In TypeScript AST, parameters are direct children with kind="Parameter" + params = [] + + # Look in children for Parameter nodes + for child in node.get("children", []): + if isinstance(child, dict) and child.get("kind") == "Parameter": + # Found a parameter - get its text directly + param_text = child.get("text", "") + if param_text: + params.append(param_text) + + # Fallback to old structure if no parameters found + if not params: + param_nodes = node.get("parameters", []) + for param in param_nodes: + if isinstance(param, dict) and param.get("name"): + param_name_node = param.get("name") + if isinstance(param_name_node, dict): + params.append(param_name_node.get("text", "")) + elif isinstance(param_name_node, str): + params.append(param_name_node) + + if func_name != "anonymous" and params: + func_params[func_name] = params + + # Recurse through children + for child in node.get("children", []): + traverse(child, depth + 1) + + ast_root = tree.get("ast", {}) + traverse(ast_root) + + return func_params + + +def extract_typescript_calls_with_args(tree: Dict, function_params: Dict[str, List[str]], parser_self) -> List[Dict[str, Any]]: + """Extract function calls with arguments from TypeScript semantic AST.""" + calls = [] + + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"[DEBUG] extract_typescript_calls_with_args: tree type={type(tree)}, success={tree.get('success') if tree else 'N/A'}") + + if not tree or not tree.get("success"): + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"[DEBUG] extract_typescript_calls_with_args: Returning early - no tree or no success") + return calls + + def traverse(node, current_function="global", depth=0): + if depth > 100 or not isinstance(node, dict): + return + + try: + kind = node.get("kind", "") + + # Track function context + new_function = current_function + if kind in ["FunctionDeclaration", "MethodDeclaration", "ArrowFunction", "FunctionExpression"]: + name_node = node.get("name") + if name_node and isinstance(name_node, dict): + new_function = name_node.get("text", "anonymous") + else: + new_function = "anonymous" + + # CallExpression: function calls + if kind == "CallExpression": + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"[DEBUG] Found CallExpression at line {node.get('line', 0)}") + + # FIX: In TypeScript AST, the function and arguments are in children array + children = node.get("children", []) + if not children: + # Fallback to old structure + expression = node.get("expression", {}) + arguments = node.get("arguments", []) + else: + # New structure: first child is function, rest are arguments + expression = children[0] if len(children) > 0 else {} + arguments = children[1:] if len(children) > 1 else [] + + # Get function name from expression + callee_name = "unknown" + if isinstance(expression, dict): + callee_name = expression.get("text", "unknown") + + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"[DEBUG] CallExpression: callee={callee_name}, args={len(arguments)}") + if arguments: + print(f"[DEBUG] First arg: {arguments[0].get('text', 'N/A') if isinstance(arguments[0], dict) else arguments[0]}") + + # Get parameters for this function if we know them + callee_params = function_params.get(callee_name.split(".")[-1], []) + + # Process arguments + for i, arg in enumerate(arguments): + if isinstance(arg, dict): + arg_text = arg.get("text", "") + param_name = callee_params[i] if i < len(callee_params) else f"arg{i}" + + calls.append({ + "line": node.get("line", 0), + "caller_function": current_function, + "callee_function": callee_name, + "argument_index": i, + "argument_expr": arg_text, + "param_name": param_name + }) + + # Recurse with updated function context + for child in node.get("children", []): + traverse(child, new_function, depth + 1) + + except Exception as e: + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"[DEBUG] Error in extract_typescript_calls_with_args: {e}") + + ast_root = tree.get("ast", {}) + traverse(ast_root) + + # Debug output + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"[DEBUG] Extracted {len(calls)} function calls with args from semantic AST") + + return calls + + +def extract_typescript_returns(tree: Dict, parser_self) -> List[Dict[str, Any]]: + """Extract return statements from TypeScript semantic AST.""" + returns = [] + + if not tree or not tree.get("success"): + return returns + + # Traverse AST looking for return statements + def traverse(node, current_function="global", depth=0): + if depth > 100 or not isinstance(node, dict): + return + + kind = node.get("kind") + + # Track current function context + if kind in ["FunctionDeclaration", "FunctionExpression", "ArrowFunction", "MethodDeclaration"]: + # Extract function name if available + name_node = node.get("name") + if name_node and isinstance(name_node, dict): + current_function = name_node.get("text", "anonymous") + else: + current_function = "anonymous" + + # ReturnStatement + elif kind == "ReturnStatement": + expr_node = node.get("expression", {}) + if isinstance(expr_node, dict): + return_expr = expr_node.get("text", "") + else: + return_expr = str(expr_node) if expr_node else "undefined" + + returns.append({ + "function_name": current_function, + "line": node.get("line", 0), + "return_expr": return_expr, + "return_vars": extract_vars_from_tree_sitter_expr(return_expr) + }) + + # Recurse through children + for child in node.get("children", []): + traverse(child, current_function, depth + 1) + + ast_root = tree.get("ast", {}) + traverse(ast_root) + + return returns \ No newline at end of file diff --git a/theauditor/ast_parser.py b/theauditor/ast_parser.py new file mode 100644 index 0000000..bb5e5de --- /dev/null +++ b/theauditor/ast_parser.py @@ -0,0 +1,323 @@ +"""AST parser using Tree-sitter for multi-language support. + +This module provides true structural code analysis using Tree-sitter, +enabling high-fidelity pattern detection that understands code semantics +rather than just text matching. +""" + +import ast +import hashlib +import json +import os +import re +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path +from typing import Any, Optional, List, Dict, Union + +from theauditor.js_semantic_parser import get_semantic_ast, get_semantic_ast_batch +from theauditor.ast_patterns import ASTPatternMixin +from theauditor.ast_extractors import ASTExtractorMixin + + +@dataclass +class ASTMatch: + """Represents an AST pattern match.""" + + node_type: str + start_line: int + end_line: int + start_col: int + snippet: str + metadata: Dict[str, Any] = None + + +class ASTParser(ASTPatternMixin, ASTExtractorMixin): + """Multi-language AST parser using Tree-sitter for structural analysis.""" + + def __init__(self): + """Initialize parser with Tree-sitter language support.""" + self.has_tree_sitter = False + self.parsers = {} + self.languages = {} + self.project_type = None # Cache project type detection + + # Try to import tree-sitter and language bindings + try: + import tree_sitter + self.tree_sitter = tree_sitter + self.has_tree_sitter = True + self._init_tree_sitter_parsers() + except ImportError: + print("Warning: Tree-sitter not available. Install with: pip install tree-sitter tree-sitter-python tree-sitter-javascript tree-sitter-typescript") + + def _init_tree_sitter_parsers(self): + """Initialize Tree-sitter language parsers with proper bindings.""" + if not self.has_tree_sitter: + return + + # Use tree-sitter-language-pack for all languages + try: + from tree_sitter_language_pack import get_language, get_parser + + # Python parser + try: + python_lang = get_language("python") + python_parser = get_parser("python") + self.parsers["python"] = python_parser + self.languages["python"] = python_lang + except Exception as e: + # Python has built-in fallback, so we can continue with a warning + print(f"Warning: Failed to initialize Python parser: {e}") + print(" AST analysis for Python will use built-in parser as fallback.") + + # JavaScript parser (CRITICAL - must fail fast) + try: + js_lang = get_language("javascript") + js_parser = get_parser("javascript") + self.parsers["javascript"] = js_parser + self.languages["javascript"] = js_lang + except Exception as e: + raise RuntimeError( + f"Failed to load tree-sitter grammar for JavaScript: {e}\n" + "This is often due to missing build tools or corrupted installation.\n" + "Please try: pip install --force-reinstall tree-sitter-language-pack\n" + "Or install with AST support: pip install -e '.[ast]'" + ) + + # TypeScript parser (CRITICAL - must fail fast) + try: + ts_lang = get_language("typescript") + ts_parser = get_parser("typescript") + self.parsers["typescript"] = ts_parser + self.languages["typescript"] = ts_lang + except Exception as e: + raise RuntimeError( + f"Failed to load tree-sitter grammar for TypeScript: {e}\n" + "This is often due to missing build tools or corrupted installation.\n" + "Please try: pip install --force-reinstall tree-sitter-language-pack\n" + "Or install with AST support: pip install -e '.[ast]'" + ) + + except ImportError as e: + # If tree-sitter is installed but language pack is not, this is a critical error + # The user clearly intends to use tree-sitter, so we should fail loudly + print(f"ERROR: tree-sitter is installed but tree-sitter-language-pack is not: {e}") + print("This means tree-sitter AST analysis cannot work properly.") + print("Please install with: pip install tree-sitter-language-pack") + print("Or install TheAuditor with full AST support: pip install -e '.[ast]'") + # Set flags to indicate no language support + self.has_tree_sitter = False + # Don't raise - allow fallback to regex-based parsing + + def _detect_project_type(self) -> str: + """Detect the primary project type based on manifest files. + + Returns: + 'polyglot' if multiple language manifest files exist + 'javascript' if only package.json exists + 'python' if only Python manifest files exist + 'go' if only go.mod exists + 'unknown' otherwise + """ + if self.project_type is not None: + return self.project_type + + # Check all manifest files first + has_js = Path("package.json").exists() + has_python = (Path("requirements.txt").exists() or + Path("pyproject.toml").exists() or + Path("setup.py").exists()) + has_go = Path("go.mod").exists() + + # Determine project type based on combinations + if has_js and has_python: + self.project_type = "polyglot" # NEW: Properly handle mixed projects + elif has_js and has_go: + self.project_type = "polyglot" + elif has_python and has_go: + self.project_type = "polyglot" + elif has_js: + self.project_type = "javascript" + elif has_python: + self.project_type = "python" + elif has_go: + self.project_type = "go" + else: + self.project_type = "unknown" + + return self.project_type + + def parse_file(self, file_path: Path, language: str = None, root_path: str = None) -> Any: + """Parse a file into an AST. + + Args: + file_path: Path to the source file. + language: Programming language (auto-detected if None). + root_path: Absolute path to project root (for sandbox resolution). + + Returns: + AST tree object or None if parsing fails. + """ + if language is None: + language = self._detect_language(file_path) + + try: + with open(file_path, "rb") as f: + content = f.read() + + # Compute content hash for caching + content_hash = hashlib.md5(content).hexdigest() + + # For JavaScript/TypeScript, try semantic parser first + # CRITICAL FIX: Include None and polyglot project types + # When project_type is None (not detected yet) or polyglot, still try semantic parsing + project_type = self._detect_project_type() + if language in ["javascript", "typescript"] and project_type in ["javascript", "polyglot", None, "unknown"]: + try: + # Attempt to use the TypeScript Compiler API for semantic analysis + # Normalize path for cross-platform compatibility + normalized_path = str(file_path).replace("\\", "/") + semantic_result = get_semantic_ast(normalized_path, project_root=root_path) + + if semantic_result.get("success"): + # Return the semantic AST with full type information + return { + "type": "semantic_ast", + "tree": semantic_result, + "language": language, + "content": content.decode("utf-8", errors="ignore"), + "has_types": semantic_result.get("hasTypes", False), + "diagnostics": semantic_result.get("diagnostics", []), + "symbols": semantic_result.get("symbols", []) + } + else: + # Log but continue to Tree-sitter/regex fallback + error_msg = semantic_result.get('error', 'Unknown error') + print(f"Warning: Semantic parser failed for {file_path}: {error_msg}") + print(f" Falling back to Tree-sitter/regex parser.") + # Continue to fallback options below + + except Exception as e: + # Log but continue to Tree-sitter/regex fallback + print(f"Warning: Exception in semantic parser for {file_path}: {e}") + print(f" Falling back to Tree-sitter/regex parser.") + # Continue to fallback options below + + # Use Tree-sitter if available + if self.has_tree_sitter and language in self.parsers: + try: + # Use cached parser + tree = self._parse_treesitter_cached(content_hash, content, language) + return {"type": "tree_sitter", "tree": tree, "language": language, "content": content} + except Exception as e: + print(f"Warning: Tree-sitter parsing failed for {file_path}: {e}") + print(f" Falling back to alternative parser if available.") + # Continue to fallback options below + + # Fallback to built-in parsers for Python + if language == "python": + decoded = content.decode("utf-8", errors="ignore") + python_ast = self._parse_python_cached(content_hash, decoded) + if python_ast: + return {"type": "python_ast", "tree": python_ast, "language": language, "content": decoded} + + # Return minimal structure to signal regex fallback for JS/TS + if language in ["javascript", "typescript"]: + print(f"Warning: AST parsing unavailable for {file_path}. Using regex fallback.") + decoded = content.decode("utf-8", errors="ignore") + return {"type": "regex_fallback", "tree": None, "language": language, "content": decoded} + + # Return None for unsupported languages + return None + + except Exception as e: + print(f"Warning: Failed to parse {file_path}: {e}") + return None + + def _detect_language(self, file_path: Path) -> str: + """Detect language from file extension.""" + ext_map = { + ".py": "python", + ".js": "javascript", + ".jsx": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".mjs": "javascript", + ".cjs": "javascript", + ".vue": "javascript", # Vue SFCs contain JavaScript/TypeScript + } + return ext_map.get(file_path.suffix.lower(), "") # Empty not unknown + + def _parse_python_builtin(self, content: str) -> Optional[ast.AST]: + """Parse Python code using built-in ast module.""" + try: + return ast.parse(content) + except SyntaxError: + return None + + @lru_cache(maxsize=500) + def _parse_python_cached(self, content_hash: str, content: str) -> Optional[ast.AST]: + """Parse Python code with caching based on content hash. + + Args: + content_hash: MD5 hash of the file content + content: The actual file content + + Returns: + Parsed AST or None if parsing fails + """ + return self._parse_python_builtin(content) + + @lru_cache(maxsize=500) + def _parse_treesitter_cached(self, content_hash: str, content: bytes, language: str) -> Any: + """Parse code using Tree-sitter with caching based on content hash. + + Args: + content_hash: MD5 hash of the file content + content: The actual file content as bytes + language: The programming language + + Returns: + Parsed Tree-sitter tree + """ + parser = self.parsers[language] + return parser.parse(content) + + + def supports_language(self, language: str) -> bool: + """Check if a language is supported for AST parsing. + + Args: + language: Programming language name. + + Returns: + True if AST parsing is supported. + """ + # Python is always supported via built-in ast module + if language == "python": + return True + + # JavaScript and TypeScript are always supported via fallback + if language in ["javascript", "typescript"]: + return True + + # Check Tree-sitter support for other languages + if self.has_tree_sitter and language in self.parsers: + return True + + return False + + def get_supported_languages(self) -> List[str]: + """Get list of supported languages. + + Returns: + List of language names. + """ + # Always supported via built-in or fallback + languages = ["python", "javascript", "typescript"] + + if self.has_tree_sitter: + languages.extend(self.parsers.keys()) + + return sorted(set(languages)) diff --git a/theauditor/ast_patterns.py b/theauditor/ast_patterns.py new file mode 100644 index 0000000..967788d --- /dev/null +++ b/theauditor/ast_patterns.py @@ -0,0 +1,401 @@ +"""AST Pattern Matching Engine. + +This module contains all pattern matching and query logic for the AST parser. +It provides pattern-based search capabilities across different AST types. +""" + +import ast +from typing import Any, Optional, List, Dict, TYPE_CHECKING +from dataclasses import dataclass + +if TYPE_CHECKING: + # For type checking only, avoid circular import + from .ast_parser import ASTMatch +else: + # At runtime, ASTMatch will be available from the parent class + @dataclass + class ASTMatch: + """Represents an AST pattern match.""" + node_type: str + start_line: int + end_line: int + start_col: int + snippet: str + metadata: Dict[str, Any] = None + + +class ASTPatternMixin: + """Mixin class providing pattern matching capabilities for AST analysis.""" + + def query_ast(self, tree: Any, query_string: str) -> List[ASTMatch]: + """Execute a Tree-sitter query on the AST. + + Args: + tree: AST tree object from parse_file. + query_string: Tree-sitter query in S-expression format. + + Returns: + List of ASTMatch objects. + """ + matches = [] + + if not tree: + return matches + + # Handle Tree-sitter AST with queries + if tree.get("type") == "tree_sitter" and self.has_tree_sitter: + language = tree["language"] + if language in self.languages: + try: + # CRITICAL FIX: Use correct tree-sitter API with QueryCursor + # Per tree-sitter 0.25.1 documentation, must: + # 1. Create Query with Query() constructor + # 2. Create QueryCursor from the query + # 3. Call matches() on the cursor, not the query + from tree_sitter import Query, QueryCursor + + # Create Query object using the language and query string + query = Query(self.languages[language], query_string) + + # Create QueryCursor from the query + query_cursor = QueryCursor(query) + + # Call matches() on the cursor (not the query!) + query_matches = query_cursor.matches(tree["tree"].root_node) + + for match in query_matches: + # Each match is a tuple: (pattern_index, captures_dict) + pattern_index, captures = match + + # Process captures dictionary + for capture_name, nodes in captures.items(): + # Handle both single node and list of nodes + if not isinstance(nodes, list): + nodes = [nodes] + + for node in nodes: + start_point = node.start_point + end_point = node.end_point + snippet = node.text.decode("utf-8", errors="ignore") if node.text else "" + + ast_match = ASTMatch( + node_type=node.type, + start_line=start_point[0] + 1, + end_line=end_point[0] + 1, + start_col=start_point[1], + snippet=snippet[:200], + metadata={"capture": capture_name, "pattern": pattern_index} + ) + matches.append(ast_match) + except Exception as e: + print(f"Query error: {e}") + + # For Python AST, fall back to pattern matching + elif tree.get("type") == "python_ast": + # Convert query to pattern and use existing method + pattern = self._query_to_pattern(query_string) + if pattern: + matches = self.find_ast_matches(tree, pattern) + + return matches + + def _query_to_pattern(self, query_string: str) -> Optional[Dict]: + """Convert a Tree-sitter query to a simple pattern dict. + + This is a fallback for Python's built-in AST. + """ + # Simple heuristic conversion for common patterns + if "any" in query_string.lower(): + return {"node_type": "type_annotation", "contains": ["any"]} + elif "function" in query_string.lower(): + return {"node_type": "function_def", "contains": []} + elif "class" in query_string.lower(): + return {"node_type": "class_def", "contains": []} + return None + + def find_ast_matches(self, tree: Any, ast_pattern: dict) -> List[ASTMatch]: + """Find matches in AST based on pattern. + + Args: + tree: AST tree object. + ast_pattern: Pattern dictionary with node_type and optional contains. + + Returns: + List of ASTMatch objects. + """ + matches = [] + + if not tree: + return matches + + # Handle wrapped tree objects + if isinstance(tree, dict): + tree_type = tree.get("type") + actual_tree = tree.get("tree") + + if tree_type == "tree_sitter" and self.has_tree_sitter: + matches.extend(self._find_tree_sitter_matches(actual_tree.root_node, ast_pattern)) + elif tree_type == "python_ast": + matches.extend(self._find_python_ast_matches(actual_tree, ast_pattern)) + elif tree_type == "semantic_ast": + # Handle Semantic AST from TypeScript Compiler API + matches.extend(self._find_semantic_ast_matches(actual_tree, ast_pattern)) + elif tree_type == "eslint_ast": + # Handle ESLint AST (legacy, now replaced by semantic_ast) + # For now, we treat it similarly to regex_ast but with higher confidence + matches.extend(self._find_eslint_ast_matches(actual_tree, ast_pattern)) + + # Handle direct AST objects (legacy support) + elif isinstance(tree, ast.AST): + matches.extend(self._find_python_ast_matches(tree, ast_pattern)) + + return matches + + def _find_tree_sitter_matches(self, node: Any, pattern: dict) -> List[ASTMatch]: + """Find matches in Tree-sitter AST using structural patterns.""" + matches = [] + + if node is None: + return matches + + # Check if node type matches + node_type = pattern.get("node_type", "") + + # Special handling for type annotations + if node_type == "type_annotation" and "any" in pattern.get("contains", []): + # Look for TypeScript/JavaScript any type annotations + if node.type in ["type_annotation", "type_identifier", "any_type"]: + node_text = node.text.decode("utf-8", errors="ignore") if node.text else "" + if node_text == "any" or ": any" in node_text: + start_point = node.start_point + end_point = node.end_point + + match = ASTMatch( + node_type=node.type, + start_line=start_point[0] + 1, + end_line=end_point[0] + 1, + start_col=start_point[1], + snippet=node_text[:200] + ) + matches.append(match) + + # General pattern matching + elif node.type == node_type or node_type == "*": + contains = pattern.get("contains", []) + node_text = node.text.decode("utf-8", errors="ignore") if node.text else "" + + if all(keyword in node_text for keyword in contains): + start_point = node.start_point + end_point = node.end_point + + match = ASTMatch( + node_type=node.type, + start_line=start_point[0] + 1, + end_line=end_point[0] + 1, + start_col=start_point[1], + snippet=node_text[:200], + ) + matches.append(match) + + # Recursively search children + for child in node.children: + matches.extend(self._find_tree_sitter_matches(child, pattern)) + + return matches + + def _find_semantic_ast_matches(self, tree: Dict[str, Any], pattern: dict) -> List[ASTMatch]: + """Find matches in Semantic AST from TypeScript Compiler API. + + This provides the highest fidelity analysis with full type information. + """ + matches = [] + + if not tree or not tree.get("ast"): + return matches + + # Handle type-related patterns + node_type = pattern.get("node_type", "") + + if node_type == "type_annotation" and "any" in pattern.get("contains", []): + # Search for 'any' types in symbols + for symbol in tree.get("symbols", []): + if symbol.get("type") == "any": + match = ASTMatch( + node_type="any_type", + start_line=symbol.get("line", 0), + end_line=symbol.get("line", 0), + start_col=0, + snippet=f"{symbol.get('name')}: any", + metadata={"symbol": symbol.get("name"), "type": "any"} + ) + matches.append(match) + + # Also recursively search the AST for AnyKeyword nodes + def search_ast_for_any(node, depth=0): + if depth > 100 or not isinstance(node, dict): + return + + if node.get("kind") == "AnyKeyword": + match = ASTMatch( + node_type="AnyKeyword", + start_line=node.get("line", 0), + end_line=node.get("line", 0), + start_col=node.get("column", 0), + snippet=node.get("text", "any")[:200], + metadata={"kind": "AnyKeyword"} + ) + matches.append(match) + + for child in node.get("children", []): + search_ast_for_any(child, depth + 1) + + search_ast_for_any(tree.get("ast", {})) + + return matches + + def _find_eslint_ast_matches(self, tree: Dict[str, Any], pattern: dict) -> List[ASTMatch]: + """Find matches in ESLint AST. + + ESLint provides a full JavaScript/TypeScript AST with high fidelity. + This provides accurate pattern matching for JS/TS code. + """ + matches = [] + + # ESLint AST follows the ESTree specification + # Future enhancement: properly traverse the ESTree AST structure + + if not tree: + return matches + + # Basic implementation - will be enhanced in future iterations + # to properly traverse the ESTree AST structure + return matches + + + def _find_python_ast_matches(self, node: ast.AST, pattern: dict) -> List[ASTMatch]: + """Find matches in Python built-in AST.""" + matches = [] + + # Map pattern node types to Python AST node types + node_type_map = { + "if_statement": ast.If, + "while_statement": ast.While, + "for_statement": ast.For, + "function_def": ast.FunctionDef, + "class_def": ast.ClassDef, + "try_statement": ast.Try, + "type_annotation": ast.AnnAssign, # For type hints + } + + pattern_node_type = pattern.get("node_type", "") + expected_type = node_type_map.get(pattern_node_type) + + # Special handling for 'any' type in Python + if pattern_node_type == "type_annotation" and "any" in pattern.get("contains", []): + # Check for typing.Any usage + if isinstance(node, ast.Name) and node.id == "Any": + match = ASTMatch( + node_type="Any", + start_line=getattr(node, "lineno", 0), + end_line=getattr(node, "end_lineno", getattr(node, "lineno", 0)), + start_col=getattr(node, "col_offset", 0), + snippet="Any" + ) + matches.append(match) + elif isinstance(node, ast.AnnAssign): + # Check annotation for Any + node_source = ast.unparse(node) if hasattr(ast, "unparse") else "" + if "Any" in node_source: + match = ASTMatch( + node_type="AnnAssign", + start_line=getattr(node, "lineno", 0), + end_line=getattr(node, "end_lineno", getattr(node, "lineno", 0)), + start_col=getattr(node, "col_offset", 0), + snippet=node_source[:200] + ) + matches.append(match) + + # General pattern matching + elif expected_type and isinstance(node, expected_type): + contains = pattern.get("contains", []) + node_source = ast.unparse(node) if hasattr(ast, "unparse") else "" + + if all(keyword in node_source for keyword in contains): + match = ASTMatch( + node_type=node.__class__.__name__, + start_line=getattr(node, "lineno", 0), + end_line=getattr(node, "end_lineno", getattr(node, "lineno", 0)), + start_col=getattr(node, "col_offset", 0), + snippet=node_source[:200], + ) + matches.append(match) + + # Recursively search children + for child in ast.walk(node): + if child != node: + matches.extend(self._find_python_ast_matches(child, pattern)) + + return matches + + def get_tree_sitter_query_for_pattern(self, pattern: str, language: str) -> str: + """Convert a pattern identifier to a Tree-sitter query. + + Args: + pattern: Pattern identifier (e.g., "NO_ANY_IN_SCOPE") + language: Programming language + + Returns: + Tree-sitter query string in S-expression format + """ + queries = { + "typescript": { + "NO_ANY_IN_SCOPE": """ + (type_annotation + (type_identifier) @type + (#eq? @type "any")) + """, + "NO_UNSAFE_EVAL": """ + (call_expression + function: (identifier) @func + (#eq? @func "eval")) + """, + "NO_VAR_IN_STRICT": """ + (variable_declaration + kind: "var") @var_usage + """, + }, + "javascript": { + "NO_ANY_IN_SCOPE": """ + (type_annotation + (type_identifier) @type + (#eq? @type "any")) + """, + "NO_UNSAFE_EVAL": """ + (call_expression + function: (identifier) @func + (#eq? @func "eval")) + """, + "NO_VAR_IN_STRICT": """ + (variable_declaration + kind: "var") @var_usage + """, + }, + "python": { + "NO_EVAL_EXEC": """ + (call + function: (identifier) @func + (#match? @func "^(eval|exec)$")) + """, + "NO_BARE_EXCEPT": """ + (except_clause + !type) @bare_except + """, + "NO_MUTABLE_DEFAULT": """ + (default_parameter + value: [(list) (dictionary)]) @mutable_default + """, + } + } + + language_queries = queries.get(language, {}) + return language_queries.get(pattern, "") \ No newline at end of file diff --git a/theauditor/claude_setup.py b/theauditor/claude_setup.py new file mode 100644 index 0000000..17ab5bc --- /dev/null +++ b/theauditor/claude_setup.py @@ -0,0 +1,273 @@ +"""Claude Code integration setup - Zero-optional bulletproof installer.""" + +import hashlib +import json +import platform +import shutil +import stat +import sys +from pathlib import Path +from typing import Dict, List, Optional + +from .venv_install import setup_project_venv, find_theauditor_root + +# Detect if running on Windows for character encoding +IS_WINDOWS = platform.system() == "Windows" + + +def write_file_atomic(path: Path, content: str, executable: bool = False) -> str: + """ + Write file atomically with backup if content differs. + + Args: + path: File path to write + content: Content to write + executable: Make file executable (Unix only) + + Returns: + "created" if new file + "updated" if file changed (creates .bak) + "skipped" if identical content + """ + # Ensure parent directory exists + path.parent.mkdir(parents=True, exist_ok=True) + + if path.exists(): + existing = path.read_text(encoding='utf-8') + if existing == content: + return "skipped" + + # Create backup (only once per unique content) + bak_path = path.with_suffix(path.suffix + ".bak") + if not bak_path.exists(): + shutil.copy2(path, bak_path) + + path.write_text(content, encoding='utf-8') + status = "updated" + else: + path.write_text(content, encoding='utf-8') + status = "created" + + # Set executable if needed + if executable and platform.system() != "Windows": + st = path.stat() + path.chmod(st.st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH) + + return status + + +class WrapperTemplates: + """Cross-platform wrapper script templates.""" + + POSIX_WRAPPER = '''#!/usr/bin/env bash +# Auto-generated wrapper for project-local aud +PROJ_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +VENV="$PROJ_ROOT/.auditor_venv/bin/aud" +if [ -x "$VENV" ]; then + exec "$VENV" "$@" +fi +# Fallback to module execution +exec "$PROJ_ROOT/.auditor_venv/bin/python" -m theauditor.cli "$@" +''' + + POWERSHELL_WRAPPER = r'''# Auto-generated wrapper for project-local aud +$proj = Split-Path -Path (Split-Path -Parent $MyInvocation.MyCommand.Path) -Parent +$aud = Join-Path $proj ".auditor_venv\Scripts\aud.exe" +if (Test-Path $aud) { + & $aud @args + exit $LASTEXITCODE +} +# Fallback to module execution +$python = Join-Path $proj ".auditor_venv\Scripts\python.exe" +& $python "-m" "theauditor.cli" @args +exit $LASTEXITCODE +''' + + CMD_WRAPPER = r'''@echo off +REM Auto-generated wrapper for project-local aud +set PROJ=%~dp0..\.. +if exist "%PROJ%\.auditor_venv\Scripts\aud.exe" ( + "%PROJ%\.auditor_venv\Scripts\aud.exe" %* + exit /b %ERRORLEVEL% +) +REM Fallback to module execution +"%PROJ%\.auditor_venv\Scripts\python.exe" -m theauditor.cli %* +exit /b %ERRORLEVEL% +''' + + +def create_wrappers(target_dir: Path) -> Dict[str, str]: + """ + Create cross-platform wrapper scripts. + + Args: + target_dir: Project root directory + + Returns: + Dict mapping wrapper paths to their status + """ + wrappers_dir = target_dir / ".claude" / "bin" + results = {} + + # POSIX wrapper (bash) + posix_wrapper = wrappers_dir / "aud" + status = write_file_atomic(posix_wrapper, WrapperTemplates.POSIX_WRAPPER, executable=True) + results[str(posix_wrapper)] = status + + # PowerShell wrapper + ps_wrapper = wrappers_dir / "aud.ps1" + status = write_file_atomic(ps_wrapper, WrapperTemplates.POWERSHELL_WRAPPER) + results[str(ps_wrapper)] = status + + # CMD wrapper + cmd_wrapper = wrappers_dir / "aud.cmd" + status = write_file_atomic(cmd_wrapper, WrapperTemplates.CMD_WRAPPER) + results[str(cmd_wrapper)] = status + + return results + + +def copy_agent_templates(source_dir: Path, target_dir: Path) -> Dict[str, str]: + """ + Copy all .md agent template files directly to target/.claude/agents/. + + Args: + source_dir: Directory containing agent template .md files + target_dir: Project root directory + + Returns: + Dict mapping agent paths to their status + """ + agents_dir = target_dir / ".claude" / "agents" + agents_dir.mkdir(parents=True, exist_ok=True) + + results = {} + + # Find all .md files in source directory + for md_file in source_dir.glob("*.md"): + if md_file.is_file(): + # Read content + content = md_file.read_text(encoding='utf-8') + + # Write to target + target_file = agents_dir / md_file.name + status = write_file_atomic(target_file, content) + results[str(target_file)] = status + + return results + + +def setup_claude_complete( + target: str, + source: str = "agent_templates", + sync: bool = False, + dry_run: bool = False +) -> Dict[str, List[str]]: + """ + Complete Claude setup: venv, wrappers, hooks, and agents. + + Args: + target: Target project root (absolute or relative path) + source: Path to TheAuditor agent templates directory + sync: Force update (still creates .bak on first change) + dry_run: Print plan without executing + + Returns: + Dict with created, updated, and skipped file lists + """ + # Resolve paths + target_dir = Path(target).resolve() + + if not target_dir.exists(): + raise ValueError(f"Target directory does not exist: {target_dir}") + + # Find source docs + if Path(source).is_absolute(): + source_dir = Path(source) + else: + theauditor_root = find_theauditor_root() + source_dir = theauditor_root / source + + if not source_dir.exists(): + raise ValueError(f"Source agent templates directory not found: {source_dir}") + + print(f"\n{'='*60}") + print(f"Claude Setup - Zero-Optional Installation") + print(f"{'='*60}") + print(f"Target: {target_dir}") + print(f"Source: {source_dir}") + print(f"Mode: {'DRY RUN' if dry_run else 'EXECUTE'}") + print(f"{'='*60}\n") + + if dry_run: + print("DRY RUN - Plan of operations:") + print(f"1. Create/verify venv at {target_dir}/.auditor_venv") + print(f"2. Install TheAuditor (editable) into venv") + print(f"3. Create wrappers at {target_dir}/.claude/bin/") + print(f"4. Copy agent templates from {source_dir}/*.md") + print(f"5. Write agents to {target_dir}/.claude/agents/") + print("\nNo files will be modified.") + return {"created": [], "updated": [], "skipped": []} + + results = { + "created": [], + "updated": [], + "skipped": [], + "failed": [] + } + + # Step 1: Setup venv + print("Step 1: Setting up Python virtual environment...", flush=True) + try: + venv_path, success = setup_project_venv(target_dir, force=sync) + if success: + results["created"].append(str(venv_path)) + else: + results["failed"].append(f"venv setup at {venv_path}") + print("ERROR: Failed to setup venv. Aborting.") + return results + except Exception as e: + print(f"ERROR setting up venv: {e}") + results["failed"].append("venv setup") + return results + + # Step 2: Create wrappers + print("\nStep 2: Creating cross-platform wrappers...", flush=True) + wrapper_results = create_wrappers(target_dir) + for path, status in wrapper_results.items(): + results[status].append(path) + + # Step 3: Copy agent templates + print("\nStep 3: Copying agent templates...", flush=True) + try: + agent_results = copy_agent_templates(source_dir, target_dir) + for path, status in agent_results.items(): + results[status].append(path) + + if not agent_results: + print("WARNING: No .md files found in agent_templates directory") + + except Exception as e: + print(f"ERROR copying agent templates: {e}") + results["failed"].append("agent template copy") + + # Summary + print(f"\n{'='*60}") + print("Setup Complete - Summary:") + print(f"{'='*60}") + print(f"Created: {len(results['created'])} files") + print(f"Updated: {len(results['updated'])} files") + print(f"Skipped: {len(results['skipped'])} files (unchanged)") + + if results['failed']: + print(f"FAILED: {len(results['failed'])} operations") + for item in results['failed']: + print(f" - {item}") + + check_mark = "[OK]" if IS_WINDOWS else "✓" + print(f"\n{check_mark} Project configured at: {target_dir}") + print(f"{check_mark} Wrapper available at: {target_dir}/.claude/bin/aud") + print(f"{check_mark} Agents installed to: {target_dir}/.claude/agents/") + print(f"{check_mark} Professional linters installed (ruff, mypy, black, ESLint, etc.)") + + return results \ No newline at end of file diff --git a/theauditor/cli.py b/theauditor/cli.py new file mode 100644 index 0000000..f71a75e --- /dev/null +++ b/theauditor/cli.py @@ -0,0 +1,239 @@ +"""TheAuditor CLI - Main entry point and command registration hub.""" + +import platform +import subprocess +import sys + +import click +from theauditor import __version__ + +# Configure UTF-8 console output for Windows +if platform.system() == "Windows": + try: + # Set console code page to UTF-8 + subprocess.run(["chcp", "65001"], shell=True, capture_output=True, timeout=1) + # Also configure Python's stdout/stderr + import codecs + sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict') + sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict') + except Exception: + # Silently continue if chcp fails (not critical) + pass + + +class VerboseGroup(click.Group): + """Custom group that shows all subcommands and their key options in help.""" + + def format_help(self, ctx, formatter): + """Format help to show all commands with their key options.""" + # Original help text + super().format_help(ctx, formatter) + + # Add detailed command listing + formatter.write_paragraph() + formatter.write_text("Detailed Command Overview:") + formatter.write_paragraph() + + # Core commands + formatter.write_text("CORE ANALYSIS:") + with formatter.indentation(): + formatter.write_text("aud full # Complete 13-phase security audit") + formatter.write_text(" --offline # Skip network operations (deps, docs)") + formatter.write_text(" --exclude-self # Exclude TheAuditor's own files") + formatter.write_text(" --quiet # Minimal output") + formatter.write_paragraph() + + formatter.write_text("aud index # Build file manifest and symbol database") + formatter.write_text(" --exclude-self # Exclude TheAuditor's own files") + formatter.write_paragraph() + + formatter.write_text("aud workset # Analyze only changed files") + formatter.write_text(" --diff HEAD~3..HEAD # Specify git commit range") + formatter.write_text(" --all # Include all files") + + formatter.write_paragraph() + formatter.write_text("SECURITY SCANNING:") + with formatter.indentation(): + formatter.write_text("aud detect-patterns # Run 100+ security pattern rules") + formatter.write_text(" --workset # Scan only workset files") + formatter.write_paragraph() + + formatter.write_text("aud taint-analyze # Track data flow from sources to sinks") + formatter.write_paragraph() + + formatter.write_text("aud docker-analyze # Analyze Docker security issues") + formatter.write_text(" --severity critical # Filter by severity") + + formatter.write_paragraph() + formatter.write_text("DEPENDENCIES:") + with formatter.indentation(): + formatter.write_text("aud deps # Analyze project dependencies") + formatter.write_text(" --vuln-scan # Run npm audit & pip-audit") + formatter.write_text(" --check-latest # Check for outdated packages") + formatter.write_text(" --upgrade-all # YOLO: upgrade everything to latest") + + formatter.write_paragraph() + formatter.write_text("CODE QUALITY:") + with formatter.indentation(): + formatter.write_text("aud lint # Run all configured linters") + formatter.write_text(" --fix # Auto-fix issues where possible") + formatter.write_text(" --workset # Lint only changed files") + + formatter.write_paragraph() + formatter.write_text("ANALYSIS & REPORTING:") + with formatter.indentation(): + formatter.write_text("aud graph build # Build dependency graph") + formatter.write_text("aud graph analyze # Find cycles and architectural issues") + formatter.write_paragraph() + + formatter.write_text("aud impact # Analyze change impact radius") + formatter.write_text(" --file src/auth.py # Specify file to analyze") + formatter.write_text(" --line 42 # Specific line number") + formatter.write_paragraph() + + formatter.write_text("aud refactor # Detect incomplete refactorings") + formatter.write_text(" --auto-detect # Auto-detect from migrations") + formatter.write_text(" --workset # Check current changes") + formatter.write_paragraph() + + formatter.write_text("aud fce # Run Factual Correlation Engine") + formatter.write_text("aud report # Generate final report") + formatter.write_text("aud structure # Generate project structure report") + + formatter.write_paragraph() + formatter.write_text("ADVANCED:") + with formatter.indentation(): + formatter.write_text("aud insights # Run optional insights analysis") + formatter.write_text(" --mode ml # ML risk predictions") + formatter.write_text(" --mode graph # Architecture health scoring") + formatter.write_text(" --mode taint # Security severity analysis") + formatter.write_paragraph() + + formatter.write_text("aud learn # Train ML models on codebase") + formatter.write_text("aud suggest # Get ML-powered suggestions") + + formatter.write_paragraph() + formatter.write_text("SETUP & CONFIG:") + with formatter.indentation(): + formatter.write_text("aud init # Initialize .pf/ directory") + formatter.write_text("aud setup-claude # Setup sandboxed JS/TS tools") + formatter.write_text(" --target . # Target directory") + formatter.write_paragraph() + + formatter.write_text("aud init-js # Create/merge package.json") + formatter.write_text("aud init-config # Initialize configuration") + + formatter.write_paragraph() + formatter.write_text("For detailed help on any command: aud --help") + + +@click.group(cls=VerboseGroup) +@click.version_option(version=__version__, prog_name="aud") +@click.help_option("-h", "--help") +def cli(): + """TheAuditor - Offline, air-gapped CLI for repo indexing and evidence checking. + + Quick Start: + aud init # Initialize project + aud full # Run complete audit + aud full --offline # Run without network operations + + View results in .pf/readthis/ directory.""" + pass + + +# Import and register commands +from theauditor.commands.init import init +from theauditor.commands.index import index +from theauditor.commands.workset import workset +from theauditor.commands.lint import lint +from theauditor.commands.deps import deps +from theauditor.commands.report import report +from theauditor.commands.summary import summary +from theauditor.commands.graph import graph +from theauditor.commands.full import full +from theauditor.commands.fce import fce +from theauditor.commands.impact import impact +from theauditor.commands.taint import taint_analyze +from theauditor.commands.setup import setup_claude + +# Import additional migrated commands +from theauditor.commands.detect_patterns import detect_patterns +from theauditor.commands.detect_frameworks import detect_frameworks +from theauditor.commands.docs import docs +from theauditor.commands.tool_versions import tool_versions +from theauditor.commands.init_js import init_js +from theauditor.commands.init_config import init_config +from theauditor.commands.validate_templates import validate_templates + +# Import ML commands +from theauditor.commands.ml import learn, suggest, learn_feedback + +# Import internal commands (prefixed with _) +from theauditor.commands._archive import _archive + +# Import rules command +from theauditor.commands.rules import rules_command + +# Import refactoring analysis commands +from theauditor.commands.refactor import refactor_command +from theauditor.commands.insights import insights_command + +# Import new commands +from theauditor.commands.docker_analyze import docker_analyze +from theauditor.commands.structure import structure + +# Register simple commands +cli.add_command(init) +cli.add_command(index) +cli.add_command(workset) +cli.add_command(lint) +cli.add_command(deps) +cli.add_command(report) +cli.add_command(summary) +cli.add_command(full) +cli.add_command(fce) +cli.add_command(impact) +cli.add_command(taint_analyze) +cli.add_command(setup_claude) + +# Register additional migrated commands +cli.add_command(detect_patterns) +cli.add_command(detect_frameworks) +cli.add_command(docs) +cli.add_command(tool_versions) +cli.add_command(init_js) +cli.add_command(init_config) +cli.add_command(validate_templates) + +# Register ML commands +cli.add_command(learn) +cli.add_command(suggest) +cli.add_command(learn_feedback) + +# Register internal commands (not for direct user use) +cli.add_command(_archive) + +# Register rules command +cli.add_command(rules_command) + +# Register refactoring analysis commands +cli.add_command(refactor_command, name="refactor") +cli.add_command(insights_command, name="insights") + +# Register new commands +cli.add_command(docker_analyze) +cli.add_command(structure) + +# Register command groups +cli.add_command(graph) + +# All commands have been migrated to separate modules + +def main(): + """Main entry point for console script.""" + cli() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/theauditor/commands/__init__.py b/theauditor/commands/__init__.py new file mode 100644 index 0000000..df0c596 --- /dev/null +++ b/theauditor/commands/__init__.py @@ -0,0 +1 @@ +"""Commands module for TheAuditor CLI.""" \ No newline at end of file diff --git a/theauditor/commands/_archive.py b/theauditor/commands/_archive.py new file mode 100644 index 0000000..6839d09 --- /dev/null +++ b/theauditor/commands/_archive.py @@ -0,0 +1,107 @@ +"""Internal archive command for segregating history by run type.""" + +import shutil +import sys +from datetime import datetime +from pathlib import Path + +import click + + +@click.command(name="_archive") +@click.option("--run-type", required=True, type=click.Choice(["full", "diff"]), help="Type of run being archived") +@click.option("--diff-spec", help="Git diff specification for diff runs (e.g., main..HEAD)") +def _archive(run_type: str, diff_spec: str = None): + """ + Internal command to archive previous run artifacts with segregation by type. + + This command is not intended for direct user execution. It's called by + the full and orchestrate workflows to maintain clean, segregated history. + """ + # Define base paths + pf_dir = Path(".pf") + history_dir = pf_dir / "history" + + # Check if there's a previous run to archive (by checking if .pf exists and has files) + if not pf_dir.exists() or not any(pf_dir.iterdir()): + # No previous run to archive + print("[ARCHIVE] No previous run artifacts found to archive", file=sys.stderr) + return + + # Determine destination base path based on run type + if run_type == "full": + dest_base = history_dir / "full" + else: # run_type == "diff" + dest_base = history_dir / "diff" + + # Create destination base directory if it doesn't exist + dest_base.mkdir(parents=True, exist_ok=True) + + # Generate timestamp for archive directory + timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Create unique directory name + if run_type == "diff" and diff_spec: + # Sanitize diff spec for directory name + # Replace problematic characters with underscores + safe_spec = diff_spec.replace("..", "_") + safe_spec = safe_spec.replace("/", "_") + safe_spec = safe_spec.replace("\\", "_") + safe_spec = safe_spec.replace(":", "_") + safe_spec = safe_spec.replace(" ", "_") + safe_spec = safe_spec.replace("~", "_") + safe_spec = safe_spec.replace("^", "_") + + # Create descriptive name like "main_HEAD_20250819_090015" + dir_name = f"{safe_spec}_{timestamp_str}" + else: + # Simple timestamp for full runs + dir_name = timestamp_str + + # Create the archive destination directory + archive_dest = dest_base / dir_name + archive_dest.mkdir(exist_ok=True) + + # Move all top-level items from pf_dir to archive_dest + archived_count = 0 + skipped_count = 0 + + for item in pf_dir.iterdir(): + # CRITICAL: Skip the history directory itself to prevent recursive archiving + if item.name == "history": + continue + + # Safely move the item to archive destination + try: + shutil.move(str(item), str(archive_dest)) + archived_count += 1 + except Exception as e: + # Log error but don't stop the archiving process + print(f"[WARNING] Could not archive {item.name}: {e}", file=sys.stderr) + skipped_count += 1 + + # Log summary + if archived_count > 0: + click.echo(f"[ARCHIVE] Archived {archived_count} items to {archive_dest}") + if skipped_count > 0: + click.echo(f"[ARCHIVE] Skipped {skipped_count} items due to errors") + else: + click.echo("[ARCHIVE] No artifacts archived (directory was empty)") + + # Create a metadata file in the archive to track run type and context + metadata = { + "run_type": run_type, + "diff_spec": diff_spec, + "timestamp": timestamp_str, + "archived_at": datetime.now().isoformat(), + "files_archived": archived_count, + "files_skipped": skipped_count, + } + + try: + import json + metadata_path = archive_dest / "_metadata.json" + with open(metadata_path, 'w') as f: + json.dump(metadata, f, indent=2) + except Exception as e: + print(f"[WARNING] Could not write metadata file: {e}", file=sys.stderr) \ No newline at end of file diff --git a/theauditor/commands/deps.py b/theauditor/commands/deps.py new file mode 100644 index 0000000..cd6b79a --- /dev/null +++ b/theauditor/commands/deps.py @@ -0,0 +1,191 @@ +"""Parse and analyze project dependencies.""" + +import platform +from pathlib import Path +import click +from theauditor.utils.error_handler import handle_exceptions +from theauditor.utils.exit_codes import ExitCodes + +# Detect if running on Windows for character encoding +IS_WINDOWS = platform.system() == "Windows" + + +@click.command() +@handle_exceptions +@click.option("--root", default=".", help="Root directory") +@click.option("--check-latest", is_flag=True, help="Check for latest versions from registries") +@click.option("--upgrade-all", is_flag=True, help="YOLO mode: Update ALL packages to latest versions") +@click.option("--offline", is_flag=True, help="Force offline mode (no network)") +@click.option("--out", default="./.pf/raw/deps.json", help="Output dependencies file") +@click.option("--print-stats", is_flag=True, help="Print dependency statistics") +@click.option("--vuln-scan", is_flag=True, help="Scan dependencies for known vulnerabilities") +def deps(root, check_latest, upgrade_all, offline, out, print_stats, vuln_scan): + """Parse and analyze project dependencies.""" + from theauditor.deps import parse_dependencies, write_deps_json, check_latest_versions, write_deps_latest_json, upgrade_all_deps + from theauditor.vulnerability_scanner import scan_dependencies, write_vulnerabilities_json, format_vulnerability_report + import sys + + # Parse dependencies + deps_list = parse_dependencies(root_path=root) + + if not deps_list: + click.echo("No dependency files found (package.json, pyproject.toml, requirements.txt)") + click.echo(" Searched in: " + str(Path(root).resolve())) + return + + write_deps_json(deps_list, output_path=out) + + # Vulnerability scanning + if vuln_scan: + click.echo(f"\n[SCAN] Running native vulnerability scanners...") + click.echo(f" Using: npm audit, pip-audit (if available)") + + vulnerabilities = scan_dependencies(deps_list, offline=offline) + + if vulnerabilities: + # Write JSON report + vuln_output = out.replace("deps.json", "vulnerabilities.json") + write_vulnerabilities_json(vulnerabilities, output_path=vuln_output) + + # Display human-readable report + report = format_vulnerability_report(vulnerabilities) + click.echo("\n" + report) + click.echo(f"\nDetailed report written to {vuln_output}") + + # Exit with error code if critical vulnerabilities found + critical_count = sum(1 for v in vulnerabilities if v["severity"] == "critical") + if critical_count > 0: + click.echo(f"\n[FAIL] Found {critical_count} CRITICAL vulnerabilities - failing build") + sys.exit(ExitCodes.CRITICAL_SEVERITY) + else: + click.echo(f" [OK] No known vulnerabilities found in dependencies") + + # Don't continue with other operations after vuln scan + return + + # YOLO MODE: Upgrade all to latest + if upgrade_all and not offline: + click.echo("[YOLO MODE] Upgrading ALL packages to latest versions...") + click.echo(" [WARN] This may break things. That's the point!") + + # Get latest versions + latest_info = check_latest_versions(deps_list, allow_net=True, offline=offline) + if not latest_info: + click.echo(" [FAIL] Failed to fetch latest versions") + return + + # Check if all packages were successfully checked + failed_checks = sum(1 for info in latest_info.values() if info.get("error") is not None) + successful_checks = sum(1 for info in latest_info.values() if info.get("latest") is not None) + + if failed_checks > 0: + click.echo(f"\n [WARN] Only {successful_checks}/{len(latest_info)} packages checked successfully") + click.echo(f" [FAIL] Cannot upgrade with {failed_checks} failed checks") + click.echo(" Fix network issues and try again") + return + + # Upgrade all dependency files + upgraded = upgrade_all_deps(root_path=root, latest_info=latest_info, deps_list=deps_list) + + # Count unique packages that were upgraded + unique_upgraded = len([1 for k, v in latest_info.items() if v.get("is_outdated", False)]) + total_updated = sum(upgraded.values()) + + click.echo(f"\n[UPGRADED] Dependency files:") + for file_type, count in upgraded.items(): + if count > 0: + click.echo(f" [OK] {file_type}: {count} dependency entries updated") + + # Show summary that matches the "Outdated: 10/29" format + if total_updated > unique_upgraded: + click.echo(f"\n Summary: {unique_upgraded} unique packages updated across {total_updated} occurrences") + + click.echo("\n[NEXT STEPS]:") + click.echo(" 1. Run: pip install -r requirements.txt") + click.echo(" 2. Or: npm install") + click.echo(" 3. Pray it still works") + return + + # Check latest versions if requested + latest_info = {} + if check_latest and not offline: + # Count unique packages first + unique_packages = {} + for dep in deps_list: + key = f"{dep['manager']}:{dep['name']}" + if key not in unique_packages: + unique_packages[key] = 0 + unique_packages[key] += 1 + + click.echo(f"Checking {len(deps_list)} dependencies for updates...") + click.echo(f" Unique packages to check: {len(unique_packages)}") + click.echo(" Connecting to: npm registry and PyPI") + latest_info = check_latest_versions(deps_list, allow_net=True, offline=offline) + if latest_info: + write_deps_latest_json(latest_info, output_path=out.replace("deps.json", "deps_latest.json")) + + # Count successful vs failed checks + successful_checks = sum(1 for info in latest_info.values() if info.get("latest") is not None) + failed_checks = sum(1 for info in latest_info.values() if info.get("error") is not None) + + click.echo(f" [OK] Checked {successful_checks}/{len(unique_packages)} unique packages") + if failed_checks > 0: + click.echo(f" [WARN] {failed_checks} packages failed to check") + # Show first few errors + errors = [(k.split(":")[1], v["error"]) for k, v in latest_info.items() if v.get("error")][:3] + for pkg, err in errors: + click.echo(f" - {pkg}: {err}") + else: + click.echo(" [FAIL] Failed to check versions (network issue or offline mode)") + + # Always show output + click.echo(f"Dependencies written to {out}") + + # Count by manager + npm_count = sum(1 for d in deps_list if d["manager"] == "npm") + py_count = sum(1 for d in deps_list if d["manager"] == "py") + + click.echo(f" Total: {len(deps_list)} dependencies") + if npm_count > 0: + click.echo(f" Node/npm: {npm_count}") + if py_count > 0: + click.echo(f" Python: {py_count}") + + if latest_info: + # Count how many of the TOTAL deps are outdated (only if successfully checked) + outdated_deps = 0 + checked_deps = 0 + for dep in deps_list: + key = f"{dep['manager']}:{dep['name']}" + if key in latest_info and latest_info[key].get("latest") is not None: + checked_deps += 1 + if latest_info[key]["is_outdated"]: + outdated_deps += 1 + + # Also count unique outdated packages + outdated_unique = sum(1 for info in latest_info.values() if info.get("is_outdated", False)) + + # Show outdated/checked rather than outdated/total + if checked_deps == len(deps_list): + # All were checked successfully + click.echo(f" Outdated: {outdated_deps}/{len(deps_list)}") + else: + # Some failed, show both numbers + click.echo(f" Outdated: {outdated_deps}/{checked_deps} checked ({len(deps_list)} total)") + + # Show major updates + major_updates = [ + (k.split(":")[1], v["locked"], v["latest"]) + for k, v in latest_info.items() + if v.get("delta") == "major" + ] + if major_updates: + click.echo("\n Major version updates available:") + for name, locked, latest in major_updates[:5]: + click.echo(f" - {name}: {locked} -> {latest}") + if len(major_updates) > 5: + click.echo(f" ... and {len(major_updates) - 5} more") + + # Add a helpful hint if no network operation was performed + if not check_latest and not upgrade_all: + click.echo("\nTIP: Run with --check-latest to check for outdated packages.") \ No newline at end of file diff --git a/theauditor/commands/detect_frameworks.py b/theauditor/commands/detect_frameworks.py new file mode 100644 index 0000000..975979c --- /dev/null +++ b/theauditor/commands/detect_frameworks.py @@ -0,0 +1,46 @@ +"""Detect frameworks and libraries used in the project.""" + +import json +import click +from pathlib import Path + + +@click.command("detect-frameworks") +@click.option("--project-path", default=".", help="Root directory to analyze") +@click.option("--output-json", help="Path to output JSON file (default: .pf/raw/frameworks.json)") +def detect_frameworks(project_path, output_json): + """Detect frameworks and libraries used in the project.""" + from theauditor.framework_detector import FrameworkDetector + + try: + # Initialize detector + project_path = Path(project_path).resolve() + + detector = FrameworkDetector(project_path, exclude_patterns=[]) + + # Detect frameworks + frameworks = detector.detect_all() + + # Determine output path - always save to .pf/frameworks.json by default + if output_json: + # User specified custom path + save_path = Path(output_json) + else: + # Default path + save_path = Path(project_path) / ".pf" / "raw" / "frameworks.json" + + # Always save the JSON output + detector.save_to_file(save_path) + click.echo(f"Frameworks written to {save_path}") + + # Display table + table = detector.format_table() + click.echo(table) + + # Return success + if frameworks: + click.echo(f"\nDetected {len(frameworks)} framework(s)") + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e \ No newline at end of file diff --git a/theauditor/commands/detect_patterns.py b/theauditor/commands/detect_patterns.py new file mode 100644 index 0000000..b0d3c62 --- /dev/null +++ b/theauditor/commands/detect_patterns.py @@ -0,0 +1,81 @@ +"""Detect universal runtime, DB, and logic patterns in code.""" + +import click +from pathlib import Path +from theauditor.utils.helpers import get_self_exclusion_patterns + + +@click.command("detect-patterns") +@click.option("--project-path", default=".", help="Root directory to analyze") +@click.option("--patterns", multiple=True, help="Pattern categories to use (e.g., runtime_issues, db_issues)") +@click.option("--output-json", help="Path to output JSON file") +@click.option("--file-filter", help="Glob pattern to filter files") +@click.option("--max-rows", default=50, type=int, help="Maximum rows to display in table") +@click.option("--print-stats", is_flag=True, help="Print summary statistics") +@click.option("--with-ast/--no-ast", default=True, help="Enable AST-based pattern matching") +@click.option("--with-frameworks/--no-frameworks", default=True, help="Enable framework detection and framework-specific patterns") +@click.option("--exclude-self", is_flag=True, help="Exclude TheAuditor's own files (for self-testing)") +def detect_patterns(project_path, patterns, output_json, file_filter, max_rows, print_stats, with_ast, with_frameworks, exclude_self): + """Detect universal runtime, DB, and logic patterns in code.""" + from theauditor.pattern_loader import PatternLoader + from theauditor.universal_detector import UniversalPatternDetector + + try: + # Initialize detector + project_path = Path(project_path).resolve() + pattern_loader = PatternLoader() + + # Get exclusion patterns using centralized function + exclude_patterns = get_self_exclusion_patterns(exclude_self) + + detector = UniversalPatternDetector( + project_path, + pattern_loader, + with_ast=with_ast, + with_frameworks=with_frameworks, + exclude_patterns=exclude_patterns + ) + + # Run detection + categories = list(patterns) if patterns else None + findings = detector.detect_patterns(categories=categories, file_filter=file_filter) + + # Always save results to default location + patterns_output = project_path / ".pf" / "raw" / "patterns.json" + patterns_output.parent.mkdir(parents=True, exist_ok=True) + + # Save to user-specified location if provided + if output_json: + detector.to_json(Path(output_json)) + click.echo(f"\n[OK] Full results saved to: {output_json}") + + # Save to default location + detector.to_json(patterns_output) + click.echo(f"[OK] Full results saved to: {patterns_output}") + + # Display table + table = detector.format_table(max_rows=max_rows) + click.echo(table) + + # Print statistics if requested + if print_stats: + stats = detector.get_summary_stats() + click.echo("\n--- Summary Statistics ---") + click.echo(f"Total findings: {stats['total_findings']}") + click.echo(f"Files affected: {stats['files_affected']}") + + if stats['by_severity']: + click.echo("\nBy severity:") + for severity, count in sorted(stats['by_severity'].items()): + click.echo(f" {severity}: {count}") + + if stats['by_category']: + click.echo("\nBy category:") + for category, count in sorted(stats['by_category'].items()): + click.echo(f" {category}: {count}") + + # Successfully completed - found and reported all issues + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e \ No newline at end of file diff --git a/theauditor/commands/docker_analyze.py b/theauditor/commands/docker_analyze.py new file mode 100644 index 0000000..bf0b18c --- /dev/null +++ b/theauditor/commands/docker_analyze.py @@ -0,0 +1,94 @@ +"""Docker security analysis command.""" + +import click +import json +from pathlib import Path +from theauditor.utils.error_handler import handle_exceptions +from theauditor.utils.exit_codes import ExitCodes + + +@click.command("docker-analyze") +@handle_exceptions +@click.option("--db-path", default="./.pf/repo_index.db", help="Path to repo_index.db") +@click.option("--output", help="Output file for findings (JSON format)") +@click.option("--severity", type=click.Choice(["all", "critical", "high", "medium", "low"]), + default="all", help="Minimum severity to report") +@click.option("--check-vulns/--no-check-vulns", default=True, + help="Check base images for vulnerabilities (requires network)") +def docker_analyze(db_path, output, severity, check_vulns): + """Analyze Docker images for security issues. + + Detects: + - Containers running as root + - Exposed secrets in ENV/ARG instructions + - High entropy values (potential secrets) + - Base image vulnerabilities (if --check-vulns enabled) + """ + from theauditor.docker_analyzer import analyze_docker_images + + # Check if database exists + if not Path(db_path).exists(): + click.echo(f"Error: Database not found at {db_path}", err=True) + click.echo("Run 'aud index' first to create the database", err=True) + return ExitCodes.TASK_INCOMPLETE + + # Run analysis + click.echo("Analyzing Docker images for security issues...") + if check_vulns: + click.echo(" Including vulnerability scan of base images...") + findings = analyze_docker_images(db_path, check_vulnerabilities=check_vulns) + + # Filter by severity if requested + if severity != "all": + severity_order = {"critical": 4, "high": 3, "medium": 2, "low": 1} + min_severity = severity_order.get(severity.lower(), 0) + findings = [f for f in findings + if severity_order.get(f.get("severity", "").lower(), 0) >= min_severity] + + # Count by severity + severity_counts = {} + for finding in findings: + sev = finding.get("severity", "unknown").lower() + severity_counts[sev] = severity_counts.get(sev, 0) + 1 + + # Display results + if findings: + click.echo(f"\nFound {len(findings)} Docker security issues:") + + # Show severity breakdown + for sev in ["critical", "high", "medium", "low"]: + if sev in severity_counts: + click.echo(f" {sev.upper()}: {severity_counts[sev]}") + + # Show findings + click.echo("\nFindings:") + for finding in findings: + click.echo(f"\n[{finding['severity'].upper()}] {finding['type']}") + click.echo(f" File: {finding['file']}") + click.echo(f" {finding['message']}") + if finding.get('recommendation'): + click.echo(f" Fix: {finding['recommendation']}") + else: + click.echo("No Docker security issues found") + + # Save to file if requested + if output: + output_path = Path(output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w') as f: + json.dump({ + "findings": findings, + "summary": severity_counts, + "total": len(findings) + }, f, indent=2) + + click.echo(f"\nResults saved to: {output}") + + # Exit with appropriate code + if severity_counts.get("critical", 0) > 0: + return ExitCodes.CRITICAL_SEVERITY + elif severity_counts.get("high", 0) > 0: + return ExitCodes.HIGH_SEVERITY + else: + return ExitCodes.SUCCESS \ No newline at end of file diff --git a/theauditor/commands/docs.py b/theauditor/commands/docs.py new file mode 100644 index 0000000..f5b7446 --- /dev/null +++ b/theauditor/commands/docs.py @@ -0,0 +1,201 @@ +"""Fetch or summarize documentation for dependencies.""" + +import json +import click +from pathlib import Path + + +@click.command("docs") +@click.argument("action", type=click.Choice(["fetch", "summarize", "view", "list"])) +@click.argument("package_name", required=False) +@click.option("--deps", default="./.pf/deps.json", help="Input dependencies file") +@click.option("--offline", is_flag=True, help="Force offline mode") +@click.option("--allow-non-gh-readmes", is_flag=True, help="Allow non-GitHub README fetching") +@click.option("--docs-dir", default="./.pf/context/docs", help="Documentation cache directory") +@click.option("--capsules-dir", default="./.pf/context/doc_capsules", help="Output capsules directory") +@click.option("--workset", default="./.pf/workset.json", help="Workset file for filtering") +@click.option("--print-stats", is_flag=True, help="Print statistics") +@click.option("--raw", is_flag=True, help="View raw fetched doc instead of capsule") +def docs(action, package_name, deps, offline, allow_non_gh_readmes, docs_dir, capsules_dir, workset, print_stats, raw): + """Fetch or summarize documentation for dependencies.""" + from theauditor.deps import parse_dependencies + from theauditor.docs_fetch import fetch_docs, DEFAULT_ALLOWLIST + from theauditor.docs_summarize import summarize_docs + + try: + if action == "fetch": + # Load dependencies + if Path(deps).exists(): + with open(deps, encoding="utf-8") as f: + deps_list = json.load(f) + else: + # Parse if not cached + deps_list = parse_dependencies() + + # Set up allowlist + allowlist = DEFAULT_ALLOWLIST.copy() + if not allow_non_gh_readmes: + # Already restricted to GitHub by default + pass + + # Check for policy file + policy_file = Path(".pf/policy.yml") + allow_net = True + if policy_file.exists(): + try: + # Simple YAML parsing without external deps + with open(policy_file, encoding="utf-8") as f: + for line in f: + if "allow_net:" in line: + allow_net = "true" in line.lower() + break + except Exception: + pass # Default to True + + # Fetch docs + result = fetch_docs( + deps_list, + allow_net=allow_net, + allowlist=allowlist, + offline=offline, + output_dir=docs_dir + ) + + if not print_stats: + if result["mode"] == "offline": + click.echo("Running in offline mode - no documentation fetched") + else: + click.echo(f"Documentation fetch complete:") + click.echo(f" Fetched: {result['fetched']}") + click.echo(f" Cached: {result['cached']}") + click.echo(f" Skipped: {result['skipped']}") + if result["errors"]: + click.echo(f" Errors: {len(result['errors'])}") + + elif action == "summarize": + # Summarize docs + result = summarize_docs( + docs_dir=docs_dir, + output_dir=capsules_dir, + workset_path=workset if Path(workset).exists() else None + ) + + if not print_stats: + click.echo(f"Documentation capsules created:") + click.echo(f" Capsules: {result['capsules_created']}") + click.echo(f" Skipped: {result['skipped']}") + if result["errors"]: + click.echo(f" Errors: {len(result['errors'])}") + + index_file = Path(capsules_dir).parent / "doc_index.json" + click.echo(f" Index: {index_file}") + + elif action == "list": + # List available docs and capsules + docs_path = Path(docs_dir) + capsules_path = Path(capsules_dir) + + click.echo("\n[Docs] Available Documentation:\n") + + # List fetched docs + if docs_path.exists(): + click.echo("Fetched Docs (.pf/context/docs/):") + for ecosystem in ["npm", "py"]: + ecosystem_dir = docs_path / ecosystem + if ecosystem_dir.exists(): + packages = sorted([d.name for d in ecosystem_dir.iterdir() if d.is_dir()]) + if packages: + click.echo(f"\n {ecosystem.upper()}:") + for pkg in packages[:20]: # Show first 20 + click.echo(f" * {pkg}") + if len(packages) > 20: + click.echo(f" ... and {len(packages) - 20} more") + + # List capsules + if capsules_path.exists(): + click.echo("\nDoc Capsules (.pf/context/doc_capsules/):") + capsules = sorted([f.stem for f in capsules_path.glob("*.md")]) + if capsules: + for capsule in capsules[:20]: # Show first 20 + click.echo(f" * {capsule}") + if len(capsules) > 20: + click.echo(f" ... and {len(capsules) - 20} more") + + click.echo("\n[TIP] Use 'aud docs view ' to view a specific doc") + click.echo(" Add --raw to see the full fetched doc instead of capsule") + + elif action == "view": + if not package_name: + click.echo("Error: Package name required for view action") + click.echo("Usage: aud docs view ") + click.echo(" aud docs view geopandas") + click.echo(" aud docs view numpy --raw") + raise click.ClickException("Package name required") + + docs_path = Path(docs_dir) + capsules_path = Path(capsules_dir) + found = False + + if raw: + # View raw fetched doc + for ecosystem in ["npm", "py"]: + # Try exact match first + for pkg_dir in (docs_path / ecosystem).glob(f"{package_name}@*"): + if pkg_dir.is_dir(): + doc_file = pkg_dir / "doc.md" + if doc_file.exists(): + click.echo(f"\n[RAW DOC] Raw Doc: {pkg_dir.name}\n") + click.echo("=" * 80) + with open(doc_file, encoding="utf-8") as f: + content = f.read() + # Limit output for readability + lines = content.split("\n") + if len(lines) > 200: + click.echo("\n".join(lines[:200])) + click.echo(f"\n... (truncated, {len(lines) - 200} more lines)") + else: + click.echo(content) + found = True + break + if found: + break + else: + # View capsule (default) + # Try exact match first + for capsule_file in capsules_path.glob(f"*{package_name}*.md"): + if capsule_file.exists(): + click.echo(f"\n[CAPSULE] Capsule: {capsule_file.stem}\n") + click.echo("=" * 80) + with open(capsule_file, encoding="utf-8") as f: + click.echo(f.read()) + click.echo("\n" + "=" * 80) + + # Try to find the corresponding full doc + package_parts = capsule_file.stem.replace("__", "@").split("@") + if len(package_parts) >= 2: + ecosystem_prefix = package_parts[0] + pkg_name = "@".join(package_parts[:-1]).replace(ecosystem_prefix + "@", "") + version = package_parts[-1] + ecosystem = "py" if ecosystem_prefix == "py" else "npm" + full_doc_path = f"./.pf/context/docs/{ecosystem}/{pkg_name}@{version}/doc.md" + click.echo(f"\n[SOURCE] Full Documentation: `{full_doc_path}`") + + click.echo("[TIP] Use --raw to see the full fetched documentation") + found = True + break + + if not found: + click.echo(f"No documentation found for '{package_name}'") + click.echo("\nAvailable packages:") + # Show some available packages + for ecosystem in ["npm", "py"]: + ecosystem_dir = docs_path / ecosystem + if ecosystem_dir.exists(): + packages = [d.name for d in ecosystem_dir.iterdir() if d.is_dir()][:5] + if packages: + click.echo(f" {ecosystem.upper()}: {', '.join(packages)}") + click.echo("\nUse 'aud docs list' to see all available docs") + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e \ No newline at end of file diff --git a/theauditor/commands/fce.py b/theauditor/commands/fce.py new file mode 100644 index 0000000..0511e69 --- /dev/null +++ b/theauditor/commands/fce.py @@ -0,0 +1,43 @@ +"""Run Factual Correlation Engine to aggregate and correlate findings.""" + +import click +from theauditor.utils.error_handler import handle_exceptions + + +@click.command(name="fce") +@handle_exceptions +@click.option("--root", default=".", help="Root directory") +@click.option("--capsules", default="./.pf/capsules", help="Capsules directory") +@click.option("--manifest", default="manifest.json", help="Manifest file path") +@click.option("--workset", default="./.pf/workset.json", help="Workset file path") +@click.option("--timeout", default=600, type=int, help="Timeout in seconds") +@click.option("--print-plan", is_flag=True, help="Print detected tools without running") +def fce(root, capsules, manifest, workset, timeout, print_plan): + """Run Factual Correlation Engine to aggregate and correlate findings.""" + from theauditor.fce import run_fce + + result = run_fce( + root_path=root, + capsules_dir=capsules, + manifest_path=manifest, + workset_path=workset, + timeout=timeout, + print_plan=print_plan, + ) + + if result.get("printed_plan"): + return + + if result["success"]: + if result["failures_found"] == 0: + click.echo("[OK] All tools passed - no failures detected") + else: + click.echo(f"Found {result['failures_found']} failures") + # Check if output_files exists and has at least 2 elements + if result.get('output_files') and len(result.get('output_files', [])) > 1: + click.echo(f"FCE report written to: {result['output_files'][1]}") + elif result.get('output_files') and len(result.get('output_files', [])) > 0: + click.echo(f"FCE report written to: {result['output_files'][0]}") + else: + click.echo(f"Error: {result.get('error', 'Unknown error')}", err=True) + raise click.ClickException(result.get("error", "FCE failed")) \ No newline at end of file diff --git a/theauditor/commands/full.py b/theauditor/commands/full.py new file mode 100644 index 0000000..6d938e7 --- /dev/null +++ b/theauditor/commands/full.py @@ -0,0 +1,90 @@ +"""Run complete audit pipeline.""" + +import sys +import click +from theauditor.utils.error_handler import handle_exceptions +from theauditor.utils.exit_codes import ExitCodes + + +@click.command() +@handle_exceptions +@click.option("--root", default=".", help="Root directory to analyze") +@click.option("--quiet", is_flag=True, help="Minimal output") +@click.option("--exclude-self", is_flag=True, help="Exclude TheAuditor's own files (for self-testing)") +@click.option("--offline", is_flag=True, help="Skip network operations (deps, docs)") +def full(root, quiet, exclude_self, offline): + """Run complete audit pipeline in exact order specified in teamsop.md.""" + from theauditor.pipelines import run_full_pipeline + + # Define log callback for console output + def log_callback(message, is_error=False): + if is_error: + click.echo(message, err=True) + else: + click.echo(message) + + # Run the pipeline + result = run_full_pipeline( + root=root, + quiet=quiet, + exclude_self=exclude_self, + offline=offline, + log_callback=log_callback if not quiet else None + ) + + # Display clear status message based on results + findings = result.get("findings", {}) + critical = findings.get("critical", 0) + high = findings.get("high", 0) + medium = findings.get("medium", 0) + low = findings.get("low", 0) + + click.echo("\n" + "=" * 60) + click.echo("AUDIT FINAL STATUS") + click.echo("=" * 60) + + # Determine overall status and exit code + exit_code = ExitCodes.SUCCESS + + # Check for pipeline failures first + if result["failed_phases"] > 0: + click.echo(f"[WARNING] Pipeline completed with {result['failed_phases']} phase failures") + click.echo("Some analysis phases could not complete successfully.") + exit_code = ExitCodes.TASK_INCOMPLETE # Exit code for pipeline failures + + # Then check for security findings + if critical > 0: + click.echo(f"\nSTATUS: [CRITICAL] - Audit complete. Found {critical} critical vulnerabilities.") + click.echo("Immediate action required - deployment should be blocked.") + exit_code = ExitCodes.CRITICAL_SEVERITY # Exit code for critical findings + elif high > 0: + click.echo(f"\nSTATUS: [HIGH] - Audit complete. Found {high} high-severity issues.") + click.echo("Priority remediation needed before next release.") + if exit_code == ExitCodes.SUCCESS: + exit_code = ExitCodes.HIGH_SEVERITY # Exit code for high findings (unless already set for failures) + elif medium > 0 or low > 0: + click.echo(f"\nSTATUS: [MODERATE] - Audit complete. Found {medium} medium and {low} low issues.") + click.echo("Schedule fixes for upcoming sprints.") + else: + click.echo("\nSTATUS: [CLEAN] - No critical or high-severity issues found.") + click.echo("Codebase meets security and quality standards.") + + # Show findings breakdown if any exist + if critical + high + medium + low > 0: + click.echo("\nFindings breakdown:") + if critical > 0: + click.echo(f" - Critical: {critical}") + if high > 0: + click.echo(f" - High: {high}") + if medium > 0: + click.echo(f" - Medium: {medium}") + if low > 0: + click.echo(f" - Low: {low}") + + click.echo("\nReview the chunked data in .pf/readthis/ for complete findings.") + click.echo("=" * 60) + + # Exit with appropriate code for CI/CD automation + # Using standardized exit codes from ExitCodes class + if exit_code != ExitCodes.SUCCESS: + sys.exit(exit_code) \ No newline at end of file diff --git a/theauditor/commands/graph.py b/theauditor/commands/graph.py new file mode 100644 index 0000000..6e5da1a --- /dev/null +++ b/theauditor/commands/graph.py @@ -0,0 +1,639 @@ +"""Cross-project dependency and call graph analysis.""" + +import json +from pathlib import Path +import click +from theauditor.config_runtime import load_runtime_config + + +@click.group() +@click.help_option("-h", "--help") +def graph(): + """Cross-project dependency and call graph analysis.""" + pass + + +@graph.command("build") +@click.option("--root", default=".", help="Root directory to analyze") +@click.option("--langs", multiple=True, help="Languages to process (e.g., python, javascript)") +@click.option("--workset", help="Path to workset.json to limit scope") +@click.option("--batch-size", default=200, type=int, help="Files per batch") +@click.option("--resume", is_flag=True, help="Resume from checkpoint") +@click.option("--db", default="./.pf/graphs.db", help="SQLite database path") +@click.option("--out-json", default="./.pf/raw/", help="JSON output directory") +def graph_build(root, langs, workset, batch_size, resume, db, out_json): + """Build import and call graphs for project.""" + from theauditor.graph.builder import XGraphBuilder + from theauditor.graph.store import XGraphStore + + try: + # Initialize builder and store + builder = XGraphBuilder(batch_size=batch_size, exclude_patterns=[], project_root=root) + store = XGraphStore(db_path=db) + + # Load workset if provided + file_filter = None + workset_files = set() + if workset: + workset_path = Path(workset) + if workset_path.exists(): + with open(workset_path) as f: + workset_data = json.load(f) + # Extract file paths from workset + workset_files = {p["path"] for p in workset_data.get("paths", [])} + click.echo(f"Loaded workset with {len(workset_files)} files") + + # Clear checkpoint if not resuming + if not resume and builder.checkpoint_file.exists(): + builder.checkpoint_file.unlink() + + # Load manifest.json if it exists to use as file list + file_list = None + config = load_runtime_config(root) + manifest_path = Path(config["paths"]["manifest"]) + if manifest_path.exists(): + click.echo("Loading file manifest...") + with open(manifest_path, 'r') as f: + manifest_data = json.load(f) + + # Apply workset filtering if active + if workset_files: + file_list = [f for f in manifest_data if f.get("path") in workset_files] + click.echo(f" Filtered to {len(file_list)} files from workset") + else: + file_list = manifest_data + click.echo(f" Found {len(file_list)} files in manifest") + else: + click.echo("No manifest found, using filesystem walk") + + # Build import graph + click.echo("Building import graph...") + import_graph = builder.build_import_graph( + root=root, + langs=list(langs) if langs else None, + file_list=file_list, + ) + + # Save to database (SINGLE SOURCE OF TRUTH) + store.save_import_graph(import_graph) + + # REMOVED: JSON dual persistence - using SQLite as single source + + click.echo(f" Nodes: {len(import_graph['nodes'])}") + click.echo(f" Edges: {len(import_graph['edges'])}") + + # Build call graph + click.echo("Building call graph...") + call_graph = builder.build_call_graph( + root=root, + langs=list(langs) if langs else None, + file_list=file_list, + ) + + # Save to database (SINGLE SOURCE OF TRUTH) + store.save_call_graph(call_graph) + + # REMOVED: JSON dual persistence - using SQLite as single source + + # Call graph uses 'nodes' for functions and 'edges' for calls + click.echo(f" Functions: {len(call_graph.get('nodes', []))}") + click.echo(f" Calls: {len(call_graph.get('edges', []))}") + + click.echo(f"\nGraphs saved to database: {db}") + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e + + +@graph.command("analyze") +@click.option("--db", default="./.pf/graphs.db", help="SQLite database path") +@click.option("--out", default="./.pf/raw/graph_analysis.json", help="Output JSON path") +@click.option("--max-depth", default=3, type=int, help="Max traversal depth for impact analysis") +@click.option("--workset", help="Path to workset.json for change impact") +@click.option("--no-insights", is_flag=True, help="Skip interpretive insights (health scores, recommendations)") +def graph_analyze(db, out, max_depth, workset, no_insights): + """Analyze graphs for cycles, hotspots, and impact.""" + from theauditor.graph.analyzer import XGraphAnalyzer + from theauditor.graph.store import XGraphStore + + # Try to import insights module (optional) + insights = None + if not no_insights: + try: + from theauditor.graph.insights import GraphInsights + insights = GraphInsights() + except ImportError: + click.echo("Note: Insights module not available. Running basic analysis only.") + insights = None + + try: + # Load graphs from database + store = XGraphStore(db_path=db) + import_graph = store.load_import_graph() + call_graph = store.load_call_graph() + + if not import_graph["nodes"]: + click.echo("No graphs found. Run 'aud graph build' first.") + return + + # Initialize analyzer + analyzer = XGraphAnalyzer() + + # Detect cycles + click.echo("Detecting cycles...") + cycles = analyzer.detect_cycles(import_graph) + click.echo(f" Found {len(cycles)} cycles") + if cycles and len(cycles) > 0: + click.echo(f" Largest cycle: {cycles[0]['size']} nodes") + + # Rank hotspots (if insights available) + hotspots = [] + if insights: + click.echo("Ranking hotspots...") + hotspots = insights.rank_hotspots(import_graph, call_graph) + click.echo(f" Top 10 hotspots:") + for i, hotspot in enumerate(hotspots[:10], 1): + click.echo(f" {i}. {hotspot['id'][:50]} (score: {hotspot['score']})") + else: + # Basic hotspot detection without scoring + click.echo("Finding most connected nodes...") + degrees = analyzer.calculate_node_degrees(import_graph) + connected = sorted( + [(k, v["in_degree"] + v["out_degree"]) for k, v in degrees.items()], + key=lambda x: x[1], + reverse=True + )[:10] + click.echo(f" Top 10 most connected nodes:") + for i, (node, connections) in enumerate(connected, 1): + click.echo(f" {i}. {node[:50]} ({connections} connections)") + + # Calculate change impact if workset provided + impact = None + if workset: + workset_path = Path(workset) + if workset_path.exists(): + with open(workset_path) as f: + workset_data = json.load(f) + targets = workset_data.get("seed_files", []) + + if targets: + click.echo(f"\nCalculating impact for {len(targets)} targets...") + impact = analyzer.impact_of_change( + targets=targets, + import_graph=import_graph, + call_graph=call_graph, + max_depth=max_depth, + ) + click.echo(f" Upstream impact: {len(impact['upstream'])} files") + click.echo(f" Downstream impact: {len(impact['downstream'])} files") + click.echo(f" Total impacted: {impact['total_impacted']}") + + # Generate summary + summary = {} + if insights: + click.echo("\nGenerating interpreted summary...") + summary = insights.summarize( + import_graph=import_graph, + call_graph=call_graph, + cycles=cycles, + hotspots=hotspots, + ) + + click.echo(f" Graph density: {summary['import_graph'].get('density', 0):.4f}") + click.echo(f" Health grade: {summary['health_metrics'].get('health_grade', 'N/A')}") + click.echo(f" Fragility score: {summary['health_metrics'].get('fragility_score', 0):.2f}") + else: + # Basic summary without interpretation + click.echo("\nGenerating basic summary...") + nodes_count = len(import_graph.get("nodes", [])) + edges_count = len(import_graph.get("edges", [])) + density = edges_count / (nodes_count * (nodes_count - 1)) if nodes_count > 1 else 0 + + summary = { + "import_graph": { + "nodes": nodes_count, + "edges": edges_count, + "density": density, + }, + "cycles": { + "total": len(cycles), + "largest": cycles[0]["size"] if cycles else 0, + }, + } + + if call_graph: + summary["call_graph"] = { + "nodes": len(call_graph.get("nodes", [])), + "edges": len(call_graph.get("edges", [])), + } + + click.echo(f" Nodes: {nodes_count}") + click.echo(f" Edges: {edges_count}") + click.echo(f" Density: {density:.4f}") + click.echo(f" Cycles: {len(cycles)}") + + # Save analysis results + analysis = { + "cycles": cycles, + "hotspots": hotspots[:50], # Top 50 + "impact": impact, + "summary": summary, + } + + out_path = Path(out) + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "w") as f: + json.dump(analysis, f, indent=2, sort_keys=True) + + click.echo(f"\nAnalysis saved to {out}") + + # Save metrics for ML consumption (if insights available) + if insights and hotspots: + metrics = {} + for hotspot in hotspots: + metrics[hotspot['id']] = hotspot.get('centrality', 0) + metrics_path = Path("./.pf/raw/graph_metrics.json") + metrics_path.parent.mkdir(parents=True, exist_ok=True) + with open(metrics_path, "w") as f: + json.dump(metrics, f, indent=2) + click.echo(f" Saved graph metrics to {metrics_path}") + + # Create AI-readable summary + graph_summary = analyzer.get_graph_summary(import_graph) + summary_path = Path("./.pf/raw/graph_summary.json") + with open(summary_path, "w") as f: + json.dump(graph_summary, f, indent=2) + click.echo(f" Saved graph summary to {summary_path}") + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e + + +@graph.command("query") +@click.option("--db", default="./.pf/graphs.db", help="SQLite database path") +@click.option("--uses", help="Find who uses/imports this module or calls this function") +@click.option("--calls", help="Find what this module/function calls or depends on") +@click.option("--nearest-path", nargs=2, help="Find shortest path between two nodes") +@click.option("--format", type=click.Choice(["table", "json"]), default="table", help="Output format") +def graph_query(db, uses, calls, nearest_path, format): + """Query graph relationships.""" + from theauditor.graph.analyzer import XGraphAnalyzer + from theauditor.graph.store import XGraphStore + + # Check if any query options were provided + if not any([uses, calls, nearest_path]): + click.echo("Please specify a query option:") + click.echo(" --uses MODULE Find who uses a module") + click.echo(" --calls FUNC Find what a function calls") + click.echo(" --nearest-path SOURCE TARGET Find path between nodes") + click.echo("\nExample: aud graph query --uses theauditor.cli") + return + + try: + # Load graphs + store = XGraphStore(db_path=db) + + results = {} + + if uses: + # Find who uses this node + deps = store.query_dependencies(uses, direction="upstream") + call_deps = store.query_calls(uses, direction="callers") + + all_users = sorted(set(deps.get("upstream", []) + call_deps.get("callers", []))) + results["uses"] = { + "node": uses, + "used_by": all_users, + "count": len(all_users), + } + + if format == "table": + click.echo(f"\n{uses} is used by {len(all_users)} nodes:") + for user in all_users[:20]: # Show first 20 + click.echo(f" - {user}") + if len(all_users) > 20: + click.echo(f" ... and {len(all_users) - 20} more") + + if calls: + # Find what this node calls/depends on + deps = store.query_dependencies(calls, direction="downstream") + call_deps = store.query_calls(calls, direction="callees") + + all_deps = sorted(set(deps.get("downstream", []) + call_deps.get("callees", []))) + results["calls"] = { + "node": calls, + "depends_on": all_deps, + "count": len(all_deps), + } + + if format == "table": + click.echo(f"\n{calls} depends on {len(all_deps)} nodes:") + for dep in all_deps[:20]: # Show first 20 + click.echo(f" - {dep}") + if len(all_deps) > 20: + click.echo(f" ... and {len(all_deps) - 20} more") + + if nearest_path: + # Find shortest path + source, target = nearest_path + import_graph = store.load_import_graph() + + analyzer = XGraphAnalyzer() + path = analyzer.find_shortest_path(source, target, import_graph) + + results["path"] = { + "source": source, + "target": target, + "path": path, + "length": len(path) if path else None, + } + + if format == "table": + if path: + click.echo(f"\nPath from {source} to {target} ({len(path)} steps):") + for i, node in enumerate(path): + prefix = " " + ("-> " if i > 0 else "") + click.echo(f"{prefix}{node}") + else: + click.echo(f"\nNo path found from {source} to {target}") + + if format == "json": + click.echo(json.dumps(results, indent=2)) + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e + + +@graph.command("viz") +@click.option("--db", default="./.pf/graphs.db", help="SQLite database path") +@click.option("--graph-type", type=click.Choice(["import", "call"]), default="import", help="Graph type to visualize") +@click.option("--out-dir", default="./.pf/raw/", help="Output directory for visualizations") +@click.option("--limit-nodes", default=500, type=int, help="Maximum nodes to display") +@click.option("--format", type=click.Choice(["dot", "svg", "png", "json"]), default="dot", help="Output format") +@click.option("--view", type=click.Choice(["full", "cycles", "hotspots", "layers", "impact"]), default="full", + help="Visualization view type") +@click.option("--include-analysis", is_flag=True, help="Include analysis results (cycles, hotspots) in visualization") +@click.option("--title", help="Graph title") +@click.option("--top-hotspots", default=10, type=int, help="Number of top hotspots to show (for hotspots view)") +@click.option("--impact-target", help="Target node for impact analysis (for impact view)") +@click.option("--show-self-loops", is_flag=True, help="Include self-referential edges") +def graph_viz(db, graph_type, out_dir, limit_nodes, format, view, include_analysis, title, + top_hotspots, impact_target, show_self_loops): + """Visualize graphs with rich visual encoding (Graphviz). + + Creates visually intelligent graphs with multiple view modes: + + VIEW MODES: + - full: Complete graph with all nodes and edges + - cycles: Only nodes/edges involved in dependency cycles + - hotspots: Top N most connected nodes with neighbors + - layers: Architectural layers as subgraphs + - impact: Highlight impact radius of changes + + VISUAL ENCODING: + - Node Color: Programming language (Python=blue, JS=yellow, TS=blue) + - Node Size: Importance/connectivity (larger = more dependencies) + - Edge Color: Red for cycles, gray for normal + - Border Width: Code churn (thicker = more changes) + - Node Shape: box=module, ellipse=function, diamond=class + + Examples: + # Basic visualization + aud graph viz + + # Show only dependency cycles + aud graph viz --view cycles --include-analysis + + # Top 5 hotspots with connections + aud graph viz --view hotspots --top-hotspots 5 + + # Architectural layers + aud graph viz --view layers --include-analysis + + # Impact analysis for a specific file + aud graph viz --view impact --impact-target "src/auth.py" + + # Generate SVG for AI analysis + aud graph viz --format svg --view full --include-analysis + """ + from theauditor.graph.store import XGraphStore + from theauditor.graph.visualizer import GraphVisualizer + + try: + # Load the appropriate graph + store = XGraphStore(db_path=db) + + if graph_type == "import": + graph = store.load_import_graph() + output_name = "import_graph" + default_title = "Import Dependencies" + else: + graph = store.load_call_graph() + output_name = "call_graph" + default_title = "Function Call Graph" + + if not graph or not graph.get("nodes"): + click.echo(f"No {graph_type} graph found. Run 'aud graph build' first.") + return + + # Load analysis if requested + analysis = {} + if include_analysis: + # Try to load analysis from file + analysis_path = Path("./.pf/raw/graph_analysis.json") + if analysis_path.exists(): + with open(analysis_path) as f: + analysis_data = json.load(f) + analysis = { + 'cycles': analysis_data.get('cycles', []), + 'hotspots': analysis_data.get('hotspots', []), + 'impact': analysis_data.get('impact', {}) + } + click.echo(f"Loaded analysis: {len(analysis['cycles'])} cycles, {len(analysis['hotspots'])} hotspots") + else: + click.echo("No analysis found. Run 'aud graph analyze' first for richer visualization.") + + # Create output directory + out_path = Path(out_dir) + out_path.mkdir(parents=True, exist_ok=True) + + if format == "json": + # Simple JSON output (original behavior) + json_file = out_path / f"{output_name}.json" + with open(json_file, "w") as f: + json.dump({"nodes": graph["nodes"], "edges": graph["edges"]}, f, indent=2) + + click.echo(f"[OK] JSON saved to: {json_file}") + click.echo(f" Nodes: {len(graph['nodes'])}, Edges: {len(graph['edges'])}") + else: + # Use new visualizer for DOT/SVG/PNG + visualizer = GraphVisualizer() + + # Set visualization options + options = { + 'max_nodes': limit_nodes, + 'title': title or default_title, + 'show_self_loops': show_self_loops + } + + # Generate DOT with visual intelligence based on view mode + click.echo(f"Generating {format.upper()} visualization (view: {view})...") + + if view == "cycles": + # Cycles-only view + cycles = analysis.get('cycles', []) + if not cycles: + # Check if analysis was run but found no cycles + if 'cycles' in analysis: + click.echo("[INFO] No dependency cycles detected in the codebase (good architecture!).") + click.echo(" Showing full graph instead...") + else: + click.echo("[WARN] No cycles data found. Run 'aud graph analyze' first.") + click.echo(" Falling back to full view...") + dot_content = visualizer.generate_dot(graph, analysis, options) + else: + click.echo(f" Showing {len(cycles)} cycles") + dot_content = visualizer.generate_cycles_only_view(graph, cycles, options) + + elif view == "hotspots": + # Hotspots-only view + if not analysis.get('hotspots'): + # Try to calculate hotspots on the fly + from theauditor.graph.analyzer import XGraphAnalyzer + analyzer = XGraphAnalyzer() + hotspots = analyzer.identify_hotspots(graph, top_n=top_hotspots) + click.echo(f" Calculated {len(hotspots)} hotspots") + else: + hotspots = analysis['hotspots'] + + click.echo(f" Showing top {top_hotspots} hotspots") + dot_content = visualizer.generate_hotspots_only_view( + graph, hotspots, options, top_n=top_hotspots + ) + + elif view == "layers": + # Architectural layers view + from theauditor.graph.analyzer import XGraphAnalyzer + analyzer = XGraphAnalyzer() + layers = analyzer.identify_layers(graph) + click.echo(f" Found {len(layers)} architectural layers") + # Filter out None keys before iterating + for layer_num, nodes in layers.items(): + if layer_num is not None: + click.echo(f" Layer {layer_num}: {len(nodes)} nodes") + dot_content = visualizer.generate_dot_with_layers(graph, layers, analysis, options) + + elif view == "impact": + # Impact analysis view + if not impact_target: + click.echo("[ERROR] --impact-target required for impact view") + raise click.ClickException("Missing --impact-target for impact view") + + from theauditor.graph.analyzer import XGraphAnalyzer + analyzer = XGraphAnalyzer() + impact = analyzer.analyze_impact(graph, [impact_target]) + + if not impact['targets']: + click.echo(f"[WARN] Target '{impact_target}' not found in graph") + click.echo(" Showing full graph instead...") + dot_content = visualizer.generate_dot(graph, analysis, options) + else: + click.echo(f" Target: {impact_target}") + click.echo(f" Upstream: {len(impact['upstream'])} nodes") + click.echo(f" Downstream: {len(impact['downstream'])} nodes") + click.echo(f" Total impact: {len(impact['all_impacted'])} nodes") + dot_content = visualizer.generate_impact_visualization(graph, impact, options) + + else: # view == "full" or default + # Full graph view + click.echo(f" Nodes: {len(graph['nodes'])} (limit: {limit_nodes})") + click.echo(f" Edges: {len(graph['edges'])}") + dot_content = visualizer.generate_dot(graph, analysis, options) + + # Save DOT file with view suffix + if view != "full": + output_filename = f"{output_name}_{view}" + else: + output_filename = output_name + + dot_file = out_path / f"{output_filename}.dot" + with open(dot_file, "w") as f: + f.write(dot_content) + click.echo(f"[OK] DOT file saved to: {dot_file}") + + # Generate image if requested + if format in ["svg", "png"]: + try: + import subprocess + + # Check if Graphviz is installed + result = subprocess.run( + ["dot", "-V"], + capture_output=True, + text=True + ) + + if result.returncode == 0: + # Generate image + output_file = out_path / f"{output_filename}.{format}" + subprocess.run( + ["dot", f"-T{format}", str(dot_file), "-o", str(output_file)], + check=True + ) + click.echo(f"[OK] {format.upper()} image saved to: {output_file}") + + # For SVG, also mention AI readability + if format == "svg": + click.echo(" ✓ SVG is AI-readable and can be analyzed for patterns") + else: + click.echo(f"[WARN] Graphviz not found. Install it to generate {format.upper()} images:") + click.echo(" Ubuntu/Debian: apt install graphviz") + click.echo(" macOS: brew install graphviz") + click.echo(" Windows: choco install graphviz") + click.echo(f"\n Manual generation: dot -T{format} {dot_file} -o {output_filename}.{format}") + + except FileNotFoundError: + click.echo(f"[WARN] Graphviz not installed. Cannot generate {format.upper()}.") + click.echo(f" Install graphviz and run: dot -T{format} {dot_file} -o {output_filename}.{format}") + except subprocess.CalledProcessError as e: + click.echo(f"[ERROR] Failed to generate {format.upper()}: {e}") + + # Provide visual encoding legend based on view + click.echo("\nVisual Encoding:") + + if view == "cycles": + click.echo(" • Red Nodes: Part of dependency cycles") + click.echo(" • Red Edges: Cycle connections") + click.echo(" • Subgraphs: Individual cycles grouped") + + elif view == "hotspots": + click.echo(" • Node Color: Red gradient (darker = higher rank)") + click.echo(" • Node Size: Total connections") + click.echo(" • Gray Nodes: Connected but not hotspots") + click.echo(" • Labels: Show in/out degree counts") + + elif view == "layers": + click.echo(" • Subgraphs: Architectural layers") + click.echo(" • Node Color: Programming language") + click.echo(" • Border Width: Code churn (thicker = more changes)") + click.echo(" • Node Size: Importance (in-degree)") + + elif view == "impact": + click.echo(" • Red Nodes: Impact targets") + click.echo(" • Orange Nodes: Upstream dependencies") + click.echo(" • Blue Nodes: Downstream dependencies") + click.echo(" • Purple Nodes: Both upstream and downstream") + click.echo(" • Gray Nodes: Unaffected") + + else: # full view + click.echo(" • Node Color: Programming language") + click.echo(" • Node Size: Importance (larger = more dependencies)") + click.echo(" • Red Edges: Part of dependency cycles") + click.echo(" • Node Shape: box=module, ellipse=function") + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e \ No newline at end of file diff --git a/theauditor/commands/impact.py b/theauditor/commands/impact.py new file mode 100644 index 0000000..6e05338 --- /dev/null +++ b/theauditor/commands/impact.py @@ -0,0 +1,118 @@ +"""Analyze the impact radius of code changes using the AST symbol graph.""" + +import platform +import click +from pathlib import Path + +# Detect if running on Windows for character encoding +IS_WINDOWS = platform.system() == "Windows" + + +@click.command() +@click.option("--file", required=True, help="Path to the file containing the code to analyze") +@click.option("--line", required=True, type=int, help="Line number of the code to analyze") +@click.option("--db", default=None, help="Path to the SQLite database (default: repo_index.db)") +@click.option("--json", is_flag=True, help="Output results as JSON") +@click.option("--max-depth", default=2, type=int, help="Maximum depth for transitive dependencies") +@click.option("--verbose", is_flag=True, help="Show detailed dependency information") +@click.option("--trace-to-backend", is_flag=True, help="Trace frontend API calls to backend endpoints (cross-stack analysis)") +def impact(file, line, db, json, max_depth, verbose, trace_to_backend): + """ + Analyze the impact radius of changing code at a specific location. + + This command traces both upstream dependencies (who calls this code) + and downstream dependencies (what this code calls) to help understand + the blast radius of potential changes. + + Example: + aud impact --file src/auth.py --line 42 + aud impact --file theauditor/indexer.py --line 100 --verbose + """ + from theauditor.impact_analyzer import analyze_impact, format_impact_report + from theauditor.config_runtime import load_runtime_config + import json as json_lib + + # Load configuration for default paths + config = load_runtime_config(".") + + # Use default database path if not provided + if db is None: + db = config["paths"]["db"] + + # Verify database exists + db_path = Path(db) + if not db_path.exists(): + click.echo(f"Error: Database not found at {db}", err=True) + click.echo("Run 'aud index' first to build the repository index", err=True) + raise click.ClickException(f"Database not found: {db}") + + # Verify file exists (helpful for user) + file = Path(file) + if not file.exists(): + click.echo(f"Warning: File {file} not found in filesystem", err=True) + click.echo("Proceeding with analysis using indexed data...", err=True) + + # Perform impact analysis + try: + result = analyze_impact( + db_path=str(db_path), + target_file=str(file), + target_line=line, + trace_to_backend=trace_to_backend + ) + + # Output results + if json: + # JSON output for programmatic use + click.echo(json_lib.dumps(result, indent=2, sort_keys=True)) + else: + # Human-readable report + report = format_impact_report(result) + click.echo(report) + + # Additional verbose output + if verbose and not result.get("error"): + click.echo("\n" + "=" * 60) + click.echo("DETAILED DEPENDENCY INFORMATION") + click.echo("=" * 60) + + # Show transitive upstream + if result.get("upstream_transitive"): + click.echo(f"\nTransitive Upstream Dependencies ({len(result['upstream_transitive'])} total):") + for dep in result["upstream_transitive"][:20]: + depth_indicator = " " * (3 - dep.get("depth", 1)) + tree_char = "+-" if IS_WINDOWS else "└─" + click.echo(f"{depth_indicator}{tree_char} {dep['symbol']} in {dep['file']}:{dep['line']}") + if len(result["upstream_transitive"]) > 20: + click.echo(f" ... and {len(result['upstream_transitive']) - 20} more") + + # Show transitive downstream + if result.get("downstream_transitive"): + click.echo(f"\nTransitive Downstream Dependencies ({len(result['downstream_transitive'])} total):") + for dep in result["downstream_transitive"][:20]: + depth_indicator = " " * (3 - dep.get("depth", 1)) + if dep["file"] != "external": + tree_char = "+-" if IS_WINDOWS else "└─" + click.echo(f"{depth_indicator}{tree_char} {dep['symbol']} in {dep['file']}:{dep['line']}") + else: + tree_char = "+-" if IS_WINDOWS else "└─" + click.echo(f"{depth_indicator}{tree_char} {dep['symbol']} (external)") + if len(result["downstream_transitive"]) > 20: + click.echo(f" ... and {len(result['downstream_transitive']) - 20} more") + + # Exit with appropriate code + if result.get("error"): + # Error already displayed in the report, just exit with code + exit(3) # Exit code 3 for analysis errors + + # Warn if high impact + summary = result.get("impact_summary", {}) + if summary.get("total_impact", 0) > 20: + click.echo("\n⚠ WARNING: High impact change detected!", err=True) + exit(1) # Non-zero exit for CI/CD integration + + except Exception as e: + # Only show this for unexpected exceptions, not for already-handled errors + if "No function or class found at" not in str(e): + click.echo(f"Error during impact analysis: {e}", err=True) + raise click.ClickException(str(e)) \ No newline at end of file diff --git a/theauditor/commands/index.py b/theauditor/commands/index.py new file mode 100644 index 0000000..8da6c48 --- /dev/null +++ b/theauditor/commands/index.py @@ -0,0 +1,50 @@ +"""Build language-agnostic manifest and SQLite index of repository.""" + +import click +from theauditor.utils.error_handler import handle_exceptions +from theauditor.utils.helpers import get_self_exclusion_patterns + + +@click.command() +@handle_exceptions +@click.option("--root", default=".", help="Root directory to index") +@click.option("--manifest", default=None, help="Output manifest file path") +@click.option("--db", default=None, help="Output SQLite database path") +@click.option("--print-stats", is_flag=True, help="Print summary statistics") +@click.option("--dry-run", is_flag=True, help="Scan but don't write files") +@click.option("--follow-symlinks", is_flag=True, help="Follow symbolic links (default: skip)") +@click.option("--exclude-self", is_flag=True, help="Exclude TheAuditor's own files (for self-testing)") +def index(root, manifest, db, print_stats, dry_run, follow_symlinks, exclude_self): + """Build language-agnostic manifest and SQLite index of repository.""" + from theauditor.indexer import build_index + from theauditor.config_runtime import load_runtime_config + + # Load configuration + config = load_runtime_config(root) + + # Use config defaults if not provided + if manifest is None: + manifest = config["paths"]["manifest"] + if db is None: + db = config["paths"]["db"] + + # Build exclude patterns using centralized function + exclude_patterns = get_self_exclusion_patterns(exclude_self) + + if exclude_self and print_stats: + click.echo(f"[EXCLUDE-SELF] Excluding TheAuditor's own files from indexing") + click.echo(f"[EXCLUDE-SELF] {len(exclude_patterns)} patterns will be excluded") + + result = build_index( + root_path=root, + manifest_path=manifest, + db_path=db, + print_stats=print_stats, + dry_run=dry_run, + follow_symlinks=follow_symlinks, + exclude_patterns=exclude_patterns, + ) + + if result.get("error"): + click.echo(f"Error: {result['error']}", err=True) + raise click.ClickException(result["error"]) \ No newline at end of file diff --git a/theauditor/commands/init.py b/theauditor/commands/init.py new file mode 100644 index 0000000..02f41e3 --- /dev/null +++ b/theauditor/commands/init.py @@ -0,0 +1,143 @@ +"""Initialize TheAuditor for first-time use.""" + +from pathlib import Path +import click + + +@click.command() +@click.option("--offline", is_flag=True, help="Skip network operations (deps check, docs fetch)") +@click.option("--skip-docs", is_flag=True, help="Skip documentation fetching") +@click.option("--skip-deps", is_flag=True, help="Skip dependency checking") +def init(offline, skip_docs, skip_deps): + """Initialize TheAuditor for first-time use (runs all setup steps).""" + from theauditor.init import initialize_project + + click.echo("[INIT] Initializing TheAuditor...\n") + click.echo("This will run all setup steps:") + click.echo(" 1. Index repository") + click.echo(" 2. Create workset") + click.echo(" 3. Check dependencies") + click.echo(" 4. Fetch documentation") + click.echo("\n" + "="*60 + "\n") + + # Call the refactored initialization logic + result = initialize_project( + offline=offline, + skip_docs=skip_docs, + skip_deps=skip_deps + ) + + stats = result["stats"] + has_failures = result["has_failures"] + next_steps = result["next_steps"] + + # Display step-by-step results + click.echo("[INDEX] Step 1/5: Indexing repository...") + if stats.get("index", {}).get("success"): + click.echo(f" [OK] Indexed {stats['index']['text_files']} text files") + else: + click.echo(f" [FAIL] Failed: {stats['index'].get('error', 'Unknown error')}", err=True) + + click.echo("\n[TARGET] Step 2/5: Creating workset...") + if stats.get("workset", {}).get("success"): + click.echo(f" [OK] Workset created with {stats['workset']['files']} files") + elif stats.get("workset", {}).get("files") == 0: + click.echo(" [WARN] No files found to create workset") + else: + click.echo(f" [FAIL] Failed: {stats['workset'].get('error', 'Unknown error')}", err=True) + + if not skip_deps and not offline: + click.echo("\n[PACKAGE] Step 3/4: Checking dependencies...") + if stats.get("deps", {}).get("success"): + if stats["deps"]["total"] > 0: + click.echo(f" [OK] Found {stats['deps']['total']} dependencies ({stats['deps']['outdated']} outdated)") + else: + click.echo(" [OK] No dependency files found") + else: + click.echo(f" [FAIL] Failed: {stats['deps'].get('error', 'Unknown error')}", err=True) + else: + click.echo("\n[PACKAGE] Step 3/4: Skipping dependency check (offline/skipped)") + + if not skip_docs and not offline: + click.echo("\n[DOCS] Step 4/4: Fetching documentation...") + if stats.get("docs", {}).get("success"): + fetched = stats['docs'].get('fetched', 0) + cached = stats['docs'].get('cached', 0) + if fetched > 0 and cached > 0: + click.echo(f" [OK] Fetched {fetched} new docs, using {cached} cached docs") + elif fetched > 0: + click.echo(f" [OK] Fetched {fetched} docs") + elif cached > 0: + click.echo(f" [OK] Using {cached} cached docs (already up-to-date)") + else: + click.echo(" [WARN] No docs fetched or cached") + + # Report any errors from the stats + if stats['docs'].get('errors'): + errors = stats['docs']['errors'] + rate_limited = [e for e in errors if "rate limited" in e.lower()] + other_errors = [e for e in errors if "rate limited" not in e.lower()] + + if rate_limited: + click.echo(f" [WARN] {len(rate_limited)} packages rate-limited (will retry with delay)") + if other_errors and len(other_errors) <= 3: + for err in other_errors[:3]: + click.echo(f" [WARN] {err}") + elif other_errors: + click.echo(f" [WARN] {len(other_errors)} packages failed to fetch") + + click.echo(f" [OK] Created {stats['docs']['capsules']} doc capsules") + elif stats["docs"].get("error") == "Interrupted by user": + click.echo("\n [WARN] Documentation fetch interrupted (Ctrl+C)") + else: + click.echo(f" [FAIL] Failed: {stats['docs'].get('error', 'Unknown error')}", err=True) + else: + click.echo("\n[DOCS] Step 4/4: Skipping documentation (offline/skipped)") + + # Summary + click.echo("\n" + "="*60) + + if has_failures: + click.echo("\n[WARN] Initialization Partially Complete\n") + else: + click.echo("\n[SUCCESS] Initialization Complete!\n") + + # Show summary + click.echo("[STATS] Summary:") + if stats.get("index", {}).get("success"): + click.echo(f" * Indexed: {stats['index']['text_files']} files") + else: + click.echo(" * Indexing: [FAILED] Failed") + + if stats.get("workset", {}).get("success"): + click.echo(f" * Workset: {stats['workset']['files']} files") + elif stats.get("workset", {}).get("files") == 0: + click.echo(" * Workset: [WARN] No files found") + else: + click.echo(" * Workset: [FAILED] Failed") + + if stats.get("deps", {}).get("success"): + click.echo(f" * Dependencies: {stats['deps'].get('total', 0)} total, {stats['deps'].get('outdated', 0)} outdated") + elif stats.get("deps", {}).get("skipped"): + click.echo(" * Dependencies: [SKIPPED] Skipped") + + if stats.get("docs", {}).get("success"): + fetched = stats['docs'].get('fetched', 0) + cached = stats['docs'].get('cached', 0) + capsules = stats['docs'].get('capsules', 0) + if cached > 0: + click.echo(f" * Documentation: {fetched} fetched, {cached} cached, {capsules} capsules") + else: + click.echo(f" * Documentation: {fetched} fetched, {capsules} capsules") + elif stats.get("docs", {}).get("skipped"): + click.echo(" * Documentation: [SKIPPED] Skipped") + + # Next steps - only show if we have files to work with + if next_steps: + click.echo("\n[TARGET] Next steps:") + for i, step in enumerate(next_steps, 1): + click.echo(f" {i}. Run: {step}") + click.echo("\nOr run all at once:") + click.echo(f" {' && '.join(next_steps)}") + else: + click.echo("\n[WARN] No files found to audit. Check that you're in the right directory.") \ No newline at end of file diff --git a/theauditor/commands/init_config.py b/theauditor/commands/init_config.py new file mode 100644 index 0000000..8da0c78 --- /dev/null +++ b/theauditor/commands/init_config.py @@ -0,0 +1,21 @@ +"""Ensure minimal mypy config exists (idempotent).""" + +import click + + +@click.command("init-config") +@click.option("--pyproject", default="pyproject.toml", help="Path to pyproject.toml") +def init_config(pyproject): + """Ensure minimal mypy config exists (idempotent).""" + from theauditor.config import ensure_mypy_config + + try: + res = ensure_mypy_config(pyproject) + msg = ( + "mypy config created" + if res.get("status") == "created" + else "mypy config already present" + ) + click.echo(msg) + except Exception as e: + raise click.ClickException(f"Failed to init config: {e}") from e \ No newline at end of file diff --git a/theauditor/commands/init_js.py b/theauditor/commands/init_js.py new file mode 100644 index 0000000..370d6a1 --- /dev/null +++ b/theauditor/commands/init_js.py @@ -0,0 +1,41 @@ +"""Create or merge minimal package.json for lint/typecheck.""" + +import click + + +@click.command("init-js") +@click.option("--path", default="package.json", help="Path to package.json") +@click.option("--add-hooks", is_flag=True, help="Add TheAuditor hooks to npm scripts") +def init_js(path, add_hooks): + """Create or merge minimal package.json for lint/typecheck.""" + from theauditor.js_init import ensure_package_json, add_auditor_hooks + + try: + res = ensure_package_json(path) + + if res["status"] == "created": + click.echo(f"[OK] Created {path} with PIN_ME placeholders") + click.echo(" Edit devDependencies to set exact versions") + elif res["status"] == "merged": + click.echo(f"[OK] Merged lint/typecheck config into {path}") + click.echo(" Check devDependencies for PIN_ME placeholders") + else: + click.echo(f"No changes needed - {path} already configured") + + # Add hooks if requested + if add_hooks: + click.echo("\nAdding TheAuditor hooks to npm scripts...") + hook_res = add_auditor_hooks(path) + + if hook_res["status"] == "hooks_added": + click.echo("[OK] Added TheAuditor hooks to package.json:") + for change in hook_res["details"]: + click.echo(f" - {change}") + elif hook_res["status"] == "unchanged": + click.echo("No changes needed - all hooks already present") + elif hook_res["status"] == "error": + click.echo(f"Error adding hooks: {hook_res['message']}", err=True) + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e \ No newline at end of file diff --git a/theauditor/commands/insights.py b/theauditor/commands/insights.py new file mode 100644 index 0000000..25455aa --- /dev/null +++ b/theauditor/commands/insights.py @@ -0,0 +1,443 @@ +"""Run optional insights analysis on existing audit data. + +This command runs interpretive analysis modules (ML, graph health, taint severity) +on top of existing raw audit data, generating insights and predictions. +""" + +import json +import sys +from pathlib import Path +from typing import Dict, Any, List + +import click + + +@click.command() +@click.option("--mode", "-m", + type=click.Choice(["ml", "graph", "taint", "impact", "all"]), + default="all", + help="Which insights modules to run") +@click.option("--ml-train", is_flag=True, + help="Train ML models before generating suggestions") +@click.option("--topk", default=10, type=int, + help="Top K files for ML suggestions") +@click.option("--output-dir", "-o", type=click.Path(), + default="./.pf/insights", + help="Directory for insights output") +@click.option("--print-summary", is_flag=True, + help="Print summary to console") +def insights(mode: str, ml_train: bool, topk: int, output_dir: str, print_summary: bool) -> None: + """Run optional insights analysis on existing audit data. + + This command generates interpretive analysis and predictions based on + the raw facts collected by the audit pipeline. All insights are optional + and separate from the core truth data. + + Available insights modules: + - ml: Machine learning risk predictions and root cause analysis + - graph: Graph health metrics and architectural scoring + - taint: Severity scoring for taint analysis paths + - impact: Impact radius and blast zone analysis + - all: Run all available insights + + Examples: + # Run all insights + aud insights + + # Only ML predictions + aud insights --mode ml + + # Train ML first, then predict + aud insights --mode ml --ml-train + + # Graph health only with summary + aud insights --mode graph --print-summary + """ + + # Ensure we have raw data to analyze + pf_dir = Path(".pf") + raw_dir = pf_dir / "raw" + + if not raw_dir.exists(): + click.echo("[ERROR] No raw audit data found. Run 'aud full' first.", err=True) + sys.exit(1) + + # Create insights directory + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + click.echo(f"\n{'='*60}") + click.echo(f"INSIGHTS ANALYSIS - {mode.upper()} Mode") + click.echo(f"{'='*60}") + click.echo(f"Output directory: {output_path}") + + results = {} + errors = [] + + # ML Insights + if mode in ["ml", "all"]: + click.echo("\n[ML] Running machine learning insights...") + ml_result = run_ml_insights(ml_train, topk, output_path) + results["ml"] = ml_result + if ml_result.get("error"): + errors.append(f"ML: {ml_result['error']}") + else: + click.echo(f" ✓ ML predictions saved to {output_path}/ml_suggestions.json") + + # Graph Health Insights + if mode in ["graph", "all"]: + click.echo("\n[GRAPH] Running graph health analysis...") + graph_result = run_graph_insights(output_path) + results["graph"] = graph_result + if graph_result.get("error"): + errors.append(f"Graph: {graph_result['error']}") + else: + click.echo(f" ✓ Graph health saved to {output_path}/graph_health.json") + + # Taint Severity Insights + if mode in ["taint", "all"]: + click.echo("\n[TAINT] Running taint severity scoring...") + taint_result = run_taint_insights(output_path) + results["taint"] = taint_result + if taint_result.get("error"): + errors.append(f"Taint: {taint_result['error']}") + else: + click.echo(f" ✓ Taint severity saved to {output_path}/taint_severity.json") + + # Impact Analysis Insights + if mode in ["impact", "all"]: + click.echo("\n[IMPACT] Running impact analysis...") + impact_result = run_impact_insights(output_path) + results["impact"] = impact_result + if impact_result.get("error"): + errors.append(f"Impact: {impact_result['error']}") + else: + click.echo(f" ✓ Impact analysis saved to {output_path}/impact_analysis.json") + + # Aggregate all insights into unified summary + click.echo("\n[AGGREGATE] Creating unified insights summary...") + summary = aggregate_insights(results, output_path) + + # Save unified summary + summary_path = output_path / "unified_insights.json" + with open(summary_path, 'w') as f: + json.dump(summary, f, indent=2, default=str) + click.echo(f" ✓ Unified summary saved to {summary_path}") + + # Print summary if requested + if print_summary: + print_insights_summary(summary) + + # Final status + click.echo(f"\n{'='*60}") + if errors: + click.echo(f"[WARN] Insights completed with {len(errors)} errors:", err=True) + for error in errors: + click.echo(f" • {error}", err=True) + else: + click.echo("[OK] All insights generated successfully") + + click.echo(f"\n[TIP] Insights are interpretive and optional.") + click.echo(f" Raw facts remain in .pf/raw/ unchanged.") + + sys.exit(1 if errors else 0) + + +def run_ml_insights(train: bool, topk: int, output_dir: Path) -> Dict[str, Any]: + """Run ML insights generation.""" + try: + from theauditor.ml import check_ml_available, learn, suggest + + if not check_ml_available(): + return {"error": "ML module not installed. Run: pip install -e .[ml]"} + + # Train if requested + if train: + learn_result = learn( + db_path="./.pf/repo_index.db", + manifest_path="./.pf/manifest.json", + print_stats=False + ) + if not learn_result.get("success"): + return {"error": f"ML training failed: {learn_result.get('error')}"} + + # Generate suggestions + suggest_result = suggest( + db_path="./.pf/repo_index.db", + manifest_path="./.pf/manifest.json", + workset_path="./.pf/workset.json", + topk=topk, + out_path=str(output_dir / "ml_suggestions.json") + ) + + return suggest_result + + except ImportError: + return {"error": "ML module not available"} + except Exception as e: + return {"error": str(e)} + + +def run_graph_insights(output_dir: Path) -> Dict[str, Any]: + """Run graph health insights.""" + try: + from theauditor.graph.insights import GraphInsights + from theauditor.graph.analyzer import XGraphAnalyzer + from theauditor.graph.store import XGraphStore + + # Load graph from SQLite database (SINGLE SOURCE OF TRUTH) + store = XGraphStore(db_path="./.pf/graphs.db") + import_graph = store.load_import_graph() + + if not import_graph or not import_graph.get("nodes"): + return {"error": "No import graph found. Run 'aud graph build' first."} + + # Load analysis data if it exists + analysis_path = Path(".pf/raw/graph_analysis.json") + analysis_data = {} + if analysis_path.exists(): + with open(analysis_path) as f: + analysis_data = json.load(f) + + # Run insights analysis + insights = GraphInsights() + analyzer = XGraphAnalyzer() + + # Use pre-calculated cycles and hotspots if available, otherwise calculate + if 'cycles' in analysis_data: + cycles = analysis_data['cycles'] + else: + cycles = analyzer.detect_cycles(import_graph) + + # Use pre-calculated hotspots if available, otherwise calculate + if 'hotspots' in analysis_data: + hotspots = analysis_data['hotspots'] + else: + hotspots = insights.rank_hotspots(import_graph) + + # Calculate health metrics + health = insights.calculate_health_metrics( + import_graph, + cycles=cycles, + hotspots=hotspots + ) + + # Generate recommendations + recommendations = insights.generate_recommendations( + import_graph, + cycles=cycles, + hotspots=hotspots + ) + + # Save results + output = { + "health_metrics": health, + "top_hotspots": hotspots[:10], + "recommendations": recommendations, + "cycles_found": len(cycles), + "total_nodes": len(import_graph.get("nodes", [])), + "total_edges": len(import_graph.get("edges", [])) + } + + output_path = output_dir / "graph_health.json" + with open(output_path, 'w') as f: + json.dump(output, f, indent=2) + + return {"success": True, "health_score": health.get("health_score")} + + except ImportError: + return {"error": "Graph insights module not available"} + except Exception as e: + return {"error": str(e)} + + +def run_taint_insights(output_dir: Path) -> Dict[str, Any]: + """Run taint severity insights.""" + try: + from datetime import datetime, UTC + from theauditor.taint.insights import calculate_severity, classify_vulnerability, generate_summary + from theauditor.taint_analyzer import SECURITY_SINKS + + # Load raw taint data + taint_path = Path(".pf/raw/taint_analysis.json") + if not taint_path.exists(): + return {"error": "No taint data found. Run 'aud taint-analyze' first."} + + with open(taint_path) as f: + taint_data = json.load(f) + + if not taint_data.get("success"): + return {"error": "Taint analysis was not successful"} + + # Calculate severity for each path and create enriched versions + severity_analysis = [] + enriched_paths = [] + for path in taint_data.get("taint_paths", []): + severity = calculate_severity(path) + vuln_type = classify_vulnerability(path.get("sink", {}), SECURITY_SINKS) + + severity_analysis.append({ + "file": path.get("sink", {}).get("file"), + "line": path.get("sink", {}).get("line"), + "severity": severity, + "vulnerability_type": vuln_type, + "path_length": len(path.get("path", [])), + "risk_score": 1.0 if severity == "critical" else 0.7 if severity == "high" else 0.4 + }) + + # Create enriched path with severity for summary generation + enriched_path = dict(path) + enriched_path["severity"] = severity + enriched_path["vulnerability_type"] = vuln_type + enriched_paths.append(enriched_path) + + # Generate summary using enriched paths with severity + summary = generate_summary(enriched_paths) + + # Save results + output = { + "generated_at": datetime.now(UTC).isoformat(), + "severity_analysis": severity_analysis, + "summary": summary, + "total_vulnerabilities": len(severity_analysis), + "sources_analyzed": taint_data.get("sources_found", 0), + "sinks_analyzed": taint_data.get("sinks_found", 0) + } + + output_path = output_dir / "taint_severity.json" + with open(output_path, 'w') as f: + json.dump(output, f, indent=2) + + return {"success": True, "risk_level": summary.get("risk_level")} + + except ImportError: + return {"error": "Taint insights module not available"} + except Exception as e: + return {"error": str(e)} + + +def run_impact_insights(output_dir: Path) -> Dict[str, Any]: + """Run impact analysis insights.""" + try: + # Check if workset exists + workset_path = Path(".pf/workset.json") + if not workset_path.exists(): + return {"error": "No workset found. Run 'aud workset' first."} + + with open(workset_path) as f: + workset_data = json.load(f) + + # For now, create a simple impact summary + # In future, this could run actual impact analysis on changed files + output = { + "files_changed": len(workset_data.get("files", [])), + "potential_impact": "Analysis pending", + "recommendation": "Run 'aud impact --file --line ' for detailed analysis" + } + + output_path = output_dir / "impact_analysis.json" + with open(output_path, 'w') as f: + json.dump(output, f, indent=2) + + return {"success": True, "files_analyzed": len(workset_data.get("files", []))} + + except Exception as e: + return {"error": str(e)} + + +def aggregate_insights(results: Dict[str, Any], output_dir: Path) -> Dict[str, Any]: + """Aggregate all insights into unified summary.""" + summary = { + "insights_generated": list(results.keys()), + "timestamp": __import__('datetime').datetime.now().isoformat(), + "output_directory": str(output_dir) + } + + # ML insights + if "ml" in results and results["ml"].get("success"): + summary["ml"] = { + "status": "success", + "workset_size": results["ml"].get("workset_size", 0), + "predictions_generated": True + } + elif "ml" in results: + summary["ml"] = {"status": "error", "error": results["ml"].get("error")} + + # Graph insights + if "graph" in results and results["graph"].get("success"): + summary["graph"] = { + "status": "success", + "health_score": results["graph"].get("health_score", 0) + } + elif "graph" in results: + summary["graph"] = {"status": "error", "error": results["graph"].get("error")} + + # Taint insights + if "taint" in results and results["taint"].get("success"): + summary["taint"] = { + "status": "success", + "risk_level": results["taint"].get("risk_level", "unknown") + } + elif "taint" in results: + summary["taint"] = {"status": "error", "error": results["taint"].get("error")} + + # Impact insights + if "impact" in results and results["impact"].get("success"): + summary["impact"] = { + "status": "success", + "files_analyzed": results["impact"].get("files_analyzed", 0) + } + elif "impact" in results: + summary["impact"] = {"status": "error", "error": results["impact"].get("error")} + + return summary + + +def print_insights_summary(summary: Dict[str, Any]) -> None: + """Print insights summary to console.""" + click.echo(f"\n{'='*60}") + click.echo("INSIGHTS SUMMARY") + click.echo(f"{'='*60}") + + # ML Summary + if "ml" in summary: + if summary["ml"]["status"] == "success": + click.echo(f"\n[ML] Machine Learning Insights:") + click.echo(f" • Workset size: {summary['ml'].get('workset_size', 0)} files") + click.echo(f" • Predictions: Generated successfully") + else: + click.echo(f"\n[ML] Machine Learning Insights: {summary['ml'].get('error')}") + + # Graph Summary + if "graph" in summary: + if summary["graph"]["status"] == "success": + health = summary["graph"].get("health_score", 0) + grade = "A" if health >= 90 else "B" if health >= 80 else "C" if health >= 70 else "D" if health >= 60 else "F" + click.echo(f"\n[GRAPH] Architecture Health:") + click.echo(f" • Health score: {health}/100 (Grade: {grade})") + else: + click.echo(f"\n[GRAPH] Architecture Health: {summary['graph'].get('error')}") + + # Taint Summary + if "taint" in summary: + if summary["taint"]["status"] == "success": + risk = summary["taint"].get("risk_level", "unknown") + color = "red" if risk == "critical" else "yellow" if risk == "high" else "green" + click.echo(f"\n[TAINT] Security Risk:") + click.echo(f" • Risk level: {risk.upper()}") + else: + click.echo(f"\n[TAINT] Security Risk: {summary['taint'].get('error')}") + + # Impact Summary + if "impact" in summary: + if summary["impact"]["status"] == "success": + click.echo(f"\n[IMPACT] Change Impact:") + click.echo(f" • Files analyzed: {summary['impact'].get('files_analyzed', 0)}") + else: + click.echo(f"\n[IMPACT] Change Impact: {summary['impact'].get('error')}") + + click.echo(f"\n{'='*60}") + + +# Register command +insights_command = insights \ No newline at end of file diff --git a/theauditor/commands/lint.py b/theauditor/commands/lint.py new file mode 100644 index 0000000..701cef4 --- /dev/null +++ b/theauditor/commands/lint.py @@ -0,0 +1,267 @@ +"""Run linters and normalize output to evidence format.""" + +import hashlib +import json +from collections import defaultdict +from pathlib import Path +from typing import Any + +import click + +from theauditor.linters import ( + detect_linters, + run_linter, +) +from theauditor.utils import load_json_file +from theauditor.utils.error_handler import handle_exceptions + + +def write_lint_json(findings: list[dict[str, Any]], output_path: str): + """Write findings to JSON file.""" + # Sort findings for determinism + sorted_findings = sorted(findings, key=lambda f: (f["file"], f["line"], f["rule"])) + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(sorted_findings, f, indent=2, sort_keys=True) + + +def lint_command( + root_path: str = ".", + workset_path: str = "./.pf/workset.json", + manifest_path: str = "manifest.json", + timeout: int = 300, + print_plan: bool = False, + auto_fix: bool = False, +) -> dict[str, Any]: + """ + Run linters and normalize output. + + Returns: + Dictionary with success status and statistics + """ + # AUTO-FIX DEPRECATED: Force disabled to prevent version mismatch issues + auto_fix = False + # Load workset or manifest files + if workset_path is not None: + # Use workset mode + try: + workset = load_json_file(workset_path) + workset_files = {p["path"] for p in workset.get("paths", [])} + except (FileNotFoundError, json.JSONDecodeError) as e: + return {"success": False, "error": f"Failed to load workset: {e}"} + else: + # Use all files from manifest when --workset is not used + try: + manifest = load_json_file(manifest_path) + # Use all text files from the manifest + workset_files = {f["path"] for f in manifest if isinstance(f, dict) and "path" in f} + except (FileNotFoundError, json.JSONDecodeError) as e: + return {"success": False, "error": f"Failed to load manifest: {e}"} + + if not workset_files: + return {"success": False, "error": "Empty workset"} + + # Detect available linters + linters = detect_linters(root_path, auto_fix=auto_fix) + + if print_plan: + print("Lint Plan:") + # AUTO-FIX DEPRECATED: Always in check-only mode + # print(f" Mode: {'AUTO-FIX' if auto_fix else 'CHECK-ONLY'}") + print(f" Mode: CHECK-ONLY") + print(f" Workset: {len(workset_files)} files") + if linters: + print(" External linters detected:") + for tool in linters: + # AUTO-FIX DEPRECATED: No fix indicators + # fix_capable = tool in ["eslint", "prettier", "ruff", "black"] + # fix_indicator = " (will fix)" if auto_fix and fix_capable else "" + print(f" - {tool}") + else: + print(" No external linters detected") + print(" Will run built-in checks:") + print(" - NO_TODO_LAND (excessive TODOs)") + print(" - NO_LONG_FILES (>1500 lines)") + print(" - NO_CYCLES (import cycles)") + print(" - NO_DEBUG_CALLS (console.log/print)") + print(" - NO_SECRET_LIKE (potential secrets)") + return {"success": True, "printed_plan": True} + + all_findings = [] + fixed_count = 0 + all_ast_data = {} # Collect AST data from ESLint + + if linters: + # Run external linters + # AUTO-FIX DEPRECATED: Always run in check-only mode + # mode_str = "Fixing" if auto_fix else "Checking" + print(f"Checking with {len(linters)} external linters...") + for tool, command in linters.items(): + # AUTO-FIX DEPRECATED: This entire block is disabled + # if auto_fix and tool in ["eslint", "prettier", "ruff", "black"]: + # print(f" Fixing with {tool}...") + # # In fix mode, we run the tool but may get fewer findings (as they're fixed) + # findings, ast_data = run_linter(tool, command, root_path, workset_files, timeout) + # # Collect AST data from ESLint + # if tool == "eslint" and ast_data: + # all_ast_data.update(ast_data) + # # Add remaining findings (unfixable issues) + # all_findings.extend(findings) + # # Estimate fixes based on the tool (most issues are fixable) + # if tool in ["prettier", "black"]: + # # Formatters fix all issues + # if len(findings) == 0: + # print(f" Fixed all formatting issues") + # else: + # print(f" Fixed most issues, {len(findings)} remaining") + # else: + # # ESLint and Ruff fix most but not all issues + # remaining = len(findings) + # if remaining > 0: + # print(f" Fixed issues, {remaining} remaining (unfixable)") + # else: + # print(f" Fixed all issues") + # else: + print(f" Checking with {tool}...") + findings, ast_data = run_linter(tool, command, root_path, workset_files, timeout) + # Collect AST data from ESLint + if tool == "eslint" and ast_data: + all_ast_data.update(ast_data) + all_findings.extend(findings) + print(f" Found {len(findings)} issues") + else: + # No linters found - this indicates broken environment + print("[WARNING] No external linters found!") + print("[ERROR] Environment is not properly configured - industry tools are required") + print(" Install at least one linter:") + print(" JavaScript/TypeScript: npm install --save-dev eslint") + print(" Python: pip install ruff") + print(" Go: go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest") + # Continue with empty findings rather than failing completely + print("[INFO] Continuing with no lint findings...") + + # Check TypeScript configuration to determine which TS tool to use + # This is DETECTION logic, not a linter itself + # tsconfig_findings = check_tsconfig(root_path) + # NOTE: check_tsconfig was deleted with builtin.py - need to restore detection logic + + # Write outputs directly to raw directory + output_dir = Path(".pf/raw") + output_dir.mkdir(parents=True, exist_ok=True) + + json_path = output_dir / "lint.json" + + write_lint_json(all_findings, str(json_path)) + + # Save ESLint ASTs to cache + if all_ast_data: + # Load manifest to get file hashes + try: + manifest = load_json_file(manifest_path) + file_hashes = {f["path"]: f.get("sha256") for f in manifest if isinstance(f, dict) and "sha256" in f} + + # Create AST cache directory + ast_cache_dir = output_dir / "ast_cache" / "eslint" + ast_cache_dir.mkdir(parents=True, exist_ok=True) + + # Save each AST with the file's SHA256 hash as the filename + for file_path, ast in all_ast_data.items(): + if file_path in file_hashes and file_hashes[file_path]: + file_hash = file_hashes[file_path] + else: + # If hash not in manifest, compute it from file content + full_path = Path(root_path) / file_path + if full_path.exists(): + with open(full_path, "rb") as f: + file_hash = hashlib.sha256(f.read()).hexdigest() + else: + continue + + # Save AST to cache file + ast_file = ast_cache_dir / f"{file_hash}.json" + with open(ast_file, "w", encoding="utf-8") as f: + json.dump(ast, f, indent=2) + + print(f" Cached {len(all_ast_data)} ASTs from ESLint") + except Exception as e: + print(f"Warning: Failed to cache ESLint ASTs: {e}") + + # Statistics + stats = { + "total_findings": len(all_findings), + "tools_run": len(linters) if linters else 1, # 1 for built-in + "workset_size": len(workset_files), + "errors": sum(1 for f in all_findings if f["severity"] == "error"), + "warnings": sum(1 for f in all_findings if f["severity"] == "warning"), + } + + # AUTO-FIX DEPRECATED: This block is disabled + # if auto_fix: + # print("\n[OK] Auto-fix complete:") + # print(f" Files processed: {len(workset_files)}") + # print(f" Remaining issues: {stats['total_findings']}") + # print(f" Errors: {stats['errors']}") + # print(f" Warnings: {stats['warnings']}") + # if stats['total_findings'] > 0: + # print(f" Note: Some issues cannot be auto-fixed and require manual attention") + # print(f" Report: {json_path}") + # else: + print("\nLint complete:") + print(f" Total findings: {stats['total_findings']}") + print(f" Errors: {stats['errors']}") + print(f" Warnings: {stats['warnings']}") + print(f" Output: {json_path}") + if stats['total_findings'] > 0: + print(" Note: Many linters (ESLint, Prettier, Ruff, Black) have their own automatic code style fix capabilities") + + return { + "success": True, + "stats": stats, + "output_files": [str(json_path)], + "auto_fix_applied": auto_fix, + } + + +@click.command() +@handle_exceptions +@click.option("--root", default=".", help="Root directory") +@click.option("--workset", is_flag=True, help="Use workset mode (lint only files in .pf/workset.json)") +@click.option("--workset-path", default=None, help="Custom workset path (rarely needed)") +@click.option("--manifest", default=None, help="Manifest file path") +@click.option("--timeout", default=None, type=int, help="Timeout in seconds for each linter") +@click.option("--print-plan", is_flag=True, help="Print lint plan without executing") +# AUTO-FIX DEPRECATED: Hidden flag kept for backward compatibility +@click.option("--fix", is_flag=True, hidden=True, help="[DEPRECATED] No longer functional") +def lint(root, workset, workset_path, manifest, timeout, print_plan, fix): + """Run linters and normalize output to evidence format.""" + from theauditor.config_runtime import load_runtime_config + + # Load configuration + config = load_runtime_config(root) + + # Use config defaults if not provided + if manifest is None: + manifest = config["paths"]["manifest"] + if timeout is None: + timeout = config["timeouts"]["lint_timeout"] + if workset_path is None and workset: + workset_path = config["paths"]["workset"] + + # Use workset path only if --workset flag is set + actual_workset_path = workset_path if workset else None + + result = lint_command( + root_path=root, + workset_path=actual_workset_path, + manifest_path=manifest, + timeout=timeout, + print_plan=print_plan, + auto_fix=fix, + ) + + if result.get("printed_plan"): + return + + if not result["success"]: + click.echo(f"Error: {result.get('error', 'Lint failed')}", err=True) + raise click.ClickException(result.get("error", "Lint failed")) \ No newline at end of file diff --git a/theauditor/commands/ml.py b/theauditor/commands/ml.py new file mode 100644 index 0000000..76d0c11 --- /dev/null +++ b/theauditor/commands/ml.py @@ -0,0 +1,165 @@ +"""Machine learning commands for TheAuditor.""" + +import click +from pathlib import Path + + +@click.command(name="learn") +@click.option("--db-path", default="./.pf/repo_index.db", help="Database path") +@click.option("--manifest", default="./.pf/manifest.json", help="Manifest file path") +@click.option("--journal", default="./.pf/journal.ndjson", help="Journal file path") +@click.option("--fce", default="./.pf/fce.json", help="FCE file path") +@click.option("--ast", default="./.pf/ast_proofs.json", help="AST proofs file path") +@click.option("--enable-git", is_flag=True, help="Enable git churn features") +@click.option("--model-dir", default="./.pf/ml", help="Model output directory") +@click.option("--window", default=50, type=int, help="Journal window size") +@click.option("--seed", default=13, type=int, help="Random seed") +@click.option("--feedback", help="Path to human feedback JSON file") +@click.option("--train-on", type=click.Choice(["full", "diff", "all"]), default="full", help="Type of historical runs to train on") +@click.option("--print-stats", is_flag=True, help="Print training statistics") +def learn(db_path, manifest, journal, fce, ast, enable_git, model_dir, window, seed, feedback, train_on, print_stats): + """Train ML models from audit artifacts to predict risk and root causes.""" + from theauditor.ml import learn as ml_learn + + click.echo(f"[ML] Training models from audit artifacts (using {train_on} runs)...") + + result = ml_learn( + db_path=db_path, + manifest_path=manifest, + journal_path=journal, + fce_path=fce, + ast_path=ast, + enable_git=enable_git, + model_dir=model_dir, + window=window, + seed=seed, + print_stats=print_stats, + feedback_path=feedback, + train_on=train_on, + ) + + if result.get("success"): + stats = result.get("stats", {}) + click.echo(f"[OK] Models trained successfully") + click.echo(f" * Training data: {train_on} runs from history") + click.echo(f" * Files analyzed: {result.get('source_files', 0)}") + click.echo(f" * Features: {stats.get('n_features', 0)} dimensions") + click.echo(f" * Root cause ratio: {stats.get('root_cause_positive_ratio', 0):.2%}") + click.echo(f" * Risk mean: {stats.get('mean_risk', 0):.3f}") + if stats.get('cold_start'): + click.echo(f" [WARN] Cold-start mode (<500 samples)") + click.echo(f" * Models saved to: {result.get('model_dir')}") + else: + click.echo(f"[FAIL] Training failed: {result.get('error')}", err=True) + raise click.ClickException(result.get("error")) + + +@click.command(name="suggest") +@click.option("--db-path", default="./.pf/repo_index.db", help="Database path") +@click.option("--manifest", default="./.pf/manifest.json", help="Manifest file path") +@click.option("--workset", default="./.pf/workset.json", help="Workset file path") +@click.option("--fce", default="./.pf/fce.json", help="FCE file path") +@click.option("--ast", default="./.pf/ast_proofs.json", help="AST proofs file path") +@click.option("--model-dir", default="./.pf/ml", help="Model directory") +@click.option("--topk", default=10, type=int, help="Top K files to suggest") +@click.option("--out", default="./.pf/insights/ml_suggestions.json", help="Output file path") +@click.option("--print-plan", is_flag=True, help="Print suggestions to console") +def suggest(db_path, manifest, workset, fce, ast, model_dir, topk, out, print_plan): + """Generate ML-based suggestions for risky files and likely root causes.""" + from theauditor.ml import suggest as ml_suggest + + click.echo("[ML] Generating suggestions from trained models...") + + result = ml_suggest( + db_path=db_path, + manifest_path=manifest, + workset_path=workset, + fce_path=fce, + ast_path=ast, + model_dir=model_dir, + topk=topk, + out_path=out, + print_plan=print_plan, + ) + + if result.get("success"): + click.echo(f"[OK] Suggestions generated") + click.echo(f" * Workset size: {result.get('workset_size', 0)} files") + click.echo(f" * Source files analyzed: {result.get('workset_size', 0)}") + click.echo(f" * Non-source excluded: {result.get('excluded_count', 0)}") + click.echo(f" * Top {result.get('topk', 10)} suggestions saved to: {result.get('out_path')}") + else: + click.echo(f"[FAIL] Suggestion generation failed: {result.get('error')}", err=True) + raise click.ClickException(result.get("error")) + + +@click.command(name="learn-feedback") +@click.option("--feedback-file", required=True, help="Path to feedback JSON file") +@click.option("--db-path", default="./.pf/repo_index.db", help="Database path") +@click.option("--manifest", default="./.pf/manifest.json", help="Manifest file path") +@click.option("--model-dir", default="./.pf/ml", help="Model output directory") +@click.option("--train-on", type=click.Choice(["full", "diff", "all"]), default="full", help="Type of historical runs to train on") +@click.option("--print-stats", is_flag=True, help="Print training statistics") +def learn_feedback(feedback_file, db_path, manifest, model_dir, train_on, print_stats): + """ + Re-train models with human feedback for improved accuracy. + + The feedback file should be a JSON file with the format: + { + "path/to/file.py": { + "is_risky": true, + "is_root_cause": false, + "will_need_edit": true + }, + ... + } + """ + from theauditor.ml import learn as ml_learn + + # Validate feedback file exists + if not Path(feedback_file).exists(): + click.echo(f"[FAIL] Feedback file not found: {feedback_file}", err=True) + raise click.ClickException(f"Feedback file not found: {feedback_file}") + + # Validate feedback file format + try: + import json + with open(feedback_file) as f: + feedback_data = json.load(f) + + if not isinstance(feedback_data, dict): + raise ValueError("Feedback file must contain a JSON object") + + # Count feedback entries + feedback_count = len(feedback_data) + click.echo(f"[ML] Loading human feedback for {feedback_count} files...") + + except Exception as e: + click.echo(f"[FAIL] Invalid feedback file format: {e}", err=True) + raise click.ClickException(f"Invalid feedback file: {e}") + + click.echo(f"[ML] Re-training models with human feedback (using {train_on} runs)...") + + result = ml_learn( + db_path=db_path, + manifest_path=manifest, + model_dir=model_dir, + print_stats=print_stats, + feedback_path=feedback_file, + train_on=train_on, + # Use default paths for historical data from .pf/history + enable_git=False, # Disable git for speed in feedback mode + ) + + if result.get("success"): + stats = result.get("stats", {}) + click.echo(f"[OK] Models re-trained with human feedback") + click.echo(f" * Training data: {train_on} runs from history") + click.echo(f" * Files analyzed: {result.get('source_files', 0)}") + click.echo(f" * Human feedback incorporated: {feedback_count} files") + click.echo(f" * Features: {stats.get('n_features', 0)} dimensions") + click.echo(f" * Models saved to: {result.get('model_dir')}") + click.echo(f"\n[TIP] The models have learned from your feedback and will provide more accurate predictions.") + else: + click.echo(f"[FAIL] Re-training failed: {result.get('error')}", err=True) + raise click.ClickException(result.get("error")) \ No newline at end of file diff --git a/theauditor/commands/refactor.py b/theauditor/commands/refactor.py new file mode 100644 index 0000000..e3089fc --- /dev/null +++ b/theauditor/commands/refactor.py @@ -0,0 +1,600 @@ +"""Refactoring impact analysis command. + +This command analyzes the impact of refactoring changes and detects +inconsistencies between frontend and backend, API contract mismatches, +and data model evolution issues. +""" + +import json +import os +import sqlite3 +from pathlib import Path +from typing import Dict, List, Set, Any, Optional + +import click + + +@click.command() +@click.option("--file", "-f", help="File to analyze refactoring impact from") +@click.option("--line", "-l", type=int, help="Line number in the file") +@click.option("--migration-dir", "-m", default="backend/migrations", + help="Directory containing database migrations") +@click.option("--migration-limit", "-ml", type=int, default=0, + help="Number of recent migrations to analyze (0=all, default=all)") +@click.option("--expansion-mode", "-e", + type=click.Choice(["none", "direct", "full"]), + default="none", + help="Dependency expansion mode: none (affected only), direct (1 level), full (transitive)") +@click.option("--auto-detect", "-a", is_flag=True, + help="Auto-detect refactoring from recent migrations") +@click.option("--workset", "-w", is_flag=True, + help="Use current workset for analysis") +@click.option("--output", "-o", type=click.Path(), + help="Output file for detailed report") +def refactor(file: Optional[str], line: Optional[int], migration_dir: str, + migration_limit: int, expansion_mode: str, + auto_detect: bool, workset: bool, output: Optional[str]) -> None: + """Analyze refactoring impact and find inconsistencies. + + This command helps detect issues introduced by refactoring such as: + - Data model changes (fields moved between tables) + - API contract mismatches (frontend expects old structure) + - Missing updates in dependent code + - Cross-stack inconsistencies + + Examples: + # Analyze impact from a specific model change + aud refactor --file models/Product.ts --line 42 + + # Auto-detect refactoring from migrations + aud refactor --auto-detect + + # Analyze current workset + aud refactor --workset + """ + + # Find repository root + repo_root = Path.cwd() + while repo_root != repo_root.parent: + if (repo_root / ".git").exists(): + break + repo_root = repo_root.parent + + pf_dir = repo_root / ".pf" + db_path = pf_dir / "repo_index.db" + + if not db_path.exists(): + click.echo("Error: No index found. Run 'aud index' first.", err=True) + raise click.Abort() + + # Import components here to avoid import errors + try: + from theauditor.impact_analyzer import analyze_impact + from theauditor.universal_detector import UniversalPatternDetector + from theauditor.pattern_loader import PatternLoader + from theauditor.fce import run_fce + from theauditor.correlations.loader import CorrelationLoader + except ImportError as e: + click.echo(f"Error importing components: {e}", err=True) + raise click.Abort() + # Initialize components + pattern_loader = PatternLoader() + pattern_detector = UniversalPatternDetector( + repo_root, + pattern_loader, + exclude_patterns=[] + ) + + click.echo("\nRefactoring Impact Analysis") + click.echo("-" * 60) + + # Step 1: Determine what to analyze + affected_files = set() + + if auto_detect: + click.echo("Auto-detecting refactoring from migrations...") + affected_files.update(_analyze_migrations(repo_root, migration_dir, migration_limit)) + + if not affected_files: + click.echo("No affected files found from migrations.") + click.echo("Tip: Check if your migrations contain schema change operations") + return + + elif workset: + click.echo("Analyzing workset files...") + workset_file = pf_dir / "workset.json" + if workset_file.exists(): + with open(workset_file, 'r') as f: + workset_data = json.load(f) + affected_files.update(workset_data.get("files", [])) + else: + click.echo("Error: No workset found. Create one with 'aud workset'", err=True) + raise click.Abort() + + elif file and line: + click.echo(f"Analyzing impact from {file}:{line}...") + + # Run impact analysis + impact_result = analyze_impact( + db_path=str(db_path), + target_file=file, + target_line=line, + trace_to_backend=True + ) + + if not impact_result.get("error"): + # Extract affected files from impact analysis + upstream_files = [dep["file"] for dep in impact_result.get("upstream", [])] + downstream_files = [dep["file"] for dep in impact_result.get("downstream", [])] + upstream_trans_files = [dep["file"] for dep in impact_result.get("upstream_transitive", [])] + downstream_trans_files = [dep["file"] for dep in impact_result.get("downstream_transitive", [])] + + all_impact_files = set(upstream_files + downstream_files + upstream_trans_files + downstream_trans_files) + affected_files.update(all_impact_files) + + # Show immediate impact + summary = impact_result.get("impact_summary", {}) + click.echo(f"\nDirect impact: {summary.get('direct_upstream', 0)} upstream, " + f"{summary.get('direct_downstream', 0)} downstream") + click.echo(f"Total files affected: {summary.get('affected_files', len(affected_files))}") + + # Check for cross-stack impact + if impact_result.get("cross_stack_impact"): + click.echo("\n⚠️ Cross-stack impact detected!") + for impact in impact_result["cross_stack_impact"]: + click.echo(f" • {impact['file']}:{impact['line']} - {impact['type']}") + else: + click.echo("Error: Specify --file and --line, --auto-detect, or --workset", err=True) + raise click.Abort() + + if not affected_files: + click.echo("No files to analyze.") + return + + # Step 2b: Expand affected files based on mode + if affected_files: + expanded_files = _expand_affected_files( + affected_files, + str(db_path), + expansion_mode, + repo_root + ) + else: + expanded_files = set() + + # Update workset with expanded files + click.echo(f"\nCreating workset from {len(expanded_files)} files...") + temp_workset_file = pf_dir / "temp_workset.json" + with open(temp_workset_file, 'w') as f: + json.dump({"files": list(expanded_files)}, f) + + # Step 3: Run pattern detection with targeted file list + if expanded_files: + click.echo(f"Running pattern detection on {len(expanded_files)} files...") + + # Check if batch method is available + if hasattr(pattern_detector, 'detect_patterns_for_files'): + # Use optimized batch method if available + findings = pattern_detector.detect_patterns_for_files( + list(expanded_files), + categories=None + ) + else: + # Fallback to individual file processing + findings = [] + for i, file_path in enumerate(expanded_files, 1): + if i % 10 == 0: + click.echo(f" Scanning file {i}/{len(expanded_files)}...", nl=False) + click.echo("\r", nl=False) + + # Convert to relative path for pattern detector + try: + rel_path = Path(file_path).relative_to(repo_root).as_posix() + except ValueError: + rel_path = file_path + + file_findings = pattern_detector.detect_patterns( + categories=None, + file_filter=rel_path + ) + findings.extend(file_findings) + + click.echo(f"\n Found {len(findings)} patterns") + else: + findings = [] + click.echo("No files to analyze after expansion") + + patterns = findings + + # Step 4: Run FCE correlation with refactoring rules + click.echo("Running correlation analysis...") + + # Run the FCE to get correlations + fce_results = run_fce( + root_path=str(repo_root), + capsules_dir=str(pf_dir / "capsules"), + manifest_path="manifest.json", + workset_path=str(temp_workset_file), + db_path="repo_index.db", + timeout=600, + print_plan=False + ) + + # Extract correlations from FCE results + correlations = [] + if fce_results.get("success") and fce_results.get("results"): + fce_data = fce_results["results"] + if "correlations" in fce_data and "factual_clusters" in fce_data["correlations"]: + correlations = fce_data["correlations"]["factual_clusters"] + + # Step 5: Identify mismatches + mismatches = _find_mismatches(patterns, correlations, affected_files) + + # Generate report + report = _generate_report(affected_files, patterns, correlations, mismatches) + + # Display summary + click.echo("\n" + "=" * 60) + click.echo("Refactoring Analysis Summary") + click.echo("=" * 60) + + click.echo(f"\nFiles analyzed: {len(affected_files)}") + click.echo(f"Patterns detected: {len(patterns)}") + click.echo(f"Correlations found: {len(correlations)}") + + if mismatches["api"]: + click.echo(f"\nAPI Mismatches: {len(mismatches['api'])}") + for mismatch in mismatches["api"][:5]: # Show top 5 + click.echo(f" • {mismatch['description']}") + + if mismatches["model"]: + click.echo(f"\nData Model Mismatches: {len(mismatches['model'])}") + for mismatch in mismatches["model"][:5]: # Show top 5 + click.echo(f" • {mismatch['description']}") + + if mismatches["contract"]: + click.echo(f"\nContract Mismatches: {len(mismatches['contract'])}") + for mismatch in mismatches["contract"][:5]: # Show top 5 + click.echo(f" • {mismatch['description']}") + + # Risk assessment + risk_level = _assess_risk(mismatches, len(affected_files)) + click.echo(f"\nRisk Level: {risk_level}") + + # Recommendations + recommendations = _generate_recommendations(mismatches) + if recommendations: + click.echo("\nRecommendations:") + for rec in recommendations: + click.echo(f" ✓ {rec}") + + # Save detailed report if requested + if output: + with open(output, 'w') as f: + json.dump(report, f, indent=2, default=str) + click.echo(f"\nDetailed report saved to: {output}") + + # Suggest next steps + click.echo("\nNext Steps:") + click.echo(" 1. Review the mismatches identified above") + click.echo(" 2. Run 'aud impact --file --line ' for detailed impact") + click.echo(" 3. Use 'aud detect-patterns --workset' for pattern-specific issues") + click.echo(" 4. Run 'aud full' for comprehensive analysis") + + +def _expand_affected_files( + affected_files: Set[str], + db_path: str, + expansion_mode: str, + repo_root: Path +) -> Set[str]: + """Expand affected files with their dependencies based on mode.""" + if expansion_mode == "none": + return affected_files + + expanded = set(affected_files) + total_files = len(affected_files) + + click.echo(f"\nExpanding {total_files} affected files with {expansion_mode} mode...") + + if expansion_mode in ["direct", "full"]: + from theauditor.impact_analyzer import analyze_impact + import sqlite3 + import os + + for i, file_path in enumerate(affected_files, 1): + if i % 5 == 0 or i == total_files: + click.echo(f" Analyzing dependencies {i}/{total_files}...", nl=False) + click.echo("\r", nl=False) + + # Find a representative line (first function/class) + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT line FROM symbols + WHERE path = ? AND type IN ('function', 'class') + ORDER BY line LIMIT 1 + """, (file_path,)) + result = cursor.fetchone() + conn.close() + + if result: + line = result[0] + try: + impact = analyze_impact( + db_path=db_path, + target_file=file_path, + target_line=line, + trace_to_backend=(expansion_mode == "full") + ) + + # Add direct dependencies + for dep in impact.get("upstream", []): + expanded.add(dep["file"]) + for dep in impact.get("downstream", []): + if dep["file"] != "external": + expanded.add(dep["file"]) + + # Add transitive if full mode + if expansion_mode == "full": + for dep in impact.get("upstream_transitive", []): + expanded.add(dep["file"]) + for dep in impact.get("downstream_transitive", []): + if dep["file"] != "external": + expanded.add(dep["file"]) + except Exception as e: + # Don't fail entire analysis for one file + if os.environ.get("THEAUDITOR_DEBUG"): + click.echo(f"\n Warning: Could not analyze {file_path}: {e}") + + click.echo(f"\n Expanded from {total_files} to {len(expanded)} files") + + return expanded + + +def _analyze_migrations(repo_root: Path, migration_dir: str, migration_limit: int = 0) -> List[str]: + """Analyze migration files to detect schema changes. + + Args: + repo_root: Repository root path + migration_dir: Migration directory path + migration_limit: Number of recent migrations to analyze (0=all) + """ + migration_path = repo_root / migration_dir + affected_files = [] + + if not migration_path.exists(): + # Try common locations (most common first!) + found_migrations = False + for common_path in ["backend/migrations", "migrations", "db/migrations", + "database/migrations", "frontend/migrations"]: + test_path = repo_root / common_path + if test_path.exists(): + # Check if it actually contains migration files + import glob + test_migrations = (glob.glob(str(test_path / "*.js")) + + glob.glob(str(test_path / "*.ts")) + + glob.glob(str(test_path / "*.sql"))) + if test_migrations: + migration_path = test_path + found_migrations = True + click.echo(f"Found migrations in: {common_path}") + break + + if not found_migrations: + click.echo("\n⚠️ WARNING: No migration files found in standard locations:", err=True) + click.echo(" • backend/migrations/", err=True) + click.echo(" • migrations/", err=True) + click.echo(" • db/migrations/", err=True) + click.echo(" • database/migrations/", err=True) + click.echo(" • frontend/migrations/ (yes, we check here too)", err=True) + click.echo(f"\n Current directory searched: {migration_dir}", err=True) + click.echo(f" Use --migration-dir to specify your migration folder\n", err=True) + return affected_files + + if migration_path.exists(): + # Look for migration files + import glob + import re + + migrations = sorted(glob.glob(str(migration_path / "*.js")) + + glob.glob(str(migration_path / "*.ts")) + + glob.glob(str(migration_path / "*.sql"))) + + if not migrations: + click.echo(f"\n⚠️ WARNING: Directory '{migration_path}' exists but contains no migration files", err=True) + click.echo(f" Expected: .js, .ts, or .sql files", err=True) + return affected_files + + # Determine which migrations to analyze + total_migrations = len(migrations) + if migration_limit > 0: + migrations_to_analyze = migrations[-migration_limit:] + click.echo(f"Analyzing {len(migrations_to_analyze)} most recent migrations (out of {total_migrations} total)") + else: + migrations_to_analyze = migrations + click.echo(f"Analyzing ALL {total_migrations} migration files") + if total_migrations > 20: + click.echo("⚠️ Large migration set detected. Consider using --migration-limit for faster analysis") + + # Enhanced pattern matching + schema_patterns = { + 'column_ops': r'(?:removeColumn|dropColumn|renameColumn|addColumn|alterColumn|modifyColumn)', + 'table_ops': r'(?:createTable|dropTable|renameTable|alterTable)', + 'index_ops': r'(?:addIndex|dropIndex|createIndex|removeIndex)', + 'fk_ops': r'(?:addForeignKey|dropForeignKey|addConstraint|dropConstraint)', + 'type_changes': r'(?:changeColumn|changeDataType|alterType)' + } + + tables_affected = set() + operations_found = set() + + # Process migrations with progress indicator + for i, migration_file in enumerate(migrations_to_analyze, 1): + if i % 10 == 0 or i == len(migrations_to_analyze): + click.echo(f" Processing migration {i}/{len(migrations_to_analyze)}...", nl=False) + click.echo("\r", nl=False) + + try: + with open(migration_file, 'r') as f: + content = f.read() + + # Check all pattern categories + for pattern_name, pattern_regex in schema_patterns.items(): + if re.search(pattern_regex, content, re.IGNORECASE): + operations_found.add(pattern_name) + + # Extract table/model names (improved regex) + # Handles: "table", 'table', `table`, tableName + tables = re.findall(r"['\"`](\w+)['\"`]|(?:table|Table)Name:\s*['\"`]?(\w+)", content) + for match in tables: + # match is a tuple from multiple capture groups + table = match[0] if match[0] else match[1] if len(match) > 1 else None + if table and table not in ['table', 'Table', 'column', 'Column']: + tables_affected.add(table) + except Exception as e: + click.echo(f"\nWarning: Could not read migration {migration_file}: {e}") + continue + + click.echo(f"\nFound {len(operations_found)} types of operations affecting {len(tables_affected)} tables") + + # Map tables to model files + for table in tables_affected: + model_file = _find_model_file(repo_root, table) + if model_file: + affected_files.append(str(model_file)) + + # Deduplicate + affected_files = list(set(affected_files)) + click.echo(f"Mapped to {len(affected_files)} model files") + + return affected_files + + +def _find_model_file(repo_root: Path, table_name: str) -> Optional[Path]: + """Find model file corresponding to a database table.""" + # Convert table name to likely model name + model_names = [ + table_name, # exact match + table_name.rstrip('s'), # singular + ''.join(word.capitalize() for word in table_name.split('_')), # PascalCase + ] + + for model_name in model_names: + # Check common model locations + for pattern in [f"**/models/{model_name}.*", f"**/{model_name}.model.*", + f"**/entities/{model_name}.*"]: + import glob + matches = glob.glob(str(repo_root / pattern), recursive=True) + if matches: + return Path(matches[0]) + + return None + + +def _find_mismatches(patterns: List[Dict], correlations: List[Dict], + affected_files: Set[str]) -> Dict[str, List[Dict]]: + """Identify mismatches from patterns and correlations.""" + mismatches = { + "api": [], + "model": [], + "contract": [] + } + + # Analyze patterns for known refactoring issues + for pattern in patterns: + if pattern.get("rule_id") in ["PRODUCT_PRICE_FIELD_REMOVED", + "PRODUCT_SKU_MOVED_TO_VARIANT"]: + mismatches["model"].append({ + "type": "field_moved", + "description": pattern.get("message", "Field moved between models"), + "file": pattern.get("file"), + "line": pattern.get("line") + }) + elif pattern.get("rule_id") in ["API_ENDPOINT_PRODUCT_PRICE"]: + mismatches["api"].append({ + "type": "endpoint_deprecated", + "description": pattern.get("message", "API endpoint no longer exists"), + "file": pattern.get("file"), + "line": pattern.get("line") + }) + elif pattern.get("rule_id") in ["FRONTEND_BACKEND_CONTRACT_MISMATCH"]: + mismatches["contract"].append({ + "type": "contract_mismatch", + "description": pattern.get("message", "Frontend/backend contract mismatch"), + "file": pattern.get("file"), + "line": pattern.get("line") + }) + + # Analyze correlations for co-occurring issues + for correlation in correlations: + if correlation.get("confidence", 0) > 0.8: + category = "contract" if "contract" in correlation.get("name", "").lower() else \ + "api" if "api" in correlation.get("name", "").lower() else "model" + + mismatches[category].append({ + "type": "correlation", + "description": correlation.get("description", "Correlated issue detected"), + "confidence": correlation.get("confidence"), + "facts": correlation.get("matched_facts", []) + }) + + return mismatches + + +def _assess_risk(mismatches: Dict[str, List], file_count: int) -> str: + """Assess the risk level of the refactoring.""" + total_issues = sum(len(issues) for issues in mismatches.values()) + + if total_issues > 20 or file_count > 50: + return "HIGH" + elif total_issues > 10 or file_count > 20: + return "MEDIUM" + else: + return "LOW" + + +def _generate_recommendations(mismatches: Dict[str, List]) -> List[str]: + """Generate actionable recommendations based on mismatches.""" + recommendations = [] + + if mismatches["model"]: + recommendations.append("Update frontend interfaces to match new model structure") + recommendations.append("Run database migrations in all environments") + + if mismatches["api"]: + recommendations.append("Update API client to use new endpoints") + recommendations.append("Add deprecation notices for old endpoints") + + if mismatches["contract"]: + recommendations.append("Synchronize TypeScript interfaces with backend models") + recommendations.append("Add API versioning to prevent breaking changes") + + if sum(len(issues) for issues in mismatches.values()) > 10: + recommendations.append("Consider breaking this refactoring into smaller steps") + recommendations.append("Add integration tests before proceeding") + + return recommendations + + +def _generate_report(affected_files: Set[str], patterns: List[Dict], + correlations: List[Dict], mismatches: Dict) -> Dict: + """Generate detailed report of the refactoring analysis.""" + return { + "summary": { + "files_analyzed": len(affected_files), + "patterns_detected": len(patterns), + "correlations_found": len(correlations), + "total_mismatches": sum(len(issues) for issues in mismatches.values()) + }, + "affected_files": list(affected_files), + "patterns": patterns, + "correlations": correlations, + "mismatches": mismatches, + "risk_assessment": _assess_risk(mismatches, len(affected_files)), + "recommendations": _generate_recommendations(mismatches) + } + + +# Register command +refactor_command = refactor \ No newline at end of file diff --git a/theauditor/commands/report.py b/theauditor/commands/report.py new file mode 100644 index 0000000..087a77f --- /dev/null +++ b/theauditor/commands/report.py @@ -0,0 +1,66 @@ +"""Generate unified audit report from all artifacts.""" + +from pathlib import Path +import click +from theauditor.utils.error_handler import handle_exceptions + + +@click.command() +@handle_exceptions +@click.option("--manifest", default="./.pf/manifest.json", help="Manifest file path") +@click.option("--db", default="./.pf/repo_index.db", help="Database path") +@click.option("--workset", default="./.pf/workset.json", help="Workset file path") +@click.option("--capsules", default="./.pf/capsules", help="Capsules directory") +@click.option("--run-report", default="./.pf/run_report.json", help="Run report file path") +@click.option("--journal", default="./.pf/journal.ndjson", help="Journal file path") +@click.option("--fce", default="./.pf/fce.json", help="FCE file path") +@click.option("--ast", default="./.pf/ast_proofs.json", help="AST proofs file path") +@click.option("--ml", default="./.pf/ml_suggestions.json", help="ML suggestions file path") +@click.option("--patch", help="Patch diff file path") +@click.option("--out-dir", default="./.pf/audit", help="Output directory for audit reports") +@click.option("--max-snippet-lines", default=3, type=int, help="Maximum lines per snippet") +@click.option("--max-snippet-chars", default=220, type=int, help="Maximum characters per line") +@click.option("--print-stats", is_flag=True, help="Print summary statistics") +def report( + manifest, + db, + workset, + capsules, + run_report, + journal, + fce, + ast, + ml, + patch, + out_dir, + max_snippet_lines, + max_snippet_chars, + print_stats, +): + """Generate unified audit report from all artifacts.""" + # Report generation has been simplified + # Data is already chunked in .pf/readthis/ by extraction phase + + readthis_dir = Path("./.pf/readthis") + + if readthis_dir.exists(): + json_files = list(readthis_dir.glob("*.json")) + click.echo(f"[OK] Audit report generated - Data chunks ready for AI consumption") + click.echo(f"[INFO] Report contains {len(json_files)} JSON chunks in .pf/readthis/") + + if print_stats: + total_size = sum(f.stat().st_size for f in json_files) + click.echo(f"\n[STATS] Summary:") + click.echo(f" - Total chunks: {len(json_files)}") + click.echo(f" - Total size: {total_size:,} bytes") + click.echo(f" - Average chunk: {total_size // len(json_files):,} bytes" if json_files else " - No chunks") + + click.echo(f"\n[FILES] Available chunks:") + for f in sorted(json_files)[:10]: # Show first 10 + size = f.stat().st_size + click.echo(f" - {f.name} ({size:,} bytes)") + if len(json_files) > 10: + click.echo(f" ... and {len(json_files) - 10} more") + else: + click.echo("[WARNING] No readthis directory found at .pf/readthis/") + click.echo("[INFO] Run 'aud full' to generate analysis data") \ No newline at end of file diff --git a/theauditor/commands/rules.py b/theauditor/commands/rules.py new file mode 100644 index 0000000..2a89c09 --- /dev/null +++ b/theauditor/commands/rules.py @@ -0,0 +1,226 @@ +"""Rules command - inspect and summarize detection capabilities.""" + +import os +import yaml +import importlib +import inspect +from pathlib import Path +from typing import Dict, List, Any + +import click + +from theauditor.utils import handle_exceptions +from theauditor.utils.exit_codes import ExitCodes + + +@click.command(name="rules") +@click.option( + "--summary", + is_flag=True, + default=False, + help="Generate a summary of all detection capabilities", +) +@handle_exceptions +def rules_command(summary: bool) -> None: + """Inspect and summarize TheAuditor's detection rules and patterns. + + Args: + summary: If True, generate a comprehensive capability report + """ + if not summary: + click.echo(click.style("[ERROR] Please specify --summary to generate a capability report", fg="red"), err=True) + raise SystemExit(ExitCodes.TASK_INCOMPLETE) + + # Get the base path for patterns and rules + base_path = Path(__file__).parent.parent + patterns_path = base_path / "patterns" + rules_path = base_path / "rules" + + # Create output directory + output_dir = Path(".pf") + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / "auditor_capabilities.md" + + # Collect output in a list + output_lines = [] + output_lines.append("# TheAuditor Detection Capabilities\n") + + # Also print to console + print("# TheAuditor Detection Capabilities\n") + + # Scan YAML patterns + print("## YAML Patterns\n") + output_lines.append("## YAML Patterns\n") + yaml_patterns = scan_yaml_patterns(patterns_path) + total_patterns = 0 + + for category, files in yaml_patterns.items(): + if files: + category_display = "patterns/" if category == "." else f"patterns/{category}/" + print(f"### {category_display}\n") + output_lines.append(f"### {category_display}\n") + for file_name, patterns in files.items(): + if patterns: + print(f"**{file_name}** ({len(patterns)} patterns)") + output_lines.append(f"**{file_name}** ({len(patterns)} patterns)") + for pattern in patterns: + print(f"- `{pattern}`") + output_lines.append(f"- `{pattern}`") + print() + output_lines.append("") + total_patterns += len(patterns) + + # Scan Python rules + print("## Python AST Rules\n") + output_lines.append("## Python AST Rules\n") + python_rules = scan_python_rules(rules_path) + total_rules = 0 + + for module_path, functions in python_rules.items(): + if functions: + # Make path relative to rules/ for readability + display_path = module_path.replace(str(rules_path) + os.sep, "") + print(f"### {display_path}") + output_lines.append(f"### {display_path}") + for func in functions: + print(f"- `{func}()`") + output_lines.append(f"- `{func}()`") + print() + output_lines.append("") + total_rules += len(functions) + + # Print summary statistics + print("## Summary Statistics\n") + output_lines.append("## Summary Statistics\n") + print(f"- **Total YAML Patterns**: {total_patterns}") + output_lines.append(f"- **Total YAML Patterns**: {total_patterns}") + print(f"- **Total Python Rules**: {total_rules}") + output_lines.append(f"- **Total Python Rules**: {total_rules}") + print(f"- **Combined Detection Capabilities**: {total_patterns + total_rules}") + output_lines.append(f"- **Combined Detection Capabilities**: {total_patterns + total_rules}") + + # Write to file + with open(output_file, 'w', encoding='utf-8') as f: + f.write('\n'.join(output_lines)) + + click.echo(click.style(f"\n[SUCCESS] Capability report generated successfully", fg="green")) + click.echo(f"[INFO] Report saved to: {output_file}") + raise SystemExit(ExitCodes.SUCCESS) + + +def scan_yaml_patterns(patterns_path: Path) -> Dict[str, Dict[str, List[str]]]: + """Scan YAML pattern files and extract pattern names. + + Args: + patterns_path: Path to the patterns directory + + Returns: + Dictionary mapping category -> file -> list of pattern names + """ + results = {} + + if not patterns_path.exists(): + return results + + # Walk through all subdirectories + for root, dirs, files in os.walk(patterns_path): + # Skip __pycache__ directories + dirs[:] = [d for d in dirs if d != "__pycache__"] + + for file in files: + if file.endswith(".yml") or file.endswith(".yaml"): + file_path = Path(root) / file + + # Determine category from directory structure + rel_path = file_path.relative_to(patterns_path) + # If file is in root of patterns/, use "." as category + # If in subdirectory like frameworks/, use that as category + if rel_path.parent == Path("."): + category = "." + else: + category = str(rel_path.parent) + + if category not in results: + results[category] = {} + + # Parse YAML and extract pattern names + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if data and isinstance(data, list): + pattern_names = [] + for pattern in data: + if isinstance(pattern, dict) and 'name' in pattern: + pattern_names.append(pattern['name']) + + if pattern_names: + results[category][file] = pattern_names + + except (yaml.YAMLError, OSError) as e: + # Skip files that can't be parsed + continue + + return results + + +def scan_python_rules(rules_path: Path) -> Dict[str, List[str]]: + """Scan Python rule files and find all find_* functions. + + Args: + rules_path: Path to the rules directory + + Returns: + Dictionary mapping module path -> list of find_* function names + """ + results = {} + + if not rules_path.exists(): + return results + + # First, check what's exposed in the main __init__.py + init_file = rules_path / "__init__.py" + if init_file.exists(): + try: + module = importlib.import_module("theauditor.rules") + exposed_functions = [] + for name, obj in inspect.getmembers(module, inspect.isfunction): + if name.startswith("find_"): + exposed_functions.append(name) + if exposed_functions: + results["rules/__init__.py (exposed)"] = exposed_functions + except ImportError: + pass + + # Walk through all Python files + for root, dirs, files in os.walk(rules_path): + # Skip __pycache__ directories + dirs[:] = [d for d in dirs if d != "__pycache__"] + + for file in files: + if file.endswith(".py"): + file_path = Path(root) / file + + # Skip __init__.py files for now (we handle them separately) + if file == "__init__.py": + continue + + # Try basic text scanning (more reliable than import) + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Simple regex to find function definitions + import re + pattern = r'^def\s+(find_\w+)\s*\(' + matches = re.findall(pattern, content, re.MULTILINE) + + if matches: + # Make path relative for display + display_path = str(file_path.relative_to(rules_path.parent)) + results[display_path] = matches + + except OSError: + continue + + return results \ No newline at end of file diff --git a/theauditor/commands/setup.py b/theauditor/commands/setup.py new file mode 100644 index 0000000..ee98ada --- /dev/null +++ b/theauditor/commands/setup.py @@ -0,0 +1,63 @@ +"""Setup commands for TheAuditor - Claude Code integration.""" + +import click + + +@click.command("setup-claude") +@click.option( + "--target", + required=True, + help="Target project root (absolute or relative path)" +) +@click.option( + "--source", + default="agent_templates", + help="Path to TheAuditor agent templates directory (default: agent_templates)" +) +@click.option( + "--sync", + is_flag=True, + help="Force update (still creates .bak on first change only)" +) +@click.option( + "--dry-run", + is_flag=True, + help="Print plan without executing" +) +def setup_claude(target, source, sync, dry_run): + """Install Claude Code agents, hooks, and per-project venv for TheAuditor. + + This command performs a complete zero-optional installation: + 1. Creates a Python venv at /.venv + 2. Installs TheAuditor into that venv (editable/offline) + 3. Creates cross-platform launcher wrappers at /.claude/bin/ + 4. Generates Claude agents from agent_templates/*.md + 5. Writes hooks to /.claude/hooks.json + + All commands in agents/hooks use ./.claude/bin/aud to ensure + they run with the project's own venv. + """ + from theauditor.claude_setup import setup_claude_complete + + try: + result = setup_claude_complete( + target=target, + source=source, + sync=sync, + dry_run=dry_run + ) + + # The setup_claude_complete function already prints detailed output + # Just handle any failures here + if result.get("failed"): + click.echo("\n[WARN] Some operations failed:", err=True) + for item in result["failed"]: + click.echo(f" - {item}", err=True) + raise click.ClickException("Setup incomplete due to failures") + + except ValueError as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e \ No newline at end of file diff --git a/theauditor/commands/structure.py b/theauditor/commands/structure.py new file mode 100644 index 0000000..d4df424 --- /dev/null +++ b/theauditor/commands/structure.py @@ -0,0 +1,96 @@ +"""Project structure and intelligence report command.""" + +import click +from pathlib import Path +from theauditor.utils.error_handler import handle_exceptions +from theauditor.utils.exit_codes import ExitCodes + + +@click.command("structure") +@handle_exceptions +@click.option("--root", default=".", help="Root directory to analyze") +@click.option("--manifest", default="./.pf/manifest.json", help="Path to manifest.json") +@click.option("--db-path", default="./.pf/repo_index.db", help="Path to repo_index.db") +@click.option("--output", default="./.pf/readthis/STRUCTURE.md", help="Output file path") +@click.option("--max-depth", default=4, type=int, help="Maximum directory tree depth") +def structure(root, manifest, db_path, output, max_depth): + """Generate project structure and intelligence report. + + Creates a comprehensive markdown report including: + - Directory tree visualization + - Project statistics (files, LOC, tokens) + - Language distribution + - Top 10 largest files by tokens + - Top 15 critical files by convention + - AI context optimization recommendations + """ + from theauditor.project_summary import generate_project_summary, generate_directory_tree + + # Check if manifest exists (not required but enhances report) + manifest_exists = Path(manifest).exists() + db_exists = Path(db_path).exists() + + if not manifest_exists and not db_exists: + click.echo("Warning: Neither manifest.json nor repo_index.db found", err=True) + click.echo("Run 'aud index' first for complete statistics", err=True) + click.echo("Generating basic structure report...\n") + elif not manifest_exists: + click.echo("Warning: manifest.json not found, statistics will be limited", err=True) + elif not db_exists: + click.echo("Warning: repo_index.db not found, symbol counts will be missing", err=True) + + # Generate the report + click.echo(f"Analyzing project structure (max depth: {max_depth})...") + + try: + # Generate full report + report_content = generate_project_summary( + root_path=root, + manifest_path=manifest, + db_path=db_path, + max_depth=max_depth + ) + + # Ensure output directory exists + output_path = Path(output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Write report + with open(output_path, 'w', encoding='utf-8') as f: + f.write(report_content) + + click.echo(f"\n✓ Project structure report generated: {output}") + + # Show summary stats if available + if manifest_exists: + import json + with open(manifest, 'r') as f: + manifest_data = json.load(f) + + total_files = len(manifest_data) + total_loc = sum(f.get('loc', 0) for f in manifest_data) + total_bytes = sum(f.get('bytes', 0) for f in manifest_data) + total_tokens = total_bytes // 4 # Rough approximation + + click.echo(f"\nProject Summary:") + click.echo(f" Files: {total_files:,}") + click.echo(f" LOC: {total_loc:,}") + click.echo(f" Tokens: ~{total_tokens:,}") + + # Token percentage of Claude's context + # Claude has 200k context, but practical limit is ~160k for user content + # (leaving room for system prompts, conversation history, response) + claude_total_context = 200000 # Total context window + claude_usable_context = 160000 # Practical limit for user content + token_percent = (total_tokens / claude_usable_context * 100) if total_tokens > 0 else 0 + + if token_percent > 100: + click.echo(f" Context Usage: {token_percent:.1f}% (EXCEEDS Claude's practical limit)") + else: + click.echo(f" Context Usage: {token_percent:.1f}% of Claude's usable window") + + return ExitCodes.SUCCESS + + except Exception as e: + click.echo(f"Error generating report: {e}", err=True) + return ExitCodes.TASK_INCOMPLETE \ No newline at end of file diff --git a/theauditor/commands/summary.py b/theauditor/commands/summary.py new file mode 100644 index 0000000..a070940 --- /dev/null +++ b/theauditor/commands/summary.py @@ -0,0 +1,236 @@ +"""Generate comprehensive audit summary from all analysis phases.""" + +import json +import time +from pathlib import Path +from typing import Any, Dict +import click + + +@click.command() +@click.option("--root", default=".", help="Root directory") +@click.option("--raw-dir", default="./.pf/raw", help="Raw outputs directory") +@click.option("--out", default="./.pf/raw/audit_summary.json", help="Output path for summary") +def summary(root, raw_dir, out): + """Generate comprehensive audit summary from all phases.""" + start_time = time.time() + raw_path = Path(raw_dir) + + # Initialize summary structure + audit_summary = { + "generated_at": time.strftime('%Y-%m-%d %H:%M:%S'), + "overall_status": "UNKNOWN", + "total_runtime_seconds": 0, + "total_findings_by_severity": { + "critical": 0, + "high": 0, + "medium": 0, + "low": 0, + "info": 0 + }, + "metrics_by_phase": {}, + "key_statistics": {} + } + + # Helper function to safely load JSON + def load_json(file_path: Path) -> Dict[str, Any]: + if file_path.exists(): + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, IOError): + pass + return {} + + # Phase 1: Index metrics + manifest_path = Path(root) / "manifest.json" + if manifest_path.exists(): + manifest = load_json(manifest_path) + if isinstance(manifest, list): + audit_summary["metrics_by_phase"]["index"] = { + "files_indexed": len(manifest), + "total_size_bytes": sum(f.get("size", 0) for f in manifest) + } + + # Phase 2: Framework detection + frameworks = load_json(raw_path / "frameworks.json") + if frameworks: + if isinstance(frameworks, dict): + framework_list = frameworks.get("frameworks", []) + else: + framework_list = frameworks if isinstance(frameworks, list) else [] + + audit_summary["metrics_by_phase"]["detect_frameworks"] = { + "frameworks_detected": len(framework_list), + "languages": list(set(f.get("language", "") if isinstance(f, dict) else "" for f in framework_list)) + } + + # Phase 3: Dependencies + deps = load_json(raw_path / "deps.json") + deps_latest = load_json(raw_path / "deps_latest.json") + if deps or deps_latest: + outdated_count = 0 + vulnerability_count = 0 + total_deps = 0 + + # Handle deps being either dict or list + if isinstance(deps, dict): + total_deps = len(deps.get("dependencies", [])) + elif isinstance(deps, list): + total_deps = len(deps) + + # Handle deps_latest structure + if isinstance(deps_latest, dict) and "packages" in deps_latest: + for pkg in deps_latest["packages"]: + if isinstance(pkg, dict): + if pkg.get("outdated"): + outdated_count += 1 + if pkg.get("vulnerabilities"): + vulnerability_count += len(pkg["vulnerabilities"]) + + audit_summary["metrics_by_phase"]["dependencies"] = { + "total_dependencies": total_deps, + "outdated_packages": outdated_count, + "vulnerabilities": vulnerability_count + } + + # Phase 7: Linting + lint_data = load_json(raw_path / "lint.json") + if lint_data and "findings" in lint_data: + lint_by_severity = {"critical": 0, "high": 0, "medium": 0, "low": 0, "info": 0} + for finding in lint_data["findings"]: + severity = finding.get("severity", "info").lower() + if severity in lint_by_severity: + lint_by_severity[severity] += 1 + + audit_summary["metrics_by_phase"]["lint"] = { + "total_issues": len(lint_data["findings"]), + "by_severity": lint_by_severity + } + + # Add to total + for sev, count in lint_by_severity.items(): + audit_summary["total_findings_by_severity"][sev] += count + + # Phase 8: Pattern detection + patterns = load_json(raw_path / "patterns.json") + if not patterns: + patterns = load_json(raw_path / "findings.json") + + if patterns and "findings" in patterns: + pattern_by_severity = {"critical": 0, "high": 0, "medium": 0, "low": 0, "info": 0} + for finding in patterns["findings"]: + severity = finding.get("severity", "info").lower() + if severity in pattern_by_severity: + pattern_by_severity[severity] += 1 + + audit_summary["metrics_by_phase"]["patterns"] = { + "total_patterns_matched": len(patterns["findings"]), + "by_severity": pattern_by_severity + } + + # Add to total + for sev, count in pattern_by_severity.items(): + audit_summary["total_findings_by_severity"][sev] += count + + # Phase 9-10: Graph analysis + graph_analysis = load_json(raw_path / "graph_analysis.json") + graph_metrics = load_json(raw_path / "graph_metrics.json") + if graph_analysis: + summary_data = graph_analysis.get("summary", {}) + audit_summary["metrics_by_phase"]["graph"] = { + "import_nodes": summary_data.get("import_graph", {}).get("nodes", 0), + "import_edges": summary_data.get("import_graph", {}).get("edges", 0), + "cycles_detected": len(graph_analysis.get("cycles", [])), + "hotspots_identified": len(graph_analysis.get("hotspots", [])), + "graph_density": summary_data.get("import_graph", {}).get("density", 0) + } + + if "health_metrics" in summary_data: + audit_summary["metrics_by_phase"]["graph"]["health_grade"] = summary_data["health_metrics"].get("health_grade", "N/A") + audit_summary["metrics_by_phase"]["graph"]["fragility_score"] = summary_data["health_metrics"].get("fragility_score", 0) + + # Phase 11: Taint analysis + taint = load_json(raw_path / "taint_analysis.json") + if taint: + taint_by_severity = {"critical": 0, "high": 0, "medium": 0, "low": 0} + if "taint_paths" in taint: + for path in taint["taint_paths"]: + severity = path.get("severity", "medium").lower() + if severity in taint_by_severity: + taint_by_severity[severity] += 1 + + audit_summary["metrics_by_phase"]["taint_analysis"] = { + "taint_paths_found": len(taint.get("taint_paths", [])), + "total_vulnerabilities": taint.get("total_vulnerabilities", 0), + "by_severity": taint_by_severity + } + + # Add to total + for sev, count in taint_by_severity.items(): + if sev in audit_summary["total_findings_by_severity"]: + audit_summary["total_findings_by_severity"][sev] += count + + # Phase 12: FCE (Factual Correlation Engine) + fce = load_json(raw_path / "fce.json") + if fce: + correlations = fce.get("correlations", {}) + audit_summary["metrics_by_phase"]["fce"] = { + "total_findings": len(fce.get("all_findings", [])), + "test_failures": len(fce.get("test_results", {}).get("failures", [])), + "hotspots_correlated": correlations.get("total_hotspots", 0), + "factual_clusters": len(correlations.get("factual_clusters", [])) + } + + # Calculate overall status based on severity counts + severity_counts = audit_summary["total_findings_by_severity"] + if severity_counts["critical"] > 0: + audit_summary["overall_status"] = "CRITICAL" + elif severity_counts["high"] > 0: + audit_summary["overall_status"] = "HIGH" + elif severity_counts["medium"] > 0: + audit_summary["overall_status"] = "MEDIUM" + elif severity_counts["low"] > 0: + audit_summary["overall_status"] = "LOW" + else: + audit_summary["overall_status"] = "CLEAN" + + # Add key statistics + audit_summary["key_statistics"] = { + "total_findings": sum(severity_counts.values()), + "phases_with_findings": len([p for p in audit_summary["metrics_by_phase"] if audit_summary["metrics_by_phase"][p]]), + "total_phases_run": len(audit_summary["metrics_by_phase"]) + } + + # Calculate runtime + elapsed = time.time() - start_time + audit_summary["summary_generation_time"] = elapsed + + # Read pipeline.log for total runtime if available + pipeline_log = Path(root) / ".pf" / "pipeline.log" + if pipeline_log.exists(): + try: + with open(pipeline_log, 'r') as f: + for line in f: + if "[TIME] Total time:" in line: + # Extract seconds from line like "[TIME] Total time: 73.0s" + parts = line.split(":")[-1].strip().replace("s", "").split("(")[0] + audit_summary["total_runtime_seconds"] = float(parts) + break + except: + pass + + # Save the summary + out_path = Path(out) + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_path, 'w', encoding='utf-8') as f: + json.dump(audit_summary, f, indent=2) + + # Output results + click.echo(f"[OK] Audit summary generated in {elapsed:.1f}s") + click.echo(f" Overall status: {audit_summary['overall_status']}") + click.echo(f" Total findings: {audit_summary['key_statistics']['total_findings']}") + click.echo(f" Critical: {severity_counts['critical']}, High: {severity_counts['high']}, Medium: {severity_counts['medium']}, Low: {severity_counts['low']}") + click.echo(f" Summary saved to: {out_path}") + + return audit_summary \ No newline at end of file diff --git a/theauditor/commands/taint.py b/theauditor/commands/taint.py new file mode 100644 index 0000000..a05bbff --- /dev/null +++ b/theauditor/commands/taint.py @@ -0,0 +1,272 @@ +"""Perform taint analysis to detect security vulnerabilities via data flow tracking.""" + +import sys +import platform +import click +from pathlib import Path +from datetime import datetime, UTC +from theauditor.utils.error_handler import handle_exceptions + +# Detect if running on Windows for character encoding +IS_WINDOWS = platform.system() == "Windows" + + + +@click.command("taint-analyze") +@handle_exceptions +@click.option("--db", default=None, help="Path to the SQLite database (default: repo_index.db)") +@click.option("--output", default="./.pf/raw/taint_analysis.json", help="Output path for analysis results") +@click.option("--max-depth", default=5, type=int, help="Maximum depth for taint propagation tracing") +@click.option("--json", is_flag=True, help="Output raw JSON instead of formatted report") +@click.option("--verbose", is_flag=True, help="Show detailed path information") +@click.option("--severity", type=click.Choice(["all", "critical", "high", "medium", "low"]), + default="all", help="Filter results by severity level") +@click.option("--rules/--no-rules", default=True, help="Enable/disable rule-based detection") +def taint_analyze(db, output, max_depth, json, verbose, severity, rules): + """ + Perform taint analysis to detect security vulnerabilities. + + This command traces the flow of untrusted data from taint sources + (user inputs) to security sinks (dangerous functions) to identify + potential injection vulnerabilities and data exposure risks. + + The analysis detects: + - SQL Injection + - Command Injection + - Cross-Site Scripting (XSS) + - Path Traversal + - LDAP Injection + - NoSQL Injection + + Example: + aud taint-analyze + aud taint-analyze --severity critical --verbose + aud taint-analyze --json --output vulns.json + """ + from theauditor.taint_analyzer import trace_taint, save_taint_analysis, normalize_taint_path, SECURITY_SINKS + from theauditor.taint.insights import format_taint_report, calculate_severity, generate_summary, classify_vulnerability + from theauditor.config_runtime import load_runtime_config + from theauditor.rules.orchestrator import RulesOrchestrator, RuleContext + from theauditor.taint.registry import TaintRegistry + import json as json_lib + + # Load configuration for default paths + config = load_runtime_config(".") + + # Use default database path if not provided + if db is None: + db = config["paths"]["db"] + + # Verify database exists + db_path = Path(db) + if not db_path.exists(): + click.echo(f"Error: Database not found at {db}", err=True) + click.echo("Run 'aud index' first to build the repository index", err=True) + raise click.ClickException(f"Database not found: {db}") + + # Check if rules are enabled + if rules: + # STAGE 1: Initialize infrastructure + click.echo("Initializing security analysis infrastructure...") + registry = TaintRegistry() + orchestrator = RulesOrchestrator(project_path=Path("."), db_path=db_path) + + # Track all findings + all_findings = [] + + # STAGE 2: Run standalone infrastructure rules + click.echo("Running infrastructure and configuration analysis...") + infra_findings = orchestrator.run_standalone_rules() + all_findings.extend(infra_findings) + click.echo(f" Found {len(infra_findings)} infrastructure issues") + + # STAGE 3: Run discovery rules to populate registry + click.echo("Discovering framework-specific patterns...") + discovery_findings = orchestrator.run_discovery_rules(registry) + all_findings.extend(discovery_findings) + + stats = registry.get_stats() + click.echo(f" Registry now has {stats['total_sinks']} sinks, {stats['total_sources']} sources") + + # STAGE 4: Run enriched taint analysis with registry + click.echo("Performing data-flow taint analysis...") + result = trace_taint( + db_path=str(db_path), + max_depth=max_depth, + registry=registry + ) + + # Extract taint paths + taint_paths = result.get("taint_paths", result.get("paths", [])) + click.echo(f" Found {len(taint_paths)} taint flow vulnerabilities") + + # STAGE 5: Run taint-dependent rules + click.echo("Running advanced security analysis...") + + # Create taint checker from results + def taint_checker(var_name, line_num=None): + """Check if variable is in any taint path.""" + for path in taint_paths: + # Check source + if path.get("source", {}).get("name") == var_name: + return True + # Check sink + if path.get("sink", {}).get("name") == var_name: + return True + # Check intermediate steps + for step in path.get("path", []): + if isinstance(step, dict) and step.get("name") == var_name: + return True + return False + + advanced_findings = orchestrator.run_taint_dependent_rules(taint_checker) + all_findings.extend(advanced_findings) + click.echo(f" Found {len(advanced_findings)} advanced security issues") + + # STAGE 6: Consolidate all findings + click.echo(f"\nTotal vulnerabilities found: {len(all_findings) + len(taint_paths)}") + + # Add all non-taint findings to result + result["infrastructure_issues"] = infra_findings + result["discovery_findings"] = discovery_findings + result["advanced_findings"] = advanced_findings + result["all_rule_findings"] = all_findings + + # Update total count + result["total_vulnerabilities"] = len(taint_paths) + len(all_findings) + else: + # Original taint analysis without orchestrator + click.echo("Performing taint analysis (rules disabled)...") + result = trace_taint( + db_path=str(db_path), + max_depth=max_depth + ) + + # Enrich raw paths with interpretive insights + if result.get("success"): + # Add severity and classification to each path + enriched_paths = [] + for path in result.get("taint_paths", result.get("paths", [])): + # Normalize the path first + path = normalize_taint_path(path) + # Add severity + path["severity"] = calculate_severity(path) + # Enrich sink information with vulnerability classification + path["vulnerability_type"] = classify_vulnerability( + path.get("sink", {}), + SECURITY_SINKS + ) + enriched_paths.append(path) + + # Update result with enriched paths + result["taint_paths"] = enriched_paths + result["paths"] = enriched_paths + + # Generate summary + result["summary"] = generate_summary(enriched_paths) + + # Filter by severity if requested + if severity != "all" and result.get("success"): + filtered_paths = [] + for path in result.get("taint_paths", result.get("paths", [])): + # Normalize the path to ensure all keys exist + path = normalize_taint_path(path) + if path["severity"].lower() == severity or ( + severity == "critical" and path["severity"].lower() == "critical" + ) or ( + severity == "high" and path["severity"].lower() in ["critical", "high"] + ): + filtered_paths.append(path) + + # Update counts + result["taint_paths"] = filtered_paths + result["paths"] = filtered_paths # Keep both keys synchronized + result["total_vulnerabilities"] = len(filtered_paths) + + # Recalculate vulnerability types + from collections import defaultdict + vuln_counts = defaultdict(int) + for path in filtered_paths: + # Path is already normalized from filtering above + vuln_counts[path.get("vulnerability_type", "Unknown")] += 1 + result["vulnerabilities_by_type"] = dict(vuln_counts) + + # CRITICAL FIX: Recalculate summary with filtered paths + from theauditor.taint.insights import generate_summary + result["summary"] = generate_summary(filtered_paths) + + # Save COMPLETE taint analysis results to raw (including all data) + save_taint_analysis(result, output) + click.echo(f"Raw analysis results saved to: {output}") + + # Output results + if json: + # JSON output for programmatic use + click.echo(json_lib.dumps(result, indent=2, sort_keys=True)) + else: + # Human-readable report + report = format_taint_report(result) + click.echo(report) + + # Additional verbose output + if verbose and result.get("success"): + paths = result.get("taint_paths", result.get("paths", [])) + if paths and len(paths) > 10: + click.echo("\n" + "=" * 60) + click.echo("ADDITIONAL VULNERABILITY DETAILS") + click.echo("=" * 60) + + for i, path in enumerate(paths[10:20], 11): + # Normalize path to ensure all keys exist + path = normalize_taint_path(path) + click.echo(f"\n{i}. {path['vulnerability_type']} ({path['severity']})") + click.echo(f" Source: {path['source']['file']}:{path['source']['line']}") + click.echo(f" Sink: {path['sink']['file']}:{path['sink']['line']}") + arrow = "->" if IS_WINDOWS else "→" + click.echo(f" Pattern: {path['source'].get('pattern', '')} {arrow} {path['sink'].get('pattern', '')}") # Empty not unknown + + if len(paths) > 20: + click.echo(f"\n... and {len(paths) - 20} additional vulnerabilities not shown") + + # Provide actionable recommendations based on findings + if not json and result.get("success"): + summary = result.get("summary", {}) + risk_level = summary.get("risk_level", "UNKNOWN") + + click.echo("\n" + "=" * 60) + click.echo("RECOMMENDED ACTIONS") + click.echo("=" * 60) + + if risk_level == "CRITICAL": + click.echo("[CRITICAL] CRITICAL SECURITY ISSUES DETECTED") + click.echo("1. Review and fix all CRITICAL vulnerabilities immediately") + click.echo("2. Add input validation and sanitization at all entry points") + click.echo("3. Use parameterized queries for all database operations") + click.echo("4. Implement output encoding for all user-controlled data") + click.echo("5. Consider a security audit before deployment") + elif risk_level == "HIGH": + click.echo("[HIGH] HIGH RISK VULNERABILITIES FOUND") + click.echo("1. Prioritize fixing HIGH severity issues this sprint") + click.echo("2. Review all user input handling code") + click.echo("3. Implement security middleware/filters") + click.echo("4. Add security tests for vulnerable paths") + elif risk_level == "MEDIUM": + click.echo("[MEDIUM] MODERATE SECURITY CONCERNS") + click.echo("1. Schedule vulnerability fixes for next sprint") + click.echo("2. Review and update security best practices") + click.echo("3. Add input validation where missing") + else: + click.echo("[LOW] LOW RISK PROFILE") + click.echo("1. Continue following secure coding practices") + click.echo("2. Regular security scanning recommended") + click.echo("3. Keep dependencies updated") + + # Exit with appropriate code + if result.get("success"): + summary = result.get("summary", {}) + if summary.get("critical_count", 0) > 0: + exit(2) # Critical vulnerabilities found + elif summary.get("high_count", 0) > 0: + exit(1) # High severity vulnerabilities found + else: + raise click.ClickException(result.get("error", "Analysis failed")) \ No newline at end of file diff --git a/theauditor/commands/tool_versions.py b/theauditor/commands/tool_versions.py new file mode 100644 index 0000000..bb985b9 --- /dev/null +++ b/theauditor/commands/tool_versions.py @@ -0,0 +1,25 @@ +"""Detect and record tool versions.""" + +import click + + +@click.command("tool-versions") +@click.option("--out-dir", default="./.pf/audit", help="Output directory") +def tool_versions(out_dir): + """Detect and record tool versions.""" + from theauditor.tools import write_tools_report + + try: + res = write_tools_report(out_dir) + click.echo(f"[OK] Tool versions written to {out_dir}/") + click.echo(" - TOOLS.md (human-readable)") + click.echo(" - tools.json (machine-readable)") + + # Show summary + python_found = sum(1 for v in res["python"].values() if v != "missing") + node_found = sum(1 for v in res["node"].values() if v != "missing") + click.echo(f" - Python tools: {python_found}/4 found") + click.echo(f" - Node tools: {node_found}/3 found") + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(str(e)) from e \ No newline at end of file diff --git a/theauditor/commands/validate_templates.py b/theauditor/commands/validate_templates.py new file mode 100644 index 0000000..cd31f30 --- /dev/null +++ b/theauditor/commands/validate_templates.py @@ -0,0 +1,30 @@ +"""Validate agent templates for SOP compliance.""" + +import click + + +@click.command("validate-templates") +@click.option("--source", default="./agent_templates", help="Directory containing agent templates") +@click.option("--format", type=click.Choice(["json", "text"]), default="text", help="Output format") +@click.option("--output", help="Write report to file instead of stdout") +def validate_templates(source, format, output): + """Validate agent templates for SOP compliance.""" + from theauditor.agent_template_validator import TemplateValidator + + validator = TemplateValidator() + results = validator.validate_all(source) + + report = validator.generate_report(results, format=format) + + if output: + with open(output, 'w') as f: + f.write(report) + click.echo(f"Report written to {output}") + else: + click.echo(report) + + # Exit with non-zero if violations found + if not results["valid"]: + raise click.ClickException( + f"Template validation failed: {results['total_violations']} violations found" + ) \ No newline at end of file diff --git a/theauditor/commands/workset.py b/theauditor/commands/workset.py new file mode 100644 index 0000000..1f60510 --- /dev/null +++ b/theauditor/commands/workset.py @@ -0,0 +1,55 @@ +"""Compute target file set from git diff and dependencies.""" + +import click +from theauditor.utils.error_handler import handle_exceptions + + +@click.command() +@handle_exceptions +@click.option("--root", default=".", help="Root directory") +@click.option("--db", default=None, help="Input SQLite database path") +@click.option("--manifest", default=None, help="Input manifest file path") +@click.option("--all", is_flag=True, help="Include all source files (ignores common directories)") +@click.option("--diff", help="Git diff range (e.g., main..HEAD)") +@click.option("--files", multiple=True, help="Explicit file list") +@click.option("--include", multiple=True, help="Include glob patterns") +@click.option("--exclude", multiple=True, help="Exclude glob patterns") +@click.option("--max-depth", default=None, type=int, help="Maximum dependency depth") +@click.option("--out", default=None, help="Output workset file path") +@click.option("--print-stats", is_flag=True, help="Print summary statistics") +def workset(root, db, manifest, all, diff, files, include, exclude, max_depth, out, print_stats): + """Compute target file set from git diff and dependencies.""" + from theauditor.workset import compute_workset + from theauditor.config_runtime import load_runtime_config + + # Load configuration + config = load_runtime_config(root) + + # Use config defaults if not provided + if db is None: + db = config["paths"]["db"] + if manifest is None: + manifest = config["paths"]["manifest"] + if out is None: + out = config["paths"]["workset"] + if max_depth is None: + max_depth = config["limits"]["max_graph_depth"] + + result = compute_workset( + root_path=root, + db_path=db, + manifest_path=manifest, + all_files=all, + diff_spec=diff, + file_list=list(files) if files else None, + include_patterns=list(include) if include else None, + exclude_patterns=list(exclude) if exclude else None, + max_depth=max_depth, + output_path=out, + print_stats=print_stats, + ) + + if not print_stats: + click.echo(f"Workset written to {out}") + click.echo(f" Seed files: {result['seed_count']}") + click.echo(f" Expanded files: {result['expanded_count']}") \ No newline at end of file diff --git a/theauditor/config.py b/theauditor/config.py new file mode 100644 index 0000000..2ff1b74 --- /dev/null +++ b/theauditor/config.py @@ -0,0 +1,40 @@ +"""Configuration management for TheAuditor.""" + +import tomllib +from pathlib import Path + + +def ensure_mypy_config(pyproject_path: str) -> dict[str, str]: + """ + Ensure minimal mypy config exists in pyproject.toml. + + Returns: + {"status": "created"} if config was added + {"status": "exists"} if config already present + """ + path = Path(pyproject_path) + + if not path.exists(): + raise FileNotFoundError(f"pyproject.toml not found at {pyproject_path}") + + # Parse to check if [tool.mypy] exists + with open(path, "rb") as f: + data = tomllib.load(f) + + # Check if mypy config already exists + if "tool" in data and "mypy" in data["tool"]: + return {"status": "exists"} + + # Mypy config to append + mypy_block = """ + +[tool.mypy] +python_version = "3.12" +strict = true +warn_unused_configs = true""" + + # Append to file + with open(path, "a") as f: + f.write(mypy_block) + + return {"status": "created"} diff --git a/theauditor/config_runtime.py b/theauditor/config_runtime.py new file mode 100644 index 0000000..a3eb732 --- /dev/null +++ b/theauditor/config_runtime.py @@ -0,0 +1,160 @@ +"""Runtime configuration for TheAuditor - centralized configuration management.""" + +from __future__ import annotations +import json +import os +from pathlib import Path +from typing import Any + + +DEFAULTS = { + "paths": { + # Core files + "manifest": "./.pf/manifest.json", + "db": "./.pf/repo_index.db", + "workset": "./.pf/workset.json", + + # Directories + "pf_dir": "./.pf", + "capsules_dir": "./.pf/capsules", + "docs_dir": "./.pf/docs", + "audit_dir": "./.pf/audit", + "context_docs_dir": "./.pf/context/docs", + "doc_capsules_dir": "./.pf/context/doc_capsules", + "graphs_dir": "./.pf/graphs", + "model_dir": "./.pf/ml", + "claude_dir": "./.claude", + + # Core artifacts + "journal": "./.pf/journal.ndjson", + "checkpoint": "./.pf/checkpoint.json", + "run_report": "./.pf/run_report.json", + "fce_json": "./.pf/raw/fce.json", + "ast_proofs_json": "./.pf/ast_proofs.json", + "ast_proofs_md": "./.pf/ast_proofs.md", + "ml_suggestions": "./.pf/insights/ml_suggestions.json", + "graphs_db": "./.pf/graphs.db", + "graph_analysis": "./.pf/graph_analysis.json", + "deps_json": "./.pf/deps.json", + "findings_json": "./.pf/findings.json", + "patterns_json": "./.pf/patterns.json", + "xgraph_json": "./.pf/xgraph.json", + "pattern_fce_json": "./.pf/pattern_fce.json", + "fix_suggestions_json": "./.pf/fix_suggestions.json", + "policy_yml": "./.pf/policy.yml", + }, + "limits": { + # File size limits + "max_file_size": 2 * 1024 * 1024, # 2 MiB + + # Chunking limits for extraction + "max_chunks_per_file": 3, # Maximum number of chunks per extracted file + "max_chunk_size": 56320, # Maximum size per chunk in bytes (55KB) + + # Batch processing + "default_batch_size": 200, + "evidence_batch_size": 100, + + # ML and analysis windows + "ml_window": 50, + "git_churn_window_days": 30, + + # Graph analysis + "max_graph_depth": 3, + "high_risk_threshold": 0.5, + "high_risk_limit": 10, + "graph_limit_nodes": 500, + }, + "timeouts": { + # Tool detection (quick checks) + "tool_detection": 5, + + # Network operations + "url_fetch": 10, + "venv_check": 30, + + # Build/test operations + "test_run": 60, + "venv_install": 120, + + # Analysis operations + "lint_timeout": 300, + "orchestrator_timeout": 300, + + # FCE and long operations + "fce_timeout": 600, + }, + "report": { + "max_lint_rows": 50, + "max_ast_rows": 50, + "max_snippet_lines": 12, + "max_snippet_chars": 800, + } +} + + +def load_runtime_config(root: str = ".") -> dict[str, Any]: + """ + Load runtime configuration from .pf/config.json and environment variables. + + Config priority (highest to lowest): + 1. Environment variables (THEAUDITOR_* prefixed) + 2. .pf/config.json file + 3. Built-in defaults + + Args: + root: Root directory to look for config file + + Returns: + Configuration dictionary with merged values + """ + # Start with deep copy of defaults + import copy + cfg = copy.deepcopy(DEFAULTS) + + # Try to load user config from .pf/config.json + path = Path(root) / ".pf" / "config.json" + try: + if path.exists(): + with open(path, "r", encoding="utf-8") as f: + user = json.load(f) + + # Merge each section if present + if isinstance(user, dict): + for section in ["paths", "limits", "timeouts", "report"]: + if section in user and isinstance(user[section], dict): + for key, value in user[section].items(): + # Validate type matches default + if key in cfg[section]: + if isinstance(value, type(cfg[section][key])): + cfg[section][key] = value + except (json.JSONDecodeError, IOError, OSError) as e: + print(f"[WARNING] Could not load config file from {path}: {e}") + print("[INFO] Continuing with default configuration") + # Continue with defaults - config file is optional + + # Environment variable overrides (flattened namespace) + # Format: THEAUDITOR_SECTION_KEY (e.g., THEAUDITOR_PATHS_MANIFEST) + for section in cfg: + for key in cfg[section]: + env_var = f"THEAUDITOR_{section.upper()}_{key.upper()}" + if env_var in os.environ: + value = os.environ[env_var] + try: + # Try to cast to the same type as the default + default_value = cfg[section][key] + if isinstance(default_value, int): + cfg[section][key] = int(value) + elif isinstance(default_value, float): + cfg[section][key] = float(value) + elif isinstance(default_value, list): + # Parse comma-separated values for lists + cfg[section][key] = [v.strip() for v in value.split(",")] + else: + cfg[section][key] = value + except (ValueError, AttributeError) as e: + print(f"[WARNING] Invalid value for environment variable {env_var}: '{value}' - {e}") + print(f"[INFO] Using default value: {cfg[section][key]}") + # Continue with default value - env vars are optional overrides + + return cfg \ No newline at end of file diff --git a/theauditor/correlations/__init__.py b/theauditor/correlations/__init__.py new file mode 100644 index 0000000..0ccac3b --- /dev/null +++ b/theauditor/correlations/__init__.py @@ -0,0 +1,5 @@ +"""Correlation rules for the Factual Correlation Engine.""" + +from .loader import CorrelationLoader, CorrelationRule + +__all__ = ["CorrelationLoader", "CorrelationRule"] \ No newline at end of file diff --git a/theauditor/correlations/loader.py b/theauditor/correlations/loader.py new file mode 100644 index 0000000..32cf378 --- /dev/null +++ b/theauditor/correlations/loader.py @@ -0,0 +1,237 @@ +"""Correlation rule loader for the Factual Correlation Engine.""" + +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional + +import yaml + + +@dataclass +class CorrelationRule: + """Represents a single correlation rule for factual co-occurrence detection.""" + + name: str + co_occurring_facts: List[Dict[str, str]] + description: Optional[str] = None + confidence: float = 0.8 + compiled_patterns: List[Dict[str, Any]] = field(default_factory=list, init=False, repr=False) + + def __post_init__(self): + """Compile regex patterns in co-occurring facts after initialization.""" + for fact in self.co_occurring_facts: + if 'tool' not in fact or 'pattern' not in fact: + raise ValueError(f"Invalid fact in rule '{self.name}': must contain 'tool' and 'pattern' keys") + + compiled_fact = { + 'tool': fact['tool'], + 'pattern': fact['pattern'] + } + + # Try to compile as regex, if it fails, treat as literal string + try: + compiled_fact['compiled_regex'] = re.compile(fact['pattern'], re.IGNORECASE) + compiled_fact['is_regex'] = True + except re.error: + # Not a valid regex, will be used as literal string match + compiled_fact['is_regex'] = False + + self.compiled_patterns.append(compiled_fact) + + def matches_finding(self, finding: Dict[str, Any], fact_index: int) -> bool: + """Check if a finding matches a specific fact pattern. + + Args: + finding: Dictionary containing finding data with 'tool' and 'rule' keys + fact_index: Index of the fact pattern to check + + Returns: + True if the finding matches the specified fact pattern + """ + if fact_index >= len(self.compiled_patterns): + return False + + fact = self.compiled_patterns[fact_index] + + # Check tool match + if finding.get('tool') != fact['tool']: + return False + + # Check pattern match against rule or message + if fact['is_regex']: + # Check against rule field and message field + rule_match = fact['compiled_regex'].search(finding.get('rule', '')) + message_match = fact['compiled_regex'].search(finding.get('message', '')) + return bool(rule_match or message_match) + else: + # Literal string match + return (fact['pattern'] in finding.get('rule', '') or + fact['pattern'] in finding.get('message', '')) + + +class CorrelationLoader: + """Loads and manages correlation rules from YAML files.""" + + def __init__(self, rules_dir: Optional[Path] = None): + """Initialize correlation loader. + + Args: + rules_dir: Directory containing correlation rule YAML files. + Defaults to theauditor/correlations/rules/ + """ + if rules_dir is None: + rules_dir = Path(__file__).parent / "rules" + self.rules_dir = Path(rules_dir) + self.rules: List[CorrelationRule] = [] + self._loaded = False + + def load_rules(self) -> List[CorrelationRule]: + """Load correlation rules from YAML files. + + Returns: + List of CorrelationRule objects. + + Raises: + FileNotFoundError: If the rules directory doesn't exist. + """ + if not self.rules_dir.exists(): + # Create directory if it doesn't exist, but return empty list + self.rules_dir.mkdir(parents=True, exist_ok=True) + self._loaded = True + return self.rules + + yaml_files = list(self.rules_dir.glob("*.yml")) + list(self.rules_dir.glob("*.yaml")) + + # Clear existing rules before loading + self.rules = [] + + for yaml_file in yaml_files: + try: + rules = self._load_yaml_file(yaml_file) + self.rules.extend(rules) + except Exception as e: + # Log warning but continue loading other files + print(f"Warning: Failed to load correlation rules from {yaml_file}: {e}") + + self._loaded = True + return self.rules + + def _load_yaml_file(self, file_path: Path) -> List[CorrelationRule]: + """Load correlation rules from a single YAML file. + + Args: + file_path: Path to YAML file. + + Returns: + List of CorrelationRule objects. + + Raises: + ValueError: If the file format is invalid. + """ + with open(file_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if not isinstance(data, dict): + raise ValueError(f"Invalid rule file format in {file_path}: expected dictionary at root") + + rules = [] + + # Support both single rule and multiple rules formats + if 'rules' in data: + # Multiple rules format + rule_list = data['rules'] + if not isinstance(rule_list, list): + raise ValueError(f"Invalid rule file format in {file_path}: 'rules' must be a list") + + for rule_data in rule_list: + try: + rule = self._parse_rule(rule_data) + rules.append(rule) + except (KeyError, ValueError) as e: + print(f"Warning: Skipping invalid rule in {file_path}: {e}") + + elif 'name' in data and 'co_occurring_facts' in data: + # Single rule format + try: + rule = self._parse_rule(data) + rules.append(rule) + except (KeyError, ValueError) as e: + print(f"Warning: Skipping invalid rule in {file_path}: {e}") + + else: + raise ValueError(f"Invalid rule file format in {file_path}: must contain 'rules' list or single rule with 'name' and 'co_occurring_facts'") + + return rules + + def _parse_rule(self, rule_data: Dict[str, Any]) -> CorrelationRule: + """Parse a single rule from dictionary data. + + Args: + rule_data: Dictionary containing rule data. + + Returns: + CorrelationRule object. + + Raises: + KeyError: If required fields are missing. + ValueError: If data format is invalid. + """ + if 'name' not in rule_data: + raise KeyError("Rule must have a 'name' field") + + if 'co_occurring_facts' not in rule_data: + raise KeyError("Rule must have a 'co_occurring_facts' field") + + if not isinstance(rule_data['co_occurring_facts'], list): + raise ValueError("'co_occurring_facts' must be a list") + + if len(rule_data['co_occurring_facts']) == 0: + raise ValueError("'co_occurring_facts' must not be empty") + + return CorrelationRule( + name=rule_data['name'], + co_occurring_facts=rule_data['co_occurring_facts'], + description=rule_data.get('description'), + confidence=rule_data.get('confidence', 0.8) + ) + + def get_all_rules(self) -> List[CorrelationRule]: + """Get all loaded correlation rules. + + Returns: + List of all loaded CorrelationRule objects. + """ + if not self._loaded: + self.load_rules() + + return self.rules + + def validate_rules(self) -> List[str]: + """Validate all loaded correlation rules. + + Returns: + List of validation error messages. + """ + if not self._loaded: + self.load_rules() + + errors = [] + + # Check for duplicate rule names + names = [rule.name for rule in self.rules] + for name in names: + if names.count(name) > 1: + errors.append(f"Duplicate rule name: {name}") + + # Validate each rule + for rule in self.rules: + # Check that each rule has at least 2 co-occurring facts + if len(rule.co_occurring_facts) < 2: + errors.append(f"Rule '{rule.name}' has fewer than 2 co-occurring facts") + + # Check confidence is between 0 and 1 + if not 0 <= rule.confidence <= 1: + errors.append(f"Rule '{rule.name}' has invalid confidence value: {rule.confidence}") + + return errors \ No newline at end of file diff --git a/theauditor/correlations/rules/angular_sanitization_cluster.yml b/theauditor/correlations/rules/angular_sanitization_cluster.yml new file mode 100644 index 0000000..8cab74f --- /dev/null +++ b/theauditor/correlations/rules/angular_sanitization_cluster.yml @@ -0,0 +1,10 @@ +name: "Angular Sanitization Bypass Factual Cluster" +description: "Multiple tools detected patterns consistent with XSS via sanitization bypass in Angular." +confidence: 0.95 +co_occurring_facts: + - tool: "framework_detector" + pattern: "angular" + - tool: "patterns" + pattern: "bypassSecurity" + - tool: "taint_analyzer" + pattern: "trust" \ No newline at end of file diff --git a/theauditor/correlations/rules/api_key_exposure_cluster.yml b/theauditor/correlations/rules/api_key_exposure_cluster.yml new file mode 100644 index 0000000..3fd5769 --- /dev/null +++ b/theauditor/correlations/rules/api_key_exposure_cluster.yml @@ -0,0 +1,10 @@ +name: "API Key Exposure Factual Cluster" +description: "Multiple tools detected patterns consistent with a hardcoded or exposed API key." +confidence: 0.95 +co_occurring_facts: + - tool: "patterns" + pattern: "api_key" + - tool: "ast" + pattern: "hardcoded" + - tool: "git" + pattern: "committed" \ No newline at end of file diff --git a/theauditor/correlations/rules/command_injection_cluster.yml b/theauditor/correlations/rules/command_injection_cluster.yml new file mode 100644 index 0000000..d00be54 --- /dev/null +++ b/theauditor/correlations/rules/command_injection_cluster.yml @@ -0,0 +1,10 @@ +name: "Command Injection Factual Cluster" +description: "Multiple tools detected patterns consistent with a Command Injection vulnerability." +confidence: 0.95 +co_occurring_facts: + - tool: "taint_analyzer" + pattern: "command" + - tool: "patterns" + pattern: "(exec|subprocess|shell)" + - tool: "lint" + pattern: "subprocess" \ No newline at end of file diff --git a/theauditor/correlations/rules/container_escape_cluster.yml b/theauditor/correlations/rules/container_escape_cluster.yml new file mode 100644 index 0000000..fa6f073 --- /dev/null +++ b/theauditor/correlations/rules/container_escape_cluster.yml @@ -0,0 +1,10 @@ +name: "Container Escape Factual Cluster" +description: "Multiple tools detected patterns consistent with a container escape vulnerability." +confidence: 0.90 +co_occurring_facts: + - tool: "deployment" + pattern: "privileged" + - tool: "patterns" + pattern: "docker" + - tool: "security" + pattern: "cap_sys_admin" \ No newline at end of file diff --git a/theauditor/correlations/rules/cors_misconfiguration_cluster.yml b/theauditor/correlations/rules/cors_misconfiguration_cluster.yml new file mode 100644 index 0000000..cf88d8f --- /dev/null +++ b/theauditor/correlations/rules/cors_misconfiguration_cluster.yml @@ -0,0 +1,10 @@ +name: "CORS Misconfiguration Factual Cluster" +description: "Multiple tools detected patterns consistent with a CORS misconfiguration." +confidence: 0.90 +co_occurring_facts: + - tool: "patterns" + pattern: "Access-Control" + - tool: "security" + pattern: "wildcard" + - tool: "framework_detector" + pattern: "cors" \ No newline at end of file diff --git a/theauditor/correlations/rules/deadlock_cluster.yml b/theauditor/correlations/rules/deadlock_cluster.yml new file mode 100644 index 0000000..f45cdb2 --- /dev/null +++ b/theauditor/correlations/rules/deadlock_cluster.yml @@ -0,0 +1,10 @@ +name: "Deadlock Factual Cluster" +description: "Multiple tools detected patterns consistent with a potential deadlock." +confidence: 0.85 +co_occurring_facts: + - tool: "graph" + pattern: "mutex" + - tool: "patterns" + pattern: "lock" + - tool: "taint_analyzer" + pattern: "circular" \ No newline at end of file diff --git a/theauditor/correlations/rules/debug_enabled_cluster.yml b/theauditor/correlations/rules/debug_enabled_cluster.yml new file mode 100644 index 0000000..9cf0822 --- /dev/null +++ b/theauditor/correlations/rules/debug_enabled_cluster.yml @@ -0,0 +1,10 @@ +name: "Debug Mode Enabled Factual Cluster" +description: "Multiple tools detected patterns consistent with debug mode being enabled in a production environment." +confidence: 0.95 +co_occurring_facts: + - tool: "patterns" + pattern: "DEBUG=true" + - tool: "framework_detector" + pattern: "production" + - tool: "deployment" + pattern: "exposed" \ No newline at end of file diff --git a/theauditor/correlations/rules/express_bodyparser_cluster.yml b/theauditor/correlations/rules/express_bodyparser_cluster.yml new file mode 100644 index 0000000..49db22e --- /dev/null +++ b/theauditor/correlations/rules/express_bodyparser_cluster.yml @@ -0,0 +1,10 @@ +name: "Express Body-Parser Factual Cluster" +description: "Multiple tools detected patterns consistent with insecure body-parser configuration in Express." +confidence: 0.75 +co_occurring_facts: + - tool: "framework_detector" + pattern: "express" + - tool: "patterns" + pattern: "body-parser" + - tool: "security" + pattern: "no_limit" \ No newline at end of file diff --git a/theauditor/correlations/rules/infinite_loop_cluster.yml b/theauditor/correlations/rules/infinite_loop_cluster.yml new file mode 100644 index 0000000..e1b1d98 --- /dev/null +++ b/theauditor/correlations/rules/infinite_loop_cluster.yml @@ -0,0 +1,10 @@ +name: "Infinite Loop Factual Cluster" +description: "Multiple tools detected patterns consistent with a potential infinite loop." +confidence: 0.80 +co_occurring_facts: + - tool: "graph" + pattern: "cycle" + - tool: "patterns" + pattern: "while\\(true\\)" + - tool: "ast" + pattern: "no_break" \ No newline at end of file diff --git a/theauditor/correlations/rules/jwt_issues_cluster.yml b/theauditor/correlations/rules/jwt_issues_cluster.yml new file mode 100644 index 0000000..e44cc10 --- /dev/null +++ b/theauditor/correlations/rules/jwt_issues_cluster.yml @@ -0,0 +1,10 @@ +name: "JWT Issues Factual Cluster" +description: "Multiple tools detected patterns consistent with insecure JWT implementation." +confidence: 0.90 +co_occurring_facts: + - tool: "patterns" + pattern: "jwt" + - tool: "security" + pattern: "HS256" + - tool: "lint" + pattern: "jwt" \ No newline at end of file diff --git a/theauditor/correlations/rules/ldap_injection_cluster.yml b/theauditor/correlations/rules/ldap_injection_cluster.yml new file mode 100644 index 0000000..7a76245 --- /dev/null +++ b/theauditor/correlations/rules/ldap_injection_cluster.yml @@ -0,0 +1,10 @@ +name: "LDAP Injection Factual Cluster" +description: "Multiple tools detected patterns consistent with an LDAP Injection vulnerability." +confidence: 0.85 +co_occurring_facts: + - tool: "taint_analyzer" + pattern: "ldap" + - tool: "patterns" + pattern: "filter" + - tool: "lint" + pattern: "ldap" \ No newline at end of file diff --git a/theauditor/correlations/rules/memory_leak_cluster.yml b/theauditor/correlations/rules/memory_leak_cluster.yml new file mode 100644 index 0000000..145e421 --- /dev/null +++ b/theauditor/correlations/rules/memory_leak_cluster.yml @@ -0,0 +1,10 @@ +name: "Memory Leak Factual Cluster" +description: "Multiple tools detected patterns consistent with a potential memory leak." +confidence: 0.70 +co_occurring_facts: + - tool: "patterns" + pattern: "setInterval" + - tool: "graph" + pattern: "no_cleanup" + - tool: "lint" + pattern: "memory" \ No newline at end of file diff --git a/theauditor/correlations/rules/missing_auth_cluster.yml b/theauditor/correlations/rules/missing_auth_cluster.yml new file mode 100644 index 0000000..59c1411 --- /dev/null +++ b/theauditor/correlations/rules/missing_auth_cluster.yml @@ -0,0 +1,10 @@ +name: "Missing Authentication Factual Cluster" +description: "Multiple tools detected patterns consistent with a missing authentication control on a sensitive endpoint." +confidence: 0.80 +co_occurring_facts: + - tool: "patterns" + pattern: "public" + - tool: "framework_detector" + pattern: "no_auth" + - tool: "graph" + pattern: "exposed" \ No newline at end of file diff --git a/theauditor/correlations/rules/nosql_injection_cluster.yml b/theauditor/correlations/rules/nosql_injection_cluster.yml new file mode 100644 index 0000000..c7578ce --- /dev/null +++ b/theauditor/correlations/rules/nosql_injection_cluster.yml @@ -0,0 +1,10 @@ +name: "NoSQL Injection Factual Cluster" +description: "Multiple tools detected patterns consistent with a NoSQL Injection vulnerability." +confidence: 0.85 +co_occurring_facts: + - tool: "patterns" + pattern: "(mongodb|mongoose)" + - tool: "taint_analyzer" + pattern: "$where" + - tool: "lint" + pattern: "nosql" \ No newline at end of file diff --git a/theauditor/correlations/rules/path_traversal_cluster.yml b/theauditor/correlations/rules/path_traversal_cluster.yml new file mode 100644 index 0000000..34567b5 --- /dev/null +++ b/theauditor/correlations/rules/path_traversal_cluster.yml @@ -0,0 +1,10 @@ +name: "Path Traversal Factual Cluster" +description: "Multiple tools detected patterns consistent with a Path Traversal vulnerability." +confidence: 0.85 +co_occurring_facts: + - tool: "taint_analyzer" + pattern: "path" + - tool: "patterns" + pattern: "\\.\\./" + - tool: "lint" + pattern: "path" \ No newline at end of file diff --git a/theauditor/correlations/rules/pii_leak_cluster.yml b/theauditor/correlations/rules/pii_leak_cluster.yml new file mode 100644 index 0000000..87c43ee --- /dev/null +++ b/theauditor/correlations/rules/pii_leak_cluster.yml @@ -0,0 +1,10 @@ +name: "PII Leak Factual Cluster" +description: "Multiple tools detected patterns consistent with a potential leak of Personally Identifiable Information (PII)." +confidence: 0.80 +co_occurring_facts: + - tool: "patterns" + pattern: "(email|ssn)" + - tool: "taint_analyzer" + pattern: "response" + - tool: "framework_detector" + pattern: "no_mask" \ No newline at end of file diff --git a/theauditor/correlations/rules/race_condition_cluster.yml b/theauditor/correlations/rules/race_condition_cluster.yml new file mode 100644 index 0000000..ed88f3a --- /dev/null +++ b/theauditor/correlations/rules/race_condition_cluster.yml @@ -0,0 +1,10 @@ +name: "Race Condition Factual Cluster" +description: "Multiple tools detected patterns consistent with a potential race condition." +confidence: 0.75 +co_occurring_facts: + - tool: "graph" + pattern: "concurrent" + - tool: "patterns" + pattern: "async" + - tool: "taint_analyzer" + pattern: "shared_state" \ No newline at end of file diff --git a/theauditor/correlations/rules/rate_limit_missing_cluster.yml b/theauditor/correlations/rules/rate_limit_missing_cluster.yml new file mode 100644 index 0000000..d6c9451 --- /dev/null +++ b/theauditor/correlations/rules/rate_limit_missing_cluster.yml @@ -0,0 +1,10 @@ +name: "Missing Rate Limiting Factual Cluster" +description: "Multiple tools detected patterns consistent with a sensitive endpoint lacking rate limiting." +confidence: 0.85 +co_occurring_facts: + - tool: "patterns" + pattern: "endpoint" + - tool: "framework_detector" + pattern: "no_throttle" + - tool: "deployment" + pattern: "public" \ No newline at end of file diff --git a/theauditor/correlations/rules/react_dangerous_html_cluster.yml b/theauditor/correlations/rules/react_dangerous_html_cluster.yml new file mode 100644 index 0000000..fe8d517 --- /dev/null +++ b/theauditor/correlations/rules/react_dangerous_html_cluster.yml @@ -0,0 +1,10 @@ +name: "React dangerouslySetInnerHTML Factual Cluster" +description: "Multiple tools detected patterns consistent with XSS via dangerouslySetInnerHTML in React." +confidence: 0.95 +co_occurring_facts: + - tool: "framework_detector" + pattern: "react" + - tool: "patterns" + pattern: "dangerously" + - tool: "taint_analyzer" + pattern: "user" \ No newline at end of file diff --git a/theauditor/correlations/rules/refactoring.yaml b/theauditor/correlations/rules/refactoring.yaml new file mode 100644 index 0000000..485537f --- /dev/null +++ b/theauditor/correlations/rules/refactoring.yaml @@ -0,0 +1,277 @@ +# Refactoring Detection Correlation Rules +# These rules detect common refactoring issues and inconsistencies + +rules: + # ============================================================================ + # DATA MODEL REFACTORING PATTERNS + # ============================================================================ + + - name: "FIELD_MOVED_BETWEEN_MODELS" + description: "Field moved from one model to another but old references remain" + co_occurring_facts: + - tool: "grep" + pattern: "removeColumn.*('products'|\"products\")" + - tool: "grep" + pattern: "product\\.(unit_price|retail_price|wholesale_price|sku|inventory_type)" + confidence: 0.95 + + - name: "PRODUCT_VARIANT_REFACTOR" + description: "Product fields moved to ProductVariant but frontend still uses old structure" + co_occurring_facts: + - tool: "grep" + pattern: "ProductVariant.*retail_price.*Sequelize" + - tool: "grep" + pattern: "product\\.unit_price|product\\.retail_price" + confidence: 0.92 + + - name: "SKU_FIELD_MIGRATION" + description: "SKU moved from Product to ProductVariant" + co_occurring_facts: + - tool: "grep" + pattern: "ProductVariant.*sku.*unique.*true" + - tool: "grep" + pattern: "product\\.sku|WHERE.*products\\.sku" + confidence: 0.94 + + # ============================================================================ + # FOREIGN KEY REFACTORING + # ============================================================================ + + - name: "ORDER_ITEMS_WRONG_FK" + description: "Order items using product_id instead of product_variant_id" + co_occurring_facts: + - tool: "grep" + pattern: "order_items.*product_variant_id.*fkey" + - tool: "grep" + pattern: "order_items.*product_id(?!_variant)" + confidence: 0.96 + + - name: "TRANSFER_ITEMS_WRONG_FK" + description: "Transfer items referencing wrong product foreign key" + co_occurring_facts: + - tool: "grep" + pattern: "transfer_items.*product_variant_id" + - tool: "grep" + pattern: "transfer.*product_id(?!_variant)" + confidence: 0.93 + + - name: "INVENTORY_FK_MISMATCH" + description: "Inventory table has both product_id and product_variant_id" + co_occurring_facts: + - tool: "grep" + pattern: "inventory.*product_variant_id.*NULL" + - tool: "grep" + pattern: "inventory.*product_id.*NOT NULL" + confidence: 0.88 + + # ============================================================================ + # API CONTRACT CHANGES + # ============================================================================ + + - name: "API_ENDPOINT_REMOVED" + description: "Frontend calling API endpoints that no longer exist" + co_occurring_facts: + - tool: "grep" + pattern: "/api/products/.*/price|/api/products/.*/sku" + - tool: "grep" + pattern: "router\\.(get|post).*'/variants'" + confidence: 0.90 + + - name: "API_RESPONSE_STRUCTURE_CHANGED" + description: "API response structure changed but frontend expects old format" + co_occurring_facts: + - tool: "grep" + pattern: "res\\.json.*variants.*product" + - tool: "grep" + pattern: "response\\.data\\.product\\.price" + confidence: 0.87 + + - name: "GRAPHQL_SCHEMA_MISMATCH" + description: "GraphQL schema doesn't match model structure" + co_occurring_facts: + - tool: "grep" + pattern: "type Product.*price.*Float" + - tool: "grep" + pattern: "Product\\.init.*!.*price" + confidence: 0.85 + + # ============================================================================ + # FRONTEND-BACKEND MISMATCHES + # ============================================================================ + + - name: "TYPESCRIPT_INTERFACE_OUTDATED" + description: "TypeScript interfaces don't match backend models" + co_occurring_facts: + - tool: "grep" + pattern: "interface.*Product.*unit_price.*number" + - tool: "grep" + pattern: "Product\\.init.*!.*unit_price" + confidence: 0.96 + + - name: "FRONTEND_NESTED_STRUCTURE" + description: "Frontend expects nested relationships that backend doesn't provide" + co_occurring_facts: + - tool: "grep" + pattern: "product_variant\\.product\\.(name|brand)" + - tool: "grep" + pattern: "ProductVariant.*belongsTo.*Product" + confidence: 0.91 + + - name: "CART_WRONG_ID_FIELD" + description: "Shopping cart using product_id instead of product_variant_id" + co_occurring_facts: + - tool: "grep" + pattern: "OrderItem.*product_variant_id.*required" + - tool: "grep" + pattern: "addToCart.*product_id|cart.*product_id" + confidence: 0.93 + + # ============================================================================ + # MIGRATION PATTERNS + # ============================================================================ + + - name: "INCOMPLETE_MIGRATION" + description: "Database migration incomplete - old column references remain" + co_occurring_facts: + - tool: "grep" + pattern: "removeColumn|dropColumn" + - tool: "grep" + pattern: "SELECT.*FROM.*WHERE.*{removed_column}" + confidence: 0.89 + + - name: "MIGRATION_DATA_LOSS" + description: "Migration drops column without data migration" + co_occurring_facts: + - tool: "grep" + pattern: "removeColumn.*CASCADE|dropColumn.*CASCADE" + - tool: "grep" + pattern: "!.*UPDATE.*SET.*before.*removeColumn" + confidence: 0.86 + + - name: "ENUM_TYPE_CHANGED" + description: "ENUM values changed but code still uses old values" + co_occurring_facts: + - tool: "grep" + pattern: "DROP TYPE.*enum_products" + - tool: "grep" + pattern: "inventory_type.*=.*'both'|inventory_type.*weight|unit|both" + confidence: 0.84 + + # ============================================================================ + # AUTHORIZATION CHANGES + # ============================================================================ + + - name: "MISSING_AUTH_MIDDLEWARE" + description: "New routes missing authentication/authorization" + co_occurring_facts: + - tool: "grep" + pattern: "router\\.(post|put|delete).*variant" + - tool: "grep" + pattern: "!.*requireAdmin.*productVariant\\.routes" + confidence: 0.92 + + - name: "PERMISSION_MODEL_CHANGED" + description: "Permission model changed but checks not updated" + co_occurring_facts: + - tool: "grep" + pattern: "role.*admin|worker" + - tool: "grep" + pattern: "req\\.user\\.permissions|can\\(" + confidence: 0.80 + + # ============================================================================ + # VALIDATION CHANGES + # ============================================================================ + + - name: "VALIDATION_SCHEMA_OUTDATED" + description: "Joi/Yup validation schema doesn't match model" + co_occurring_facts: + - tool: "grep" + pattern: "Joi\\.object.*product.*unit_price" + - tool: "grep" + pattern: "!.*Product.*unit_price" + confidence: 0.88 + + - name: "REQUIRED_FIELD_MISMATCH" + description: "Required fields in validation don't match database constraints" + co_occurring_facts: + - tool: "grep" + pattern: "allowNull.*false.*sku" + - tool: "grep" + pattern: "sku.*Joi\\..*optional\\(\\)" + confidence: 0.85 + + # ============================================================================ + # SERVICE LAYER ISSUES + # ============================================================================ + + - name: "SERVICE_METHOD_SIGNATURE_CHANGED" + description: "Service method signature changed but callers not updated" + co_occurring_facts: + - tool: "grep" + pattern: "async.*create.*product.*variant" + - tool: "grep" + pattern: "productService\\.create\\(.*price" + confidence: 0.87 + + - name: "REPOSITORY_PATTERN_MISMATCH" + description: "Repository methods don't match new model structure" + co_occurring_facts: + - tool: "grep" + pattern: "findOne.*where.*sku" + - tool: "grep" + pattern: "ProductVariant.*sku" + confidence: 0.83 + + # ============================================================================ + # TESTING ISSUES + # ============================================================================ + + - name: "TEST_FIXTURES_OUTDATED" + description: "Test fixtures using old model structure" + co_occurring_facts: + - tool: "grep" + pattern: "test.*product.*unit_price" + - tool: "grep" + pattern: "ProductVariant.*retail_price" + confidence: 0.82 + + - name: "MOCK_DATA_MISMATCH" + description: "Mock data doesn't match actual model structure" + co_occurring_facts: + - tool: "grep" + pattern: "mock.*product.*price" + - tool: "grep" + pattern: "!.*Product.*price" + confidence: 0.79 + + # ============================================================================ + # COMMON REFACTORING ANTI-PATTERNS + # ============================================================================ + + - name: "EXTRACT_VARIANT_PATTERN" + description: "Classic Extract Variant refactoring with incomplete updates" + co_occurring_facts: + - tool: "grep" + pattern: "createTable.*variants" + - tool: "grep" + pattern: "product\\.(price|sku|inventory)" + confidence: 0.94 + + - name: "NORMALIZE_HIERARCHY" + description: "Hierarchy normalization with missing relationship updates" + co_occurring_facts: + - tool: "grep" + pattern: "belongsTo.*hasMany.*through" + - tool: "grep" + pattern: "JOIN.*old_table" + confidence: 0.86 + + - name: "SPLIT_TABLE_INCOMPLETE" + description: "Table split into multiple tables but queries not updated" + co_occurring_facts: + - tool: "grep" + pattern: "createTable.*_details|_metadata" + - tool: "grep" + pattern: "SELECT.*FROM.*{original_table}.*WHERE" + confidence: 0.88 \ No newline at end of file diff --git a/theauditor/correlations/rules/sensitive_logs_cluster.yml b/theauditor/correlations/rules/sensitive_logs_cluster.yml new file mode 100644 index 0000000..5440be2 --- /dev/null +++ b/theauditor/correlations/rules/sensitive_logs_cluster.yml @@ -0,0 +1,10 @@ +name: "Sensitive Data in Logs Factual Cluster" +description: "Multiple tools detected patterns consistent with sensitive data being written to logs." +confidence: 0.85 +co_occurring_facts: + - tool: "patterns" + pattern: "console.log" + - tool: "taint_analyzer" + pattern: "password" + - tool: "lint" + pattern: "logging" \ No newline at end of file diff --git a/theauditor/correlations/rules/session_fixation_cluster.yml b/theauditor/correlations/rules/session_fixation_cluster.yml new file mode 100644 index 0000000..8d8b7ad --- /dev/null +++ b/theauditor/correlations/rules/session_fixation_cluster.yml @@ -0,0 +1,10 @@ +name: "Session Fixation Factual Cluster" +description: "Multiple tools detected patterns consistent with a Session Fixation vulnerability." +confidence: 0.75 +co_occurring_facts: + - tool: "patterns" + pattern: "session" + - tool: "taint_analyzer" + pattern: "user_controlled" + - tool: "framework_detector" + pattern: "session" \ No newline at end of file diff --git a/theauditor/correlations/rules/source_map_exposure_cluster.yml b/theauditor/correlations/rules/source_map_exposure_cluster.yml new file mode 100644 index 0000000..cc61b5f --- /dev/null +++ b/theauditor/correlations/rules/source_map_exposure_cluster.yml @@ -0,0 +1,10 @@ +name: "Source Map Exposure Factual Cluster" +description: "Multiple tools detected patterns consistent with exposed source maps in a production environment." +confidence: 0.95 +co_occurring_facts: + - tool: "build" + pattern: "sourcemap" + - tool: "deployment" + pattern: "production" + - tool: "patterns" + pattern: "\\.map" \ No newline at end of file diff --git a/theauditor/correlations/rules/ssrf_cluster.yml b/theauditor/correlations/rules/ssrf_cluster.yml new file mode 100644 index 0000000..c56015d --- /dev/null +++ b/theauditor/correlations/rules/ssrf_cluster.yml @@ -0,0 +1,10 @@ +name: "SSRF Factual Cluster" +description: "Multiple tools detected patterns consistent with a Server-Side Request Forgery (SSRF) vulnerability." +confidence: 0.80 +co_occurring_facts: + - tool: "taint_analyzer" + pattern: "url" + - tool: "patterns" + pattern: "(request|fetch|urllib)" + - tool: "lint" + pattern: "urllib" \ No newline at end of file diff --git a/theauditor/correlations/rules/template_injection_cluster.yml b/theauditor/correlations/rules/template_injection_cluster.yml new file mode 100644 index 0000000..7c67304 --- /dev/null +++ b/theauditor/correlations/rules/template_injection_cluster.yml @@ -0,0 +1,10 @@ +name: "Template Injection Factual Cluster" +description: "Multiple tools detected patterns consistent with a Server-Side Template Injection (SSTI) vulnerability." +confidence: 0.80 +co_occurring_facts: + - tool: "taint_analyzer" + pattern: "template" + - tool: "patterns" + pattern: "eval" + - tool: "framework_detector" + pattern: "(jinja|blade|pug)" \ No newline at end of file diff --git a/theauditor/correlations/rules/test_sql_injection.yml b/theauditor/correlations/rules/test_sql_injection.yml new file mode 100644 index 0000000..7b33598 --- /dev/null +++ b/theauditor/correlations/rules/test_sql_injection.yml @@ -0,0 +1,10 @@ +name: "Potential SQL Injection Factual Cluster" +description: "Multiple tools detected patterns consistent with SQL injection vulnerability" +confidence: 0.85 +co_occurring_facts: + - tool: "taint_analyzer" + pattern: "sql" + - tool: "patterns" + pattern: "string.*query" + - tool: "lint" + pattern: "sql" \ No newline at end of file diff --git a/theauditor/correlations/rules/vue_v_html_cluster.yml b/theauditor/correlations/rules/vue_v_html_cluster.yml new file mode 100644 index 0000000..cc8bf18 --- /dev/null +++ b/theauditor/correlations/rules/vue_v_html_cluster.yml @@ -0,0 +1,10 @@ +name: "Vue v-html Factual Cluster" +description: "Multiple tools detected patterns consistent with XSS via v-html in Vue." +confidence: 0.95 +co_occurring_facts: + - tool: "framework_detector" + pattern: "vue" + - tool: "patterns" + pattern: "v-html" + - tool: "taint_analyzer" + pattern: "user_input" \ No newline at end of file diff --git a/theauditor/correlations/rules/weak_auth_cluster.yml b/theauditor/correlations/rules/weak_auth_cluster.yml new file mode 100644 index 0000000..2ec8a7e --- /dev/null +++ b/theauditor/correlations/rules/weak_auth_cluster.yml @@ -0,0 +1,10 @@ +name: "Weak Authentication Factual Cluster" +description: "Multiple tools detected patterns consistent with weak or deprecated authentication mechanisms." +confidence: 0.85 +co_occurring_facts: + - tool: "patterns" + pattern: "(md5|sha1)" + - tool: "security" + pattern: "password" + - tool: "lint" + pattern: "deprecated" \ No newline at end of file diff --git a/theauditor/correlations/rules/xss_cluster.yml b/theauditor/correlations/rules/xss_cluster.yml new file mode 100644 index 0000000..bcdfd25 --- /dev/null +++ b/theauditor/correlations/rules/xss_cluster.yml @@ -0,0 +1,10 @@ +name: "XSS Factual Cluster" +description: "Multiple tools detected patterns consistent with a Cross-Site Scripting (XSS) vulnerability." +confidence: 0.90 +co_occurring_facts: + - tool: "taint_analyzer" + pattern: "xss" + - tool: "patterns" + pattern: "(innerHTML|dangerouslySetInnerHTML)" + - tool: "lint" + pattern: "xss" \ No newline at end of file diff --git a/theauditor/correlations/rules/xxe_cluster.yml b/theauditor/correlations/rules/xxe_cluster.yml new file mode 100644 index 0000000..7db0c3f --- /dev/null +++ b/theauditor/correlations/rules/xxe_cluster.yml @@ -0,0 +1,10 @@ +name: "XXE Factual Cluster" +description: "Multiple tools detected patterns consistent with an XML External Entity (XXE) vulnerability." +confidence: 0.80 +co_occurring_facts: + - tool: "patterns" + pattern: "xml" + - tool: "taint_analyzer" + pattern: "parse" + - tool: "framework_detector" + pattern: "xml_parser" \ No newline at end of file diff --git a/theauditor/deps.py b/theauditor/deps.py new file mode 100644 index 0000000..9fe1d84 --- /dev/null +++ b/theauditor/deps.py @@ -0,0 +1,1109 @@ +"""Dependency parser for multiple ecosystems.""" + +import glob +import http.client +import json +import platform +import re +import shutil +import time +import urllib.error +import yaml +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional +from theauditor.security import sanitize_path, sanitize_url_component, validate_package_name, SecurityError + +# Detect if running on Windows for character encoding +IS_WINDOWS = platform.system() == "Windows" + +# Rate limiting configuration - optimized for minimal runtime +# Based on actual API rate limits and industry standards +RATE_LIMIT_NPM = 0.1 # npm registry: 600 req/min (well under any limit) +RATE_LIMIT_PYPI = 0.2 # PyPI: 300 req/min (safe margin) +RATE_LIMIT_DOCKER = 0.2 # Docker Hub: 300 req/min for tag checks +RATE_LIMIT_BACKOFF = 15 # Backoff on 429/disconnect (15s gives APIs time to reset) + + +def parse_dependencies(root_path: str = ".") -> List[Dict[str, Any]]: + """ + Parse dependencies from various package managers. + + Returns list of dependency objects with structure: + { + "name": str, + "version": str, + "manager": "npm"|"py", + "files": [paths that import it], + "source": "package.json|pyproject.toml|requirements.txt" + } + """ + import os + root = Path(root_path) + deps = [] + + # Debug mode + debug = os.environ.get("THEAUDITOR_DEBUG") + + # Parse Node dependencies + try: + package_json = sanitize_path("package.json", root_path) + if package_json.exists(): + if debug: + print(f"Debug: Found {package_json}") + deps.extend(_parse_package_json(package_json)) + except SecurityError as e: + if debug: + print(f"Debug: Security error checking package.json: {e}") + + # Parse Python dependencies + try: + pyproject = sanitize_path("pyproject.toml", root_path) + if pyproject.exists(): + if debug: + print(f"Debug: Found {pyproject}") + deps.extend(_parse_pyproject_toml(pyproject)) + except SecurityError as e: + if debug: + print(f"Debug: Security error checking pyproject.toml: {e}") + + # Parse requirements files + req_files = list(root.glob("requirements*.txt")) + if debug and req_files: + print(f"Debug: Found requirements files: {req_files}") + for req_file in req_files: + try: + # Validate the path is within project root + safe_req_file = sanitize_path(str(req_file), root_path) + deps.extend(_parse_requirements_txt(safe_req_file)) + except SecurityError as e: + if debug: + print(f"Debug: Security error with {req_file}: {e}") + + # Parse Docker Compose files + docker_compose_files = list(root.glob("docker-compose*.yml")) + list(root.glob("docker-compose*.yaml")) + if debug and docker_compose_files: + print(f"Debug: Found Docker Compose files: {docker_compose_files}") + for compose_file in docker_compose_files: + try: + safe_compose_file = sanitize_path(str(compose_file), root_path) + deps.extend(_parse_docker_compose(safe_compose_file)) + except SecurityError as e: + if debug: + print(f"Debug: Security error with {compose_file}: {e}") + + # Parse Dockerfiles + dockerfiles = list(root.glob("**/Dockerfile")) + if debug and dockerfiles: + print(f"Debug: Found Dockerfiles: {dockerfiles}") + for dockerfile in dockerfiles: + try: + safe_dockerfile = sanitize_path(str(dockerfile), root_path) + deps.extend(_parse_dockerfile(safe_dockerfile)) + except SecurityError as e: + if debug: + print(f"Debug: Security error with {dockerfile}: {e}") + + if debug: + print(f"Debug: Total dependencies found: {len(deps)}") + + return deps + + +def _parse_package_json(path: Path) -> List[Dict[str, Any]]: + """Parse dependencies from package.json, with monorepo support.""" + deps = [] + processed_packages = set() # Track processed packages to avoid duplicates + + def parse_single_package(pkg_path: Path, workspace_path: str = "package.json") -> List[Dict[str, Any]]: + """Parse a single package.json file.""" + local_deps = [] + try: + with open(pkg_path, encoding="utf-8") as f: + data = json.load(f) + + # Combine dependencies and devDependencies + all_deps = {} + if "dependencies" in data: + all_deps.update(data["dependencies"]) + if "devDependencies" in data: + all_deps.update(data["devDependencies"]) + + for name, version_spec in all_deps.items(): + # Clean version spec (remove ^, ~, >=, etc.) + version = _clean_version(version_spec) + local_deps.append({ + "name": name, + "version": version, + "manager": "npm", + "files": [], # Will be populated by workset scan + "source": "package.json", + "workspace_package": workspace_path # Track which package.json this came from + }) + except (json.JSONDecodeError, KeyError) as e: + # Log but don't fail - package.json might be malformed + print(f"Warning: Could not parse {pkg_path}: {e}") + + return local_deps + + # Parse the root package.json first + root_dir = path.parent + deps.extend(parse_single_package(path, "package.json")) + processed_packages.add(str(path.resolve())) + + # Check for monorepo workspaces + try: + with open(path, encoding="utf-8") as f: + data = json.load(f) + + # Check for workspaces field (Yarn/npm workspaces) + workspaces = data.get("workspaces", []) + + # Handle different workspace formats + if isinstance(workspaces, dict): + # npm 7+ format: {"packages": ["packages/*"]} + workspaces = workspaces.get("packages", []) + + if workspaces and isinstance(workspaces, list): + # This is a monorepo - expand workspace patterns + for pattern in workspaces: + # Convert workspace pattern to absolute path pattern + abs_pattern = str(root_dir / pattern) + + # Handle glob patterns like "packages/*" or "apps/**" + if "*" in abs_pattern: + # Use glob to find matching directories + matched_paths = glob.glob(abs_pattern) + + for matched_path in matched_paths: + matched_dir = Path(matched_path) + if matched_dir.is_dir(): + # Look for package.json in this directory + workspace_pkg = matched_dir / "package.json" + if workspace_pkg.exists(): + # Skip if already processed + if str(workspace_pkg.resolve()) in processed_packages: + continue + + # Calculate relative path for workspace_package field + try: + rel_path = workspace_pkg.relative_to(root_dir) + workspace_path = str(rel_path).replace("\\", "/") + except ValueError: + # If relative path fails, use absolute path + workspace_path = str(workspace_pkg) + + # Parse this workspace package + workspace_deps = parse_single_package(workspace_pkg, workspace_path) + deps.extend(workspace_deps) + processed_packages.add(str(workspace_pkg.resolve())) + else: + # Direct path without glob + workspace_dir = root_dir / pattern + if workspace_dir.is_dir(): + workspace_pkg = workspace_dir / "package.json" + if workspace_pkg.exists(): + # Skip if already processed + if str(workspace_pkg.resolve()) in processed_packages: + continue + + # Calculate relative path for workspace_package field + try: + rel_path = workspace_pkg.relative_to(root_dir) + workspace_path = str(rel_path).replace("\\", "/") + except ValueError: + workspace_path = str(workspace_pkg) + + # Parse this workspace package + workspace_deps = parse_single_package(workspace_pkg, workspace_path) + deps.extend(workspace_deps) + processed_packages.add(str(workspace_pkg.resolve())) + + # Also check for Lerna configuration (lerna.json) + lerna_json = root_dir / "lerna.json" + if lerna_json.exists(): + try: + with open(lerna_json, encoding="utf-8") as f: + lerna_data = json.load(f) + + lerna_packages = lerna_data.get("packages", []) + for pattern in lerna_packages: + abs_pattern = str(root_dir / pattern) + if "*" in abs_pattern: + matched_paths = glob.glob(abs_pattern) + for matched_path in matched_paths: + matched_dir = Path(matched_path) + if matched_dir.is_dir(): + workspace_pkg = matched_dir / "package.json" + if workspace_pkg.exists() and str(workspace_pkg.resolve()) not in processed_packages: + try: + rel_path = workspace_pkg.relative_to(root_dir) + workspace_path = str(rel_path).replace("\\", "/") + except ValueError: + workspace_path = str(workspace_pkg) + + workspace_deps = parse_single_package(workspace_pkg, workspace_path) + deps.extend(workspace_deps) + processed_packages.add(str(workspace_pkg.resolve())) + except (json.JSONDecodeError, KeyError): + # Lerna.json parsing failed, continue without it + pass + + # Check for pnpm-workspace.yaml + pnpm_workspace = root_dir / "pnpm-workspace.yaml" + if pnpm_workspace.exists(): + try: + with open(pnpm_workspace, encoding="utf-8") as f: + pnpm_data = yaml.safe_load(f) + + pnpm_packages = pnpm_data.get("packages", []) + for pattern in pnpm_packages: + abs_pattern = str(root_dir / pattern) + if "*" in abs_pattern: + matched_paths = glob.glob(abs_pattern) + for matched_path in matched_paths: + matched_dir = Path(matched_path) + if matched_dir.is_dir(): + workspace_pkg = matched_dir / "package.json" + if workspace_pkg.exists() and str(workspace_pkg.resolve()) not in processed_packages: + try: + rel_path = workspace_pkg.relative_to(root_dir) + workspace_path = str(rel_path).replace("\\", "/") + except ValueError: + workspace_path = str(workspace_pkg) + + workspace_deps = parse_single_package(workspace_pkg, workspace_path) + deps.extend(workspace_deps) + processed_packages.add(str(workspace_pkg.resolve())) + except (yaml.YAMLError, KeyError): + # pnpm-workspace.yaml parsing failed, continue without it + pass + + except (json.JSONDecodeError, KeyError) as e: + # Root package.json parsing for workspaces failed, but we already have root deps + pass + + return deps + + +def _parse_pyproject_toml(path: Path) -> List[Dict[str, Any]]: + """Parse dependencies from pyproject.toml.""" + deps = [] + try: + import tomllib + except ImportError: + # Python < 3.11 + try: + import tomli as tomllib + except ImportError: + # Can't parse TOML without library + print(f"Warning: Cannot parse {path} - tomllib not available") + return deps + + try: + with open(path, "rb") as f: + data = tomllib.load(f) + + # Get project dependencies + project_deps = data.get("project", {}).get("dependencies", []) + for dep_spec in project_deps: + name, version = _parse_python_dep_spec(dep_spec) + if name: + deps.append({ + "name": name, + "version": version or "latest", + "manager": "py", + "files": [], + "source": "pyproject.toml" + }) + + # Also check optional dependencies + optional = data.get("project", {}).get("optional-dependencies", {}) + for group_deps in optional.values(): + for dep_spec in group_deps: + name, version = _parse_python_dep_spec(dep_spec) + if name: + deps.append({ + "name": name, + "version": version or "latest", + "manager": "py", + "files": [], + "source": "pyproject.toml" + }) + except Exception as e: + print(f"Warning: Could not parse {path}: {e}") + + return deps + + +def _parse_requirements_txt(path: Path) -> List[Dict[str, Any]]: + """Parse dependencies from requirements.txt.""" + deps = [] + try: + with open(path, encoding="utf-8") as f: + for line in f: + line = line.strip() + # Skip comments and empty lines + if not line or line.startswith("#"): + continue + # Skip special directives + if line.startswith("-"): + continue + + # Strip inline comments and trailing whitespace + if "#" in line: + line = line.split("#")[0].strip() + + name, version = _parse_python_dep_spec(line) + if name: + deps.append({ + "name": name, + "version": version or "latest", + "manager": "py", + "files": [], + "source": path.name + }) + except Exception as e: + print(f"Warning: Could not parse {path}: {e}") + + return deps + + +def _parse_python_dep_spec(spec: str) -> tuple[str, Optional[str]]: + """ + Parse a Python dependency specification. + Returns (name, version) tuple. + """ + # Handle various formats: + # package==1.2.3 + # package>=1.2.3 + # package~=1.2.3 + # package[extra]==1.2.3 + # package @ git+https://... + + # Remove extras + spec = re.sub(r'\[.*?\]', '', spec) + + # Handle git URLs + if "@" in spec and ("git+" in spec or "https://" in spec): + name = spec.split("@")[0].strip() + return (name, "git") + + # Parse version specs (allow dots, underscores, hyphens in package names) + match = re.match(r'^([a-zA-Z0-9._-]+)\s*([><=~!]+)\s*(.+)$', spec) + if match: + name, op, version = match.groups() + # For pinned versions, use exact version + if op == "==": + return (name, version) + # For other operators, use the specified version as hint + return (name, version) + + # No version specified + return (spec.strip(), None) + + +def _clean_version(version_spec: str) -> str: + """ + Clean version specification to get actual version. + ^1.2.3 -> 1.2.3 + ~1.2.3 -> 1.2.3 + >=1.2.3 -> 1.2.3 + """ + # Remove common prefixes + version = re.sub(r'^[~^>=<]+', '', version_spec) + # Handle ranges (use first version) + if " " in version: + version = version.split()[0] + return version.strip() + + +def _parse_docker_compose(path: Path) -> List[Dict[str, Any]]: + """Parse Docker base images from docker-compose.yml files.""" + deps = [] + try: + with open(path, encoding="utf-8") as f: + data = yaml.safe_load(f) + + # Check if services key exists + if not data or "services" not in data: + return deps + + # Iterate through services + for service_name, service_config in data["services"].items(): + if not isinstance(service_config, dict): + continue + + # Extract image if present + if "image" in service_config: + image_spec = service_config["image"] + # Parse image:tag format + if ":" in image_spec: + name, tag = image_spec.rsplit(":", 1) + else: + name = image_spec + tag = "latest" + + # Handle registry prefixes (e.g., docker.io/library/postgres) + if "/" in name: + # Take the last part as the image name + name_parts = name.split("/") + if len(name_parts) >= 2: + # If it's library/image, use just image + if name_parts[-2] == "library": + name = name_parts[-1] + else: + # Keep org/image format + name = "/".join(name_parts[-2:]) + + deps.append({ + "name": name, + "version": tag, + "manager": "docker", + "files": [], + "source": path.name + }) + except (yaml.YAMLError, KeyError, AttributeError) as e: + print(f"Warning: Could not parse {path}: {e}") + + return deps + + +def _parse_dockerfile(path: Path) -> List[Dict[str, Any]]: + """Parse Docker base images from Dockerfile.""" + deps = [] + try: + with open(path, encoding="utf-8") as f: + for line in f: + line = line.strip() + # Look for FROM instructions + if line.upper().startswith("FROM "): + # Extract image spec after FROM + image_spec = line[5:].strip() + + # Handle multi-stage builds (FROM image AS stage) + if " AS " in image_spec.upper(): + image_spec = image_spec.split(" AS ")[0].strip() + elif " as " in image_spec: + image_spec = image_spec.split(" as ")[0].strip() + + # Skip scratch and build stages + if image_spec.lower() in ["scratch", "builder"]: + continue + + # Parse image:tag format + if ":" in image_spec: + name, tag = image_spec.rsplit(":", 1) + else: + name = image_spec + tag = "latest" + + # Handle registry prefixes + if "/" in name: + name_parts = name.split("/") + if len(name_parts) >= 2: + if name_parts[-2] == "library": + name = name_parts[-1] + else: + name = "/".join(name_parts[-2:]) + + deps.append({ + "name": name, + "version": tag, + "manager": "docker", + "files": [], + "source": str(path.relative_to(Path.cwd())) + }) + except Exception as e: + print(f"Warning: Could not parse {path}: {e}") + + return deps + + +def write_deps_json(deps: List[Dict[str, Any]], output_path: str = "./.pf/deps.json") -> None: + """Write dependencies to JSON file.""" + try: + output = sanitize_path(output_path, ".") + output.parent.mkdir(parents=True, exist_ok=True) + + with open(output, "w", encoding="utf-8") as f: + json.dump(deps, f, indent=2, sort_keys=True) + except SecurityError as e: + raise SecurityError(f"Invalid output path: {e}") + + +def check_latest_versions( + deps: List[Dict[str, Any]], + allow_net: bool = True, + offline: bool = False, + cache_file: str = "./.pf/deps_cache.json" +) -> Dict[str, Dict[str, Any]]: + """ + Check latest versions from registries with caching. + + Returns dict keyed by "manager:name" with: + { + "locked": str, + "latest": str, + "delta": str, + "is_outdated": bool, + "last_checked": str (ISO timestamp) + } + """ + if offline or not allow_net: + # Try to load from cache in offline mode + cached_data = _load_deps_cache(cache_file) + if cached_data: + # Update locked versions from current deps + for dep in deps: + key = f"{dep['manager']}:{dep['name']}" + if key in cached_data: + cached_data[key]["locked"] = dep["version"] + cached_data[key]["is_outdated"] = cached_data[key]["latest"] != dep["version"] + cached_data[key]["delta"] = _calculate_version_delta(dep["version"], cached_data[key]["latest"]) + return cached_data or {} + + # Load existing cache + cache = _load_deps_cache(cache_file) + latest_info = {} + needs_check = [] + + # FIRST PASS: Check what's in cache and still valid + for dep in deps: + key = f"{dep['manager']}:{dep['name']}" + if key in latest_info: + continue # Already processed + + # Check if we have valid cached data (24 hours for deps) + if key in cache and _is_cache_valid(cache[key], hours=24): + # Update locked version from current deps + cache[key]["locked"] = dep["version"] + cache[key]["is_outdated"] = cache[key]["latest"] != dep["version"] + cache[key]["delta"] = _calculate_version_delta(dep["version"], cache[key]["latest"]) + latest_info[key] = cache[key] + else: + needs_check.append(dep) + + # Early exit if everything is cached + if not needs_check: + return latest_info + + # SECOND PASS: Check only what needs updating, with per-service rate limiting + npm_rate_limited_until = 0 + pypi_rate_limited_until = 0 + docker_rate_limited_until = 0 + + for dep in needs_check: + key = f"{dep['manager']}:{dep['name']}" + current_time = time.time() + + # Skip if this service is rate limited + if dep["manager"] == "npm" and current_time < npm_rate_limited_until: + # Use cached data if available, even if expired + if key in cache: + latest_info[key] = cache[key] + continue + elif dep["manager"] == "py" and current_time < pypi_rate_limited_until: + if key in cache: + latest_info[key] = cache[key] + continue + elif dep["manager"] == "docker" and current_time < docker_rate_limited_until: + if key in cache: + latest_info[key] = cache[key] + continue + + try: + if dep["manager"] == "npm": + latest = _check_npm_latest(dep["name"]) + elif dep["manager"] == "py": + latest = _check_pypi_latest(dep["name"]) + elif dep["manager"] == "docker": + latest = _check_dockerhub_latest(dep["name"]) + else: + continue + + if latest: + locked = dep["version"] + delta = _calculate_version_delta(locked, latest) + latest_info[key] = { + "locked": locked, + "latest": latest, + "delta": delta, + "is_outdated": locked != latest, + "last_checked": datetime.now().isoformat() + } + # Rate limiting: service-specific delays for optimal performance + if dep["manager"] == "npm": + time.sleep(RATE_LIMIT_NPM) # 0.1s for npm + elif dep["manager"] == "py": + time.sleep(RATE_LIMIT_PYPI) # 0.2s for PyPI + elif dep["manager"] == "docker": + time.sleep(RATE_LIMIT_DOCKER) # 0.2s for Docker Hub + except (urllib.error.URLError, urllib.error.HTTPError, http.client.RemoteDisconnected, + TimeoutError, json.JSONDecodeError, KeyError, ValueError) as e: + error_msg = f"{type(e).__name__}: {str(e)[:50]}" + + # Handle rate limiting and connection errors specifically + if ("429" in str(e) or "rate" in str(e).lower() or + "RemoteDisconnected" in str(e) or "closed connection" in str(e).lower()): + # Set rate limit expiry for this service + if dep["manager"] == "npm": + npm_rate_limited_until = current_time + RATE_LIMIT_BACKOFF + elif dep["manager"] == "py": + pypi_rate_limited_until = current_time + RATE_LIMIT_BACKOFF + elif dep["manager"] == "docker": + docker_rate_limited_until = current_time + RATE_LIMIT_BACKOFF + + # Use cached data if available, even if expired + if key in cache: + latest_info[key] = cache[key] + latest_info[key]["error"] = error_msg + else: + latest_info[key] = { + "locked": dep["version"], + "latest": None, + "delta": None, + "is_outdated": False, + "error": error_msg, + "last_checked": datetime.now().isoformat() + } + continue + + # Save updated cache + _save_deps_cache(latest_info, cache_file) + + return latest_info + + +def _load_deps_cache(cache_file: str) -> Dict[str, Dict[str, Any]]: + """ + Load the dependency cache from disk. + Returns empty dict if cache doesn't exist or is invalid. + """ + try: + cache_path = Path(cache_file) + if cache_path.exists(): + with open(cache_path, 'r', encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, OSError): + pass + return {} + + +def _save_deps_cache(latest_info: Dict[str, Dict[str, Any]], cache_file: str) -> None: + """ + Save the dependency cache to disk. + Merges with existing cache to preserve data for packages not in current check. + """ + try: + cache_path = Path(cache_file) + cache_path.parent.mkdir(parents=True, exist_ok=True) + + # Load existing cache to merge + existing = _load_deps_cache(cache_file) + + # Merge new data into existing (new data takes precedence) + existing.update(latest_info) + + # Write merged cache + with open(cache_path, 'w', encoding='utf-8') as f: + json.dump(existing, f, indent=2, sort_keys=True) + except OSError: + pass # Fail silently if can't write cache + + +def _is_cache_valid(cached_item: Dict[str, Any], hours: int = 24) -> bool: + """ + Check if a cached item is still valid based on age. + Default is 24 hours for dependency version checks. + """ + try: + if "last_checked" not in cached_item: + return False + last_checked = datetime.fromisoformat(cached_item["last_checked"]) + age = datetime.now() - last_checked + return age.total_seconds() < (hours * 3600) + except (ValueError, KeyError): + return False + + +def _check_npm_latest(package_name: str) -> Optional[str]: + """Fetch latest version from npm registry.""" + import urllib.request + import urllib.error + + # Validate and sanitize package name + if not validate_package_name(package_name, "npm"): + return None + + # URL-encode the package name for safety + safe_package_name = sanitize_url_component(package_name) + url = f"https://registry.npmjs.org/{safe_package_name}" + try: + with urllib.request.urlopen(url, timeout=10) as response: + data = json.loads(response.read()) + return data.get("dist-tags", {}).get("latest") + except (urllib.error.URLError, http.client.RemoteDisconnected, json.JSONDecodeError, KeyError): + return None + + +def _check_pypi_latest(package_name: str) -> Optional[str]: + """Fetch latest version from PyPI.""" + import urllib.request + import urllib.error + + # Validate package name + if not validate_package_name(package_name, "py"): + return None + + # Normalize package name for PyPI (replace underscores with hyphens) + normalized_name = package_name.replace('_', '-') + # Sanitize for URL + safe_package_name = sanitize_url_component(normalized_name) + url = f"https://pypi.org/pypi/{safe_package_name}/json" + try: + with urllib.request.urlopen(url, timeout=10) as response: + data = json.loads(response.read()) + return data.get("info", {}).get("version") + except (urllib.error.URLError, http.client.RemoteDisconnected, json.JSONDecodeError, KeyError): + return None + + +def _check_dockerhub_latest(image_name: str) -> Optional[str]: + """Fetch latest version from Docker Hub.""" + import urllib.request + import urllib.error + + # Validate image name + if not validate_package_name(image_name, "docker"): + return None + + # For official images, use library/ prefix + if "/" not in image_name: + image_name = f"library/{image_name}" + + # Sanitize image name for URL + safe_image_name = sanitize_url_component(image_name) + + # Docker Hub API endpoint for tags + url = f"https://hub.docker.com/v2/repositories/{safe_image_name}/tags" + + try: + # Create request with proper headers + req = urllib.request.Request(url) + req.add_header('User-Agent', 'TheAuditor/0.1.0') + + with urllib.request.urlopen(req, timeout=10) as response: + data = json.loads(response.read()) + + # Parse the results to find latest stable version + tags = data.get("results", []) + if not tags: + return None + + # Filter and sort tags to find the best "latest" version + version_tags = [] + for tag in tags: + tag_name = tag.get("name", "") + # Skip non-version tags + if tag_name in ["latest", "alpine", "slim", "bullseye", "bookworm"]: + continue + # Look for semantic version-like tags + if re.match(r'^\d+(\.\d+)*', tag_name): + version_tags.append(tag_name) + + if version_tags: + # Sort versions (simple string sort for now) + # More sophisticated version comparison could be added + version_tags.sort(reverse=True) + return version_tags[0] + + # Fallback to "latest" if no version tags found + for tag in tags: + if tag.get("name") == "latest": + return "latest" + + return None + + except (urllib.error.URLError, http.client.RemoteDisconnected, json.JSONDecodeError, KeyError) as e: + # Docker Hub API might require auth or have rate limits + return None + + +def _calculate_version_delta(locked: str, latest: str) -> str: + """ + Calculate semantic version delta. + Returns: "major", "minor", "patch", "equal", or "unknown" + """ + try: + locked_parts = [int(x) for x in locked.split(".")[:3]] + latest_parts = [int(x) for x in latest.split(".")[:3]] + + # Pad with zeros if needed + while len(locked_parts) < 3: + locked_parts.append(0) + while len(latest_parts) < 3: + latest_parts.append(0) + + if locked_parts == latest_parts: + return "equal" + elif latest_parts[0] > locked_parts[0]: + return "major" + elif latest_parts[1] > locked_parts[1]: + return "minor" + elif latest_parts[2] > locked_parts[2]: + return "patch" + else: + return "unknown" # locked is newer than latest? + except (ValueError, IndexError): + return "unknown" + + +def write_deps_latest_json( + latest_info: Dict[str, Dict[str, Any]], + output_path: str = "./.pf/deps_latest.json" +) -> None: + """Write latest version info to JSON file.""" + try: + output = sanitize_path(output_path, ".") + output.parent.mkdir(parents=True, exist_ok=True) + + with open(output, "w", encoding="utf-8") as f: + json.dump(latest_info, f, indent=2, sort_keys=True) + except SecurityError as e: + raise SecurityError(f"Invalid output path: {e}") + + +def upgrade_all_deps( + root_path: str, + latest_info: Dict[str, Dict[str, Any]], + deps_list: List[Dict[str, Any]] +) -> Dict[str, int]: + """ + YOLO MODE: Upgrade all dependencies to latest versions. + Rewrites requirements.txt, package.json, and pyproject.toml with latest versions. + + Returns dict with counts of upgraded packages per file type. + """ + import shutil + from datetime import datetime + + root = Path(root_path) + upgraded = { + "requirements.txt": 0, + "package.json": 0, + "pyproject.toml": 0 + } + + # Group deps by source file + deps_by_source = {} + for dep in deps_list: + source = dep.get("source", "") + if source not in deps_by_source: + deps_by_source[source] = [] + deps_by_source[source].append(dep) + + # Upgrade requirements*.txt files + for req_file in root.glob("requirements*.txt"): + if req_file.name in deps_by_source: + count = _upgrade_requirements_txt(req_file, latest_info, deps_by_source[req_file.name]) + upgraded["requirements.txt"] += count + + # Upgrade package.json + package_json = root / "package.json" + if package_json.exists() and "package.json" in deps_by_source: + count = _upgrade_package_json(package_json, latest_info, deps_by_source["package.json"]) + upgraded["package.json"] = count + + # Upgrade pyproject.toml + pyproject = root / "pyproject.toml" + if pyproject.exists() and "pyproject.toml" in deps_by_source: + count = _upgrade_pyproject_toml(pyproject, latest_info, deps_by_source["pyproject.toml"]) + upgraded["pyproject.toml"] = count + + return upgraded + + +def _upgrade_requirements_txt( + path: Path, + latest_info: Dict[str, Dict[str, Any]], + deps: List[Dict[str, Any]] +) -> int: + """Upgrade a requirements.txt file to latest versions.""" + # Sanitize path + try: + safe_path = sanitize_path(str(path), ".") + except SecurityError: + return 0 # Skip files outside project root + + # Create backup + backup_path = safe_path.with_suffix(safe_path.suffix + ".bak") + shutil.copy2(safe_path, backup_path) + + # Read current file + with open(safe_path, "r", encoding="utf-8") as f: + lines = f.readlines() + + # Build package name to latest version map + latest_versions = {} + for dep in deps: + key = f"py:{dep['name']}" + if key in latest_info: + latest_versions[dep['name']] = latest_info[key]['latest'] + + # Rewrite lines with latest versions + updated_lines = [] + count = 0 + + for line in lines: + original_line = line + line = line.strip() + + # Skip comments and empty lines + if not line or line.startswith("#") or line.startswith("-"): + updated_lines.append(original_line) + continue + + # Parse package name + name, _ = _parse_python_dep_spec(line) + + if name and name in latest_versions: + # Replace with latest version + updated_lines.append(f"{name}=={latest_versions[name]}\n") + count += 1 + else: + updated_lines.append(original_line) + + # Write updated file + with open(safe_path, "w", encoding="utf-8") as f: + f.writelines(updated_lines) + + return count + + +def _upgrade_package_json( + path: Path, + latest_info: Dict[str, Dict[str, Any]], + deps: List[Dict[str, Any]] +) -> int: + """Upgrade package.json to latest versions.""" + import shutil + + # Sanitize path + try: + safe_path = sanitize_path(str(path), ".") + except SecurityError: + return 0 # Skip files outside project root + + # Create backup + backup_path = safe_path.with_suffix(safe_path.suffix + ".bak") + shutil.copy2(safe_path, backup_path) + + # Read current file + with open(safe_path, "r", encoding="utf-8") as f: + data = json.load(f) + + count = 0 + + # Update dependencies + if "dependencies" in data: + for name in data["dependencies"]: + key = f"npm:{name}" + if key in latest_info: + data["dependencies"][name] = latest_info[key]["latest"] + count += 1 + + # Update devDependencies + if "devDependencies" in data: + for name in data["devDependencies"]: + key = f"npm:{name}" + if key in latest_info: + data["devDependencies"][name] = latest_info[key]["latest"] + count += 1 + + # Write updated file + with open(safe_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + f.write("\n") # Add trailing newline + + return count + + +def _upgrade_pyproject_toml( + path: Path, + latest_info: Dict[str, Dict[str, Any]], + deps: List[Dict[str, Any]] +) -> int: + """Upgrade pyproject.toml to latest versions - handles ALL sections.""" + import shutil + import re + + # Sanitize path + try: + safe_path = sanitize_path(str(path), ".") + except SecurityError: + return 0 # Skip files outside project root + + # Create backup + backup_path = safe_path.with_suffix(safe_path.suffix + ".bak") + shutil.copy2(safe_path, backup_path) + + # Read entire file as string for regex replacement + with open(safe_path, "r", encoding="utf-8") as f: + content = f.read() + + count = 0 + updated_packages = {} # Track all updates: package -> [(old, new)] + + # For each package in latest_info + for key, info in latest_info.items(): + if not key.startswith("py:"): + continue + + package_name = key[3:] # Remove "py:" prefix + latest_version = info.get("latest") + + if not latest_version: + continue + + # Pattern to match this package anywhere in the file + # Matches: "package==X.Y.Z" with any version number + pattern = rf'"{package_name}==([^"]+)"' + + # Replace ALL occurrences at once using re.sub with a function + def replacer(match): + old_version = match.group(1) + if old_version != latest_version: + # Track the update + if package_name not in updated_packages: + updated_packages[package_name] = [] + updated_packages[package_name].append((old_version, latest_version)) + return f'"{package_name}=={latest_version}"' + return match.group(0) # No change + + # Replace all occurrences in one pass + new_content = re.sub(pattern, replacer, content) + + # Update count only if package was actually updated + if package_name in updated_packages and content != new_content: + count += 1 + content = new_content + + # Write updated content + with open(safe_path, "w", encoding="utf-8") as f: + f.write(content) + + # Report what was updated + total_occurrences = 0 + # Use ASCII characters on Windows + check_mark = "[OK]" if IS_WINDOWS else "✓" + arrow = "->" if IS_WINDOWS else "→" + for package, updates in updated_packages.items(): + total_occurrences += len(updates) + if len(updates) == 1: + print(f" {check_mark} {package}: {updates[0][0]} {arrow} {updates[0][1]}") + else: + print(f" {check_mark} {package}: {updates[0][0]} {arrow} {updates[0][1]} ({len(updates)} occurrences)") + + # Return total occurrences updated, not just unique packages + return total_occurrences \ No newline at end of file diff --git a/theauditor/docgen.py b/theauditor/docgen.py new file mode 100644 index 0000000..baa4255 --- /dev/null +++ b/theauditor/docgen.py @@ -0,0 +1,565 @@ +"""Documentation generator from index and capsules (optional feature).""" + +import hashlib +import json +import platform +import sqlite3 +import sys +from collections import defaultdict +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from theauditor import __version__ + + +def is_source_file(file_path: str) -> bool: + """Check if a file is a source code file (not test, config, or docs).""" + path = Path(file_path) + + # Skip test files and directories + if any(part in ['test', 'tests', '__tests__', 'spec', 'fixtures', 'fixture_repo', 'test_scaffold'] for part in path.parts): + return False + if path.name.startswith('test_') or path.name.endswith('_test.py') or '.test.' in path.name or '.spec.' in path.name: + return False + if 'test' in str(path).lower() and any(ext in str(path).lower() for ext in ['.spec.', '_test.', 'test_']): + return False + + # Skip documentation + if path.suffix.lower() in ['.md', '.rst', '.txt']: + return False + + # Skip configuration files + config_files = { + '.gitignore', '.gitattributes', '.editorconfig', + 'pyproject.toml', 'setup.py', 'setup.cfg', + 'package.json', 'package-lock.json', 'yarn.lock', + 'package-template.json', 'tsconfig.json', + 'Makefile', 'makefile', 'requirements.txt', + 'Dockerfile', 'docker-compose.yml', '.dockerignore', + 'manifest.json', 'repo_index.db' + } + if path.name.lower() in config_files: + return False + + # Skip build artifacts and caches + skip_dirs = {'docs', 'documentation', 'examples', 'samples', 'schemas', 'agent_templates'} + if any(part.lower() in skip_dirs for part in path.parts): + return False + + return True + + +def load_manifest(manifest_path: str) -> tuple[list[dict], str]: + """Load manifest and compute its hash.""" + with open(manifest_path, "rb") as f: + content = f.read() + manifest_hash = hashlib.sha256(content).hexdigest() + + manifest = json.loads(content) + return manifest, manifest_hash + + +def load_workset(workset_path: str) -> set[str]: + """Load workset file paths.""" + if not Path(workset_path).exists(): + return set() + + with open(workset_path) as f: + workset = json.load(f) + return {p["path"] for p in workset.get("paths", [])} + + +def load_capsules(capsules_dir: str, workset_paths: set[str] | None = None) -> list[dict]: + """Load capsules, optionally filtered by workset.""" + capsules = [] + capsules_path = Path(capsules_dir) + + if not capsules_path.exists(): + raise RuntimeError(f"Capsules directory not found: {capsules_dir}") + + for capsule_file in sorted(capsules_path.glob("*.json")): + with open(capsule_file) as f: + capsule = json.load(f) + + # Filter by workset if provided + if workset_paths is None or capsule.get("path") in workset_paths: + # Filter out non-source files + if is_source_file(capsule.get("path", "")): + capsules.append(capsule) + + return capsules + + +def get_routes(db_path: str, workset_paths: set[str] | None = None) -> list[dict]: + """Get routes from database, excluding test files.""" + if not Path(db_path).exists(): + return [] + + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + if workset_paths: + placeholders = ",".join("?" * len(workset_paths)) + query = f""" + SELECT method, pattern, file + FROM api_endpoints + WHERE file IN ({placeholders}) + ORDER BY file, pattern + """ + cursor.execute(query, tuple(workset_paths)) + else: + cursor.execute( + """ + SELECT method, pattern, file + FROM api_endpoints + ORDER BY file, pattern + """ + ) + + routes = [] + for row in cursor.fetchall(): + # Filter out test files + if is_source_file(row[2]): + routes.append({"method": row[0], "pattern": row[1], "file": row[2]}) + + conn.close() + return routes + + +def get_sql_objects(db_path: str, workset_paths: set[str] | None = None) -> list[dict]: + """Get SQL objects from database, excluding test files.""" + if not Path(db_path).exists(): + return [] + + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + if workset_paths: + placeholders = ",".join("?" * len(workset_paths)) + query = f""" + SELECT kind, name, file + FROM sql_objects + WHERE file IN ({placeholders}) + ORDER BY kind, name + """ + cursor.execute(query, tuple(workset_paths)) + else: + cursor.execute( + """ + SELECT kind, name, file + FROM sql_objects + ORDER BY kind, name + """ + ) + + objects = [] + for row in cursor.fetchall(): + # Filter out test files + if is_source_file(row[2]): + objects.append({"kind": row[0], "name": row[1], "file": row[2]}) + + conn.close() + return objects + + +def group_files_by_folder(capsules: list[dict]) -> dict[str, list[dict]]: + """Group files by their first directory segment.""" + groups = defaultdict(list) + + for capsule in capsules: + path = capsule.get("path", "") + if "/" in path: + folder = path.split("/")[0] + else: + folder = "." + groups[folder].append(capsule) + + # Sort by folder name + return dict(sorted(groups.items())) + + +def generate_architecture_md( + routes: list[dict], + sql_objects: list[dict], + capsules: list[dict], + scope: str, +) -> str: + """Generate ARCHITECTURE.md content.""" + now = datetime.now(UTC).isoformat() + + content = [ + "# Architecture", + f"Generated at: {now}", + "", + "## Scope", + f"Mode: {scope}", + "", + ] + + # Routes table + if routes: + content.extend( + [ + "## Routes", + "", + "| Method | Pattern | File |", + "|--------|---------|------|", + ] + ) + for route in routes: + content.append(f"| {route['method']} | {route['pattern']} | {route['file']} |") + content.append("") + + # SQL Objects table + if sql_objects: + content.extend( + [ + "## SQL Objects", + "", + "| Kind | Name | File |", + "|------|------|------|", + ] + ) + for obj in sql_objects: + content.append(f"| {obj['kind']} | {obj['name']} | {obj['file']} |") + content.append("") + + # Core Modules (group by actual functionality) + groups = group_files_by_folder(capsules) + if groups: + content.extend( + [ + "## Core Modules", + "", + ] + ) + + # Filter and organize by purpose + module_categories = { + "Core CLI": {}, + "Analysis & Detection": {}, + "Code Generation": {}, + "Reporting": {}, + "Utilities": {}, + } + + for folder, folder_capsules in groups.items(): + if folder == "theauditor": + for capsule in folder_capsules: + path = Path(capsule.get("path", "")) + name = path.stem + + # Skip duplicates and internal modules + if name in ['__init__', 'parsers'] or name.endswith('.py.tpl'): + continue + + exports = capsule.get("interfaces", {}).get("exports", []) + functions = capsule.get("interfaces", {}).get("functions", []) + classes = capsule.get("interfaces", {}).get("classes", []) + + # Categorize based on filename + if name in ['cli', 'orchestrator', 'config', 'config_runtime']: + category = "Core CLI" + elif name in ['lint', 'ast_verify', 'universal_detector', 'pattern_loader', 'flow_analyzer', 'risk_scorer', 'pattern_rca', 'xgraph_analyzer']: + category = "Analysis & Detection" + elif name in ['scaffolder', 'test_generator', 'claude_setup', 'claude_autogen', 'venv_install']: + category = "Code Generation" + elif name in ['report', 'capsules', 'docgen', 'journal_view']: + category = "Reporting" + else: + # Skip certain utility files from main display + if name in ['utils', 'evidence', 'runner', 'contracts', 'tools']: + continue + category = "Utilities" + + # Build summary (only add if not already present) + if name not in module_categories[category]: + summary_parts = [] + if classes: + summary_parts.append(f"Classes: {', '.join(classes[:3])}") + elif functions: + summary_parts.append(f"Functions: {', '.join(functions[:3])}") + elif exports: + summary_parts.append(f"Exports: {', '.join(exports[:3])}") + + summary = " | ".join(summary_parts) if summary_parts else "Utility module" + module_categories[category][name] = f"- **{name}**: {summary}" + + # Output categorized modules + for category, modules_dict in module_categories.items(): + if modules_dict: + content.append(f"### {category}") + # Sort modules by name and get their descriptions + for name in sorted(modules_dict.keys()): + content.append(modules_dict[name]) + content.append("") + + return "\n".join(content) + + +def generate_features_md(capsules: list[dict]) -> str: + """Generate FEATURES.md content with meaningful feature descriptions.""" + content = [ + "# Features & Capabilities", + "", + "## Core Functionality", + "", + ] + + # Analyze capsules to extract features + features = { + "Code Analysis": [], + "Test Generation": [], + "Documentation": [], + "CI/CD Integration": [], + "ML Capabilities": [], + } + + cli_commands = set() + + for capsule in capsules: + path = Path(capsule.get("path", "")) + if path.parent.name != "theauditor": + continue + + name = path.stem + exports = capsule.get("interfaces", {}).get("exports", []) + functions = capsule.get("interfaces", {}).get("functions", []) + + # Extract features based on module + if name == "cli": + # Try to extract CLI commands from functions + for func in functions: + if func not in ['main', 'cli']: + cli_commands.add(func) + elif name == "lint": + features["Code Analysis"].append("- **Linting**: Custom security and quality rules") + elif name == "ast_verify": + features["Code Analysis"].append("- **AST Verification**: Contract-based code verification") + elif name == "universal_detector": + features["Code Analysis"].append("- **Pattern Detection**: Security and performance anti-patterns") + elif name == "flow_analyzer": + features["Code Analysis"].append("- **Flow Analysis**: Deadlock and race condition detection") + elif name == "risk_scorer": + features["Code Analysis"].append("- **Risk Scoring**: Automated risk assessment for files") + elif name == "test_generator": + features["Test Generation"].append("- **Test Scaffolding**: Generate test stubs from code") + elif name == "scaffolder": + features["Test Generation"].append("- **Contract Tests**: Generate DB/API contract tests") + elif name == "docgen": + features["Documentation"].append("- **Architecture Docs**: Auto-generate architecture documentation") + elif name == "capsules": + features["Documentation"].append("- **Code Capsules**: Compressed code summaries") + elif name == "report": + features["Documentation"].append("- **Audit Reports**: Comprehensive audit report generation") + elif name == "claude_setup": + features["CI/CD Integration"].append("- **Claude Code Integration**: Automated hooks for Claude AI") + elif name == "orchestrator": + features["CI/CD Integration"].append("- **Event-Driven Automation**: Git hooks and CI pipeline support") + elif name == "ml": + features["ML Capabilities"].append("- **ML-Based Suggestions**: Learn from codebase patterns") + features["ML Capabilities"].append("- **Root Cause Prediction**: Predict likely failure points") + + # Output features by category + for category, feature_list in features.items(): + if feature_list: + content.append(f"### {category}") + # Deduplicate + seen = set() + for feature in feature_list: + if feature not in seen: + content.append(feature) + seen.add(feature) + content.append("") + + # Add CLI commands summary + if cli_commands: + content.append("## Available Commands") + content.append("") + content.append("The following commands are available through the CLI:") + content.append("") + # Group commands by purpose + cmd_groups = { + "Analysis": ['lint', 'ast_verify', 'detect_patterns', 'flow_analyze', 'risk_score'], + "Generation": ['gen_tests', 'scaffold', 'suggest_fixes'], + "Reporting": ['report', 'journal', 'capsules'], + "Setup": ['init', 'setup_claude', 'deps'], + } + + for group, cmds in cmd_groups.items(): + group_cmds = [c for c in cli_commands if any(cmd in c for cmd in cmds)] + if group_cmds: + content.append(f"**{group}**: {', '.join(sorted(group_cmds)[:5])}") + content.append("") + + # Add configuration info + content.append("## Configuration") + content.append("") + content.append("- **Zero Dependencies**: Core functionality uses only Python stdlib") + content.append("- **Offline Mode**: All operations work without network access") + content.append("- **Per-Project**: No global state, everything is project-local") + content.append("") + + return "\n".join(content) + + +def generate_trace_md( + manifest_hash: str, + manifest: list[dict], + capsules: list[dict], + db_path: str, + workset_paths: set[str] | None, +) -> str: + """Generate TRACE.md content with meaningful metrics.""" + # Count database entries + routes_count = 0 + sql_objects_count = 0 + refs_count = 0 + imports_count = 0 + + if Path(db_path).exists(): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + cursor.execute("SELECT COUNT(*) FROM api_endpoints") + routes_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM sql_objects") + sql_objects_count = cursor.fetchone()[0] + + # Count refs (files table) + cursor.execute("SELECT COUNT(*) FROM files") + refs_count = cursor.fetchone()[0] + + # Count imports + try: + cursor.execute("SELECT COUNT(*) FROM imports") + imports_count = cursor.fetchone()[0] + except sqlite3.OperationalError: + imports_count = 0 + + conn.close() + + # Separate source files from all files + source_files = [f for f in manifest if is_source_file(f.get("path", ""))] + test_files = [f for f in manifest if 'test' in f.get("path", "").lower()] + doc_files = [f for f in manifest if f.get("path", "").endswith(('.md', '.rst', '.txt'))] + + # Calculate coverage + if workset_paths: + coverage = len(capsules) / len(workset_paths) * 100 if workset_paths else 0 + else: + coverage = len(capsules) / len(source_files) * 100 if source_files else 0 + + content = [ + "# Audit Trace", + "", + "## Repository Snapshot", + f"**Manifest Hash**: `{manifest_hash}`", + f"**Timestamp**: {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}", + "", + "## File Statistics", + f"- **Total Files**: {len(manifest)}", + f" - Source Files: {len(source_files)}", + f" - Test Files: {len(test_files)}", + f" - Documentation: {len(doc_files)}", + f" - Other: {len(manifest) - len(source_files) - len(test_files) - len(doc_files)}", + "", + "## Code Metrics", + f"- **Cross-References**: {refs_count}", + f"- **Import Statements**: {imports_count}", + f"- **HTTP Routes**: {routes_count}", + f"- **SQL Objects**: {sql_objects_count}", + "", + "## Analysis Coverage", + f"- **Coverage**: {coverage:.1f}% of source files", + f"- **Capsules Generated**: {len(capsules)}", + f"- **Scope**: {'Workset' if workset_paths else 'Full repository'}", + "", + "## Language Distribution", + ] + + # Count languages + lang_counts = defaultdict(int) + for capsule in capsules: + lang = capsule.get("language", "") # Empty not unknown + lang_counts[lang] += 1 + + for lang, count in sorted(lang_counts.items(), key=lambda x: x[1], reverse=True): + content.append(f"- {lang}: {count} files") + + content.extend([ + "", + "## Environment", + f"- **TheAuditor Version**: {__version__}", + f"- **Python**: {sys.version.split()[0]}", + f"- **Platform**: {platform.platform()}", + f"- **Processor**: {platform.processor() or 'Unknown'}", + "", + "## Audit Trail", + "This document provides cryptographic proof of the codebase state at audit time.", + "The manifest hash can be used to verify no files have been modified since analysis.", + "", + ]) + + return "\n".join(content) + + +# This function was moved above generate_trace_md + + +def generate_docs( + manifest_path: str = "manifest.json", + db_path: str = "repo_index.db", + capsules_dir: str = "./.pf/capsules", + workset_path: str = "./.pf/workset.json", + out_dir: str = "./.pf/docs", + full: bool = False, + print_stats: bool = False, +) -> dict[str, Any]: + """Generate documentation from index and capsules.""" + + # Load data + manifest, manifest_hash = load_manifest(manifest_path) + workset_paths = None if full else load_workset(workset_path) + + try: + capsules = load_capsules(capsules_dir, workset_paths) + except RuntimeError as e: + raise RuntimeError(f"Cannot generate docs: {e}. Run 'aud capsules' first.") from e + + # Get database data + routes = get_routes(db_path, workset_paths) + sql_objects = get_sql_objects(db_path, workset_paths) + + # Generate content + scope = "full" if full else "workset" + architecture_content = generate_architecture_md(routes, sql_objects, capsules, scope) + trace_content = generate_trace_md(manifest_hash, manifest, capsules, db_path, workset_paths) + features_content = generate_features_md(capsules) + + # Write files + out_path = Path(out_dir) + out_path.mkdir(parents=True, exist_ok=True) + + (out_path / "ARCHITECTURE.md").write_text(architecture_content) + (out_path / "TRACE.md").write_text(trace_content) + (out_path / "FEATURES.md").write_text(features_content) + + result = { + "files_written": 3, + "scope": scope, + "capsules_used": len(capsules), + "routes": len(routes), + "sql_objects": len(sql_objects), + } + + if print_stats: + print(f"Generated {result['files_written']} docs in {out_dir}") + print(f" Scope: {result['scope']}") + print(f" Capsules: {result['capsules_used']}") + print(f" Routes: {result['routes']}") + print(f" SQL Objects: {result['sql_objects']}") + + return result diff --git a/theauditor/docker_analyzer.py b/theauditor/docker_analyzer.py new file mode 100644 index 0000000..455809b --- /dev/null +++ b/theauditor/docker_analyzer.py @@ -0,0 +1,310 @@ +"""Docker container security analyzer module.""" + +import json +import logging +import re +import sqlite3 +from pathlib import Path +from typing import Any, Dict, List + +# Set up logger +logger = logging.getLogger(__name__) + + +def analyze_docker_images(db_path: str, check_vulnerabilities: bool = True) -> List[Dict[str, Any]]: + """ + Analyze indexed Docker images for security misconfigurations. + + Args: + db_path: Path to the repo_index.db database + check_vulnerabilities: Whether to scan base images for vulnerabilities + + Returns: + List of security findings with severity levels + """ + findings = [] + + # Connect to the database + with sqlite3.connect(db_path) as conn: + conn.row_factory = sqlite3.Row + + # Run each security check + findings.extend(_find_root_containers(conn)) + findings.extend(_find_exposed_secrets(conn)) + + # Base image vulnerability check + if check_vulnerabilities: + base_images = _prepare_base_image_scan(conn) + if base_images: + # Import here to avoid circular dependency + from .vulnerability_scanner import scan_dependencies + + # Run vulnerability scan on Docker base images + vuln_findings = scan_dependencies(base_images, offline=False) + + # Convert vulnerability findings to Docker-specific format + for vuln in vuln_findings: + findings.append({ + 'type': 'docker_base_image_vulnerability', + 'severity': vuln.get('severity', 'medium'), + 'file': 'Dockerfile', + 'message': f"Base image {vuln.get('package', 'unknown')} has vulnerability: {vuln.get('title', 'Unknown vulnerability')}", + 'recommendation': vuln.get('recommendation', 'Update to latest secure version'), + 'details': vuln + }) + + return findings + + +def _find_root_containers(conn: sqlite3.Connection) -> List[Dict[str, Any]]: + """ + Detect containers running as root user (default or explicit). + + CIS Docker Benchmark: Running containers as root is a major security risk. + A container breakout would grant attacker root privileges on the host. + + Args: + conn: SQLite database connection + + Returns: + List of findings for containers running as root + """ + findings = [] + cursor = conn.cursor() + + # Query all Docker images + cursor.execute("SELECT file_path, env_vars FROM docker_images") + + for row in cursor: + file_path = row['file_path'] + env_vars_json = row['env_vars'] + + # Parse the JSON column + try: + env_vars = json.loads(env_vars_json) if env_vars_json else {} + except json.JSONDecodeError as e: + logger.debug(f"Non-critical error parsing Docker env vars JSON: {e}", exc_info=False) + continue + + # Check for _DOCKER_USER key (set by USER instruction) + docker_user = env_vars.get('_DOCKER_USER') + + # If no USER instruction or explicitly set to root + if docker_user is None or docker_user.lower() == 'root': + findings.append({ + 'type': 'docker_root_user', + 'severity': 'High', + 'file': file_path, + 'message': f"Container runs as root user (USER instruction {'not set' if docker_user is None else 'set to root'})", + 'recommendation': "Add 'USER ' instruction to Dockerfile after installing dependencies" + }) + + return findings + + +def _find_exposed_secrets(conn: sqlite3.Connection) -> List[Dict[str, Any]]: + """ + Detect hardcoded secrets in ENV and ARG instructions. + + ENV and ARG values are stored in image layers and can be inspected + by anyone with access to the image, making them unsuitable for secrets. + + Args: + conn: SQLite database connection + + Returns: + List of findings for exposed secrets + """ + findings = [] + cursor = conn.cursor() + + # Patterns for detecting sensitive keys + sensitive_key_patterns = [ + r'(?i)password', + r'(?i)secret', + r'(?i)api[_-]?key', + r'(?i)token', + r'(?i)auth', + r'(?i)credential', + r'(?i)private[_-]?key', + r'(?i)access[_-]?key' + ] + + # Common secret value patterns + secret_value_patterns = [ + r'^ghp_[A-Za-z0-9]{36}$', # GitHub personal access token + r'^ghs_[A-Za-z0-9]{36}$', # GitHub secret + r'^sk-[A-Za-z0-9]{48}$', # OpenAI API key + r'^xox[baprs]-[A-Za-z0-9-]+$', # Slack token + r'^AKIA[A-Z0-9]{16}$', # AWS access key ID + ] + + # Query all Docker images + cursor.execute("SELECT file_path, env_vars, build_args FROM docker_images") + + for row in cursor: + file_path = row['file_path'] + env_vars_json = row['env_vars'] + build_args_json = row['build_args'] + + # Parse JSON columns + try: + env_vars = json.loads(env_vars_json) if env_vars_json else {} + build_args = json.loads(build_args_json) if build_args_json else {} + except json.JSONDecodeError as e: + logger.debug(f"Non-critical error parsing Docker JSON columns: {e}", exc_info=False) + continue + + # Check ENV variables + for key, value in env_vars.items(): + # Skip internal tracking keys + if key.startswith('_DOCKER_'): + continue + + is_sensitive = False + + # Check if key name indicates sensitive data + for pattern in sensitive_key_patterns: + if re.search(pattern, key): + is_sensitive = True + findings.append({ + 'type': 'docker_exposed_secret', + 'severity': 'Critical', + 'file': file_path, + 'message': f"Potential secret exposed in ENV instruction: {key}", + 'recommendation': "Use Docker secrets or mount secrets at runtime instead of ENV" + }) + break + + # Check if value matches known secret patterns + if not is_sensitive and value: + for pattern in secret_value_patterns: + if re.match(pattern, str(value)): + findings.append({ + 'type': 'docker_exposed_secret', + 'severity': 'Critical', + 'file': file_path, + 'message': f"Detected secret pattern in ENV value for key: {key}", + 'recommendation': "Remove hardcoded secrets and use runtime secret injection" + }) + break + + # Check for high entropy strings (potential secrets) + if not is_sensitive and value and _is_high_entropy(str(value)): + findings.append({ + 'type': 'docker_possible_secret', + 'severity': 'Medium', + 'file': file_path, + 'message': f"High entropy value in ENV {key} - possible secret", + 'recommendation': "Review if this is a secret and move to secure storage if so" + }) + + # Check BUILD ARGs + for key, value in build_args.items(): + # Check if key name indicates sensitive data + for pattern in sensitive_key_patterns: + if re.search(pattern, key): + findings.append({ + 'type': 'docker_exposed_secret', + 'severity': 'High', # Slightly lower than ENV as ARGs are build-time only + 'file': file_path, + 'message': f"Potential secret exposed in ARG instruction: {key}", + 'recommendation': "Use --secret mount or BuildKit secrets instead of ARG for sensitive data" + }) + break + + return findings + + +def _prepare_base_image_scan(conn: sqlite3.Connection) -> List[Dict[str, Any]]: + """ + Prepare base image data for vulnerability scanning. + + This function extracts and parses base image information from the database, + preparing it in the format expected by vulnerability_scanner.scan_dependencies(). + + Args: + conn: SQLite database connection + + Returns: + List of dependency dicts with manager='docker', name, and version + """ + dependencies = [] + cursor = conn.cursor() + + # Get all unique base images + cursor.execute("SELECT DISTINCT base_image FROM docker_images WHERE base_image IS NOT NULL") + + for row in cursor: + base_image = row[0] + + # Parse image string to extract name and version/tag + # Format examples: + # - python:3.11-slim + # - node:18-alpine + # - ubuntu:22.04 + # - gcr.io/project/image:tag + # - image@sha256:hash + + if '@' in base_image: + # Handle digest format (image@sha256:...) + name = base_image.split('@')[0] + version = base_image.split('@')[1] + elif ':' in base_image: + # Handle tag format (image:tag) + parts = base_image.rsplit(':', 1) + name = parts[0] + version = parts[1] + else: + # No tag specified, defaults to 'latest' + name = base_image + version = 'latest' + + # Create dependency dict in vulnerability scanner format + dependencies.append({ + 'manager': 'docker', + 'name': name, + 'version': version, + 'source_file': 'Dockerfile' # Could be enhanced to track actual file + }) + + return dependencies + + +def _is_high_entropy(value: str, threshold: float = 4.0) -> bool: + """ + Check if a string has high entropy (potential secret). + + Uses Shannon entropy calculation to detect random-looking strings + that might be secrets, API keys, or tokens. + + Args: + value: String to check + threshold: Entropy threshold (default 4.0) + + Returns: + True if entropy exceeds threshold + """ + import math + + # Skip short strings + if len(value) < 10: + return False + + # Skip strings with spaces (likely not secrets) + if ' ' in value: + return False + + # Calculate character frequency + char_freq = {} + for char in value: + char_freq[char] = char_freq.get(char, 0) + 1 + + # Calculate Shannon entropy + entropy = 0.0 + for freq in char_freq.values(): + probability = freq / len(value) + if probability > 0: + entropy -= probability * math.log2(probability) + + return entropy > threshold \ No newline at end of file diff --git a/theauditor/docs_fetch.py b/theauditor/docs_fetch.py new file mode 100644 index 0000000..9952ad4 --- /dev/null +++ b/theauditor/docs_fetch.py @@ -0,0 +1,793 @@ +"""Documentation fetcher for version-correct package docs.""" + +import json +import re +import time +import urllib.error +import urllib.request +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional +from theauditor.security import sanitize_path, sanitize_url_component, validate_package_name, SecurityError + + +# Default allowlist for registries +DEFAULT_ALLOWLIST = [ + "https://registry.npmjs.org/", + "https://pypi.org/", # Allow both API and web scraping + "https://raw.githubusercontent.com/", + "https://readthedocs.io/", + "https://readthedocs.org/", +] + +# Rate limiting configuration - optimized for minimal runtime +RATE_LIMIT_DELAY = 0.15 # Average delay between requests (balanced for npm/PyPI) +RATE_LIMIT_BACKOFF = 15 # Backoff on 429/disconnect (15s gives APIs time to reset) + + +def fetch_docs( + deps: List[Dict[str, Any]], + allow_net: bool = True, + allowlist: Optional[List[str]] = None, + offline: bool = False, + output_dir: str = "./.pf/context/docs" +) -> Dict[str, Any]: + """ + Fetch version-correct documentation for dependencies. + + Args: + deps: List of dependency objects from deps.py + allow_net: Whether network access is allowed + allowlist: List of allowed URL prefixes (uses DEFAULT_ALLOWLIST if None) + offline: Force offline mode + output_dir: Base directory for cached docs + + Returns: + Summary of fetch operations + """ + if offline or not allow_net: + return { + "mode": "offline", + "fetched": 0, + "cached": 0, + "skipped": len(deps), + "errors": [] + } + + if allowlist is None: + allowlist = DEFAULT_ALLOWLIST + + try: + output_path = sanitize_path(output_dir, ".") + output_path.mkdir(parents=True, exist_ok=True) + except SecurityError as e: + return { + "mode": "error", + "error": f"Invalid output directory: {e}", + "fetched": 0, + "cached": 0, + "skipped": len(deps) + } + + stats = { + "mode": "online", + "fetched": 0, + "cached": 0, + "skipped": 0, + "errors": [] + } + + # FIRST PASS: Check what's cached + needs_fetch = [] + for dep in deps: + # Quick cache check without network + cache_result = _check_cache_for_dep(dep, output_path) + if cache_result["cached"]: + stats["cached"] += 1 + else: + needs_fetch.append(dep) + + # Early exit if everything is cached + if not needs_fetch: + return stats + + # SECOND PASS: Fetch only what we need, with per-service rate limiting + npm_rate_limited_until = 0 + pypi_rate_limited_until = 0 + + for i, dep in enumerate(needs_fetch): + try: + current_time = time.time() + + # Check if this service is rate limited + if dep["manager"] == "npm" and current_time < npm_rate_limited_until: + stats["skipped"] += 1 + stats["errors"].append(f"{dep['name']}: Skipped (npm rate limited)") + continue + elif dep["manager"] == "py" and current_time < pypi_rate_limited_until: + stats["skipped"] += 1 + stats["errors"].append(f"{dep['name']}: Skipped (PyPI rate limited)") + continue + + # Fetch the documentation + if dep["manager"] == "npm": + result = _fetch_npm_docs(dep, output_path, allowlist) + elif dep["manager"] == "py": + result = _fetch_pypi_docs(dep, output_path, allowlist) + else: + stats["skipped"] += 1 + continue + + if result["status"] == "fetched": + stats["fetched"] += 1 + # Rate limiting: delay after successful fetch to be server-friendly + # npm and PyPI both have rate limits (npm: 100/min, PyPI: 60/min) + time.sleep(RATE_LIMIT_DELAY) # Be server-friendly + elif result["status"] == "cached": + stats["cached"] += 1 # Shouldn't happen here but handle it + elif result.get("reason") == "rate_limited": + stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s") + stats["skipped"] += 1 + # Set rate limit expiry for this service + if dep["manager"] == "npm": + npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF + elif dep["manager"] == "py": + pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF + else: + stats["skipped"] += 1 + + except Exception as e: + error_msg = str(e) + if "429" in error_msg or "rate" in error_msg.lower(): + stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s") + # Set rate limit expiry for this service + if dep["manager"] == "npm": + npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF + elif dep["manager"] == "py": + pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF + else: + stats["errors"].append(f"{dep['name']}: {error_msg}") + + return stats + + +def _check_cache_for_dep(dep: Dict[str, Any], output_dir: Path) -> Dict[str, bool]: + """ + Quick cache check for a dependency without making network calls. + Returns {"cached": True/False} + """ + name = dep["name"] + version = dep["version"] + manager = dep["manager"] + + # Build the cache file path + if manager == "npm": + # Handle git versions + if version.startswith("git") or "://" in version: + import hashlib + version_hash = hashlib.md5(version.encode()).hexdigest()[:8] + safe_version = f"git-{version_hash}" + else: + safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_") + safe_name = name.replace("@", "_at_").replace("/", "_") + pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}" + elif manager == "py": + safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_") + safe_name = name.replace("/", "_").replace("\\", "_") + pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}" + else: + return {"cached": False} + + doc_file = pkg_dir / "doc.md" + meta_file = pkg_dir / "meta.json" + + # Check cache validity + if doc_file.exists() and meta_file.exists(): + try: + with open(meta_file, encoding="utf-8") as f: + meta = json.load(f) + # Cache for 7 days + last_checked = datetime.fromisoformat(meta["last_checked"]) + if (datetime.now() - last_checked).days < 7: + return {"cached": True} + except (json.JSONDecodeError, KeyError): + pass + + return {"cached": False} + + +def _fetch_npm_docs( + dep: Dict[str, Any], + output_dir: Path, + allowlist: List[str] +) -> Dict[str, Any]: + """Fetch documentation for an npm package.""" + name = dep["name"] + version = dep["version"] + + # Validate package name + if not validate_package_name(name, "npm"): + return {"status": "skipped", "reason": "Invalid package name"} + + # Sanitize version for filesystem (handle git URLs) + if version.startswith("git") or "://" in version: + # For git dependencies, use a hash of the URL as version + import hashlib + version_hash = hashlib.md5(version.encode()).hexdigest()[:8] + safe_version = f"git-{version_hash}" + else: + # For normal versions, just replace problematic characters + safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_") + + # Create package-specific directory with sanitized name + # Replace @ and / in scoped packages for filesystem safety + safe_name = name.replace("@", "_at_").replace("/", "_") + try: + pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}" + pkg_dir.mkdir(parents=True, exist_ok=True) + except (OSError, SecurityError) as e: + return {"status": "error", "error": f"Cannot create package directory: {e}"} + + doc_file = pkg_dir / "doc.md" + meta_file = pkg_dir / "meta.json" + + # Check cache + if doc_file.exists() and meta_file.exists(): + # Check if cache is still valid (simple time-based for now) + try: + with open(meta_file, encoding="utf-8") as f: + meta = json.load(f) + # Cache for 7 days + last_checked = datetime.fromisoformat(meta["last_checked"]) + if (datetime.now() - last_checked).days < 7: + return {"status": "cached"} + except (json.JSONDecodeError, KeyError): + pass # Invalid cache, refetch + + # Fetch from registry with sanitized package name + safe_url_name = sanitize_url_component(name) + safe_url_version = sanitize_url_component(version) + url = f"https://registry.npmjs.org/{safe_url_name}/{safe_url_version}" + if not _is_url_allowed(url, allowlist): + return {"status": "skipped", "reason": "URL not in allowlist"} + + try: + with urllib.request.urlopen(url, timeout=10) as response: + data = json.loads(response.read()) + + readme = data.get("readme", "") + repository = data.get("repository", {}) + homepage = data.get("homepage", "") + + # Priority 1: Try to get README from GitHub if available + github_fetched = False + if isinstance(repository, dict): + repo_url = repository.get("url", "") + github_readme = _fetch_github_readme(repo_url, allowlist) + if github_readme and len(github_readme) > 500: # Only use if substantial + readme = github_readme + github_fetched = True + + # Priority 2: If no good GitHub README, try homepage if it's GitHub + if not github_fetched and homepage and "github.com" in homepage: + github_readme = _fetch_github_readme(homepage, allowlist) + if github_readme and len(github_readme) > 500: + readme = github_readme + github_fetched = True + + # Priority 3: Use npm README if it's substantial + if not github_fetched and len(readme) < 500: + # The npm README is too short, try to enhance it + readme = _enhance_npm_readme(data, readme) + + # Write documentation + with open(doc_file, "w", encoding="utf-8") as f: + f.write(f"# {name}@{version}\n\n") + f.write(f"**Package**: [{name}](https://www.npmjs.com/package/{name})\n") + f.write(f"**Version**: {version}\n") + if homepage: + f.write(f"**Homepage**: {homepage}\n") + f.write("\n---\n\n") + f.write(readme) + + # Add usage examples if not in README + if "## Usage" not in readme and "## Example" not in readme: + f.write("\n\n## Installation\n\n```bash\nnpm install {name}\n```\n".format(name=name)) + + # Write metadata + meta = { + "source_url": url, + "last_checked": datetime.now().isoformat(), + "etag": response.headers.get("ETag"), + "repository": repository, + "from_github": github_fetched + } + with open(meta_file, "w", encoding="utf-8") as f: + json.dump(meta, f, indent=2) + + return {"status": "fetched"} + + except urllib.error.HTTPError as e: + if e.code == 429: + return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"} + return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"} + except (urllib.error.URLError, json.JSONDecodeError) as e: + return {"status": "error", "error": str(e)} + + +def _fetch_pypi_docs( + dep: Dict[str, Any], + output_dir: Path, + allowlist: List[str] +) -> Dict[str, Any]: + """Fetch documentation for a PyPI package.""" + name = dep["name"].strip() # Strip any whitespace from name + version = dep["version"] + + # Validate package name + if not validate_package_name(name, "py"): + return {"status": "skipped", "reason": "Invalid package name"} + + # Sanitize package name for URL + safe_url_name = sanitize_url_component(name) + + # Handle special versions + if version in ["latest", "git"]: + # For latest, fetch current version first + if version == "latest": + url = f"https://pypi.org/pypi/{safe_url_name}/json" + else: + return {"status": "skipped", "reason": "git dependency"} + else: + safe_url_version = sanitize_url_component(version) + url = f"https://pypi.org/pypi/{safe_url_name}/{safe_url_version}/json" + + if not _is_url_allowed(url, allowlist): + return {"status": "skipped", "reason": "URL not in allowlist"} + + # Sanitize version for filesystem + safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_") + + # Create package-specific directory with sanitized name + safe_name = name.replace("/", "_").replace("\\", "_") + try: + pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}" + pkg_dir.mkdir(parents=True, exist_ok=True) + except (OSError, SecurityError) as e: + return {"status": "error", "error": f"Cannot create package directory: {e}"} + + doc_file = pkg_dir / "doc.md" + meta_file = pkg_dir / "meta.json" + + # Check cache + if doc_file.exists() and meta_file.exists(): + try: + with open(meta_file, encoding="utf-8") as f: + meta = json.load(f) + last_checked = datetime.fromisoformat(meta["last_checked"]) + if (datetime.now() - last_checked).days < 7: + return {"status": "cached"} + except (json.JSONDecodeError, KeyError): + pass + + try: + with urllib.request.urlopen(url, timeout=10) as response: + data = json.loads(response.read()) + + info = data.get("info", {}) + description = info.get("description", "") + summary = info.get("summary", "") + + # Priority 1: Try to get README from project URLs (GitHub, GitLab, etc.) + github_fetched = False + project_urls = info.get("project_urls", {}) + + # Check all possible URL sources for GitHub + all_urls = [] + for key, proj_url in project_urls.items(): + if proj_url: + all_urls.append(proj_url) + + # Also check home_page and download_url + home_page = info.get("home_page", "") + if home_page: + all_urls.append(home_page) + download_url = info.get("download_url", "") + if download_url: + all_urls.append(download_url) + + # Try GitHub first + for url in all_urls: + if "github.com" in url.lower(): + github_readme = _fetch_github_readme(url, allowlist) + if github_readme and len(github_readme) > 500: + description = github_readme + github_fetched = True + break + + # Priority 2: Try ReadTheDocs if available + if not github_fetched: + for url in all_urls: + if "readthedocs" in url.lower(): + rtd_content = _fetch_readthedocs(url, allowlist) + if rtd_content and len(rtd_content) > 500: + description = rtd_content + github_fetched = True # Mark as fetched from external source + break + + # Priority 3: Try to scrape PyPI web page (not API) for full README + if not github_fetched and len(description) < 1000: + pypi_readme = _fetch_pypi_web_readme(name, version, allowlist) + if pypi_readme and len(pypi_readme) > len(description): + description = pypi_readme + github_fetched = True # Mark as fetched from external source + + # Priority 4: Use PyPI description (often contains full README) + # PyPI descriptions can be quite good if properly uploaded + if not github_fetched and len(description) < 500 and summary: + # If description is too short, enhance it + description = _enhance_pypi_description(info, description, summary) + + # Write documentation + with open(doc_file, "w", encoding="utf-8") as f: + f.write(f"# {name}@{version}\n\n") + f.write(f"**Package**: [{name}](https://pypi.org/project/{name}/)\n") + f.write(f"**Version**: {version}\n") + + # Add project URLs if available + if project_urls: + f.write("\n**Links**:\n") + for key, url in list(project_urls.items())[:5]: # Limit to 5 + if url: + f.write(f"- {key}: {url}\n") + + f.write("\n---\n\n") + + # Add summary if different from description + if summary and summary not in description: + f.write(f"**Summary**: {summary}\n\n") + + f.write(description) + + # Add installation instructions if not in description + if "pip install" not in description.lower(): + f.write(f"\n\n## Installation\n\n```bash\npip install {name}\n```\n") + + # Add basic usage if really minimal docs + if len(description) < 200: + f.write(f"\n\n## Basic Usage\n\n```python\nimport {name.replace('-', '_')}\n```\n") + + # Write metadata + meta = { + "source_url": url, + "last_checked": datetime.now().isoformat(), + "etag": response.headers.get("ETag"), + "project_urls": project_urls, + "from_github": github_fetched + } + with open(meta_file, "w", encoding="utf-8") as f: + json.dump(meta, f, indent=2) + + return {"status": "fetched"} + + except urllib.error.HTTPError as e: + if e.code == 429: + return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"} + return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"} + except (urllib.error.URLError, json.JSONDecodeError) as e: + return {"status": "error", "error": str(e)} + + +def _fetch_github_readme(repo_url: str, allowlist: List[str]) -> Optional[str]: + """ + Fetch README from GitHub repository. + Converts repository URL to raw GitHub URL for README. + """ + if not repo_url: + return None + + # Extract owner/repo from various GitHub URL formats + patterns = [ + r'github\.com[:/]([^/]+)/([^/\s]+)', + r'git\+https://github\.com/([^/]+)/([^/\s]+)', + ] + + for pattern in patterns: + match = re.search(pattern, repo_url) + if match: + owner, repo = match.groups() + # Clean repo name + repo = repo.replace(".git", "") + + # Try common README filenames + readme_files = ["README.md", "readme.md", "README.rst", "README.txt"] + + # Sanitize owner and repo for URL + safe_owner = sanitize_url_component(owner) + safe_repo = sanitize_url_component(repo) + + for readme_name in readme_files: + safe_readme = sanitize_url_component(readme_name) + raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/main/{safe_readme}" + + if not _is_url_allowed(raw_url, allowlist): + continue + + try: + with urllib.request.urlopen(raw_url, timeout=5) as response: + return response.read().decode("utf-8") + except urllib.error.HTTPError: + # Try master branch + raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/master/{safe_readme}" + try: + with urllib.request.urlopen(raw_url, timeout=5) as response: + return response.read().decode("utf-8") + except urllib.error.URLError: + continue + except urllib.error.URLError: + continue + + return None + + +def _is_url_allowed(url: str, allowlist: List[str]) -> bool: + """Check if URL is in the allowlist.""" + for allowed in allowlist: + if url.startswith(allowed): + return True + return False + + +def _enhance_npm_readme(data: Dict[str, Any], readme: str) -> str: + """Enhance minimal npm README with package metadata.""" + enhanced = readme if readme else "" + + # Add description if not in README + description = data.get("description", "") + if description and description not in enhanced: + enhanced = f"{description}\n\n{enhanced}" + + # Add keywords + keywords = data.get("keywords", []) + if keywords and "keywords" not in enhanced.lower(): + enhanced += f"\n\n## Keywords\n\n{', '.join(keywords)}" + + # Add main entry point info + main = data.get("main", "") + if main: + enhanced += f"\n\n## Entry Point\n\nMain file: `{main}`" + + # Add dependencies info if substantial + deps = data.get("dependencies", {}) + if len(deps) > 0 and len(deps) <= 10: # Only if reasonable number + enhanced += "\n\n## Dependencies\n\n" + for dep, ver in deps.items(): + enhanced += f"- {dep}: {ver}\n" + + return enhanced + + +def _fetch_readthedocs(url: str, allowlist: List[str]) -> Optional[str]: + """ + Fetch documentation from ReadTheDocs. + Tries to get the main index page content. + """ + if not url or not _is_url_allowed(url, allowlist): + return None + + # Ensure we're getting the latest version + if not url.endswith("/"): + url += "/" + + # Try to fetch the main page + try: + # Add en/latest if not already in URL + if "/en/latest" not in url and "/en/stable" not in url: + url = url.rstrip("/") + "/en/latest/" + + with urllib.request.urlopen(url, timeout=10) as response: + html_content = response.read().decode("utf-8") + + # Basic HTML to markdown conversion (very simplified) + # Remove script and style tags + html_content = re.sub(r']*>.*?', '', html_content, flags=re.DOTALL) + html_content = re.sub(r']*>.*?', '', html_content, flags=re.DOTALL) + + # Extract main content (look for common RTD content divs) + content_match = re.search(r']*class="[^"]*document[^"]*"[^>]*>(.*?)', html_content, re.DOTALL) + if content_match: + html_content = content_match.group(1) + + # Convert basic HTML tags to markdown + html_content = re.sub(r']*>(.*?)', r'# \1\n', html_content) + html_content = re.sub(r']*>(.*?)', r'## \1\n', html_content) + html_content = re.sub(r']*>(.*?)', r'### \1\n', html_content) + html_content = re.sub(r']*>(.*?)', r'`\1`', html_content) + html_content = re.sub(r']*>(.*?)', r'```\n\1\n```', html_content, flags=re.DOTALL) + html_content = re.sub(r']*>(.*?)

', r'\1\n\n', html_content) + html_content = re.sub(r']*href="([^"]*)"[^>]*>(.*?)', r'[\2](\1)', html_content) + html_content = re.sub(r'<[^>]+>', '', html_content) # Remove remaining HTML tags + + # Clean up whitespace + html_content = re.sub(r'\n{3,}', '\n\n', html_content) + + return html_content.strip() + except Exception: + return None + + +def _fetch_pypi_web_readme(name: str, version: str, allowlist: List[str]) -> Optional[str]: + """ + Fetch the rendered README from PyPI's web interface. + The web interface shows the full README that's often missing from the API. + """ + # Validate package name + if not validate_package_name(name, "py"): + return None + + # Sanitize for URL + safe_name = sanitize_url_component(name) + safe_version = sanitize_url_component(version) + + # PyPI web URLs + urls_to_try = [ + f"https://pypi.org/project/{safe_name}/{safe_version}/", + f"https://pypi.org/project/{safe_name}/" + ] + + for url in urls_to_try: + if not _is_url_allowed(url, allowlist): + continue + + try: + req = urllib.request.Request(url, headers={ + 'User-Agent': 'Mozilla/5.0 (compatible; TheAuditor/1.0)' + }) + with urllib.request.urlopen(req, timeout=10) as response: + html_content = response.read().decode("utf-8") + + # Look for the project description div + # PyPI uses a specific class for the README content + readme_match = re.search( + r']*class="[^"]*project-description[^"]*"[^>]*>(.*?)', + html_content, + re.DOTALL | re.IGNORECASE + ) + + if not readme_match: + # Try alternative patterns + readme_match = re.search( + r']*class="[^"]*description[^"]*"[^>]*>(.*?)', + html_content, + re.DOTALL | re.IGNORECASE + ) + + if readme_match: + readme_html = readme_match.group(1) + + # Convert HTML to markdown (simplified) + # Headers + readme_html = re.sub(r']*>(.*?)', r'# \1\n', readme_html, flags=re.IGNORECASE) + readme_html = re.sub(r']*>(.*?)', r'## \1\n', readme_html, flags=re.IGNORECASE) + readme_html = re.sub(r']*>(.*?)', r'### \1\n', readme_html, flags=re.IGNORECASE) + + # Code blocks + readme_html = re.sub(r']*>]*>(.*?)', r'```\n\1\n```', readme_html, flags=re.DOTALL | re.IGNORECASE) + readme_html = re.sub(r']*>(.*?)', r'`\1`', readme_html, flags=re.IGNORECASE) + + # Lists + readme_html = re.sub(r']*>(.*?)', r'- \1\n', readme_html, flags=re.IGNORECASE) + + # Links + readme_html = re.sub(r']*href="([^"]*)"[^>]*>(.*?)', r'[\2](\1)', readme_html, flags=re.IGNORECASE) + + # Paragraphs and line breaks + readme_html = re.sub(r']*>(.*?)

', r'\1\n\n', readme_html, flags=re.DOTALL | re.IGNORECASE) + readme_html = re.sub(r']*>', '\n', readme_html, flags=re.IGNORECASE) + + # Remove remaining HTML tags + readme_html = re.sub(r'<[^>]+>', '', readme_html) + + # Decode HTML entities + readme_html = readme_html.replace('<', '<') + readme_html = readme_html.replace('>', '>') + readme_html = readme_html.replace('&', '&') + readme_html = readme_html.replace('"', '"') + readme_html = readme_html.replace(''', "'") + + # Clean up whitespace + readme_html = re.sub(r'\n{3,}', '\n\n', readme_html) + readme_html = readme_html.strip() + + if len(readme_html) > 100: # Only return if we got substantial content + return readme_html + except Exception: + continue + + return None + + +def _enhance_pypi_description(info: Dict[str, Any], description: str, summary: str) -> str: + """Enhance minimal PyPI description with package metadata.""" + enhanced = description if description else "" + + # Start with summary if description is empty + if not enhanced and summary: + enhanced = f"{summary}\n\n" + + # Add author info + author = info.get("author", "") + author_email = info.get("author_email", "") + if author and "author" not in enhanced.lower(): + author_info = f"\n\n## Author\n\n{author}" + if author_email: + author_info += f" ({author_email})" + enhanced += author_info + + # Add license + license_info = info.get("license", "") + if license_info and "license" not in enhanced.lower(): + enhanced += f"\n\n## License\n\n{license_info}" + + # Add classifiers (limited) + classifiers = info.get("classifiers", []) + relevant_classifiers = [ + c for c in classifiers + if "Programming Language" in c or "Framework" in c or "Topic" in c + ][:5] # Limit to 5 + if relevant_classifiers: + enhanced += "\n\n## Classifiers\n\n" + for classifier in relevant_classifiers: + enhanced += f"- {classifier}\n" + + # Add requires_python if specified + requires_python = info.get("requires_python", "") + if requires_python: + enhanced += f"\n\n## Python Version\n\nRequires Python {requires_python}" + + return enhanced + + +def check_latest( + deps: List[Dict[str, Any]], + allow_net: bool = True, + offline: bool = False, + output_path: str = "./.pf/deps_latest.json" +) -> Dict[str, Any]: + """ + Check latest versions and compare to locked versions. + + This is a wrapper around deps.check_latest_versions for consistency. + """ + from .deps import check_latest_versions, write_deps_latest_json + + if offline or not allow_net: + return { + "mode": "offline", + "checked": 0, + "outdated": 0 + } + + latest_info = check_latest_versions(deps, allow_net=allow_net, offline=offline) + + if latest_info: + # Sanitize output path before writing + try: + safe_output_path = str(sanitize_path(output_path, ".")) + write_deps_latest_json(latest_info, safe_output_path) + except SecurityError as e: + return { + "mode": "error", + "error": f"Invalid output path: {e}", + "checked": 0, + "outdated": 0 + } + + outdated = sum(1 for info in latest_info.values() if info["is_outdated"]) + + return { + "mode": "online", + "checked": len(latest_info), + "outdated": outdated, + "output": output_path + } diff --git a/theauditor/docs_summarize.py b/theauditor/docs_summarize.py new file mode 100644 index 0000000..e39b69f --- /dev/null +++ b/theauditor/docs_summarize.py @@ -0,0 +1,408 @@ +"""Documentation summarizer for creating concise doc capsules.""" + +import json +import re +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Set + + +def summarize_docs( + docs_dir: str = "./.pf/context/docs", + output_dir: str = "./.pf/context/doc_capsules", + workset_path: Optional[str] = None, + max_capsule_lines: int = 50 +) -> Dict[str, Any]: + """ + Generate concise doc capsules from fetched documentation. + + Args: + docs_dir: Directory containing fetched docs + output_dir: Directory for output capsules + workset_path: Optional workset to filter relevant deps + max_capsule_lines: Maximum lines per capsule + + Returns: + Summary statistics + """ + docs_path = Path(docs_dir) + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + # Load workset if provided + relevant_deps = None + if workset_path and Path(workset_path).exists(): + relevant_deps = _load_workset_deps(workset_path) + + stats = { + "total_docs": 0, + "capsules_created": 0, + "skipped": 0, + "errors": [] + } + + capsules_index = [] + + # Process npm docs + npm_dir = docs_path / "npm" + if npm_dir.exists(): + for pkg_dir in npm_dir.iterdir(): + if not pkg_dir.is_dir(): + continue + + # Extract package name and version + pkg_info = pkg_dir.name # format: name@version + if "@" not in pkg_info: + stats["skipped"] += 1 + continue + + name_version = pkg_info.rsplit("@", 1) + if len(name_version) != 2: + stats["skipped"] += 1 + continue + + name, version = name_version + + # Check if in workset + if relevant_deps and f"npm:{name}" not in relevant_deps: + stats["skipped"] += 1 + continue + + stats["total_docs"] += 1 + + # Create capsule + doc_file = pkg_dir / "doc.md" + meta_file = pkg_dir / "meta.json" + + if doc_file.exists(): + try: + capsule = _create_capsule( + doc_file, meta_file, name, version, "npm", max_capsule_lines + ) + + # Write capsule + capsule_file = output_path / f"npm__{name}@{version}.md" + with open(capsule_file, "w", encoding="utf-8") as f: + f.write(capsule) + + capsules_index.append({ + "name": name, + "version": version, + "ecosystem": "npm", + "path": str(capsule_file.relative_to(output_path)) + }) + + stats["capsules_created"] += 1 + + except Exception as e: + stats["errors"].append(f"{name}@{version}: {str(e)}") + + # Process Python docs + py_dir = docs_path / "py" + if py_dir.exists(): + for pkg_dir in py_dir.iterdir(): + if not pkg_dir.is_dir(): + continue + + # Extract package name and version + pkg_info = pkg_dir.name # format: name@version + if "@" not in pkg_info: + stats["skipped"] += 1 + continue + + name_version = pkg_info.rsplit("@", 1) + if len(name_version) != 2: + stats["skipped"] += 1 + continue + + name, version = name_version + + # Check if in workset + if relevant_deps and f"py:{name}" not in relevant_deps: + stats["skipped"] += 1 + continue + + stats["total_docs"] += 1 + + # Create capsule + doc_file = pkg_dir / "doc.md" + meta_file = pkg_dir / "meta.json" + + if doc_file.exists(): + try: + capsule = _create_capsule( + doc_file, meta_file, name, version, "py", max_capsule_lines + ) + + # Write capsule + capsule_file = output_path / f"py__{name}@{version}.md" + with open(capsule_file, "w", encoding="utf-8") as f: + f.write(capsule) + + capsules_index.append({ + "name": name, + "version": version, + "ecosystem": "py", + "path": str(capsule_file.relative_to(output_path)) + }) + + stats["capsules_created"] += 1 + + except Exception as e: + stats["errors"].append(f"{name}@{version}: {str(e)}") + + # Write index + index_file = output_path.parent / "doc_index.json" + with open(index_file, "w", encoding="utf-8") as f: + json.dump({ + "created_at": datetime.now().isoformat(), + "capsules": capsules_index, + "stats": stats + }, f, indent=2) + + return stats + + +def _load_workset_deps(workset_path: str) -> Set[str]: + """ + Load relevant dependencies from workset. + Returns set of "manager:name" keys. + """ + relevant = set() + + try: + with open(workset_path, encoding="utf-8") as f: + workset = json.load(f) + + # Extract imported packages from workset files + # This is a simplified version - would need more sophisticated parsing + for file_info in workset.get("files", []): + path = file_info.get("path", "") + + # Simple heuristic: look at file extension + if path.endswith((".js", ".ts", ".jsx", ".tsx")): + # Would parse imports/requires + # For now, include all npm deps + relevant.add("npm:*") + elif path.endswith(".py"): + # Would parse imports + # For now, include all py deps + relevant.add("py:*") + + except (json.JSONDecodeError, KeyError): + pass + + # If we couldn't determine specific deps, include all + if not relevant or "npm:*" in relevant or "py:*" in relevant: + return set() # Empty set means include all + + return relevant + + +def _create_capsule( + doc_file: Path, + meta_file: Path, + name: str, + version: str, + ecosystem: str, + max_lines: int +) -> str: + """Create a concise capsule from documentation.""" + + # Read documentation + with open(doc_file, encoding="utf-8") as f: + content = f.read() + + # Read metadata + meta = {} + if meta_file.exists(): + try: + with open(meta_file, encoding="utf-8") as f: + meta = json.load(f) + except json.JSONDecodeError: + pass + + # Extract key sections + sections = { + "init": _extract_initialization(content, ecosystem), + "apis": _extract_top_apis(content), + "examples": _extract_examples(content), + } + + # Build capsule + capsule_lines = [ + f"# {name}@{version} ({ecosystem})", + "", + "## Quick Start", + "" + ] + + if sections["init"]: + capsule_lines.extend(sections["init"][:10]) # Limit lines + capsule_lines.append("") + elif content: # If no structured init but has content, add some raw content + content_lines = content.split("\n")[:10] + capsule_lines.extend(content_lines) + capsule_lines.append("") + + if sections["apis"]: + capsule_lines.append("## Top APIs") + capsule_lines.append("") + capsule_lines.extend(sections["apis"][:15]) # Limit lines + capsule_lines.append("") + + if sections["examples"]: + capsule_lines.append("## Examples") + capsule_lines.append("") + capsule_lines.extend(sections["examples"][:15]) # Limit lines + capsule_lines.append("") + + # Add reference to full documentation + capsule_lines.append("## 📄 Full Documentation Available") + capsule_lines.append("") + # Calculate relative path from project root + full_doc_path = f"./.pf/context/docs/{ecosystem}/{name}@{version}/doc.md" + capsule_lines.append(f"**Full content**: `{full_doc_path}`") + + # Count lines in full doc if it exists + if doc_file.exists(): + try: + with open(doc_file, encoding="utf-8") as f: + line_count = len(f.readlines()) + capsule_lines.append(f"**Size**: {line_count} lines") + except Exception: + pass + + capsule_lines.append("") + + # Add source info + capsule_lines.append("## Source") + capsule_lines.append("") + capsule_lines.append(f"- URL: {meta.get('source_url', '')}") + capsule_lines.append(f"- Fetched: {meta.get('last_checked', '')}") + + # Truncate if too long + if len(capsule_lines) > max_lines: + # Keep the full doc reference even when truncating + keep_lines = capsule_lines[:max_lines-7] # Leave room for reference and truncation + ref_lines = [l for l in capsule_lines if "Full Documentation Available" in l or "Full content" in l or "Size" in l] + capsule_lines = keep_lines + ["", "...","(truncated)", ""] + ref_lines + + return "\n".join(capsule_lines) + + +def _extract_initialization(content: str, ecosystem: str) -> List[str]: + """Extract initialization/installation snippets.""" + lines = [] + + # Look for installation section + install_patterns = [ + r"## Install\w*", + r"## Getting Started", + r"## Quick Start", + r"### Install\w*", + ] + + for pattern in install_patterns: + match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE) + if match: + # Extract next code block + start = match.end() + code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL) + if code_match: + lines.append(f"```{code_match.group(1)}") + lines.extend(code_match.group(2).strip().split("\n")[:5]) + lines.append("```") + break + + # Fallback: look for common patterns + if not lines: + if ecosystem == "npm": + if "require(" in content: + match = re.search(r"(const|var|let)\s+\w+\s*=\s*require\([^)]+\)", content) + if match: + lines = ["```javascript", match.group(0), "```"] + elif "import " in content: + match = re.search(r"import\s+.*?from\s+['\"][^'\"]+['\"]", content) + if match: + lines = ["```javascript", match.group(0), "```"] + elif ecosystem == "py": + if "import " in content: + match = re.search(r"import\s+\w+", content) + if match: + lines = ["```python", match.group(0), "```"] + elif "from " in content: + match = re.search(r"from\s+\w+\s+import\s+\w+", content) + if match: + lines = ["```python", match.group(0), "```"] + + return lines + + +def _extract_top_apis(content: str) -> List[str]: + """Extract top API methods.""" + lines = [] + + # Look for API section + api_patterns = [ + r"## API", + r"## Methods", + r"## Functions", + r"### API", + ] + + for pattern in api_patterns: + match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE) + if match: + start = match.end() + # Extract next few method signatures + method_matches = re.findall( + r"^[\*\-]\s*`([^`]+)`", + content[start:start+2000], + re.MULTILINE + ) + for method in method_matches[:5]: # Top 5 methods + lines.append(f"- `{method}`") + break + + # Fallback: look for function definitions in code blocks + if not lines: + code_blocks = re.findall(r"```\w*\n(.*?)```", content, re.DOTALL) + for block in code_blocks[:2]: # Check first 2 code blocks + # Look for function signatures + funcs = re.findall(r"(?:function|def|const|let|var)\s+(\w+)\s*\(([^)]*)\)", block) + for func_name, params in funcs[:5]: + lines.append(f"- `{func_name}({params})`") + if lines: + break + + return lines + + +def _extract_examples(content: str) -> List[str]: + """Extract usage examples.""" + lines = [] + + # Look for examples section + example_patterns = [ + r"## Example", + r"## Usage", + r"### Example", + r"### Usage", + ] + + for pattern in example_patterns: + match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE) + if match: + start = match.end() + # Extract next code block + code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL) + if code_match: + lang = code_match.group(1) or "javascript" + code_lines = code_match.group(2).strip().split("\n")[:10] # Max 10 lines + lines.append(f"```{lang}") + lines.extend(code_lines) + lines.append("```") + break + + return lines \ No newline at end of file diff --git a/theauditor/extraction.py b/theauditor/extraction.py new file mode 100644 index 0000000..4d08c9f --- /dev/null +++ b/theauditor/extraction.py @@ -0,0 +1,493 @@ +"""Extraction module - pure courier model for data chunking. + +This module implements the courier model: takes raw tool output and chunks it +into manageable pieces for AI processing WITHOUT any filtering or interpretation. + +Pure Courier Principles: +- NO filtering by severity or importance +- NO deduplication or sampling +- NO interpretation of findings +- ONLY chunks files if they exceed 65KB +- ALL data preserved exactly as generated + +The AI consumer decides what's important, not TheAuditor. +""" + +import json +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple +from collections import defaultdict +from theauditor.config_runtime import load_runtime_config + + +# DELETED: All smart extraction functions removed +# Pure courier model - no filtering, only chunking if needed + + +def _chunk_large_file(raw_path: Path, max_chunk_size: Optional[int] = None) -> Optional[List[Tuple[Path, int]]]: + """Split large files into chunks of configured max size.""" + # Load config if not provided + if max_chunk_size is None: + config = load_runtime_config() + max_chunk_size = config["limits"]["max_chunk_size"] + + # Get max chunks per file from config + config = load_runtime_config() + max_chunks_per_file = config["limits"]["max_chunks_per_file"] + + chunks = [] + try: + # Handle non-JSON files (like .dot, .txt, etc.) + if raw_path.suffix != '.json': + # Read as text and chunk if needed + with open(raw_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + # Check if file needs chunking + if len(content) <= max_chunk_size: + # Small enough, copy as-is + output_path = raw_path.parent.parent / 'readthis' / raw_path.name + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(content) + size = output_path.stat().st_size + print(f" [COPIED] {raw_path.name} → {output_path.name} ({size:,} bytes)") + return [(output_path, size)] + else: + # Need to chunk text file + base_name = raw_path.stem + ext = raw_path.suffix + chunk_num = 0 + position = 0 + + while position < len(content) and chunk_num < max_chunks_per_file: + chunk_num += 1 + chunk_end = min(position + max_chunk_size, len(content)) + chunk_content = content[position:chunk_end] + + output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}" + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(chunk_content) + size = output_path.stat().st_size + chunks.append((output_path, size)) + print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({size:,} bytes)") + + position = chunk_end + + if position < len(content): + print(f" [TRUNCATED] {raw_path.name} - stopped at {max_chunks_per_file} chunks") + + return chunks + + # Handle JSON files + with open(raw_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Check if file needs chunking + full_json = json.dumps(data, indent=2) + if len(full_json) <= max_chunk_size: + # Small enough, copy as-is + output_path = raw_path.parent.parent / 'readthis' / raw_path.name + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(full_json) + size = output_path.stat().st_size + print(f" [COPIED] {raw_path.name} → {output_path.name} ({size:,} bytes)") + return [(output_path, size)] + + # File needs chunking + base_name = raw_path.stem + ext = raw_path.suffix + + # Handle different data structures + if isinstance(data, list): + # For lists, chunk by items + chunk_num = 0 + current_chunk = [] + current_size = 100 # Account for JSON structure overhead + + for item in data: + item_json = json.dumps(item, indent=2) + item_size = len(item_json) + + if current_size + item_size > max_chunk_size and current_chunk: + # Check chunk limit + if chunk_num >= max_chunks_per_file: + print(f" [TRUNCATED] {raw_path.name} - stopped at {max_chunks_per_file} chunks (would have created more)") + break + + # Write current chunk + chunk_num += 1 + output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}" + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(current_chunk, f, indent=2) + size = output_path.stat().st_size + chunks.append((output_path, size)) + print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({size:,} bytes)") + + # Start new chunk + current_chunk = [item] + current_size = item_size + 100 + else: + current_chunk.append(item) + current_size += item_size + + # Write final chunk (only if under limit) + if current_chunk and chunk_num < max_chunks_per_file: + chunk_num += 1 + output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}" + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(current_chunk, f, indent=2) + size = output_path.stat().st_size + chunks.append((output_path, size)) + print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({size:,} bytes)") + + elif isinstance(data, dict): + # For dicts with lists (like findings, paths), chunk the lists + # Determine the correct key to chunk on + if base_name == 'taint_analysis': + # For taint analysis, we need to merge ALL findings into one list + # because they're split across multiple keys + if 'taint_paths' in data or 'all_rule_findings' in data: + # Merge all findings into a single list for chunking + all_taint_items = [] + + # Add taint paths + if 'taint_paths' in data: + for item in data['taint_paths']: + item['finding_type'] = 'taint_path' + all_taint_items.append(item) + + # Add all rule findings + if 'all_rule_findings' in data: + for item in data['all_rule_findings']: + item['finding_type'] = 'rule_finding' + all_taint_items.append(item) + + # Add infrastructure issues only if they're different from all_rule_findings + # (to avoid duplicates when they're the same list) + if 'infrastructure_issues' in data: + # Check if they're different objects (not the same list) + if data['infrastructure_issues'] is not data.get('all_rule_findings'): + # Only add if they're actually different content + infra_set = {json.dumps(item, sort_keys=True) for item in data['infrastructure_issues']} + rules_set = {json.dumps(item, sort_keys=True) for item in data.get('all_rule_findings', [])} + if infra_set != rules_set: + for item in data['infrastructure_issues']: + item['finding_type'] = 'infrastructure' + all_taint_items.append(item) + + # Add paths (data flow paths) - these are often duplicates of taint_paths but may have extra info + if 'paths' in data: + # Check if different from taint_paths + paths_set = {json.dumps(item, sort_keys=True) for item in data['paths']} + taint_set = {json.dumps(item, sort_keys=True) for item in data.get('taint_paths', [])} + if paths_set != taint_set: + for item in data['paths']: + item['finding_type'] = 'path' + all_taint_items.append(item) + + # Add vulnerabilities - these are the final analyzed vulnerabilities + if 'vulnerabilities' in data: + for item in data['vulnerabilities']: + item['finding_type'] = 'vulnerability' + all_taint_items.append(item) + + # Create a new data structure with merged findings + data = { + 'success': data.get('success', True), + 'summary': data.get('summary', {}), + 'total_vulnerabilities': data.get('total_vulnerabilities', len(all_taint_items)), + 'sources_found': data.get('sources_found', 0), + 'sinks_found': data.get('sinks_found', 0), + 'merged_findings': all_taint_items + } + list_key = 'merged_findings' + else: + list_key = 'paths' + elif 'all_findings' in data: + # CRITICAL: FCE findings are pre-sorted by severity via finding_priority.py + # The order MUST be preserved during chunking to ensure critical issues + # appear in chunk01. DO NOT sort or shuffle these findings! + list_key = 'all_findings' + + # Log for verification + if data.get(list_key): + first_items = data[list_key][:3] if len(data[list_key]) >= 3 else data[list_key] + severities = [item.get('severity', 'unknown') for item in first_items] + print(f"[EXTRACTION] Processing FCE with {len(data[list_key])} pre-sorted findings") + print(f"[EXTRACTION] First 3 severities: {severities}") + elif 'findings' in data: + list_key = 'findings' + elif 'vulnerabilities' in data: + list_key = 'vulnerabilities' + elif 'issues' in data: + list_key = 'issues' + elif 'edges' in data: + list_key = 'edges' # For call_graph.json and import_graph.json + elif 'nodes' in data: + list_key = 'nodes' # For graph files with nodes + elif 'taint_paths' in data: + list_key = 'taint_paths' + elif 'paths' in data: + list_key = 'paths' + elif 'dependencies' in data: + list_key = 'dependencies' # For deps.json + elif 'files' in data: + list_key = 'files' # For file lists + elif 'results' in data: + list_key = 'results' # For analysis results + else: + list_key = None + + if list_key: + items = data.get(list_key, []) + + # Extract minimal metadata (don't duplicate everything) + metadata = {} + for key in ['success', 'summary', 'total_vulnerabilities', 'chunk_info']: + if key in data: + metadata[key] = data[key] + + # Calculate actual metadata size + metadata_json = json.dumps(metadata, indent=2) + metadata_size = len(metadata_json) + + chunk_num = 0 + chunk_items = [] + current_size = metadata_size + 200 # Actual metadata size + bracket overhead + + for item in items: + item_json = json.dumps(item, indent=2) + item_size = len(item_json) + + if current_size + item_size > max_chunk_size and chunk_items: + # Check chunk limit + if chunk_num >= max_chunks_per_file: + print(f" [TRUNCATED] {raw_path.name} - stopped at {max_chunks_per_file} chunks (would have created more)") + break + + # Write current chunk + chunk_num += 1 + chunk_data = metadata.copy() + chunk_data[list_key] = chunk_items + chunk_data['chunk_info'] = { + 'chunk_number': chunk_num, + 'total_items_in_chunk': len(chunk_items), + 'original_total_items': len(items), + 'list_key': list_key, + 'truncated': chunk_num >= max_chunks_per_file # Mark if this is the last allowed chunk + } + + output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}" + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(chunk_data, f, indent=2) + size = output_path.stat().st_size + chunks.append((output_path, size)) + print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({len(chunk_items)} items, {size:,} bytes)") + + # Start new chunk + chunk_items = [item] + current_size = metadata_size + item_size + 200 + else: + chunk_items.append(item) + current_size += item_size + + # Write final chunk (only if under limit) + if chunk_items and chunk_num < max_chunks_per_file: + chunk_num += 1 + chunk_data = metadata.copy() + chunk_data[list_key] = chunk_items + chunk_data['chunk_info'] = { + 'chunk_number': chunk_num, + 'total_items_in_chunk': len(chunk_items), + 'original_total_items': len(items), + 'list_key': list_key, + 'truncated': False # This is the final chunk within limit + } + + output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}" + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(chunk_data, f, indent=2) + size = output_path.stat().st_size + chunks.append((output_path, size)) + print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({len(chunk_items)} items, {size:,} bytes)") + else: + # No recognized list key - shouldn't happen now with expanded list + # Log warning and copy as-is + print(f" [WARNING] No chunkable list found in {raw_path.name}, copying as-is") + output_path = raw_path.parent.parent / 'readthis' / raw_path.name + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + size = output_path.stat().st_size + chunks.append((output_path, size)) + print(f" [COPIED] {raw_path.name} → {output_path.name} ({size:,} bytes)") + + return chunks + + except Exception as e: + print(f" [ERROR] Failed to chunk {raw_path.name}: {e}") + return None # Return None to signal failure, not empty list + + +def _copy_as_is(raw_path: Path) -> Tuple[Optional[Path], int]: + """Copy small files as-is or chunk if >65KB.""" + chunks = _chunk_large_file(raw_path) + if chunks is None: + # Chunking failed + return None, -1 # Signal error with -1 + elif chunks: + # Return the first chunk info for compatibility + return chunks[0] if len(chunks) == 1 else (None, sum(s for _, s in chunks)) + return None, 0 + + +def extract_all_to_readthis(root_path_str: str, budget_kb: int = 1500) -> bool: + """Main function for extracting readthis chunks from raw data. + + Implements intelligent extraction with prioritization to stay within + budget while preserving all critical security findings. + + Args: + root_path_str: Root directory path as string + budget_kb: Maximum total size in KB for all readthis files (default 1000KB) + + Returns: + True if extraction completed successfully, False otherwise + """ + root_path = Path(root_path_str) + raw_dir = root_path / ".pf" / "raw" + readthis_dir = root_path / ".pf" / "readthis" + + print("\n" + "="*60) + print("[EXTRACTION] Smart extraction with 1MB budget") + print("="*60) + + # Check if raw directory exists + if not raw_dir.exists(): + print(f"[WARNING] Raw directory does not exist: {raw_dir}") + print("[INFO] No raw data to extract - skipping extraction phase") + return True + + # Ensure readthis directory exists + try: + readthis_dir.mkdir(parents=True, exist_ok=True) + print(f"[OK] Readthis directory ready: {readthis_dir}") + except Exception as e: + print(f"[ERROR] Failed to create readthis directory: {e}") + return False + + # Discover ALL files in raw directory dynamically (courier model) + raw_files = [] + for file_path in raw_dir.iterdir(): + if file_path.is_file(): + raw_files.append(file_path.name) + + print(f"[DISCOVERED] Found {len(raw_files)} files in raw directory") + + # Pure courier model - no smart extraction, just chunking if needed + # Build extraction strategy dynamically + extraction_strategy = [] + for filename in sorted(raw_files): + # All files get same treatment: chunk if needed + extraction_strategy.append((filename, 100, _copy_as_is)) + + total_budget = budget_kb * 1024 # Convert to bytes + total_used = 0 + extracted_files = [] + skipped_files = [] + failed_files = [] # Track failures + + print(f"[BUDGET] Total budget: {budget_kb}KB ({total_budget:,} bytes)") + print(f"[STRATEGY] Pure courier model - no filtering\n") + + for filename, file_budget_kb, extractor in extraction_strategy: + raw_path = raw_dir / filename + + if not raw_path.exists(): + continue + + print(f"[PROCESSING] {filename}") + + # Just chunk everything - ignore budget for chunking + # The whole point is to break large files into manageable pieces + chunks = _chunk_large_file(raw_path) + + if chunks is None: + # Chunking failed for this file + print(f" [FAILED] {filename} - chunking error") + failed_files.append(filename) + continue + + if chunks: + for chunk_path, chunk_size in chunks: + # Optionally check budget per chunk (or ignore completely) + if total_used + chunk_size > total_budget: + # Could skip remaining chunks or just ignore budget + # For now, let's just ignore budget and extract everything + pass + + total_used += chunk_size + extracted_files.append((chunk_path.name, chunk_size)) + + # Create extraction summary + summary = { + 'extraction_timestamp': str(Path(root_path_str).stat().st_mtime), + 'budget_kb': budget_kb, + 'total_used_bytes': total_used, + 'total_used_kb': total_used // 1024, + 'utilization_percent': (total_used / total_budget) * 100, + 'files_extracted': len(extracted_files), + 'files_skipped': len(skipped_files), + 'files_failed': len(failed_files), + 'extracted': [{'file': f, 'size': s} for f, s in extracted_files], + 'skipped': skipped_files, + 'failed': failed_files, + 'strategy': 'Pure courier model - chunk if needed, no filtering' + } + + summary_path = readthis_dir / 'extraction_summary.json' + with open(summary_path, 'w', encoding='utf-8') as f: + json.dump(summary, f, indent=2) + + # Summary report + print("\n" + "="*60) + print("[EXTRACTION COMPLETE]") + print(f" Files extracted: {len(extracted_files)}") + print(f" Files skipped: {len(skipped_files)}") + print(f" Files failed: {len(failed_files)}") + print(f" Total size: {total_used:,} bytes ({total_used//1024}KB)") + print(f" Budget used: {(total_used/total_budget)*100:.1f}%") + print(f" Summary saved: {summary_path}") + + # List what was extracted + print("\n[EXTRACTED FILES]") + for filename, size in extracted_files: + print(f" {filename:30} {size:8,} bytes ({size//1024:4}KB)") + + if skipped_files: + print("\n[SKIPPED FILES]") + for filename in skipped_files: + print(f" {filename}") + + if failed_files: + print("\n[FAILED FILES]") + for filename in failed_files: + print(f" {filename}") + + print("\n[KEY INSIGHTS]") + print(" ✓ All findings preserved - no filtering") + print(" ✓ Pure courier model - no interpretation") + print(" ✓ Files chunked only if >65KB") + print(" ✓ Complete data for AI consumption") + print("="*60) + + # Return False if any files failed, True only if all succeeded + if failed_files: + print(f"\n[ERROR] Extraction failed for {len(failed_files)} files") + return False + return True \ No newline at end of file diff --git a/theauditor/fce.py b/theauditor/fce.py new file mode 100644 index 0000000..5ddc3f8 --- /dev/null +++ b/theauditor/fce.py @@ -0,0 +1,784 @@ +"""Factual Correlation Engine - aggregates and correlates findings from all analysis tools.""" + +import json +import os +import re +import shlex +import sqlite3 +import subprocess +from collections import defaultdict, deque +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from theauditor.test_frameworks import detect_test_framework +from theauditor.correlations import CorrelationLoader + + + + +def scan_all_findings(raw_dir: Path) -> list[dict[str, Any]]: + """ + Scan ALL raw outputs for structured findings with line-level detail. + Extract findings from JSON outputs with file, line, rule, and tool information. + """ + all_findings = [] + + for output_file in raw_dir.glob('*.json'): + if not output_file.is_file(): + continue + + tool_name = output_file.stem + try: + with open(output_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Handle different JSON structures based on tool + findings = [] + + # Standard findings structure (lint.json, patterns.json, etc.) + if isinstance(data, dict) and 'findings' in data: + findings = data['findings'] + # Vulnerabilities structure + elif isinstance(data, dict) and 'vulnerabilities' in data: + findings = data['vulnerabilities'] + # Taint analysis structure + elif isinstance(data, dict) and 'taint_paths' in data: + for path in data['taint_paths']: + # Create a finding for each taint path + if 'file' in path and 'line' in path: + findings.append({ + 'file': path['file'], + 'line': path['line'], + 'rule': f"taint-{path.get('sink_type', 'unknown')}", + 'message': path.get('message', 'Taint path detected') + }) + # Direct list of findings + elif isinstance(data, list): + findings = data + # RCA/test results structure + elif isinstance(data, dict) and 'failures' in data: + findings = data['failures'] + + # Process each finding + for finding in findings: + if isinstance(finding, dict): + # Ensure required fields exist + if 'file' in finding: + # Create standardized finding + standardized = { + 'file': finding.get('file', ''), + 'line': int(finding.get('line', 0)), + 'rule': finding.get('rule', finding.get('code', finding.get('pattern', 'unknown'))), + 'tool': finding.get('tool', tool_name), + 'message': finding.get('message', ''), + 'severity': finding.get('severity', 'warning') + } + all_findings.append(standardized) + + except (json.JSONDecodeError, KeyError, TypeError): + # Skip files that can't be parsed as JSON or don't have expected structure + continue + except Exception: + # Skip files with other errors + continue + + return all_findings + + +def run_tool(command: str, root_path: str, timeout: int = 600) -> tuple[int, str, str]: + """Run build/test tool with timeout and capture output.""" + try: + # Use deque as ring buffer to limit memory usage + max_lines = 10000 + stdout_buffer = deque(maxlen=max_lines) + stderr_buffer = deque(maxlen=max_lines) + + # Run command - safely split command string into arguments + cmd_args = shlex.split(command) + + # Write directly to temp files to avoid buffer overflow + import tempfile + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stdout.txt') as out_tmp, \ + tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stderr.txt') as err_tmp: + + process = subprocess.Popen( + cmd_args, + cwd=root_path, + stdout=out_tmp, + stderr=err_tmp, + text=True, + env={**os.environ, "CI": "true"}, # Set CI env for tools + ) + + stdout_file = out_tmp.name + stderr_file = err_tmp.name + + # Stream output with timeout + try: + process.communicate(timeout=timeout) + + # Read back the outputs + with open(stdout_file, 'r') as f: + stdout = f.read() + with open(stderr_file, 'r') as f: + stderr = f.read() + + # Clean up temp files + os.unlink(stdout_file) + os.unlink(stderr_file) + + # Append any errors to the global error.log + if stderr: + from pathlib import Path + error_log = Path(root_path) / ".pf" / "error.log" + error_log.parent.mkdir(parents=True, exist_ok=True) + with open(error_log, 'a') as f: + f.write(f"\n=== RCA Subprocess Error ({command[:50]}) ===\n") + f.write(f"Timestamp: {datetime.now().isoformat()}\n") + f.write(stderr) + f.write("\n") + # Store in buffers + stdout_buffer.extend(stdout.splitlines()) + stderr_buffer.extend(stderr.splitlines()) + except subprocess.TimeoutExpired: + process.kill() + return 124, "Process timed out", f"Command exceeded {timeout}s timeout" + + # Join lines + stdout_text = "\n".join(stdout_buffer) + stderr_text = "\n".join(stderr_buffer) + + return process.returncode, stdout_text, stderr_text + + except Exception as e: + return 1, "", str(e) + + +def parse_typescript_errors(output: str) -> list[dict[str, Any]]: + """Parse TypeScript/TSNode compiler errors.""" + errors = [] + + # TypeScript error format: file:line:col - error CODE: message + pattern = ( + r"(?P[^:\n]+):(?P\d+):(?P\d+) - error (?P[A-Z]+\d+): (?P.+)" + ) + + for match in re.finditer(pattern, output): + errors.append( + { + "tool": "tsc", + "file": match.group("file"), + "line": int(match.group("line")), + "column": int(match.group("col")), + "message": match.group("msg"), + "code": match.group("code"), + "category": "type_error", + } + ) + + return errors + + +def parse_jest_errors(output: str) -> list[dict[str, Any]]: + """Parse Jest/Vitest test failures.""" + errors = [] + + # Jest failed test: ● Test Suite Name › test name + # Followed by stack trace: at Object. (file:line:col) + test_pattern = r"● (?P[^\n]+)" + stack_pattern = r"at .*? \((?P[^:]+):(?P\d+):(?P\d+)\)" + + lines = output.splitlines() + for i, line in enumerate(lines): + test_match = re.match(test_pattern, line) + if test_match: + # Look for stack trace in next lines + for j in range(i + 1, min(i + 20, len(lines))): + stack_match = re.search(stack_pattern, lines[j]) + if stack_match: + errors.append( + { + "tool": "jest", + "file": stack_match.group("file"), + "line": int(stack_match.group("line")), + "column": int(stack_match.group("col")), + "message": f"Test failed: {test_match.group('testname')}", + "category": "test_failure", + } + ) + break + + return errors + + +def parse_pytest_errors(output: str) -> list[dict[str, Any]]: + """Parse pytest failures.""" + errors = [] + + # Pytest error format varies, but typically: + # FAILED path/to/test.py::TestClass::test_method - AssertionError: message + # Or: E AssertionError: message + # path/to/file.py:42: AssertionError + + failed_pattern = r"FAILED (?P[^:]+)(?:::(?P[^\s]+))? - (?P.+)" + error_pattern = r"^E\s+(?P.+)\n.*?(?P[^:]+):(?P\d+):" + + for match in re.finditer(failed_pattern, output): + errors.append( + { + "tool": "pytest", + "file": match.group("file"), + "line": 0, # Line not in FAILED format + "message": match.group("msg"), + "category": "test_failure", + } + ) + + for match in re.finditer(error_pattern, output, re.MULTILINE): + errors.append( + { + "tool": "pytest", + "file": match.group("file"), + "line": int(match.group("line")), + "message": match.group("msg"), + "category": "test_failure", + } + ) + + return errors + + +def parse_python_compile_errors(output: str) -> list[dict[str, Any]]: + """Parse Python compilation errors from py_compile output.""" + errors = [] + + # Python compile error format: + # Traceback (most recent call last): + # File "path/to/file.py", line X, in + # SyntaxError: invalid syntax + # Or: ModuleNotFoundError: No module named 'xxx' + + # Parse traceback format + lines = output.splitlines() + for i, line in enumerate(lines): + # Look for File references in tracebacks + if 'File "' in line and '", line ' in line: + # Extract file and line number + match = re.match(r'.*File "([^"]+)", line (\d+)', line) + if match and i + 1 < len(lines): + file_path = match.group(1) + line_num = int(match.group(2)) + + # Look for the error type in following lines + for j in range(i + 1, min(i + 5, len(lines))): + if 'Error:' in lines[j]: + error_msg = lines[j].strip() + errors.append({ + "tool": "py_compile", + "file": file_path, + "line": line_num, + "message": error_msg, + "category": "compile_error", + }) + break + + # Also catch simple error messages + if 'SyntaxError:' in line or 'ModuleNotFoundError:' in line or 'ImportError:' in line: + # Try to extract file info from previous lines + file_info = None + for j in range(max(0, i - 3), i): + if '***' in lines[j] and '.py' in lines[j]: + # py_compile format: *** path/to/file.py + file_match = re.match(r'\*\*\* (.+\.py)', lines[j]) + if file_match: + file_info = file_match.group(1) + break + + if file_info: + errors.append({ + "tool": "py_compile", + "file": file_info, + "line": 0, + "message": line.strip(), + "category": "compile_error", + }) + + return errors + + +def parse_errors(output: str, tool_name: str) -> list[dict[str, Any]]: + """Parse errors based on tool type.""" + all_errors = [] + + # Try all parsers + all_errors.extend(parse_typescript_errors(output)) + all_errors.extend(parse_jest_errors(output)) + all_errors.extend(parse_pytest_errors(output)) + all_errors.extend(parse_python_compile_errors(output)) + + return all_errors + + +def load_capsule(capsules_dir: str, file_hash: str) -> dict | None: + """Load capsule by file hash.""" + capsule_path = Path(capsules_dir) / f"{file_hash}.json" + if not capsule_path.exists(): + return None + + try: + with open(capsule_path) as f: + return json.load(f) + except json.JSONDecodeError: + return None + + + + +def correlate_failures( + errors: list[dict[str, Any]], + manifest_path: str, + workset_path: str, + capsules_dir: str, + db_path: str, +) -> list[dict[str, Any]]: + """Correlate failures with capsules for factual enrichment.""" + # Load manifest for hash lookup + file_hashes = {} + try: + with open(manifest_path) as f: + manifest = json.load(f) + for entry in manifest: + file_hashes[entry["path"]] = entry.get("sha256") + except (FileNotFoundError, json.JSONDecodeError): + pass + + # Load workset + workset_files = set() + try: + with open(workset_path) as f: + workset = json.load(f) + workset_files = {p["path"] for p in workset.get("paths", [])} + except (FileNotFoundError, json.JSONDecodeError): + pass + + # Correlate each error + for error in errors: + file = error.get("file", "") + + # Load capsule if file in workset/manifest + if file in file_hashes: + file_hash = file_hashes[file] + capsule = load_capsule(capsules_dir, file_hash) + if capsule: + error["capsule"] = { + "path": capsule.get("path"), + "hash": capsule.get("sha256"), + "interfaces": capsule.get("interfaces", {}), + } + + + return errors + + +def generate_rca_json(failures: list[dict[str, Any]]) -> dict[str, Any]: + """Generate RCA JSON output.""" + return { + "completed_at": datetime.now(UTC).isoformat(), + "failures": failures, + } + + +def run_fce( + root_path: str = ".", + capsules_dir: str = "./.pf/capsules", + manifest_path: str = "manifest.json", + workset_path: str = "./.pf/workset.json", + db_path: str = "repo_index.db", + timeout: int = 600, + print_plan: bool = False, +) -> dict[str, Any]: + """Run factual correlation engine - NO interpretation, just facts.""" + try: + # Step A: Initialization + raw_dir = Path(root_path) / ".pf" / "raw" + results = { + "timestamp": datetime.now(UTC).isoformat(), + "all_findings": [], + "test_results": {}, + "correlations": {} + } + + # Step B: Phase 1 - Gather All Findings + if raw_dir.exists(): + results["all_findings"] = scan_all_findings(raw_dir) + + # Step B2: Load Optional Insights (ML predictions, etc.) + insights_dir = Path(root_path) / ".pf" / "insights" + if insights_dir.exists(): + # Load ML suggestions if available + ml_path = insights_dir / "ml_suggestions.json" + if ml_path.exists(): + try: + with open(ml_path) as f: + ml_data = json.load(f) + + # Convert ML predictions to correlatable findings + # ML has separate lists for root causes, risk scores, etc. + for root_cause in ml_data.get("likely_root_causes", [])[:5]: # Top 5 root causes + if root_cause.get("score", 0) > 0.7: + results["all_findings"].append({ + "file": root_cause["path"], + "line": 0, # ML doesn't provide line-level predictions + "rule": "ML_ROOT_CAUSE", + "tool": "ml", + "message": f"ML predicts {root_cause['score']:.1%} probability as root cause", + "severity": "high" + }) + + for risk_item in ml_data.get("risk", [])[:5]: # Top 5 risky files + if risk_item.get("score", 0) > 0.7: + results["all_findings"].append({ + "file": risk_item["path"], + "line": 0, + "rule": f"ML_RISK_{int(risk_item['score']*100)}", + "tool": "ml", + "message": f"ML predicts {risk_item['score']:.1%} risk score", + "severity": "high" if risk_item.get("score", 0) > 0.85 else "medium" + }) + except (json.JSONDecodeError, KeyError): + pass # ML insights are optional, continue if they fail + + # Load taint severity insights if available + taint_severity_path = insights_dir / "taint_severity.json" + if taint_severity_path.exists(): + try: + with open(taint_severity_path) as f: + taint_data = json.load(f) + + # Add severity-enhanced taint findings + for item in taint_data.get("severity_analysis", []): + if item.get("severity") in ["critical", "high"]: + results["all_findings"].append({ + "file": item.get("file", ""), + "line": item.get("line", 0), + "rule": f"TAINT_{item.get('vulnerability_type', 'UNKNOWN').upper().replace(' ', '_')}", + "tool": "taint-insights", + "message": f"{item.get('vulnerability_type')} with {item.get('severity')} severity", + "severity": item.get("severity") + }) + except (json.JSONDecodeError, KeyError): + pass # Insights are optional + + # Step C: Phase 2 - Execute Tests + # Detect test framework + framework_info = detect_test_framework(root_path) + + tools = [] + if framework_info["name"] != "unknown" and framework_info["cmd"]: + command = framework_info["cmd"] + + # Add quiet flags + if "pytest" in command: + command = "pytest -q -p no:cacheprovider" + elif "npm test" in command: + command = "npm test --silent" + elif "unittest" in command: + command = "python -m unittest discover -q" + + tools.append({ + "name": framework_info["name"], + "command": command, + "type": "test" + }) + + # Check for build scripts + package_json = Path(root_path) / "package.json" + if package_json.exists(): + try: + with open(package_json) as f: + package = json.load(f) + scripts = package.get("scripts", {}) + if "build" in scripts: + tools.append({ + "name": "npm build", + "command": "npm run build --silent", + "type": "build" + }) + except json.JSONDecodeError: + pass + + if print_plan: + print("Detected tools:") + for tool in tools: + print(f" - {tool['name']}: {tool['command']}") + return {"success": True, "printed_plan": True} + + if not tools: + tools = [] # No test tools, continue processing + + # Run tools and collect failures + all_failures = [] + + for tool in tools: + print(f"Running {tool['name']}...") + exit_code, stdout, stderr = run_tool(tool["command"], root_path, timeout) + + if exit_code != 0: + output = stdout + "\n" + stderr + errors = parse_errors(output, tool["name"]) + + # Special handling for pytest collection failures + if tool["name"] == "pytest" and exit_code == 2 and "ERROR collecting" in output: + print("Pytest collection failed. Falling back to Python compilation check...") + + py_files = [] + for py_file in Path(root_path).rglob("*.py"): + if "__pycache__" not in str(py_file) and not any(part.startswith('.') for part in py_file.parts): + py_files.append(str(py_file.relative_to(root_path))) + + if py_files: + print(f"Checking {len(py_files)} Python files for compilation errors...") + compile_errors = [] + + for py_file in py_files[:50]: + module_path = str(Path(py_file).as_posix()).replace('/', '.').replace('.py', '') + import_cmd = f'python3 -c "import {module_path}"' + comp_exit, comp_out, comp_err = run_tool(import_cmd, root_path, 10) + + if comp_exit != 0: + comp_output = comp_out + "\n" + comp_err + if comp_output.strip(): + error_lines = comp_output.strip().split('\n') + error_msg = "Import failed" + + for line in error_lines: + if 'ModuleNotFoundError:' in line: + error_msg = line.strip() + break + elif 'ImportError:' in line: + error_msg = line.strip() + break + elif 'SyntaxError:' in line: + error_msg = line.strip() + break + elif 'AttributeError:' in line: + error_msg = line.strip() + break + + compile_errors.append({ + "tool": "py_import", + "file": py_file, + "line": 0, + "message": error_msg, + "category": "compile_error", + }) + + if compile_errors: + print(f"Found {len(compile_errors)} compilation errors") + errors.extend(compile_errors) + + # If no errors parsed, create generic one + if not errors and exit_code != 0: + errors.append({ + "tool": tool["name"], + "file": "unknown", + "line": 0, + "message": f"Tool failed with exit code {exit_code}", + "category": "runtime", + }) + + all_failures.extend(errors) + + # Correlate with capsules + all_failures = correlate_failures( + all_failures, + Path(root_path) / manifest_path, + Path(root_path) / workset_path, + Path(root_path) / capsules_dir, + Path(root_path) / db_path, + ) + + # Store test results + results["test_results"] = { + "completed_at": datetime.now(UTC).isoformat(), + "failures": all_failures, + "tools_run": len(tools) + } + + # Step D: Consolidate Evidence + consolidated_findings = results["all_findings"].copy() + + # Add test failures to consolidated list + if all_failures: + for failure in all_failures: + if 'file' in failure and 'line' in failure: + consolidated_findings.append({ + 'file': failure['file'], + 'line': int(failure.get('line', 0)), + 'rule': failure.get('code', failure.get('category', 'test-failure')), + 'tool': failure.get('tool', 'test'), + 'message': failure.get('message', ''), + 'severity': failure.get('severity', 'error') + }) + + # Step E: Phase 3 - Line-Level Correlation (Hotspots) + # Group findings by file:line + line_groups = defaultdict(list) + for finding in consolidated_findings: + if finding['line'] > 0: + key = f"{finding['file']}:{finding['line']}" + line_groups[key].append(finding) + + # Find hotspots + hotspots = {} + for line_key, findings in line_groups.items(): + tools_on_line = set(f['tool'] for f in findings) + if len(tools_on_line) > 1: + hotspots[line_key] = findings + + # Enrich hotspots with symbol context + full_db_path = Path(root_path) / db_path + if hotspots and full_db_path.exists(): + try: + conn = sqlite3.connect(str(full_db_path)) + cursor = conn.cursor() + + enriched_hotspots = {} + for line_key, findings in hotspots.items(): + if ':' in line_key: + file_path, line_str = line_key.rsplit(':', 1) + try: + line_num = int(line_str) + + query = """ + SELECT name, type, line + FROM symbols + WHERE file = ? + AND line <= ? + AND type IN ('function', 'class') + ORDER BY line DESC + LIMIT 1 + """ + cursor.execute(query, (file_path, line_num)) + result = cursor.fetchone() + + hotspot_data = {"findings": findings} + + if result: + symbol_name, symbol_type, symbol_line = result + hotspot_data["in_symbol"] = f"{symbol_type}: {symbol_name}" + + enriched_hotspots[line_key] = hotspot_data + except (ValueError, TypeError): + enriched_hotspots[line_key] = {"findings": findings} + else: + enriched_hotspots[line_key] = {"findings": findings} + + conn.close() + hotspots = enriched_hotspots + except (sqlite3.Error, Exception): + hotspots = {k: {"findings": v} for k, v in hotspots.items()} + else: + hotspots = {k: {"findings": v} for k, v in hotspots.items()} + + # Store hotspots in correlations + results["correlations"]["hotspots"] = hotspots + results["correlations"]["total_findings"] = len(consolidated_findings) + results["correlations"]["total_lines_with_findings"] = len(line_groups) + results["correlations"]["total_hotspots"] = len(hotspots) + + # Step F: Phase 4 - Factual Cluster Detection + factual_clusters = [] + + # Load correlation rules + correlation_loader = CorrelationLoader() + correlation_rules = correlation_loader.load_rules() + + if correlation_rules and consolidated_findings: + # Group findings by file + findings_by_file = defaultdict(list) + for finding in consolidated_findings: + if 'file' in finding: + findings_by_file[finding['file']].append(finding) + + # Check each file against each rule + for file_path, file_findings in findings_by_file.items(): + for rule in correlation_rules: + all_facts_matched = True + + for fact_index, fact in enumerate(rule.co_occurring_facts): + fact_matched = False + for finding in file_findings: + if rule.matches_finding(finding, fact_index): + fact_matched = True + break + + if not fact_matched: + all_facts_matched = False + break + + if all_facts_matched: + factual_clusters.append({ + "name": rule.name, + "file": file_path, + "description": rule.description, + "confidence": rule.confidence + }) + + # Store factual clusters + results["correlations"]["factual_clusters"] = factual_clusters + + # Step G: Finalization - Apply intelligent organization sorting + from theauditor.utils.finding_priority import sort_findings, normalize_severity + + # CRITICAL: Normalize all severities BEFORE sorting + # This handles Docker's integer severity and ESLint's "error" strings + if results.get("all_findings"): + # First pass: normalize severity in-place + for finding in results["all_findings"]: + original_severity = finding.get("severity") + finding["severity"] = normalize_severity(original_severity) + + # Debug log for unusual severities (helps catch new formats) + if original_severity and str(original_severity) != finding["severity"]: + if isinstance(original_severity, int): + # Expected for Docker, don't log + pass + else: + print(f"[FCE] Normalized severity: {original_severity} -> {finding['severity']}") + + # Second pass: sort using centralized logic + results["all_findings"] = sort_findings(results["all_findings"]) + + # Log sorting results for verification + if results["all_findings"]: + print(f"[FCE] Sorted {len(results['all_findings'])} findings") + first = results["all_findings"][0] + last = results["all_findings"][-1] if len(results["all_findings"]) > 1 else first + print(f"[FCE] First: {first.get('severity')} from {first.get('tool')}") + print(f"[FCE] Last: {last.get('severity')} from {last.get('tool')}") + + # Write results to JSON + raw_dir.mkdir(parents=True, exist_ok=True) + fce_path = raw_dir / "fce.json" + fce_path.write_text(json.dumps(results, indent=2)) + + # Count total failures/findings + failures_found = len(results.get("all_findings", [])) + + # Return success structure + return { + "success": True, + "failures_found": failures_found, + "output_files": [str(fce_path)], + "results": results + } + + except Exception as e: + # Step H: Error Handling + return { + "success": False, + "failures_found": 0, + "error": str(e) + } diff --git a/theauditor/framework_detector.py b/theauditor/framework_detector.py new file mode 100644 index 0000000..e0cbee8 --- /dev/null +++ b/theauditor/framework_detector.py @@ -0,0 +1,608 @@ +"""Framework detection for various languages and ecosystems.""" + +import json +import re +import glob +from pathlib import Path +from typing import Any +from theauditor.manifest_parser import ManifestParser +from theauditor.framework_registry import FRAMEWORK_REGISTRY + + +class FrameworkDetector: + """Detects frameworks and libraries used in a project.""" + + # Note: Framework detection now uses the centralized FRAMEWORK_REGISTRY + # from framework_registry.py instead of the old FRAMEWORK_SIGNATURES + + def __init__(self, project_path: Path, exclude_patterns: list[str] = None): + """Initialize detector with project path. + + Args: + project_path: Root directory of the project. + exclude_patterns: List of patterns to exclude from scanning. + """ + self.project_path = Path(project_path) + self.detected_frameworks = [] + self.deps_cache = None + self.exclude_patterns = exclude_patterns or [] + + def detect_all(self) -> list[dict[str, Any]]: + """Detect all frameworks in the project. + + Returns: + List of detected framework info dictionaries. + """ + self.detected_frameworks = [] + + # Load TheAuditor's deps.json if available for better version info + self._load_deps_cache() + + # Use unified manifest detection + self._detect_from_manifests() + + # Also detect from monorepo workspaces (keep existing logic) + self._detect_from_workspaces() + + # Store frameworks found in manifests for version lookup + manifest_frameworks = {} + for fw in self.detected_frameworks: + if fw["source"] != "imports": + key = (fw["framework"], fw["language"]) + manifest_frameworks[key] = fw["version"] + + # DISABLED: Import scanning causes too many false positives + # It detects framework names in strings, comments, and detection code itself + # Real dependencies should be in manifest files (package.json, requirements.txt, etc.) + # self._scan_source_imports() + + # Check for framework-specific files + self._check_framework_files() + + # Update versions for frameworks detected from framework files only (imports disabled) + for fw in self.detected_frameworks: + if fw["version"] == "unknown" and fw["source"] == "framework_files": + key = (fw["framework"], fw["language"]) + # First try manifest frameworks + if key in manifest_frameworks: + fw["version"] = manifest_frameworks[key] + fw["source"] = f"{fw['source']} (version from manifest)" + # Then try deps cache + elif self.deps_cache and fw["framework"] in self.deps_cache: + cached_dep = self.deps_cache[fw["framework"]] + manager = cached_dep.get("manager", "") + # Match language to manager (py -> python, npm -> javascript) + if (fw["language"] == "python" and manager == "py") or \ + (fw["language"] in ["javascript", "typescript"] and manager == "npm"): + fw["version"] = cached_dep.get("version", "") # Empty not unknown + if fw["version"] != "unknown": + fw["source"] = f"{fw['source']} (version from deps cache)" + + # Deduplicate results, preferring entries with known versions + # Now we keep framework+language+path as unique key to support monorepos + seen = {} + for fw in self.detected_frameworks: + key = (fw["framework"], fw["language"], fw.get("path", ".")) + if key not in seen: + seen[key] = fw + elif fw["version"] != "unknown" and seen[key]["version"] == "unknown": + # Replace with version that has a known version + seen[key] = fw + + return list(seen.values()) + + def _detect_from_manifests(self): + """Unified manifest detection using registry and ManifestParser - now directory-aware.""" + parser = ManifestParser() + + # Manifest file names to search for + manifest_names = [ + "pyproject.toml", + "package.json", + "requirements.txt", + "requirements-dev.txt", + "requirements-test.txt", + "setup.py", + "setup.cfg", + "Gemfile", + "Gemfile.lock", + "go.mod", + "pom.xml", + "build.gradle", + "build.gradle.kts", + "composer.json", + ] + + # Recursively find all manifest files in the project + manifests = {} + for manifest_name in manifest_names: + # Use rglob to find all instances of this manifest file + for manifest_path in self.project_path.rglob(manifest_name): + # Skip excluded directories + try: + relative_path = manifest_path.relative_to(self.project_path) + should_skip = False + + # Check common skip directories + for part in relative_path.parts[:-1]: # Don't check the filename itself + if part in ["node_modules", "venv", ".venv", ".auditor_venv", "vendor", + "dist", "build", "__pycache__", ".git", ".tox", ".pytest_cache"]: + should_skip = True + break + + if should_skip: + continue + + # Calculate the directory path relative to project root + dir_path = manifest_path.parent.relative_to(self.project_path) + dir_str = str(dir_path) if dir_path != Path('.') else '.' + + # Create a unique key for this manifest + manifest_key = f"{dir_str}/{manifest_name}" if dir_str != '.' else manifest_name + manifests[manifest_key] = manifest_path + + except ValueError: + # File is outside project path somehow, skip it + continue + + # Parse each manifest that exists + parsed_data = {} + for manifest_key, path in manifests.items(): + if path.exists(): + try: + # Extract just the filename for parsing logic + filename = path.name + + if filename.endswith('.toml'): + parsed_data[manifest_key] = parser.parse_toml(path) + elif filename.endswith('.json'): + parsed_data[manifest_key] = parser.parse_json(path) + elif filename.endswith(('.yml', '.yaml')): + parsed_data[manifest_key] = parser.parse_yaml(path) + elif filename.endswith('.cfg'): + parsed_data[manifest_key] = parser.parse_ini(path) + elif filename.endswith('.txt'): + parsed_data[manifest_key] = parser.parse_requirements_txt(path) + elif filename == 'Gemfile' or filename == 'Gemfile.lock': + # Parse Gemfile as text for now + with open(path, 'r', encoding='utf-8') as f: + parsed_data[manifest_key] = f.read() + elif filename.endswith('.xml') or filename.endswith('.gradle') or filename.endswith('.kts') or filename.endswith('.mod'): + # Parse as text content for now + with open(path, 'r', encoding='utf-8') as f: + parsed_data[manifest_key] = f.read() + elif filename == 'setup.py': + with open(path, 'r', encoding='utf-8') as f: + parsed_data[manifest_key] = f.read() + except Exception as e: + print(f"Warning: Failed to parse {manifest_key}: {e}") + + # Check each framework against all manifests + for fw_name, fw_config in FRAMEWORK_REGISTRY.items(): + for required_manifest_name, search_configs in fw_config.get("detection_sources", {}).items(): + # Check all parsed manifests that match this manifest type + for manifest_key, manifest_data in parsed_data.items(): + # Check if this manifest matches the required type + if not manifest_key.endswith(required_manifest_name): + continue + + # Extract the directory path from the manifest key + if '/' in manifest_key: + dir_path = '/'.join(manifest_key.split('/')[:-1]) + else: + dir_path = '.' + + if search_configs == "line_search": + # Simple text search for requirements.txt style or Gemfile + if isinstance(manifest_data, list): + # Requirements.txt parsed as list + for line in manifest_data: + version = parser.check_package_in_deps([line], fw_name) + if version: + self.detected_frameworks.append({ + "framework": fw_name, + "version": version or "unknown", + "language": fw_config["language"], + "path": dir_path, + "source": manifest_key + }) + break + elif isinstance(manifest_data, str): + # Text file content + if fw_name in manifest_data or (fw_config.get("package_pattern") and fw_config["package_pattern"] in manifest_data): + # Try to extract version + version = "unknown" + import re + if fw_config.get("package_pattern"): + pattern = fw_config["package_pattern"] + else: + pattern = fw_name + + # Try different version patterns + version_match = re.search(rf'{re.escape(pattern)}["\']?\s*[,:]?\s*["\']?([\d.]+)', manifest_data) + if not version_match: + version_match = re.search(rf'{re.escape(pattern)}\s+v([\d.]+)', manifest_data) + if not version_match: + version_match = re.search(rf'gem\s+["\']?{re.escape(pattern)}["\']?\s*,\s*["\']([\d.]+)["\']', manifest_data) + + if version_match: + version = version_match.group(1) + + self.detected_frameworks.append({ + "framework": fw_name, + "version": version, + "language": fw_config["language"], + "path": dir_path, + "source": manifest_key + }) + + elif search_configs == "content_search": + # Content search for text-based files + if isinstance(manifest_data, str): + found = False + # Check package pattern first + if fw_config.get("package_pattern") and fw_config["package_pattern"] in manifest_data: + found = True + # Check content patterns + elif fw_config.get("content_patterns"): + for pattern in fw_config["content_patterns"]: + if pattern in manifest_data: + found = True + break + # Fallback to framework name + elif fw_name in manifest_data: + found = True + + if found: + # Try to extract version + version = "unknown" + import re + pattern = fw_config.get("package_pattern", fw_name) + version_match = re.search(rf'{re.escape(pattern)}.*?[>v]([\d.]+)', manifest_data, re.DOTALL) + if version_match: + version = version_match.group(1) + + self.detected_frameworks.append({ + "framework": fw_name, + "version": version, + "language": fw_config["language"], + "path": dir_path, + "source": manifest_key + }) + + elif search_configs == "exists": + # Just check if file exists (for go.mod with go test framework) + self.detected_frameworks.append({ + "framework": fw_name, + "version": "unknown", + "language": fw_config["language"], + "path": dir_path, + "source": manifest_key + }) + + else: + # Structured search for JSON/TOML/YAML + for key_path in search_configs: + deps = parser.extract_nested_value(manifest_data, key_path) + if deps: + # Check if framework is in dependencies + package_name = fw_config.get("package_pattern", fw_name) + version = parser.check_package_in_deps(deps, package_name) + if version: + self.detected_frameworks.append({ + "framework": fw_name, + "version": version, + "language": fw_config["language"], + "path": dir_path, + "source": manifest_key + }) + break + + def _detect_from_workspaces(self): + """Detect frameworks from monorepo workspace packages.""" + # This preserves the existing monorepo detection logic + package_json = self.project_path / "package.json" + if not package_json.exists(): + return + + parser = ManifestParser() + try: + data = parser.parse_json(package_json) + + # Check for workspaces field (Yarn/npm workspaces) + workspaces = data.get("workspaces", []) + + # Handle different workspace formats + if isinstance(workspaces, dict): + # npm 7+ format: {"packages": ["packages/*"]} + workspaces = workspaces.get("packages", []) + + if workspaces and isinstance(workspaces, list): + # This is a monorepo - check workspace packages + for pattern in workspaces: + # Convert workspace pattern to absolute path pattern + abs_pattern = str(self.project_path / pattern) + + # Handle glob patterns + if "*" in abs_pattern: + matched_paths = glob.glob(abs_pattern) + for matched_path in matched_paths: + matched_dir = Path(matched_path) + if matched_dir.is_dir(): + workspace_pkg = matched_dir / "package.json" + if workspace_pkg.exists(): + # Parse and check this workspace package + self._check_workspace_package(workspace_pkg, parser) + else: + # Direct path without glob + workspace_dir = self.project_path / pattern + if workspace_dir.is_dir(): + workspace_pkg = workspace_dir / "package.json" + if workspace_pkg.exists(): + self._check_workspace_package(workspace_pkg, parser) + except Exception as e: + print(f"Warning: Failed to check workspaces: {e}") + + def _check_workspace_package(self, pkg_path: Path, parser: ManifestParser): + """Check a single workspace package.json for frameworks.""" + try: + data = parser.parse_json(pkg_path) + + # Check dependencies + all_deps = {} + if "dependencies" in data: + all_deps.update(data["dependencies"]) + if "devDependencies" in data: + all_deps.update(data["devDependencies"]) + + # Check each JavaScript framework + for fw_name, fw_config in FRAMEWORK_REGISTRY.items(): + if fw_config["language"] != "javascript": + continue + + package_name = fw_config.get("package_pattern", fw_name) + if package_name in all_deps: + version = all_deps[package_name] + # Clean version + version = re.sub(r'^[~^>=<]+', '', str(version)).strip() + + # Calculate relative path for path field + try: + rel_path = pkg_path.parent.relative_to(self.project_path) + path = str(rel_path).replace("\\", "/") if rel_path != Path('.') else '.' + source = str(pkg_path.relative_to(self.project_path)).replace("\\", "/") + except ValueError: + path = '.' + source = str(pkg_path) + + self.detected_frameworks.append({ + "framework": fw_name, + "version": version, + "language": "javascript", + "path": path, + "source": source + }) + except Exception as e: + print(f"Warning: Failed to parse workspace package {pkg_path}: {e}") + + # Stub method kept for backward compatibility - actual logic moved to _detect_from_manifests + pass + + def _scan_source_imports(self): + """Scan source files for framework imports.""" + # Limit scanning to avoid performance issues + max_files = 100 + files_scanned = 0 + + # Language file extensions + lang_extensions = { + ".py": "python", + ".js": "javascript", + ".jsx": "javascript", + ".ts": "javascript", + ".tsx": "javascript", + ".go": "go", + ".java": "java", + ".rb": "ruby", + ".php": "php", + } + + for ext, language in lang_extensions.items(): + if files_scanned >= max_files: + break + + for file_path in self.project_path.rglob(f"*{ext}"): + if files_scanned >= max_files: + break + + # Skip node_modules, venv, etc. + if any( + part in file_path.parts + for part in ["node_modules", "venv", ".venv", ".auditor_venv", "vendor", "dist", "build", "__pycache__", ".git"] + ): + continue + + # Check exclude patterns + relative_path = file_path.relative_to(self.project_path) + should_skip = False + for pattern in self.exclude_patterns: + # Handle directory patterns + if pattern.endswith('/'): + dir_pattern = pattern.rstrip('/') + if str(relative_path).startswith(dir_pattern + '/') or str(relative_path).startswith(dir_pattern + '\\'): + should_skip = True + break + # Handle glob patterns + elif '*' in pattern: + from fnmatch import fnmatch + if fnmatch(str(relative_path), pattern): + should_skip = True + break + # Handle exact matches + elif str(relative_path) == pattern: + should_skip = True + break + + if should_skip: + continue + + files_scanned += 1 + + try: + with open(file_path, encoding="utf-8", errors="ignore") as f: + content = f.read() + + # Check frameworks from registry + for fw_name, fw_config in FRAMEWORK_REGISTRY.items(): + # Only check frameworks for this language + if fw_config["language"] != language: + continue + + if "import_patterns" in fw_config: + for import_pattern in fw_config["import_patterns"]: + if import_pattern in content: + # Check if not already detected in this directory + file_dir = file_path.parent.relative_to(self.project_path) + dir_str = str(file_dir).replace("\\", "/") if file_dir != Path('.') else '.' + + if not any( + fw["framework"] == fw_name and fw["language"] == language and fw.get("path", ".") == dir_str + for fw in self.detected_frameworks + ): + self.detected_frameworks.append( + { + "framework": fw_name, + "version": "unknown", + "language": language, + "path": dir_str, + "source": "imports", + } + ) + break + + except Exception: + # Skip files that can't be read + continue + + def _check_framework_files(self): + """Check for framework-specific files.""" + # Check all frameworks in registry for file markers + for fw_name, fw_config in FRAMEWORK_REGISTRY.items(): + if "file_markers" in fw_config: + for file_marker in fw_config["file_markers"]: + # Handle wildcard patterns + if "*" in file_marker: + # Use glob for wildcard patterns + import glob + pattern = str(self.project_path / file_marker) + if glob.glob(pattern): + # Check if not already detected + if not any( + fw["framework"] == fw_name and fw["language"] == fw_config["language"] + for fw in self.detected_frameworks + ): + self.detected_frameworks.append( + { + "framework": fw_name, + "version": "unknown", + "language": fw_config["language"], + "path": ".", # Framework files typically at root + "source": "framework_files", + } + ) + break + else: + # Direct file path + if (self.project_path / file_marker).exists(): + # Check if not already detected + if not any( + fw["framework"] == fw_name and fw["language"] == fw_config["language"] + for fw in self.detected_frameworks + ): + self.detected_frameworks.append( + { + "framework": fw_name, + "version": "unknown", + "language": fw_config["language"], + "path": ".", # Framework files typically at root + "source": "framework_files", + } + ) + break + + def _load_deps_cache(self): + """Load TheAuditor's deps.json if available for version info.""" + deps_file = self.project_path / ".pf" / "deps.json" + if deps_file.exists(): + try: + with open(deps_file) as f: + data = json.load(f) + self.deps_cache = {} + # Handle both old format (list) and new format (dict with "dependencies" key) + if isinstance(data, list): + deps_list = data + else: + deps_list = data.get("dependencies", []) + + for dep in deps_list: + # Store by name for quick lookup + self.deps_cache[dep["name"]] = dep + except Exception as e: + # Log the error but continue + print(f"Warning: Could not load deps cache: {e}") + pass + + def format_table(self) -> str: + """Format detected frameworks as a table. + + Returns: + Formatted table string. + """ + if not self.detected_frameworks: + return "No frameworks detected." + + lines = [] + lines.append("FRAMEWORK LANGUAGE PATH VERSION SOURCE") + lines.append("-" * 80) + + imports_only = [] + for fw in self.detected_frameworks: + framework = fw["framework"][:18].ljust(18) + language = fw["language"][:12].ljust(12) + path = fw.get("path", ".")[:15].ljust(15) + version = fw["version"][:15].ljust(15) + source = fw["source"] + + lines.append(f"{framework} {language} {path} {version} {source}") + + # Track if any are from imports only + if fw["source"] == "imports" and fw["version"] == "unknown": + imports_only.append(fw["framework"]) + + # Add note if frameworks detected from imports without versions + if imports_only: + lines.append("\n" + "="*60) + lines.append("NOTE: Frameworks marked with 'imports' source were detected from") + lines.append("import statements in the codebase (possibly test files) but are") + lines.append("not listed as dependencies. Version shown as 'unknown' because") + lines.append("they are not in package.json, pyproject.toml, or requirements.txt.") + + return "\n".join(lines) + + def to_json(self) -> str: + """Export detected frameworks to JSON. + + Returns: + JSON string. + """ + return json.dumps(self.detected_frameworks, indent=2, sort_keys=True) + + def save_to_file(self, output_path: Path) -> None: + """Save detected frameworks to a JSON file. + + Args: + output_path: Path where the JSON file should be saved. + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(self.to_json()) \ No newline at end of file diff --git a/theauditor/framework_registry.py b/theauditor/framework_registry.py new file mode 100644 index 0000000..26ce19e --- /dev/null +++ b/theauditor/framework_registry.py @@ -0,0 +1,549 @@ +"""Registry of framework detection patterns and test framework configurations.""" + +# Framework detection registry - defines where to find each framework +FRAMEWORK_REGISTRY = { + # Python frameworks + "django": { + "language": "python", + "detection_sources": { + "pyproject.toml": [ + ["project", "dependencies"], + ["tool", "poetry", "dependencies"], + ["tool", "poetry", "group", "*", "dependencies"], + ["tool", "pdm", "dependencies"], + ["tool", "setuptools", "install_requires"], + ["project", "optional-dependencies", "*"], + ], + "requirements.txt": "line_search", + "requirements-dev.txt": "line_search", + "setup.py": "content_search", + "setup.cfg": ["options", "install_requires"], + }, + "import_patterns": ["from django", "import django"], + "file_markers": ["manage.py", "wsgi.py"], + }, + "flask": { + "language": "python", + "detection_sources": { + "pyproject.toml": [ + ["project", "dependencies"], + ["tool", "poetry", "dependencies"], + ["tool", "poetry", "group", "*", "dependencies"], + ["tool", "pdm", "dependencies"], + ["project", "optional-dependencies", "*"], + ], + "requirements.txt": "line_search", + "requirements-dev.txt": "line_search", + "setup.py": "content_search", + "setup.cfg": ["options", "install_requires"], + }, + "import_patterns": ["from flask", "import flask"], + }, + "fastapi": { + "language": "python", + "detection_sources": { + "pyproject.toml": [ + ["project", "dependencies"], + ["tool", "poetry", "dependencies"], + ["tool", "poetry", "group", "*", "dependencies"], + ["tool", "pdm", "dependencies"], + ["project", "optional-dependencies", "*"], + ], + "requirements.txt": "line_search", + "requirements-dev.txt": "line_search", + "setup.py": "content_search", + "setup.cfg": ["options", "install_requires"], + }, + "import_patterns": ["from fastapi", "import fastapi"], + }, + "pyramid": { + "language": "python", + "detection_sources": { + "pyproject.toml": [ + ["project", "dependencies"], + ["tool", "poetry", "dependencies"], + ["tool", "poetry", "group", "*", "dependencies"], + ["tool", "pdm", "dependencies"], + ["project", "optional-dependencies", "*"], + ], + "requirements.txt": "line_search", + "requirements-dev.txt": "line_search", + "setup.py": "content_search", + "setup.cfg": ["options", "install_requires"], + }, + "import_patterns": ["from pyramid", "import pyramid"], + }, + "tornado": { + "language": "python", + "detection_sources": { + "pyproject.toml": [ + ["project", "dependencies"], + ["tool", "poetry", "dependencies"], + ["tool", "poetry", "group", "*", "dependencies"], + ["tool", "pdm", "dependencies"], + ["project", "optional-dependencies", "*"], + ], + "requirements.txt": "line_search", + "requirements-dev.txt": "line_search", + "setup.py": "content_search", + "setup.cfg": ["options", "install_requires"], + }, + "import_patterns": ["from tornado", "import tornado"], + }, + "bottle": { + "language": "python", + "detection_sources": { + "pyproject.toml": [ + ["project", "dependencies"], + ["tool", "poetry", "dependencies"], + ["tool", "poetry", "group", "*", "dependencies"], + ["tool", "pdm", "dependencies"], + ["project", "optional-dependencies", "*"], + ], + "requirements.txt": "line_search", + "requirements-dev.txt": "line_search", + "setup.py": "content_search", + "setup.cfg": ["options", "install_requires"], + }, + "import_patterns": ["from bottle", "import bottle"], + }, + "aiohttp": { + "language": "python", + "detection_sources": { + "pyproject.toml": [ + ["project", "dependencies"], + ["tool", "poetry", "dependencies"], + ["tool", "poetry", "group", "*", "dependencies"], + ["tool", "pdm", "dependencies"], + ["project", "optional-dependencies", "*"], + ], + "requirements.txt": "line_search", + "requirements-dev.txt": "line_search", + "setup.py": "content_search", + "setup.cfg": ["options", "install_requires"], + }, + "import_patterns": ["from aiohttp", "import aiohttp"], + }, + "sanic": { + "language": "python", + "detection_sources": { + "pyproject.toml": [ + ["project", "dependencies"], + ["tool", "poetry", "dependencies"], + ["tool", "poetry", "group", "*", "dependencies"], + ["tool", "pdm", "dependencies"], + ["project", "optional-dependencies", "*"], + ], + "requirements.txt": "line_search", + "requirements-dev.txt": "line_search", + "setup.py": "content_search", + "setup.cfg": ["options", "install_requires"], + }, + "import_patterns": ["from sanic", "import sanic"], + }, + + # JavaScript/TypeScript frameworks + "express": { + "language": "javascript", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "import_patterns": ["express", "require('express')", "from 'express'"], + }, + "nestjs": { + "language": "javascript", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "package_pattern": "@nestjs/core", + "import_patterns": ["@nestjs"], + }, + "next": { + "language": "javascript", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "import_patterns": ["next/", "from 'next'"], + }, + "react": { + "language": "javascript", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "import_patterns": ["react", "from 'react'", "React"], + }, + "vue": { + "language": "javascript", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "import_patterns": ["vue", "from 'vue'"], + "file_markers": ["*.vue"], + }, + "angular": { + "language": "javascript", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "package_pattern": "@angular/core", + "import_patterns": ["@angular"], + "file_markers": ["angular.json"], + }, + "fastify": { + "language": "javascript", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "import_patterns": ["fastify"], + }, + "koa": { + "language": "javascript", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "import_patterns": ["koa", "require('koa')"], + }, + "vite": { + "language": "javascript", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "import_patterns": ["vite"], + "config_files": ["vite.config.js", "vite.config.ts"], + }, + + # PHP frameworks + "laravel": { + "language": "php", + "detection_sources": { + "composer.json": [ + ["require"], + ["require-dev"], + ], + }, + "package_pattern": "laravel/framework", + "file_markers": ["artisan", "bootstrap/app.php"], + }, + "symfony": { + "language": "php", + "detection_sources": { + "composer.json": [ + ["require"], + ["require-dev"], + ], + }, + "package_pattern": "symfony/framework-bundle", + "file_markers": ["bin/console", "config/bundles.php"], + }, + "slim": { + "language": "php", + "detection_sources": { + "composer.json": [ + ["require"], + ["require-dev"], + ], + }, + "package_pattern": "slim/slim", + }, + "lumen": { + "language": "php", + "detection_sources": { + "composer.json": [ + ["require"], + ["require-dev"], + ], + }, + "package_pattern": "laravel/lumen-framework", + "file_markers": ["artisan"], + }, + "codeigniter": { + "language": "php", + "detection_sources": { + "composer.json": [ + ["require"], + ["require-dev"], + ], + }, + "package_pattern": "codeigniter4/framework", + "file_markers": ["spark"], + }, + + # Go frameworks + "gin": { + "language": "go", + "detection_sources": { + "go.mod": "content_search", + }, + "package_pattern": "github.com/gin-gonic/gin", + "import_patterns": ["github.com/gin-gonic/gin"], + }, + "echo": { + "language": "go", + "detection_sources": { + "go.mod": "content_search", + }, + "package_pattern": "github.com/labstack/echo", + "import_patterns": ["github.com/labstack/echo"], + }, + "fiber": { + "language": "go", + "detection_sources": { + "go.mod": "content_search", + }, + "package_pattern": "github.com/gofiber/fiber", + "import_patterns": ["github.com/gofiber/fiber"], + }, + "beego": { + "language": "go", + "detection_sources": { + "go.mod": "content_search", + }, + "package_pattern": "github.com/beego/beego", + "import_patterns": ["github.com/beego/beego"], + }, + "chi": { + "language": "go", + "detection_sources": { + "go.mod": "content_search", + }, + "package_pattern": "github.com/go-chi/chi", + "import_patterns": ["github.com/go-chi/chi"], + }, + "gorilla": { + "language": "go", + "detection_sources": { + "go.mod": "content_search", + }, + "package_pattern": "github.com/gorilla/mux", + "import_patterns": ["github.com/gorilla/mux"], + }, + + # Java frameworks + "spring": { + "language": "java", + "detection_sources": { + "pom.xml": "content_search", + "build.gradle": "content_search", + "build.gradle.kts": "content_search", + }, + "package_pattern": "spring", + "content_patterns": ["spring-boot", "springframework"], + }, + "micronaut": { + "language": "java", + "detection_sources": { + "pom.xml": "content_search", + "build.gradle": "content_search", + "build.gradle.kts": "content_search", + }, + "package_pattern": "io.micronaut", + "content_patterns": ["io.micronaut"], + }, + "quarkus": { + "language": "java", + "detection_sources": { + "pom.xml": "content_search", + "build.gradle": "content_search", + "build.gradle.kts": "content_search", + }, + "package_pattern": "io.quarkus", + "content_patterns": ["io.quarkus"], + }, + "dropwizard": { + "language": "java", + "detection_sources": { + "pom.xml": "content_search", + "build.gradle": "content_search", + "build.gradle.kts": "content_search", + }, + "package_pattern": "io.dropwizard", + "content_patterns": ["io.dropwizard"], + }, + "play": { + "language": "java", + "detection_sources": { + "build.sbt": "content_search", + "build.gradle": "content_search", + }, + "package_pattern": "com.typesafe.play", + "content_patterns": ["com.typesafe.play"], + }, + + # Ruby frameworks + "rails": { + "language": "ruby", + "detection_sources": { + "Gemfile": "line_search", + "Gemfile.lock": "content_search", + }, + "package_pattern": "rails", + "file_markers": ["Rakefile", "config.ru", "bin/rails"], + }, + "sinatra": { + "language": "ruby", + "detection_sources": { + "Gemfile": "line_search", + "Gemfile.lock": "content_search", + }, + "package_pattern": "sinatra", + }, + "hanami": { + "language": "ruby", + "detection_sources": { + "Gemfile": "line_search", + "Gemfile.lock": "content_search", + }, + "package_pattern": "hanami", + }, + "grape": { + "language": "ruby", + "detection_sources": { + "Gemfile": "line_search", + "Gemfile.lock": "content_search", + }, + "package_pattern": "grape", + }, +} + + +# Test framework detection registry +TEST_FRAMEWORK_REGISTRY = { + "pytest": { + "language": "python", + "command": "pytest -q -p no:cacheprovider", + "detection_sources": { + "pyproject.toml": [ + ["project", "dependencies"], + ["project", "optional-dependencies", "test"], + ["project", "optional-dependencies", "dev"], + ["project", "optional-dependencies", "tests"], + ["tool", "poetry", "dependencies"], + ["tool", "poetry", "group", "dev", "dependencies"], + ["tool", "poetry", "group", "test", "dependencies"], + ["tool", "poetry", "dev-dependencies"], + ["tool", "pdm", "dev-dependencies"], + ["tool", "hatch", "envs", "default", "dependencies"], + ], + "requirements.txt": "line_search", + "requirements-dev.txt": "line_search", + "requirements-test.txt": "line_search", + "setup.cfg": ["options", "tests_require"], + "setup.py": "content_search", + "tox.ini": "content_search", + }, + "config_files": ["pytest.ini", ".pytest.ini", "pyproject.toml"], + "config_sections": { + "pyproject.toml": [ + ["tool", "pytest"], + ["tool", "pytest", "ini_options"], + ], + "setup.cfg": [ + ["tool:pytest"], + ["pytest"], + ], + }, + }, + "unittest": { + "language": "python", + "command": "python -m unittest discover -q", + "import_patterns": ["import unittest", "from unittest"], + "file_patterns": ["test*.py", "*_test.py"], + }, + "jest": { + "language": "javascript", + "command": "npm test --silent", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "config_files": ["jest.config.js", "jest.config.ts", "jest.config.json"], + "config_sections": { + "package.json": [["jest"]], + }, + "script_patterns": ["jest"], + }, + "vitest": { + "language": "javascript", + "command": "npm test --silent", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "config_files": ["vitest.config.js", "vitest.config.ts", "vite.config.js", "vite.config.ts"], + "script_patterns": ["vitest"], + }, + "mocha": { + "language": "javascript", + "command": "npm test --silent", + "detection_sources": { + "package.json": [ + ["dependencies"], + ["devDependencies"], + ], + }, + "config_files": [".mocharc.js", ".mocharc.json", ".mocharc.yaml", ".mocharc.yml"], + "script_patterns": ["mocha"], + }, + "go": { + "language": "go", + "command": "go test ./...", + "file_patterns": ["*_test.go"], + "detection_sources": { + "go.mod": "exists", + }, + }, + "junit": { + "language": "java", + "command_maven": "mvn test", + "command_gradle": "gradle test", + "detection_sources": { + "pom.xml": "content_search", + "build.gradle": "content_search", + "build.gradle.kts": "content_search", + }, + "content_patterns": ["junit", "testImplementation"], + "import_patterns": ["import org.junit"], + "file_patterns": ["*Test.java", "Test*.java"], + }, + "rspec": { + "language": "ruby", + "command": "rspec", + "detection_sources": { + "Gemfile": "line_search", + "Gemfile.lock": "content_search", + }, + "config_files": [".rspec", "spec/spec_helper.rb"], + "directory_markers": ["spec/"], + }, +} \ No newline at end of file diff --git a/theauditor/graph/__init__.py b/theauditor/graph/__init__.py new file mode 100644 index 0000000..a7cd157 --- /dev/null +++ b/theauditor/graph/__init__.py @@ -0,0 +1,45 @@ +"""Graph package - dependency and call graph functionality. + +Core modules (always available): +- analyzer: Pure graph algorithms (cycles, paths, layers) +- builder: Graph construction from source code +- store: SQLite persistence + +Optional modules: +- insights: Interpretive metrics (health scores, recommendations, hotspots) +""" + +# Core exports (always available) +from .analyzer import XGraphAnalyzer +from .builder import XGraphBuilder, GraphNode, GraphEdge, Cycle, Hotspot, ImpactAnalysis +from .store import XGraphStore +from .visualizer import GraphVisualizer + +# Optional insights module +try: + from .insights import GraphInsights, check_insights_available, create_insights + INSIGHTS_AVAILABLE = True +except ImportError: + # Insights module is optional - similar to ml.py + INSIGHTS_AVAILABLE = False + GraphInsights = None + check_insights_available = lambda: False + create_insights = lambda weights=None: None + +__all__ = [ + # Core classes (always available) + "XGraphBuilder", + "XGraphAnalyzer", + "XGraphStore", + "GraphVisualizer", + "GraphNode", + "GraphEdge", + "Cycle", + "Hotspot", + "ImpactAnalysis", + # Optional insights + "GraphInsights", + "INSIGHTS_AVAILABLE", + "check_insights_available", + "create_insights", +] \ No newline at end of file diff --git a/theauditor/graph/analyzer.py b/theauditor/graph/analyzer.py new file mode 100644 index 0000000..bda4cf6 --- /dev/null +++ b/theauditor/graph/analyzer.py @@ -0,0 +1,421 @@ +"""Graph analyzer module - pure graph algorithms for dependency and call graphs. + +This module provides ONLY non-interpretive graph algorithms: +- Cycle detection (DFS) +- Shortest path finding (BFS) +- Layer identification (topological sort) +- Impact analysis (graph traversal) +- Statistical summaries (counts and grouping) + +For interpretive metrics like health scores, recommendations, and weighted +rankings, see the optional graph.insights module. +""" + +from collections import defaultdict +from pathlib import Path +from typing import Any + + +class XGraphAnalyzer: + """Analyze cross-project dependency and call graphs using pure algorithms.""" + + def detect_cycles(self, graph: dict[str, Any]) -> list[dict[str, Any]]: + """ + Detect cycles in the dependency graph using DFS. + + This is a pure graph algorithm that returns raw cycle data + without any interpretation or scoring. + + Args: + graph: Graph with 'nodes' and 'edges' keys + + Returns: + List of cycles, each with nodes and size + """ + # Build adjacency list + adj = defaultdict(list) + for edge in graph.get("edges", []): + adj[edge["source"]].append(edge["target"]) + + # Track visited nodes and recursion stack + visited = set() + rec_stack = set() + cycles = [] + + def dfs(node: str, path: list[str]) -> None: + """DFS to detect cycles.""" + visited.add(node) + rec_stack.add(node) + path.append(node) + + for neighbor in adj[node]: + if neighbor not in visited: + dfs(neighbor, path.copy()) + elif neighbor in rec_stack: + # Found a cycle + cycle_start = path.index(neighbor) + cycle_nodes = path[cycle_start:] + [neighbor] + cycles.append({ + "nodes": cycle_nodes, + "size": len(cycle_nodes) - 1, # Don't count repeated node + }) + + rec_stack.remove(node) + + # Run DFS from all unvisited nodes + for node in graph.get("nodes", []): + node_id = node["id"] + if node_id not in visited: + dfs(node_id, []) + + # Sort cycles by size (largest first) + cycles.sort(key=lambda c: c["size"], reverse=True) + + return cycles + + def impact_of_change( + self, + targets: list[str], + import_graph: dict[str, Any], + call_graph: dict[str, Any] | None = None, + max_depth: int = 3, + ) -> dict[str, Any]: + """ + Calculate the impact of changing target files using graph traversal. + + This is a pure graph algorithm that finds affected nodes + without interpreting or scoring the impact. + + Args: + targets: List of file/module IDs that will change + import_graph: Import/dependency graph + call_graph: Optional call graph + max_depth: Maximum traversal depth + + Returns: + Raw impact data with upstream and downstream effects + """ + # Build adjacency lists + upstream = defaultdict(list) # Who depends on X + downstream = defaultdict(list) # What X depends on + + for edge in import_graph.get("edges", []): + downstream[edge["source"]].append(edge["target"]) + upstream[edge["target"]].append(edge["source"]) + + if call_graph: + for edge in call_graph.get("edges", []): + downstream[edge["source"]].append(edge["target"]) + upstream[edge["target"]].append(edge["source"]) + + # Find upstream impact (what depends on targets) + upstream_impact = set() + to_visit = [(t, 0) for t in targets] + visited = set() + + while to_visit: + node, depth = to_visit.pop(0) + if node in visited or depth >= max_depth: + continue + visited.add(node) + + for dependent in upstream[node]: + upstream_impact.add(dependent) + to_visit.append((dependent, depth + 1)) + + # Find downstream impact (what targets depend on) + downstream_impact = set() + to_visit = [(t, 0) for t in targets] + visited = set() + + while to_visit: + node, depth = to_visit.pop(0) + if node in visited or depth >= max_depth: + continue + visited.add(node) + + for dependency in downstream[node]: + downstream_impact.add(dependency) + to_visit.append((dependency, depth + 1)) + + # Return raw counts without ratios or interpretations + all_impacted = set(targets) | upstream_impact | downstream_impact + + return { + "targets": targets, + "upstream": sorted(upstream_impact), + "downstream": sorted(downstream_impact), + "total_impacted": len(all_impacted), + "graph_nodes": len(import_graph.get("nodes", [])), + } + + def find_shortest_path( + self, + source: str, + target: str, + graph: dict[str, Any] + ) -> list[str] | None: + """ + Find shortest path between two nodes using BFS. + + Pure pathfinding algorithm without interpretation. + + Args: + source: Source node ID + target: Target node ID + graph: Graph with edges + + Returns: + List of node IDs forming the path, or None if no path exists + """ + # Build adjacency list + adj = defaultdict(list) + for edge in graph.get("edges", []): + adj[edge["source"]].append(edge["target"]) + + # BFS + queue = [(source, [source])] + visited = {source} + + while queue: + node, path = queue.pop(0) + + if node == target: + return path + + for neighbor in adj[node]: + if neighbor not in visited: + visited.add(neighbor) + queue.append((neighbor, path + [neighbor])) + + return None + + def identify_layers(self, graph: dict[str, Any]) -> dict[str, list[str]]: + """ + Identify architectural layers using topological sorting. + + Pure graph layering algorithm without interpretation. + + Args: + graph: Import/dependency graph + + Returns: + Dict mapping layer number to list of node IDs + """ + # Calculate in-degrees + in_degree = defaultdict(int) + nodes = {node["id"] for node in graph.get("nodes", [])} + + for edge in graph.get("edges", []): + in_degree[edge["target"]] += 1 + + # Find nodes with no dependencies (layer 0) + layers = {} + current_layer = [] + + for node_id in nodes: + if in_degree[node_id] == 0: + current_layer.append(node_id) + + # Build layers using modified topological sort + layer_num = 0 + adj = defaultdict(list) + + for edge in graph.get("edges", []): + adj[edge["source"]].append(edge["target"]) + + while current_layer: + layers[layer_num] = current_layer + next_layer = [] + + for node in current_layer: + for neighbor in adj[node]: + in_degree[neighbor] -= 1 + if in_degree[neighbor] == 0: + next_layer.append(neighbor) + + current_layer = next_layer + layer_num += 1 + + return layers + + def get_graph_summary(self, graph_data: dict[str, Any]) -> dict[str, Any]: + """ + Extract basic statistics from a graph without interpretation. + + This method provides raw counts and statistics only, + no subjective metrics or labels. + + Args: + graph_data: Large graph dict with 'nodes' and 'edges' + + Returns: + Concise summary with raw statistics only + """ + # Basic statistics + nodes = graph_data.get("nodes", []) + edges = graph_data.get("edges", []) + + # Calculate in/out degrees + in_degree = defaultdict(int) + out_degree = defaultdict(int) + for edge in edges: + out_degree[edge["source"]] += 1 + in_degree[edge["target"]] += 1 + + # Find most connected nodes (raw data only) + connection_counts = [] + for node in nodes: # Process all nodes + node_id = node["id"] + total = in_degree[node_id] + out_degree[node_id] + if total > 0: + connection_counts.append({ + "id": node_id, + "in_degree": in_degree[node_id], + "out_degree": out_degree[node_id], + "total_connections": total + }) + + # Sort and get top 10 + connection_counts.sort(key=lambda x: x["total_connections"], reverse=True) + top_connected = connection_counts[:10] + + # Detect cycles (complete search) + cycles = self.detect_cycles({"nodes": nodes, "edges": edges}) + + # Calculate graph metrics + node_count = len(nodes) + edge_count = len(edges) + density = edge_count / (node_count * (node_count - 1)) if node_count > 1 else 0 + + # Find isolated nodes + connected_nodes = set() + for edge in edges: + connected_nodes.add(edge["source"]) + connected_nodes.add(edge["target"]) + isolated_count = len([n for n in nodes if n["id"] not in connected_nodes]) + + # Create summary with raw data only + summary = { + "statistics": { + "total_nodes": node_count, + "total_edges": edge_count, + "graph_density": round(density, 4), + "isolated_nodes": isolated_count, + "average_connections": round(edge_count / node_count, 2) if node_count > 0 else 0 + }, + "top_connected_nodes": top_connected, + "cycles_found": [ + { + "size": cycle["size"], + "nodes": cycle["nodes"][:5] + (["..."] if len(cycle["nodes"]) > 5 else []) + } + for cycle in cycles[:5] + ], + "file_types": self._count_file_types(nodes), + "connection_distribution": { + "nodes_with_20_plus_connections": len([c for c in connection_counts if c["total_connections"] > 20]), + "nodes_with_30_plus_inbound": len([c for c in connection_counts if c["in_degree"] > 30]), + "cycle_count": len(cycles) if len(nodes) < 500 else f"{len(cycles)}+ (limited search)", + } + } + + return summary + + def _count_file_types(self, nodes: list[dict]) -> dict[str, int]: + """Count nodes by file extension - pure counting, no interpretation.""" + ext_counts = defaultdict(int) + for node in nodes: # Process all nodes + if "file" in node: + ext = Path(node["file"]).suffix or "no_ext" + ext_counts[ext] += 1 + # Return top 10 extensions + sorted_exts = sorted(ext_counts.items(), key=lambda x: x[1], reverse=True) + return dict(sorted_exts[:10]) + + def identify_hotspots(self, graph: dict[str, Any], top_n: int = 10) -> list[dict[str, Any]]: + """ + Identify hotspot nodes based on connectivity (in/out degree). + + Pure graph algorithm that identifies most connected nodes + without interpretation or scoring. + + Args: + graph: Graph with 'nodes' and 'edges' + top_n: Number of top hotspots to return + + Returns: + List of hotspot nodes with their degree counts + """ + # Calculate in/out degrees + in_degree = defaultdict(int) + out_degree = defaultdict(int) + + for edge in graph.get("edges", []): + out_degree[edge["source"]] += 1 + in_degree[edge["target"]] += 1 + + # Calculate total connections for each node + hotspots = [] + for node in graph.get("nodes", []): + node_id = node["id"] + in_deg = in_degree[node_id] + out_deg = out_degree[node_id] + total = in_deg + out_deg + + if total > 0: # Only include connected nodes + hotspots.append({ + "id": node_id, + "in_degree": in_deg, + "out_degree": out_deg, + "total_connections": total, + "file": node.get("file", node_id), + "lang": node.get("lang", "unknown") + }) + + # Sort by total connections and return top N + hotspots.sort(key=lambda x: x["total_connections"], reverse=True) + return hotspots[:top_n] + + def calculate_node_degrees(self, graph: dict[str, Any]) -> dict[str, dict[str, int]]: + """ + Calculate in-degree and out-degree for all nodes. + + Pure counting algorithm without interpretation. + + Args: + graph: Graph with edges + + Returns: + Dict mapping node IDs to degree counts + """ + degrees = defaultdict(lambda: {"in_degree": 0, "out_degree": 0}) + + for edge in graph.get("edges", []): + degrees[edge["source"]]["out_degree"] += 1 + degrees[edge["target"]]["in_degree"] += 1 + + return dict(degrees) + + def analyze_impact(self, graph: dict[str, Any], targets: list[str], max_depth: int = 3) -> dict[str, Any]: + """ + Analyze impact of changes to target nodes. + + Wrapper method for impact_of_change to match expected API. + + Args: + graph: Graph with 'nodes' and 'edges' + targets: List of target node IDs + max_depth: Maximum traversal depth + + Returns: + Impact analysis results with upstream/downstream effects + """ + # Use existing impact_of_change method + result = self.impact_of_change(targets, graph, None, max_depth) + + # Add all_impacted field for compatibility + all_impacted = set(targets) | set(result.get("upstream", [])) | set(result.get("downstream", [])) + result["all_impacted"] = sorted(all_impacted) + + return result \ No newline at end of file diff --git a/theauditor/graph/builder.py b/theauditor/graph/builder.py new file mode 100644 index 0000000..ca21630 --- /dev/null +++ b/theauditor/graph/builder.py @@ -0,0 +1,1017 @@ +"""Graph builder module - constructs dependency and call graphs.""" + +import os +import platform +import re +import subprocess +import tempfile +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + +# Windows compatibility +IS_WINDOWS = platform.system() == "Windows" + +import click + +from theauditor.indexer.config import SKIP_DIRS +from theauditor.module_resolver import ModuleResolver +from theauditor.ast_parser import ASTParser + + +@dataclass +class GraphNode: + """Represents a node in the dependency or call graph.""" + + id: str + file: str + lang: str | None = None + loc: int = 0 + churn: int | None = None # Git commit count if available + type: str = "module" # module, function, class + + +@dataclass +class GraphEdge: + """Represents an edge in the graph.""" + + source: str + target: str + type: str = "import" # import, call, extends, implements + file: str | None = None + line: int | None = None + + +@dataclass +class Cycle: + """Represents a cycle in the dependency graph.""" + + nodes: list[str] + size: int + + def __init__(self, nodes: list[str]): + self.nodes = nodes + self.size = len(nodes) + + +@dataclass +class Hotspot: + """Represents a hotspot node with high connectivity.""" + + id: str + in_degree: int + out_degree: int + centrality: float + score: float # Computed based on weights + + +@dataclass +class ImpactAnalysis: + """Results of change impact analysis.""" + + targets: list[str] + upstream: list[str] # What depends on targets + downstream: list[str] # What targets depend on + total_impacted: int + + +class XGraphBuilder: + """Build cross-project dependency and call graphs.""" + + # Import regex patterns for different languages + IMPORT_PATTERNS = { + "python": [ + r"^import\s+(\S+)", + r"^from\s+(\S+)\s+import", + ], + "javascript": [ + # Standard ES6 imports with 'from' + r"import\s+.*?\s+from\s+['\"]([^'\"]+)['\"]", + + # Side-effect imports (no 'from') + r"import\s+['\"]([^'\"]+)['\"]", + + # CommonJS require + r"require\(['\"]([^'\"]+)['\"]\)", + + # Dynamic imports + r"import\(['\"]([^'\"]+)['\"]\)", + + # Re-exports + r"export\s+.*?\s+from\s+['\"]([^'\"]+)['\"]", + ], + "typescript": [ + # Standard ES6 imports with 'from' + r"import\s+.*?\s+from\s+['\"]([^'\"]+)['\"]", + + # Side-effect imports (no 'from') + r"import\s+['\"]([^'\"]+)['\"]", + + # Type-only imports + r"import\s+type\s+.*?\s+from\s+['\"]([^'\"]+)['\"]", + + # CommonJS require + r"require\(['\"]([^'\"]+)['\"]\)", + + # Dynamic imports + r"import\(['\"]([^'\"]+)['\"]\)", + + # Re-exports + r"export\s+.*?\s+from\s+['\"]([^'\"]+)['\"]", + ], + "java": [ + r"^import\s+(\S+);", + r"^import\s+static\s+(\S+);", + ], + "go": [ + r'^import\s+"([^"]+)"', + r'^import\s+\(\s*"([^"]+)"', + ], + "c#": [ + r"^using\s+(\S+);", + r"^using\s+static\s+(\S+);", + ], + "php": [ + r"^use\s+(\S+);", + r"require_once\s*\(['\"]([^'\"]+)['\"]\)", + r"include_once\s*\(['\"]([^'\"]+)['\"]\)", + ], + "ruby": [ + r"^require\s+['\"]([^'\"]+)['\"]", + r"^require_relative\s+['\"]([^'\"]+)['\"]", + ], + } + + # Export patterns for different languages + EXPORT_PATTERNS = { + "python": [ + r"^def\s+(\w+)\s*\(", + r"^class\s+(\w+)", + r"^(\w+)\s*=", # Module-level variables + ], + "javascript": [ + r"export\s+(?:default\s+)?(?:function|class|const|let|var)\s+(\w+)", + r"exports\.(\w+)\s*=", + r"module\.exports\.(\w+)\s*=", + ], + "typescript": [ + r"export\s+(?:default\s+)?(?:function|class|const|let|var|interface|type)\s+(\w+)", + r"exports\.(\w+)\s*=", + ], + "java": [ + r"public\s+(?:static\s+)?(?:class|interface|enum)\s+(\w+)", + r"public\s+(?:static\s+)?(?:\w+\s+)?(\w+)\s*\(", # Public methods + ], + "go": [ + r"^func\s+(\w+)\s*\(", # Exported if capitalized + r"^type\s+(\w+)\s+", + r"^var\s+(\w+)\s+", + ], + } + + # Call patterns for different languages + CALL_PATTERNS = { + "python": [ + r"(\w+)\s*\(", # Function calls + r"(\w+)\.(\w+)\s*\(", # Method calls + ], + "javascript": [ + r"(\w+)\s*\(", + r"(\w+)\.(\w+)\s*\(", + r"new\s+(\w+)\s*\(", + ], + "typescript": [ + r"(\w+)\s*\(", + r"(\w+)\.(\w+)\s*\(", + r"new\s+(\w+)\s*\(", + ], + "java": [ + r"(\w+)\s*\(", + r"(\w+)\.(\w+)\s*\(", + r"new\s+(\w+)\s*\(", + ], + "go": [ + r"(\w+)\s*\(", + r"(\w+)\.(\w+)\s*\(", + ], + } + + def __init__(self, batch_size: int = 200, exclude_patterns: list[str] = None, project_root: str = "."): + """Initialize builder with configuration.""" + self.batch_size = batch_size + self.exclude_patterns = exclude_patterns or [] + self.checkpoint_file = Path(".pf/xgraph_checkpoint.json") + self.project_root = Path(project_root).resolve() + self.module_resolver = ModuleResolver() # No project_root - uses database! + self.ast_parser = ASTParser() # Initialize AST parser for structural analysis + + def detect_language(self, file_path: Path) -> str | None: + """Detect language from file extension.""" + ext_map = { + ".py": "python", + ".js": "javascript", + ".jsx": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".java": "java", + ".go": "go", + ".cs": "c#", + ".php": "php", + ".rb": "ruby", + ".c": "c", + ".cpp": "c++", + ".h": "c", + ".hpp": "c++", + ".rs": "rust", + ".swift": "swift", + ".kt": "kotlin", + ".scala": "scala", + ".r": "r", + ".R": "r", + ".m": "objective-c", + ".mm": "objective-c++", + } + return ext_map.get(file_path.suffix.lower()) + + def should_skip(self, file_path: Path) -> bool: + """Check if file should be skipped based on exclude patterns.""" + # First, check if any component of the path is in SKIP_DIRS + for part in file_path.parts: + if part in SKIP_DIRS: + return True + + # Second, check against exclude_patterns + path_str = str(file_path) + for pattern in self.exclude_patterns: + if pattern in path_str: + return True + return False + + def extract_imports_from_db(self, rel_path: str) -> list[str]: + """Extract import statements from the database where indexer already stored them. + + Args: + rel_path: Relative path as stored in the database (e.g., "backend/src/app.ts") + + Returns: + List of import targets + """ + import sqlite3 + + # Query the refs table for imports + db_file = self.project_root / ".pf" / "repo_index.db" + if not db_file.exists(): + print(f"Warning: Database not found at {db_file}") + return [] + + try: + conn = sqlite3.connect(db_file) + cursor = conn.cursor() + + # Get all imports for this file from refs table + # The indexer stores imports with kind like 'import', 'require', etc. + cursor.execute( + "SELECT value FROM refs WHERE src = ? AND kind IN ('import', 'require', 'from', 'import_type', 'export')", + (rel_path,) + ) + + imports = [row[0] for row in cursor.fetchall()] + conn.close() + + return imports + + except sqlite3.Error as e: + print(f"Warning: Failed to read imports from database: {e}") + return [] + + def extract_imports(self, file_path: Path, lang: str) -> list[str]: + """Extract import statements from the database where indexer already stored them. + + The indexer has already extracted all imports and stored them in the refs table. + We should read from there instead of re-parsing files. + """ + import sqlite3 + + # Get relative path for database lookup + try: + rel_path = file_path.relative_to(self.project_root) + except ValueError: + # If file_path is already relative or from a different root + rel_path = file_path + + # Normalize path separators for database lookup + db_path = str(rel_path).replace("\\", "/") + + # Query the refs table for imports + db_file = self.project_root / ".pf" / "repo_index.db" + if not db_file.exists(): + print(f"Warning: Database not found at {db_file}") + return [] + + try: + conn = sqlite3.connect(db_file) + cursor = conn.cursor() + + # Get all imports for this file from refs table + # The indexer stores imports with kind like 'import', 'require', etc. + cursor.execute( + "SELECT value FROM refs WHERE src = ? AND kind IN ('import', 'require', 'from', 'import_type', 'export')", + (db_path,) + ) + + imports = [row[0] for row in cursor.fetchall()] + conn.close() + + return imports + + except sqlite3.Error as e: + print(f"Warning: Failed to read imports from database: {e}") + return [] + + def extract_exports_from_db(self, rel_path: str) -> list[str]: + """Extract exported symbols from the database where indexer already stored them. + + Args: + rel_path: Relative path as stored in the database + + Returns: + List of exported symbol names + """ + import sqlite3 + + db_file = self.project_root / ".pf" / "repo_index.db" + if not db_file.exists(): + return [] + + try: + conn = sqlite3.connect(db_file) + cursor = conn.cursor() + + # Get exported functions/classes from symbols table + # The indexer stores these as 'function' and 'class' types + cursor.execute( + "SELECT name FROM symbols WHERE path = ? AND type IN ('function', 'class')", + (rel_path,) + ) + + exports = [row[0] for row in cursor.fetchall()] + conn.close() + + return exports + + except sqlite3.Error: + return [] + + def extract_exports(self, file_path: Path, lang: str) -> list[str]: + """Extract exported symbols from a file using AST parser with regex fallback.""" + # Try AST parser first for supported languages + if self.ast_parser.supports_language(lang): + try: + # Check persistent cache first for JS/TS files + tree = None + if lang in ["javascript", "typescript"]: + # Compute file hash for cache lookup + import hashlib + with open(file_path, 'rb') as f: + file_hash = hashlib.sha256(f.read()).hexdigest() + + # Check cache + cache_dir = self.project_root / ".pf" / "ast_cache" + cache_file = cache_dir / f"{file_hash}.json" + if cache_file.exists(): + try: + import json + with open(cache_file, 'r', encoding='utf-8') as f: + tree = json.load(f) + except (json.JSONDecodeError, OSError): + pass # Cache read failed, parse fresh + + # Parse file if not in cache + if not tree: + tree = self.ast_parser.parse_file(file_path, lang) + # REMOVED: Cache write logic - only indexer.py should write to cache + + if tree and tree.get("type") != "regex_fallback": + # Extract exports using AST + export_dicts = self.ast_parser.extract_exports(tree, lang) + # Convert to list of export names + exports = [] + for exp in export_dicts: + name = exp.get('name') + if name and name != 'unknown': + exports.append(name) + if exports: # If we got results, return them + return exports + except Exception as e: + # Fall through to regex fallback + pass + + # Fallback to regex-based extraction + return self._extract_exports_regex(file_path, lang) + + def extract_calls_from_db(self, rel_path: str) -> list[tuple[str, str | None]]: + """Extract function calls from the database where indexer already stored them. + + Args: + rel_path: Relative path as stored in the database + + Returns: + List of (function_name, None) tuples for calls + """ + import sqlite3 + + db_file = self.project_root / ".pf" / "repo_index.db" + if not db_file.exists(): + return [] + + try: + conn = sqlite3.connect(db_file) + cursor = conn.cursor() + + # Get function calls from symbols table + # The indexer stores these as 'call' type + cursor.execute( + "SELECT name FROM symbols WHERE path = ? AND type = 'call'", + (rel_path,) + ) + + # Return as tuples with None for second element (no parent info) + calls = [(row[0], None) for row in cursor.fetchall()] + conn.close() + + return calls + + except sqlite3.Error: + return [] + + def extract_calls(self, file_path: Path, lang: str) -> list[tuple[str, str | None]]: + """Extract function/method calls from a file using AST parser with regex fallback.""" + # Try AST parser first for supported languages + if self.ast_parser.supports_language(lang): + try: + # Check persistent cache first for JS/TS files + tree = None + if lang in ["javascript", "typescript"]: + # Compute file hash for cache lookup + import hashlib + with open(file_path, 'rb') as f: + file_hash = hashlib.sha256(f.read()).hexdigest() + + # Check cache + cache_dir = self.project_root / ".pf" / "ast_cache" + cache_file = cache_dir / f"{file_hash}.json" + if cache_file.exists(): + try: + import json + with open(cache_file, 'r', encoding='utf-8') as f: + tree = json.load(f) + except (json.JSONDecodeError, OSError): + pass # Cache read failed, parse fresh + + # Parse file if not in cache + if not tree: + tree = self.ast_parser.parse_file(file_path, lang) + # REMOVED: Cache write logic - only indexer.py should write to cache + + if tree and tree.get("type") != "regex_fallback": + # Extract calls using AST + call_dicts = self.ast_parser.extract_calls(tree, lang) + # Convert to list of (function, method) tuples + calls = [] + for call in call_dicts: + name = call.get('name', '') + # Check if it's a method call (contains dot) + if '.' in name: + parts = name.rsplit('.', 1) + if len(parts) == 2: + calls.append((parts[0], parts[1])) + else: + calls.append((name, None)) + else: + calls.append((name, None)) + if calls: # If we got results, return them + return calls + except Exception as e: + # Fall through to regex fallback + pass + + # Fallback to regex-based extraction + return self._extract_calls_regex(file_path, lang) + + def resolve_import_path(self, import_str: str, source_file: Path, lang: str) -> str: + """Resolve import string to a normalized module path that matches actual files in the graph.""" + import sqlite3 + + # Clean up the import string (remove quotes, semicolons, etc.) + import_str = import_str.strip().strip('"\'`;') + + # Language-specific resolution + if lang == "python": + # Convert Python module path to file path + parts = import_str.split(".") + return "/".join(parts) + elif lang in ["javascript", "typescript"]: + # Get source file directory for relative imports + source_dir = source_file.parent + # Handle case where source_file might already be relative or might be from manifest + try: + source_rel = str(source_file.relative_to(self.project_root)).replace("\\", "/") + except ValueError: + # If source_file is already relative or from a different root, use it as is + source_rel = str(source_file).replace("\\", "/") + + # Handle different import patterns + resolved_path = None + + # 1. Handle TypeScript path aliases using ModuleResolver (database-driven) + if import_str.startswith("@"): + # Determine context from source file location + try: + source_rel = str(source_file.relative_to(self.project_root)).replace("\\", "/") + except ValueError: + source_rel = str(source_file).replace("\\", "/") + + # Determine which tsconfig context applies + if "backend/" in source_rel: + context = "backend" + elif "frontend/" in source_rel: + context = "frontend" + else: + context = "root" + + # Use ModuleResolver's context-aware resolution + resolved = self.module_resolver.resolve_with_context(import_str, str(source_file), context) + + # Check if resolution succeeded + if resolved != import_str: + # Resolution worked, now verify file exists in database + db_file = self.project_root / ".pf" / "repo_index.db" + if db_file.exists(): + try: + conn = sqlite3.connect(db_file) + cursor = conn.cursor() + + # Try with common extensions if no extension + test_paths = [resolved] + if not Path(resolved).suffix: + for ext in [".ts", ".tsx", ".js", ".jsx"]: + test_paths.append(resolved + ext) + test_paths.append(resolved + "/index.ts") + test_paths.append(resolved + "/index.js") + + for test_path in test_paths: + cursor.execute("SELECT 1 FROM files WHERE path = ? LIMIT 1", (test_path,)) + if cursor.fetchone(): + conn.close() + return test_path + + conn.close() + except sqlite3.Error: + pass + + # Return resolved even if file check failed + return resolved + + # 2. Handle relative imports (./foo, ../bar/baz) + elif import_str.startswith("."): + # Resolve relative to source file + try: + # Remove leading dots and slashes + rel_import = import_str.lstrip("./") + + # Go up directories for ../ + up_count = import_str.count("../") + current_dir = source_dir + for _ in range(up_count): + current_dir = current_dir.parent + + if up_count > 0: + rel_import = import_str.replace("../", "") + + # Build the target path + target_path = current_dir / rel_import + rel_target = str(target_path.relative_to(self.project_root)).replace("\\", "/") + + # Check if this file exists (try with extensions) + db_file = self.project_root / ".pf" / "repo_index.db" + if db_file.exists(): + try: + conn = sqlite3.connect(db_file) + cursor = conn.cursor() + + # Try with common extensions + for ext in ["", ".ts", ".tsx", ".js", ".jsx", "/index.ts", "/index.tsx", "/index.js"]: + test_path = rel_target + ext + cursor.execute("SELECT 1 FROM files WHERE path = ? LIMIT 1", (test_path,)) + if cursor.fetchone(): + conn.close() + return test_path + + conn.close() + except sqlite3.Error: + pass + + return rel_target + + except (ValueError, OSError): + pass + + # 3. Handle node_modules imports (just return as-is, they're external) + else: + # For npm packages, just return the package name + return import_str + + # If nothing worked, return original + return import_str + else: + # Default: return as-is + return import_str + + def get_file_metrics(self, file_path: Path) -> dict[str, Any]: + """Get basic metrics for a file.""" + metrics = {"loc": 0, "churn": None} + + # When working with manifest data, skip file reading + # The manifest already has loc and other metrics + if not file_path.exists(): + # File doesn't exist, we're working with manifest data + # Return default metrics - the caller should use manifest data instead + return metrics + + # Count lines of code + try: + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + metrics["loc"] = sum(1 for _ in f) + except (IOError, UnicodeDecodeError, OSError) as e: + print(f"Warning: Failed to read {file_path} for metrics: {e}") + # Still return default metrics but LOG the failure + + # Get git churn (commit count) + try: + # Use temp files to avoid buffer overflow + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stdout.txt', encoding='utf-8') as stdout_fp, \ + tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stderr.txt', encoding='utf-8') as stderr_fp: + + stdout_path = stdout_fp.name + stderr_path = stderr_fp.name + + result = subprocess.run( + ["git", "log", "--oneline", str(file_path)], + stdout=stdout_fp, + stderr=stderr_fp, + text=True, + timeout=5, + cwd=Path.cwd(), + shell=IS_WINDOWS # Windows compatibility fix + ) + + with open(stdout_path, 'r', encoding='utf-8') as f: + result.stdout = f.read() + with open(stderr_path, 'r', encoding='utf-8') as f: + result.stderr = f.read() + + os.unlink(stdout_path) + os.unlink(stderr_path) + if result.returncode == 0: + metrics["churn"] = len(result.stdout.strip().split("\n")) + except (subprocess.TimeoutExpired, OSError, IOError) as e: + print(f"Warning: Failed to get git churn for {file_path}: {e}") + # Still return default metrics but LOG the failure + + return metrics + + def build_import_graph( + self, + root: str = ".", + langs: list[str] | None = None, + file_filter: str | None = None, + file_list: list[dict[str, Any]] | None = None, + ) -> dict[str, Any]: + """Build import/dependency graph for the project.""" + root_path = Path(root).resolve() + nodes = {} + edges = [] + + # Collect all source files + files = [] + manifest_lookup = {} # Map file paths to manifest items for metrics + + if file_list is not None: + # Use provided file list from manifest + # The manifest already contains all the file info we need + for item in file_list: + manifest_path = Path(item['path']) + + # Use the path from manifest directly - we don't need actual files + # The manifest has all the data (path, ext, content, etc.) + file = root_path / manifest_path # Just for consistent path handling + + # Store manifest item for later metric lookup + manifest_lookup[str(file)] = item + + # Detect language from extension in manifest + lang = self.detect_language(manifest_path) # Use manifest path + if lang and (not langs or lang in langs): + files.append((file, lang)) + else: + # Fall back to original os.walk logic for backward compatibility + for dirpath, dirnames, filenames in os.walk(root_path): + # CRITICAL: Prune excluded directories before os.walk descends into them + # This prevents traversal into .venv and other SKIP_DIRS + dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS] + + # Also prune based on exclude_patterns + if self.exclude_patterns: + dirnames[:] = [d for d in dirnames + if not any(pattern in d for pattern in self.exclude_patterns)] + + # Process files in this directory + for filename in filenames: + file = Path(dirpath) / filename + if not self.should_skip(file): + lang = self.detect_language(file) + if lang and (not langs or lang in langs): + files.append((file, lang)) + + # Process files with progress bar + with click.progressbar( + files, + label="Building import graph", + show_pos=True, + show_percent=True, + show_eta=True, + item_show_func=lambda x: str(x[0].name) if x else None, + ) as bar: + for file_path, lang in bar: + # Create node for this file + rel_path = str(file_path.relative_to(root_path)).replace("\\", "/") # Normalize separators + node_id = rel_path # Already normalized + + # Get metrics from manifest if available, otherwise from file + if str(file_path) in manifest_lookup: + # Use manifest data which already has metrics + manifest_item = manifest_lookup[str(file_path)] + loc = manifest_item.get('loc', 0) + churn = None # Manifest doesn't have churn data + else: + # Fall back to reading file metrics + metrics = self.get_file_metrics(file_path) + loc = metrics["loc"] + churn = metrics["churn"] + + node = GraphNode( + id=node_id, + file=rel_path, # Already normalized + lang=lang, + loc=loc, + churn=churn, + type="module", + ) + nodes[node_id] = asdict(node) + + # Extract imports and create edges + # Pass the relative path that matches what's in the database + imports = self.extract_imports_from_db(rel_path) + for imp in imports: + target = self.resolve_import_path(imp, file_path, lang) + edge = GraphEdge( + source=node_id, + target=target, + type="import", + file=rel_path, # Already normalized + ) + edges.append(asdict(edge)) + + return { + "nodes": list(nodes.values()), + "edges": edges, + "metadata": { + "root": str(root_path), + "languages": list(set(n["lang"] for n in nodes.values())), + "total_files": len(nodes), + "total_imports": len(edges), + }, + } + + def build_call_graph( + self, + root: str = ".", + langs: list[str] | None = None, + file_filter: str | None = None, + file_list: list[dict[str, Any]] | None = None, + ) -> dict[str, Any]: + """Build call graph for the project.""" + root_path = Path(root).resolve() + nodes = {} + edges = [] + + # Collect all source files + files = [] + + if file_list is not None: + # Use provided file list from manifest + # The manifest already contains all the file info we need + for item in file_list: + manifest_path = Path(item['path']) + + # Use the path from manifest directly - we don't need actual files + # The manifest has all the data (path, ext, content, etc.) + file = root_path / manifest_path # Just for consistent path handling + + # Detect language from extension in manifest + lang = self.detect_language(manifest_path) # Use manifest path + if lang and (not langs or lang in langs): + files.append((file, lang)) + else: + # Fall back to original os.walk logic for backward compatibility + for dirpath, dirnames, filenames in os.walk(root_path): + # CRITICAL: Prune excluded directories before os.walk descends into them + # This prevents traversal into .venv and other SKIP_DIRS + dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS] + + # Also prune based on exclude_patterns + if self.exclude_patterns: + dirnames[:] = [d for d in dirnames + if not any(pattern in d for pattern in self.exclude_patterns)] + + # Process files in this directory + for filename in filenames: + file = Path(dirpath) / filename + if not self.should_skip(file): + lang = self.detect_language(file) + if lang and (not langs or lang in langs): + files.append((file, lang)) + + # Process files with progress bar to extract functions and calls + with click.progressbar( + files, + label="Building call graph", + show_pos=True, + show_percent=True, + show_eta=True, + item_show_func=lambda x: str(x[0].name) if x else None, + ) as bar: + for file_path, lang in bar: + rel_path = str(file_path.relative_to(root_path)).replace("\\", "/") # Normalize separators + module_id = rel_path # Already normalized + + # Extract exported functions/classes from database + exports = self.extract_exports_from_db(rel_path) + for export in exports: + func_id = f"{module_id}::{export}" + node = GraphNode( + id=func_id, + file=rel_path, # Already normalized + lang=lang, + type="function", + ) + nodes[func_id] = asdict(node) + + # Extract calls from database + calls = self.extract_calls_from_db(rel_path) + for call, method in calls: + # Try to resolve the call target + if method: + # Method call + target_id = f"{call}.{method}" + else: + # Function call + target_id = call + + # Create edge from module to called function + edge = GraphEdge( + source=module_id, + target=target_id, + type="call", + file=rel_path, # Already normalized + ) + edges.append(asdict(edge)) + + return { + "nodes": list(nodes.values()), + "edges": edges, + "metadata": { + "root": str(root_path), + "languages": langs or [], + "total_functions": len(nodes), + "total_calls": len(edges), + }, + } + + def merge_graphs(self, import_graph: dict, call_graph: dict) -> dict[str, Any]: + """Merge import and call graphs into a unified graph.""" + # Combine nodes (dedup by id) + nodes = {} + for node in import_graph["nodes"]: + nodes[node["id"]] = node + for node in call_graph["nodes"]: + nodes[node["id"]] = node + + # Combine edges + edges = import_graph["edges"] + call_graph["edges"] + + return { + "nodes": list(nodes.values()), + "edges": edges, + "metadata": { + "root": import_graph["metadata"]["root"], + "languages": list( + set( + import_graph["metadata"]["languages"] + + call_graph["metadata"].get("languages", []) + ) + ), + "total_nodes": len(nodes), + "total_edges": len(edges), + }, + } + + def _extract_imports_regex(self, file_path: Path, lang: str) -> list[str]: + """Regex-based fallback for extracting imports. + + This method is used when AST parsing fails or is unavailable. + """ + if lang not in self.IMPORT_PATTERNS: + return [] + + imports = [] + patterns = [re.compile(p, re.MULTILINE) for p in self.IMPORT_PATTERNS[lang]] + + try: + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + + for pattern in patterns: + matches = pattern.findall(content) + imports.extend(matches) + + except (IOError, UnicodeDecodeError, OSError) as e: + print(f"Warning: Failed to extract imports from {file_path}: {e}") + # Return empty list but LOG the failure + + return imports + + def _extract_exports_regex(self, file_path: Path, lang: str) -> list[str]: + """Regex-based fallback for extracting exports. + + This method is used when AST parsing fails or is unavailable. + """ + if lang not in self.EXPORT_PATTERNS: + return [] + + exports = [] + patterns = [re.compile(p, re.MULTILINE) for p in self.EXPORT_PATTERNS[lang]] + + try: + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + + for pattern in patterns: + matches = pattern.findall(content) + # Flatten tuples if regex has groups + for match in matches: + if isinstance(match, tuple): + exports.extend([m for m in match if m]) + else: + exports.append(match) + + except (IOError, UnicodeDecodeError, OSError) as e: + print(f"Warning: Failed to extract exports from {file_path}: {e}") + # Return empty list but LOG the failure + + # Filter exports for Go (only capitalized are public) + if lang == "go": + exports = [e for e in exports if e and e[0].isupper()] + + return exports + + def _extract_calls_regex(self, file_path: Path, lang: str) -> list[tuple[str, str | None]]: + """Regex-based fallback for extracting function calls. + + This method is used when AST parsing fails or is unavailable. + """ + if lang not in self.CALL_PATTERNS: + return [] + + calls = [] + patterns = [re.compile(p) for p in self.CALL_PATTERNS[lang]] + + try: + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + + for pattern in patterns: + matches = pattern.findall(content) + for match in matches: + if isinstance(match, tuple): + # Method call: (object, method) + calls.append(match) + else: + # Function call + calls.append((match, None)) + + except (IOError, UnicodeDecodeError, OSError) as e: + print(f"Warning: Failed to extract calls from {file_path}: {e}") + # Return empty list but LOG the failure + + return calls \ No newline at end of file diff --git a/theauditor/graph/insights.py b/theauditor/graph/insights.py new file mode 100644 index 0000000..963c874 --- /dev/null +++ b/theauditor/graph/insights.py @@ -0,0 +1,17 @@ +"""Backward compatibility shim for graph insights. + +This file exists to maintain backward compatibility for code that imports +from theauditor.graph.insights directly. All functionality has been moved to +theauditor.insights.graph for better organization. + +This ensures that: + - from theauditor.graph.insights import GraphInsights # STILL WORKS + - from theauditor.graph import insights # STILL WORKS + - import theauditor.graph.insights # STILL WORKS +""" + +# Import everything from the new location +from theauditor.insights.graph import * + +# This shim ensures 100% backward compatibility while the actual +# implementation is now in theauditor/insights/graph.py \ No newline at end of file diff --git a/theauditor/graph/store.py b/theauditor/graph/store.py new file mode 100644 index 0000000..10bb450 --- /dev/null +++ b/theauditor/graph/store.py @@ -0,0 +1,444 @@ +"""Graph store module - persistence and database operations for graphs.""" + +import json +import sqlite3 +from pathlib import Path +from typing import Any + + +class XGraphStore: + """Store and query cross-project graphs in SQLite.""" + + def __init__(self, db_path: str = "./.pf/graphs.db"): + """ + Initialize store with database path. + + Args: + db_path: Path to SQLite database + """ + self.db_path = Path(db_path) + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self._init_schema() + + def _init_schema(self) -> None: + """Initialize database schema.""" + with sqlite3.connect(self.db_path) as conn: + # Nodes table + conn.execute(""" + CREATE TABLE IF NOT EXISTS nodes ( + id TEXT PRIMARY KEY, + file TEXT NOT NULL, + lang TEXT, + loc INTEGER DEFAULT 0, + churn INTEGER, + type TEXT DEFAULT 'module', + graph_type TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + # Edges table + conn.execute(""" + CREATE TABLE IF NOT EXISTS edges ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, + target TEXT NOT NULL, + type TEXT DEFAULT 'import', + file TEXT, + line INTEGER, + graph_type TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + UNIQUE(source, target, type, graph_type) + ) + """) + + # Analysis results table + conn.execute(""" + CREATE TABLE IF NOT EXISTS analysis_results ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + analysis_type TEXT NOT NULL, + result_json TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + # Create indexes + conn.execute("CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_nodes_file ON nodes(file)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type)") + + conn.commit() + + def save_import_graph(self, graph: dict[str, Any]) -> None: + """ + Save import graph to database. + + Args: + graph: Import graph with nodes and edges + """ + with sqlite3.connect(self.db_path) as conn: + # Clear existing import graph + conn.execute("DELETE FROM nodes WHERE graph_type = 'import'") + conn.execute("DELETE FROM edges WHERE graph_type = 'import'") + + # Insert nodes + for node in graph.get("nodes", []): + conn.execute( + """ + INSERT OR REPLACE INTO nodes + (id, file, lang, loc, churn, type, graph_type) + VALUES (?, ?, ?, ?, ?, ?, 'import') + """, + ( + node["id"], + node["file"], + node.get("lang"), + node.get("loc", 0), + node.get("churn"), + node.get("type", "module"), + ), + ) + + # Insert edges + for edge in graph.get("edges", []): + conn.execute( + """ + INSERT OR IGNORE INTO edges + (source, target, type, file, line, graph_type) + VALUES (?, ?, ?, ?, ?, 'import') + """, + ( + edge["source"], + edge["target"], + edge.get("type", "import"), + edge.get("file"), + edge.get("line"), + ), + ) + + conn.commit() + + def save_call_graph(self, graph: dict[str, Any]) -> None: + """ + Save call graph to database. + + Args: + graph: Call graph with nodes and edges + """ + with sqlite3.connect(self.db_path) as conn: + # Clear existing call graph + conn.execute("DELETE FROM nodes WHERE graph_type = 'call'") + conn.execute("DELETE FROM edges WHERE graph_type = 'call'") + + # Insert nodes + for node in graph.get("nodes", []): + conn.execute( + """ + INSERT OR REPLACE INTO nodes + (id, file, lang, loc, churn, type, graph_type) + VALUES (?, ?, ?, ?, ?, ?, 'call') + """, + ( + node["id"], + node["file"], + node.get("lang"), + node.get("loc", 0), + node.get("churn"), + node.get("type", "function"), + ), + ) + + # Insert edges + for edge in graph.get("edges", []): + conn.execute( + """ + INSERT OR IGNORE INTO edges + (source, target, type, file, line, graph_type) + VALUES (?, ?, ?, ?, ?, 'call') + """, + ( + edge["source"], + edge["target"], + edge.get("type", "call"), + edge.get("file"), + edge.get("line"), + ), + ) + + conn.commit() + + def load_import_graph(self) -> dict[str, Any]: + """ + Load import graph from database. + + Returns: + Import graph dict + """ + with sqlite3.connect(self.db_path) as conn: + conn.row_factory = sqlite3.Row + + # Load nodes + nodes = [] + for row in conn.execute( + "SELECT * FROM nodes WHERE graph_type = 'import'" + ): + nodes.append({ + "id": row["id"], + "file": row["file"], + "lang": row["lang"], + "loc": row["loc"], + "churn": row["churn"], + "type": row["type"], + }) + + # Load edges + edges = [] + for row in conn.execute( + "SELECT * FROM edges WHERE graph_type = 'import'" + ): + edges.append({ + "source": row["source"], + "target": row["target"], + "type": row["type"], + "file": row["file"], + "line": row["line"], + }) + + return {"nodes": nodes, "edges": edges} + + def load_call_graph(self) -> dict[str, Any]: + """ + Load call graph from database. + + Returns: + Call graph dict + """ + with sqlite3.connect(self.db_path) as conn: + conn.row_factory = sqlite3.Row + + # Load nodes + nodes = [] + for row in conn.execute( + "SELECT * FROM nodes WHERE graph_type = 'call'" + ): + nodes.append({ + "id": row["id"], + "file": row["file"], + "lang": row["lang"], + "loc": row["loc"], + "churn": row["churn"], + "type": row["type"], + }) + + # Load edges + edges = [] + for row in conn.execute( + "SELECT * FROM edges WHERE graph_type = 'call'" + ): + edges.append({ + "source": row["source"], + "target": row["target"], + "type": row["type"], + "file": row["file"], + "line": row["line"], + }) + + return {"nodes": nodes, "edges": edges} + + def query_dependencies( + self, + node_id: str, + direction: str = "both", + graph_type: str = "import" + ) -> dict[str, list[str]]: + """ + Query dependencies of a node. + + Args: + node_id: Node to query + direction: 'upstream', 'downstream', or 'both' + graph_type: 'import' or 'call' + + Returns: + Dict with upstream and/or downstream dependencies + """ + result = {} + + with sqlite3.connect(self.db_path) as conn: + if direction in ["upstream", "both"]: + # Find who depends on this node + upstream = [] + for row in conn.execute( + "SELECT DISTINCT source FROM edges WHERE target = ? AND graph_type = ?", + (node_id, graph_type) + ): + upstream.append(row[0]) + result["upstream"] = upstream + + if direction in ["downstream", "both"]: + # Find what this node depends on + downstream = [] + for row in conn.execute( + "SELECT DISTINCT target FROM edges WHERE source = ? AND graph_type = ?", + (node_id, graph_type) + ): + downstream.append(row[0]) + result["downstream"] = downstream + + return result + + def query_calls( + self, + node_id: str, + direction: str = "both" + ) -> dict[str, list[str]]: + """ + Query function calls related to a node. + + Args: + node_id: Node to query + direction: 'callers', 'callees', or 'both' + + Returns: + Dict with callers and/or callees + """ + result = {} + + with sqlite3.connect(self.db_path) as conn: + if direction in ["callers", "both"]: + # Find who calls this function + callers = [] + for row in conn.execute( + "SELECT DISTINCT source FROM edges WHERE target = ? AND graph_type = 'call'", + (node_id,) + ): + callers.append(row[0]) + result["callers"] = callers + + if direction in ["callees", "both"]: + # Find what this function calls + callees = [] + for row in conn.execute( + "SELECT DISTINCT target FROM edges WHERE source = ? AND graph_type = 'call'", + (node_id,) + ): + callees.append(row[0]) + result["callees"] = callees + + return result + + def save_analysis_result( + self, + analysis_type: str, + result: dict[str, Any] + ) -> None: + """ + Save analysis result to database. + + Args: + analysis_type: Type of analysis (e.g., 'cycles', 'hotspots') + result: Analysis result dict + """ + with sqlite3.connect(self.db_path) as conn: + conn.execute( + """ + INSERT INTO analysis_results (analysis_type, result_json) + VALUES (?, ?) + """, + (analysis_type, json.dumps(result)) + ) + conn.commit() + + def get_latest_analysis(self, analysis_type: str) -> dict[str, Any] | None: + """ + Get most recent analysis result of given type. + + Args: + analysis_type: Type of analysis + + Returns: + Analysis result dict or None if not found + """ + with sqlite3.connect(self.db_path) as conn: + row = conn.execute( + """ + SELECT result_json FROM analysis_results + WHERE analysis_type = ? + ORDER BY created_at DESC + LIMIT 1 + """, + (analysis_type,) + ).fetchone() + + if row: + return json.loads(row[0]) + return None + + def get_graph_stats(self) -> dict[str, Any]: + """ + Get summary statistics about stored graphs. + + Returns: + Dict with node and edge counts + """ + with sqlite3.connect(self.db_path) as conn: + stats = { + "import_nodes": conn.execute( + "SELECT COUNT(*) FROM nodes WHERE graph_type = 'import'" + ).fetchone()[0], + "import_edges": conn.execute( + "SELECT COUNT(*) FROM edges WHERE graph_type = 'import'" + ).fetchone()[0], + "call_nodes": conn.execute( + "SELECT COUNT(*) FROM nodes WHERE graph_type = 'call'" + ).fetchone()[0], + "call_edges": conn.execute( + "SELECT COUNT(*) FROM edges WHERE graph_type = 'call'" + ).fetchone()[0], + } + + return stats + + def get_high_risk_nodes(self, threshold: float = 0.5, limit: int = 10) -> list[dict[str, Any]]: + """ + Get nodes with high risk based on connectivity and churn. + + Args: + threshold: Risk threshold (0-1) + limit: Maximum number of nodes to return + + Returns: + List of high-risk nodes + """ + with sqlite3.connect(self.db_path) as conn: + conn.row_factory = sqlite3.Row + + # Calculate risk based on in-degree and churn + query = """ + SELECT + n.id, + n.file, + n.churn, + COUNT(DISTINCT e.source) as in_degree, + (COUNT(DISTINCT e.source) * COALESCE(n.churn, 1)) / 100.0 as risk_score + FROM nodes n + LEFT JOIN edges e ON n.id = e.target + WHERE n.graph_type = 'import' + GROUP BY n.id + HAVING risk_score > ? + ORDER BY risk_score DESC + LIMIT ? + """ + + nodes = [] + for row in conn.execute(query, (threshold, limit)): + nodes.append({ + "id": row["id"], + "file": row["file"], + "churn": row["churn"], + "in_degree": row["in_degree"], + "risk_score": row["risk_score"], + }) + + return nodes \ No newline at end of file diff --git a/theauditor/graph/visualizer.py b/theauditor/graph/visualizer.py new file mode 100644 index 0000000..59f0805 --- /dev/null +++ b/theauditor/graph/visualizer.py @@ -0,0 +1,937 @@ +"""Graph visualizer module - rich Graphviz visualization with visual intelligence. + +This module transforms raw graph data and analysis results into actionable +visualizations using Graphviz DOT format with intelligent visual encoding. + +Visual encoding strategy: +- Node color: Programming language +- Node size: Importance/connectivity (in-degree) +- Edge color: Red for cycles, gray for normal +- Edge style: Import type (solid/dashed/dotted) +- Node shape: Type (box=module, ellipse=function) +""" + +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Set, Optional + + +class GraphVisualizer: + """Transform graph analysis into actionable visualizations.""" + + # Language colors - high contrast, colorblind-friendly palette + LANGUAGE_COLORS = { + 'python': '#3776AB', # Python blue + 'javascript': '#F7DF1E', # JS yellow + 'typescript': '#3178C6', # TS blue + 'java': '#007396', # Java blue-green + 'go': '#00ADD8', # Go cyan + 'rust': '#CE4E21', # Rust orange + 'c': '#A8B9CC', # C gray-blue + 'c++': '#00599C', # C++ dark blue + 'c#': '#239120', # C# green + 'ruby': '#CC342D', # Ruby red + 'php': '#777BB4', # PHP purple + 'default': '#808080', # Gray for unknown + } + + # Risk level colors for severity encoding + RISK_COLORS = { + 'critical': '#D32F2F', # Deep red + 'high': '#F57C00', # Orange + 'medium': '#FBC02D', # Yellow + 'low': '#689F38', # Green + 'info': '#1976D2', # Blue + } + + def __init__(self): + """Initialize the visualizer.""" + self.cycle_edges = set() # Track edges that are part of cycles + self.node_degrees = {} # Track in/out degrees for sizing + + def generate_dot( + self, + graph: Dict[str, Any], + analysis: Optional[Dict[str, Any]] = None, + options: Optional[Dict[str, Any]] = None, + ) -> str: + """ + Generate DOT format with visual intelligence encoding. + + Args: + graph: Graph dict with 'nodes' and 'edges' + analysis: Optional analysis results with cycles, hotspots, etc. + options: Optional visualization options + + Returns: + DOT format string ready for Graphviz + """ + options = options or {} + analysis = analysis or {} + + # Pre-process analysis data + self._process_analysis(graph, analysis) + + # Start DOT file + dot_lines = ['digraph G {'] + + # Global graph attributes + dot_lines.extend(self._generate_graph_attrs(options)) + + # Generate nodes with visual encoding + dot_lines.extend(self._generate_nodes(graph, analysis, options)) + + # Generate edges with visual encoding + dot_lines.extend(self._generate_edges(graph, analysis, options)) + + # Close graph + dot_lines.append('}') + + return '\n'.join(dot_lines) + + def _process_analysis( + self, + graph: Dict[str, Any], + analysis: Dict[str, Any] + ) -> None: + """Pre-process analysis data for quick lookup.""" + # Calculate node degrees + self.node_degrees.clear() + for edge in graph.get('edges', []): + source = edge.get('source', '') + target = edge.get('target', '') + + # Track out-degree + if source not in self.node_degrees: + self.node_degrees[source] = {'in': 0, 'out': 0} + self.node_degrees[source]['out'] += 1 + + # Track in-degree + if target not in self.node_degrees: + self.node_degrees[target] = {'in': 0, 'out': 0} + self.node_degrees[target]['in'] += 1 + + # Identify edges that are part of cycles + self.cycle_edges.clear() + cycles = analysis.get('cycles', []) + for cycle in cycles: + cycle_nodes = cycle.get('nodes', []) + # Mark edges between consecutive nodes in cycle + for i in range(len(cycle_nodes)): + source = cycle_nodes[i] + target = cycle_nodes[(i + 1) % len(cycle_nodes)] + self.cycle_edges.add((source, target)) + + def _generate_graph_attrs(self, options: Dict[str, Any]) -> List[str]: + """Generate global graph attributes.""" + attrs = [] + attrs.append(' rankdir=LR;') # Left to right layout + attrs.append(' bgcolor="white";') + attrs.append(' nodesep=0.5;') + attrs.append(' ranksep=1.0;') + attrs.append(' fontname="Arial";') + + # Default node attributes + attrs.append(' node [fontname="Arial", fontsize=10, style=filled];') + + # Default edge attributes + attrs.append(' edge [fontname="Arial", fontsize=8];') + + # Add title if provided + if options.get('title'): + attrs.append(f' label="{options["title"]}";') + attrs.append(' labelloc=t;') + attrs.append(' fontsize=14;') + + return attrs + + def _generate_nodes( + self, + graph: Dict[str, Any], + analysis: Dict[str, Any], + options: Dict[str, Any] + ) -> List[str]: + """Generate nodes with visual encoding.""" + node_lines = [] + nodes = graph.get('nodes', []) + + # Get hotspots for special highlighting + hotspots = analysis.get('hotspots', []) + hotspot_ids = {h['id']: h for h in hotspots[:10]} # Top 10 hotspots + + # Limit nodes if requested + max_nodes = options.get('max_nodes', 500) + if len(nodes) > max_nodes: + # Sort by importance (in-degree + out-degree) + nodes = sorted( + nodes, + key=lambda n: self.node_degrees.get( + n['id'], {'in': 0, 'out': 0} + )['in'] + self.node_degrees.get( + n['id'], {'in': 0, 'out': 0} + )['out'], + reverse=True + )[:max_nodes] + + for node in nodes: + node_id = node.get('id', '') + node_file = node.get('file', node_id) + node_lang = node.get('lang', 'default') + node_type = node.get('type', 'module') + + # Sanitize node ID for DOT format + safe_id = self._sanitize_id(node_id) + + # Determine node color based on language + color = self.LANGUAGE_COLORS.get(node_lang, self.LANGUAGE_COLORS['default']) + + # Determine node size based on in-degree (hotspot detection) + degrees = self.node_degrees.get(node_id, {'in': 0, 'out': 0}) + in_degree = degrees['in'] + + # Scale size based on in-degree (min 0.5, max 2.0) + if in_degree > 30: + size = 2.0 + elif in_degree > 20: + size = 1.5 + elif in_degree > 10: + size = 1.2 + elif in_degree > 5: + size = 1.0 + else: + size = 0.8 + + # Determine shape based on type + if node_type == 'function': + shape = 'ellipse' + elif node_type == 'class': + shape = 'diamond' + else: # module + shape = 'box' + + # Generate label (shortened for readability) + label = self._generate_node_label(node_id, node_file) + + # Build node attributes + attrs = [] + attrs.append(f'label="{label}"') + attrs.append(f'fillcolor="{color}"') + attrs.append(f'shape={shape}') + attrs.append(f'width={size}') + attrs.append(f'height={size * 0.7}') + + # Special styling for hotspots + if node_id in hotspot_ids: + attrs.append('penwidth=3') + attrs.append('fontsize=12') + attrs.append('fontcolor="black"') + # Add tooltip with hotspot info + hotspot = hotspot_ids[node_id] + tooltip = f"Hotspot: in={hotspot.get('in_degree', 0)}, out={hotspot.get('out_degree', 0)}" + attrs.append(f'tooltip="{tooltip}"') + else: + attrs.append('penwidth=1') + attrs.append('fontcolor="white"') + + # Create node line + node_line = f' {safe_id} [{", ".join(attrs)}];' + node_lines.append(node_line) + + return node_lines + + def _generate_edges( + self, + graph: Dict[str, Any], + analysis: Dict[str, Any], + options: Dict[str, Any] + ) -> List[str]: + """Generate edges with visual encoding.""" + edge_lines = [] + edges = graph.get('edges', []) + + # Get node IDs for filtering + node_ids = {n['id'] for n in graph.get('nodes', [])} + max_nodes = options.get('max_nodes', 500) + if len(node_ids) > max_nodes: + # Keep only edges between displayed nodes + important_nodes = set(list(node_ids)[:max_nodes]) + edges = [ + e for e in edges + if e.get('source') in important_nodes and e.get('target') in important_nodes + ] + + for edge in edges: + source = edge.get('source', '') + target = edge.get('target', '') + edge_type = edge.get('type', 'import') + + # Skip self-loops unless in options + if source == target and not options.get('show_self_loops'): + continue + + # Sanitize IDs + safe_source = self._sanitize_id(source) + safe_target = self._sanitize_id(target) + + # Build edge attributes + attrs = [] + + # Color red if part of a cycle + if (source, target) in self.cycle_edges: + attrs.append('color="#D32F2F"') # Red for cycles + attrs.append('penwidth=2') + attrs.append('fontcolor="#D32F2F"') + attrs.append('label="cycle"') + else: + attrs.append('color="#666666"') # Gray for normal + attrs.append('penwidth=1') + + # Style based on edge type + if edge_type == 'call': + attrs.append('style=dashed') + elif edge_type == 'extends' or edge_type == 'implements': + attrs.append('style=bold') + else: # import + attrs.append('style=solid') + + # Arrowhead style + if edge_type == 'extends': + attrs.append('arrowhead=empty') # Inheritance + elif edge_type == 'implements': + attrs.append('arrowhead=odiamond') # Interface + else: + attrs.append('arrowhead=normal') + + # Create edge line + if attrs: + edge_line = f' {safe_source} -> {safe_target} [{", ".join(attrs)}];' + else: + edge_line = f' {safe_source} -> {safe_target};' + + edge_lines.append(edge_line) + + return edge_lines + + def _sanitize_id(self, node_id: str) -> str: + """Sanitize node ID for DOT format.""" + # Replace problematic characters + safe_id = node_id.replace('.', '_') + safe_id = safe_id.replace('/', '_') + safe_id = safe_id.replace('\\', '_') + safe_id = safe_id.replace('-', '_') + safe_id = safe_id.replace(':', '_') + safe_id = safe_id.replace(' ', '_') + safe_id = safe_id.replace('(', '_') + safe_id = safe_id.replace(')', '_') + safe_id = safe_id.replace('[', '_') + safe_id = safe_id.replace(']', '_') + + # Ensure it starts with a letter or underscore + if safe_id and not safe_id[0].isalpha() and safe_id[0] != '_': + safe_id = '_' + safe_id + + # Quote if necessary + if safe_id and not safe_id.replace('_', '').isalnum(): + safe_id = f'"{safe_id}"' + + return safe_id + + def _generate_node_label(self, node_id: str, node_file: str) -> str: + """Generate readable label for a node.""" + # Use filename for modules, full ID for functions + if '::' in node_id: # Function node + # Show module::function + parts = node_id.split('::') + if len(parts) >= 2: + module = Path(parts[0]).stem # Just filename without extension + function = parts[1] + return f"{module}::{function}" + return node_id + else: # Module node + # Show just the filename without path + path = Path(node_file) + if path.parts: + # Show last 2 parts of path for context + if len(path.parts) > 2: + return f".../{path.parts[-2]}/{path.name}" + elif len(path.parts) > 1: + return f"{path.parts[-2]}/{path.name}" + else: + return path.name + return node_id + + def generate_dot_with_layers( + self, + graph: Dict[str, Any], + layers: Dict[int, List[str]], + analysis: Optional[Dict[str, Any]] = None, + options: Optional[Dict[str, Any]] = None, + ) -> str: + """ + Generate DOT format with architectural layers as subgraphs. + + Args: + graph: Graph dict with 'nodes' and 'edges' + layers: Dict mapping layer number to list of node IDs + analysis: Optional analysis results + options: Optional visualization options + + Returns: + DOT format string with layer subgraphs + """ + options = options or {} + analysis = analysis or {} + + # Pre-process analysis data + self._process_analysis(graph, analysis) + + # Build node lookup for efficiency + node_map = {n['id']: n for n in graph.get('nodes', []) if n.get('id') is not None} + + # Start DOT file + dot_lines = ['digraph G {'] + + # Global graph attributes + dot_lines.extend(self._generate_graph_attrs(options)) + dot_lines.append(' rankdir=TB;') # Top-to-bottom for layers + + # Generate layer subgraphs + # Filter out None keys and ensure all keys are comparable + valid_layer_nums = [k for k in layers.keys() if k is not None] + for layer_num in sorted(valid_layer_nums): + layer_nodes = layers[layer_num] + if not layer_nodes: + continue + + # Create subgraph for this layer + dot_lines.append(f' subgraph cluster_layer{layer_num} {{') + dot_lines.append(f' label="Layer {layer_num}";') + dot_lines.append(f' style=filled;') + dot_lines.append(f' fillcolor="#F0F0F0";') + dot_lines.append(f' color="#CCCCCC";') + dot_lines.append(f' fontsize=12;') + dot_lines.append(f' rank=same;') # Keep nodes at same level + + # Add nodes for this layer + for node_id in layer_nodes: + if node_id not in node_map: + continue + + node = node_map[node_id] + node_lang = node.get('lang', 'default') + node_type = node.get('type', 'module') + + # Sanitize node ID + safe_id = self._sanitize_id(node_id) + + # Determine node color based on language + color = self.LANGUAGE_COLORS.get(node_lang, self.LANGUAGE_COLORS['default']) + + # Determine node size based on in-degree + degrees = self.node_degrees.get(node_id, {'in': 0, 'out': 0}) + in_degree = degrees['in'] + + # Scale size based on in-degree + if in_degree > 30: + size = 2.0 + elif in_degree > 20: + size = 1.5 + elif in_degree > 10: + size = 1.2 + elif in_degree > 5: + size = 1.0 + else: + size = 0.8 + + # Determine shape based on type + if node_type == 'function': + shape = 'ellipse' + elif node_type == 'class': + shape = 'diamond' + else: # module + shape = 'box' + + # Generate label + label = self._generate_node_label(node_id, node.get('file', node_id)) + + # Check if node has churn data for border thickness + churn = node.get('churn', 0) + if churn is None: + churn = 0 + if churn > 100: + penwidth = 4 # Very high churn + elif churn > 50: + penwidth = 3 # High churn + elif churn > 20: + penwidth = 2 # Medium churn + else: + penwidth = 1 # Low/no churn + + # Build node attributes + attrs = [] + attrs.append(f'label="{label}"') + attrs.append(f'fillcolor="{color}"') + attrs.append(f'shape={shape}') + attrs.append(f'width={size}') + attrs.append(f'height={size * 0.7}') + attrs.append(f'penwidth={penwidth}') + attrs.append('fontcolor="white"') + attrs.append('style=filled') + + # Add tooltip with layer info + tooltip = f"Layer {layer_num}: {node_id}" + if churn > 0: + tooltip += f" (churn: {churn})" + attrs.append(f'tooltip="{tooltip}"') + + # Create node line + node_line = f' {safe_id} [{", ".join(attrs)}];' + dot_lines.append(node_line) + + dot_lines.append(' }') # Close subgraph + + # Generate edges (outside of subgraphs) + dot_lines.extend(self._generate_edges(graph, analysis, options)) + + # Close graph + dot_lines.append('}') + + return '\n'.join(dot_lines) + + def generate_impact_visualization( + self, + graph: Dict[str, Any], + impact: Dict[str, Any], + options: Optional[Dict[str, Any]] = None, + ) -> str: + """ + Generate DOT highlighting impact analysis results. + + Args: + graph: Graph dict with 'nodes' and 'edges' + impact: Impact analysis with targets, upstream, downstream + options: Optional visualization options + + Returns: + DOT format string with impact highlighting + """ + options = options or {} + + # Extract impact sets + targets = set(impact.get('targets', [])) + upstream = set(impact.get('upstream', [])) + downstream = set(impact.get('downstream', [])) + + # Pre-process analysis data + self._process_analysis(graph, {}) + + # Start DOT file + dot_lines = ['digraph G {'] + + # Global graph attributes + dot_lines.extend(self._generate_graph_attrs(options)) + + # Add legend for impact visualization + dot_lines.append(' subgraph cluster_legend {') + dot_lines.append(' label="Impact Analysis Legend";') + dot_lines.append(' style=filled;') + dot_lines.append(' fillcolor=white;') + dot_lines.append(' node [shape=box, style=filled];') + dot_lines.append(' legend_target [label="Target", fillcolor="#FF0000"];') + dot_lines.append(' legend_upstream [label="Upstream", fillcolor="#FF9800"];') + dot_lines.append(' legend_downstream [label="Downstream", fillcolor="#2196F3"];') + dot_lines.append(' legend_both [label="Both", fillcolor="#9C27B0"];') + dot_lines.append(' legend_unaffected [label="Unaffected", fillcolor="#808080"];') + dot_lines.append(' }') + + # Generate nodes with impact highlighting + node_lines = [] + for node in graph.get('nodes', []): + node_id = node.get('id', '') + node_file = node.get('file', node_id) + node_lang = node.get('lang', 'default') + node_type = node.get('type', 'module') + + # Sanitize node ID + safe_id = self._sanitize_id(node_id) + + # Determine impact color + if node_id in targets: + color = '#FF0000' # Red for target + fontcolor = 'white' + penwidth = 3 + elif node_id in upstream and node_id in downstream: + color = '#9C27B0' # Purple for both upstream and downstream + fontcolor = 'white' + penwidth = 2 + elif node_id in upstream: + color = '#FF9800' # Orange for upstream + fontcolor = 'white' + penwidth = 2 + elif node_id in downstream: + color = '#2196F3' # Blue for downstream + fontcolor = 'white' + penwidth = 2 + else: + color = '#E0E0E0' # Light gray for unaffected + fontcolor = 'black' + penwidth = 1 + + # Determine node size based on impact radius + degrees = self.node_degrees.get(node_id, {'in': 0, 'out': 0}) + if node_id in targets: + size = 1.5 # Targets are emphasized + elif node_id in upstream or node_id in downstream: + size = 1.2 # Affected nodes are slightly larger + else: + size = 0.8 # Unaffected nodes are smaller + + # Determine shape based on type + if node_type == 'function': + shape = 'ellipse' + elif node_type == 'class': + shape = 'diamond' + else: # module + shape = 'box' + + # Generate label + label = self._generate_node_label(node_id, node_file) + + # Build node attributes + attrs = [] + attrs.append(f'label="{label}"') + attrs.append(f'fillcolor="{color}"') + attrs.append(f'shape={shape}') + attrs.append(f'width={size}') + attrs.append(f'height={size * 0.7}') + attrs.append(f'penwidth={penwidth}') + attrs.append(f'fontcolor="{fontcolor}"') + attrs.append('style=filled') + + # Add tooltip with impact info + tooltip_parts = [] + if node_id in targets: + tooltip_parts.append("TARGET") + if node_id in upstream: + tooltip_parts.append("Upstream") + if node_id in downstream: + tooltip_parts.append("Downstream") + if tooltip_parts: + tooltip = f"{node_id}: {', '.join(tooltip_parts)}" + else: + tooltip = f"{node_id}: Unaffected" + attrs.append(f'tooltip="{tooltip}"') + + # Create node line + node_line = f' {safe_id} [{", ".join(attrs)}];' + node_lines.append(node_line) + + dot_lines.extend(node_lines) + + # Generate edges with impact highlighting + edge_lines = [] + for edge in graph.get('edges', []): + source = edge.get('source', '') + target = edge.get('target', '') + edge_type = edge.get('type', 'import') + + # Skip self-loops unless in options + if source == target and not options.get('show_self_loops'): + continue + + # Sanitize IDs + safe_source = self._sanitize_id(source) + safe_target = self._sanitize_id(target) + + # Build edge attributes + attrs = [] + + # Color edges based on impact path + if source in targets and target in downstream: + attrs.append('color="#FF0000"') # Red for direct impact + attrs.append('penwidth=3') + elif source in upstream and target in targets: + attrs.append('color="#FF9800"') # Orange for upstream to target + attrs.append('penwidth=2') + elif (source in targets or source in upstream or source in downstream) and \ + (target in targets or target in upstream or target in downstream): + attrs.append('color="#666666"') # Gray for affected connections + attrs.append('penwidth=1.5') + else: + attrs.append('color="#E0E0E0"') # Light gray for unaffected + attrs.append('penwidth=0.5') + attrs.append('style=dashed') + + # Arrowhead style + attrs.append('arrowhead=normal') + + # Create edge line + if attrs: + edge_line = f' {safe_source} -> {safe_target} [{", ".join(attrs)}];' + else: + edge_line = f' {safe_source} -> {safe_target};' + + edge_lines.append(edge_line) + + dot_lines.extend(edge_lines) + + # Close graph + dot_lines.append('}') + + return '\n'.join(dot_lines) + + def generate_cycles_only_view( + self, + graph: Dict[str, Any], + cycles: List[Dict[str, Any]], + options: Optional[Dict[str, Any]] = None, + ) -> str: + """ + Generate DOT format showing only nodes and edges involved in cycles. + + Args: + graph: Graph dict with 'nodes' and 'edges' + cycles: List of cycle dicts with 'nodes' lists + options: Optional visualization options + + Returns: + DOT format string with only cycle-related elements + """ + options = options or {} + + # Collect all nodes involved in cycles + cycle_nodes = set() + cycle_edges = set() + + for cycle in cycles: + nodes = cycle.get('nodes', []) + cycle_nodes.update(nodes) + + # Mark edges between consecutive nodes in cycle + for i in range(len(nodes)): + source = nodes[i] + target = nodes[(i + 1) % len(nodes)] + cycle_edges.add((source, target)) + + if not cycle_nodes: + # No cycles found + return 'digraph G {\n label="No cycles detected";\n}' + + # Filter graph to only cycle-related elements + filtered_graph = { + 'nodes': [n for n in graph.get('nodes', []) if n['id'] in cycle_nodes], + 'edges': [e for e in graph.get('edges', []) + if (e['source'], e['target']) in cycle_edges] + } + + # Pre-process for visualization + self.cycle_edges = cycle_edges # Mark for red highlighting + self._process_analysis(filtered_graph, {}) + + # Start DOT file + dot_lines = ['digraph G {'] + + # Global graph attributes + dot_lines.append(' label="Dependency Cycles Visualization";') + dot_lines.append(' labelloc=t;') + dot_lines.append(' fontsize=14;') + dot_lines.append(' bgcolor="white";') + dot_lines.append(' rankdir=LR;') + dot_lines.append(' node [fontname="Arial", fontsize=10, style=filled];') + dot_lines.append(' edge [fontname="Arial", fontsize=8];') + + # Group nodes by cycle for better visualization + for idx, cycle in enumerate(cycles): + cycle_node_set = set(cycle.get('nodes', [])) + + dot_lines.append(f' subgraph cluster_cycle{idx} {{') + dot_lines.append(f' label="Cycle {idx + 1} (size: {len(cycle_node_set)})";') + dot_lines.append(' style=filled;') + dot_lines.append(' fillcolor="#FFE0E0";') # Light red background + dot_lines.append(' color="#D32F2F";') # Red border + + # Add nodes for this cycle + for node in filtered_graph['nodes']: + if node['id'] not in cycle_node_set: + continue + + node_id = node['id'] + safe_id = self._sanitize_id(node_id) + label = self._generate_node_label(node_id, node.get('file', node_id)) + + # Node styling + attrs = [] + attrs.append(f'label="{label}"') + attrs.append('fillcolor="#FF5252"') # Red for cycle nodes + attrs.append('fontcolor="white"') + attrs.append('shape=box') + attrs.append('penwidth=2') + + node_line = f' {safe_id} [{", ".join(attrs)}];' + dot_lines.append(node_line) + + dot_lines.append(' }') + + # Add edges + for edge in filtered_graph['edges']: + source = edge['source'] + target = edge['target'] + + safe_source = self._sanitize_id(source) + safe_target = self._sanitize_id(target) + + attrs = [] + attrs.append('color="#D32F2F"') # Red for cycle edges + attrs.append('penwidth=2') + attrs.append('arrowhead=normal') + + edge_line = f' {safe_source} -> {safe_target} [{", ".join(attrs)}];' + dot_lines.append(edge_line) + + dot_lines.append('}') + + return '\n'.join(dot_lines) + + def generate_hotspots_only_view( + self, + graph: Dict[str, Any], + hotspots: List[Dict[str, Any]], + options: Optional[Dict[str, Any]] = None, + top_n: int = 10, + ) -> str: + """ + Generate DOT format showing only hotspot nodes and their connections. + + Args: + graph: Graph dict with 'nodes' and 'edges' + hotspots: List of hotspot dicts with 'id' and metrics + options: Optional visualization options + top_n: Number of top hotspots to show (default: 10) + + Returns: + DOT format string with only hotspot-related elements + """ + options = options or {} + + # Get top N hotspots + top_hotspots = hotspots[:top_n] + hotspot_ids = {h['id'] for h in top_hotspots} + + if not hotspot_ids: + return 'digraph G {\n label="No hotspots detected";\n}' + + # Collect nodes connected to hotspots (1 degree of separation) + connected_nodes = set(hotspot_ids) + for edge in graph.get('edges', []): + if edge['source'] in hotspot_ids: + connected_nodes.add(edge['target']) + if edge['target'] in hotspot_ids: + connected_nodes.add(edge['source']) + + # Filter graph + filtered_graph = { + 'nodes': [n for n in graph.get('nodes', []) if n['id'] in connected_nodes], + 'edges': [e for e in graph.get('edges', []) + if e['source'] in connected_nodes and e['target'] in connected_nodes] + } + + # Pre-process + self._process_analysis(filtered_graph, {}) + + # Start DOT file + dot_lines = ['digraph G {'] + + # Global graph attributes + dot_lines.append(f' label="Top {top_n} Hotspots Visualization";') + dot_lines.append(' labelloc=t;') + dot_lines.append(' fontsize=14;') + dot_lines.append(' bgcolor="white";') + dot_lines.append(' rankdir=LR;') + dot_lines.append(' node [fontname="Arial", fontsize=10, style=filled];') + dot_lines.append(' edge [fontname="Arial", fontsize=8];') + + # Create hotspot lookup + hotspot_map = {h['id']: h for h in top_hotspots} + + # Generate nodes + for node in filtered_graph['nodes']: + node_id = node['id'] + safe_id = self._sanitize_id(node_id) + label = self._generate_node_label(node_id, node.get('file', node_id)) + + # Determine styling based on whether it's a hotspot + if node_id in hotspot_ids: + hotspot = hotspot_map[node_id] + in_degree = hotspot.get('in_degree', 0) + out_degree = hotspot.get('out_degree', 0) + + # Size based on total connections + total = in_degree + out_degree + if total > 50: + size = 2.5 + elif total > 30: + size = 2.0 + elif total > 20: + size = 1.5 + else: + size = 1.2 + + # Color intensity based on ranking + rank = list(hotspot_ids).index(node_id) + if rank == 0: + color = '#D32F2F' # Darkest red for #1 + elif rank < 3: + color = '#F44336' # Red for top 3 + elif rank < 5: + color = '#FF5722' # Deep orange for top 5 + else: + color = '#FF9800' # Orange for rest + + attrs = [] + attrs.append(f'label="{label}\\n[in:{in_degree} out:{out_degree}]"') + attrs.append(f'fillcolor="{color}"') + attrs.append('fontcolor="white"') + attrs.append('shape=box') + attrs.append(f'width={size}') + attrs.append(f'height={size * 0.7}') + attrs.append('penwidth=3') + + # Tooltip + tooltip = f"Hotspot #{rank+1}: in={in_degree}, out={out_degree}" + attrs.append(f'tooltip="{tooltip}"') + else: + # Connected node (not a hotspot) + attrs = [] + attrs.append(f'label="{label}"') + attrs.append('fillcolor="#E0E0E0"') + attrs.append('fontcolor="black"') + attrs.append('shape=box') + attrs.append('width=0.8') + attrs.append('height=0.6') + attrs.append('penwidth=1') + + node_line = f' {safe_id} [{", ".join(attrs)}];' + dot_lines.append(node_line) + + # Generate edges + for edge in filtered_graph['edges']: + source = edge['source'] + target = edge['target'] + + safe_source = self._sanitize_id(source) + safe_target = self._sanitize_id(target) + + # Highlight edges connected to hotspots + if source in hotspot_ids or target in hotspot_ids: + attrs = ['color="#666666"', 'penwidth=1.5'] + else: + attrs = ['color="#CCCCCC"', 'penwidth=0.5'] + + attrs.append('arrowhead=normal') + + edge_line = f' {safe_source} -> {safe_target} [{", ".join(attrs)}];' + dot_lines.append(edge_line) + + dot_lines.append('}') + + return '\n'.join(dot_lines) \ No newline at end of file diff --git a/theauditor/impact_analyzer.py b/theauditor/impact_analyzer.py new file mode 100644 index 0000000..4ecd5f2 --- /dev/null +++ b/theauditor/impact_analyzer.py @@ -0,0 +1,683 @@ +"""Impact analysis engine for tracing code dependencies and change blast radius.""" + +import sqlite3 +from pathlib import Path +from typing import Dict, List, Optional, Any, Set, Tuple + + +def analyze_impact( + db_path: str, + target_file: str, + target_line: int, + trace_to_backend: bool = False +) -> Dict[str, Any]: + """ + Analyze the impact of changing code at a specific file and line. + + Traces both upstream dependencies (who calls this) and downstream + dependencies (what this calls) to understand the blast radius of changes. + + Args: + db_path: Path to the SQLite database + target_file: Path to the file containing the target code + target_line: Line number of the target code + + Returns: + Dictionary containing: + - target_symbol: Name and type of the symbol at target location + - upstream: List of symbols that call the target (callers) + - downstream: List of symbols called by the target (callees) + - impact_summary: Statistics about the blast radius + """ + # Connect to database + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + try: + # Normalize the target file path to match database format + target_file = Path(target_file).as_posix() + if target_file.startswith("./"): + target_file = target_file[2:] + + # Check if cross-stack analysis is requested + if trace_to_backend and target_file.endswith(('.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs')): + # Attempt cross-stack tracing + cross_stack_trace = trace_frontend_to_backend(cursor, target_file, target_line) + + if cross_stack_trace: + # Found a backend endpoint - analyze its downstream impact + backend_file = cross_stack_trace["backend"]["file"] + backend_line = cross_stack_trace["backend"]["line"] + + # Find the backend function/class at the traced location + cursor.execute(""" + SELECT name, type, line, col + FROM symbols + WHERE path = ? + AND type IN ('function', 'class') + AND line <= ? + ORDER BY line DESC + LIMIT 1 + """, (backend_file, backend_line)) + + backend_result = cursor.fetchone() + + if backend_result: + backend_name, backend_type, backend_def_line, backend_col = backend_result + + # Only get downstream dependencies from backend (not upstream) + downstream = find_downstream_dependencies(cursor, backend_file, backend_def_line, backend_name) + downstream_transitive = calculate_transitive_impact(cursor, downstream, "downstream") + + # Build cross-stack response + return { + "cross_stack_trace": cross_stack_trace, + "target_symbol": { + "name": f"API Call to {cross_stack_trace['frontend']['url']}", + "type": "api_call", + "file": target_file, + "line": target_line, + "column": 0 + }, + "backend_symbol": { + "name": backend_name, + "type": backend_type, + "file": backend_file, + "line": backend_def_line, + "column": backend_col + }, + "upstream": [], # Frontend has no upstream in this context + "upstream_transitive": [], + "downstream": downstream, + "downstream_transitive": downstream_transitive, + "impact_summary": { + "direct_upstream": 0, + "direct_downstream": len(downstream), + "total_upstream": 0, + "total_downstream": len(downstream) + len(downstream_transitive), + "total_impact": len(downstream) + len(downstream_transitive), + "affected_files": len(set( + [d["file"] for d in downstream] + + [d["file"] for d in downstream_transitive] + )), + "cross_stack": True + } + } + + # Step 1: Find the target symbol at the specified location + # Look for function or class definition at or near the target line + cursor.execute(""" + SELECT name, type, line, col + FROM symbols + WHERE path = ? + AND type IN ('function', 'class') + AND line <= ? + ORDER BY line DESC + LIMIT 1 + """, (target_file, target_line)) + + target_result = cursor.fetchone() + + if not target_result: + # No function/class found, return empty analysis + return { + "target_symbol": None, + "error": f"No function or class found at {target_file}:{target_line}", + "upstream": [], + "downstream": [], + "impact_summary": { + "total_upstream": 0, + "total_downstream": 0, + "total_impact": 0 + } + } + + target_name, target_type, target_def_line, target_col = target_result + + # Step 2: Find upstream dependencies (who calls this symbol) + upstream = find_upstream_dependencies(cursor, target_file, target_name, target_type) + + # Step 3: Find downstream dependencies (what this symbol calls) + downstream = find_downstream_dependencies(cursor, target_file, target_def_line, target_name) + + # Step 4: Calculate transitive impact (recursive dependencies) + upstream_transitive = calculate_transitive_impact(cursor, upstream, "upstream") + downstream_transitive = calculate_transitive_impact(cursor, downstream, "downstream") + + # Build response + return { + "target_symbol": { + "name": target_name, + "type": target_type, + "file": target_file, + "line": target_def_line, + "column": target_col + }, + "upstream": upstream, + "upstream_transitive": upstream_transitive, + "downstream": downstream, + "downstream_transitive": downstream_transitive, + "impact_summary": { + "direct_upstream": len(upstream), + "direct_downstream": len(downstream), + "total_upstream": len(upstream) + len(upstream_transitive), + "total_downstream": len(downstream) + len(downstream_transitive), + "total_impact": len(upstream) + len(downstream) + len(upstream_transitive) + len(downstream_transitive), + "affected_files": len(set( + [u["file"] for u in upstream] + + [d["file"] for d in downstream] + + [u["file"] for u in upstream_transitive] + + [d["file"] for d in downstream_transitive] + )) + } + } + + finally: + conn.close() + + +def find_upstream_dependencies( + cursor: sqlite3.Cursor, + target_file: str, + target_name: str, + target_type: str +) -> List[Dict[str, Any]]: + """ + Find all symbols that call the target symbol (upstream dependencies). + + Args: + cursor: Database cursor + target_file: File containing the target symbol + target_name: Name of the target symbol + target_type: Type of the target symbol (function/class) + + Returns: + List of upstream dependency dictionaries + """ + upstream = [] + + # Find all calls to this symbol + # Match by name (simple matching, could be enhanced with qualified names) + cursor.execute(""" + SELECT DISTINCT s1.path, s1.name, s1.type, s1.line, s1.col + FROM symbols s1 + WHERE s1.type = 'call' + AND s1.name = ? + AND EXISTS ( + SELECT 1 FROM symbols s2 + WHERE s2.path = s1.path + AND s2.type IN ('function', 'class') + AND s2.line <= s1.line + AND s2.name != ? + ) + ORDER BY s1.path, s1.line + """, (target_name, target_name)) + + for row in cursor.fetchall(): + call_file, call_name, call_type, call_line, call_col = row + + # Find the containing function/class for this call + cursor.execute(""" + SELECT name, type, line + FROM symbols + WHERE path = ? + AND type IN ('function', 'class') + AND line <= ? + ORDER BY line DESC + LIMIT 1 + """, (call_file, call_line)) + + container = cursor.fetchone() + if container: + container_name, container_type, container_line = container + upstream.append({ + "file": call_file, + "symbol": container_name, + "type": container_type, + "line": container_line, + "call_line": call_line, + "calls": target_name + }) + + # Deduplicate by file+symbol combination + seen = set() + unique_upstream = [] + for dep in upstream: + key = (dep["file"], dep["symbol"]) + if key not in seen: + seen.add(key) + unique_upstream.append(dep) + + return unique_upstream + + +def find_downstream_dependencies( + cursor: sqlite3.Cursor, + target_file: str, + target_line: int, + target_name: str +) -> List[Dict[str, Any]]: + """ + Find all symbols called by the target symbol (downstream dependencies). + + Args: + cursor: Database cursor + target_file: File containing the target symbol + target_line: Line where target symbol is defined + target_name: Name of the target symbol + + Returns: + List of downstream dependency dictionaries + """ + downstream = [] + + # Find the end line of the target function/class + # Look for the next function/class definition in the same file + cursor.execute(""" + SELECT line + FROM symbols + WHERE path = ? + AND type IN ('function', 'class') + AND line > ? + ORDER BY line + LIMIT 1 + """, (target_file, target_line)) + + next_symbol = cursor.fetchone() + end_line = next_symbol[0] if next_symbol else 999999 + + # Find all calls within the target function/class body + cursor.execute(""" + SELECT DISTINCT name, line, col + FROM symbols + WHERE path = ? + AND type = 'call' + AND line > ? + AND line < ? + ORDER BY line + """, (target_file, target_line, end_line)) + + for row in cursor.fetchall(): + called_name, call_line, call_col = row + + # Skip recursive calls + if called_name == target_name: + continue + + # Try to find the definition of the called symbol + cursor.execute(""" + SELECT path, type, line + FROM symbols + WHERE name = ? + AND type IN ('function', 'class') + LIMIT 1 + """, (called_name,)) + + definition = cursor.fetchone() + if definition: + def_file, def_type, def_line = definition + downstream.append({ + "file": def_file, + "symbol": called_name, + "type": def_type, + "line": def_line, + "called_from_line": call_line, + "called_by": target_name + }) + else: + # External or built-in function + downstream.append({ + "file": "external", + "symbol": called_name, + "type": "unknown", + "line": 0, + "called_from_line": call_line, + "called_by": target_name + }) + + # Deduplicate by symbol name + seen = set() + unique_downstream = [] + for dep in downstream: + if dep["symbol"] not in seen: + seen.add(dep["symbol"]) + unique_downstream.append(dep) + + return unique_downstream + + +def calculate_transitive_impact( + cursor: sqlite3.Cursor, + direct_deps: List[Dict[str, Any]], + direction: str, + max_depth: int = 2, + visited: Optional[Set[Tuple[str, str]]] = None +) -> List[Dict[str, Any]]: + """ + Calculate transitive dependencies up to max_depth. + + Args: + cursor: Database cursor + direct_deps: Direct dependencies to expand + direction: "upstream" or "downstream" + max_depth: Maximum recursion depth + visited: Set of already visited (file, symbol) pairs + + Returns: + List of transitive dependencies + """ + if max_depth <= 0 or not direct_deps: + return [] + + if visited is None: + visited = set() + + transitive = [] + + for dep in direct_deps: + # Skip external dependencies + if dep["file"] == "external": + continue + + dep_key = (dep["file"], dep["symbol"]) + if dep_key in visited: + continue + visited.add(dep_key) + + if direction == "upstream": + # Find who calls this dependency + next_level = find_upstream_dependencies( + cursor, dep["file"], dep["symbol"], dep["type"] + ) + else: + # Find what this dependency calls + next_level = find_downstream_dependencies( + cursor, dep["file"], dep["line"], dep["symbol"] + ) + + # Add current level + for next_dep in next_level: + next_dep["depth"] = max_depth + transitive.append(next_dep) + + # Recurse + recursive_deps = calculate_transitive_impact( + cursor, next_level, direction, max_depth - 1, visited + ) + transitive.extend(recursive_deps) + + return transitive + + +def trace_frontend_to_backend( + cursor: sqlite3.Cursor, + target_file: str, + target_line: int +) -> Optional[Dict[str, Any]]: + """ + Trace a frontend API call to its corresponding backend endpoint. + + Args: + cursor: Database cursor + target_file: Frontend file containing API call + target_line: Line number of the API call + + Returns: + Dictionary with cross-stack trace information or None if not found + """ + import re + from pathlib import Path + + # Read the target file to extract API call details + try: + file = Path(target_file) + if not file_path.exists(): + # Try relative path + file = Path(".") / target_file + if not file_path.exists(): + return None + + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + lines = f.readlines() + + # Get context around the target line (5 lines before and after) + start_idx = max(0, target_line - 6) # -6 because line numbers are 1-based + end_idx = min(len(lines), target_line + 5) + context_lines = lines[start_idx:end_idx] + context = ''.join(context_lines) + + # Extract API call patterns + # Common patterns: axios.get('/api/users'), fetch('/api/users'), http.post('/api/items') + api_patterns = [ + # axios patterns + r'axios\.(get|post|put|patch|delete)\s*\(\s*[\'"`]([^\'"`]+)[\'"`]', + # fetch patterns + r'fetch\s*\(\s*[\'"`]([^\'"`]+)[\'"`].*method:\s*[\'"`](GET|POST|PUT|PATCH|DELETE)[\'"`]', + # fetch with default GET + r'fetch\s*\(\s*[\'"`]([^\'"`]+)[\'"`]', + # http/request patterns + r'(http|request)\.(get|post|put|patch|delete)\s*\(\s*[\'"`]([^\'"`]+)[\'"`]', + # jQuery ajax + r'\$\.(ajax|get|post)\s*\(\s*\{[^}]*url:\s*[\'"`]([^\'"`]+)[\'"`]', + ] + + method = None + url_path = None + + for pattern in api_patterns: + match = re.search(pattern, context, re.IGNORECASE | re.MULTILINE) + if match: + groups = match.groups() + if 'fetch' in pattern and len(groups) == 2: + # fetch with explicit method + url_path = groups[0] + method = groups[1].upper() + elif 'fetch' in pattern and len(groups) == 1: + # fetch defaults to GET + url_path = groups[0] + method = 'GET' + elif len(groups) >= 2: + # axios, http, request patterns + if pattern.startswith(r'axios'): + method = groups[0].upper() + url_path = groups[1] + elif pattern.startswith(r'(http|request)'): + method = groups[1].upper() + url_path = groups[2] + elif pattern.startswith(r'\$'): + # jQuery + url_path = groups[1] + if groups[0] == 'ajax': + # Look for method in context + method_match = re.search(r'type:\s*[\'"`](GET|POST|PUT|PATCH|DELETE)[\'"`]', context) + method = method_match.group(1).upper() if method_match else 'GET' + elif groups[0] == 'get': + method = 'GET' + elif groups[0] == 'post': + method = 'POST' + break + + if not url_path or not method: + return None + + # Clean up the URL path + # Remove query parameters and fragments + url_path = url_path.split('?')[0].split('#')[0] + # Remove any template literals (${...}) + url_path = re.sub(r'\$\{[^}]+\}', '*', url_path) + + # Query the api_endpoints table to find matching backend endpoint + # Try exact match first + cursor.execute(""" + SELECT file, method, pattern, controls + FROM api_endpoints + WHERE pattern = ? AND method = ? + LIMIT 1 + """, (url_path, method)) + + backend_match = cursor.fetchone() + + if not backend_match: + # Try pattern matching (e.g., /api/users/* matches /api/users/:id) + # Convert URL to SQL LIKE pattern + like_pattern = url_path.replace('*', '%') + + cursor.execute(""" + SELECT file, method, pattern, controls + FROM api_endpoints + WHERE ? LIKE REPLACE(REPLACE(pattern, ':id', '%'), ':{param}', '%') + AND method = ? + LIMIT 1 + """, (url_path, method)) + + backend_match = cursor.fetchone() + + if not backend_match: + # No matching backend endpoint found + return None + + backend_file, backend_method, backend_pattern, backend_controls = backend_match + + # Find the exact line number of the backend endpoint + cursor.execute(""" + SELECT line + FROM symbols + WHERE path = ? AND type = 'function' + ORDER BY line + LIMIT 1 + """, (backend_file,)) + + line_result = cursor.fetchone() + backend_line = line_result[0] if line_result else 1 + + return { + "frontend": { + "file": target_file, + "line": target_line, + "method": method, + "url": url_path + }, + "backend": { + "file": backend_file, + "line": backend_line, + "method": backend_method, + "pattern": backend_pattern, + "controls": backend_controls + } + } + + except Exception as e: + # Error reading file or parsing + return None + + +def format_impact_report(impact_data: Dict[str, Any]) -> str: + """ + Format impact analysis results into a human-readable report. + + Args: + impact_data: Results from analyze_impact + + Returns: + Formatted string report + """ + lines = [] + + # Header + lines.append("=" * 60) + lines.append("IMPACT ANALYSIS REPORT") + lines.append("=" * 60) + + # Target symbol + if impact_data.get("error"): + lines.append(f"\nError: {impact_data['error']}") + return "\n".join(lines) + + # Check for cross-stack trace + if impact_data.get("cross_stack_trace"): + trace = impact_data["cross_stack_trace"] + lines.append(f"\n{'─' * 40}") + lines.append("FRONTEND TO BACKEND TRACE") + lines.append(f"{'─' * 40}") + lines.append(f"Frontend API Call:") + lines.append(f" File: {trace['frontend']['file']}:{trace['frontend']['line']}") + lines.append(f" Method: {trace['frontend']['method']}") + lines.append(f" URL: {trace['frontend']['url']}") + lines.append(f"\nBackend Endpoint:") + lines.append(f" File: {trace['backend']['file']}:{trace['backend']['line']}") + lines.append(f" Method: {trace['backend']['method']}") + lines.append(f" Pattern: {trace['backend']['pattern']}") + if trace['backend'].get('controls') and trace['backend']['controls'] != '[]': + lines.append(f" Security Controls: {trace['backend']['controls']}") + + # Show backend symbol as the primary target + if impact_data.get("backend_symbol"): + backend = impact_data["backend_symbol"] + lines.append(f"\nBackend Function: {backend['name']} ({backend['type']})") + lines.append(f"Location: {backend['file']}:{backend['line']}") + else: + target = impact_data["target_symbol"] + lines.append(f"\nTarget Symbol: {target['name']} ({target['type']})") + lines.append(f"Location: {target['file']}:{target['line']}") + + # Impact summary + summary = impact_data["impact_summary"] + lines.append(f"\n{'─' * 40}") + lines.append("IMPACT SUMMARY") + lines.append(f"{'─' * 40}") + lines.append(f"Direct Upstream Dependencies: {summary['direct_upstream']}") + lines.append(f"Direct Downstream Dependencies: {summary['direct_downstream']}") + lines.append(f"Total Upstream (including transitive): {summary['total_upstream']}") + lines.append(f"Total Downstream (including transitive): {summary['total_downstream']}") + lines.append(f"Total Impact Radius: {summary['total_impact']} symbols") + lines.append(f"Affected Files: {summary['affected_files']}") + + # Upstream dependencies + if impact_data["upstream"]: + lines.append(f"\n{'─' * 40}") + lines.append("UPSTREAM DEPENDENCIES (Who calls this)") + lines.append(f"{'─' * 40}") + for dep in impact_data["upstream"][:10]: # Limit to first 10 + lines.append(f" • {dep['symbol']} ({dep['type']}) in {dep['file']}:{dep['line']}") + if len(impact_data["upstream"]) > 10: + lines.append(f" ... and {len(impact_data['upstream']) - 10} more") + + # Downstream dependencies + if impact_data["downstream"]: + lines.append(f"\n{'─' * 40}") + lines.append("DOWNSTREAM DEPENDENCIES (What this calls)") + lines.append(f"{'─' * 40}") + for dep in impact_data["downstream"][:10]: # Limit to first 10 + if dep["file"] != "external": + lines.append(f" • {dep['symbol']} ({dep['type']}) in {dep['file']}:{dep['line']}") + else: + lines.append(f" • {dep['symbol']} (external/built-in)") + if len(impact_data["downstream"]) > 10: + lines.append(f" ... and {len(impact_data['downstream']) - 10} more") + + # Risk assessment + lines.append(f"\n{'─' * 40}") + lines.append("RISK ASSESSMENT") + lines.append(f"{'─' * 40}") + + risk_level = "LOW" + if summary["total_impact"] > 20: + risk_level = "HIGH" + elif summary["total_impact"] > 10: + risk_level = "MEDIUM" + + lines.append(f"Change Risk Level: {risk_level}") + + if risk_level == "HIGH": + lines.append("⚠ WARNING: This change has a large blast radius!") + lines.append(" Consider:") + lines.append(" - Breaking the change into smaller, incremental steps") + lines.append(" - Adding comprehensive tests before refactoring") + lines.append(" - Reviewing all upstream dependencies for compatibility") + elif risk_level == "MEDIUM": + lines.append("⚠ CAUTION: This change affects multiple components") + lines.append(" Ensure all callers are updated if the interface changes") + + lines.append("=" * 60) + + return "\n".join(lines) \ No newline at end of file diff --git a/theauditor/indexer/__init__.py b/theauditor/indexer/__init__.py new file mode 100644 index 0000000..7c16f96 --- /dev/null +++ b/theauditor/indexer/__init__.py @@ -0,0 +1,393 @@ +"""TheAuditor Indexer Package. + +This package provides modular, extensible code indexing functionality. +It includes: +- FileWalker for directory traversal with monorepo support +- DatabaseManager for SQLite operations +- Pluggable language extractors +- AST caching for performance +""" + +import os +import sys +import json +import logging +from pathlib import Path +from typing import Dict, Any, List, Optional, Tuple + +from theauditor.config_runtime import load_runtime_config +from theauditor.ast_parser import ASTParser + +from .config import ( + DEFAULT_BATCH_SIZE, JS_BATCH_SIZE, + SUPPORTED_AST_EXTENSIONS, SQL_EXTENSIONS, + DOCKERFILE_PATTERNS +) +from .core import FileWalker, ASTCache +from .database import DatabaseManager +from .extractors import ExtractorRegistry +from .extractors.docker import DockerExtractor +from .extractors.generic import GenericExtractor + +logger = logging.getLogger(__name__) + + +class IndexerOrchestrator: + """Orchestrates the indexing process, coordinating all components.""" + + def __init__(self, root_path: Path, db_path: str, + batch_size: int = DEFAULT_BATCH_SIZE, + follow_symlinks: bool = False, + exclude_patterns: Optional[List[str]] = None): + """Initialize the indexer orchestrator. + + Args: + root_path: Project root path + db_path: Path to SQLite database + batch_size: Batch size for database operations + follow_symlinks: Whether to follow symbolic links + exclude_patterns: Patterns to exclude from indexing + """ + self.root_path = root_path + self.config = load_runtime_config(str(root_path)) + + # Initialize components + self.ast_parser = ASTParser() + self.ast_cache = ASTCache(root_path) + self.db_manager = DatabaseManager(db_path, batch_size) + self.file_walker = FileWalker( + root_path, self.config, follow_symlinks, exclude_patterns + ) + self.extractor_registry = ExtractorRegistry(root_path, self.ast_parser) + + # Special extractors that don't follow standard extension mapping + self.docker_extractor = DockerExtractor(root_path, self.ast_parser) + self.generic_extractor = GenericExtractor(root_path, self.ast_parser) + + # Stats tracking + self.counts = { + "files": 0, + "refs": 0, + "routes": 0, + "sql": 0, + "sql_queries": 0, + "symbols": 0, + "docker": 0, + "orm": 0 + } + + def index(self) -> Tuple[Dict[str, int], Dict[str, Any]]: + """Run the complete indexing process. + + Returns: + Tuple of (counts, stats) dictionaries + """ + # Walk directory and collect files + files, stats = self.file_walker.walk() + + if not files: + print("[Indexer] No files found to index.") + return self.counts, stats + + print(f"[Indexer] Processing {len(files)} files...") + + # Separate JS/TS files for batch processing + js_ts_files = [] + js_ts_cache = {} + + for file_info in files: + if file_info['ext'] in ['.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs']: + file_path = self.root_path / file_info['path'] + js_ts_files.append(file_path) + + # Batch process JS/TS files if there are any + if js_ts_files: + print(f"[Indexer] Batch processing {len(js_ts_files)} JavaScript/TypeScript files...") + try: + # Process in batches for memory efficiency + for i in range(0, len(js_ts_files), JS_BATCH_SIZE): + batch = js_ts_files[i:i+JS_BATCH_SIZE] + batch_trees = self.ast_parser.parse_files_batch( + batch, root_path=str(self.root_path) + ) + + # Cache the results + for file_path in batch: + file_str = str(file_path).replace("\\", "/") # Normalize + if file_str in batch_trees: + js_ts_cache[file_str] = batch_trees[file_str] + + print(f"[Indexer] Successfully batch processed {len(js_ts_cache)} JS/TS files") + except Exception as e: + print(f"[Indexer] Batch processing failed, falling back to individual processing: {e}") + js_ts_cache = {} + + # Process all files + for idx, file_info in enumerate(files): + # Debug progress + if os.environ.get("THEAUDITOR_DEBUG") and idx % 50 == 0: + print(f"[INDEXER_DEBUG] Processing file {idx+1}/{len(files)}: {file_info['path']}", + file=sys.stderr) + + # Process the file + self._process_file(file_info, js_ts_cache) + + # Execute batch inserts periodically + if (idx + 1) % self.db_manager.batch_size == 0 or idx == len(files) - 1: + self.db_manager.flush_batch() + + # Final commit + self.db_manager.commit() + + # Report results with database location + print(f"[Indexer] Indexed {self.counts['files']} files, " + f"{self.counts['symbols']} symbols, {self.counts['refs']} imports, " + f"{self.counts['routes']} routes") + print(f"[Indexer] Database updated: {self.db_manager.db_path}") + + return self.counts, stats + + def _process_file(self, file_info: Dict[str, Any], js_ts_cache: Dict[str, Any]): + """Process a single file. + + Args: + file_info: File metadata + js_ts_cache: Cache of pre-parsed JS/TS ASTs + """ + # Insert file record + self.db_manager.add_file( + file_info['path'], file_info['sha256'], file_info['ext'], + file_info['bytes'], file_info['loc'] + ) + self.counts['files'] += 1 + + # Read file content (cap at 256KB) + file_path = self.root_path / file_info['path'] + try: + with open(file_path, encoding="utf-8", errors="ignore") as f: + content = f.read(256 * 1024) + except Exception as e: + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"Debug: Cannot read {file_path}: {e}") + return + + # Store configuration files for ModuleResolver + if file_info['path'].endswith('tsconfig.json'): + # Determine context from path + context_dir = None + if 'backend/' in file_info['path']: + context_dir = 'backend' + elif 'frontend/' in file_info['path']: + context_dir = 'frontend' + + self.db_manager.add_config_file( + file_info['path'], + content, + 'tsconfig', + context_dir + ) + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"[DEBUG] Cached tsconfig: {file_info['path']} (context: {context_dir})") + + # Get or parse AST + tree = self._get_or_parse_ast(file_info, file_path, js_ts_cache) + + # Select appropriate extractor + extractor = self._select_extractor(file_info['path'], file_info['ext']) + if not extractor: + return # No extractor for this file type + + # Extract all information + try: + extracted = extractor.extract(file_info, content, tree) + except Exception as e: + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"Debug: Extraction failed for {file_path}: {e}") + return + + # Store extracted data in database + self._store_extracted_data(file_info['path'], extracted) + + def _get_or_parse_ast(self, file_info: Dict[str, Any], + file_path: Path, js_ts_cache: Dict[str, Any]) -> Optional[Dict]: + """Get AST from cache or parse the file. + + Args: + file_info: File metadata + file_path: Path to the file + js_ts_cache: Cache of pre-parsed JS/TS ASTs + + Returns: + Parsed AST tree or None + """ + if file_info['ext'] not in SUPPORTED_AST_EXTENSIONS: + return None + + # Check JS/TS batch cache + file_str = str(file_path).replace("\\", "/") + if file_str in js_ts_cache: + return js_ts_cache[file_str] + + # Check persistent AST cache + cached_tree = self.ast_cache.get(file_info['sha256']) + if cached_tree: + return cached_tree + + # Parse the file + tree = self.ast_parser.parse_file(file_path, root_path=str(self.root_path)) + + # Cache the result if it's JSON-serializable + if tree and isinstance(tree, dict): + self.ast_cache.set(file_info['sha256'], tree) + + return tree + + def _select_extractor(self, file_path: str, file_ext: str): + """Select the appropriate extractor for a file. + + Args: + file_path: Path to the file + file_ext: File extension + + Returns: + Appropriate extractor instance or None + """ + # Check special extractors first (by filename pattern) + if self.docker_extractor.should_extract(file_path): + return self.docker_extractor + if self.generic_extractor.should_extract(file_path): + return self.generic_extractor + + # Use registry for standard extension-based extraction + return self.extractor_registry.get_extractor(file_ext) + + def _store_extracted_data(self, file_path: str, extracted: Dict[str, Any]): + """Store extracted data in the database. + + Args: + file_path: Path to the source file + extracted: Dictionary of extracted data + """ + # Store imports/references + if 'imports' in extracted: + for kind, value in extracted['imports']: + # Check for resolved import + resolved = extracted.get('resolved_imports', {}).get(value, value) + self.db_manager.add_ref(file_path, kind, resolved) + self.counts['refs'] += 1 + + # Store routes + if 'routes' in extracted: + for method, pattern, controls in extracted['routes']: + self.db_manager.add_endpoint(file_path, method, pattern, controls) + self.counts['routes'] += 1 + + # Store SQL objects + if 'sql_objects' in extracted: + for kind, name in extracted['sql_objects']: + self.db_manager.add_sql_object(file_path, kind, name) + self.counts['sql'] += 1 + + # Store SQL queries + if 'sql_queries' in extracted: + for query in extracted['sql_queries']: + self.db_manager.add_sql_query( + file_path, query['line'], query['query_text'], + query['command'], query['tables'] + ) + self.counts['sql_queries'] += 1 + + # Store symbols + if 'symbols' in extracted: + for symbol in extracted['symbols']: + self.db_manager.add_symbol( + file_path, symbol['name'], symbol['type'], + symbol['line'], symbol['col'] + ) + self.counts['symbols'] += 1 + + # Store ORM queries + if 'orm_queries' in extracted: + for query in extracted['orm_queries']: + self.db_manager.add_orm_query( + file_path, query['line'], query['query_type'], + query.get('includes'), query.get('has_limit', False), + query.get('has_transaction', False) + ) + self.counts['orm'] += 1 + + # Store Docker information + if 'docker_info' in extracted and extracted['docker_info']: + info = extracted['docker_info'] + self.db_manager.add_docker_image( + file_path, info.get('base_image'), info.get('exposed_ports', []), + info.get('env_vars', {}), info.get('build_args', {}), + info.get('user'), info.get('has_healthcheck', False) + ) + self.counts['docker'] += 1 + + # Store Docker security issues + if 'docker_issues' in extracted: + for issue in extracted['docker_issues']: + self.db_manager.add_docker_issue( + file_path, issue['line'], issue['issue_type'], issue['severity'] + ) + + # Store data flow information for taint analysis + if 'assignments' in extracted: + if extracted['assignments']: + logger.info(f"[DEBUG] Found {len(extracted['assignments'])} assignments in {file_path}") + # Log first assignment for debugging + if extracted['assignments']: + first = extracted['assignments'][0] + logger.info(f"[DEBUG] First assignment: line {first.get('line')}, {first.get('target_var')} = {first.get('source_expr', '')[:50]}") + for assignment in extracted['assignments']: + self.db_manager.add_assignment( + file_path, assignment['line'], assignment['target_var'], + assignment['source_expr'], assignment['source_vars'], + assignment['in_function'] + ) + + if 'function_calls' in extracted: + for call in extracted['function_calls']: + self.db_manager.add_function_call_arg( + file_path, call['line'], call['caller_function'], + call['callee_function'], call['argument_index'], + call['argument_expr'], call['param_name'] + ) + + if 'returns' in extracted: + for ret in extracted['returns']: + self.db_manager.add_function_return( + file_path, ret['line'], ret['function_name'], + ret['return_expr'], ret['return_vars'] + ) + + +# Import backward compatibility functions from the compat module +from ..indexer_compat import ( + build_index, + walk_directory, + populate_database, + extract_imports, + extract_routes, + extract_sql_objects, + extract_sql_queries +) + +# Backward compatibility exports +__all__ = [ + 'IndexerOrchestrator', + 'FileWalker', + 'DatabaseManager', + 'ASTCache', + 'ExtractorRegistry', + # Backward compat functions + 'build_index', + 'walk_directory', + 'populate_database', + 'extract_imports', + 'extract_routes', + 'extract_sql_objects', + 'extract_sql_queries' +] \ No newline at end of file diff --git a/theauditor/indexer/config.py b/theauditor/indexer/config.py new file mode 100644 index 0000000..bb30a72 --- /dev/null +++ b/theauditor/indexer/config.py @@ -0,0 +1,165 @@ +"""Centralized configuration for the indexer. + +All constants, patterns, and configuration values used across the indexer +package are defined here. +""" + +import re + +# Directories to skip (always ignored) +SKIP_DIRS = { + ".git", + ".hg", + ".svn", + "node_modules", + "dist", + "build", + "out", + ".venv", + ".auditor_venv", # TheAuditor's isolated virtual environment + ".venv_wsl", # WSL virtual environments + "venv", + "__pycache__", + ".pytest_cache", + ".mypy_cache", + ".ruff_cache", + "target", # Rust + ".next", # Next.js + ".nuxt", # Nuxt + "coverage", + ".coverage", + "htmlcov", + ".tox", + ".egg-info", + "__pycache__", + "*.egg-info", + ".pf", # TheAuditor's own output directory (contains all artifacts now) + ".claude", # Claude integration directory +} + +# Compiled regex patterns for extraction +IMPORT_PATTERNS = [ + # JavaScript/TypeScript + re.compile(r"import\s+.*?\s+from\s+['\"]([^'\"]+)['\"]"), + re.compile(r"import\s*\(['\"]([^'\"]+)['\"]\)"), + re.compile(r"require\s*\(['\"]([^'\"]+)['\"]\)"), + # Python + re.compile(r"from\s+([^\s]+)\s+import"), + re.compile(r"import\s+([^\s,]+)"), + # Go + re.compile(r'import\s+"([^"]+)"'), + re.compile(r"import\s+\(\s*[\"']([^\"']+)[\"']"), + # Java + re.compile(r"import\s+([^\s;]+);"), + re.compile(r"package\s+([^\s;]+);"), + # Ruby + re.compile(r"require\s+['\"]([^'\"]+)['\"]"), + re.compile(r"require_relative\s+['\"]([^'\"]+)['\"]"), +] + +ROUTE_PATTERNS = [ + # Express/Fastify style + re.compile(r"(?:app|router)\.(get|post|put|patch|delete|all)\s*\(['\"`]([^'\"`]+)['\"`]"), + # Decorator style (Python Flask, Java Spring, etc) + re.compile(r"@(Get|Post|Put|Patch|Delete|RequestMapping)\s*\(['\"`]([^'\"`]+)['\"`]\)"), + re.compile(r"@(GET|POST|PUT|PATCH|DELETE)\s*\(['\"`]([^'\"`]+)['\"`]\)"), +] + +SQL_PATTERNS = [ + re.compile(r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)", re.IGNORECASE), + re.compile(r"CREATE\s+INDEX\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)", re.IGNORECASE), + re.compile(r"CREATE\s+VIEW\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)", re.IGNORECASE), + re.compile(r"CREATE\s+(?:OR\s+REPLACE\s+)?FUNCTION\s+(\w+)", re.IGNORECASE), + re.compile(r"CREATE\s+POLICY\s+(\w+)", re.IGNORECASE), + re.compile(r"CONSTRAINT\s+(\w+)", re.IGNORECASE), +] + +# Patterns to find SQL query strings in code +SQL_QUERY_PATTERNS = [ + # Multi-line SQL strings (Python, JS, etc.) + re.compile(r'"""([^"]*(?:SELECT|INSERT|UPDATE|DELETE|MERGE|WITH)[^"]*)"""', re.IGNORECASE | re.DOTALL), + re.compile(r"'''([^']*(?:SELECT|INSERT|UPDATE|DELETE|MERGE|WITH)[^']*)'''", re.IGNORECASE | re.DOTALL), + re.compile(r'`([^`]*(?:SELECT|INSERT|UPDATE|DELETE|MERGE|WITH)[^`]*)`', re.IGNORECASE | re.DOTALL), + # Single-line SQL strings + re.compile(r'"([^"]*(?:SELECT|INSERT|UPDATE|DELETE|MERGE|WITH)[^"]*)"', re.IGNORECASE), + re.compile(r"'([^']*(?:SELECT|INSERT|UPDATE|DELETE|MERGE|WITH)[^']*)'", re.IGNORECASE), + # Common ORM/query builder patterns + re.compile(r'\.query\s*\(\s*["\']([^"\']+)["\']', re.IGNORECASE), + re.compile(r'\.execute\s*\(\s*["\']([^"\']+)["\']', re.IGNORECASE), + re.compile(r'\.raw\s*\(\s*["\']([^"\']+)["\']', re.IGNORECASE), +] + +# Default batch size for database operations +DEFAULT_BATCH_SIZE = 200 +MAX_BATCH_SIZE = 1000 + +# File processing batch size for JavaScript/TypeScript +JS_BATCH_SIZE = 20 + +# Standard monorepo structures to check +STANDARD_MONOREPO_PATHS = [ + ("backend", "src"), # backend/src + ("frontend", "src"), # frontend/src + ("mobile", "src"), # mobile/src + ("server", "src"), # server/src + ("client", "src"), # client/src + ("web", "src"), # web/src + ("api", "src"), # api/src + ("packages", None), # packages/* (for lerna/yarn workspaces) + ("apps", None), # apps/* (for nx/turborepo) +] + +# Common root-level entry files in monorepos +MONOREPO_ENTRY_FILES = ["app.ts", "app.js", "index.ts", "index.js", "server.ts", "server.js"] + +# File extensions supported for AST parsing +SUPPORTED_AST_EXTENSIONS = [".py", ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs"] + +# SQL file extensions +SQL_EXTENSIONS = [".sql", ".psql", ".ddl"] + +# Dockerfile name patterns +DOCKERFILE_PATTERNS = ['dockerfile', 'dockerfile.dev', 'dockerfile.prod', 'dockerfile.test'] + +# Docker Compose file patterns +COMPOSE_PATTERNS = [ + 'docker-compose.yml', 'docker-compose.yaml', + 'docker-compose.override.yml', 'docker-compose.override.yaml', + 'compose.yml', 'compose.yaml' +] + +# Nginx config file patterns +NGINX_PATTERNS = ['nginx.conf', 'default.conf', 'site.conf'] + +# Sensitive ports for Docker security analysis +SENSITIVE_PORTS = ['22', '23', '135', '139', '445', '3389'] # SSH, Telnet, SMB, RDP + +# Sensitive keywords for Docker ENV security analysis +SENSITIVE_ENV_KEYWORDS = ['SECRET', 'TOKEN', 'PASSWORD', 'API_KEY', 'PRIVATE_KEY', 'ACCESS_KEY'] + +# ORM method patterns to detect +SEQUELIZE_METHODS = { + 'findAll', 'findOne', 'findByPk', 'findOrCreate', + 'create', 'update', 'destroy', 'bulkCreate', 'bulkUpdate', + 'count', 'max', 'min', 'sum', 'findAndCountAll' +} + +PRISMA_METHODS = { + 'findMany', 'findFirst', 'findUnique', 'findUniqueOrThrow', + 'create', 'createMany', 'update', 'updateMany', 'upsert', + 'delete', 'deleteMany', 'count', 'aggregate', 'groupBy' +} + +TYPEORM_REPOSITORY_METHODS = { + 'find', 'findOne', 'findOneBy', 'findOneOrFail', 'findBy', + 'findAndCount', 'findAndCountBy', 'save', 'remove', 'delete', + 'update', 'insert', 'create', 'merge', 'preload', 'count', + 'increment', 'decrement', 'restore', 'softRemove' +} + +TYPEORM_QB_METHODS = { + 'createQueryBuilder', 'select', 'addSelect', 'where', 'andWhere', + 'orWhere', 'having', 'orderBy', 'groupBy', 'limit', 'take', + 'skip', 'offset', 'getMany', 'getOne', 'getRawMany', 'getRawOne', + 'getManyAndCount', 'getCount', 'execute', 'delete', 'update', 'insert' +} \ No newline at end of file diff --git a/theauditor/indexer/core.py b/theauditor/indexer/core.py new file mode 100644 index 0000000..954455c --- /dev/null +++ b/theauditor/indexer/core.py @@ -0,0 +1,409 @@ +"""Core functionality for file system operations and AST caching. + +This module contains the FileWalker class for directory traversal with monorepo +detection, and the ASTCache class for persistent AST caching. +""" + +import os +import json +import sqlite3 +import fnmatch +from pathlib import Path +from typing import Tuple, List, Dict, Any, Optional, Set + +from theauditor.utils import compute_file_hash, count_lines_in_file +from theauditor.config_runtime import load_runtime_config +from .config import ( + SKIP_DIRS, STANDARD_MONOREPO_PATHS, MONOREPO_ENTRY_FILES +) + + +class ASTCache: + """Manages persistent AST caching for improved performance.""" + + def __init__(self, root_path: Path): + """Initialize the AST cache. + + Args: + root_path: Project root path for cache directory + """ + self.cache_dir = root_path / ".pf" / "ast_cache" + self.cache_dir.mkdir(parents=True, exist_ok=True) + + def get(self, file_hash: str) -> Optional[Dict]: + """Get cached AST for a file by its hash. + + Args: + file_hash: SHA256 hash of the file content + + Returns: + Cached AST tree or None if not found + """ + cache_file = self.cache_dir / f"{file_hash}.json" + if cache_file.exists(): + try: + with open(cache_file, 'r', encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, OSError): + # Cache corrupted, return None + return None + return None + + def set(self, file_hash: str, tree: Dict) -> None: + """Store an AST tree in the cache. + + Args: + file_hash: SHA256 hash of the file content + tree: AST tree to cache (must be JSON serializable) + """ + cache_file = self.cache_dir / f"{file_hash}.json" + try: + # Only cache if tree is JSON serializable (dict), not a Tree object + if isinstance(tree, dict): + with open(cache_file, 'w', encoding='utf-8') as f: + json.dump(tree, f) + except (OSError, PermissionError, TypeError): + # Cache write failed, non-critical + pass + + def invalidate(self, file_hash: str) -> None: + """Invalidate cache entry for a specific file. + + Args: + file_hash: SHA256 hash of the file content + """ + cache_file = self.cache_dir / f"{file_hash}.json" + if cache_file.exists(): + try: + cache_file.unlink() + except (OSError, PermissionError): + pass + + +def is_text_file(file_path: Path) -> bool: + """Check if file is text (not binary). + + Args: + file_path: Path to the file to check + + Returns: + True if file is text, False if binary + """ + try: + with open(file_path, "rb") as f: + chunk = f.read(8192) + if b"\0" in chunk: + return False + # Try to decode as UTF-8 + try: + chunk.decode("utf-8") + return True + except UnicodeDecodeError: + return False + except (FileNotFoundError, PermissionError, UnicodeDecodeError): + return False + + +def get_first_lines(file_path: Path, n: int = 2) -> List[str]: + """Get first n lines of a text file. + + Args: + file_path: Path to the file + n: Number of lines to read + + Returns: + List of first n lines from the file + """ + lines = [] + try: + with open(file_path, encoding="utf-8", errors="ignore") as f: + for i, line in enumerate(f): + if i >= n: + break + # Strip \r and truncate at 200 chars + line = line.replace("\r", "").rstrip("\n")[:200] + lines.append(line) + except (FileNotFoundError, PermissionError, UnicodeDecodeError): + # Gracefully skip unreadable files + pass + return lines + + +def load_gitignore_patterns(root_path: Path) -> Set[str]: + """Load patterns from .gitignore if it exists. + + Args: + root_path: Project root path + + Returns: + Set of directory patterns to ignore + """ + gitignore_path = root_path / ".gitignore" + patterns = set() + + if gitignore_path.exists(): + try: + with open(gitignore_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + # Skip comments and empty lines + if line and not line.startswith('#'): + # Convert gitignore patterns to simple dir names + # This is a simplified approach - just extract directory names + pattern = line.rstrip('/') + if '/' not in pattern and '*' not in pattern: + patterns.add(pattern) + except Exception: + pass # Ignore errors reading .gitignore + + return patterns + + +class FileWalker: + """Handles directory walking with monorepo detection and filtering.""" + + def __init__(self, root_path: Path, config: Dict[str, Any], + follow_symlinks: bool = False, exclude_patterns: Optional[List[str]] = None): + """Initialize the file walker. + + Args: + root_path: Root directory to walk + config: Runtime configuration + follow_symlinks: Whether to follow symbolic links + exclude_patterns: Additional patterns to exclude + """ + self.root_path = root_path + self.config = config + self.follow_symlinks = follow_symlinks + self.exclude_patterns = exclude_patterns or [] + + # Load gitignore patterns and combine with default skip dirs + gitignore_patterns = load_gitignore_patterns(root_path) + self.skip_dirs = SKIP_DIRS | gitignore_patterns + + # Stats tracking + self.stats = { + "total_files": 0, + "text_files": 0, + "binary_files": 0, + "large_files": 0, + "skipped_dirs": 0, + } + + def detect_monorepo(self) -> Tuple[bool, List[Path], List[Path]]: + """Detect if project is a monorepo and return source directories. + + Returns: + Tuple of (is_monorepo, src_directories, root_entry_files) + """ + monorepo_dirs = [] + monorepo_detected = False + + # Check which monorepo directories exist + for base_dir, src_dir in STANDARD_MONOREPO_PATHS: + base_path = self.root_path / base_dir + if base_path.exists() and base_path.is_dir(): + if src_dir: + # Check if src subdirectory exists + src_path = base_path / src_dir + if src_path.exists() and src_path.is_dir(): + monorepo_dirs.append(src_path) + monorepo_detected = True + else: + # For packages/apps directories, add all subdirectories with src folders + for subdir in base_path.iterdir(): + if subdir.is_dir() and not subdir.name.startswith('.'): + src_path = subdir / "src" + if src_path.exists() and src_path.is_dir(): + monorepo_dirs.append(src_path) + monorepo_detected = True + + # Check for root-level entry files in monorepo + root_entry_files = [] + if monorepo_detected: + for entry_file in MONOREPO_ENTRY_FILES: + entry_path = self.root_path / entry_file + if entry_path.exists() and entry_path.is_file(): + root_entry_files.append(entry_path) + + return monorepo_detected, monorepo_dirs, root_entry_files + + def process_file(self, file: Path, exclude_file_patterns: List[str]) -> Optional[Dict[str, Any]]: + """Process a single file and return its info. + + Args: + file: Path to the file to process + exclude_file_patterns: Patterns for files to exclude + + Returns: + File info dictionary or None if file should be skipped + """ + # Check if file matches any exclude pattern + if exclude_file_patterns: + filename = file.name + relative_path = file.relative_to(self.root_path).as_posix() + for pattern in exclude_file_patterns: + # Check both the filename and the full relative path + if fnmatch.fnmatch(filename, pattern) or fnmatch.fnmatch(relative_path, pattern): + return None + + # Skip symlinks if not following + try: + if not self.follow_symlinks and file.is_symlink(): + return None + except (OSError, PermissionError): + # On Windows, is_symlink() can fail on certain paths + return None + + try: + file_size = file.stat().st_size + + # Skip large files + if file_size >= self.config["limits"]["max_file_size"]: + self.stats["large_files"] += 1 + return None + + # Check if text file + if not is_text_file(file): + self.stats["binary_files"] += 1 + return None + + self.stats["text_files"] += 1 + + # Compute metadata + relative_path = file.relative_to(self.root_path) + posix_path = relative_path.as_posix() + + file_info = { + "path": posix_path, + "sha256": compute_file_hash(file), + "ext": file.suffix, + "bytes": file_size, + "loc": count_lines_in_file(file), + "first_lines": get_first_lines(file), + } + + return file_info + + except (FileNotFoundError, PermissionError, UnicodeDecodeError, sqlite3.Error, OSError): + # Skip files we can't read + return None + + def walk(self) -> Tuple[List[Dict], Dict[str, Any]]: + """Walk directory and collect file information. + + Returns: + Tuple of (files_list, statistics) + """ + files = [] + + # Separate file and directory patterns from exclude_patterns + exclude_file_patterns = [] + if self.exclude_patterns: + for pattern in self.exclude_patterns: + # Directory patterns + if pattern.endswith('/**'): + # Pattern like "theauditor/**" means skip the directory + self.skip_dirs.add(pattern.rstrip('/**')) + elif pattern.endswith('/'): + self.skip_dirs.add(pattern.rstrip('/')) + elif '/' in pattern and '*' not in pattern: + # Add the first directory component + self.skip_dirs.add(pattern.split('/')[0]) + else: + # File pattern (e.g., "*.md", "pyproject.toml") + exclude_file_patterns.append(pattern) + + # Detect if this is a monorepo + monorepo_detected, monorepo_dirs, root_entry_files = self.detect_monorepo() + + if monorepo_detected: + print(f"[Indexer] Monorepo detected. Using whitelist for {len(monorepo_dirs)} src directories") + + # Process whitelisted directories only + for src_dir in monorepo_dirs: + for dirpath, dirnames, filenames in os.walk(src_dir, followlinks=self.follow_symlinks): + # Still apply skip_dirs within the whitelisted paths + skipped_count = len([d for d in dirnames if d in self.skip_dirs]) + self.stats["skipped_dirs"] += skipped_count + dirnames[:] = [d for d in dirnames if d not in self.skip_dirs] + + # Process files in this directory + for filename in filenames: + self.stats["total_files"] += 1 + file = Path(dirpath) / filename + + file_info = self.process_file(file, exclude_file_patterns) + if file_info: + files.append(file_info) + + # CRITICAL: Also collect config files from monorepo directories + # These are outside src/ but essential for module resolution + config_patterns = ['tsconfig.json', 'tsconfig.*.json', 'package.json', + 'webpack.config.js', 'vite.config.ts', '.babelrc*'] + + for base_dir, _ in STANDARD_MONOREPO_PATHS: + base_path = self.root_path / base_dir + if base_path.exists() and base_path.is_dir(): + # Look for config files in the base directory (not just src) + for pattern in config_patterns: + for config_file in base_path.glob(pattern): + if config_file.is_file(): + self.stats["total_files"] += 1 + file_info = self.process_file(config_file, []) + if file_info: + files.append(file_info) + + # Also check root directory for configs + for pattern in config_patterns: + for config_file in self.root_path.glob(pattern): + if config_file.is_file() and config_file not in [f for f in files]: + self.stats["total_files"] += 1 + file_info = self.process_file(config_file, []) + if file_info: + files.append(file_info) + + # Also process root-level entry files + for entry_file in root_entry_files: + self.stats["total_files"] += 1 + file_info = self.process_file(entry_file, []) + if file_info: + files.append(file_info) + + else: + # Not a monorepo, use traditional approach + print("[Indexer] Standard project structure detected. Using traditional scanning.") + + for dirpath, dirnames, filenames in os.walk(self.root_path, followlinks=self.follow_symlinks): + # Count directories that will be skipped + skipped_count = len([d for d in dirnames if d in self.skip_dirs]) + self.stats["skipped_dirs"] += skipped_count + + # Skip ignored directories + dirnames[:] = [d for d in dirnames if d not in self.skip_dirs] + + # On Windows, skip problematic symlink directories in venv + current_path = Path(dirpath) + try: + if not os.access(dirpath, os.R_OK): + continue + # Skip known problematic symlinks in virtual environments + if any(part in [".venv", "venv", "virtualenv"] for part in current_path.parts): + if current_path.name in ["lib64", "bin64", "include64"]: + dirnames.clear() + continue + except (OSError, PermissionError): + continue + + for filename in filenames: + self.stats["total_files"] += 1 + file = Path(dirpath) / filename + + file_info = self.process_file(file, exclude_file_patterns) + if file_info: + files.append(file_info) + + # Sort by path for deterministic output + files.sort(key=lambda x: x["path"]) + + return files, self.stats \ No newline at end of file diff --git a/theauditor/indexer/database.py b/theauditor/indexer/database.py new file mode 100644 index 0000000..e9b5d59 --- /dev/null +++ b/theauditor/indexer/database.py @@ -0,0 +1,607 @@ +"""Database operations for the indexer. + +This module contains the DatabaseManager class which handles all database +operations including schema creation, batch inserts, and transaction management. +""" + +import sqlite3 +import json +from typing import Any, List, Dict, Optional +from pathlib import Path + +from .config import DEFAULT_BATCH_SIZE, MAX_BATCH_SIZE + + +class DatabaseManager: + """Manages database operations with batching and transactions.""" + + def __init__(self, db_path: str, batch_size: int = DEFAULT_BATCH_SIZE): + """Initialize the database manager. + + Args: + db_path: Path to the SQLite database file + batch_size: Size of batches for insert operations + """ + self.db_path = db_path + self.conn = sqlite3.connect(db_path) + + # Validate and set batch size + if batch_size <= 0: + self.batch_size = DEFAULT_BATCH_SIZE + elif batch_size > MAX_BATCH_SIZE: + self.batch_size = MAX_BATCH_SIZE + else: + self.batch_size = batch_size + + # Initialize batch lists + self.files_batch = [] + self.refs_batch = [] + self.endpoints_batch = [] + self.sql_objects_batch = [] + self.sql_queries_batch = [] + self.symbols_batch = [] + self.orm_queries_batch = [] + self.docker_images_batch = [] + self.docker_issues_batch = [] + self.assignments_batch = [] + self.function_call_args_batch = [] + self.function_returns_batch = [] + self.prisma_batch = [] + self.compose_batch = [] + self.nginx_batch = [] + + def begin_transaction(self): + """Start a new transaction.""" + self.conn.execute("BEGIN IMMEDIATE") + + def commit(self): + """Commit the current transaction.""" + try: + self.conn.commit() + except sqlite3.Error as e: + self.conn.rollback() + raise RuntimeError(f"Failed to commit database changes: {e}") + + def rollback(self): + """Rollback the current transaction.""" + self.conn.rollback() + + def close(self): + """Close the database connection.""" + self.conn.close() + + def create_schema(self): + """Create all database tables and indexes.""" + cursor = self.conn.cursor() + + # Create tables + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS files( + path TEXT PRIMARY KEY, + sha256 TEXT NOT NULL, + ext TEXT NOT NULL, + bytes INTEGER NOT NULL, + loc INTEGER NOT NULL + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS config_files( + path TEXT PRIMARY KEY, + content TEXT NOT NULL, + type TEXT NOT NULL, + context_dir TEXT, + FOREIGN KEY(path) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS refs( + src TEXT NOT NULL, + kind TEXT NOT NULL, + value TEXT NOT NULL, + FOREIGN KEY(src) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS api_endpoints( + file TEXT NOT NULL, + method TEXT NOT NULL, + pattern TEXT NOT NULL, + controls TEXT, + FOREIGN KEY(file) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS sql_objects( + file TEXT NOT NULL, + kind TEXT NOT NULL, + name TEXT NOT NULL, + FOREIGN KEY(file) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS symbols( + path TEXT NOT NULL, + name TEXT NOT NULL, + type TEXT NOT NULL, + line INTEGER NOT NULL, + col INTEGER NOT NULL, + FOREIGN KEY(path) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS sql_queries( + file_path TEXT NOT NULL, + line_number INTEGER NOT NULL, + query_text TEXT NOT NULL, + command TEXT NOT NULL, + tables TEXT, + FOREIGN KEY(file_path) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS docker_images( + file_path TEXT PRIMARY KEY, + base_image TEXT, + exposed_ports TEXT, + env_vars TEXT, + build_args TEXT, + user TEXT, + has_healthcheck BOOLEAN DEFAULT 0, + FOREIGN KEY(file_path) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS docker_issues( + file TEXT NOT NULL, + line INTEGER NOT NULL, + issue_type TEXT NOT NULL, + severity TEXT NOT NULL, + FOREIGN KEY(file) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS orm_queries( + file TEXT NOT NULL, + line INTEGER NOT NULL, + query_type TEXT NOT NULL, + includes TEXT, + has_limit BOOLEAN DEFAULT 0, + has_transaction BOOLEAN DEFAULT 0, + FOREIGN KEY(file) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS prisma_models( + model_name TEXT NOT NULL, + field_name TEXT NOT NULL, + field_type TEXT NOT NULL, + is_indexed BOOLEAN DEFAULT 0, + is_unique BOOLEAN DEFAULT 0, + is_relation BOOLEAN DEFAULT 0, + PRIMARY KEY (model_name, field_name) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS compose_services( + file_path TEXT NOT NULL, + service_name TEXT NOT NULL, + image TEXT, + ports TEXT, + volumes TEXT, + environment TEXT, + is_privileged BOOLEAN DEFAULT 0, + network_mode TEXT, + PRIMARY KEY (file_path, service_name), + FOREIGN KEY(file_path) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS nginx_configs( + file_path TEXT NOT NULL, + block_type TEXT NOT NULL, + block_context TEXT, + directives TEXT, + level INTEGER DEFAULT 0, + PRIMARY KEY (file_path, block_type, block_context), + FOREIGN KEY(file_path) REFERENCES files(path) + ) + """ + ) + + # Data flow analysis tables for taint tracking + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS assignments ( + file TEXT NOT NULL, + line INTEGER NOT NULL, + target_var TEXT NOT NULL, + source_expr TEXT NOT NULL, + source_vars TEXT, + in_function TEXT NOT NULL, + FOREIGN KEY(file) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS function_call_args ( + file TEXT NOT NULL, + line INTEGER NOT NULL, + caller_function TEXT NOT NULL, + callee_function TEXT NOT NULL, + argument_index INTEGER NOT NULL, + argument_expr TEXT NOT NULL, + param_name TEXT NOT NULL, + FOREIGN KEY(file) REFERENCES files(path) + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS function_returns ( + file TEXT NOT NULL, + line INTEGER NOT NULL, + function_name TEXT NOT NULL, + return_expr TEXT NOT NULL, + return_vars TEXT, + FOREIGN KEY(file) REFERENCES files(path) + ) + """ + ) + + # Create indexes + cursor.execute("CREATE INDEX IF NOT EXISTS idx_refs_src ON refs(src)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_api_endpoints_file ON api_endpoints(file)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_sql_file ON sql_objects(file)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_path ON symbols(path)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(type)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_sql_queries_file ON sql_queries(file_path)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_sql_queries_command ON sql_queries(command)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_docker_images_base ON docker_images(base_image)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_docker_issues_file ON docker_issues(file)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_docker_issues_severity ON docker_issues(severity)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_orm_queries_file ON orm_queries(file)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_orm_queries_type ON orm_queries(query_type)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_prisma_models_indexed ON prisma_models(is_indexed)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_compose_services_file ON compose_services(file_path)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_compose_services_privileged ON compose_services(is_privileged)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_nginx_configs_file ON nginx_configs(file_path)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_nginx_configs_type ON nginx_configs(block_type)") + + # Indexes for data flow tables + cursor.execute("CREATE INDEX IF NOT EXISTS idx_assignments_file ON assignments(file)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_assignments_function ON assignments(in_function)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_function_call_args_file ON function_call_args(file)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_function_call_args_caller ON function_call_args(caller_function)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_function_call_args_callee ON function_call_args(callee_function)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_function_returns_file ON function_returns(file)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_function_returns_function ON function_returns(function_name)") + + self.conn.commit() + + def clear_tables(self): + """Clear all existing data from tables.""" + cursor = self.conn.cursor() + + try: + cursor.execute("DELETE FROM files") + cursor.execute("DELETE FROM refs") + cursor.execute("DELETE FROM api_endpoints") + cursor.execute("DELETE FROM sql_objects") + cursor.execute("DELETE FROM symbols") + cursor.execute("DELETE FROM sql_queries") + cursor.execute("DELETE FROM docker_images") + cursor.execute("DELETE FROM docker_issues") + cursor.execute("DELETE FROM orm_queries") + cursor.execute("DELETE FROM prisma_models") + cursor.execute("DELETE FROM compose_services") + cursor.execute("DELETE FROM nginx_configs") + cursor.execute("DELETE FROM assignments") + cursor.execute("DELETE FROM function_call_args") + cursor.execute("DELETE FROM function_returns") + except sqlite3.Error as e: + self.conn.rollback() + raise RuntimeError(f"Failed to clear existing data: {e}") + + def add_file(self, path: str, sha256: str, ext: str, bytes_size: int, loc: int): + """Add a file record to the batch.""" + self.files_batch.append((path, sha256, ext, bytes_size, loc)) + + def add_ref(self, src: str, kind: str, value: str): + """Add a reference record to the batch.""" + self.refs_batch.append((src, kind, value)) + + def add_endpoint(self, file_path: str, method: str, pattern: str, controls: List[str]): + """Add an API endpoint record to the batch.""" + controls_json = json.dumps(controls) if controls else "[]" + self.endpoints_batch.append((file_path, method, pattern, controls_json)) + + def add_sql_object(self, file_path: str, kind: str, name: str): + """Add a SQL object record to the batch.""" + self.sql_objects_batch.append((file_path, kind, name)) + + def add_sql_query(self, file_path: str, line: int, query_text: str, command: str, tables: List[str]): + """Add a SQL query record to the batch.""" + tables_json = json.dumps(tables) if tables else "[]" + self.sql_queries_batch.append((file_path, line, query_text, command, tables_json)) + + def add_symbol(self, path: str, name: str, symbol_type: str, line: int, col: int): + """Add a symbol record to the batch.""" + self.symbols_batch.append((path, name, symbol_type, line, col)) + + def add_orm_query(self, file_path: str, line: int, query_type: str, includes: Optional[str], + has_limit: bool, has_transaction: bool): + """Add an ORM query record to the batch.""" + self.orm_queries_batch.append((file_path, line, query_type, includes, has_limit, has_transaction)) + + def add_docker_image(self, file_path: str, base_image: Optional[str], exposed_ports: List[str], + env_vars: Dict, build_args: Dict, user: Optional[str], has_healthcheck: bool): + """Add a Docker image record to the batch.""" + ports_json = json.dumps(exposed_ports) + env_json = json.dumps(env_vars) + args_json = json.dumps(build_args) + self.docker_images_batch.append((file_path, base_image, ports_json, env_json, + args_json, user, has_healthcheck)) + + def add_docker_issue(self, file_path: str, line: int, issue_type: str, severity: str): + """Add a Docker security issue to the batch.""" + self.docker_issues_batch.append((file_path, line, issue_type, severity)) + + def add_assignment(self, file_path: str, line: int, target_var: str, source_expr: str, + source_vars: List[str], in_function: str): + """Add a variable assignment record to the batch.""" + source_vars_json = json.dumps(source_vars) + self.assignments_batch.append((file_path, line, target_var, source_expr, + source_vars_json, in_function)) + + def add_function_call_arg(self, file_path: str, line: int, caller_function: str, + callee_function: str, arg_index: int, arg_expr: str, param_name: str): + """Add a function call argument record to the batch.""" + self.function_call_args_batch.append((file_path, line, caller_function, callee_function, + arg_index, arg_expr, param_name)) + + def add_function_return(self, file_path: str, line: int, function_name: str, + return_expr: str, return_vars: List[str]): + """Add a function return statement record to the batch.""" + return_vars_json = json.dumps(return_vars) + self.function_returns_batch.append((file_path, line, function_name, + return_expr, return_vars_json)) + + def add_config_file(self, path: str, content: str, file_type: str, context: Optional[str] = None): + """Add a configuration file content to the batch.""" + if not hasattr(self, 'config_files_batch'): + self.config_files_batch = [] + self.config_files_batch.append((path, content, file_type, context)) + + def add_prisma_model(self, model_name: str, field_name: str, field_type: str, + is_indexed: bool, is_unique: bool, is_relation: bool): + """Add a Prisma model field record to the batch.""" + self.prisma_batch.append((model_name, field_name, field_type, + is_indexed, is_unique, is_relation)) + + def add_compose_service(self, file_path: str, service_name: str, image: Optional[str], + ports: List[str], volumes: List[str], environment: Dict, + is_privileged: bool, network_mode: str): + """Add a Docker Compose service record to the batch.""" + ports_json = json.dumps(ports) + volumes_json = json.dumps(volumes) + env_json = json.dumps(environment) + self.compose_batch.append((file_path, service_name, image, ports_json, + volumes_json, env_json, is_privileged, network_mode)) + + def add_nginx_config(self, file_path: str, block_type: str, block_context: str, + directives: Dict, level: int): + """Add an Nginx configuration block to the batch.""" + directives_json = json.dumps(directives) + # Use a default context if empty to avoid primary key issues + block_context = block_context or 'default' + + # Check for duplicates before adding + batch_key = (file_path, block_type, block_context) + if not any(b[:3] == batch_key for b in self.nginx_batch): + self.nginx_batch.append((file_path, block_type, block_context, + directives_json, level)) + + def flush_batch(self, batch_idx: Optional[int] = None): + """Execute all pending batch inserts.""" + cursor = self.conn.cursor() + + try: + if self.files_batch: + cursor.executemany( + "INSERT INTO files (path, sha256, ext, bytes, loc) VALUES (?, ?, ?, ?, ?)", + self.files_batch + ) + self.files_batch = [] + + if self.refs_batch: + cursor.executemany( + "INSERT INTO refs (src, kind, value) VALUES (?, ?, ?)", + self.refs_batch + ) + self.refs_batch = [] + + if self.endpoints_batch: + cursor.executemany( + "INSERT INTO api_endpoints (file, method, pattern, controls) VALUES (?, ?, ?, ?)", + self.endpoints_batch + ) + self.endpoints_batch = [] + + if self.sql_objects_batch: + cursor.executemany( + "INSERT INTO sql_objects (file, kind, name) VALUES (?, ?, ?)", + self.sql_objects_batch + ) + self.sql_objects_batch = [] + + if self.sql_queries_batch: + cursor.executemany( + "INSERT INTO sql_queries (file_path, line_number, query_text, command, tables) VALUES (?, ?, ?, ?, ?)", + self.sql_queries_batch + ) + self.sql_queries_batch = [] + + if self.symbols_batch: + cursor.executemany( + "INSERT INTO symbols (path, name, type, line, col) VALUES (?, ?, ?, ?, ?)", + self.symbols_batch + ) + self.symbols_batch = [] + + if self.orm_queries_batch: + cursor.executemany( + "INSERT INTO orm_queries (file, line, query_type, includes, has_limit, has_transaction) VALUES (?, ?, ?, ?, ?, ?)", + self.orm_queries_batch + ) + self.orm_queries_batch = [] + + if self.docker_images_batch: + cursor.executemany( + "INSERT INTO docker_images (file_path, base_image, exposed_ports, env_vars, build_args, user, has_healthcheck) VALUES (?, ?, ?, ?, ?, ?, ?)", + self.docker_images_batch + ) + self.docker_images_batch = [] + + if self.docker_issues_batch: + cursor.executemany( + "INSERT INTO docker_issues (file, line, issue_type, severity) VALUES (?, ?, ?, ?)", + self.docker_issues_batch + ) + self.docker_issues_batch = [] + + if self.assignments_batch: + cursor.executemany( + "INSERT INTO assignments (file, line, target_var, source_expr, source_vars, in_function) VALUES (?, ?, ?, ?, ?, ?)", + self.assignments_batch + ) + self.assignments_batch = [] + + if self.function_call_args_batch: + cursor.executemany( + "INSERT INTO function_call_args (file, line, caller_function, callee_function, argument_index, argument_expr, param_name) VALUES (?, ?, ?, ?, ?, ?, ?)", + self.function_call_args_batch + ) + self.function_call_args_batch = [] + + if self.function_returns_batch: + cursor.executemany( + "INSERT INTO function_returns (file, line, function_name, return_expr, return_vars) VALUES (?, ?, ?, ?, ?)", + self.function_returns_batch + ) + self.function_returns_batch = [] + + if self.prisma_batch: + cursor.executemany( + """INSERT INTO prisma_models + (model_name, field_name, field_type, is_indexed, is_unique, is_relation) + VALUES (?, ?, ?, ?, ?, ?)""", + self.prisma_batch + ) + self.prisma_batch = [] + + if self.compose_batch: + cursor.executemany( + """INSERT INTO compose_services + (file_path, service_name, image, ports, volumes, environment, + is_privileged, network_mode) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", + self.compose_batch + ) + self.compose_batch = [] + + if self.nginx_batch: + cursor.executemany( + """INSERT INTO nginx_configs + (file_path, block_type, block_context, directives, level) + VALUES (?, ?, ?, ?, ?)""", + self.nginx_batch + ) + self.nginx_batch = [] + + if hasattr(self, 'config_files_batch') and self.config_files_batch: + cursor.executemany( + "INSERT OR REPLACE INTO config_files (path, content, type, context_dir) VALUES (?, ?, ?, ?)", + self.config_files_batch + ) + self.config_files_batch = [] + + except sqlite3.Error as e: + if batch_idx is not None: + raise RuntimeError(f"Batch insert failed at file index {batch_idx}: {e}") + else: + raise RuntimeError(f"Batch insert failed: {e}") + + +# Standalone function for backward compatibility +def create_database_schema(conn: sqlite3.Connection) -> None: + """Create SQLite database schema - backward compatibility wrapper. + + Args: + conn: SQLite connection (remains open after schema creation) + """ + # Use the existing connection to create schema + manager = DatabaseManager.__new__(DatabaseManager) + manager.conn = conn + manager.cursor = conn.cursor() + manager.batch_size = 200 + + # Initialize batch lists + manager.files_batch = [] + manager.refs_batch = [] + manager.endpoints_batch = [] + manager.sql_objects_batch = [] + manager.sql_queries_batch = [] + manager.symbols_batch = [] + manager.orm_queries_batch = [] + manager.docker_images_batch = [] + manager.docker_issues_batch = [] + manager.assignments_batch = [] + manager.function_calls_batch = [] + manager.returns_batch = [] + manager.prisma_batch = [] + manager.compose_batch = [] + manager.nginx_batch = [] + + # Create the schema using the existing connection + manager.create_schema() + # Don't close - let caller handle connection lifecycle \ No newline at end of file diff --git a/theauditor/indexer/extractors/__init__.py b/theauditor/indexer/extractors/__init__.py new file mode 100644 index 0000000..77d4368 --- /dev/null +++ b/theauditor/indexer/extractors/__init__.py @@ -0,0 +1,287 @@ +"""Extractor framework for the indexer. + +This module defines the BaseExtractor abstract class and the ExtractorRegistry +for dynamic discovery and registration of language-specific extractors. +""" + +import os +import re +import importlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Dict, Any, List, Optional, Tuple + +from ..config import ( + IMPORT_PATTERNS, ROUTE_PATTERNS, SQL_PATTERNS, SQL_QUERY_PATTERNS +) + +# Optional SQL parsing support +try: + import sqlparse + HAS_SQLPARSE = True +except ImportError: + HAS_SQLPARSE = False + + +class BaseExtractor(ABC): + """Abstract base class for all language extractors.""" + + def __init__(self, root_path: Path, ast_parser: Optional[Any] = None): + """Initialize the extractor. + + Args: + root_path: Project root path + ast_parser: Optional AST parser instance + """ + self.root_path = root_path + self.ast_parser = ast_parser + + @abstractmethod + def supported_extensions(self) -> List[str]: + """Return list of file extensions this extractor supports. + + Returns: + List of file extensions (e.g., ['.py', '.pyx']) + """ + pass + + @abstractmethod + def extract(self, file_info: Dict[str, Any], content: str, + tree: Optional[Any] = None) -> Dict[str, Any]: + """Extract all relevant information from a file. + + Args: + file_info: File metadata dictionary + content: File content + tree: Optional pre-parsed AST tree + + Returns: + Dictionary containing all extracted data + """ + pass + + def extract_imports(self, content: str, file_ext: str) -> List[Tuple[str, str]]: + """Extract import statements from file content. + + Args: + content: File content + file_ext: File extension + + Returns: + List of (kind, value) tuples for imports + """ + imports = [] + for pattern in IMPORT_PATTERNS: + for match in pattern.finditer(content): + value = match.group(1) if match.lastindex else match.group(0) + # Determine kind based on pattern + if "require" in pattern.pattern: + kind = "require" + elif "from" in pattern.pattern and "import" in pattern.pattern: + kind = "from" + elif "package" in pattern.pattern: + kind = "package" + else: + kind = "import" + imports.append((kind, value)) + return imports + + def extract_routes(self, content: str) -> List[Tuple[str, str]]: + """Extract route definitions from file content. + + Args: + content: File content + + Returns: + List of (method, path) tuples + """ + routes = [] + for pattern in ROUTE_PATTERNS: + for match in pattern.finditer(content): + if match.lastindex == 2: + method = match.group(1).upper() + path = match.group(2) + else: + method = "ANY" + path = match.group(1) if match.lastindex else match.group(0) + routes.append((method, path)) + return routes + + def extract_sql_objects(self, content: str) -> List[Tuple[str, str]]: + """Extract SQL object definitions from file content. + + Args: + content: File content + + Returns: + List of (kind, name) tuples + """ + objects = [] + for pattern in SQL_PATTERNS: + for match in pattern.finditer(content): + name = match.group(1) + # Determine kind from pattern + pattern_text = pattern.pattern.lower() + if "table" in pattern_text: + kind = "table" + elif "index" in pattern_text: + kind = "index" + elif "view" in pattern_text: + kind = "view" + elif "function" in pattern_text: + kind = "function" + elif "policy" in pattern_text: + kind = "policy" + elif "constraint" in pattern_text: + kind = "constraint" + else: + kind = "unknown" + objects.append((kind, name)) + return objects + + def extract_sql_queries(self, content: str) -> List[Dict]: + """Extract and parse SQL queries from code. + + Args: + content: File content + + Returns: + List of query dictionaries + """ + if not HAS_SQLPARSE: + return [] + + queries = [] + + # Find all potential SQL query strings + for pattern in SQL_QUERY_PATTERNS: + for match in pattern.finditer(content): + query_text = match.group(1) if match.lastindex else match.group(0) + + # Calculate line number + line = content[:match.start()].count('\n') + 1 + + # Clean up the query text + query_text = query_text.strip() + if not query_text: + continue + + try: + # Parse the SQL query + parsed = sqlparse.parse(query_text) + if not parsed: + continue + + for statement in parsed: + # Extract command type + command = statement.get_type() + if not command: + # Try to extract manually from first token + tokens = statement.tokens + for token in tokens: + if not token.is_whitespace: + command = str(token).upper() + break + + # Extract table names + tables = [] + tokens = list(statement.flatten()) + for i, token in enumerate(tokens): + if token.ttype is None and token.value.upper() in ['FROM', 'INTO', 'UPDATE', 'TABLE', 'JOIN']: + # Look for the next non-whitespace token + for j in range(i + 1, len(tokens)): + next_token = tokens[j] + if not next_token.is_whitespace: + if next_token.ttype in [None, sqlparse.tokens.Name]: + table_name = next_token.value + # Clean up table name + table_name = table_name.strip('"\'`') + if '.' in table_name: + table_name = table_name.split('.')[-1] + if table_name and not table_name.upper() in ['SELECT', 'WHERE', 'SET', 'VALUES']: + tables.append(table_name) + break + + queries.append({ + 'line': line, + 'query_text': query_text[:1000], # Limit length + 'command': command or 'UNKNOWN', + 'tables': tables + }) + except Exception: + # Skip queries that can't be parsed + continue + + return queries + + +class ExtractorRegistry: + """Registry for dynamic discovery and management of extractors.""" + + def __init__(self, root_path: Path, ast_parser: Optional[Any] = None): + """Initialize the registry and discover extractors. + + Args: + root_path: Project root path + ast_parser: Optional AST parser instance + """ + self.root_path = root_path + self.ast_parser = ast_parser + self.extractors = {} + self._discover() + + def _discover(self): + """Auto-discover and register all extractor modules.""" + extractor_dir = Path(__file__).parent + + # Find all Python files in the extractors directory + for file_path in extractor_dir.glob("*.py"): + if file_path.name.startswith('_'): + continue # Skip __init__.py and private modules + + module_name = file_path.stem + + try: + # Import the module + module = importlib.import_module(f'.{module_name}', package='theauditor.indexer.extractors') + + # Find extractor class (looking for subclasses of BaseExtractor) + for attr_name in dir(module): + attr = getattr(module, attr_name) + if (isinstance(attr, type) and + issubclass(attr, BaseExtractor) and + attr != BaseExtractor): + + # Instantiate the extractor + extractor = attr(self.root_path, self.ast_parser) + + # Register for all supported extensions + for ext in extractor.supported_extensions(): + self.extractors[ext] = extractor + + break # One extractor per module + + except (ImportError, AttributeError) as e: + # Skip modules that can't be imported or don't have extractors + if os.environ.get("THEAUDITOR_DEBUG"): + print(f"Debug: Failed to load extractor {module_name}: {e}") + continue + + def get_extractor(self, file_extension: str) -> Optional[BaseExtractor]: + """Get the appropriate extractor for a file extension. + + Args: + file_extension: File extension (e.g., '.py') + + Returns: + Extractor instance or None if not supported + """ + return self.extractors.get(file_extension) + + def supported_extensions(self) -> List[str]: + """Get list of all supported file extensions. + + Returns: + List of supported extensions + """ + return list(self.extractors.keys()) \ No newline at end of file diff --git a/theauditor/indexer/extractors/docker.py b/theauditor/indexer/extractors/docker.py new file mode 100644 index 0000000..01a8188 --- /dev/null +++ b/theauditor/indexer/extractors/docker.py @@ -0,0 +1,279 @@ +"""Docker file extractor. + +Handles extraction of Docker-specific elements including: +- Base images and build stages +- Environment variables and build arguments +- Security issues (running as root, unpinned images, etc.) +""" + +import json +from pathlib import Path +from typing import Dict, Any, List, Optional + +from . import BaseExtractor +from ..config import SENSITIVE_PORTS, SENSITIVE_ENV_KEYWORDS + +# Check for optional Docker parsing libraries +try: + from dockerfile_parse import DockerfileParser as DFParser + HAS_DOCKERFILE_PARSE = True +except ImportError: + HAS_DOCKERFILE_PARSE = False + +try: + from theauditor.parsers.dockerfile_parser import DockerfileParser + HAS_CUSTOM_PARSERS = True +except ImportError: + HAS_CUSTOM_PARSERS = False + + +class DockerExtractor(BaseExtractor): + """Extractor for Docker files.""" + + def supported_extensions(self) -> List[str]: + """Return list of file extensions this extractor supports. + + Note: Dockerfiles don't have extensions, we match by filename. + """ + return [] # We handle this specially in should_extract + + def should_extract(self, file_path: str) -> bool: + """Check if this extractor should handle the file. + + Args: + file_path: Path to the file + + Returns: + True if this is a Dockerfile + """ + file_name_lower = Path(file_path).name.lower() + dockerfile_patterns = [ + 'dockerfile', 'dockerfile.dev', 'dockerfile.prod', + 'dockerfile.test', 'dockerfile.staging' + ] + return (file_name_lower in dockerfile_patterns or + file_name_lower.startswith('dockerfile.')) + + def extract(self, file_info: Dict[str, Any], content: str, + tree: Optional[Any] = None) -> Dict[str, Any]: + """Extract all relevant information from a Dockerfile. + + Args: + file_info: File metadata dictionary + content: File content + tree: Optional pre-parsed AST tree (not used for Docker) + + Returns: + Dictionary containing all extracted data + """ + result = { + 'docker_info': {}, + 'docker_issues': [] + } + + # Extract basic Docker info if dockerfile_parse available + if HAS_DOCKERFILE_PARSE: + result['docker_info'] = self._extract_docker_info(content) + + # Analyze for security issues if custom parser available + if HAS_CUSTOM_PARSERS: + file_path = self.root_path / file_info['path'] + result['docker_issues'] = self._analyze_security(file_path, content) + + return result + + def _extract_docker_info(self, content: str) -> Dict[str, Any]: + """Extract structured information from Dockerfile content. + + Args: + content: Dockerfile content + + Returns: + Dict with Docker information + """ + info = { + 'base_image': None, + 'exposed_ports': [], + 'env_vars': {}, + 'build_args': {}, + 'user': None, + 'has_healthcheck': False + } + + try: + # Parse the Dockerfile + parser = DFParser() + parser.content = content + + # Extract base image + if parser.baseimage: + info['base_image'] = parser.baseimage + + # Extract exposed ports + for instruction in parser.structure: + if instruction['instruction'] == 'EXPOSE': + # Parse ports from the value + ports_str = instruction['value'] + ports = ports_str.split() + info['exposed_ports'].extend(ports) + + # Extract environment variables + elif instruction['instruction'] == 'ENV': + # Parse ENV key=value or ENV key value + env_str = instruction['value'] + # Handle both formats: KEY=value and KEY value + if '=' in env_str: + # Format: KEY=value KEY2=value2 + parts = env_str.split() + for part in parts: + if '=' in part: + key, value = part.split('=', 1) + info['env_vars'][key] = value + else: + # Format: KEY value + parts = env_str.split(None, 1) + if len(parts) == 2: + info['env_vars'][parts[0]] = parts[1] + + # Extract build arguments + elif instruction['instruction'] == 'ARG': + arg_str = instruction['value'] + # Handle ARG key=value or ARG key + if '=' in arg_str: + key, value = arg_str.split('=', 1) + info['build_args'][key] = value + else: + info['build_args'][arg_str] = None + + # Check for USER and HEALTHCHECK + elif instruction['instruction'] == 'USER': + info['user'] = instruction['value'] + elif instruction['instruction'] == 'HEALTHCHECK': + info['has_healthcheck'] = True + elif instruction['instruction'] == 'WORKDIR': + info['env_vars']['_DOCKER_WORKDIR'] = instruction['value'] + + except Exception: + # If parsing fails, return empty info + return {} + + return info + + def _analyze_security(self, file_path: Path, content: str) -> List[Dict]: + """Analyze Dockerfile for security issues. + + Args: + file_path: Path to the Dockerfile + content: Dockerfile content + + Returns: + List of security issue dictionaries + """ + issues = [] + + try: + parser = DockerfileParser() + parsed_data = parser.parse_file(file_path) + + if 'instructions' not in parsed_data: + return issues + + instructions = parsed_data['instructions'] + + # Security Rule 1: Check for running as root + has_user = False + runs_as_root = False + for inst in instructions: + if inst['instruction'] == 'USER': + has_user = True + if inst['value'].strip().lower() == 'root': + runs_as_root = True + issues.append({ + 'line': inst['line'], + 'issue_type': 'ROOT_USER', + 'severity': 'critical' + }) + + if not has_user: + # No USER instruction means runs as root by default + issues.append({ + 'line': 1, + 'issue_type': 'ROOT_USER', + 'severity': 'critical' + }) + + # Security Rule 2: Check for unpinned images + for inst in instructions: + if inst['instruction'] == 'FROM': + image = inst['value'].strip() + # Check for :latest or no tag + if ':latest' in image or (':' not in image and ' as ' not in image.lower()): + issues.append({ + 'line': inst['line'], + 'issue_type': 'UNPINNED_IMAGE', + 'severity': 'high' + }) + + # Security Rule 3: Check for secrets in ENV + for inst in instructions: + if inst['instruction'] == 'ENV': + env_value = inst['value'].upper() + for keyword in SENSITIVE_ENV_KEYWORDS: + if keyword in env_value: + issues.append({ + 'line': inst['line'], + 'issue_type': 'SECRET_IN_ENV', + 'severity': 'critical' + }) + break + + # Security Rule 4: Check for missing healthcheck + has_healthcheck = any(inst['instruction'] == 'HEALTHCHECK' for inst in instructions) + if not has_healthcheck: + issues.append({ + 'line': 1, + 'issue_type': 'MISSING_HEALTHCHECK', + 'severity': 'medium' + }) + + # Security Rule 5: Check for dangerous COPY commands + for inst in instructions: + if inst['instruction'] == 'COPY': + copy_value = inst['value'].strip() + # Check for copying entire directory including potential secrets + if copy_value.startswith('. ') or copy_value == '.' or '.env' in copy_value: + issues.append({ + 'line': inst['line'], + 'issue_type': 'DANGEROUS_COPY', + 'severity': 'high' + }) + + # Security Rule 6: Check for apt-get upgrade in production + for inst in instructions: + if inst['instruction'] == 'RUN': + run_value = inst['value'].lower() + if 'apt-get upgrade' in run_value or 'apt upgrade' in run_value: + issues.append({ + 'line': inst['line'], + 'issue_type': 'APT_UPGRADE_IN_PROD', + 'severity': 'medium' + }) + + # Security Rule 7: Check for exposed sensitive ports + for inst in instructions: + if inst['instruction'] == 'EXPOSE': + ports = inst['value'].split() + for port in ports: + port_num = port.split('/')[0] # Handle "8080/tcp" format + if port_num in SENSITIVE_PORTS: + issues.append({ + 'line': inst['line'], + 'issue_type': 'SENSITIVE_PORT_EXPOSED', + 'severity': 'high' + }) + + except Exception: + # Silently fail security analysis + pass + + return issues \ No newline at end of file diff --git a/theauditor/indexer/extractors/generic.py b/theauditor/indexer/extractors/generic.py new file mode 100644 index 0000000..f6b4faa --- /dev/null +++ b/theauditor/indexer/extractors/generic.py @@ -0,0 +1,121 @@ +"""Generic file extractor. + +Handles extraction for files that don't have specialized extractors: +- Webpack configurations +- Docker Compose files +- Nginx configurations +- Other configuration files +""" + +import json +from pathlib import Path +from typing import Dict, Any, List, Optional + +from . import BaseExtractor +from ..config import COMPOSE_PATTERNS, NGINX_PATTERNS + +# Check for optional custom parsers +try: + from theauditor.parsers.webpack_config_parser import WebpackConfigParser + from theauditor.parsers.compose_parser import ComposeParser + from theauditor.parsers.nginx_parser import NginxParser + HAS_CUSTOM_PARSERS = True +except ImportError: + HAS_CUSTOM_PARSERS = False + + +class GenericExtractor(BaseExtractor): + """Generic extractor for configuration and other files.""" + + def supported_extensions(self) -> List[str]: + """Return list of file extensions this extractor supports.""" + # This extractor handles files by name pattern, not extension + return [] + + def should_extract(self, file_path: str) -> bool: + """Check if this extractor should handle the file. + + Args: + file_path: Path to the file + + Returns: + True if this extractor should handle the file + """ + file_name = Path(file_path).name.lower() + + # Check for specific file patterns + if file_name.endswith('webpack.config.js'): + return True + if file_name in COMPOSE_PATTERNS: + return True + if file_name in NGINX_PATTERNS or file_name.endswith('.conf'): + return True + + return False + + def extract(self, file_info: Dict[str, Any], content: str, + tree: Optional[Any] = None) -> Dict[str, Any]: + """Extract information from generic configuration files. + + Args: + file_info: File metadata dictionary + content: File content + tree: Optional pre-parsed AST tree + + Returns: + Dictionary containing all extracted data + """ + result = { + 'config_data': {}, + 'imports': [], + 'routes': [], + 'sql_queries': [] + } + + file_path = self.root_path / file_info['path'] + file_name = file_path.name.lower() + + # Handle webpack configuration + if HAS_CUSTOM_PARSERS and file_name.endswith('webpack.config.js'): + try: + parser = WebpackConfigParser() + webpack_data = parser.parse_file(file_path) + if webpack_data and not webpack_data.get('error'): + result['config_data']['webpack'] = webpack_data + except Exception: + pass + + # Handle Docker Compose files + if HAS_CUSTOM_PARSERS and file_name in COMPOSE_PATTERNS: + try: + parser = ComposeParser() + compose_data = parser.parse_file(file_path) + if compose_data and not compose_data.get('error'): + result['config_data']['docker_compose'] = compose_data + except Exception: + pass + + # Handle Nginx configuration + if HAS_CUSTOM_PARSERS and (file_name in NGINX_PATTERNS or file_name.endswith('.conf')): + try: + parser = NginxParser() + nginx_data = parser.parse_file(file_path) + if nginx_data and not nginx_data.get('error'): + result['config_data']['nginx'] = nginx_data + # Extract routes from nginx location blocks + if 'locations' in nginx_data: + for location in nginx_data['locations']: + result['routes'].append(( + 'ANY', # Nginx handles all methods by default + location.get('path', '/'), + [] # No middleware concept in nginx + )) + except Exception: + pass + + # For all files, try to extract common patterns + result['imports'].extend(self.extract_imports(content, file_info['ext'])) + result['routes'].extend([(m, p, []) for m, p in self.extract_routes(content)]) + result['sql_queries'].extend(self.extract_sql_queries(content)) + + return result \ No newline at end of file diff --git a/theauditor/indexer/extractors/javascript.py b/theauditor/indexer/extractors/javascript.py new file mode 100644 index 0000000..b4023d5 --- /dev/null +++ b/theauditor/indexer/extractors/javascript.py @@ -0,0 +1,345 @@ +"""JavaScript/TypeScript file extractor. + +Handles extraction of JavaScript and TypeScript specific elements including: +- ES6/CommonJS imports and requires +- Express/Fastify routes with middleware +- ORM queries (Sequelize, Prisma, TypeORM) +- Property accesses for taint analysis +""" + +import re +import json +from pathlib import Path +from typing import Dict, Any, List, Optional + +from . import BaseExtractor +from ..config import ( + SEQUELIZE_METHODS, PRISMA_METHODS, + TYPEORM_REPOSITORY_METHODS, TYPEORM_QB_METHODS +) + + +class JavaScriptExtractor(BaseExtractor): + """Extractor for JavaScript and TypeScript files.""" + + def supported_extensions(self) -> List[str]: + """Return list of file extensions this extractor supports.""" + return ['.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs'] + + def extract(self, file_info: Dict[str, Any], content: str, + tree: Optional[Any] = None) -> Dict[str, Any]: + """Extract all relevant information from a JavaScript/TypeScript file. + + Args: + file_info: File metadata dictionary + content: File content + tree: Optional pre-parsed AST tree + + Returns: + Dictionary containing all extracted data + """ + result = { + 'imports': [], + 'resolved_imports': {}, + 'routes': [], + 'symbols': [], + 'assignments': [], + 'function_calls': [], + 'returns': [], + 'orm_queries': [] + } + + # Extract imports using regex patterns + result['imports'] = self.extract_imports(content, file_info['ext']) + + # Resolve imports if we have js_semantic_parser + if tree and tree.get('success'): + try: + from theauditor.js_semantic_parser import JSSemanticParser + js_parser = JSSemanticParser(project_root=str(self.root_path)) + result['resolved_imports'] = js_parser.resolve_imports( + tree, file_info['path'] + ) + except Exception: + # Resolution failed, keep unresolved imports + pass + + # Extract routes + if tree: + result['routes'] = self._extract_routes_ast(tree, content) + else: + result['routes'] = [(method, path, []) + for method, path in self.extract_routes(content)] + + # Extract symbols from AST if available + if tree and self.ast_parser: + # Functions + functions = self.ast_parser.extract_functions(tree) + for func in functions: + line = func.get('line', 0) + # Validate line numbers are reasonable + if line < 1 or line > 100000: + continue # Skip invalid symbols + + result['symbols'].append({ + 'name': func.get('name', ''), + 'type': 'function', + 'line': line, + 'col': func.get('col', 0) + }) + + # Classes + classes = self.ast_parser.extract_classes(tree) + for cls in classes: + line = cls.get('line', 0) + # Validate line numbers are reasonable + if line < 1 or line > 100000: + continue # Skip invalid symbols + + result['symbols'].append({ + 'name': cls.get('name', ''), + 'type': 'class', + 'line': line, + 'col': cls.get('col', 0) + }) + + # Calls and other symbols + symbols = self.ast_parser.extract_calls(tree) + for symbol in symbols: + line = symbol.get('line', 0) + # Validate line numbers are reasonable + if line < 1 or line > 100000: + continue # Skip invalid symbols + + result['symbols'].append({ + 'name': symbol.get('name', ''), + 'type': symbol.get('type', 'call'), + 'line': line, + 'col': symbol.get('col', symbol.get('column', 0)) + }) + + # CRITICAL: Extract property accesses for taint analysis + # This is needed to find patterns like req.body, req.query, etc. + properties = self.ast_parser.extract_properties(tree) + for prop in properties: + line = prop.get('line', 0) + # Validate line numbers are reasonable + if line < 1 or line > 100000: + continue # Skip invalid symbols + + result['symbols'].append({ + 'name': prop.get('name', ''), + 'type': 'property', + 'line': line, + 'col': prop.get('col', prop.get('column', 0)) + }) + + # Extract data flow information + assignments = self.ast_parser.extract_assignments(tree) + for assignment in assignments: + result['assignments'].append({ + 'line': assignment.get('line', 0), + 'target_var': assignment.get('target_var', ''), + 'source_expr': assignment.get('source_expr', ''), + 'source_vars': assignment.get('source_vars', []), + 'in_function': assignment.get('in_function', 'global') + }) + + # Extract function calls with arguments + calls_with_args = self.ast_parser.extract_function_calls_with_args(tree) + for call in calls_with_args: + result['function_calls'].append({ + 'line': call.get('line', 0), + 'caller_function': call.get('caller_function', 'global'), + 'callee_function': call.get('callee_function', ''), + 'argument_index': call.get('argument_index', 0), + 'argument_expr': call.get('argument_expr', ''), + 'param_name': call.get('param_name', '') + }) + + # Extract return statements + return_statements = self.ast_parser.extract_returns(tree) + for ret in return_statements: + result['returns'].append({ + 'line': ret.get('line', 0), + 'function_name': ret.get('function_name', 'global'), + 'return_expr': ret.get('return_expr', ''), + 'return_vars': ret.get('return_vars', []) + }) + + # Extract ORM queries + result['orm_queries'] = self._extract_orm_queries(tree, content) + + # Extract SQL queries embedded in JavaScript code + result['sql_queries'] = self.extract_sql_queries(content) + + return result + + def _extract_routes_ast(self, tree: Dict[str, Any], content: str) -> List[tuple]: + """Extract Express/Fastify routes with middleware. + + Args: + tree: Parsed AST tree + content: File content for fallback extraction + + Returns: + List of (method, pattern, controls) tuples + """ + routes = [] + + # Enhanced regex to capture middleware + # Pattern: router.METHOD('/path', [middleware1, middleware2,] handler) + pattern = re.compile( + r'(?:app|router)\.(get|post|put|patch|delete|all)\s*\(\s*[\'\"\`]([^\'\"\`]+)[\'\"\`]\s*,\s*([^)]+)\)', + re.MULTILINE | re.DOTALL + ) + + for match in pattern.finditer(content): + method = match.group(1).upper() + path = match.group(2) + middleware_str = match.group(3) + + # Extract middleware function names + middleware = [] + # Look for function names before the final handler + middleware_pattern = re.compile(r'(\w+)(?:\s*,|\s*\))') + for m in middleware_pattern.finditer(middleware_str): + name = m.group(1) + # Filter out common non-middleware terms + if name not in ['req', 'res', 'next', 'async', 'function', 'err']: + middleware.append(name) + + # Remove the last item as it's likely the handler, not middleware + if len(middleware) > 1: + middleware = middleware[:-1] + + routes.append((method, path, middleware)) + + # If no routes found with enhanced regex, fallback to basic extraction + if not routes: + routes = [(method, path, []) + for method, path in self.extract_routes(content)] + + return routes + + def _extract_orm_queries(self, tree: Dict[str, Any], content: str) -> List[Dict]: + """Extract ORM query calls from JavaScript/TypeScript code. + + Args: + tree: AST tree from ast_parser + content: File content for line extraction + + Returns: + List of ORM query dictionaries + """ + queries = [] + + if not tree or not self.ast_parser: + return queries + + # Handle wrapped tree format + if not isinstance(tree, dict) or tree.get("type") != "tree_sitter": + return queries + + try: + # Extract all function calls from the tree + calls = self.ast_parser.extract_calls(tree) + lines = content.split('\n') + + # All ORM methods to check + all_orm_methods = ( + SEQUELIZE_METHODS | PRISMA_METHODS | + TYPEORM_REPOSITORY_METHODS | TYPEORM_QB_METHODS + ) + + # Process each call + for call in calls: + method_name = call.get('name', '') + + # Check for ORM method patterns + if '.' in method_name: + parts = method_name.split('.') + method = parts[-1] + + if method in all_orm_methods: + line_num = call.get('line', 0) + + # Determine ORM type and extract context + orm_type = self._determine_orm_type(method, parts) + + # Try to extract options from context + has_include = False + has_limit = False + has_transaction = False + includes_json = None + + if 0 < line_num <= len(lines): + # Get context for multi-line calls + start_line = max(0, line_num - 1) + end_line = min(len(lines), line_num + 10) + context = '\n'.join(lines[start_line:end_line]) + + # Check for includes/relations (eager loading) + if 'include:' in context or 'include :' in context or 'relations:' in context: + has_include = True + # Check for death query pattern in Sequelize + if 'all: true' in context and 'nested: true' in context: + includes_json = json.dumps({"all": True, "nested": True}) + else: + # Try to extract include/relations specification + include_match = re.search( + r'(?:include|relations):\s*(\[.*?\]|\{.*?\})', + context, re.DOTALL + ) + if include_match: + includes_json = json.dumps({"raw": include_match.group(1)[:200]}) + + # Check for limit/take + if 'limit:' in context or 'limit :' in context or 'take:' in context: + has_limit = True + + # Check for transaction + if 'transaction:' in context or '.$transaction' in context: + has_transaction = True + + # Format query type with model name for Prisma + if orm_type == 'prisma' and len(parts) >= 3: + query_type = f'{parts[-2]}.{method}' # model.method + else: + query_type = method + + queries.append({ + 'line': line_num, + 'query_type': query_type, + 'includes': includes_json, + 'has_limit': has_limit, + 'has_transaction': has_transaction + }) + + except Exception: + # Silently fail ORM extraction + pass + + return queries + + def _determine_orm_type(self, method: str, parts: List[str]) -> str: + """Determine which ORM is being used based on method and call pattern. + + Args: + method: The method name + parts: The split call parts (e.g., ['prisma', 'user', 'findMany']) + + Returns: + ORM type string: 'sequelize', 'prisma', 'typeorm', or 'unknown' + """ + if method in SEQUELIZE_METHODS: + return 'sequelize' + elif method in PRISMA_METHODS: + # Prisma typically uses prisma.modelName.method pattern + if len(parts) >= 3 and parts[-3] in ['prisma', 'db', 'client']: + return 'prisma' + elif method in TYPEORM_REPOSITORY_METHODS: + return 'typeorm_repository' + elif method in TYPEORM_QB_METHODS: + return 'typeorm_qb' + return 'unknown' \ No newline at end of file diff --git a/theauditor/indexer/extractors/python.py b/theauditor/indexer/extractors/python.py new file mode 100644 index 0000000..87739d0 --- /dev/null +++ b/theauditor/indexer/extractors/python.py @@ -0,0 +1,189 @@ +"""Python file extractor. + +Handles extraction of Python-specific elements including: +- Python imports (import/from statements) +- Flask/FastAPI route decorators with middleware +- AST-based symbol extraction +""" + +import ast +import json +from pathlib import Path +from typing import Dict, Any, List, Optional + +from . import BaseExtractor + + +class PythonExtractor(BaseExtractor): + """Extractor for Python files.""" + + def supported_extensions(self) -> List[str]: + """Return list of file extensions this extractor supports.""" + return ['.py', '.pyx'] + + def extract(self, file_info: Dict[str, Any], content: str, + tree: Optional[Any] = None) -> Dict[str, Any]: + """Extract all relevant information from a Python file. + + Args: + file_info: File metadata dictionary + content: File content + tree: Optional pre-parsed AST tree + + Returns: + Dictionary containing all extracted data + """ + result = { + 'imports': [], + 'routes': [], + 'symbols': [], + 'assignments': [], + 'function_calls': [], + 'returns': [] + } + + # Extract imports using regex patterns (for all types) + result['imports'] = self.extract_imports(content, file_info['ext']) + + # If we have an AST tree, extract Python-specific information + if tree and isinstance(tree, dict): + # Extract routes with decorators using AST + result['routes'] = self._extract_routes_ast(tree, file_info['path']) + + # Extract symbols from AST parser results + if self.ast_parser: + # Functions + functions = self.ast_parser.extract_functions(tree) + for func in functions: + result['symbols'].append({ + 'name': func.get('name', ''), + 'type': 'function', + 'line': func.get('line', 0), + 'col': func.get('col', 0) + }) + + # Classes + classes = self.ast_parser.extract_classes(tree) + for cls in classes: + result['symbols'].append({ + 'name': cls.get('name', ''), + 'type': 'class', + 'line': cls.get('line', 0), + 'col': cls.get('col', 0) + }) + + # Calls and other symbols + symbols = self.ast_parser.extract_calls(tree) + for symbol in symbols: + result['symbols'].append({ + 'name': symbol.get('name', ''), + 'type': symbol.get('type', 'call'), + 'line': symbol.get('line', 0), + 'col': symbol.get('col', symbol.get('column', 0)) + }) + + # Extract data flow information for taint analysis + assignments = self.ast_parser.extract_assignments(tree) + for assignment in assignments: + result['assignments'].append({ + 'line': assignment.get('line', 0), + 'target_var': assignment.get('target_var', ''), + 'source_expr': assignment.get('source_expr', ''), + 'source_vars': assignment.get('source_vars', []), + 'in_function': assignment.get('in_function', 'global') + }) + + # Extract function calls with arguments + calls_with_args = self.ast_parser.extract_function_calls_with_args(tree) + for call in calls_with_args: + result['function_calls'].append({ + 'line': call.get('line', 0), + 'caller_function': call.get('caller_function', 'global'), + 'callee_function': call.get('callee_function', ''), + 'argument_index': call.get('argument_index', 0), + 'argument_expr': call.get('argument_expr', ''), + 'param_name': call.get('param_name', '') + }) + + # Extract return statements + return_statements = self.ast_parser.extract_returns(tree) + for ret in return_statements: + result['returns'].append({ + 'line': ret.get('line', 0), + 'function_name': ret.get('function_name', 'global'), + 'return_expr': ret.get('return_expr', ''), + 'return_vars': ret.get('return_vars', []) + }) + else: + # Fallback to regex extraction for routes if no AST + result['routes'] = [(method, path, []) + for method, path in self.extract_routes(content)] + + # Extract SQL queries embedded in Python code + result['sql_queries'] = self.extract_sql_queries(content) + + return result + + def _extract_routes_ast(self, tree: Dict[str, Any], file_path: str) -> List[tuple]: + """Extract Flask/FastAPI routes using Python AST. + + Args: + tree: Parsed AST tree + file_path: Path to file being analyzed + + Returns: + List of (method, pattern, controls) tuples + """ + routes = [] + + # Check if we have a Python AST tree + if not isinstance(tree.get("tree"), ast.Module): + return routes + + # Walk the AST to find decorated functions + for node in ast.walk(tree["tree"]): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + decorators = [] + route_info = None + + # Extract all decorator names + for decorator in node.decorator_list: + dec_name = None + if isinstance(decorator, ast.Name): + dec_name = decorator.id + elif isinstance(decorator, ast.Attribute): + dec_name = decorator.attr + elif isinstance(decorator, ast.Call): + if isinstance(decorator.func, ast.Attribute): + # Handle @app.route('/path') or @router.get('/path') + method_name = decorator.func.attr + if method_name in ['route', 'get', 'post', 'put', 'patch', 'delete']: + # Extract path from first argument + if decorator.args and isinstance(decorator.args[0], ast.Constant): + path = decorator.args[0].value + # Determine HTTP method + if method_name == 'route': + # Check for methods argument + method = 'GET' # Default + for keyword in decorator.keywords: + if keyword.arg == 'methods': + if isinstance(keyword.value, ast.List): + if keyword.value.elts: + if isinstance(keyword.value.elts[0], ast.Constant): + method = keyword.value.elts[0].value.upper() + else: + method = method_name.upper() + route_info = (method, path) + dec_name = method_name + elif isinstance(decorator.func, ast.Name): + dec_name = decorator.func.id + + # Collect non-route decorators as potential middleware/controls + if dec_name and dec_name not in ['route', 'get', 'post', 'put', 'patch', 'delete']: + decorators.append(dec_name) + + # If we found a route, add it with its security decorators + if route_info: + routes.append((route_info[0], route_info[1], decorators)) + + return routes \ No newline at end of file diff --git a/theauditor/indexer/extractors/sql.py b/theauditor/indexer/extractors/sql.py new file mode 100644 index 0000000..358d90a --- /dev/null +++ b/theauditor/indexer/extractors/sql.py @@ -0,0 +1,44 @@ +"""SQL file extractor. + +Handles extraction of SQL-specific elements including: +- SQL object definitions (tables, indexes, views, functions) +- SQL queries and their structure +""" + +from pathlib import Path +from typing import Dict, Any, List, Optional + +from . import BaseExtractor + + +class SQLExtractor(BaseExtractor): + """Extractor for SQL files.""" + + def supported_extensions(self) -> List[str]: + """Return list of file extensions this extractor supports.""" + return ['.sql', '.psql', '.ddl'] + + def extract(self, file_info: Dict[str, Any], content: str, + tree: Optional[Any] = None) -> Dict[str, Any]: + """Extract all relevant information from a SQL file. + + Args: + file_info: File metadata dictionary + content: File content + tree: Optional pre-parsed AST tree (not used for SQL) + + Returns: + Dictionary containing all extracted data + """ + result = { + 'sql_objects': [], + 'sql_queries': [] + } + + # Extract SQL objects (CREATE statements) + result['sql_objects'] = self.extract_sql_objects(content) + + # Extract and parse SQL queries + result['sql_queries'] = self.extract_sql_queries(content) + + return result \ No newline at end of file diff --git a/theauditor/indexer_compat.py b/theauditor/indexer_compat.py new file mode 100644 index 0000000..288457f --- /dev/null +++ b/theauditor/indexer_compat.py @@ -0,0 +1,321 @@ +"""Repository indexer - Backward Compatibility Shim. + +This module provides backward compatibility for code that imports from indexer.py. +All functionality has been refactored into the theauditor.indexer package. + +IMPORTANT: New code should import from theauditor.indexer package directly: + from theauditor.indexer import IndexerOrchestrator +""" + +import json +import sqlite3 +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +# Import from the new package structure +from theauditor.indexer import IndexerOrchestrator +from theauditor.indexer.config import ( + SKIP_DIRS, IMPORT_PATTERNS, ROUTE_PATTERNS, SQL_PATTERNS, + SQL_QUERY_PATTERNS, DEFAULT_BATCH_SIZE +) +from theauditor.indexer.core import ( + FileWalker, is_text_file, get_first_lines, load_gitignore_patterns +) +from theauditor.indexer.database import create_database_schema +from theauditor.config_runtime import load_runtime_config + +# Re-export commonly used items for backward compatibility +__all__ = [ + 'build_index', + 'walk_directory', + 'populate_database', + 'create_database_schema', + 'SKIP_DIRS', + 'extract_imports', + 'extract_routes', + 'extract_sql_objects', + 'extract_sql_queries' +] + + +def extract_imports(content: str, file_ext: str) -> List[tuple]: + """Extract import statements - backward compatibility wrapper.""" + imports = [] + for pattern in IMPORT_PATTERNS: + for match in pattern.finditer(content): + value = match.group(1) if match.lastindex else match.group(0) + # Determine kind based on pattern + if "require" in pattern.pattern: + kind = "require" + elif "from" in pattern.pattern and "import" in pattern.pattern: + kind = "from" + elif "package" in pattern.pattern: + kind = "package" + else: + kind = "import" + imports.append((kind, value)) + return imports + + +def extract_routes(content: str) -> List[tuple]: + """Extract route definitions - backward compatibility wrapper.""" + routes = [] + for pattern in ROUTE_PATTERNS: + for match in pattern.finditer(content): + if match.lastindex == 2: + method = match.group(1).upper() + path = match.group(2) + else: + method = "ANY" + path = match.group(1) if match.lastindex else match.group(0) + routes.append((method, path)) + return routes + + +def extract_sql_objects(content: str) -> List[tuple]: + """Extract SQL object definitions - backward compatibility wrapper.""" + objects = [] + for pattern in SQL_PATTERNS: + for match in pattern.finditer(content): + name = match.group(1) + # Determine kind from pattern + pattern_text = pattern.pattern.lower() + if "table" in pattern_text: + kind = "table" + elif "index" in pattern_text: + kind = "index" + elif "view" in pattern_text: + kind = "view" + elif "function" in pattern_text: + kind = "function" + elif "policy" in pattern_text: + kind = "policy" + elif "constraint" in pattern_text: + kind = "constraint" + else: + kind = "unknown" + objects.append((kind, name)) + return objects + + +def extract_sql_queries(content: str) -> List[dict]: + """Extract SQL queries - backward compatibility wrapper. + + Note: This requires sqlparse to be installed for full functionality. + """ + try: + import sqlparse + except ImportError: + return [] + + queries = [] + for pattern in SQL_QUERY_PATTERNS: + for match in pattern.finditer(content): + query_text = match.group(1) if match.lastindex else match.group(0) + + # Calculate line number + line = content[:match.start()].count('\n') + 1 + + # Clean up the query text + query_text = query_text.strip() + if not query_text: + continue + + try: + # Parse the SQL query + parsed = sqlparse.parse(query_text) + if not parsed: + continue + + for statement in parsed: + # Extract command type + command = statement.get_type() + if not command: + # Try to extract manually from first token + tokens = statement.tokens + for token in tokens: + if not token.is_whitespace: + command = str(token).upper() + break + + # Extract table names + tables = [] + tokens = list(statement.flatten()) + for i, token in enumerate(tokens): + if token.ttype is None and token.value.upper() in ['FROM', 'INTO', 'UPDATE', 'TABLE', 'JOIN']: + # Look for the next non-whitespace token + for j in range(i + 1, len(tokens)): + next_token = tokens[j] + if not next_token.is_whitespace: + if next_token.ttype in [None, sqlparse.tokens.Name]: + table_name = next_token.value + # Clean up table name + table_name = table_name.strip('"\'`') + if '.' in table_name: + table_name = table_name.split('.')[-1] + if table_name and not table_name.upper() in ['SELECT', 'WHERE', 'SET', 'VALUES']: + tables.append(table_name) + break + + queries.append({ + 'line': line, + 'query_text': query_text[:1000], # Limit length + 'command': command or 'UNKNOWN', + 'tables': tables + }) + except Exception: + # Skip queries that can't be parsed + continue + + return queries + + +def walk_directory( + root_path: Path, + follow_symlinks: bool = False, + exclude_patterns: Optional[List[str]] = None +) -> tuple[List[dict], Dict[str, Any]]: + """Walk directory and collect file information - backward compatibility wrapper. + + Args: + root_path: Root directory to walk + follow_symlinks: Whether to follow symbolic links + exclude_patterns: Additional patterns to exclude + + Returns: + Tuple of (files_list, statistics) + """ + config = load_runtime_config(str(root_path)) + walker = FileWalker(root_path, config, follow_symlinks, exclude_patterns) + return walker.walk() + + +def populate_database( + conn: sqlite3.Connection, + files: List[dict], + root_path: Path, + batch_size: int = DEFAULT_BATCH_SIZE +) -> Dict[str, int]: + """Populate SQLite database - backward compatibility wrapper. + + Args: + conn: SQLite connection + files: List of file dictionaries + root_path: Project root path + batch_size: Batch size for database operations + + Returns: + Dictionary of extraction counts + """ + # Create orchestrator with the existing connection's path + db_path = conn.execute("PRAGMA database_list").fetchone()[2] + orchestrator = IndexerOrchestrator(root_path, db_path, batch_size) + + # Close the passed connection as orchestrator creates its own + conn.close() + + # Run the indexing + counts, _ = orchestrator.index() + return counts + + +def build_index( + root_path: str = ".", + manifest_path: str = "manifest.json", + db_path: str = "repo_index.db", + print_stats: bool = False, + dry_run: bool = False, + follow_symlinks: bool = False, + exclude_patterns: Optional[List[str]] = None, +) -> Dict[str, Any]: + """Build repository index - main entry point for backward compatibility. + + Args: + root_path: Root directory to index + manifest_path: Path to write manifest JSON + db_path: Path to SQLite database + print_stats: Whether to print statistics + dry_run: If True, only scan files without creating database + follow_symlinks: Whether to follow symbolic links + exclude_patterns: Patterns to exclude from indexing + + Returns: + Dictionary with success status and statistics + """ + start_time = time.time() + root = Path(root_path).resolve() + + if not root.exists(): + return {"error": f"Root path does not exist: {root_path}"} + + # Walk directory and collect files + config = load_runtime_config(str(root)) + walker = FileWalker(root, config, follow_symlinks, exclude_patterns) + files, walk_stats = walker.walk() + + if dry_run: + if print_stats: + elapsed_ms = int((time.time() - start_time) * 1000) + print(f"Files scanned: {walk_stats['total_files']}") + print(f"Text files indexed: {walk_stats['text_files']}") + print(f"Binary files skipped: {walk_stats['binary_files']}") + print(f"Large files skipped: {walk_stats['large_files']}") + print(f"Elapsed: {elapsed_ms}ms") + return {"success": True, "dry_run": True, "stats": walk_stats} + + # Write manifest + try: + with open(manifest_path, "w", encoding="utf-8") as f: + json.dump(files, f, indent=2, sort_keys=True) + except Exception as e: + return {"error": f"Failed to write manifest: {e}"} + + # Create and populate database + try: + # Check if database already exists + db_exists = Path(db_path).exists() + + # Create database schema + conn = sqlite3.connect(db_path) + conn.execute("BEGIN IMMEDIATE") + create_database_schema(conn) + conn.commit() + conn.close() + + # Report database creation if new + if not db_exists: + print(f"[Indexer] Created database: {db_path}") + + # Use orchestrator to populate the database + orchestrator = IndexerOrchestrator(root, db_path, DEFAULT_BATCH_SIZE, + follow_symlinks, exclude_patterns) + + # Clear existing data to avoid UNIQUE constraint errors + orchestrator.db_manager.clear_tables() + + extract_counts, _ = orchestrator.index() + + except Exception as e: + return {"error": f"Failed to create database: {e}"} + + if print_stats: + elapsed_ms = int((time.time() - start_time) * 1000) + print(f"Files scanned: {walk_stats['total_files']}") + print(f"Text files indexed: {walk_stats['text_files']}") + print(f"Binary files skipped: {walk_stats['binary_files']}") + print(f"Large files skipped: {walk_stats['large_files']}") + print(f"Refs extracted: {extract_counts['refs']}") + print(f"Routes extracted: {extract_counts['routes']}") + print(f"SQL objects extracted: {extract_counts['sql']}") + print(f"SQL queries extracted: {extract_counts['sql_queries']}") + print(f"Docker images analyzed: {extract_counts['docker']}") + print(f"Symbols extracted: {extract_counts['symbols']}") + print(f"Elapsed: {elapsed_ms}ms") + + return { + "success": True, + "stats": walk_stats, + "extract_counts": extract_counts, + "elapsed_ms": int((time.time() - start_time) * 1000), + } \ No newline at end of file diff --git a/theauditor/init.py b/theauditor/init.py new file mode 100644 index 0000000..7e2b0aa --- /dev/null +++ b/theauditor/init.py @@ -0,0 +1,182 @@ +"""Initialization module for TheAuditor - handles project setup and initialization.""" + +from pathlib import Path +from typing import Dict, Any +from theauditor.security import sanitize_config_path, SecurityError + + +def initialize_project( + offline: bool = False, + skip_docs: bool = False, + skip_deps: bool = False +) -> Dict[str, Any]: + """ + Initialize TheAuditor for first-time use by running all setup steps. + + This function handles the sequence of operations: + 1. Index repository + 2. Create workset + 3. Check dependencies (unless skipped/offline) + 4. Fetch documentation (unless skipped/offline) + + Args: + offline: Skip network operations (deps check, docs fetch) + skip_docs: Skip documentation fetching + skip_deps: Skip dependency checking + + Returns: + Dict containing: + - stats: Statistics for each step + - success: Overall success status + - has_failures: Whether any steps failed + - next_steps: List of recommended next commands + """ + from theauditor.indexer import build_index + from theauditor.workset import compute_workset + from theauditor.deps import parse_dependencies, check_latest_versions + from theauditor.docs_fetch import fetch_docs + from theauditor.docs_summarize import summarize_docs + from theauditor.config_runtime import load_runtime_config + + # Load configuration + config = load_runtime_config(".") + stats = {} + + # 1. Index + try: + # Sanitize paths from config before use + manifest_path = str(sanitize_config_path(config["paths"]["manifest"], "paths", "manifest", ".")) + db_path = str(sanitize_config_path(config["paths"]["db"], "paths", "db", ".")) + + result = build_index( + root_path=".", + manifest_path=manifest_path, + db_path=db_path, + print_stats=False, + dry_run=False, + follow_symlinks=False + ) + if result.get("error"): + raise Exception(result["error"]) + # Extract stats from nested structure + index_stats = result.get("stats", {}) + stats["index"] = { + "files": index_stats.get("total_files", 0), + "text_files": index_stats.get("text_files", 0), + "success": True + } + except SecurityError as e: + stats["index"] = {"success": False, "error": f"Security violation: {str(e)}"} + except Exception as e: + stats["index"] = {"success": False, "error": str(e)} + + # 2. Workset + try: + # Skip if indexing failed or found no files + if not stats.get("index", {}).get("success"): + raise Exception("Skipping - indexing failed") + if stats.get("index", {}).get("text_files", 0) == 0: + stats["workset"] = {"success": False, "files": 0} + else: + # Sanitize paths from config before use + db_path = str(sanitize_config_path(config["paths"]["db"], "paths", "db", ".")) + manifest_path = str(sanitize_config_path(config["paths"]["manifest"], "paths", "manifest", ".")) + output_path = str(sanitize_config_path(config["paths"]["workset"], "paths", "workset", ".")) + + result = compute_workset( + all_files=True, + root_path=".", + db_path=db_path, + manifest_path=manifest_path, + output_path=output_path, + max_depth=2, + print_stats=False + ) + stats["workset"] = { + "files": result.get("expanded_count", 0), + "coverage": result.get("coverage", 0), + "success": True + } + except SecurityError as e: + stats["workset"] = {"success": False, "error": f"Security violation: {str(e)}"} + except Exception as e: + stats["workset"] = {"success": False, "error": str(e)} + + # 3. Dependencies + if not skip_deps and not offline: + try: + deps_list = parse_dependencies(root_path=".") + + if deps_list: + latest_info = check_latest_versions(deps_list, allow_net=True, offline=False) + outdated = sum(1 for info in latest_info.values() if info["is_outdated"]) + stats["deps"] = { + "total": len(deps_list), + "outdated": outdated, + "success": True + } + else: + stats["deps"] = {"total": 0, "success": True} + except Exception as e: + stats["deps"] = {"success": False, "error": str(e)} + else: + stats["deps"] = {"skipped": True} + + # 4. Documentation + if not skip_docs and not offline: + try: + deps_list = parse_dependencies(root_path=".") + + if deps_list: + # Limit to first 50 deps for init command to avoid hanging + if len(deps_list) > 50: + deps_list = deps_list[:50] + + # Fetch with progress indicator + fetch_result = fetch_docs(deps_list) + fetched = fetch_result.get('fetched', 0) + cached = fetch_result.get('cached', 0) + errors = fetch_result.get('errors', []) + + # Summarize + summarize_result = summarize_docs() + stats["docs"] = { + "fetched": fetched, + "cached": cached, + "capsules": summarize_result.get('capsules_created', 0), + "success": True, + "errors": errors + } + else: + stats["docs"] = {"success": True, "fetched": 0, "capsules": 0} + except KeyboardInterrupt: + stats["docs"] = {"success": False, "error": "Interrupted by user"} + except Exception as e: + stats["docs"] = {"success": False, "error": str(e)} + else: + stats["docs"] = {"skipped": True} + + # Code capsules feature has been removed - the command was deleted + # Doc capsules (for dependency documentation) are handled by 'aud docs summarize' + + # Check if initialization was successful + has_failures = any( + not stats.get(step, {}).get("success", False) and not stats.get(step, {}).get("skipped", False) + for step in ["index", "workset", "deps", "docs"] + ) + + # Determine next steps + next_steps = [] + if stats.get("workset", {}).get("files", 0) > 0: + next_steps = [ + "aud lint --workset", + "aud ast-verify --workset", + "aud report" + ] + + return { + "stats": stats, + "success": not has_failures, + "has_failures": has_failures, + "next_steps": next_steps + } \ No newline at end of file diff --git a/theauditor/insights/__init__.py b/theauditor/insights/__init__.py new file mode 100644 index 0000000..e061548 --- /dev/null +++ b/theauditor/insights/__init__.py @@ -0,0 +1,86 @@ +"""TheAuditor insights package - optional interpretive intelligence. + +This package contains all optional scoring, classification, and +recommendation modules that add interpretation on top of facts. + +The insights package follows the Truth Courier principle - all modules +here are OPTIONAL and add subjective analysis on top of objective facts. +The core audit pipeline works without any of these modules. + +Modules: + - ml: Machine learning predictions and risk scoring + - graph: Architecture health metrics and recommendations + - taint: Security vulnerability severity classification +""" + +# ML Insights - predictions and risk scoring +from theauditor.insights.ml import ( + check_ml_available, + learn, + suggest, + build_feature_matrix, + build_labels, + train_models, + save_models, + load_models, + is_source_file, + load_journal_stats, + load_rca_stats, + load_ast_stats, + load_graph_stats, + load_git_churn, + load_semantic_import_features, + load_ast_complexity_metrics, + extract_text_features, + fowler_noll_hash, +) + +# Graph Insights - health metrics and recommendations +from theauditor.insights.graph import ( + GraphInsights, + check_insights_available, + create_insights, +) + +# Taint Insights - severity scoring and classification +from theauditor.insights.taint import ( + calculate_severity, + classify_vulnerability, + generate_summary, + format_taint_report, + get_taint_summary, + is_vulnerable_sink, +) + +__all__ = [ + # ML exports + 'check_ml_available', + 'learn', + 'suggest', + 'build_feature_matrix', + 'build_labels', + 'train_models', + 'save_models', + 'load_models', + 'is_source_file', + 'load_journal_stats', + 'load_rca_stats', + 'load_ast_stats', + 'load_graph_stats', + 'load_git_churn', + 'load_semantic_import_features', + 'load_ast_complexity_metrics', + 'extract_text_features', + 'fowler_noll_hash', + # Graph exports + 'GraphInsights', + 'check_insights_available', + 'create_insights', + # Taint exports + 'calculate_severity', + 'classify_vulnerability', + 'generate_summary', + 'format_taint_report', + 'get_taint_summary', + 'is_vulnerable_sink', +] \ No newline at end of file diff --git a/theauditor/insights/graph.py b/theauditor/insights/graph.py new file mode 100644 index 0000000..3e6f107 --- /dev/null +++ b/theauditor/insights/graph.py @@ -0,0 +1,470 @@ +"""Graph insights module - OPTIONAL interpretive analysis for dependency graphs. + +This module provides interpretive metrics like health scores, recommendations, +and weighted rankings. It's completely optional and decoupled from core graph +analysis - similar to how ml.py works. + +IMPORTANT: This module performs interpretation and scoring, which goes beyond +pure data extraction. It's designed for teams that want actionable insights +and are willing to accept some subjective analysis. +""" + +from collections import defaultdict +from typing import Any + + +class GraphInsights: + """Optional graph interpretation and scoring. + + This class provides subjective metrics and recommendations based on + graph topology. All methods here involve interpretation and scoring, + not just raw data extraction. + """ + + # Weights for hotspot scoring (configurable) + DEFAULT_WEIGHTS = { + "in_degree": 0.3, + "out_degree": 0.2, + "centrality": 0.3, + "churn": 0.1, + "loc": 0.1, + } + + def __init__(self, weights: dict[str, float] | None = None): + """ + Initialize insights analyzer with optional weight configuration. + + Args: + weights: Custom weights for hotspot scoring + """ + self.weights = weights or self.DEFAULT_WEIGHTS + + def rank_hotspots( + self, + import_graph: dict[str, Any], + call_graph: dict[str, Any] | None = None + ) -> list[dict[str, Any]]: + """ + Rank nodes by their importance as hotspots using weighted scoring. + + This is an INTERPRETIVE method that assigns subjective importance + scores based on configurable weights. + + Args: + import_graph: Import/dependency graph + call_graph: Optional call graph for additional signals + + Returns: + List of hotspot nodes sorted by interpreted score + """ + # Calculate in/out degrees for import graph + in_degree = defaultdict(int) + out_degree = defaultdict(int) + + for edge in import_graph.get("edges", []): + out_degree[edge["source"]] += 1 + in_degree[edge["target"]] += 1 + + # Add call graph degrees if available + if call_graph: + for edge in call_graph.get("edges", []): + out_degree[edge["source"]] += 1 + in_degree[edge["target"]] += 1 + + # Calculate centrality (simplified betweenness centrality approximation) + centrality = self._calculate_centrality(import_graph) + + # Build node metrics with INTERPRETED SCORING + hotspots = [] + for node in import_graph.get("nodes", []): + node_id = node["id"] + + # Normalize metrics + in_deg = in_degree[node_id] + out_deg = out_degree[node_id] + cent = centrality.get(node_id, 0) + churn = node.get("churn", 0) or 0 + loc = node.get("loc", 0) or 0 + + # INTERPRETATION: Calculate weighted score + score = ( + self.weights["in_degree"] * in_deg + + self.weights["out_degree"] * out_deg + + self.weights["centrality"] * cent + + self.weights["churn"] * (churn / 100) + # Normalize churn + self.weights["loc"] * (loc / 1000) # Normalize LOC + ) + + hotspots.append({ + "id": node_id, + "in_degree": in_deg, + "out_degree": out_deg, + "centrality": cent, + "churn": churn, + "loc": loc, + "score": score, # INTERPRETED METRIC + }) + + # Sort by interpreted score (highest first) + hotspots.sort(key=lambda h: h["score"], reverse=True) + + return hotspots + + def _calculate_centrality(self, graph: dict[str, Any]) -> dict[str, float]: + """ + Calculate centrality scores using PageRank-like algorithm. + + This is an INTERPRETIVE scoring algorithm that assigns importance + based on graph topology. + + Args: + graph: Graph with nodes and edges + + Returns: + Dict mapping node IDs to centrality scores [0, 1] + """ + # Build adjacency list + adj = defaultdict(list) + nodes = set() + + for edge in graph.get("edges", []): + adj[edge["source"]].append(edge["target"]) + nodes.add(edge["source"]) + nodes.add(edge["target"]) + + # Initialize scores + scores = {node: 1.0 for node in nodes} + damping = 0.85 + iterations = 10 + + # Power iteration (PageRank algorithm) + for _ in range(iterations): + new_scores = {} + for node in nodes: + score = (1 - damping) + for source in nodes: + if node in adj[source]: + out_count = len(adj[source]) or 1 + score += damping * scores[source] / out_count + new_scores[node] = score + scores = new_scores + + # Normalize scores to [0, 1] + if scores: + max_score = max(scores.values()) + if max_score > 0: + scores = {k: v / max_score for k, v in scores.items()} + + return scores + + def calculate_health_metrics( + self, + import_graph: dict[str, Any], + cycles: list[dict] | None = None, + hotspots: list[dict] | None = None, + layers: dict[str, list[str]] | None = None, + ) -> dict[str, Any]: + """ + Calculate interpreted health metrics and grades. + + This method provides SUBJECTIVE health scoring based on + architectural best practices. The scoring is opinionated + and may not apply to all codebases. + + Args: + import_graph: Import/dependency graph + cycles: Pre-computed cycles (optional) + hotspots: Pre-computed hotspots (optional) + layers: Pre-computed layers (optional) + + Returns: + Dict with health scores, grades, and metrics + """ + # Calculate graph density + nodes_count = len(import_graph.get("nodes", [])) + edges_count = len(import_graph.get("edges", [])) + max_edges = nodes_count * (nodes_count - 1) if nodes_count > 1 else 1 + density = edges_count / max_edges if max_edges > 0 else 0 + + # INTERPRETATION: Calculate health score + health_score = 100 + + # SUBJECTIVE PENALTY: Penalize for cycles + if cycles: + cycle_penalty = min(len(cycles) * 5, 30) + health_score -= cycle_penalty + + # SUBJECTIVE PENALTY: Penalize for high density (too coupled) + if density > 0.3: + density_penalty = min((density - 0.3) * 100, 20) + health_score -= density_penalty + + # SUBJECTIVE PENALTY: Penalize for hotspots with very high degree + if hotspots and hotspots[0]["in_degree"] > 50: + hotspot_penalty = min(hotspots[0]["in_degree"] // 10, 20) + health_score -= hotspot_penalty + + # INTERPRETATION: Assign letter grade + health_grade = ( + "A" if health_score >= 90 + else "B" if health_score >= 80 + else "C" if health_score >= 70 + else "D" if health_score >= 60 + else "F" + ) + + # INTERPRETATION: Calculate fragility score (0-100, higher is worse) + fragility = 0 + + # Hotspots increase fragility + if hotspots: + top_hotspot_score = hotspots[0]["score"] + fragility += min(top_hotspot_score * 10, 40) + + # Cycles increase fragility + if cycles: + fragility += min(len(cycles) * 3, 30) + + # High coupling increases fragility + fragility += min(density * 100, 30) + + return { + "health_score": max(health_score, 0), + "health_grade": health_grade, + "fragility_score": min(fragility, 100), + "density": density, + "cycle_free": len(cycles) == 0 if cycles else True, + "well_layered": len(layers) > 2 and max(layers.keys()) < 10 if layers else False, + "loosely_coupled": density < 0.2, + "no_god_objects": not hotspots or hotspots[0]["in_degree"] < 30, + } + + def generate_recommendations( + self, + import_graph: dict[str, Any], + cycles: list[dict] | None = None, + hotspots: list[dict] | None = None, + layers: dict[str, list[str]] | None = None, + ) -> list[str]: + """ + Generate actionable recommendations based on graph analysis. + + These are OPINIONATED suggestions based on common architectural + best practices. They may not apply to all projects. + + Args: + import_graph: Import/dependency graph + cycles: Pre-computed cycles (optional) + hotspots: Pre-computed hotspots (optional) + layers: Pre-computed layers (optional) + + Returns: + List of recommendation strings + """ + recommendations = [] + + # Calculate density for recommendations + nodes_count = len(import_graph.get("nodes", [])) + edges_count = len(import_graph.get("edges", [])) + max_edges = nodes_count * (nodes_count - 1) if nodes_count > 1 else 1 + density = edges_count / max_edges if max_edges > 0 else 0 + + # INTERPRETATION: Generate recommendations + if cycles and len(cycles) > 0: + recommendations.append( + f"Break {len(cycles)} dependency cycles to improve maintainability" + ) + + if density > 0.3: + recommendations.append( + "Reduce coupling between modules (current density: {:.2f})".format( + density + ) + ) + + if hotspots and len(hotspots) > 0 and hotspots[0]["in_degree"] > 30: + recommendations.append( + f"Refactor hotspot '{hotspots[0]['id']}' with {hotspots[0]['in_degree']} dependencies" + ) + + if layers and len(layers) <= 2: + recommendations.append( + "Consider introducing more architectural layers for better separation" + ) + + return recommendations + + def summarize( + self, + import_graph: dict[str, Any], + call_graph: dict[str, Any] | None = None, + cycles: list[dict] | None = None, + hotspots: list[dict] | None = None, + ) -> dict[str, Any]: + """ + Generate comprehensive INTERPRETED summary of graph analysis. + + This method combines objective metrics with subjective scoring + and recommendations. It's designed for teams that want actionable + insights beyond raw data. + + Args: + import_graph: Import/dependency graph + call_graph: Optional call graph + cycles: Pre-computed cycles (optional) + hotspots: Pre-computed hotspots (optional) + + Returns: + Summary dict with metrics, health scores, and recommendations + """ + from theauditor.graph.analyzer import XGraphAnalyzer + + # Use base analyzer for pure algorithms + analyzer = XGraphAnalyzer() + + # Get pure metrics + summary = { + "import_graph": { + "nodes": len(import_graph.get("nodes", [])), + "edges": len(import_graph.get("edges", [])), + } + } + + # Add call graph metrics if available + if call_graph: + summary["call_graph"] = { + "nodes": len(call_graph.get("nodes", [])), + "edges": len(call_graph.get("edges", [])), + } + + # Calculate graph density + nodes_count = len(import_graph.get("nodes", [])) + edges_count = len(import_graph.get("edges", [])) + max_edges = nodes_count * (nodes_count - 1) if nodes_count > 1 else 1 + density = edges_count / max_edges if max_edges > 0 else 0 + summary["import_graph"]["density"] = density + + # Add cycle metrics + if cycles is None: + cycles = analyzer.detect_cycles(import_graph) + + summary["cycles"] = { + "total": len(cycles), + "largest": cycles[0]["size"] if cycles else 0, + "nodes_in_cycles": len( + set(node for cycle in cycles for node in cycle["nodes"]) + ), + } + + # Add hotspot metrics + if hotspots is None: + hotspots = self.rank_hotspots(import_graph, call_graph) + + summary["hotspots"] = { + "top_5": [h["id"] for h in hotspots[:5]], + "max_in_degree": max((h["in_degree"] for h in hotspots), default=0), + "max_out_degree": max((h["out_degree"] for h in hotspots), default=0), + } + + # Identify layers + layers = analyzer.identify_layers(import_graph) + summary["layers"] = { + "count": len(layers), + "distribution": {k: len(v) for k, v in layers.items()}, + } + + # Add INTERPRETED health metrics + summary["health_metrics"] = self.calculate_health_metrics( + import_graph, cycles, hotspots, layers + ) + + # Add INTERPRETED recommendations + summary["recommendations"] = self.generate_recommendations( + import_graph, cycles, hotspots, layers + ) + + return summary + + def interpret_graph_summary(self, graph_data: dict[str, Any]) -> dict[str, Any]: + """ + Add interpretive labels to graph summary data. + + This method adds subjective interpretations to raw graph statistics, + such as coupling levels and architectural insights. + + Args: + graph_data: Raw graph summary from analyzer + + Returns: + Enhanced summary with interpretive insights + """ + # Get base statistics + stats = graph_data.get("statistics", {}) + density = stats.get("graph_density", 0) + hotspots = graph_data.get("top_hotspots", []) + + # Add interpretive insights + insights = { + "coupling_level": ( + "high" if density > 0.3 + else "medium" if density > 0.1 + else "low" + ), + "potential_god_objects": len([ + h for h in hotspots + if h.get("in_degree", 0) > 30 + ]), + "highly_connected": len([ + h for h in hotspots + if h.get("total_connections", 0) > 20 + ]), + } + + # Merge with original data + graph_data["architectural_insights"] = insights + + return graph_data + + def calculate_impact_ratio( + self, + targets: list[str], + all_impacted: set[str], + total_nodes: int, + ) -> float: + """ + Calculate interpreted impact ratio for change analysis. + + This is a SUBJECTIVE metric that interprets the scope of impact + as a ratio of total system size. + + Args: + targets: Original target nodes + all_impacted: All impacted nodes (targets + upstream + downstream) + total_nodes: Total number of nodes in graph + + Returns: + Impact ratio [0, 1] + """ + if total_nodes == 0: + return 0.0 + + return len(all_impacted) / total_nodes + + +# Module-level function for backward compatibility +def check_insights_available() -> bool: + """Check if insights module is available (always True).""" + return True + + +def create_insights(weights: dict[str, float] | None = None) -> GraphInsights: + """ + Factory function to create GraphInsights instance. + + Args: + weights: Optional custom weights for scoring + + Returns: + GraphInsights instance + """ + return GraphInsights(weights) \ No newline at end of file diff --git a/theauditor/insights/ml.py b/theauditor/insights/ml.py new file mode 100644 index 0000000..ed156b0 --- /dev/null +++ b/theauditor/insights/ml.py @@ -0,0 +1,1241 @@ +"""Offline ML signals for TheAuditor - manual trigger, non-blocking.""" + +import json +import os +import sqlite3 +import subprocess +import tempfile +from collections import defaultdict +from datetime import UTC, datetime +from pathlib import Path +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + import numpy as np + +# Safe import of ML dependencies +ML_AVAILABLE = False +try: + import joblib + import numpy as np + from sklearn.isotonic import IsotonicRegression + from sklearn.linear_model import Ridge, SGDClassifier + from sklearn.ensemble import GradientBoostingClassifier + from sklearn.preprocessing import StandardScaler + + ML_AVAILABLE = True +except ImportError: + pass + + +def check_ml_available(): + """Check if ML dependencies are available.""" + if not ML_AVAILABLE: + print("ML disabled. Install extras: pip install -e .[ml]") + return False + return True + + +def fowler_noll_hash(text: str, dim: int = 2000) -> int: + """Simple FNV-1a hash for text feature hashing.""" + FNV_PRIME = 0x01000193 + FNV_OFFSET = 0x811C9DC5 + + hash_val = FNV_OFFSET + for char in text.encode("utf-8"): + hash_val ^= char + hash_val = (hash_val * FNV_PRIME) & 0xFFFFFFFF + + return hash_val % dim + + +def extract_text_features( + path: str, rca_messages: list[str] = None, dim: int = 2000 +) -> dict[int, float]: + """Extract hashed text features from path and RCA messages.""" + features = defaultdict(float) + + # Hash path components + parts = Path(path).parts + for part in parts: + idx = fowler_noll_hash(part, dim) + features[idx] += 1.0 + + # Hash basename + basename = Path(path).name + idx = fowler_noll_hash(basename, dim) + features[idx] += 2.0 + + # Hash RCA messages if present + if rca_messages: + for msg in rca_messages[:5]: # Limit to recent 5 + tokens = msg.lower().split()[:10] # First 10 tokens + for token in tokens: + idx = fowler_noll_hash(token, dim) + features[idx] += 0.5 + + return dict(features) + + +def load_journal_stats(history_dir: Path, window: int = 50, run_type: str = "full") -> dict[str, dict]: + """ + Load and aggregate stats from all historical journal files. + + Args: + history_dir: Base history directory + window: Number of recent entries to analyze per file + run_type: Type of runs to load ("full", "diff", or "all") + """ + if not history_dir.exists(): + return {} + + stats = defaultdict( + lambda: { + "touches": 0, + "failures": 0, + "successes": 0, + "recent_phases": [], + } + ) + + try: + # Find historical journal files based on run type + if run_type == "full": + journal_files = list(history_dir.glob('full/*/journal.ndjson')) + elif run_type == "diff": + journal_files = list(history_dir.glob('diff/*/journal.ndjson')) + else: # run_type == "all" + journal_files = list(history_dir.glob('*/*/journal.ndjson')) + + # If no journal files found, fallback to FCE data + if not journal_files: + print("Warning: No journal.ndjson files found. Using FCE and AST failure data as fallback for training.") + + # Load from FCE files instead + if run_type == "full": + fce_files = list(history_dir.glob('full/*/raw/fce.json')) + elif run_type == "diff": + fce_files = list(history_dir.glob('diff/*/raw/fce.json')) + else: # run_type == "all" + fce_files = list(history_dir.glob('*/*/raw/fce.json')) + + # Process FCE files as proxy for journal data + for fce_path in fce_files: + try: + with open(fce_path) as f: + data = json.load(f) + + # Treat each finding as a "touch" and errors/criticals as "failures" + for finding in data.get("all_findings", []): + file = finding.get("file", "") + if file: + stats[file]["touches"] += 1 + severity = finding.get("severity", "") + if severity in ["error", "critical"]: + stats[file]["failures"] += 1 + else: + stats[file]["successes"] += 1 + except Exception: + continue # Skip files that can't be read + + return dict(stats) + + for journal_path in journal_files: + try: + with open(journal_path) as f: + lines = f.readlines()[-window * 20 :] # Approximate last N runs per file + + for line in lines: + try: + event = json.loads(line) + + if event.get("phase") == "apply_patch" and "file" in event: + file = event["file"] + stats[file]["touches"] += 1 + + if "result" in event: + for file_path in stats: + if event["result"] == "fail": + stats[file_path]["failures"] += 1 + else: + stats[file_path]["successes"] += 1 + + except json.JSONDecodeError: + continue + except Exception: + continue # Skip files that can't be read + except (ImportError, ValueError, AttributeError): + pass # ML unavailable - gracefully skip + + return dict(stats) + + +def load_rca_stats(history_dir: Path, run_type: str = "full") -> dict[str, dict]: + """ + Load RCA failure stats from all historical RCA files. + + Args: + history_dir: Base history directory + run_type: Type of runs to load ("full", "diff", or "all") + """ + if not history_dir.exists(): + return {} + + stats = defaultdict( + lambda: { + "fail_count": 0, + "categories": [], + "messages": [], + } + ) + + try: + # Find historical FCE files based on run type + if run_type == "full": + fce_files = list(history_dir.glob('full/*/fce.json')) + elif run_type == "diff": + fce_files = list(history_dir.glob('diff/*/fce.json')) + else: # run_type == "all" + fce_files = list(history_dir.glob('*/*/fce.json')) + + for fce_path in fce_files: + try: + with open(fce_path) as f: + data = json.load(f) + + for failure in data.get("failures", []): + file = failure.get("file", "") + if file: + stats[file]["fail_count"] += 1 + if "category" in failure: + stats[file]["categories"].append(failure["category"]) + if "message" in failure: + stats[file]["messages"].append(failure["message"][:100]) + except Exception: + continue # Skip files that can't be read + except (ImportError, ValueError, AttributeError): + pass # ML unavailable - gracefully skip + + return dict(stats) + + +def load_ast_stats(history_dir: Path, run_type: str = "full") -> dict[str, dict]: + """ + Load AST proof stats from all historical AST files. + + Args: + history_dir: Base history directory + run_type: Type of runs to load ("full", "diff", or "all") + """ + if not history_dir.exists(): + return {} + + stats = defaultdict( + lambda: { + "invariant_fails": 0, + "invariant_passes": 0, + "failed_checks": [], + } + ) + + try: + # Find historical AST proof files based on run type + if run_type == "full": + ast_files = list(history_dir.glob('full/*/ast_proofs.json')) + elif run_type == "diff": + ast_files = list(history_dir.glob('diff/*/ast_proofs.json')) + else: # run_type == "all" + ast_files = list(history_dir.glob('*/*/ast_proofs.json')) + + for ast_path in ast_files: + try: + with open(ast_path) as f: + data = json.load(f) + + for result in data.get("results", []): + file = result.get("path", "") + for check in result.get("checks", []): + if check["status"] == "FAIL": + stats[file]["invariant_fails"] += 1 + stats[file]["failed_checks"].append(check["id"]) + elif check["status"] == "PASS": + stats[file]["invariant_passes"] += 1 + except Exception: + continue # Skip files that can't be read + except (ImportError, ValueError, AttributeError): + pass # ML unavailable - gracefully skip + + return dict(stats) + + +def load_graph_stats(db_path: str, file_paths: list[str]) -> dict[str, dict]: + """Load graph topology stats from index DB.""" + if not Path(db_path).exists() or not file_paths: + return {} + + stats = defaultdict( + lambda: { + "in_degree": 0, + "out_degree": 0, + "has_routes": False, + "has_sql": False, + } + ) + + try: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + # Get refs (imports/exports) + placeholders = ",".join("?" * len(file_paths)) + + # In-degree: files that import this file + cursor.execute( + f""" + SELECT value, COUNT(*) as count + FROM refs + WHERE value IN ({placeholders}) + GROUP BY value + """, + file_paths, + ) + + for file_path, count in cursor.fetchall(): + stats[file_path]["in_degree"] = count + + # Out-degree: files this file imports + cursor.execute( + f""" + SELECT src, COUNT(*) as count + FROM refs + WHERE src IN ({placeholders}) + GROUP BY src + """, + file_paths, + ) + + for file_path, count in cursor.fetchall(): + stats[file_path]["out_degree"] = count + + # Check for routes (now stored in api_endpoints table after refactor) + cursor.execute( + f""" + SELECT DISTINCT file + FROM api_endpoints + WHERE file IN ({placeholders}) + """, + file_paths, + ) + + for (file_path,) in cursor.fetchall(): + stats[file_path]["has_routes"] = True + + # Check for SQL objects + cursor.execute( + f""" + SELECT DISTINCT file + FROM sql_objects + WHERE file IN ({placeholders}) + """, + file_paths, + ) + + for (file_path,) in cursor.fetchall(): + stats[file_path]["has_sql"] = True + + conn.close() + except (ImportError, ValueError, AttributeError): + pass # ML unavailable - gracefully skip + + return dict(stats) + + +def load_git_churn(file_paths: list[str], window_days: int = 30) -> dict[str, int]: + """Load git churn counts if available.""" + if not Path(".git").exists(): + return {} + + churn = defaultdict(int) + + try: + # Use temp files to avoid buffer overflow + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stdout.txt', encoding='utf-8') as stdout_fp, \ + tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stderr.txt', encoding='utf-8') as stderr_fp: + + stdout_path = stdout_fp.name + stderr_path = stderr_fp.name + + result = subprocess.run( + ["git", "log", "--name-only", "--pretty=format:", f"--since={window_days} days ago"], + stdout=stdout_fp, + stderr=stderr_fp, + text=True, + timeout=10, + ) + + with open(stdout_path, 'r', encoding='utf-8') as f: + result.stdout = f.read() + with open(stderr_path, 'r', encoding='utf-8') as f: + result.stderr = f.read() + + os.unlink(stdout_path) + os.unlink(stderr_path) + + if result.returncode == 0: + for line in result.stdout.split("\n"): + line = line.strip() + if line and line in file_paths: + churn[line] += 1 + except (ImportError, ValueError, AttributeError): + pass # ML unavailable - gracefully skip + + return dict(churn) + + +def load_semantic_import_features(db_path: str, file_paths: list[str]) -> dict[str, dict]: + """ + Extract semantic import features to understand file purpose. + + Returns dict with keys: has_http_import, has_db_import, has_auth_import, has_test_import + """ + if not Path(db_path).exists() or not file_paths: + return {} + + # Common library patterns for different purposes + HTTP_LIBS = { + 'requests', 'aiohttp', 'httpx', 'urllib', 'axios', 'fetch', 'superagent', + 'express', 'fastapi', 'flask', 'django.http', 'tornado', 'starlette' + } + + DB_LIBS = { + 'sqlalchemy', 'psycopg2', 'psycopg', 'pymongo', 'redis', 'django.db', + 'peewee', 'tortoise', 'databases', 'asyncpg', 'sqlite3', 'mysql', + 'mongoose', 'sequelize', 'typeorm', 'prisma', 'knex', 'pg' + } + + AUTH_LIBS = { + 'jwt', 'pyjwt', 'passlib', 'oauth', 'oauth2', 'authlib', 'django.contrib.auth', + 'flask_login', 'flask_jwt', 'bcrypt', 'cryptography', 'passport', + 'jsonwebtoken', 'express-jwt', 'firebase-auth', 'auth0' + } + + TEST_LIBS = { + 'pytest', 'unittest', 'mock', 'faker', 'factory_boy', 'hypothesis', + 'jest', 'mocha', 'chai', 'sinon', 'enzyme', 'vitest', 'testing-library' + } + + stats = defaultdict(lambda: { + "has_http_import": False, + "has_db_import": False, + "has_auth_import": False, + "has_test_import": False, + }) + + try: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + placeholders = ",".join("?" * len(file_paths)) + + # Get all imports for the specified files + cursor.execute( + f""" + SELECT src, value + FROM refs + WHERE src IN ({placeholders}) + AND kind IN ('import', 'from', 'require') + """, + file_paths, + ) + + for file_path, import_value in cursor.fetchall(): + # Normalize import value (strip quotes, extract package name) + import_name = import_value.lower().strip('"\'') + # Handle scoped packages like @angular/core + if '/' in import_name: + import_name = import_name.split('/')[0].lstrip('@') + # Handle sub-modules like django.contrib.auth + base_import = import_name.split('.')[0] + + # Check against our semantic categories + if any(lib in import_name or base_import == lib for lib in HTTP_LIBS): + stats[file_path]["has_http_import"] = True + + if any(lib in import_name or base_import == lib for lib in DB_LIBS): + stats[file_path]["has_db_import"] = True + + if any(lib in import_name or base_import == lib for lib in AUTH_LIBS): + stats[file_path]["has_auth_import"] = True + + if any(lib in import_name or base_import == lib for lib in TEST_LIBS): + stats[file_path]["has_test_import"] = True + + conn.close() + except Exception: + pass # Gracefully skip on error + + return dict(stats) + + +def load_ast_complexity_metrics(db_path: str, file_paths: list[str]) -> dict[str, dict]: + """ + Extract AST-based complexity metrics from the symbols table. + + Returns dict with keys: function_count, class_count, call_count, try_except_count, async_def_count + """ + if not Path(db_path).exists() or not file_paths: + return {} + + stats = defaultdict(lambda: { + "function_count": 0, + "class_count": 0, + "call_count": 0, + "try_except_count": 0, + "async_def_count": 0, + }) + + try: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + placeholders = ",".join("?" * len(file_paths)) + + # Count different symbol types per file + cursor.execute( + f""" + SELECT path, type, COUNT(*) as count + FROM symbols + WHERE path IN ({placeholders}) + GROUP BY path, type + """, + file_paths, + ) + + for file_path, symbol_type, count in cursor.fetchall(): + if symbol_type == "function": + stats[file_path]["function_count"] = count + elif symbol_type == "class": + stats[file_path]["class_count"] = count + elif symbol_type == "call": + stats[file_path]["call_count"] = count + + # Count async functions (those with 'async' in the name) + # This is a heuristic since we don't have a dedicated async flag + cursor.execute( + f""" + SELECT path, COUNT(*) as count + FROM symbols + WHERE path IN ({placeholders}) + AND type = 'function' + AND (name LIKE 'async%' OR name LIKE '%async%') + GROUP BY path + """, + file_paths, + ) + + for file_path, count in cursor.fetchall(): + stats[file_path]["async_def_count"] = count + + # Count try/except patterns - look for exception handling calls + # Common patterns: catch, except, rescue, error + cursor.execute( + f""" + SELECT path, COUNT(*) as count + FROM symbols + WHERE path IN ({placeholders}) + AND type = 'call' + AND (name IN ('catch', 'except', 'rescue', 'error', 'try', 'finally')) + GROUP BY path + """, + file_paths, + ) + + for file_path, count in cursor.fetchall(): + stats[file_path]["try_except_count"] = count + + conn.close() + except Exception: + pass # Gracefully skip on error + + return dict(stats) + + +def build_feature_matrix( + file_paths: list[str], + manifest_path: str, + db_path: str, + journal_stats: dict = None, + rca_stats: dict = None, + ast_stats: dict = None, + enable_git: bool = False, +) -> tuple["np.ndarray", dict[str, int]]: + """Build feature matrix for files.""" + if not ML_AVAILABLE: + return None, {} + + # Load manifest for file metadata + manifest_map = {} + try: + with open(manifest_path) as f: + manifest = json.load(f) + for entry in manifest: + manifest_map[entry["path"]] = entry + except (ImportError, ValueError, AttributeError): + pass # ML unavailable - gracefully skip + + # Use provided stats or default to empty dicts + journal_stats = journal_stats if journal_stats is not None else {} + rca_stats = rca_stats if rca_stats is not None else {} + ast_stats = ast_stats if ast_stats is not None else {} + graph_stats = load_graph_stats(db_path, file_paths) + + # Load centrality from graph metrics if available + try: + metrics_path = Path("./.pf/raw/graph_metrics.json") + if metrics_path.exists(): + with open(metrics_path) as f: + graph_metrics = json.load(f) + # Merge into existing stats + for path in file_paths: + if path in graph_metrics: + if path not in graph_stats: + graph_stats[path] = { + "in_degree": 0, + "out_degree": 0, + "has_routes": False, + "has_sql": False, + } + graph_stats[path]["centrality"] = graph_metrics[path] + except (json.JSONDecodeError, IOError): + pass # Proceed without centrality scores + + git_churn = load_git_churn(file_paths) if enable_git else {} + + # Load new advanced features + semantic_imports = load_semantic_import_features(db_path, file_paths) + complexity_metrics = load_ast_complexity_metrics(db_path, file_paths) + + # Build feature vectors + feature_names = [] + features = [] + + for file_path in file_paths: + feat = [] + + # Basic metadata features + meta = manifest_map.get(file_path, {}) + feat.append(meta.get("bytes", 0) / 10000.0) # Normalized + feat.append(meta.get("loc", 0) / 100.0) # Normalized + + # Extension as categorical + ext = meta.get("ext", "") + feat.append(1.0 if ext in [".ts", ".tsx", ".js", ".jsx"] else 0.0) + feat.append(1.0 if ext == ".py" else 0.0) + + # Graph topology + graph = graph_stats.get(file_path, {}) + feat.append(graph.get("in_degree", 0) / 10.0) + feat.append(graph.get("out_degree", 0) / 10.0) + feat.append(1.0 if graph.get("has_routes") else 0.0) + feat.append(1.0 if graph.get("has_sql") else 0.0) + feat.append(graph.get("centrality", 0.0)) # Already normalized [0,1] + + # Journal history + journal = journal_stats.get(file_path, {}) + feat.append(journal.get("touches", 0) / 10.0) + feat.append(journal.get("failures", 0) / 5.0) + feat.append(journal.get("successes", 0) / 5.0) + + # RCA history + rca = rca_stats.get(file_path, {}) + feat.append(rca.get("fail_count", 0) / 5.0) + + # AST checks + ast = ast_stats.get(file_path, {}) + feat.append(ast.get("invariant_fails", 0) / 3.0) + feat.append(ast.get("invariant_passes", 0) / 3.0) + + # Git churn + feat.append(git_churn.get(file_path, 0) / 5.0) + + # NEW: Semantic import features + semantic = semantic_imports.get(file_path, {}) + feat.append(1.0 if semantic.get("has_http_import") else 0.0) + feat.append(1.0 if semantic.get("has_db_import") else 0.0) + feat.append(1.0 if semantic.get("has_auth_import") else 0.0) + feat.append(1.0 if semantic.get("has_test_import") else 0.0) + + # NEW: AST complexity metrics + complexity = complexity_metrics.get(file_path, {}) + feat.append(complexity.get("function_count", 0) / 20.0) # Normalized + feat.append(complexity.get("class_count", 0) / 10.0) # Normalized + feat.append(complexity.get("call_count", 0) / 50.0) # Normalized + feat.append(complexity.get("try_except_count", 0) / 5.0) # Normalized + feat.append(complexity.get("async_def_count", 0) / 5.0) # Normalized + + # Text features (simplified - just path hash) + text_feats = extract_text_features( + file_path, rca.get("messages", []), dim=50 # Small for speed + ) + text_vec = [0.0] * 50 + for idx, val in text_feats.items(): + if idx < 50: + text_vec[idx] = val + feat.extend(text_vec) + + features.append(feat) + + # Feature names for debugging + feature_names = [ + "bytes_norm", + "loc_norm", + "is_js", + "is_py", + "in_degree", + "out_degree", + "has_routes", + "has_sql", + "centrality", + "touches", + "failures", + "successes", + "rca_fails", + "ast_fails", + "ast_passes", + "git_churn", + # New semantic import features + "has_http_import", + "has_db_import", + "has_auth_import", + "has_test_import", + # New complexity metrics + "function_count", + "class_count", + "call_count", + "try_except_count", + "async_def_count", + ] + [f"text_{i}" for i in range(50)] + + feature_name_map = {name: i for i, name in enumerate(feature_names)} + + return np.array(features), feature_name_map + + +def build_labels( + file_paths: list[str], + journal_stats: dict, + rca_stats: dict, +) -> tuple["np.ndarray", "np.ndarray", "np.ndarray"]: + """Build label vectors for training.""" + if not ML_AVAILABLE: + return None, None, None + + # Root cause labels (binary): file failed in RCA + root_cause_labels = np.array( + [1.0 if rca_stats.get(fp, {}).get("fail_count", 0) > 0 else 0.0 for fp in file_paths] + ) + + # Next edit labels (binary): file was edited in journal + next_edit_labels = np.array( + [1.0 if journal_stats.get(fp, {}).get("touches", 0) > 0 else 0.0 for fp in file_paths] + ) + + # Risk scores (continuous): failure ratio + risk_labels = np.array( + [ + min( + 1.0, + journal_stats.get(fp, {}).get("failures", 0) + / max(1, journal_stats.get(fp, {}).get("touches", 1)), + ) + for fp in file_paths + ] + ) + + return root_cause_labels, next_edit_labels, risk_labels + + +def train_models( + features: "np.ndarray", + root_cause_labels: "np.ndarray", + next_edit_labels: "np.ndarray", + risk_labels: "np.ndarray", + seed: int = 13, + sample_weight: "np.ndarray" = None, +) -> tuple[Any, Any, Any, Any]: + """Train the three models with optional sample weighting for human feedback.""" + if not ML_AVAILABLE: + return None, None, None, None + + # Handle empty or all-same labels + if len(np.unique(root_cause_labels)) < 2: + root_cause_labels[0] = 1 - root_cause_labels[0] # Flip one for training + if len(np.unique(next_edit_labels)) < 2: + next_edit_labels[0] = 1 - next_edit_labels[0] + + # Scale features + scaler = StandardScaler() + features_scaled = scaler.fit_transform(features) + + # Train root cause classifier with GradientBoostingClassifier + # More powerful ensemble model that captures non-linear relationships + root_cause_clf = GradientBoostingClassifier( + n_estimators=50, # Reduced for speed + learning_rate=0.1, + max_depth=3, + random_state=seed, + subsample=0.8, # Stochastic gradient boosting + min_samples_split=5, # Prevent overfitting + ) + root_cause_clf.fit(features_scaled, root_cause_labels, sample_weight=sample_weight) + + # Train next edit classifier with GradientBoostingClassifier + next_edit_clf = GradientBoostingClassifier( + n_estimators=50, + learning_rate=0.1, + max_depth=3, + random_state=seed, + subsample=0.8, + min_samples_split=5, + ) + next_edit_clf.fit(features_scaled, next_edit_labels, sample_weight=sample_weight) + + # Train risk regressor (keep Ridge for regression task) + risk_reg = Ridge(alpha=1.0, random_state=seed) + risk_reg.fit(features_scaled, risk_labels, sample_weight=sample_weight) + + return root_cause_clf, next_edit_clf, risk_reg, scaler + + +def save_models( + model_dir: str, + root_cause_clf: Any, + next_edit_clf: Any, + risk_reg: Any, + scaler: Any, + feature_name_map: dict, + stats: dict, +): + """Save trained models and metadata.""" + if not ML_AVAILABLE: + return + + Path(model_dir).mkdir(parents=True, exist_ok=True) + + # Save models + model_data = { + "root_cause_clf": root_cause_clf, + "next_edit_clf": next_edit_clf, + "risk_reg": risk_reg, + "scaler": scaler, + } + joblib.dump(model_data, Path(model_dir) / "model.joblib") + + # Save feature map + with open(Path(model_dir) / "feature_map.json", "w") as f: + json.dump(feature_name_map, f, indent=2) + + # Save training stats + with open(Path(model_dir) / "training_stats.json", "w") as f: + json.dump(stats, f, indent=2) + + +def is_source_file(file_path: str) -> bool: + """Check if a file is a source code file (not test, config, or docs).""" + path = Path(file_path) + + # Skip test files and test directories + if any(part in ['test', 'tests', '__tests__', 'spec'] for part in path.parts): + return False + if path.name.startswith('test_') or path.name.endswith('_test.py') or '.test.' in path.name or '.spec.' in path.name: + return False + + # Skip documentation + if path.suffix.lower() in ['.md', '.rst', '.txt', '.yaml', '.yml']: + return False + + # Skip configuration files + config_files = { + '.gitignore', '.gitattributes', '.editorconfig', + 'pyproject.toml', 'setup.py', 'setup.cfg', + 'package.json', 'package-lock.json', 'yarn.lock', + 'Makefile', 'makefile', 'requirements.txt', + 'Dockerfile', 'docker-compose.yml', '.dockerignore', + '.env', '.env.example', 'tsconfig.json', 'jest.config.js', + 'webpack.config.js', 'babel.config.js', '.eslintrc.js', + '.prettierrc', 'tox.ini', 'pytest.ini' + } + if path.name.lower() in config_files: + return False + + # Skip non-source extensions + non_source_exts = { + '.json', '.xml', '.lock', '.log', '.bak', + '.tmp', '.temp', '.cache', '.pid', '.sock' + } + if path.suffix.lower() in non_source_exts and path.name != 'manifest.json': + return False + + # Skip directories that are typically not source + skip_dirs = {'docs', 'documentation', 'examples', 'samples', 'fixtures'} + if any(part.lower() in skip_dirs for part in path.parts): + return False + + # Accept common source file extensions + source_exts = { + '.py', '.js', '.jsx', '.ts', '.tsx', '.java', '.go', + '.cs', '.cpp', '.cc', '.c', '.h', '.hpp', '.rs', + '.rb', '.php', '.swift', '.kt', '.scala', '.lua', + '.sh', '.bash', '.ps1', '.sql' + } + + return path.suffix.lower() in source_exts + + +def load_models(model_dir: str) -> tuple[Any, Any, Any, Any, dict]: + """Load trained models.""" + if not ML_AVAILABLE: + return None, None, None, None, {} + + model_path = Path(model_dir) / "model.joblib" + if not model_path.exists(): + return None, None, None, None, {} + + try: + model_data = joblib.load(model_path) + + with open(Path(model_dir) / "feature_map.json") as f: + feature_map = json.load(f) + + return ( + model_data["root_cause_clf"], + model_data["next_edit_clf"], + model_data["risk_reg"], + model_data["scaler"], + feature_map, + ) + except (ImportError, ValueError, AttributeError): + # ML unavailable - return graceful defaults + return None, None, None, None, {} + + +def learn( + db_path: str = "./.pf/repo_index.db", + manifest_path: str = "./.pf/manifest.json", + journal_path: str = "./.pf/journal.ndjson", + fce_path: str = "./.pf/fce.json", + ast_path: str = "./.pf/ast_proofs.json", + enable_git: bool = False, + model_dir: str = "./.pf/ml", + window: int = 50, + seed: int = 13, + print_stats: bool = False, + feedback_path: str = None, + train_on: str = "full", +) -> dict[str, Any]: + """Train ML models from artifacts.""" + if not check_ml_available(): + return {"success": False, "error": "ML not available"} + + # Get all files from manifest + try: + with open(manifest_path) as f: + manifest = json.load(f) + all_file_paths = [entry["path"] for entry in manifest] + + # Filter to only source files + file_paths = [fp for fp in all_file_paths if is_source_file(fp)] + + if print_stats: + excluded_count = len(all_file_paths) - len(file_paths) + if excluded_count > 0: + print(f"Excluded {excluded_count} non-source files (tests, docs, configs)") + + except Exception as e: + return {"success": False, "error": f"Failed to load manifest: {e}"} + + if not file_paths: + return {"success": False, "error": "No source files found in manifest"} + + # Define history directory + history_dir = Path("./.pf/history") + + # Load historical data based on train_on parameter + journal_stats = load_journal_stats(history_dir, window, run_type=train_on) + rca_stats = load_rca_stats(history_dir, run_type=train_on) + ast_stats = load_ast_stats(history_dir, run_type=train_on) + + # Build features with loaded stats + features, feature_name_map = build_feature_matrix( + file_paths, + manifest_path, + db_path, + journal_stats, + rca_stats, + ast_stats, + enable_git, + ) + + # Build labels with loaded stats + root_cause_labels, next_edit_labels, risk_labels = build_labels( + file_paths, + journal_stats, + rca_stats, + ) + + # Load human feedback if provided + sample_weight = None + if feedback_path and Path(feedback_path).exists(): + try: + with open(feedback_path) as f: + feedback_data = json.load(f) + + # Create sample weights array + sample_weight = np.ones(len(file_paths)) + + # Increase weight for files with human feedback + for i, fp in enumerate(file_paths): + if fp in feedback_data: + # Weight human-reviewed files 5x higher + sample_weight[i] = 5.0 + + # Also update labels based on feedback + feedback = feedback_data[fp] + if "is_risky" in feedback: + # Human says file is risky - treat as positive for risk + risk_labels[i] = 1.0 if feedback["is_risky"] else 0.0 + if "is_root_cause" in feedback: + # Human says file is root cause + root_cause_labels[i] = 1.0 if feedback["is_root_cause"] else 0.0 + if "will_need_edit" in feedback: + # Human says file will need editing + next_edit_labels[i] = 1.0 if feedback["will_need_edit"] else 0.0 + + if print_stats: + feedback_count = sum(1 for fp in file_paths if fp in feedback_data) + print(f"Incorporating human feedback for {feedback_count} files") + + except Exception as e: + if print_stats: + print(f"Warning: Could not load feedback file: {e}") + + # Check data size + n_samples = len(file_paths) + cold_start = n_samples < 500 + + if print_stats: + print(f"Training on {n_samples} files") + print(f"Features: {features.shape[1]} dimensions") + print(f"Root cause positive: {np.sum(root_cause_labels)}/{n_samples}") + print(f"Next edit positive: {np.sum(next_edit_labels)}/{n_samples}") + print(f"Mean risk: {np.mean(risk_labels):.3f}") + if cold_start: + print("WARNING: Cold-start with <500 samples, expect noisy signals") + + # Train models with optional sample weights from human feedback + root_cause_clf, next_edit_clf, risk_reg, scaler = train_models( + features, + root_cause_labels, + next_edit_labels, + risk_labels, + seed, + sample_weight=sample_weight, + ) + + # Calculate simple metrics + stats = { + "n_samples": n_samples, + "n_features": features.shape[1], + "root_cause_positive_ratio": float(np.mean(root_cause_labels)), + "next_edit_positive_ratio": float(np.mean(next_edit_labels)), + "mean_risk": float(np.mean(risk_labels)), + "cold_start": cold_start, + "timestamp": datetime.now(UTC).isoformat(), + } + + # Save models + save_models( + model_dir, + root_cause_clf, + next_edit_clf, + risk_reg, + scaler, + feature_name_map, + stats, + ) + + if print_stats: + print(f"Models saved to {model_dir}") + + return { + "success": True, + "stats": stats, + "model_dir": model_dir, + "source_files": len(file_paths), + "total_files": len(all_file_paths), + "excluded_count": len(all_file_paths) - len(file_paths), + } + + +def suggest( + db_path: str = "./.pf/repo_index.db", + manifest_path: str = "./.pf/manifest.json", + workset_path: str = "./.pf/workset.json", + fce_path: str = "./.pf/fce.json", + ast_path: str = "./.pf/ast_proofs.json", + model_dir: str = "./.pf/ml", + topk: int = 10, + out_path: str = "./.pf/insights/ml_suggestions.json", + print_plan: bool = False, +) -> dict[str, Any]: + """Generate ML suggestions for workset files.""" + if not check_ml_available(): + return {"success": False, "error": "ML not available"} + + # Load models + root_cause_clf, next_edit_clf, risk_reg, scaler, feature_map = load_models(model_dir) + + if root_cause_clf is None: + print(f"No models found in {model_dir}. Run 'aud learn' first.") + return {"success": False, "error": "Models not found"} + + # Load workset + try: + with open(workset_path) as f: + workset = json.load(f) + all_file_paths = [p["path"] for p in workset.get("paths", [])] + + # Filter to only source files + file_paths = [fp for fp in all_file_paths if is_source_file(fp)] + + if print_plan: + excluded_count = len(all_file_paths) - len(file_paths) + if excluded_count > 0: + print(f"Excluded {excluded_count} non-source files from suggestions") + + except Exception as e: + return {"success": False, "error": f"Failed to load workset: {e}"} + + if not file_paths: + return {"success": False, "error": "No source files in workset"} + + # Load current FCE and AST stats if available + current_fce_stats = {} + if fce_path and Path(fce_path).exists(): + try: + with open(fce_path) as f: + data = json.load(f) + for failure in data.get("failures", []): + file = failure.get("file", "") + if file: + if file not in current_fce_stats: + current_fce_stats[file] = {"fail_count": 0, "categories": [], "messages": []} + current_fce_stats[file]["fail_count"] += 1 + if "category" in failure: + current_fce_stats[file]["categories"].append(failure["category"]) + if "message" in failure: + current_fce_stats[file]["messages"].append(failure["message"][:100]) + except Exception: + pass + + current_ast_stats = {} + if ast_path and Path(ast_path).exists(): + try: + with open(ast_path) as f: + data = json.load(f) + for result in data.get("results", []): + file = result.get("path", "") + if file: + if file not in current_ast_stats: + current_ast_stats[file] = {"invariant_fails": 0, "invariant_passes": 0, "failed_checks": []} + for check in result.get("checks", []): + if check["status"] == "FAIL": + current_ast_stats[file]["invariant_fails"] += 1 + current_ast_stats[file]["failed_checks"].append(check["id"]) + elif check["status"] == "PASS": + current_ast_stats[file]["invariant_passes"] += 1 + except Exception: + pass + + # Build features for workset files + features, _ = build_feature_matrix( + file_paths, + manifest_path, + db_path, + None, # No journal for prediction + current_fce_stats, # Use current FCE if available + current_ast_stats, # Use current AST if available + False, # No git for speed + ) + + # Scale features + features_scaled = scaler.transform(features) + + # Get predictions + root_cause_scores = root_cause_clf.predict_proba(features_scaled)[:, 1] + next_edit_scores = next_edit_clf.predict_proba(features_scaled)[:, 1] + risk_scores = np.clip(risk_reg.predict(features_scaled), 0, 1) + + # Rank files + root_cause_ranked = sorted( + zip(file_paths, root_cause_scores, strict=False), + key=lambda x: x[1], + reverse=True, + )[:topk] + + next_edit_ranked = sorted( + zip(file_paths, next_edit_scores, strict=False), + key=lambda x: x[1], + reverse=True, + )[:topk] + + risk_ranked = sorted( + zip(file_paths, risk_scores, strict=False), + key=lambda x: x[1], + reverse=True, + )[:topk] + + # Build output + output = { + "generated_at": datetime.now(UTC).isoformat(), + "workset_size": len(file_paths), + "likely_root_causes": [ + {"path": path, "score": float(score)} for path, score in root_cause_ranked + ], + "next_files_to_edit": [ + {"path": path, "score": float(score)} for path, score in next_edit_ranked + ], + "risk": [{"path": path, "score": float(score)} for path, score in risk_ranked], + } + + # Ensure output directory exists + Path(out_path).parent.mkdir(parents=True, exist_ok=True) + + # Write output atomically + tmp_path = f"{out_path}.tmp" + with open(tmp_path, "w") as f: + json.dump(output, f, indent=2, sort_keys=True) + os.replace(tmp_path, out_path) + + if print_plan: + print(f"Workset: {len(file_paths)} files") + print(f"\nTop {min(5, topk)} likely root causes:") + for item in output["likely_root_causes"][:5]: + print(f" {item['score']:.3f} - {item['path']}") + + print(f"\nTop {min(5, topk)} next files to edit:") + for item in output["next_files_to_edit"][:5]: + print(f" {item['score']:.3f} - {item['path']}") + + print(f"\nTop {min(5, topk)} risk scores:") + for item in output["risk"][:5]: + print(f" {item['score']:.3f} - {item['path']}") + + return { + "success": True, + "out_path": out_path, + "workset_size": len(file_paths), + "original_size": len(all_file_paths), + "excluded_count": len(all_file_paths) - len(file_paths), + "topk": topk, + } diff --git a/theauditor/insights/taint.py b/theauditor/insights/taint.py new file mode 100644 index 0000000..9f3a45d --- /dev/null +++ b/theauditor/insights/taint.py @@ -0,0 +1,446 @@ +"""Interpretive intelligence layer for taint analysis - optional severity scoring and vulnerability classification.""" + +import sqlite3 +import platform +from typing import Dict, List, Any +from collections import defaultdict + +# Detect if running on Windows for character encoding +IS_WINDOWS = platform.system() == "Windows" + + +def calculate_severity(path_data: Dict[str, Any]) -> str: + """ + Calculate severity based on vulnerability type and path complexity. + This is interpretive logic that assigns risk levels. + + Args: + path_data: Dictionary with vulnerability_type and path information + + Returns: + Severity level: "critical", "high", "medium", or "low" + """ + vulnerability_type = path_data.get("vulnerability_type", "") + path_length = len(path_data.get("path", [])) + + high_severity = ["SQL Injection", "Command Injection", "NoSQL Injection"] + medium_severity = ["Cross-Site Scripting (XSS)", "Path Traversal", "LDAP Injection"] + + if vulnerability_type in high_severity: + return "critical" if path_length <= 2 else "high" + elif vulnerability_type in medium_severity: + return "high" if path_length <= 2 else "medium" + else: + return "medium" if path_length <= 3 else "low" + + +def classify_vulnerability(sink: Dict[str, Any], security_sinks: Dict[str, List[str]]) -> str: + """ + Classify the vulnerability based on sink type. + This is interpretive logic that categorizes vulnerabilities. + + Args: + sink: Sink dictionary with name + security_sinks: Mapping of vulnerability types to sink patterns + + Returns: + Human-readable vulnerability type + """ + sink_name = sink["name"].lower() if "name" in sink else "" + + for vuln_type, sinks in security_sinks.items(): + if any(s.lower() in sink_name for s in sinks): + return { + "sql": "SQL Injection", + "command": "Command Injection", + "xss": "Cross-Site Scripting (XSS)", + "path": "Path Traversal", + "ldap": "LDAP Injection", + "nosql": "NoSQL Injection" + }.get(vuln_type, vuln_type.upper()) + + return "Data Exposure" + + +def is_vulnerable_sink(cursor: sqlite3.Cursor, sink: Dict[str, Any], context: Dict[str, Any]) -> bool: + """ + Check if a sink is actually vulnerable based on context. + This is interpretive logic that makes security judgments. + + For example, parameterized queries are safe even if they use execute(). + + Args: + cursor: SQLite cursor for querying code patterns + sink: Sink dictionary with name and category + context: Context information about the sink + + Returns: + True if the sink is judged to be vulnerable, False if safe + """ + # Direct access - sinks must have name and category + sink_name = sink["name"].lower() if "name" in sink else "" + sink_category = sink["category"] if "category" in sink else "" + + # SQL injection context checking + if sink_category == "sql" or "execute" in sink_name or "query" in sink_name: + # Check if this is a parameterized query + # Look for patterns that indicate parameterization + cursor.execute(""" + SELECT name + FROM symbols + WHERE path = ? + AND type = 'call' + AND line = ? + """, (sink["file"], sink["line"])) + + call_at_line = cursor.fetchone() + if call_at_line: + call_text = call_at_line[0] + # Heuristics for parameterized queries + # If using ? or %s placeholders, it's likely parameterized + # If using prepare/bind patterns, it's safe + safe_patterns = [ + "prepare", + "bind", + "execute(", # With parameters + "executemany", + "format(", # SQL formatting functions + "sql.SQL", + "sql.Identifier", + "text(", # SQLAlchemy safe text + ] + + for pattern in safe_patterns: + if pattern in call_text: + return False # Not vulnerable - using safe pattern + + # Check for dangerous patterns (string concatenation) + dangerous_patterns = [ + "+", # String concatenation + ".format", # String formatting (when not SQL.format) + "f\"", # F-strings + "%", # Old-style formatting + ] + + # Get the actual code around the sink to check for concatenation + cursor.execute(""" + SELECT name + FROM symbols + WHERE path = ? + AND type = 'call' + AND line >= ? + AND line <= ? + """, (sink["file"], sink["line"] - 1, sink["line"] + 1)) + + nearby_calls = cursor.fetchall() + for call in nearby_calls: + call_str = str(call[0]) + for pattern in dangerous_patterns: + if pattern in call_str and "sql" not in call_str.lower(): + return True # Vulnerable - using dangerous pattern + + # Command injection context checking + elif sink_category == "command" or any(cmd in sink_name for cmd in ["system", "exec", "spawn"]): + # Check if using shell=False or proper escaping + cursor.execute(""" + SELECT name + FROM symbols + WHERE path = ? + AND line = ? + """, (sink["file"], sink["line"])) + + call_details = cursor.fetchone() + if call_details: + call_text = call_details[0] + # Safe patterns for command execution + if "shell=False" in call_text or "shlex" in call_text: + return False # Not vulnerable - using safe execution + + # Path traversal context checking + elif sink_category == "path": + # Check if path is validated/sanitized + cursor.execute(""" + SELECT name + FROM symbols + WHERE path = ? + AND type = 'call' + AND line >= ? + AND line <= ? + """, (sink["file"], sink["line"] - 3, sink["line"])) + + recent_calls = cursor.fetchall() + for call in recent_calls: + if any(san in str(call[0]) for san in ["basename", "secure_filename", "normalize"]): + return False # Path is sanitized + + # Default: consider it vulnerable if we can't prove it's safe + return True + + +def generate_summary(paths: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Generate a summary of the taint analysis results. + This is interpretive logic that creates risk assessments and recommendations. + + Args: + paths: List of taint path dictionaries + + Returns: + Summary with risk levels and recommendations + """ + if not paths: + return { + "risk_level": "low", + "critical_count": 0, + "high_count": 0, + "medium_count": 0, + "low_count": 0, + "recommendation": "No direct taint paths found. Continue monitoring for indirect flows." + } + + severity_counts = defaultdict(int) + for path in paths: + # Calculate severity for each path + severity = calculate_severity(path) + severity_counts[severity] += 1 + + # Determine overall risk level with clear explanation + critical_count = severity_counts.get("critical", 0) + high_count = severity_counts.get("high", 0) + medium_count = severity_counts.get("medium", 0) + low_count = severity_counts.get("low", 0) + + if critical_count > 0: + risk_level = "critical" + recommendation = f"URGENT: Critical risk level assigned due to {critical_count} critical-severity vulnerability(ies). Immediate remediation required!" + elif high_count > 2: + risk_level = "high" + recommendation = f"High risk level assigned due to {high_count} high-severity vulnerabilities. Priority remediation needed." + elif high_count > 0: + risk_level = "medium" + recommendation = f"Medium risk level assigned due to {high_count} high-severity vulnerability(ies) found. Schedule remediation in next sprint." + elif medium_count > 5: + risk_level = "medium" + recommendation = f"Medium risk level assigned due to high volume ({medium_count}) of medium-severity findings. Review and prioritize fixes." + else: + risk_level = "low" + recommendation = f"Low risk level assigned. Found {medium_count} medium and {low_count} low severity issues. Review and address as time permits." + + return { + "risk_level": risk_level, + "critical_count": severity_counts.get("critical", 0), + "high_count": severity_counts.get("high", 0), + "medium_count": severity_counts.get("medium", 0), + "low_count": severity_counts.get("low", 0), + "recommendation": recommendation, + "most_common_vulnerability": max( + [(v, k) for k, v in severity_counts.items()], + default=(0, "None") + )[1] if paths else "None" + } + + +def format_taint_report(analysis_result: Dict[str, Any]) -> str: + """ + Format taint analysis results into a human-readable report. + This is interpretive presentation logic. + + Args: + analysis_result: Raw analysis results from trace_taint + + Returns: + Formatted string report + """ + lines = [] + + # Use ASCII characters on Windows, Unicode elsewhere + if IS_WINDOWS: + border_char = "=" + section_char = "-" + arrow = "->" + else: + border_char = "=" + section_char = "─" + arrow = "→" + + # Header + lines.append(border_char * 60) + lines.append("TAINT ANALYSIS SECURITY REPORT") + lines.append(border_char * 60) + + if not analysis_result.get("success"): + lines.append(f"\nError: {analysis_result.get('error', 'Unknown error')}") + return "\n".join(lines) + + # Summary + summary = analysis_result.get("summary", {}) + lines.append(f"\nRisk Level: {summary.get('risk_level', '').upper()}") + lines.append(f"Recommendation: {summary.get('recommendation', '')}") + + # Statistics + lines.append(f"\n{section_char * 40}") + lines.append("SCAN STATISTICS") + lines.append(f"{section_char * 40}") + lines.append(f"Taint Sources Found: {analysis_result.get('sources_found', 0)}") + lines.append(f"Security Sinks Found: {analysis_result.get('sinks_found', 0)}") + lines.append(f"Total Vulnerabilities: {analysis_result.get('total_vulnerabilities', 0)}") + + # Vulnerabilities by type + vuln_types = analysis_result.get("vulnerabilities_by_type", {}) + if vuln_types: + lines.append(f"\n{section_char * 40}") + lines.append("VULNERABILITIES BY TYPE") + lines.append(f"{section_char * 40}") + for vuln_type, count in sorted(vuln_types.items(), key=lambda x: x[1], reverse=True): + lines.append(f" {vuln_type}: {count}") + + # Severity breakdown + lines.append(f"\n{section_char * 40}") + lines.append("SEVERITY BREAKDOWN") + lines.append(f"{section_char * 40}") + lines.append(f" CRITICAL: {summary.get('critical_count', 0)}") + lines.append(f" HIGH: {summary.get('high_count', 0)}") + lines.append(f" MEDIUM: {summary.get('medium_count', 0)}") + lines.append(f" LOW: {summary.get('low_count', 0)}") + + # Detailed paths (limit to top 10) + # Handle both "taint_paths" and "paths" keys for compatibility + paths = analysis_result.get("taint_paths", analysis_result.get("paths", [])) + if paths: + lines.append(f"\n{section_char * 40}") + lines.append("TOP VULNERABILITY PATHS") + lines.append(f"{section_char * 40}") + + # Sort by severity + sorted_paths = sorted(paths, key=lambda p: ( + {"critical": 0, "high": 1, "medium": 2, "low": 3}.get(p.get("severity", "unknown"), 4), + p.get("path_length", 0) + )) + + for i, path in enumerate(sorted_paths[:10], 1): + lines.append(f"\n{i}. {path.get('vulnerability_type', 'Unknown')} ({path.get('severity', 'unknown').upper()})") + lines.append(f" Source: {path.get('source', {}).get('name', '')} at {path.get('source', {}).get('file', '')}:{path.get('source', {}).get('line', 0)}") + lines.append(f" Sink: {path.get('sink', {}).get('name', '')} at {path.get('sink', {}).get('file', '')}:{path.get('sink', {}).get('line', 0)}") + lines.append(f" Path Length: {path.get('path_length', 0)} steps") + + if len(path.get('path', [])) <= 4: + lines.append(" Flow:") + for step in path.get('path', []): + if isinstance(step, dict): + lines.append(f" {arrow} {step.get('name', '')}") + + lines.append("\n" + border_char * 60) + + return "\n".join(lines) + + +def get_taint_summary(taint_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Distill potentially large taint analysis data into a concise, AI-readable summary. + This is interpretive intelligence that extracts key insights. + + This function solves the "200MB file paradox" by extracting key insights + from large taint analysis results that the AI cannot read directly. + + Args: + taint_data: Large taint analysis dict with vulnerability paths + + Returns: + Concise summary (<1MB) with key security insights + """ + vulnerabilities = taint_data.get("vulnerabilities", []) + + # Count vulnerabilities by type + vuln_by_type = defaultdict(int) + vuln_by_severity = defaultdict(int) + source_files = set() + sink_files = set() + + for vuln in vulnerabilities: + # Categorize by type + vuln_type = vuln.get("vulnerability_type", "") # Empty not unknown + vuln_by_type[vuln_type] += 1 + + # Categorize by severity + severity = vuln.get("severity", "medium") + vuln_by_severity[severity] += 1 + + # Track source and sink files + if "source" in vuln: + source_files.add(vuln["source"].get("file", "")) # Empty not unknown + if "sink" in vuln: + sink_files.add(vuln["sink"].get("file", "")) # Empty not unknown + + # Find top risky source files (files that originate the most vulnerabilities) + source_file_counts = defaultdict(int) + for vuln in vulnerabilities[:100]: # Limit for efficiency + if "source" in vuln: + source_file = vuln["source"].get("file", "") # Empty not unknown + source_file_counts[source_file] += 1 + + top_source_files = sorted( + source_file_counts.items(), + key=lambda x: x[1], + reverse=True + )[:5] + + # Find top vulnerable sinks (functions that are most frequently vulnerable) + sink_counts = defaultdict(int) + for vuln in vulnerabilities[:100]: # Limit for efficiency + if "sink" in vuln: + sink_name = vuln["sink"].get("name", "") # Empty not unknown + sink_counts[sink_name] += 1 + + top_sinks = sorted( + sink_counts.items(), + key=lambda x: x[1], + reverse=True + )[:5] + + # Extract critical vulnerabilities (first 5 high/critical severity) + critical_vulns = [] + for vuln in vulnerabilities: + if vuln.get("severity") in ["critical", "high"] and len(critical_vulns) < 5: + # Create a condensed version + critical_vulns.append({ + "type": vuln.get("vulnerability_type", ""), # Empty not unknown + "severity": vuln.get("severity"), + "source": f"{vuln.get('source', {}).get('file', '')}:{vuln.get('source', {}).get('line', 0)}", # Empty not unknown + "sink": f"{vuln.get('sink', {}).get('file', '')}:{vuln.get('sink', {}).get('line', 0)}", # Empty not unknown + "path_length": len(vuln.get("path", [])) + }) + + # Create summary + summary = { + "statistics": { + "total_vulnerabilities": len(vulnerabilities), + "unique_source_files": len(source_files), + "unique_sink_files": len(sink_files), + "total_paths_analyzed": taint_data.get("total_paths", 0) + }, + "vulnerabilities_by_type": dict(vuln_by_type), + "vulnerabilities_by_severity": dict(vuln_by_severity), + "top_risky_source_files": [ + {"file": file, "vulnerability_count": count} + for file, count in top_source_files + ], + "top_vulnerable_sinks": [ + {"sink": sink, "occurrence_count": count} + for sink, count in top_sinks + ], + "critical_vulnerabilities": critical_vulns, + "security_insights": { + "has_sql_injection": vuln_by_type.get("sql_injection", 0) > 0, + "has_xss": vuln_by_type.get("xss", 0) > 0, + "has_command_injection": vuln_by_type.get("command_injection", 0) > 0, + "has_path_traversal": vuln_by_type.get("path_traversal", 0) > 0, + "critical_count": vuln_by_severity.get("critical", 0), + "high_count": vuln_by_severity.get("high", 0), + "risk_level": "critical" if vuln_by_severity.get("critical", 0) > 0 + else "high" if vuln_by_severity.get("high", 0) > 5 + else "medium" if len(vulnerabilities) > 10 + else "low" + } + } + + return summary \ No newline at end of file diff --git a/theauditor/journal.py b/theauditor/journal.py new file mode 100644 index 0000000..9907b78 --- /dev/null +++ b/theauditor/journal.py @@ -0,0 +1,446 @@ +"""Journal system for tracking audit execution history. + +This module provides functionality to write and read execution journals in NDJSON format. +The journal tracks all pipeline events, file touches, and results for ML training. +""" + +import json +import os +from datetime import datetime, UTC +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + + +class JournalWriter: + """Writes execution events to journal.ndjson file.""" + + def __init__(self, journal_path: str = "./.pf/journal.ndjson", history_dir: Optional[str] = None): + """Initialize journal writer. + + Args: + journal_path: Path to the journal file + history_dir: Optional history directory for archival copies + """ + self.journal_path = Path(journal_path) + self.history_dir = Path(history_dir) if history_dir else None + self.session_id = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") + + # Ensure parent directory exists + self.journal_path.parent.mkdir(parents=True, exist_ok=True) + + # Open file in append mode for continuous writing + self.file_handle = None + self._open_journal() + + def _open_journal(self): + """Open journal file for writing.""" + try: + self.file_handle = open(self.journal_path, 'a', encoding='utf-8', buffering=1) + except Exception as e: + print(f"[WARNING] Could not open journal file {self.journal_path}: {e}") + self.file_handle = None + + def write_event(self, event_type: str, data: Dict[str, Any]) -> bool: + """Write an event to the journal. + + Args: + event_type: Type of event (phase, file_touch, result, error, etc.) + data: Event data dictionary + + Returns: + True if written successfully, False otherwise + """ + if not self.file_handle: + return False + + try: + event = { + "timestamp": datetime.now(UTC).isoformat(), + "session_id": self.session_id, + "event_type": event_type, + **data + } + + # Write as NDJSON (one JSON object per line) + json.dump(event, self.file_handle) + self.file_handle.write('\n') + self.file_handle.flush() # Force write to disk + return True + + except Exception as e: + print(f"[WARNING] Failed to write journal event: {e}") + return False + + def phase_start(self, phase_name: str, command: str, phase_num: int = 0) -> bool: + """Record the start of a pipeline phase. + + Args: + phase_name: Human-readable phase name + command: Command being executed + phase_num: Phase number in sequence + """ + return self.write_event("phase_start", { + "phase": phase_name, + "command": command, + "phase_num": phase_num + }) + + def phase_end(self, phase_name: str, success: bool, elapsed: float, + exit_code: int = 0, error_msg: Optional[str] = None) -> bool: + """Record the end of a pipeline phase. + + Args: + phase_name: Human-readable phase name + success: Whether phase succeeded + elapsed: Execution time in seconds + exit_code: Process exit code + error_msg: Optional error message + """ + return self.write_event("phase_end", { + "phase": phase_name, + "result": "success" if success else "fail", + "elapsed": elapsed, + "exit_code": exit_code, + "error": error_msg + }) + + def file_touch(self, file_path: str, operation: str = "analyze", + success: bool = True, findings: int = 0) -> bool: + """Record a file being touched/analyzed. + + Args: + file_path: Path to the file + operation: Type of operation (analyze, modify, create, etc.) + success: Whether operation succeeded + findings: Number of findings/issues found + """ + return self.write_event("file_touch", { + "file": file_path, + "operation": operation, + "result": "success" if success else "fail", + "findings": findings + }) + + def finding(self, file_path: str, severity: str, category: str, + message: str, line: Optional[int] = None) -> bool: + """Record a specific finding/issue. + + Args: + file_path: File where finding was detected + severity: Severity level (critical, high, medium, low) + category: Category of finding + message: Finding message + line: Optional line number + """ + return self.write_event("finding", { + "file": file_path, + "severity": severity, + "category": category, + "message": message, + "line": line + }) + + def apply_patch(self, file_path: str, success: bool, + patch_type: str = "fix", error_msg: Optional[str] = None) -> bool: + """Record a patch/fix being applied to a file. + + Args: + file_path: File being patched + success: Whether patch succeeded + patch_type: Type of patch (fix, refactor, update, etc.) + error_msg: Optional error message + """ + return self.write_event("apply_patch", { + "file": file_path, + "result": "success" if success else "fail", + "patch_type": patch_type, + "error": error_msg + }) + + def pipeline_summary(self, total_phases: int, failed_phases: int, + total_files: int, total_findings: int, + elapsed: float, status: str = "complete") -> bool: + """Record pipeline execution summary. + + Args: + total_phases: Total number of phases executed + failed_phases: Number of failed phases + total_files: Total files analyzed + total_findings: Total findings detected + elapsed: Total execution time + status: Overall status (complete, partial, failed) + """ + return self.write_event("pipeline_summary", { + "total_phases": total_phases, + "failed_phases": failed_phases, + "total_files": total_files, + "total_findings": total_findings, + "elapsed": elapsed, + "status": status + }) + + def close(self, copy_to_history: bool = True): + """Close the journal file and optionally copy to history. + + Args: + copy_to_history: Whether to copy journal to history directory + """ + if self.file_handle: + try: + self.file_handle.close() + except: + pass + self.file_handle = None + + # Copy to history if requested and history_dir is set + if copy_to_history and self.history_dir and self.journal_path.exists(): + try: + import shutil + self.history_dir.mkdir(parents=True, exist_ok=True) + dest_path = self.history_dir / f"journal_{self.session_id}.ndjson" + shutil.copy2(self.journal_path, dest_path) + print(f"[INFO] Journal copied to history: {dest_path}") + except Exception as e: + print(f"[WARNING] Could not copy journal to history: {e}") + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit - close journal.""" + self.close() + + +class JournalReader: + """Reads and queries journal.ndjson files.""" + + def __init__(self, journal_path: str = "./.pf/journal.ndjson"): + """Initialize journal reader. + + Args: + journal_path: Path to the journal file + """ + self.journal_path = Path(journal_path) + + def read_events(self, event_type: Optional[str] = None, + since: Optional[datetime] = None, + session_id: Optional[str] = None) -> List[Dict[str, Any]]: + """Read events from journal with optional filtering. + + Args: + event_type: Filter by event type + since: Only events after this timestamp + session_id: Filter by session ID + + Returns: + List of matching events + """ + if not self.journal_path.exists(): + return [] + + events = [] + try: + with open(self.journal_path, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + event = json.loads(line) + + # Apply filters + if event_type and event.get("event_type") != event_type: + continue + + if session_id and event.get("session_id") != session_id: + continue + + if since: + event_time = datetime.fromisoformat(event.get("timestamp", "")) + if event_time < since: + continue + + events.append(event) + + except json.JSONDecodeError: + print(f"[WARNING] Skipping malformed JSON at line {line_num}") + continue + + except Exception as e: + print(f"[WARNING] Error reading journal: {e}") + + return events + + def get_file_stats(self) -> Dict[str, Dict[str, int]]: + """Get statistics for file touches and failures. + + Returns: + Dict mapping file paths to stats (touches, failures, successes) + """ + stats = {} + + for event in self.read_events(event_type="file_touch"): + file_path = event.get("file", "") + if not file_path: + continue + + if file_path not in stats: + stats[file_path] = { + "touches": 0, + "failures": 0, + "successes": 0, + "findings": 0 + } + + stats[file_path]["touches"] += 1 + + if event.get("result") == "fail": + stats[file_path]["failures"] += 1 + else: + stats[file_path]["successes"] += 1 + + stats[file_path]["findings"] += event.get("findings", 0) + + # Also count apply_patch events + for event in self.read_events(event_type="apply_patch"): + file_path = event.get("file", "") + if not file_path: + continue + + if file_path not in stats: + stats[file_path] = { + "touches": 0, + "failures": 0, + "successes": 0, + "findings": 0 + } + + stats[file_path]["touches"] += 1 + + if event.get("result") == "fail": + stats[file_path]["failures"] += 1 + else: + stats[file_path]["successes"] += 1 + + return stats + + def get_phase_stats(self) -> Dict[str, Dict[str, Any]]: + """Get statistics for pipeline phases. + + Returns: + Dict mapping phase names to execution stats + """ + stats = {} + + # Track phase starts + for event in self.read_events(event_type="phase_start"): + phase = event.get("phase", "") + if not phase: + continue + + if phase not in stats: + stats[phase] = { + "executions": 0, + "failures": 0, + "total_elapsed": 0.0, + "last_executed": None + } + + stats[phase]["executions"] += 1 + stats[phase]["last_executed"] = event.get("timestamp") + + # Track phase ends + for event in self.read_events(event_type="phase_end"): + phase = event.get("phase", "") + if not phase or phase not in stats: + continue + + if event.get("result") == "fail": + stats[phase]["failures"] += 1 + + stats[phase]["total_elapsed"] += event.get("elapsed", 0.0) + + return stats + + def get_recent_failures(self, limit: int = 10) -> List[Dict[str, Any]]: + """Get recent failure events. + + Args: + limit: Maximum number of failures to return + + Returns: + List of recent failure events + """ + failures = [] + + # Get all failure events + for event in self.read_events(): + if event.get("result") == "fail" or event.get("event_type") == "error": + failures.append(event) + + # Sort by timestamp (most recent first) + failures.sort(key=lambda x: x.get("timestamp", ""), reverse=True) + + return failures[:limit] + + +# Integration functions for pipeline +def get_journal_writer(run_type: str = "full") -> JournalWriter: + """Get a journal writer for the current run. + + Args: + run_type: Type of run (full, diff, etc.) + + Returns: + JournalWriter instance + """ + # Determine history directory based on run type + history_dir = Path("./.pf/history") / run_type / datetime.now(UTC).strftime("%Y%m%d_%H%M%S") + + return JournalWriter( + journal_path="./.pf/journal.ndjson", + history_dir=str(history_dir) + ) + + +def integrate_with_pipeline(pipeline_func): + """Decorator to integrate journal writing with pipeline execution. + + This decorator wraps pipeline functions to automatically write journal events. + """ + def wrapper(*args, **kwargs): + # Get or create journal writer + journal = kwargs.pop('journal', None) + close_journal = False + + if journal is None: + journal = get_journal_writer(kwargs.get('run_type', 'full')) + close_journal = True + + try: + # Inject journal into kwargs + kwargs['journal'] = journal + + # Execute pipeline + result = pipeline_func(*args, **kwargs) + + # Write summary if available + if isinstance(result, dict): + journal.pipeline_summary( + total_phases=result.get('total_phases', 0), + failed_phases=result.get('failed_phases', 0), + total_files=len(result.get('created_files', [])), + total_findings=result.get('findings', {}).get('total_vulnerabilities', 0), + elapsed=result.get('elapsed_time', 0.0), + status='complete' if result.get('success') else 'failed' + ) + + return result + + finally: + if close_journal: + journal.close() + + return wrapper \ No newline at end of file diff --git a/theauditor/js_init.py b/theauditor/js_init.py new file mode 100644 index 0000000..8ce893f --- /dev/null +++ b/theauditor/js_init.py @@ -0,0 +1,154 @@ +"""JavaScript/TypeScript project initialization.""" + +import json +from pathlib import Path + + +def deep_merge(base: dict, overlay: dict) -> dict: + """ + Deep merge overlay into base, only adding missing keys. + + Existing values in base are never overwritten. + """ + result = base.copy() + + for key, value in overlay.items(): + if key not in result: + result[key] = value + elif isinstance(value, dict) and isinstance(result[key], dict): + # Recursively merge nested dicts + result[key] = deep_merge(result[key], value) + + return result + + +def ensure_package_json(path: str) -> dict[str, str]: + """ + Create or merge minimal package.json for lint/typecheck. + + Returns: + {"status": "created"} if new file created + {"status": "merged"} if existing file updated + {"status": "unchanged"} if no changes needed + """ + package_path = Path(path) + + # Template with PIN_ME placeholders + template = { + "private": True, + "devDependencies": { + "eslint": "", + "@typescript-eslint/parser": "", + "@typescript-eslint/eslint-plugin": "", + "typescript": "", + "prettier": "", + }, + "scripts": { + "lint": "eslint .", + "typecheck": "tsc --noEmit", + "format": "prettier -c .", + }, + } + + if package_path.exists(): + # Load existing + with open(package_path) as f: + existing = json.load(f) + + # Deep merge + merged = deep_merge(existing, template) + + if merged == existing: + return {"status": "unchanged"} + + # Write merged version + with open(package_path, "w") as f: + json.dump(merged, f, indent=2) + + return {"status": "merged"} + else: + # Create new file + with open(package_path, "w") as f: + json.dump(template, f, indent=2) + + return {"status": "created"} + + +def add_auditor_hooks(path: str) -> dict[str, str]: + """ + Add TheAuditor hooks to package.json scripts non-destructively. + + Adds the following hooks: + - pretest: aud lint --workset + - prebuild: aud ast-verify + - prepush: aud taint-analyze + + If hooks already exist, prepends Auditor commands with &&. + + Args: + path: Path to package.json file + + Returns: + {"status": "hooks_added", "details": } if hooks were added + {"status": "unchanged"} if all hooks already present + {"status": "error", "message": } if error occurred + """ + package_path = Path(path) + + # Check if file exists + if not package_path.exists(): + return {"status": "error", "message": f"File not found: {path}"} + + try: + # Read existing package.json + with open(package_path, 'r') as f: + package_data = json.load(f) + + # Ensure scripts object exists + if "scripts" not in package_data: + package_data["scripts"] = {} + + scripts = package_data["scripts"] + + # Define desired Auditor hooks + auditor_hooks = { + "pretest": "aud lint --workset", + "prebuild": "aud ast-verify", + "prepush": "aud taint-analyze" + } + + changes = [] + + for hook_name, auditor_cmd in auditor_hooks.items(): + if hook_name not in scripts: + # Hook doesn't exist, add it + scripts[hook_name] = auditor_cmd + changes.append(f"Added {hook_name}: {auditor_cmd}") + else: + existing_cmd = scripts[hook_name] + + # Check if Auditor command is already present + if auditor_cmd in existing_cmd: + # Already has the command, skip + continue + + # Prepend Auditor command with && + new_cmd = f"{auditor_cmd} && {existing_cmd}" + scripts[hook_name] = new_cmd + changes.append(f"Modified {hook_name}: prepended {auditor_cmd}") + + if not changes: + return {"status": "unchanged"} + + # Write modified package.json with 2-space indent + with open(package_path, 'w') as f: + json.dump(package_data, f, indent=2) + # Add trailing newline for consistency with npm + f.write('\n') + + return {"status": "hooks_added", "details": changes} + + except json.JSONDecodeError as e: + return {"status": "error", "message": f"Invalid JSON in {path}: {e}"} + except Exception as e: + return {"status": "error", "message": f"Error processing {path}: {e}"} diff --git a/theauditor/js_semantic_parser.py b/theauditor/js_semantic_parser.py new file mode 100644 index 0000000..218a9c3 --- /dev/null +++ b/theauditor/js_semantic_parser.py @@ -0,0 +1,1270 @@ +"""JavaScript/TypeScript semantic parser using the TypeScript Compiler API. + +This module replaces Tree-sitter's syntactic parsing with true semantic analysis +using the TypeScript compiler, enabling accurate type analysis, symbol resolution, +and cross-file understanding for JavaScript and TypeScript projects. +""" + +import json +import os +import platform +import re +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import Dict, Optional, Any, List, Tuple + +# Import our custom temp manager to avoid WSL2/Windows issues +try: + from theauditor.utils.temp_manager import TempManager +except ImportError: + # Fallback to regular tempfile if custom manager not available + TempManager = None + +# Windows compatibility for subprocess calls +IS_WINDOWS = platform.system() == "Windows" + +# Module-level cache for resolver (it's stateless now) +_module_resolver_cache = None + + +class JSSemanticParser: + """Semantic parser for JavaScript/TypeScript using the TypeScript Compiler API.""" + + def __init__(self, project_root: str = None): + """Initialize the semantic parser. + + Args: + project_root: Absolute path to project root. If not provided, uses current directory. + """ + self.project_root = Path(project_root).resolve() if project_root else Path.cwd().resolve() + self.using_windows_node = False # Track if we're using Windows node.exe from WSL + self.tsc_path = None # Path to TypeScript compiler + self.node_modules_path = None # Path to sandbox node_modules + + # CRITICAL: Reuse cached ModuleResolver (stateless, database-driven) + global _module_resolver_cache + if _module_resolver_cache is None: + from theauditor.module_resolver import ModuleResolver + _module_resolver_cache = ModuleResolver() # No project_root needed! + print("[DEBUG] Created singleton ModuleResolver instance") + + self.module_resolver = _module_resolver_cache + + # CRITICAL FIX: Find the sandboxed node executable (like linters do) + # Platform-agnostic: Check multiple possible locations + sandbox_base = self.project_root / ".auditor_venv" / ".theauditor_tools" + node_runtime = sandbox_base / "node-runtime" + + # Check all possible node locations (Windows or Unix layout) + possible_node_paths = [ + node_runtime / "node.exe", # Windows binary in root + node_runtime / "node", # Unix binary in root + node_runtime / "bin" / "node", # Unix binary in bin/ + node_runtime / "bin" / "node.exe", # Windows binary in bin/ (unusual but possible) + ] + + self.node_exe = None + for node_path in possible_node_paths: + if node_path.exists(): + self.node_exe = node_path + # Track if we're using Windows node on WSL + self.using_windows_node = str(node_path).endswith('.exe') and str(node_path).startswith('/') + break + + # If not found, will trigger proper error messages + + self.tsc_available = self._check_tsc_availability() + self.helper_script = self._create_helper_script() + self.batch_helper_script = self._create_batch_helper_script() # NEW: Batch processing helper + + def _convert_path_for_node(self, path: Path) -> str: + """Convert path to appropriate format for node execution. + + If using Windows node.exe from WSL, converts to Windows path. + Otherwise returns the path as-is. + """ + path_str = str(path) + if self.using_windows_node: + try: + import subprocess as sp + result = sp.run(['wslpath', '-w', path_str], + capture_output=True, text=True, timeout=2) + if result.returncode == 0: + return result.stdout.strip() + except: + pass # Fall back to original path + return path_str + + def _check_tsc_availability(self) -> bool: + """Check if TypeScript compiler is available in our sandbox. + + CRITICAL: We ONLY use our own sandboxed TypeScript installation. + We do not check or use any user-installed versions. + """ + # Check our sandbox location ONLY - no invasive checking of user's environment + # CRITICAL: Use absolute path from project root to avoid finding wrong sandboxes + sandbox_base = self.project_root / ".auditor_venv" / ".theauditor_tools" / "node_modules" + + # Check if sandbox exists at the absolute location + sandbox_locations = [sandbox_base] + + for sandbox_base in sandbox_locations: + if not sandbox_base.exists(): + continue + + # Check for TypeScript in sandbox + tsc_paths = [ + sandbox_base / ".bin" / "tsc", + sandbox_base / ".bin" / "tsc.cmd", # Windows + ] + + # Also check for the actual TypeScript compiler JS file + tsc_js_path = sandbox_base / "typescript" / "lib" / "tsc.js" + + # If we have node and the TypeScript compiler JS file, we can use it + if self.node_exe and tsc_js_path.exists(): + try: + # Verify it actually works by running through node + # CRITICAL: Use absolute path for NODE_PATH + absolute_sandbox = sandbox_base.resolve() + # Use temp files to avoid buffer overflow + if TempManager: + stdout_path, stderr_path = TempManager.create_temp_files_for_subprocess( + str(self.project_root), "tsc_verify" + ) + with open(stdout_path, 'w+', encoding='utf-8') as stdout_fp, \ + open(stderr_path, 'w+', encoding='utf-8') as stderr_fp: + pass # File handles created, will be used below + else: + # Fallback to regular tempfile + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stdout.txt', encoding='utf-8') as stdout_fp, \ + tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stderr.txt', encoding='utf-8') as stderr_fp: + stdout_path = stdout_fp.name + stderr_path = stderr_fp.name + + with open(stdout_path, 'w+', encoding='utf-8') as stdout_fp, \ + open(stderr_path, 'w+', encoding='utf-8') as stderr_fp: + + # Convert paths for Windows node if needed + tsc_path_str = self._convert_path_for_node(tsc_js_path) + + # Run TypeScript through node.exe + result = subprocess.run( + [str(self.node_exe), tsc_path_str, "--version"], + stdout=stdout_fp, + stderr=stderr_fp, + text=True, + timeout=5, + env={**os.environ, "NODE_PATH": str(absolute_sandbox)}, + shell=False # Never use shell when we have full path + ) + + with open(stdout_path, 'r', encoding='utf-8') as f: + result.stdout = f.read() + with open(stderr_path, 'r', encoding='utf-8') as f: + result.stderr = f.read() + + os.unlink(stdout_path) + os.unlink(stderr_path) + if result.returncode == 0: + self.tsc_path = tsc_js_path # Store the JS file path, not the shell script + self.node_modules_path = absolute_sandbox # Store absolute path + return True + except (subprocess.SubprocessError, FileNotFoundError, OSError): + pass # TypeScript check failed + + # No sandbox TypeScript found - this is expected on first run + return False + + def _extract_vue_blocks(self, content: str) -> Tuple[Optional[str], Optional[str]]: + """Extract script and template blocks from Vue SFC content. + + Args: + content: The raw Vue SFC file content + + Returns: + Tuple of (script_content, template_content) or (None, None) if not found + """ + # Extract ' + script_match = re.search(script_pattern, content, re.DOTALL | re.IGNORECASE) + script_content = script_match.group(1).strip() if script_match else None + + # Extract ' + template_match = re.search(template_pattern, content, re.DOTALL | re.IGNORECASE) + template_content = template_match.group(1).strip() if template_match else None + + return script_content, template_content + + def _create_helper_script(self) -> Path: + """Create a Node.js helper script for TypeScript AST extraction. + + Returns: + Path to the created helper script + """ + # CRITICAL: Create helper script with relative path resolution + # Always create in project root's .pf directory + pf_dir = self.project_root / ".pf" + pf_dir.mkdir(exist_ok=True) + + helper_path = pf_dir / "tsc_ast_helper.js" + + # Check if TypeScript module exists in our sandbox + typescript_exists = False + if self.node_modules_path: + # The TypeScript module is at node_modules/typescript/lib/typescript.js + ts_path = self.node_modules_path / "typescript" / "lib" / "typescript.js" + typescript_exists = ts_path.exists() + + # Write the helper script that uses TypeScript Compiler API + # CRITICAL: Use relative path from helper script location to find TypeScript + helper_content = ''' +// Use TypeScript from our sandbox location with RELATIVE PATH +// This is portable - works on any machine in any location +const path = require('path'); +const fs = require('fs'); + +// Find project root by going up from .pf directory +const projectRoot = path.resolve(__dirname, '..'); + +// Build path to TypeScript module relative to project root +const tsPath = path.join(projectRoot, '.auditor_venv', '.theauditor_tools', 'node_modules', 'typescript', 'lib', 'typescript.js'); + +// Try to load TypeScript with helpful error message +let ts; +try { + if (!fs.existsSync(tsPath)) { + throw new Error(`TypeScript not found at expected location: ${tsPath}. Run 'aud setup-claude' to install tools.`); + } + ts = require(tsPath); +} catch (error) { + console.error(JSON.stringify({ + success: false, + error: `Failed to load TypeScript: ${error.message}`, + expectedPath: tsPath, + projectRoot: projectRoot + })); + process.exit(1); +} + +// Get file path and output path from command line arguments +const filePath = process.argv[2]; +const outputPath = process.argv[3]; + +if (!filePath || !outputPath) { + console.error(JSON.stringify({ error: "File path and output path required" })); + process.exit(1); +} + +try { + // Read the source file + const sourceCode = fs.readFileSync(filePath, 'utf8'); + + // Create a source file object + const sourceFile = ts.createSourceFile( + filePath, + sourceCode, + ts.ScriptTarget.Latest, + true, // setParentNodes - important for full AST traversal + ts.ScriptKind.TSX // Support both TS and TSX + ); + + // Helper function to serialize AST nodes + function serializeNode(node, depth = 0) { + if (depth > 100) { // Prevent infinite recursion + return { kind: "TooDeep" }; + } + + const result = { + kind: node.kind !== undefined ? (ts.SyntaxKind[node.kind] || node.kind) : 'Unknown', + kindValue: node.kind || 0, + pos: node.pos || 0, + end: node.end || 0, + flags: node.flags || 0 + }; + + // Add text content for leaf nodes + if (node.text !== undefined) { + result.text = node.text; + } + + // Add identifier name + if (node.name) { + if (typeof node.name === 'object') { + // Handle both escapedName and regular name + if (node.name.escapedText !== undefined) { + result.name = node.name.escapedText; + } else if (node.name.text !== undefined) { + result.name = node.name.text; + } else { + result.name = serializeNode(node.name, depth + 1); + } + } else { + result.name = node.name; + } + } + + // Add type information if available + if (node.type) { + result.type = serializeNode(node.type, depth + 1); + } + + // Add children - handle nodes with members property + const children = []; + if (node.members && Array.isArray(node.members)) { + // Handle nodes with members (interfaces, enums, etc.) + node.members.forEach(member => { + if (member) children.push(serializeNode(member, depth + 1)); + }); + } + ts.forEachChild(node, child => { + if (child) children.push(serializeNode(child, depth + 1)); + }); + + if (children.length > 0) { + result.children = children; + } + + // Get line and column information + // CRITICAL FIX: Use getStart() to exclude leading trivia for accurate line numbers + const actualStart = node.getStart ? node.getStart(sourceFile) : node.pos; + const { line, character } = sourceFile.getLineAndCharacterOfPosition(actualStart); + result.line = line + 1; // Convert to 1-indexed + result.column = character; + + // RESTORED: Text extraction needed for accurate symbol names in taint analysis + result.text = sourceCode.substring(node.pos, node.end).trim(); + + return result; + } + + // Collect diagnostics (errors, warnings) + const diagnostics = []; + const program = ts.createProgram([filePath], { + target: ts.ScriptTarget.Latest, + module: ts.ModuleKind.ESNext, + jsx: ts.JsxEmit.Preserve, + allowJs: true, + checkJs: false, + noEmit: true, + skipLibCheck: true // Skip checking .d.ts files for speed + }); + + const allDiagnostics = ts.getPreEmitDiagnostics(program); + allDiagnostics.forEach(diagnostic => { + const message = ts.flattenDiagnosticMessageText(diagnostic.messageText, '\\n'); + const location = diagnostic.file && diagnostic.start + ? diagnostic.file.getLineAndCharacterOfPosition(diagnostic.start) + : null; + + diagnostics.push({ + message, + category: ts.DiagnosticCategory[diagnostic.category], + code: diagnostic.code, + line: location ? location.line + 1 : null, + column: location ? location.character : null + }); + }); + + // Collect symbols and type information + const checker = program.getTypeChecker(); + const symbols = []; + + // Visit nodes to collect symbols + function visit(node) { + try { + const symbol = checker.getSymbolAtLocation(node); + if (symbol && symbol.getName) { + const type = checker.getTypeOfSymbolAtLocation(symbol, node); + const typeString = checker.typeToString(type); + + symbols.push({ + name: symbol.getName ? symbol.getName() : 'anonymous', + kind: symbol.flags ? (ts.SymbolFlags[symbol.flags] || symbol.flags) : 0, + type: typeString || 'unknown', + line: node.pos !== undefined ? sourceFile.getLineAndCharacterOfPosition(node.pos).line + 1 : 0 + }); + } + } catch (e) { + // Log error for debugging + console.error(`[ERROR] Symbol extraction failed at ${filePath}:${node.pos}: ${e.message}`); + } + + ts.forEachChild(node, visit); + } + + visit(sourceFile); + + // Log symbol extraction results + console.error(`[INFO] Found ${symbols.length} symbols in ${filePath}`); + + // Output the complete AST with metadata + const result = { + success: true, + fileName: filePath, + languageVersion: ts.ScriptTarget[sourceFile.languageVersion], + ast: serializeNode(sourceFile), + diagnostics: diagnostics, + symbols: symbols, + nodeCount: 0, + hasTypes: symbols.some(s => s.type && s.type !== 'any') + }; + + // Count nodes + function countNodes(node) { + if (!node) return; + result.nodeCount++; + if (node.children && Array.isArray(node.children)) { + node.children.forEach(countNodes); + } + } + if (result.ast) countNodes(result.ast); + + // Write output to file instead of stdout to avoid pipe buffer limits + fs.writeFileSync(outputPath, JSON.stringify(result, null, 2), 'utf8'); + process.exit(0); // CRITICAL: Ensure clean exit on success + +} catch (error) { + console.error(JSON.stringify({ + success: false, + error: error.message, + stack: error.stack + })); + process.exit(1); +} +''' + + helper_path.write_text(helper_content, encoding='utf-8') + return helper_path + + def _create_batch_helper_script(self) -> Path: + """Create a Node.js helper script for batch TypeScript AST extraction. + + This script processes multiple files in a single TypeScript program, + dramatically improving performance by reusing the dependency cache. + + Returns: + Path to the created batch helper script + """ + pf_dir = self.project_root / ".pf" + pf_dir.mkdir(exist_ok=True) + + batch_helper_path = pf_dir / "tsc_batch_helper.js" + + batch_helper_content = ''' +// Batch TypeScript AST extraction - processes multiple files in one program +const path = require('path'); +const fs = require('fs'); + +// Find project root by going up from .pf directory +const projectRoot = path.resolve(__dirname, '..'); + +// Build path to TypeScript module +const tsPath = path.join(projectRoot, '.auditor_venv', '.theauditor_tools', 'node_modules', 'typescript', 'lib', 'typescript.js'); + +// Load TypeScript +let ts; +try { + if (!fs.existsSync(tsPath)) { + throw new Error(`TypeScript not found at: ${tsPath}`); + } + ts = require(tsPath); +} catch (error) { + console.error(JSON.stringify({ + success: false, + error: `Failed to load TypeScript: ${error.message}` + })); + process.exit(1); +} + +// Get request and output paths from command line +const requestPath = process.argv[2]; +const outputPath = process.argv[3]; + +if (!requestPath || !outputPath) { + console.error(JSON.stringify({ error: "Request and output paths required" })); + process.exit(1); +} + +try { + // Read batch request + const request = JSON.parse(fs.readFileSync(requestPath, 'utf8')); + const filePaths = request.files || []; + + if (filePaths.length === 0) { + fs.writeFileSync(outputPath, JSON.stringify({}), 'utf8'); + process.exit(0); + } + + // Create a SINGLE TypeScript program with ALL files + // This is the key optimization - TypeScript will parse dependencies ONCE + const program = ts.createProgram(filePaths, { + target: ts.ScriptTarget.Latest, + module: ts.ModuleKind.ESNext, + jsx: ts.JsxEmit.Preserve, + allowJs: true, + checkJs: false, + noEmit: true, + skipLibCheck: true, // Skip checking .d.ts files for speed + moduleResolution: ts.ModuleResolutionKind.NodeJs + }); + + const checker = program.getTypeChecker(); + const results = {}; + + // Process each file using the SHARED program + for (const filePath of filePaths) { + try { + const sourceFile = program.getSourceFile(filePath); + if (!sourceFile) { + results[filePath] = { + success: false, + error: `Could not load source file: ${filePath}` + }; + continue; + } + + const sourceCode = sourceFile.text; + + // Helper function to serialize AST nodes (same as single-file version) + function serializeNode(node, depth = 0) { + if (depth > 100) return { kind: "TooDeep" }; + + const result = { + kind: node.kind !== undefined ? (ts.SyntaxKind[node.kind] || node.kind) : 'Unknown', + kindValue: node.kind || 0, + pos: node.pos || 0, + end: node.end || 0, + flags: node.flags || 0 + }; + + if (node.text !== undefined) result.text = node.text; + + if (node.name) { + if (typeof node.name === 'object') { + if (node.name.escapedText !== undefined) { + result.name = node.name.escapedText; + } else if (node.name.text !== undefined) { + result.name = node.name.text; + } else { + result.name = serializeNode(node.name, depth + 1); + } + } else { + result.name = node.name; + } + } + + if (node.type) { + result.type = serializeNode(node.type, depth + 1); + } + + const children = []; + if (node.members && Array.isArray(node.members)) { + node.members.forEach(member => { + if (member) children.push(serializeNode(member, depth + 1)); + }); + } + ts.forEachChild(node, child => { + if (child) children.push(serializeNode(child, depth + 1)); + }); + + if (children.length > 0) { + result.children = children; + } + + // CRITICAL FIX: Use getStart() to exclude leading trivia for accurate line numbers + const actualStart = node.getStart ? node.getStart(sourceFile) : node.pos; + const { line, character } = sourceFile.getLineAndCharacterOfPosition(actualStart); + result.line = line + 1; + result.column = character; + // RESTORED: Text extraction needed for accurate symbol names in taint analysis + result.text = sourceCode.substring(node.pos, node.end).trim(); + + return result; + } + + // Collect diagnostics for this file + const diagnostics = []; + const fileDiagnostics = ts.getPreEmitDiagnostics(program, sourceFile); + fileDiagnostics.forEach(diagnostic => { + const message = ts.flattenDiagnosticMessageText(diagnostic.messageText, '\\n'); + const location = diagnostic.file && diagnostic.start + ? diagnostic.file.getLineAndCharacterOfPosition(diagnostic.start) + : null; + + diagnostics.push({ + message, + category: ts.DiagnosticCategory[diagnostic.category], + code: diagnostic.code, + line: location ? location.line + 1 : null, + column: location ? location.character : null + }); + }); + + // Collect symbols for this file + const symbols = []; + function visit(node) { + try { + const symbol = checker.getSymbolAtLocation(node); + if (symbol && symbol.getName) { + const type = checker.getTypeOfSymbolAtLocation(symbol, node); + const typeString = checker.typeToString(type); + + symbols.push({ + name: symbol.getName ? symbol.getName() : 'anonymous', + kind: symbol.flags ? (ts.SymbolFlags[symbol.flags] || symbol.flags) : 0, + type: typeString || 'unknown', + line: node.pos !== undefined ? sourceFile.getLineAndCharacterOfPosition(node.pos).line + 1 : 0 + }); + } + } catch (e) { + // Log error for debugging + console.error(`[ERROR] Symbol extraction failed at ${filePath}:${node.pos}: ${e.message}`); + } + ts.forEachChild(node, visit); + } + visit(sourceFile); + + // Log symbol extraction results + console.error(`[INFO] Found ${symbols.length} symbols in ${filePath}`); + + // Build result for this file + const result = { + success: true, + fileName: filePath, + languageVersion: ts.ScriptTarget[sourceFile.languageVersion], + ast: serializeNode(sourceFile), + diagnostics: diagnostics, + symbols: symbols, + nodeCount: 0, + hasTypes: symbols.some(s => s.type && s.type !== 'any') + }; + + // Count nodes + function countNodes(node) { + if (!node) return; + result.nodeCount++; + if (node.children && Array.isArray(node.children)) { + node.children.forEach(countNodes); + } + } + if (result.ast) countNodes(result.ast); + + results[filePath] = result; + + } catch (error) { + results[filePath] = { + success: false, + error: `Error processing file: ${error.message}`, + ast: null, + diagnostics: [], + symbols: [] + }; + } + } + + // Write all results to output file + fs.writeFileSync(outputPath, JSON.stringify(results, null, 2), 'utf8'); + process.exit(0); + +} catch (error) { + console.error(JSON.stringify({ + success: false, + error: error.message, + stack: error.stack + })); + process.exit(1); +} +''' + + batch_helper_path.write_text(batch_helper_content, encoding='utf-8') + return batch_helper_path + + def get_semantic_ast_batch(self, file_paths: List[str]) -> Dict[str, Dict[str, Any]]: + """Get semantic ASTs for multiple JavaScript/TypeScript files in a single process. + + This dramatically improves performance by reusing the TypeScript program + and dependency cache across multiple files. + + Args: + file_paths: List of paths to JavaScript or TypeScript files to parse + + Returns: + Dictionary mapping file paths to their AST results + """ + # Validate all files exist + results = {} + valid_files = [] + + for file_path in file_paths: + file = Path(file_path).resolve() + if not file.exists(): + results[file_path] = { + "success": False, + "error": f"File not found: {file_path}", + "ast": None, + "diagnostics": [], + "symbols": [] + } + elif file.suffix.lower() not in ['.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs', '.vue']: + results[file_path] = { + "success": False, + "error": f"Not a JavaScript/TypeScript file: {file_path}", + "ast": None, + "diagnostics": [], + "symbols": [] + } + else: + valid_files.append(str(file.resolve())) + + if not valid_files: + return results + + if not self.tsc_available: + for file_path in valid_files: + results[file_path] = { + "success": False, + "error": "TypeScript compiler not available in TheAuditor sandbox. Run 'aud setup-claude' to install tools.", + "ast": None, + "diagnostics": [], + "symbols": [] + } + return results + + try: + # Create batch request + batch_request = { + "files": valid_files, + "projectRoot": str(self.project_root) + } + + # Write batch request to temp file + if TempManager: + request_path, req_fd = TempManager.create_temp_file(str(self.project_root), suffix='_request.json') + os.close(req_fd) + output_path, out_fd = TempManager.create_temp_file(str(self.project_root), suffix='_output.json') + os.close(out_fd) + else: + # Fallback to regular tempfile + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as tmp_req: + request_path = tmp_req.name + with tempfile.NamedTemporaryFile(mode='w+', suffix='.json', delete=False, encoding='utf-8') as tmp_out: + output_path = tmp_out.name + + # Write batch request data + with open(request_path, 'w', encoding='utf-8') as f: + json.dump(batch_request, f) + + # Calculate timeout based on batch size + # 5 seconds base + 2 seconds per file + dynamic_timeout = min(5 + (len(valid_files) * 2), 120) + + try: + # Run batch helper script + # Convert paths for Windows node if needed + helper_path = self._convert_path_for_node(self.batch_helper_script.resolve()) + request_path_converted = self._convert_path_for_node(Path(request_path)) + output_path_converted = self._convert_path_for_node(Path(output_path)) + + # CRITICAL FIX: Use sandboxed node executable, not system "node" + if not self.node_exe: + raise RuntimeError("Node.js runtime not found. Run 'aud setup-claude' to install tools.") + + result = subprocess.run( + [str(self.node_exe), helper_path, request_path_converted, output_path_converted], + capture_output=False, + stderr=subprocess.PIPE, + text=True, + timeout=dynamic_timeout, + cwd=self.project_root, + shell=IS_WINDOWS # Windows compatibility fix + ) + + if result.returncode != 0: + error_msg = f"Batch TypeScript compiler failed (exit code {result.returncode})" + if result.stderr: + error_msg += f": {result.stderr.strip()[:500]}" + + for file_path in valid_files: + results[file_path] = { + "success": False, + "error": error_msg, + "ast": None, + "diagnostics": [], + "symbols": [] + } + else: + # Read batch results + if Path(output_path).exists(): + with open(output_path, 'r', encoding='utf-8') as f: + batch_results = json.load(f) + + # Map results back to original file paths + for file_path in file_paths: + resolved_path = str(Path(file_path).resolve()) + if resolved_path in batch_results: + results[file_path] = batch_results[resolved_path] + elif file_path not in results: + results[file_path] = { + "success": False, + "error": "File not processed in batch", + "ast": None, + "diagnostics": [], + "symbols": [] + } + else: + for file_path in valid_files: + results[file_path] = { + "success": False, + "error": "Batch output file not created", + "ast": None, + "diagnostics": [], + "symbols": [] + } + finally: + # Clean up temp files + for temp_path in [request_path, output_path]: + if Path(temp_path).exists(): + Path(temp_path).unlink() + + except subprocess.TimeoutExpired: + for file_path in valid_files: + results[file_path] = { + "success": False, + "error": f"Batch timeout: Files too large or complex to parse within {dynamic_timeout:.0f} seconds", + "ast": None, + "diagnostics": [], + "symbols": [] + } + except Exception as e: + for file_path in valid_files: + results[file_path] = { + "success": False, + "error": f"Unexpected error in batch processing: {e}", + "ast": None, + "diagnostics": [], + "symbols": [] + } + + return results + + def get_semantic_ast(self, file_path: str) -> Dict[str, Any]: + """Get semantic AST for a JavaScript/TypeScript file using the TypeScript compiler. + + Args: + file_path: Path to the JavaScript or TypeScript file to parse + + Returns: + Dictionary containing the semantic AST and metadata: + - success: Boolean indicating if parsing was successful + - ast: The full AST tree with semantic information + - diagnostics: List of errors/warnings from TypeScript + - symbols: List of symbols with type information + - nodeCount: Total number of AST nodes + - hasTypes: Boolean indicating if type information is available + - error: Error message if parsing failed + """ + # Validate file exists + file = Path(file_path).resolve() + if not file.exists(): + return { + "success": False, + "error": f"File not found: {file_path}", + "ast": None, + "diagnostics": [], + "symbols": [] + } + + # Check if it's a JavaScript, TypeScript, or Vue file + if file.suffix.lower() not in ['.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs', '.vue']: + return { + "success": False, + "error": f"Not a JavaScript/TypeScript file: {file_path}", + "ast": None, + "diagnostics": [], + "symbols": [] + } + + # CRITICAL: No fallbacks allowed - fail fast with clear error + if not self.tsc_available: + return { + "success": False, + "error": "TypeScript compiler not available in TheAuditor sandbox. Run 'aud setup-claude' to install tools.", + "ast": None, + "diagnostics": [], + "symbols": [] + } + + try: + # CRITICAL: No automatic installation - user must install TypeScript manually + # This enforces fail-fast philosophy + + # Handle Vue SFC files specially + actual_file_to_parse = file_path + vue_metadata = None + temp_file = None + + if file.suffix.lower() == '.vue': + # Read Vue SFC content + vue_content = file.read_text(encoding='utf-8') + script_content, template_content = self._extract_vue_blocks(vue_content) + + if script_content is None: + return { + "success": False, + "error": "No