Auditor/theauditor/taint/sources.py

"""Taint source, sink, and sanitizer definitions.

This module contains all the constant definitions for taint analysis:
- TAINT_SOURCES: Where untrusted data originates
- SECURITY_SINKS: Where untrusted data should not flow
- SANITIZERS: Functions that clean/validate data
"""

import platform

# Detect if running on Windows for character encoding
IS_WINDOWS = platform.system() == "Windows"


# Define taint sources (where untrusted data originates)
# Refined to focus on truly external/untrusted input sources
TAINT_SOURCES = {
    # JavaScript/TypeScript sources - Web request data only
    "js": [
        "req.body",
        "req.query",
        "req.params",
        "req.headers",
        "req.cookies",
        "request.body",
        "request.query",
        "request.params",
        "ctx.request.body",
        "ctx.query",
        "ctx.params",
        "document.location",
        "window.location",
        "document.URL",
        "document.referrer",
        "localStorage.getItem",
        "sessionStorage.getItem",
        "URLSearchParams",
        "postMessage",
    ],
    # Python sources - Web and CLI input only
    "python": [
        "request.args",
        "request.form",
        "request.json",
        "request.data",
        "request.values",
        "request.files",
        "request.cookies",
        "request.headers",
        "request.get_json",
        "request.get_data",
        "input",  # User console input
        "raw_input",  # Python 2 user input
        "sys.argv",  # Command line arguments
        "click.argument",  # Click CLI arguments
        "click.option",  # Click CLI options
        "argparse.parse_args",  # Argparse arguments
    ],
    # Network sources only - removed generic file operations
    "network": [
        "socket.recv",
        "socket.recvfrom",
        "websocket.receive",
        "stdin.read",  # Console input
    ],
    # Web scraping and data extraction sources
    "web_scraping": [
        # Requests library
        "requests.get",
        "requests.post",
        "requests.put",
        "requests.patch",
        "requests.delete",
        "response.text",
        "response.content",
        "response.json",
        "resp.text",
        "resp.content",
        "resp.json",

        # urllib
        "urlopen",
        "urllib.request.urlopen",
        "urllib2.urlopen",

        # BeautifulSoup HTML parsing
        "BeautifulSoup",
        "soup.find",
        "soup.find_all",
        "soup.select",
        "soup.select_one",
        "element.text",
        "element.get_text",
        "element.string",
        "tag.text",
        "tag.get_text",

        # Playwright browser automation
        "page.content",
        "page.inner_text",
        "page.inner_html",
        "page.locator",
        "page.text_content",
        "element.inner_text",
        "element.inner_html",
        "element.text_content",

        # Selenium browser automation
        "driver.page_source",
        "driver.find_element",
        "element.text",
        "element.get_attribute",
        "webdriver.page_source",

        # Scrapy framework
        "response.body",
        "response.text",
        "response.css",
        "response.xpath",
        "selector.get",
        "selector.getall",
    ],
    # File I/O and data loading sources
    "file_io": [
        # Basic file operations
        "open",
        "file.read",
        "file.readline",
        "file.readlines",

        # JSON operations
        "json.load",
        "json.loads",
        "json.JSONDecoder",

        # CSV/Excel operations
        "csv.reader",
        "csv.DictReader",
        "pd.read_csv",
        "pd.read_excel",
        "pd.read_json",
        "pd.read_html",
        "pd.read_sql",
        "pandas.read_csv",
        "pandas.read_excel",

        # YAML operations
        "yaml.load",
        "yaml.safe_load",
        "yaml.full_load",

        # XML operations
        "etree.parse",
        "etree.fromstring",
        "xml.parse",
        "ElementTree.parse",

        # Environment variables
        "os.getenv",
        "os.environ.get",
        "environ.get",
    ]
    # Database category REMOVED - internal database data is trusted, not a taint source
}

# Define sanitizers that clean/validate data for different vulnerability types
SANITIZERS = {
    # SQL sanitizers - Functions that properly escape or parameterize queries
    "sql": [
        "escape_string",
        "mysql_real_escape_string",
        "mysqli_real_escape_string",
        "pg_escape_string",
        "sqlite3.escape_string",
        "sqlalchemy.text",
        "db.prepare",
        "parameterize",
        "prepared_statement",
        "bind_param",
        "execute_prepared",
        "psycopg2.sql.SQL",
        "psycopg2.sql.Identifier",
        "psycopg2.sql.Literal",
    ],
    # XSS sanitizers - HTML escaping functions
    "xss": [
        "escape_html",
        "html.escape",
        "cgi.escape",
        "markupsafe.escape",
        "DOMPurify.sanitize",
        "bleach.clean",
        "strip_tags",
        "sanitize_html",
        "escape_javascript",
        "json.dumps",  # When used for JSON encoding
        "JSON.stringify",
        "encodeURIComponent",
        "encodeURI",
        "_.escape",  # Lodash escape
        "escapeHtml",
        "htmlspecialchars",
        "htmlentities",
    ],
    # Path traversal sanitizers
    "path": [
        "os.path.basename",
        "Path.basename",
        "secure_filename",
        "sanitize_filename",
        "normalize_path",
        "realpath",
        "abspath",
        "path.resolve",
        "path.normalize",
        "werkzeug.utils.secure_filename",
    ],
    # Command injection sanitizers
    "command": [
        "shlex.quote",
        "pipes.quote",
        "escapeshellarg",
        "escapeshellcmd",
        "shell_escape",
        "quote",
        "escape_shell",
    ],
    # General validation functions
    "validation": [
        "validate",
        "validator",
        "is_valid",
        "check_input",
        "sanitize",
        "clean",
        "filter_var",
        "assert_valid",
        "verify",
    ]
}

# Define security sinks (functions where external data flows are tracked)
# Categories are for organizational purposes only - Truth Couriers don't classify vulnerabilities
SECURITY_SINKS = {
    # SQL-related sinks (factual: functions that interact with databases)
    "sql": [
        "db.query",
        "db.execute",
        "db.exec",
        "db.raw",
        "cursor.execute",
        "connection.execute",
        "query",
        "execute",
        "executemany",
        "rawQuery",
        "knex.raw",
        "sequelize.query",
        "mongoose.find",
        "collection.find",
        # Async Python ORMs
        "asyncpg.execute",
        "asyncpg.executemany",
        "asyncpg.fetch",
        "asyncpg.fetchrow",
        "asyncpg.fetchval",
        "tortoise.execute_query",
        "tortoise.execute_sql",
        "databases.execute",
        "databases.fetch_all",
        "databases.fetch_one",
        # Modern JS ORMs
        "prisma.$queryRaw",
        "prisma.$executeRaw",
        "prisma.$queryRawUnsafe",
        "prisma.$executeRawUnsafe",
        "typeorm.query",
        "typeorm.createQueryBuilder",
        "objection.raw",
        "knex.raw",
    ],
    # Command execution sinks (factual: functions that execute system commands)
    "command": [
        "os.system",
        "os.popen",
        "subprocess.run",
        "subprocess.call",
        "subprocess.Popen",
        "subprocess.check_call",
        "subprocess.check_output",
        "exec",
        "eval",
        "child_process.exec",
        "child_process.spawn",
        "child_process.execFile",
        "shell.exec",
    ],
    # HTML/Response output sinks (factual: functions that output to HTML/HTTP responses)
    "xss": [
        "innerHTML",
        "outerHTML",
        "document.write",
        "document.writeln",
        "dangerouslySetInnerHTML",
        "insertAdjacentHTML",
        "response.write",
        "res.send",
        "res.render",
        "res.json",
    ],
    # File system operation sinks (factual: functions that interact with file system)
    "path": [
        "fs.readFile",
        "fs.readFileSync",
        "fs.writeFile",
        "fs.writeFileSync",
        "fs.createReadStream",
        "fs.createWriteStream",
        "open",
        "file.open",
        "Path.join",
        "path.join",
        "os.path.join",
    ],
    # LDAP injection sinks
    "ldap": [
        "ldap.search",
        "ldap.bind",
        "ldap.modify",
        "ldap.add",
        "ldap.delete",
    ],
    # NoSQL injection sinks
    "nosql": [
        "$where",
        "$regex",
        "collection.find",
        "collection.findOne",
        "collection.update",
        "collection.remove",
        "collection.aggregate",
    ]
}