"""Taint source, sink, and sanitizer definitions. This module contains all the constant definitions for taint analysis: - TAINT_SOURCES: Where untrusted data originates - SECURITY_SINKS: Where untrusted data should not flow - SANITIZERS: Functions that clean/validate data """ import platform # Detect if running on Windows for character encoding IS_WINDOWS = platform.system() == "Windows" # Define taint sources (where untrusted data originates) # Refined to focus on truly external/untrusted input sources TAINT_SOURCES = { # JavaScript/TypeScript sources - Web request data only "js": [ "req.body", "req.query", "req.params", "req.headers", "req.cookies", "request.body", "request.query", "request.params", "ctx.request.body", "ctx.query", "ctx.params", "document.location", "window.location", "document.URL", "document.referrer", "localStorage.getItem", "sessionStorage.getItem", "URLSearchParams", "postMessage", ], # Python sources - Web and CLI input only "python": [ "request.args", "request.form", "request.json", "request.data", "request.values", "request.files", "request.cookies", "request.headers", "request.get_json", "request.get_data", "input", # User console input "raw_input", # Python 2 user input "sys.argv", # Command line arguments "click.argument", # Click CLI arguments "click.option", # Click CLI options "argparse.parse_args", # Argparse arguments ], # Network sources only - removed generic file operations "network": [ "socket.recv", "socket.recvfrom", "websocket.receive", "stdin.read", # Console input ], # Web scraping and data extraction sources "web_scraping": [ # Requests library "requests.get", "requests.post", "requests.put", "requests.patch", "requests.delete", "response.text", "response.content", "response.json", "resp.text", "resp.content", "resp.json", # urllib "urlopen", "urllib.request.urlopen", "urllib2.urlopen", # BeautifulSoup HTML parsing "BeautifulSoup", "soup.find", "soup.find_all", "soup.select", "soup.select_one", "element.text", "element.get_text", "element.string", "tag.text", "tag.get_text", # Playwright browser automation "page.content", "page.inner_text", "page.inner_html", "page.locator", "page.text_content", "element.inner_text", "element.inner_html", "element.text_content", # Selenium browser automation "driver.page_source", "driver.find_element", "element.text", "element.get_attribute", "webdriver.page_source", # Scrapy framework "response.body", "response.text", "response.css", "response.xpath", "selector.get", "selector.getall", ], # File I/O and data loading sources "file_io": [ # Basic file operations "open", "file.read", "file.readline", "file.readlines", # JSON operations "json.load", "json.loads", "json.JSONDecoder", # CSV/Excel operations "csv.reader", "csv.DictReader", "pd.read_csv", "pd.read_excel", "pd.read_json", "pd.read_html", "pd.read_sql", "pandas.read_csv", "pandas.read_excel", # YAML operations "yaml.load", "yaml.safe_load", "yaml.full_load", # XML operations "etree.parse", "etree.fromstring", "xml.parse", "ElementTree.parse", # Environment variables "os.getenv", "os.environ.get", "environ.get", ] # Database category REMOVED - internal database data is trusted, not a taint source } # Define sanitizers that clean/validate data for different vulnerability types SANITIZERS = { # SQL sanitizers - Functions that properly escape or parameterize queries "sql": [ "escape_string", "mysql_real_escape_string", "mysqli_real_escape_string", "pg_escape_string", "sqlite3.escape_string", "sqlalchemy.text", "db.prepare", "parameterize", "prepared_statement", "bind_param", "execute_prepared", "psycopg2.sql.SQL", "psycopg2.sql.Identifier", "psycopg2.sql.Literal", ], # XSS sanitizers - HTML escaping functions "xss": [ "escape_html", "html.escape", "cgi.escape", "markupsafe.escape", "DOMPurify.sanitize", "bleach.clean", "strip_tags", "sanitize_html", "escape_javascript", "json.dumps", # When used for JSON encoding "JSON.stringify", "encodeURIComponent", "encodeURI", "_.escape", # Lodash escape "escapeHtml", "htmlspecialchars", "htmlentities", ], # Path traversal sanitizers "path": [ "os.path.basename", "Path.basename", "secure_filename", "sanitize_filename", "normalize_path", "realpath", "abspath", "path.resolve", "path.normalize", "werkzeug.utils.secure_filename", ], # Command injection sanitizers "command": [ "shlex.quote", "pipes.quote", "escapeshellarg", "escapeshellcmd", "shell_escape", "quote", "escape_shell", ], # General validation functions "validation": [ "validate", "validator", "is_valid", "check_input", "sanitize", "clean", "filter_var", "assert_valid", "verify", ] } # Define security sinks (functions where external data flows are tracked) # Categories are for organizational purposes only - Truth Couriers don't classify vulnerabilities SECURITY_SINKS = { # SQL-related sinks (factual: functions that interact with databases) "sql": [ "db.query", "db.execute", "db.exec", "db.raw", "cursor.execute", "connection.execute", "query", "execute", "executemany", "rawQuery", "knex.raw", "sequelize.query", "mongoose.find", "collection.find", # Async Python ORMs "asyncpg.execute", "asyncpg.executemany", "asyncpg.fetch", "asyncpg.fetchrow", "asyncpg.fetchval", "tortoise.execute_query", "tortoise.execute_sql", "databases.execute", "databases.fetch_all", "databases.fetch_one", # Modern JS ORMs "prisma.$queryRaw", "prisma.$executeRaw", "prisma.$queryRawUnsafe", "prisma.$executeRawUnsafe", "typeorm.query", "typeorm.createQueryBuilder", "objection.raw", "knex.raw", ], # Command execution sinks (factual: functions that execute system commands) "command": [ "os.system", "os.popen", "subprocess.run", "subprocess.call", "subprocess.Popen", "subprocess.check_call", "subprocess.check_output", "exec", "eval", "child_process.exec", "child_process.spawn", "child_process.execFile", "shell.exec", ], # HTML/Response output sinks (factual: functions that output to HTML/HTTP responses) "xss": [ "innerHTML", "outerHTML", "document.write", "document.writeln", "dangerouslySetInnerHTML", "insertAdjacentHTML", "response.write", "res.send", "res.render", "res.json", ], # File system operation sinks (factual: functions that interact with file system) "path": [ "fs.readFile", "fs.readFileSync", "fs.writeFile", "fs.writeFileSync", "fs.createReadStream", "fs.createWriteStream", "open", "file.open", "Path.join", "path.join", "os.path.join", ], # LDAP injection sinks "ldap": [ "ldap.search", "ldap.bind", "ldap.modify", "ldap.add", "ldap.delete", ], # NoSQL injection sinks "nosql": [ "$where", "$regex", "collection.find", "collection.findOne", "collection.update", "collection.remove", "collection.aggregate", ] }