diff --git a/.gitignore b/.gitignore index b851e8025..294d5a6dc 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ profile.json.gz simulator-output/ &1 +bisected.sql \ No newline at end of file diff --git a/scripts/clean_interactions.sh b/scripts/clean_interactions.sh new file mode 100755 index 000000000..360ee31e8 --- /dev/null +++ b/scripts/clean_interactions.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Clean lines from simulator output by: +# 1) Removing everything up to and including "interaction=" +# 2) Replacing everything from "}:" to the end with a single semicolon +# 3) Only retaining lines containing CREATE/INSERT/UPDATE/DELETE/DROP (the rest are usually meaningless for debugging) +# +# The purpose of this is to transform the interaction plan into a list of executable SQL statements +# in cases where: +# 1. Shrinking the plan failed +# 2. We know the point at which the simulator failure occurred. +# +# I use this script like this in the simulator directory: +# cargo run &> raw_output.txt +# manually edit out the shrinking parts and the WarGames intro graphics etc and save the file +# then run: +# ./clean_interactions.sh raw_output.txt > interactions.sql +# +# Usage: +# clean_interactions.sh INPUT [OUTPUT] +# +# If OUTPUT is omitted, the result is written to stdout. + +if [[ $# -lt 1 || $# -gt 2 ]]; then + echo "Usage: $0 INPUT [OUTPUT]" >&2 + exit 1 +fi + +input_path="$1" +output_path="${2:-}" + +if [[ -z "${output_path}" ]]; then + awk '{ line=$0; sub(/^[^\n]*interaction=/, "", line); sub(/}:.*/, ";", line); print line }' "${input_path}" | grep -E 'CREATE|INSERT|UPDATE|DELETE|DROP' +else + awk '{ line=$0; sub(/^[^\n]*interaction=/, "", line); sub(/}:.*/, ";", line); print line }' "${input_path}" | grep -E 'CREATE|INSERT|UPDATE|DELETE|DROP' > "${output_path}" +fi + + diff --git a/scripts/corruption_bisecter.py b/scripts/corruption_bisecter.py new file mode 100644 index 000000000..38f512a67 --- /dev/null +++ b/scripts/corruption_bisecter.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +# Usage e.g.: uv run scripts/corruption_bisecter.py -i corruption.sql -o bisected.sql +# To clean up input data for this script, consider using `scripts/clean_interactions.sh` +import argparse +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import Callable, List, Literal, Sequence + +def read_statements(input_path: Path) -> List[str]: + with input_path.open("r", encoding="utf-8", errors="replace") as f: + lines = [line.rstrip("\n") for line in f] + return [line for line in lines if line.strip()] + + +# Run a set of SQL statements using tursodb and then run integrity_check on the given db file using sqlite3. +# Return whether the integrity check passed or failed. +def run_sql_and_do_integrity_check( + workspace_root: Path, + db_path: Path, + statements: Sequence[str], +) -> bool: + # Apply statements (if any) and then run integrity_check on the given db file + if statements: + sql_input = "\n".join(statements) + "\n" + run_cmd = ["cargo", "run", "--quiet", "--", str(db_path)] + run_proc = subprocess.run( + run_cmd, + input=sql_input, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=str(workspace_root), + check=False, + ) + if run_proc.returncode != 0: + raise RuntimeError(f"cargo run failed (code {run_proc.returncode}) for candidate with {len(statements)} statements") + + sqlite_cmd = [ + "sqlite3", + str(db_path), + "pragma integrity_check;", + ] + sqlite_proc = subprocess.run( + sqlite_cmd, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=str(workspace_root), + check=False, + ) + + output = (sqlite_proc.stdout or "").strip() + if sqlite_proc.returncode != 0: + raise RuntimeError(f"sqlite3 returned code {sqlite_proc.returncode} with output: {output}") + + return output.lower() == "ok" + + +# Find a minimal subset of SQL statements that still fails integrity check. +# This is done by binary searching for the minimal prefix. +# We don't care about scenarios where some prefix P fails and then a larger prefix P' does not fail anymore; +# We just want to find the minimal prefix that fails in some manner. +def find_min_failing_prefix( + workspace_root: Path, + statements: Sequence[str], +) -> List[str]: + # For performance reasons, reuse DB across attempts: keep last passing DB snapshot and apply only deltas. + with tempfile.TemporaryDirectory(prefix="limbo-bisect-") as tmpdir: + tmpdir_path = Path(tmpdir) + db_pass = tmpdir_path / "pass.db" + db_work = tmpdir_path / "work.db" + + def delete_db(base: Path) -> None: + for suffix in ("", "-wal", "-shm"): + p = Path(str(base) + suffix) + if p.exists(): + try: + p.unlink() + except FileNotFoundError: + pass + + def copy_db(src: Path, dst: Path) -> None: + delete_db(dst) + for suffix in ("", "-wal", "-shm"): + s = Path(str(src) + suffix) + d = Path(str(dst) + suffix) + if s.exists(): + d.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(s, d) + + last_pass_len = 0 + + def check_prefix(k: int) -> bool: + nonlocal last_pass_len + # Prepare working DB starting from last passing snapshot when possible + if last_pass_len > 0 and k > last_pass_len and db_pass.exists(): + copy_db(db_pass, db_work) + delta = statements[last_pass_len:k] + integrity_check_ok = run_sql_and_do_integrity_check(workspace_root, db_work, delta) + else: + delete_db(db_work) + initial = statements[:k] + integrity_check_ok = run_sql_and_do_integrity_check(workspace_root, db_work, initial) + + sys.stderr.write(f"Test prefix {k} -> {integrity_check_ok}\n") + if integrity_check_ok: + copy_db(db_work, db_pass) + last_pass_len = k + return not integrity_check_ok + + # Binary search minimal k such that prefix of length k FAILS. + low = 1 + high = len(statements) + answer_k = None + + # Initialize with empty DB as passing baseline + delete_db(db_pass) + delete_db(db_work) + + while low <= high: + mid = (low + high) // 2 + failed = check_prefix(mid) + if failed: + answer_k = mid + high = mid - 1 + else: + low = mid + 1 + + if answer_k is None: + raise RuntimeError("Could not find a failing prefix despite full set failing.") + + return list(statements[:answer_k]) + +def main(argv: List[str]) -> int: + parser = argparse.ArgumentParser( + description=( + "Find a minimal subset of SQL statements that still FAILS pragma integrity_check." + ) + ) + parser.add_argument( + "-i", + "--input", + type=Path, + help="Path to input SQL file (one statement per line)", + required=True, + ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=Path("bisected.sql"), + help="Path to write the minimized failing prefix (default: bisected.sql)", + ) + + args = parser.parse_args(argv) + + # Assume the script is placed under /scripts/ and use repo root as workspace + workspace_root = Path(__file__).resolve().parent.parent + + if not args.input.exists(): + raise RuntimeError(f"Input file not found: {args.input}") + + statements = read_statements(args.input) + if not statements: + raise RuntimeError("Input file has no statements after filtering empty lines.") + + # Confirm the full input fails integrity check on a fresh DB + with tempfile.TemporaryDirectory(prefix="limbo-bisect-precheck-") as pretmp: + pre_db = Path(pretmp) / "check.db" + integrity_check_ok = run_sql_and_do_integrity_check(workspace_root, pre_db, statements) + if integrity_check_ok: + raise RuntimeError("Full input did not FAIL integrity check") + + result_lines = find_min_failing_prefix(workspace_root, statements) + summary = ( + f"Reduced failing subset to {len(result_lines)} of {len(statements)} statements.\n" + ) + + args.output.parent.mkdir(parents=True, exist_ok=True) + with args.output.open("w", encoding="utf-8") as f: + for line in result_lines: + f.write(line + "\n") + sys.stderr.write(f"Wrote minimized failing prefix to {args.output}\n") + + sys.stderr.write(summary) + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) + +