Merge 'perf/throughput: Improve reproducibility' from Pekka Enberg

Improve reproducibility by documenting the steps needed to run the benchmarks and generate the plots. Also simplify plot generation a bit. Closes #3843
2026-01-31 13:54:27 +01:00 · 2025-10-29 18:10:34 +02:00
parent 50ad2f801a f10431d24f
commit d6f6cb3524
3 changed files with 41 additions and 18 deletions
--- a/perf/throughput/README.md
+++ b/perf/throughput/README.md
@@ -0,0 +1,25 @@
+# Turso throughput benchmark
+
+This directory contains Turso throughput benchmark.
+
+First, run the benchmarks:
+
+```console
+cd rusqlite
+./scripts/bench.sh > ../plot/sqlite.csv
+
+cd turso
+./scripts/bench.sh > ../plot/turso.csv
+```
+
+Then, generate the plots:
+
+```console
+cd plot
+uv run plot-thread-scaling.py turso.csv sqlite.csv
+uv run plot-compute-impact.py turso.csv sqlite.csv
+```
+
+This will generate:
+- `thread-scaling.pdf`: Write throughput vs. number of threads (scalability test)
+- `compute-impact.pdf`: How CPU-bound work affects write throughput
--- a/perf/throughput/plot/plot-compute-impact.py
+++ b/perf/throughput/plot/plot-compute-impact.py
@@ -1,4 +1,3 @@
-import os
 import sys

 import matplotlib.pyplot as plt
@@ -12,19 +11,19 @@ plt.rcParams.update({
  "font.serif": ["Times"],
 })

-# Get CSV filename from command line argument
+# Get CSV filenames from command line arguments
 if len(sys.argv) < 2:
-    print("Usage: python script.py <csv_filename>")
+    print("Usage: python script.py <csv_filename> [<csv_filename> ...]")
    sys.exit(1)

-csv_filename = sys.argv[1]
+csv_filenames = sys.argv[1:]

-# Get basename without extension for output filename
-basename = os.path.splitext(csv_filename)[0]
-output_filename = f"{basename}-compute.png"
+# Output filename
+output_filename = "compute-impact.pdf"

-# Read data from CSV file
-df = pd.read_csv(csv_filename)
+# Read data from all CSV files and concatenate
+dfs = [pd.read_csv(filename) for filename in csv_filenames]
+df = pd.concat(dfs, ignore_index=True)

 # Create figure and axis
 fig, ax = plt.subplots(figsize=(10, 6))
--- a/perf/throughput/plot/plot-thread-scaling.py
+++ b/perf/throughput/plot/plot-thread-scaling.py
@@ -1,4 +1,3 @@
-import os
 import sys

 import matplotlib.pyplot as plt
@@ -8,19 +7,19 @@ import scienceplots  # noqa: F401

 plt.style.use(["science"])

-# Get CSV filename from command line argument
+# Get CSV filenames from command line arguments
 if len(sys.argv) < 2:
-    print("Usage: python script.py <csv_filename>")
+    print("Usage: python script.py <csv_filename> [<csv_filename> ...]")
    sys.exit(1)

-csv_filename = sys.argv[1]
+csv_filenames = sys.argv[1:]

-# Get basename without extension for output filename
-basename = os.path.splitext(csv_filename)[0]
-output_filename = f"{basename}-write.png"
+# Output filename
+output_filename = "thread-scaling.pdf"

-# Read data from CSV file
-df = pd.read_csv(csv_filename)
+# Read data from all CSV files and concatenate
+dfs = [pd.read_csv(filename) for filename in csv_filenames]
+df = pd.concat(dfs, ignore_index=True)

 # Filter for compute time = 0
 df_filtered = df[df["compute"] == 0].sort_values("threads")