Add script for benchmarking simulators with different parameters (#621)

sgreenbury · web-flow · commit b1840bc62cb7 · 2025-07-30T14:53:30.000+01:00
Add scripts and plotting notebook for benchmarking (#454)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,12 +4,12 @@ repos:
     hooks:
     - id: black
       language_version: python3
-      exclude: "^autoemulate/experimental/|^tests/experimental/"
+      exclude: "^autoemulate/experimental/|^tests/experimental/|^benchmarks/"
 -   repo: https://github.com/asottile/reorder-python-imports
     rev: v3.12.0
     hooks:
     -   id: reorder-python-imports
-        exclude: "^autoemulate/experimental/|^tests/experimental/"
+        exclude: "^autoemulate/experimental/|^tests/experimental/|^benchmarks/"
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
   rev: v0.11.4
@@ -18,13 +18,13 @@ repos:
   - id: ruff
     types_or: [ python, pyi ]
     args: [ --fix ]
-    files: ^autoemulate/experimental/|^tests/experimental/
+    files: ^autoemulate/experimental/|^tests/experimental/|^benchmarks/
   # Run the formatter.
   - id: ruff-format
     types_or: [ python, pyi ]
-    files: ^autoemulate/experimental/|^tests/experimental/
+    files: ^autoemulate/experimental/|^tests/experimental/|^benchmarks/
 - repo: https://github.com/RobertCraigie/pyright-python
   rev: v1.1.398
   hooks:
   - id: pyright
-    files: ^autoemulate/experimental/|^tests/experimental/
+    files: ^autoemulate/experimental/|^tests/experimental/|^benchmarks/
diff --git a/autoemulate/experimental/simulations/__init__.py b/autoemulate/experimental/simulations/__init__.py
@@ -1,8 +1,9 @@
 from .epidemic import Epidemic
+from .flow_problem import FlowProblem
 from .projectile import Projectile, ProjectileMultioutput
 
-ALL_SIMULATORS = [Epidemic, Projectile, ProjectileMultioutput]
+ALL_SIMULATORS = [Epidemic, FlowProblem, Projectile, ProjectileMultioutput]
 
-__all__ = ["Epidemic", "Projectile", "ProjectileMultioutput"]
+__all__ = ["Epidemic", "FlowProblem", "Projectile", "ProjectileMultioutput"]
 
 SIMULATOR_REGISTRY = dict(zip(__all__, ALL_SIMULATORS, strict=False))
diff --git a/autoemulate/experimental/simulations/flow_problem.py b/autoemulate/experimental/simulations/flow_problem.py
@@ -19,8 +19,8 @@ class FlowProblem(Simulator):
 
     def __init__(
         self,
-        parameters_range: dict[str, tuple[float, float]],
-        output_names: list[str],
+        parameters_range: dict[str, tuple[float, float]] | None = None,
+        output_names: list[str] | None = None,
         log_level: str = "progress_bar",
         ncycles: int = 10,
         ncomp: int = 10,
@@ -47,6 +47,29 @@ def __init__(
         ncomp: int
             Number of compartments in the tube.
         """
+        if parameters_range is None:
+            parameters_range = {
+                # Cardiac cycle period (s)
+                "T": (0.5, 2.0),
+                # Pulse duration (s)
+                "td": (0.1, 0.5),
+                # Amplitude (e.g., pressure or flow rate)
+                "amp": (100.0, 1000.0),
+                # Time step (s)
+                "dt": (0.0001, 0.01),
+                # Compliance (unit varies based on context)
+                "C": (20.0, 60.0),
+                # Resistance (unit varies based on context)
+                "R": (0.01, 0.1),
+                # Inductance (unit varies based on context)
+                "L": (0.001, 0.005),
+                # Outflow resistance (unit varies based on context)
+                "R_o": (0.01, 0.05),
+                # Initial pressure (unit varies based on context)
+                "p_o": (5.0, 15.0),
+            }
+        if output_names is None:
+            output_names = ["pressure"]
         super().__init__(parameters_range, output_names, log_level)
         self.ncycles = ncycles
         self.ncomp = ncomp
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,14 @@
+# Bechmarks
+
+- [benchmark.py](./benchmark.py): a script with CLI for running batches of simulations with AutoEmulate for different numbers of tuningiterations
+- [run_benchmark.sh](./run_benchmark.sh): runs batches of simulations enabling some parallelisation
+- [plot_benchmark.ipynb](./plot_benchmark.ipynb): notebook for plotting results
+
+## Quickstart
+- Install [pueue](https://github.com/Nukesor/pueue): is included in [run_benchmark.sh](./run_benchmark.sh) and simplifies running multiple python scripts
+-  Run:
+```bash
+./run_benchmark.sh
+```
+
+
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -0,0 +1,118 @@
+import itertools
+from typing import cast
+
+import click
+import numpy as np
+import pandas as pd
+import torch
+from autoemulate.experimental.compare import AutoEmulate
+from autoemulate.experimental.emulators import ALL_EMULATORS
+from autoemulate.experimental.emulators.base import Emulator
+from autoemulate.experimental.simulations import SIMULATOR_REGISTRY
+from autoemulate.experimental.simulations.base import Simulator
+from tqdm import tqdm
+
+
+def run_benchmark(
+    x: torch.Tensor, y: torch.Tensor, n_iter: int, n_splits: int, log_level: str
+) -> pd.DataFrame:
+    ae = AutoEmulate(
+        x,
+        y,
+        models=cast(list[type[Emulator] | str], ALL_EMULATORS),
+        n_iter=n_iter,
+        n_splits=n_splits,
+        log_level=log_level,
+    )
+    return ae.summarise()
+
+
+@click.command()
+@click.option(
+    "--simulators",
+    type=str,
+    multiple=True,
+    default=["ProjectileMultioutput"],
+    help="Number of samples to generate",
+)
+@click.option(
+    "--n_samples_list",
+    type=int,
+    multiple=True,
+    default=[20, 50, 100, 200, 500],
+    help="Number of samples to generate",
+)
+@click.option(
+    "--n_iter_list",
+    type=int,
+    multiple=True,
+    default=[10, 50, 100],
+    help="Number of iterations to run",
+)
+@click.option(
+    "--n_splits_list",
+    type=int,
+    multiple=True,
+    default=[2, 5],
+    help="Number of splits for cross-validation",
+)
+@click.option(
+    "--seed",
+    type=int,
+    default=42,
+    help="Seed for the permutations over params",
+)
+@click.option(
+    "--output_file",
+    type=str,
+    default="benchmark_results.csv",
+    help="File name for output",
+)
+@click.option("--log_level", default="progress_bar", help="Logging level")
+def main(  # noqa: PLR0913
+    simulators, n_samples_list, n_iter_list, n_splits_list, seed, output_file, log_level
+):
+    print(f"Running benchmark with simulators: {simulators}")
+    print(f"Number of samples: {n_samples_list}")
+    print(f"Number of iterations: {n_iter_list}")
+    print(f"Number of splits: {n_splits_list}")
+    print(f"Seed: {seed}")
+    print(f"Output file: {output_file}")
+    print(f"Log level: {log_level}")
+    print("-" * 50)
+
+    dfs = []
+    for simulator_str in simulators:
+        # Generate samples
+        simulator: Simulator = SIMULATOR_REGISTRY[simulator_str]()
+        max_samples = max(n_samples_list)
+        x_all = simulator.sample_inputs(max_samples, random_seed=seed).to(torch.float32)
+        y_all = simulator.forward_batch(x_all).to(torch.float32)
+
+        params = list(itertools.product(n_samples_list, n_iter_list, n_splits_list))
+        np.random.seed(seed)
+        params = np.random.permutation(params)
+        for n_samples, n_iter, n_splits in tqdm(params):
+            print(
+                f"Running benchmark for {simulator_str} with {n_samples} samples, "
+                f"{n_iter} iterations, and {n_splits} splits"
+            )
+            try:
+                x = x_all[:n_samples]
+                y = y_all[:n_samples]
+                df = run_benchmark(x, y, n_iter, n_splits, log_level)
+                df["simulator"] = simulator_str
+                df["n_samples"] = n_samples
+                df["n_iter"] = n_iter
+                df["n_splits"] = n_splits
+                dfs.append(df)
+                final_df = pd.concat(dfs, ignore_index=True)
+                final_df.sort_values("r2_test", ascending=False).to_csv(
+                    output_file, index=False
+                )
+            except Exception as e:
+                print(f"Error raised while testing :\n{e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/plot_benchmark.ipynb b/benchmarks/plot_benchmark.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "702bb87d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv(\"https://github.com/user-attachments/files/21469860/benchmark_results.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0dd4297",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N_BOOTSTRAPS=100\n",
+    "\n",
+    "def generate_plots(df, metric=\"r2_train\", exclude = [\"SupportVectorMachine\", \"LightGBM\"], fontsize=\"small\"):\n",
+    "    simulator_list = sorted(df[\"simulator\"].unique().tolist())\n",
+    "    n_iter_list = sorted(df[\"n_iter\"].unique().tolist())\n",
+    "    n_splits_list = sorted(df[\"n_splits\"].unique().tolist())\n",
+    "    color = {name:f\"C{idx}\" for idx, name in enumerate(sorted(df[\"model_name\"].unique().tolist()))}\n",
+    "    for plot_idx, simulator in enumerate(simulator_list):\n",
+    "        fig, axs = plt.subplots(len(n_splits_list), len(n_iter_list), figsize=(12, 6), squeeze=False)\n",
+    "        handles = []\n",
+    "        labels = []\n",
+    "        for row_idx, n_splits in enumerate(n_splits_list):\n",
+    "            for col_idx, n_iter in enumerate(n_iter_list):\n",
+    "                subset = df[df[\"simulator\"].eq(simulator) & df[\"n_splits\"].eq(n_splits) & df[\"n_iter\"].eq(n_iter)]\n",
+    "                ax = axs[row_idx][col_idx]\n",
+    "                for idx, ((name,), group) in enumerate(subset.groupby([\"model_name\"], sort=True)): \n",
+    "                    if name in exclude:\n",
+    "                        continue\n",
+    "                    group_sorted = group.sort_values(\"n_samples\")\n",
+    "                    line = ax.plot(group_sorted[\"n_samples\"], group_sorted[metric], label=name, c=color[name])\n",
+    "\n",
+    "                    if row_idx == 0 and col_idx == 0:\n",
+    "                        handles.append(line[0])\n",
+    "                        labels.append(name)\n",
+    "                    \n",
+    "                    mean = group_sorted[metric]\n",
+    "                    ste = group_sorted[f\"{metric}_std\"] / np.sqrt(N_BOOTSTRAPS)\n",
+    "                    ax.fill_between(group_sorted[\"n_samples\"], mean - ste, mean + ste, alpha=0.2, lw=0, color=color[name])\n",
+    "                ax.set_ylim(-0.1, 1.05)\n",
+    "                # ax.set_xlim(df[\"n_samples\"].min(), df[\"n_samples\"].max())\n",
+    "                ax.set_xlim(10, df[\"n_samples\"].max())\n",
+    "                ax.axhline(0., lw=0.5, ls=\"--\", c=\"grey\", alpha=0.5, zorder=-1)\n",
+    "                \n",
+    "                ax.set_xscale(\"log\")\n",
+    "                # ax.set_yscale(\"log\")\n",
+    "                if col_idx == 0:\n",
+    "                    ax.set_ylabel(metric, size=fontsize)\n",
+    "                if row_idx == len(n_splits_list)-1:\n",
+    "                    ax.set_xlabel(\"n_samples\", size=fontsize)\n",
+    "                ax.tick_params(labelsize=fontsize)\n",
+    "                ax.set_title(f\"{simulator} (n_iter={n_iter}, n_splits={n_splits})\", size=fontsize)\n",
+    "                ax.grid(True, which='both', linestyle=':', linewidth=0.5, alpha=0.7)\n",
+    "        fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 0.98), ncol=df[\"model_name\"].nunique()-len(exclude), fontsize=fontsize)\n",
+    "        \n",
+    "        # Adjust layout to make room for legend\n",
+    "        plt.tight_layout()\n",
+    "        plt.subplots_adjust(top=0.88)\n",
+    "        \n",
+    "        plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be99a004",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# All models\n",
+    "generate_plots(df, metric=\"r2_test\", exclude=[])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffa939d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPs, ensembles and MLPs only\n",
+    "generate_plots(df, metric=\"r2_test\", exclude=[\"RandomForest\", \"LightGBM\", \"SupportVectorMachine\", \"RadialBasisFunctions\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a313ab5c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+source .venv/bin/activate
+
+# Run the benchmark script with the specified parameters
+date_time=$(date +"%Y-%m-%d_%H%M%S")
+outpath="./benchmarks/data/${date_time}/"
+mkdir -p "$outpath"
+for simulator in Epidemic FlowProblem Projectile ProjectileMultioutput; do
+  for n_iter_pair in "10 100" "150 50" "200 20"; do
+    for n_splits in 5 2; do  
+      n_iter_array=($n_iter_pair)
+      n_iter1=${n_iter_array[0]}
+      n_iter2=${n_iter_array[1]}
+      echo "Running benchmark for simulator: $simulator, n_splits: $n_splits, n_iter: $n_iter1 $n_iter2"
+      pueue add "python benchmarks/benchmark.py --simulators \"$simulator\" --n_splits_list \"$n_splits\" --n_iter_list \"$n_iter1\" --n_iter_list \"$n_iter2\" --log_level info --output_file \"${outpath}/benchmark_results_${simulator}_n_splits_${n_splits}_n_iter_${n_iter1}_${n_iter2}.csv\""
+    done
+  done
+done
+
+# Combine outputs with:
+# xsv cat rows benchmarks/data/${date_time}/benchmark_*.csv > benchmark_results.csv
diff --git a/pyproject.toml b/pyproject.toml
@@ -64,12 +64,12 @@ source = [".", "/tmp"]
 [tool.pyright]
 venvPath = "."
 venv = ".venv"
-include = ["autoemulate/experimental/*", "tests/experimental/*"]
+include = ["autoemulate/experimental/*", "tests/experimental/*", "benchmarks/*"]
 
 [tool.ruff]
 src = ["autoemulate/"]
 line-length = 88
-include = ["autoemulate/experimental/**/*.py", "tests/experimental/**/*.py"]
+include = ["autoemulate/experimental/**/*.py", "tests/experimental/**/*.py", "benchmarks/**/*.py"]
 target-version = "py310"
 
 [tool.ruff.format]