diff --git a/Cargo.lock b/Cargo.lock
index d5de42fb3..2d0bc6ce2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3101,6 +3101,7 @@ dependencies = [
"miette",
"nix",
"openshell-core",
+ "polling",
"prost-types",
"tar",
"tokio",
@@ -3672,6 +3673,20 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+[[package]]
+name = "polling"
+version = "3.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218"
+dependencies = [
+ "cfg-if",
+ "concurrent-queue",
+ "hermit-abi",
+ "pin-project-lite",
+ "rustix 1.1.4",
+ "windows-sys 0.61.2",
+]
+
[[package]]
name = "poly1305"
version = "0.8.0"
diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md
index 548b86d17..4cafe424f 100644
--- a/architecture/custom-vm-runtime.md
+++ b/architecture/custom-vm-runtime.md
@@ -1,140 +1,161 @@
# Custom libkrunfw VM Runtime
-> Status: Experimental and work in progress (WIP). VM support is under active development and may change.
+> Status: Experimental and work in progress (WIP). The VM compute driver is
+> under active development and may change.
## Overview
-The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a
-lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel
-is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel.
+The OpenShell gateway uses [libkrun](https://github.com/containers/libkrun) via the
+`openshell-driver-vm` compute driver to boot a lightweight microVM per sandbox.
+Each VM runs on Apple Hypervisor.framework (macOS) or KVM (Linux), with the guest
+kernel embedded inside `libkrunfw`.
-The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or
-conntrack support. This is insufficient for Kubernetes pod networking.
+The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge,
+netfilter, or conntrack support. That is insufficient for the sandbox supervisor's
+per-sandbox network namespace primitives (veth pair + iptables, see
+`crates/openshell-sandbox/src/sandbox/linux/netns.rs`). The custom libkrunfw
+runtime adds bridge, iptables/nftables, and conntrack support to the guest
+kernel.
-The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to
-the VM kernel, enabling standard Kubernetes networking.
+The driver is spawned by `openshell-gateway` as a subprocess, talks to it over a
+Unix domain socket (`compute-driver.sock`) with the
+`openshell.compute.v1.ComputeDriver` gRPC surface, and manages per-sandbox
+microVMs. The runtime (libkrun + libkrunfw + gvproxy) and the sandbox rootfs are
+embedded directly in the driver binary — no sibling files required at runtime.
## Architecture
```mermaid
graph TD
subgraph Host["Host (macOS / Linux)"]
- BIN[openshell-vm binary]
- EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy"]
- CACHE["~/.local/share/openshell/vm-runtime/{version}/"]
- PROV[Runtime provenance logging]
- GVP[gvproxy networking proxy]
-
- BIN --> EMB
- BIN -->|extracts to| CACHE
- BIN --> PROV
- BIN -->|spawns| GVP
+ GATEWAY["openshell-gateway
(compute::vm::spawn)"]
+ DRIVER["openshell-driver-vm
(compute-driver.sock)"]
+ EMB["Embedded runtime (zstd)
libkrun · libkrunfw · gvproxy
+ sandbox rootfs.tar.zst"]
+ GVP["gvproxy (per sandbox)
virtio-net · DHCP · DNS"]
+
+ GATEWAY <-->|gRPC over UDS| DRIVER
+ DRIVER --> EMB
+ DRIVER -->|spawns one per sandbox| GVP
end
- subgraph Guest["Guest VM"]
- INIT["openshell-vm-init.sh (PID 1)"]
- VAL[Validates kernel capabilities]
- CNI[Configures bridge CNI]
- EXECA["Starts exec agent\nvsock port 10777"]
- PKI[Generates mTLS PKI]
- K3S[Execs k3s server]
- EXECPY["openshell-vm-exec-agent.py"]
- CHK["check-vm-capabilities.sh"]
-
- INIT --> VAL --> CNI --> EXECA --> PKI --> K3S
+ subgraph Guest["Per-sandbox microVM"]
+ SBXINIT["/srv/openshell-vm-sandbox-init.sh"]
+ SBX["/opt/openshell/bin/openshell-sandbox
(PID 1, supervisor)"]
+ SBXINIT --> SBX
end
- BIN -- "fork + krun_start_enter" --> INIT
- GVP -- "virtio-net" --> Guest
+ DRIVER -- "fork + krun_start_enter" --> SBXINIT
+ GVP -- "virtio-net eth0" --> Guest
+ SBX -.->|"outbound ConnectSupervisor
gRPC stream"| GATEWAY
+ CLIENT["openshell-cli"] -->|SSH over supervisor relay| GATEWAY
```
+The driver spawns **one microVM per sandbox**. Each VM boots directly into
+`openshell-sandbox` as PID 1. All gateway ingress — SSH, exec, connect — rides
+the supervisor-initiated `ConnectSupervisor` gRPC stream opened from inside the
+guest back out to the gateway, so gvproxy is configured with `-ssh-port -1` and
+never binds a host-side TCP listener.
+
## Embedded Runtime
-The openshell-vm binary is fully self-contained, embedding both the VM runtime libraries
-and a minimal rootfs as zstd-compressed byte arrays. On first use, the binary extracts
-these to XDG cache directories with progress bars:
+`openshell-driver-vm` embeds the VM runtime libraries and the sandbox rootfs as
+zstd-compressed byte arrays, extracting on demand:
```
-~/.local/share/openshell/vm-runtime/{version}/
+~/.local/share/openshell/vm-runtime// # libkrun / libkrunfw / gvproxy
├── libkrun.{dylib,so}
├── libkrunfw.{5.dylib,so.5}
└── gvproxy
-~/.local/share/openshell/openshell-vm/{version}/instances//rootfs/
-├── usr/local/bin/k3s
-├── opt/openshell/bin/openshell-sandbox
-├── opt/openshell/manifests/
-└── ...
+/sandboxes//rootfs/ # per-sandbox rootfs
```
-This eliminates the need for separate bundles or downloads - a single ~120MB binary
-provides everything needed to run the VM. Old cache versions are automatically
-cleaned up when a new version is extracted.
+Old runtime cache versions are cleaned up when a new version is extracted.
-### Hybrid Approach
+### Sandbox rootfs preparation
-The embedded rootfs uses a "minimal" configuration:
-- Includes: Base Ubuntu, k3s binary, supervisor binary, helm charts, manifests
-- Excludes: Pre-loaded container images (~1GB savings)
+The rootfs tarball the driver embeds starts from the same minimal Ubuntu base
+used across the project, and is **rewritten into a supervisor-only sandbox
+guest** during extraction:
-Container images are pulled on demand when sandboxes are created. First boot takes
-~30-60s as k3s initializes; subsequent boots use cached state for ~3-5s startup.
+- k3s state and Kubernetes manifests are stripped out
+- `/srv/openshell-vm-sandbox-init.sh` is installed as the guest entrypoint
+- the guest boots directly into `openshell-sandbox` — no k3s, no kube-proxy,
+ no CNI plugins
-For the VM compute driver, the same embedded rootfs is rewritten into a
-supervisor-only sandbox guest before boot:
+See `crates/openshell-driver-vm/src/rootfs.rs` for the rewrite logic and
+`crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh` for the init
+script that gets installed.
-- removes k3s state and Kubernetes manifests from the extracted rootfs
-- installs `/srv/openshell-vm-sandbox-init.sh`
-- boots directly into `openshell-sandbox` instead of `openshell-vm-init.sh`
-- keeps the same embedded libkrun/libkrunfw kernel/runtime bundle
+### `--internal-run-vm` helper
-`openshell-driver-vm` now embeds the sandbox rootfs tarball independently so it can
-prepare sandbox guests without linking against the `openshell-vm` Rust crate.
-It now also embeds the minimal libkrun/libkrunfw bundle it needs for sandbox
-boots and launches sandbox guests via a hidden helper mode in the
-`openshell-driver-vm` binary itself, without depending on the `openshell-vm`
-binary. The helper still starts its own embedded `gvproxy` instance to provide
-virtio-net guest egress plus the single inbound SSH port forward used by the
-compute driver.
+The driver binary has two modes: the default mode is the gRPC server; when
+launched with `--internal-run-vm` it becomes a per-sandbox launcher. The driver
+spawns one launcher per sandbox as a subprocess, which in turn starts `gvproxy`
+and calls `krun_start_enter` to boot the guest. Keeping the launcher in the
+same binary means the driver ships a single artifact for both roles.
-For fully air-gapped environments requiring pre-loaded images, build with:
-```bash
-mise run vm:rootfs # Full rootfs (~2GB, includes images)
-mise run vm:build # Rebuild binary with full rootfs
-```
+## Network Plane
+
+The driver launches a **dedicated `gvproxy` instance per sandbox** to provide the
+guest's networking plane:
+
+- virtio-net backend over a Unix SOCK_STREAM (Linux) or SOCK_DGRAM (macOS vfkit)
+ socket, which surfaces as `eth0` inside the guest
+- DHCP server + default router (192.168.127.1 / 192.168.127.2) for the guest's
+ udhcpc client
+- DNS for host aliases: the guest init script seeds `/etc/hosts` with
+ `host.openshell.internal` → 192.168.127.1, while leaving gvproxy's legacy
+ `host.containers.internal` / `host.docker.internal` resolution intact
+
+The `-listen` API socket and the `-ssh-port` forwarder are both intentionally
+omitted. After the supervisor-initiated relay migration the driver does not
+enqueue any host-side port forwards, and the guest's SSH listener lives on a
+Unix socket at `/run/openshell/ssh.sock` inside the VM that is reached over the
+outbound `ConnectSupervisor` gRPC stream. Binding a host listener would race
+concurrent sandboxes for port 2222 and surface a misleading "sshd is reachable"
+endpoint.
+
+The sandbox supervisor's per-sandbox netns (veth pair + iptables) branches off
+of this plane. libkrun's built-in TSI socket impersonation would not satisfy
+those kernel-level primitives, which is why we need the custom libkrunfw.
-## Network Profile
+## Process Lifecycle Cleanup
-The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and
-netfilter kernel support. The init script validates these capabilities at boot and fails
-fast with an actionable error if they are missing.
+`openshell-driver-vm` installs a cross-platform "die when my parent dies"
+primitive (`procguard`) in every link of the spawn chain so that killing
+`openshell-gateway` (SIGTERM, SIGKILL, or crash) reaps the driver, per-sandbox
+launcher, gvproxy, and the libkrun worker:
-### Bridge Profile
+- Linux: `nix::sys::prctl::set_pdeathsig(SIGKILL)`
+- macOS / BSDs: `smol-rs/polling` with `ProcessOps::Exit` on a helper thread
+- gvproxy (the one non-Rust child) gets `PR_SET_PDEATHSIG` via `pre_exec` on
+ Linux, and is SIGTERM'd from the launcher's procguard cleanup callback on
+ macOS
-- CNI: bridge plugin with `cni0` interface
-- IP masquerade: enabled (iptables-legacy via CNI bridge plugin)
-- kube-proxy: enabled (nftables mode)
-- Service VIPs: functional (ClusterIP, NodePort)
-- hostNetwork workarounds: not required
+See `crates/openshell-driver-vm/src/procguard.rs` for the implementation and
+`tasks/scripts/vm/smoke-orphan-cleanup.sh` (exposed as
+`mise run vm:smoke:orphan-cleanup`) for the regression test that covers both
+SIGTERM and SIGKILL paths.
## Runtime Provenance
-At boot, the openshell-vm binary logs provenance metadata about the loaded runtime bundle:
+At driver startup the loaded runtime bundle is logged with:
- Library paths and SHA-256 hashes
- Whether the runtime is custom-built or stock
- For custom runtimes: libkrunfw commit, kernel version, build timestamp
-This information is sourced from `provenance.json` (generated by the build script)
-and makes it straightforward to correlate VM behavior with a specific runtime artifact.
+This information is sourced from `provenance.json` (generated by the build
+script) and makes it straightforward to correlate sandbox VM behavior with a
+specific runtime artifact.
## Build Pipeline
```mermaid
graph LR
subgraph Source["crates/openshell-vm/runtime/"]
- KCONF["kernel/openshell.kconfig\nKernel config fragment"]
- README["README.md\nOperator documentation"]
+ KCONF["kernel/openshell.kconfig
Kernel config fragment"]
end
subgraph Linux["Linux CI (build-libkrun.sh)"]
@@ -145,101 +166,87 @@ graph LR
BUILD_M["Build libkrunfw.dylib + libkrun.dylib"]
end
- subgraph Output["target/libkrun-build/"]
- LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"]
- LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"]
+ subgraph Output["vm-runtime-<platform>.tar.zst"]
+ LIB_SO["libkrunfw.so + libkrun.so + gvproxy
(Linux)"]
+ LIB_DY["libkrunfw.dylib + libkrun.dylib + gvproxy
(macOS)"]
end
- KCONF --> BUILD_L
- BUILD_L --> LIB_SO
- KCONF --> BUILD_M
- BUILD_M --> LIB_DY
+ KCONF --> BUILD_L --> LIB_SO
+ KCONF --> BUILD_M --> LIB_DY
```
+The `vm-runtime-.tar.zst` artifact is consumed by
+`openshell-driver-vm`'s `build.rs`, which embeds the library set into the
+binary via `include_bytes!()`. Setting `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR`
+at build time (wired up by `crates/openshell-driver-vm/start.sh`) points the
+build at the staged artifacts.
+
## Kernel Config Fragment
-The `openshell.kconfig` fragment enables these kernel features on top of the stock
-libkrunfw kernel:
+The `openshell.kconfig` fragment enables these kernel features on top of the
+stock libkrunfw kernel:
| Feature | Key Configs | Purpose |
|---------|-------------|---------|
-| Network namespaces | `CONFIG_NET_NS`, `CONFIG_NAMESPACES` | Pod isolation |
-| veth | `CONFIG_VETH` | Pod network namespace pairs |
-| Bridge device | `CONFIG_BRIDGE`, `CONFIG_BRIDGE_NETFILTER` | cni0 bridge for pod networking, kube-proxy bridge traffic visibility |
+| Network namespaces | `CONFIG_NET_NS`, `CONFIG_NAMESPACES` | Sandbox netns isolation |
+| veth | `CONFIG_VETH` | Sandbox network namespace pairs |
+| Bridge device | `CONFIG_BRIDGE`, `CONFIG_BRIDGE_NETFILTER` | Bridge support + iptables visibility into bridge traffic |
| Netfilter framework | `CONFIG_NETFILTER`, `CONFIG_NETFILTER_ADVANCED`, `CONFIG_NETFILTER_XTABLES` | iptables/nftables framework |
-| xtables match modules | `CONFIG_NETFILTER_XT_MATCH_CONNTRACK`, `_COMMENT`, `_MULTIPORT`, `_MARK`, `_STATISTIC`, `_ADDRTYPE`, `_RECENT`, `_LIMIT` | kube-proxy and kubelet iptables rules |
+| xtables match modules | `CONFIG_NETFILTER_XT_MATCH_CONNTRACK`, `_COMMENT`, `_MULTIPORT`, `_MARK`, `_STATISTIC`, `_ADDRTYPE`, `_RECENT`, `_LIMIT` | Sandbox supervisor iptables rules |
| Connection tracking | `CONFIG_NF_CONNTRACK`, `CONFIG_NF_CT_NETLINK` | NAT state tracking |
-| NAT | `CONFIG_NF_NAT` | Service VIP DNAT/SNAT |
-| iptables | `CONFIG_IP_NF_IPTABLES`, `CONFIG_IP_NF_FILTER`, `CONFIG_IP_NF_NAT`, `CONFIG_IP_NF_MANGLE` | CNI bridge masquerade and compat |
-| nftables | `CONFIG_NF_TABLES`, `CONFIG_NFT_CT`, `CONFIG_NFT_NAT`, `CONFIG_NFT_MASQ`, `CONFIG_NFT_NUMGEN`, `CONFIG_NFT_FIB_IPV4` | kube-proxy nftables mode (primary) |
-| IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Pod-to-pod routing |
-| IPVS | `CONFIG_IP_VS`, `CONFIG_IP_VS_RR`, `CONFIG_IP_VS_NFCT` | kube-proxy IPVS mode (optional) |
-| Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | Kubernetes QoS |
-| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Container resource limits |
-| TUN/TAP | `CONFIG_TUN` | CNI plugin support |
+| NAT | `CONFIG_NF_NAT` | Sandbox egress DNAT/SNAT |
+| iptables | `CONFIG_IP_NF_IPTABLES`, `CONFIG_IP_NF_FILTER`, `CONFIG_IP_NF_NAT`, `CONFIG_IP_NF_MANGLE` | Masquerade and compat |
+| nftables | `CONFIG_NF_TABLES`, `CONFIG_NFT_CT`, `CONFIG_NFT_NAT`, `CONFIG_NFT_MASQ`, `CONFIG_NFT_NUMGEN`, `CONFIG_NFT_FIB_IPV4` | nftables path |
+| IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Sandbox-to-host routing |
+| Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | QoS |
+| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Sandbox resource limits |
+| TUN/TAP | `CONFIG_TUN` | CNI plugin compatibility; inherited from the shared kconfig, not exercised by the driver. |
| Dummy interface | `CONFIG_DUMMY` | Fallback networking |
-| Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support |
-| Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support |
+| Landlock | `CONFIG_SECURITY_LANDLOCK` | Sandbox supervisor filesystem sandboxing |
+| Seccomp filter | `CONFIG_SECCOMP_FILTER` | Sandbox supervisor syscall filtering |
-See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full fragment with
-inline comments explaining why each option is needed.
+See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full
+fragment with inline comments explaining why each option is needed.
## Verification
-One verification tool is provided:
-
-1. **Capability checker** (`check-vm-capabilities.sh`): Runs inside the VM to verify
- kernel capabilities. Produces pass/fail results for each required feature.
-
-## Running Commands In A Live VM
-
-The standalone `openshell-vm` binary supports `openshell-vm exec -- ` for a running VM.
-
-- Each VM instance stores local runtime state next to its instance rootfs
-- libkrun maps a per-instance host Unix socket into the guest on vsock port `10777`
-- `openshell-vm-init.sh` starts `openshell-vm-exec-agent.py` during boot
-- `openshell-vm exec` connects to the host socket, which libkrun forwards into the guest exec agent
-- The guest exec agent spawns the command, then streams stdout, stderr, and exit status back
-- The host-side bootstrap also uses the exec agent to read PKI cert files from the guest
- (via `cat /opt/openshell/pki/`) instead of requiring a separate vsock server
-
-`openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style
-commands work the same way they would inside the VM shell.
+- **Capability checker** (`check-vm-capabilities.sh`): runs inside a sandbox VM
+ to verify kernel capabilities. Produces pass/fail results for each required
+ feature.
+- **Orphan-cleanup smoke test**: `mise run vm:smoke:orphan-cleanup` asserts
+ that killing the gateway leaves zero driver, launcher, gvproxy, or libkrun
+ survivors.
## Build Commands
-```bash
+```shell
# One-time setup: download pre-built runtime (~30s)
mise run vm:setup
-# Build and run
-mise run vm
-
-# Build embedded binary with base rootfs (~120MB, recommended)
-mise run vm:rootfs -- --base # Build base rootfs tarball
-mise run vm:build # Build binary with embedded rootfs
-
-# Build with full rootfs (air-gapped, ~2GB+)
-mise run vm:rootfs # Build full rootfs tarball
-mise run vm:build # Rebuild binary
+# Start openshell-gateway with the VM compute driver
+mise run gateway:vm
# With custom kernel (optional, adds ~20 min)
-FROM_SOURCE=1 mise run vm:setup # Build runtime from source
-mise run vm:build # Then build embedded binary
+FROM_SOURCE=1 mise run vm:setup
# Wipe everything and start over
mise run vm:clean
```
+See `crates/openshell-driver-vm/README.md` for the full driver workflow,
+including multi-gateway development, CLI registration, and sandbox creation
+examples.
+
## CI/CD
-The openshell-vm build is split into two GitHub Actions workflows that publish to a
-rolling `vm-dev` GitHub Release:
+Two GitHub Actions workflows back the driver's release artifacts, both
+publishing to a rolling `vm-dev` GitHub Release:
### Kernel Runtime (`release-vm-kernel.yml`)
-Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for all
-supported platforms. Runs on-demand or when the kernel config / pinned versions change.
+Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for
+all supported platforms. Runs on-demand or when the kernel config / pinned
+versions change.
| Platform | Runner | Build Method |
|----------|--------|-------------|
@@ -247,43 +254,36 @@ supported platforms. Runs on-demand or when the kernel config / pinned versions
| Linux x86_64 | `build-amd64` (self-hosted) | Native `build-libkrun.sh` |
| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` |
-Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, and
-provenance metadata.
-
-Each platform builds its own libkrunfw and libkrun natively. The kernel inside
-libkrunfw is always Linux regardless of host platform.
+Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw,
+gvproxy, and provenance metadata. Each platform builds its own libkrunfw and
+libkrun natively; the kernel inside libkrunfw is always Linux regardless of
+host platform.
-### VM Binary (`release-vm-dev.yml`)
+### Driver Binary (`release-vm-dev.yml`)
-Builds the self-extracting openshell-vm binary for all platforms. Runs on every push
-to `main` that touches VM-related crates.
+Builds the self-contained `openshell-driver-vm` binary for every platform,
+with the kernel runtime + sandbox rootfs embedded. Runs on every push to
+`main` that touches VM-related crates.
-```mermaid
-graph TD
- CV[compute-versions] --> DL[download-kernel-runtime\nfrom vm-dev release]
- DL --> RFS_ARM[build-rootfs arm64]
- DL --> RFS_AMD[build-rootfs amd64]
- RFS_ARM --> VM_ARM[build-vm linux-arm64]
- RFS_AMD --> VM_AMD[build-vm linux-amd64]
- RFS_ARM --> VM_MAC["build-vm-macos\n(osxcross, reuses arm64 rootfs)"]
- VM_ARM --> REL[release-vm-dev\nupload to rolling release]
- VM_AMD --> REL
- VM_MAC --> REL
-```
+The `download-kernel-runtime` job pulls the current `vm-runtime-.tar.zst`
+from the `vm-dev` release; the `build-openshell-driver-vm` jobs set
+`OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed` and
+run `cargo build --release -p openshell-driver-vm`. The macOS driver is
+cross-compiled via osxcross (no macOS runner needed for the binary build —
+only for the kernel build).
-The macOS binary is cross-compiled via osxcross (no macOS runner needed for the binary
-build — only for the kernel build). The macOS VM guest is always Linux ARM64, so it
-reuses the arm64 rootfs.
-
-macOS binaries produced via osxcross are not codesigned. Users must self-sign:
-```bash
-codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - ./openshell-vm
-```
+macOS driver binaries produced via osxcross are not codesigned. Development
+builds are signed automatically by `crates/openshell-driver-vm/start.sh`; a
+packaged release needs signing in CI.
## Rollout Strategy
-1. Custom runtime is embedded by default when building with `mise run vm:build`.
-2. The init script validates kernel capabilities at boot and fails fast if missing.
-3. For development, override with `OPENSHELL_VM_RUNTIME_DIR` to use a local directory.
-4. In CI, kernel runtime is pre-built and cached in the `vm-dev` release. The binary
- build downloads it via `download-kernel-runtime.sh`.
+1. Custom runtime is embedded by default when building `openshell-driver-vm`
+ with `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` set (wired up by
+ `crates/openshell-driver-vm/start.sh`).
+2. The sandbox init script validates kernel capabilities at boot and fails
+ fast if missing.
+3. For development, override with `OPENSHELL_VM_RUNTIME_DIR` to use a local
+ directory instead of the extracted cache.
+4. In CI, the kernel runtime is pre-built and cached in the `vm-dev` release.
+ The driver build downloads it via `download-kernel-runtime.sh`.
diff --git a/architecture/gateway.md b/architecture/gateway.md
index 5dd2419af..9e9da6785 100644
--- a/architecture/gateway.md
+++ b/architecture/gateway.md
@@ -605,7 +605,7 @@ The gateway reaches the sandbox exclusively through the supervisor-initiated `Co
- **Create**: The VM driver process allocates a sandbox-specific rootfs from its own embedded `rootfs.tar.zst`, injects an explicitly configured guest mTLS bundle when the gateway callback endpoint is `https://`, then re-execs itself in a hidden helper mode that loads libkrun directly and boots the supervisor.
- **Networking**: The helper starts an embedded `gvproxy`, wires it into libkrun as virtio-net, and gives the guest outbound connectivity. No inbound TCP listener is needed — the supervisor reaches the gateway over its outbound `ConnectSupervisor` stream.
-- **Gateway callback**: The guest init script configures `eth0` for gvproxy networking, prefers the configured `OPENSHELL_GRPC_ENDPOINT`, and falls back to host aliases or the gvproxy gateway IP (`192.168.127.1`) when local hostname resolution is unavailable on macOS.
+- **Gateway callback**: The guest init script configures `eth0` for gvproxy networking, seeds `/etc/hosts` so `host.openshell.internal` resolves to the gvproxy gateway IP (`192.168.127.1`), preserves gvproxy's legacy `host.containers.internal` / `host.docker.internal` DNS answers, prefers the configured `OPENSHELL_GRPC_ENDPOINT`, and falls back to those aliases or the raw gateway IP when local hostname resolution is unavailable on macOS.
- **Guest boot**: The sandbox guest runs a minimal init script that starts `openshell-sandbox` directly as PID 1 inside the VM.
- **Watch stream**: Emits provisioning, ready, error, deleting, deleted, and platform-event updates so the gateway store remains the durable source of truth.
diff --git a/crates/openshell-driver-vm/Cargo.toml b/crates/openshell-driver-vm/Cargo.toml
index 368716ef9..b4d92b0fc 100644
--- a/crates/openshell-driver-vm/Cargo.toml
+++ b/crates/openshell-driver-vm/Cargo.toml
@@ -37,5 +37,13 @@ libloading = "0.8"
tar = "0.4"
zstd = "0.13"
+# smol-rs/polling drives the BSD/macOS parent-death detection in
+# procguard via kqueue's EVFILT_PROC / NOTE_EXIT filter. We could use
+# it on Linux too (via epoll + pidfd) but sticking with
+# nix::sys::prctl::set_pdeathsig there keeps the Linux path a single
+# syscall with no helper thread.
+[target.'cfg(any(target_os = "macos", target_os = "ios", target_os = "freebsd", target_os = "netbsd", target_os = "openbsd", target_os = "dragonfly"))'.dependencies]
+polling = "3.11"
+
[lints]
workspace = true
diff --git a/crates/openshell-driver-vm/Makefile b/crates/openshell-driver-vm/Makefile
deleted file mode 100644
index e1c360f3d..000000000
--- a/crates/openshell-driver-vm/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-.PHONY: start
-
-start:
- ./start.sh
diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md
index a95462695..8808b25d9 100644
--- a/crates/openshell-driver-vm/README.md
+++ b/crates/openshell-driver-vm/README.md
@@ -31,19 +31,15 @@ Sandbox guests execute `/opt/openshell/bin/openshell-sandbox` as PID 1 inside th
## Quick start (recommended)
-`start.sh` handles runtime setup, builds, codesigning, and environment wiring. From the repo root:
```shell
-crates/openshell-driver-vm/start.sh
+mise run gateway:vm
```
-or equivalently:
-
-```shell
-make -C crates/openshell-driver-vm start
-```
-First run takes a few minutes while `mise run vm:setup` stages libkrun/libkrunfw/gvproxy and `mise run vm:rootfs -- --base` builds the embedded rootfs. Subsequent runs are cached. State lives under `target/openshell-vm-driver-dev/` (SQLite DB + per-sandbox rootfs + `compute-driver.sock`).
+First run takes a few minutes while `mise run vm:setup` stages libkrun/libkrunfw/gvproxy and `mise run vm:rootfs -- --base` builds the embedded rootfs. Subsequent runs are cached. To keep the Unix socket path under macOS `SUN_LEN`, `mise run gateway:vm` and `start.sh` default the state dir to `/tmp/openshell-vm-driver-dev-$USER-port-$PORT/` (SQLite DB + per-sandbox rootfs + `compute-driver.sock`) unless `OPENSHELL_VM_DRIVER_STATE_DIR` is set.
+The wrapper also prints the recommended gateway name (`vm-driver-port-$PORT` by default) plus the exact repo-local `scripts/bin/openshell gateway add` and `scripts/bin/openshell gateway select` commands to use from another terminal. This avoids accidentally hitting an older `openshell` binary elsewhere on your `PATH`.
+It also exports `OPENSHELL_DRIVER_DIR=$PWD/target/debug` before starting the gateway so local dev runs use the freshly built `openshell-driver-vm` instead of an older installed copy from `~/.local/libexec/openshell` or `/usr/local/libexec`.
Override via environment:
@@ -53,10 +49,33 @@ OPENSHELL_SSH_HANDSHAKE_SECRET=$(openssl rand -hex 32) \
crates/openshell-driver-vm/start.sh
```
+Run multiple dev gateways side by side by giving each one a unique port. The wrapper derives a distinct default state dir from that port automatically:
+
+```shell
+OPENSHELL_SERVER_PORT=8080 mise run gateway:vm
+OPENSHELL_SERVER_PORT=8081 mise run gateway:vm
+```
+
+If you want a custom suffix instead of `port-$PORT`, set `OPENSHELL_VM_INSTANCE`:
+
+```shell
+OPENSHELL_SERVER_PORT=8082 \
+OPENSHELL_VM_INSTANCE=feature-a \
+mise run gateway:vm
+```
+
+If you want a custom CLI gateway name, set `OPENSHELL_VM_GATEWAY_NAME`:
+
+```shell
+OPENSHELL_SERVER_PORT=8082 \
+OPENSHELL_VM_GATEWAY_NAME=vm-feature-a \
+mise run gateway:vm
+```
+
Teardown:
```shell
-rm -rf target/openshell-vm-driver-dev
+rm -rf /tmp/openshell-vm-driver-dev-$USER-port-8080
```
## Manual equivalent
@@ -78,16 +97,17 @@ codesign \
--force -s - target/debug/openshell-driver-vm
# 4. Start the gateway with the VM driver
-mkdir -p target/openshell-vm-driver-dev
+mkdir -p /tmp/openshell-vm-driver-dev-$USER-port-8080
target/debug/openshell-gateway \
--drivers vm \
--disable-tls \
- --database-url sqlite:target/openshell-vm-driver-dev/openshell.db \
+ --database-url sqlite:/tmp/openshell-vm-driver-dev-$USER-port-8080/openshell.db \
+ --driver-dir $PWD/target/debug \
--grpc-endpoint http://host.containers.internal:8080 \
--ssh-handshake-secret dev-vm-driver-secret \
--ssh-gateway-host 127.0.0.1 \
--ssh-gateway-port 8080 \
- --vm-driver-state-dir $PWD/target/openshell-vm-driver-dev
+ --vm-driver-state-dir /tmp/openshell-vm-driver-dev-$USER-port-8080
```
The gateway resolves `openshell-driver-vm` in this order: `--driver-dir`, conventional install locations (`~/.local/libexec/openshell`, `/usr/local/libexec/openshell`, `/usr/local/libexec`), then a sibling of the gateway binary.
@@ -97,7 +117,7 @@ The gateway resolves `openshell-driver-vm` in this order: `--driver-dir`, conven
| Flag | Env var | Default | Purpose |
|---|---|---|---|
| `--drivers vm` | `OPENSHELL_DRIVERS` | `kubernetes` | Select the VM compute driver. |
-| `--grpc-endpoint URL` | `OPENSHELL_GRPC_ENDPOINT` | — | Required. URL the sandbox guest calls back to. Use a host alias that resolves to the gateway's host from inside the VM (gvproxy answers `host.containers.internal` and `host.openshell.internal` to `192.168.127.1`). |
+| `--grpc-endpoint URL` | `OPENSHELL_GRPC_ENDPOINT` | — | Required. URL the sandbox guest calls back to. Use a host alias that resolves to the gateway's host from inside the VM (`host.containers.internal` comes from gvproxy DNS; the guest init script also seeds `host.openshell.internal` to `192.168.127.1`). |
| `--vm-driver-state-dir DIR` | `OPENSHELL_VM_DRIVER_STATE_DIR` | `target/openshell-vm-driver` | Per-sandbox rootfs, console logs, and the `compute-driver.sock` UDS. |
| `--driver-dir DIR` | `OPENSHELL_DRIVER_DIR` | unset | Override the directory searched for `openshell-driver-vm`. |
| `--vm-driver-vcpus N` | `OPENSHELL_VM_DRIVER_VCPUS` | `2` | vCPUs per sandbox. |
diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh
index 70dda5acb..e449003f9 100644
--- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh
+++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh
@@ -9,6 +9,7 @@
set -euo pipefail
BOOT_START=$(date +%s%3N 2>/dev/null || date +%s)
+GVPROXY_GATEWAY_IP="192.168.127.1"
ts() {
local now
@@ -72,6 +73,20 @@ tcp_probe() {
fi
}
+ensure_host_gateway_aliases() {
+ local hosts_tmp="/tmp/openshell-hosts.$$"
+
+ if [ -f /etc/hosts ]; then
+ grep -vE '(^|[[:space:]])host\.openshell\.internal([[:space:]]|$)' /etc/hosts > "$hosts_tmp" || true
+ else
+ : > "$hosts_tmp"
+ fi
+
+ printf '%s host.openshell.internal\n' "$GVPROXY_GATEWAY_IP" >> "$hosts_tmp"
+ cat "$hosts_tmp" > /etc/hosts
+ rm -f "$hosts_tmp"
+}
+
rewrite_openshell_endpoint_if_needed() {
local endpoint="${OPENSHELL_ENDPOINT:-}"
[ -n "$endpoint" ] || return 0
@@ -92,7 +107,7 @@ rewrite_openshell_endpoint_if_needed() {
return 0
fi
- for candidate in host.containers.internal host.docker.internal 192.168.127.1; do
+ for candidate in host.openshell.internal host.containers.internal host.docker.internal "$GVPROXY_GATEWAY_IP"; do
if [ "$candidate" = "$host" ]; then
continue
fi
@@ -163,18 +178,20 @@ DHCP_SCRIPT
if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then
ts "WARNING: DHCP failed, falling back to static config"
ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true
- ip route add default via 192.168.127.1 2>/dev/null || true
+ ip route add default via "$GVPROXY_GATEWAY_IP" 2>/dev/null || true
fi
else
ts "no DHCP client, using static config"
ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true
- ip route add default via 192.168.127.1 2>/dev/null || true
+ ip route add default via "$GVPROXY_GATEWAY_IP" 2>/dev/null || true
fi
if [ ! -s /etc/resolv.conf ]; then
echo "nameserver 8.8.8.8" > /etc/resolv.conf
echo "nameserver 8.8.4.4" >> /etc/resolv.conf
fi
+
+ ensure_host_gateway_aliases
else
ts "WARNING: eth0 not found; supervisor will start without guest egress"
fi
diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs
index 8237ba03c..d649a585a 100644
--- a/crates/openshell-driver-vm/src/driver.rs
+++ b/crates/openshell-driver-vm/src/driver.rs
@@ -33,6 +33,8 @@ const DRIVER_NAME: &str = "openshell-driver-vm";
const WATCH_BUFFER: usize = 256;
const DEFAULT_VCPUS: u8 = 2;
const DEFAULT_MEM_MIB: u32 = 2048;
+const GVPROXY_GATEWAY_IP: &str = "192.168.127.1";
+const OPENSHELL_HOST_GATEWAY_ALIAS: &str = "host.openshell.internal";
const GUEST_SSH_SOCKET_PATH: &str = "/run/openshell/ssh.sock";
const GUEST_TLS_DIR: &str = "/opt/openshell/tls";
const GUEST_TLS_CA_PATH: &str = "/opt/openshell/tls/ca.crt";
@@ -147,7 +149,7 @@ fn validate_openshell_endpoint(endpoint: &str) -> Result<(), String> {
if invalid_from_vm {
return Err(format!(
- "openshell endpoint '{endpoint}' is not reachable from sandbox VMs; use a concrete host such as 127.0.0.1, host.containers.internal, or another routable address"
+ "openshell endpoint '{endpoint}' is not reachable from sandbox VMs; use a concrete host such as 127.0.0.1, {OPENSHELL_HOST_GATEWAY_ALIAS}, or another routable address"
));
}
@@ -256,7 +258,19 @@ impl VmDriver {
let console_output = state_dir.join("rootfs-console.log");
let mut command = Command::new(&self.launcher_bin);
- command.kill_on_drop(true);
+ // Intentionally DO NOT set kill_on_drop(true). On a signal-driven
+ // driver exit (SIGKILL, SIGTERM without a handler, panic),
+ // tokio's Drop is racy with the launcher's procguard-initiated
+ // cleanup: if kill_on_drop SIGKILLs the launcher first, its
+ // cleanup callback never gets to SIGTERM gvproxy, and gvproxy is
+ // reparented to init as an orphan. Instead the whole cleanup
+ // cascade runs via procguard:
+ // driver exits → launcher's kqueue (macOS) or PR_SET_PDEATHSIG
+ // (Linux) fires → launcher kills gvproxy + libkrun fork →
+ // launcher exits → its own children die under pdeathsig.
+ // The explicit Drop path in VmProcess::terminate_vm_process still
+ // handles voluntary `delete_sandbox` teardown cleanly, where we
+ // do want SIGTERM + wait + SIGKILL semantics.
command.stdin(Stdio::null());
command.stdout(Stdio::inherit());
command.stderr(Stdio::inherit());
@@ -403,16 +417,23 @@ impl VmDriver {
snapshots
}
+ /// Watch the launcher child process and surface errors as driver
+ /// conditions.
+ ///
+ /// The driver no longer owns the `Ready` transition — the gateway
+ /// promotes a sandbox to `Ready` the moment its supervisor session
+ /// lands (see `openshell-server/src/compute/mod.rs`). This loop only
+ /// handles the sad paths: the child process failing to start, exiting
+ /// abnormally, or becoming unpollable. Those still surface as driver
+ /// `Error` conditions so the gateway can reason about a dead VM.
async fn monitor_sandbox(&self, sandbox_id: String) {
- let mut ready_emitted = false;
-
loop {
- let (process, state_dir) = {
+ let process = {
let registry = self.registry.lock().await;
let Some(record) = registry.get(&sandbox_id) else {
return;
};
- (record.process.clone(), record.state_dir.clone())
+ record.process.clone()
};
let exit_status = {
@@ -469,16 +490,6 @@ impl VmDriver {
return;
}
- if !ready_emitted && guest_ssh_ready(&state_dir).await {
- if let Some(snapshot) = self
- .set_snapshot_condition(&sandbox_id, ready_condition(), false)
- .await
- {
- self.publish_snapshot(snapshot);
- }
- ready_emitted = true;
- }
-
tokio::time::sleep(Duration::from_millis(250)).await;
}
}
@@ -726,7 +737,7 @@ fn guest_visible_openshell_endpoint(endpoint: &str) -> String {
None => false,
};
- if should_rewrite && url.set_host(Some("192.168.127.1")).is_ok() {
+ if should_rewrite && url.set_host(Some(GVPROXY_GATEWAY_IP)).is_ok() {
return url.to_string();
}
@@ -843,16 +854,6 @@ async fn terminate_vm_process(child: &mut Child) -> Result<(), std::io::Error> {
}
}
-async fn guest_ssh_ready(state_dir: &Path) -> bool {
- let console_log = state_dir.join("rootfs-console.log");
- let Ok(contents) = tokio::fs::read_to_string(console_log).await else {
- return false;
- };
-
- contents.contains("SSH server is ready to accept connections")
- || contents.contains("SSH server listening")
-}
-
fn sandbox_snapshot(sandbox: &Sandbox, condition: SandboxCondition, deleting: bool) -> Sandbox {
Sandbox {
id: sandbox.id.clone(),
@@ -895,16 +896,6 @@ fn provisioning_condition() -> SandboxCondition {
}
}
-fn ready_condition() -> SandboxCondition {
- SandboxCondition {
- r#type: "Ready".to_string(),
- status: "True".to_string(),
- reason: "Listening".to_string(),
- message: "Supervisor is listening for SSH connections".to_string(),
- last_transition_time: String::new(),
- }
-}
-
fn deleting_condition() -> SandboxCondition {
SandboxCondition {
r#type: "Ready".to_string(),
@@ -1030,19 +1021,47 @@ mod tests {
let env = build_guest_environment(&sandbox, &config);
assert!(env.contains(&"HOME=/root".to_string()));
- assert!(env.contains(&"OPENSHELL_ENDPOINT=http://192.168.127.1:8080/".to_string()));
+ assert!(env.contains(&format!(
+ "OPENSHELL_ENDPOINT=http://{GVPROXY_GATEWAY_IP}:8080/"
+ )));
assert!(env.contains(&"OPENSHELL_SANDBOX_ID=sandbox-123".to_string()));
assert!(env.contains(&format!(
"OPENSHELL_SSH_SOCKET_PATH={GUEST_SSH_SOCKET_PATH}"
)));
}
+ #[test]
+ fn guest_visible_openshell_endpoint_rewrites_loopback_hosts_to_gvproxy_gateway() {
+ assert_eq!(
+ guest_visible_openshell_endpoint("http://127.0.0.1:8080"),
+ format!("http://{GVPROXY_GATEWAY_IP}:8080/")
+ );
+ assert_eq!(
+ guest_visible_openshell_endpoint("http://localhost:8080"),
+ format!("http://{GVPROXY_GATEWAY_IP}:8080/")
+ );
+ assert_eq!(
+ guest_visible_openshell_endpoint("https://[::1]:8443"),
+ format!("https://{GVPROXY_GATEWAY_IP}:8443/")
+ );
+ }
+
#[test]
fn guest_visible_openshell_endpoint_preserves_non_loopback_hosts() {
+ assert_eq!(
+ guest_visible_openshell_endpoint(&format!(
+ "http://{OPENSHELL_HOST_GATEWAY_ALIAS}:8080"
+ )),
+ format!("http://{OPENSHELL_HOST_GATEWAY_ALIAS}:8080")
+ );
assert_eq!(
guest_visible_openshell_endpoint("http://host.containers.internal:8080"),
"http://host.containers.internal:8080"
);
+ assert_eq!(
+ guest_visible_openshell_endpoint(&format!("http://{GVPROXY_GATEWAY_IP}:8080")),
+ format!("http://{GVPROXY_GATEWAY_IP}:8080")
+ );
assert_eq!(
guest_visible_openshell_endpoint("https://gateway.internal:8443"),
"https://gateway.internal:8443"
@@ -1157,9 +1176,9 @@ mod tests {
fn validate_openshell_endpoint_accepts_host_gateway() {
validate_openshell_endpoint("http://host.containers.internal:8080")
.expect("guest-reachable host alias should be accepted");
- validate_openshell_endpoint("http://192.168.127.1:8080")
+ validate_openshell_endpoint(&format!("http://{GVPROXY_GATEWAY_IP}:8080"))
.expect("gateway IP should be accepted");
- validate_openshell_endpoint("http://host.openshell.internal:8080")
+ validate_openshell_endpoint(&format!("http://{OPENSHELL_HOST_GATEWAY_ALIAS}:8080"))
.expect("openshell host alias should be accepted");
validate_openshell_endpoint("https://gateway.internal:8443")
.expect("dns endpoint should be accepted");
@@ -1214,32 +1233,6 @@ mod tests {
let _ = std::fs::remove_dir_all(base);
}
- #[tokio::test]
- async fn guest_ssh_ready_detects_guest_console_marker() {
- let base = unique_temp_dir();
- std::fs::create_dir_all(&base).unwrap();
- std::fs::write(
- base.join("rootfs-console.log"),
- "...\nINFO openshell_sandbox: SSH server is ready to accept connections\n",
- )
- .unwrap();
-
- assert!(guest_ssh_ready(&base).await);
-
- let _ = std::fs::remove_dir_all(base);
- }
-
- #[tokio::test]
- async fn guest_ssh_ready_is_false_without_marker() {
- let base = unique_temp_dir();
- std::fs::create_dir_all(&base).unwrap();
- std::fs::write(base.join("rootfs-console.log"), "sandbox booting\n").unwrap();
-
- assert!(!guest_ssh_ready(&base).await);
-
- let _ = std::fs::remove_dir_all(base);
- }
-
fn unique_temp_dir() -> PathBuf {
static COUNTER: AtomicU64 = AtomicU64::new(0);
let nanos = SystemTime::now()
diff --git a/crates/openshell-driver-vm/src/ffi.rs b/crates/openshell-driver-vm/src/ffi.rs
index 750788ac1..a81b150af 100644
--- a/crates/openshell-driver-vm/src/ffi.rs
+++ b/crates/openshell-driver-vm/src/ffi.rs
@@ -37,7 +37,6 @@ type KrunSetExec = unsafe extern "C" fn(
argv: *const *const c_char,
envp: *const *const c_char,
) -> i32;
-type KrunSetPortMap = unsafe extern "C" fn(ctx_id: u32, port_map: *const *const c_char) -> i32;
type KrunSetConsoleOutput = unsafe extern "C" fn(ctx_id: u32, filepath: *const c_char) -> i32;
type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32;
type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32;
@@ -68,7 +67,6 @@ pub struct LibKrun {
pub krun_set_root: KrunSetRoot,
pub krun_set_workdir: KrunSetWorkdir,
pub krun_set_exec: KrunSetExec,
- pub krun_set_port_map: KrunSetPortMap,
pub krun_set_console_output: KrunSetConsoleOutput,
pub krun_start_enter: KrunStartEnter,
pub krun_disable_implicit_vsock: KrunDisableImplicitVsock,
@@ -121,7 +119,6 @@ impl LibKrun {
krun_set_root: load_symbol(library, b"krun_set_root\0", &libkrun_path)?,
krun_set_workdir: load_symbol(library, b"krun_set_workdir\0", &libkrun_path)?,
krun_set_exec: load_symbol(library, b"krun_set_exec\0", &libkrun_path)?,
- krun_set_port_map: load_symbol(library, b"krun_set_port_map\0", &libkrun_path)?,
krun_set_console_output: load_symbol(
library,
b"krun_set_console_output\0",
diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs
index 1c424deeb..772db47b3 100644
--- a/crates/openshell-driver-vm/src/lib.rs
+++ b/crates/openshell-driver-vm/src/lib.rs
@@ -4,10 +4,9 @@
pub mod driver;
mod embedded_runtime;
mod ffi;
+pub mod procguard;
mod rootfs;
mod runtime;
-pub const GUEST_SSH_PORT: u16 = 2222;
-
pub use driver::{VmDriver, VmDriverConfig};
pub use runtime::{VM_RUNTIME_DIR_ENV, VmLaunchConfig, configured_runtime_dir, run_vm};
diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs
index 3a7976273..5a675e78a 100644
--- a/crates/openshell-driver-vm/src/main.rs
+++ b/crates/openshell-driver-vm/src/main.rs
@@ -6,7 +6,8 @@ use miette::{IntoDiagnostic, Result};
use openshell_core::VERSION;
use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer;
use openshell_driver_vm::{
- VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, VmLaunchConfig, configured_runtime_dir, run_vm,
+ VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, VmLaunchConfig, configured_runtime_dir,
+ procguard, run_vm,
};
use std::net::SocketAddr;
use std::path::PathBuf;
@@ -34,9 +35,6 @@ struct Args {
#[arg(long, hide = true)]
vm_env: Vec,
- #[arg(long, hide = true)]
- vm_port: Vec,
-
#[arg(long, hide = true)]
vm_console_output: Option,
@@ -101,6 +99,14 @@ struct Args {
async fn main() -> Result<()> {
let args = Args::parse();
if args.internal_run_vm {
+ // We intentionally defer procguard arming until `run_vm()` so
+ // that the only arm is the one that knows how to clean up
+ // gvproxy. Racing two watchers against the same parent-death
+ // event causes the bare arm's `exit(1)` to win, skipping the
+ // gvproxy cleanup and leaking the helper. The risk window
+ // before `run_vm` arms procguard is ~a few syscalls long
+ // (`build_vm_launch_config`, `configured_runtime_dir`), which
+ // is negligible next to the parent gRPC server's uptime.
maybe_reexec_internal_vm_with_runtime_env()?;
let config = build_vm_launch_config(&args).map_err(|err| miette::miette!("{err}"))?;
run_vm(&config).map_err(|err| miette::miette!("{err}"))?;
@@ -113,6 +119,18 @@ async fn main() -> Result<()> {
)
.init();
+ // Arm procguard so that if the gateway is killed (SIGKILL or crash)
+ // we also die. Without this the driver is reparented to init and
+ // keeps its per-sandbox VM launchers alive forever. Launchers have
+ // their own procguards (armed in `run_vm`) which cascade cleanup of
+ // gvproxy and the libkrun worker the moment this driver exits.
+ if let Err(err) = procguard::die_with_parent() {
+ tracing::warn!(
+ error = %err,
+ "procguard arm failed; gateway crashes may orphan this driver"
+ );
+ }
+
let driver = VmDriver::new(VmDriverConfig {
openshell_endpoint: args
.openshell_endpoint
@@ -183,7 +201,6 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result std::result::Result Result<()> {
+ use std::os::unix::process::CommandExt as _;
+
const REEXEC_ENV: &str = "__OPENSHELL_DRIVER_VM_REEXEC";
if std::env::var_os(REEXEC_ENV).is_some() {
@@ -213,14 +232,23 @@ fn maybe_reexec_internal_vm_with_runtime_env() -> Result<()> {
.map_err(|err| miette::miette!("join DYLD_LIBRARY_PATH: {err}"))?;
let exe = std::env::current_exe().into_diagnostic()?;
let args: Vec = std::env::args().skip(1).collect();
- let status = std::process::Command::new(exe)
+
+ // Use execvp() so the current process is *replaced* by the re-exec'd
+ // binary — no wrapper process sits between the compute driver and
+ // the actually-running VM launcher. That avoids two problems:
+ // 1. An extra process level that survives SIGKILL of the driver
+ // (the wrapper was reparenting the re-exec'd child to init).
+ // 2. Signal forwarding: with a wrapper, a SIGTERM to the wrapper
+ // doesn't reach the child unless we hand-roll forwarding.
+ // After exec, the child inherits our PID and our procguard arming.
+ let err = std::process::Command::new(exe)
.args(&args)
.env("DYLD_LIBRARY_PATH", &joined)
.env(VM_RUNTIME_DIR_ENV, runtime_dir)
.env(REEXEC_ENV, "1")
- .status()
- .into_diagnostic()?;
- std::process::exit(status.code().unwrap_or(1));
+ .exec();
+ // `exec()` only returns on failure.
+ Err(miette::miette!("failed to re-exec with runtime env: {err}"))
}
#[cfg(not(target_os = "macos"))]
diff --git a/crates/openshell-driver-vm/src/procguard.rs b/crates/openshell-driver-vm/src/procguard.rs
new file mode 100644
index 000000000..1d91880f7
--- /dev/null
+++ b/crates/openshell-driver-vm/src/procguard.rs
@@ -0,0 +1,196 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Cross-platform "die when my parent dies" primitive.
+//!
+//! The VM driver spawns a chain of subprocesses (compute driver → `--internal-run-vm`
+//! launcher → gvproxy + libkrun fork). If any link in that chain is killed
+//! with SIGKILL — or simply crashes — the children are reparented to init
+//! and survive indefinitely, leaking libkrun workers and gvproxy
+//! instances.
+//!
+//! This module exposes two functions:
+//! * [`die_with_parent`] — configure the kernel (Linux) or a helper
+//! thread (BSDs, incl. macOS) to SIGKILL the current process when its
+//! parent dies. Call it from `main` in every subprocess we spawn
+//! along the chain. Idempotent-ish (each call is a full setup — see
+//! the runtime.rs comment at the single call site).
+//! * [`die_with_parent_cleanup`] — same as above, but on the BSD path a
+//! best-effort cleanup callback runs *before* this process exits.
+//! This matters when we own a non-Rust child (e.g. gvproxy) that
+//! cannot arm its own procguard; the callback lets us SIGTERM it
+//! first.
+//!
+//! The Linux path uses `nix::sys::prctl::set_pdeathsig(SIGKILL)`, and
+//! the BSD path uses `smol-rs/polling` with its `kqueue::Process` +
+//! `ProcessOps::Exit` filter. Both are well-tested library surfaces;
+//! we keep only the glue code and the pre-arming parent-liveness
+//! re-check.
+
+/// Arrange for the current process to receive SIGKILL if its parent dies.
+///
+/// On Linux this sets `PR_SET_PDEATHSIG` to SIGKILL (via
+/// `nix::sys::prctl`). The kernel delivers SIGKILL the moment
+/// `getppid()` changes away from the original parent.
+///
+/// On the BSD family (macOS, FreeBSD, etc.) this spawns a detached
+/// helper thread that uses `kqueue` with `EVFILT_PROC | NOTE_EXIT` on
+/// the parent PID. When the parent exits the thread calls `exit(1)`,
+/// which is sufficient for our use case — we are not a critical daemon
+/// that needs to drain state; we are a VM launcher / gRPC driver whose
+/// entire job is tied to the parent's lifetime.
+pub fn die_with_parent() -> Result<(), String> {
+ die_with_parent_cleanup(|| ())
+}
+
+/// Like [`die_with_parent`], but run `cleanup` (best-effort,
+/// async-signal-unsafe — it runs on the helper thread) immediately
+/// before terminating the process. Use this when we own children that
+/// cannot arm their own procguard; the cleanup hook is the only chance
+/// we get to send them SIGTERM after the kernel reparents us.
+///
+/// On Linux the cleanup is a no-op: `PR_SET_PDEATHSIG` delivers SIGKILL
+/// directly to us, there is no Rust-controlled moment between "parent
+/// died" and "we die" in which we could run a callback.
+pub fn die_with_parent_cleanup(cleanup: F) -> Result<(), String>
+where
+ F: FnOnce() + Send + 'static,
+{
+ #[cfg(target_os = "linux")]
+ {
+ // Linux has no opportunity for a cleanup hook — the kernel
+ // delivers SIGKILL directly. Callers that need pre-exit cleanup
+ // must combine this with a `pre_exec` PR_SET_PDEATHSIG on their
+ // children (so the kernel cascades) or rely on process-group
+ // killpg from a signal handler in the parent.
+ let _ = cleanup; // intentionally dropped
+ install_linux_pdeathsig()
+ }
+
+ #[cfg(any(
+ target_os = "macos",
+ target_os = "ios",
+ target_os = "freebsd",
+ target_os = "netbsd",
+ target_os = "openbsd",
+ target_os = "dragonfly",
+ ))]
+ {
+ install_bsd_kqueue_watcher(cleanup)
+ }
+
+ #[cfg(not(any(
+ target_os = "linux",
+ target_os = "macos",
+ target_os = "ios",
+ target_os = "freebsd",
+ target_os = "netbsd",
+ target_os = "openbsd",
+ target_os = "dragonfly",
+ )))]
+ {
+ let _ = cleanup;
+ Ok(())
+ }
+}
+
+#[cfg(target_os = "linux")]
+fn install_linux_pdeathsig() -> Result<(), String> {
+ use nix::sys::signal::Signal;
+ use nix::unistd::getppid;
+
+ // Race: if the parent already died between fork/exec and this call,
+ // `getppid()` now returns 1 and PR_SET_PDEATHSIG will never fire.
+ // Read the current parent first so we can detect that case and exit.
+ let original_ppid = getppid();
+ if original_ppid == nix::unistd::Pid::from_raw(1) {
+ return Err("process was already orphaned before procguard armed".to_string());
+ }
+
+ nix::sys::prctl::set_pdeathsig(Signal::SIGKILL)
+ .map_err(|err| format!("prctl(PR_SET_PDEATHSIG) failed: {err}"))?;
+
+ // Re-check after arming: the parent may have died between getppid()
+ // and prctl(). If so, PR_SET_PDEATHSIG missed its window.
+ if getppid() != original_ppid {
+ return Err("parent exited before procguard could arm".to_string());
+ }
+
+ Ok(())
+}
+
+#[cfg(any(
+ target_os = "macos",
+ target_os = "ios",
+ target_os = "freebsd",
+ target_os = "netbsd",
+ target_os = "openbsd",
+ target_os = "dragonfly",
+))]
+fn install_bsd_kqueue_watcher(cleanup: F) -> Result<(), String>
+where
+ F: FnOnce() + Send + 'static,
+{
+ use nix::unistd::getppid;
+ use polling::os::kqueue::{PollerKqueueExt, Process, ProcessOps};
+ use polling::{Events, PollMode, Poller};
+
+ let parent_pid = getppid();
+ if parent_pid == nix::unistd::Pid::from_raw(1) {
+ return Err("process was already orphaned before procguard armed".to_string());
+ }
+ let parent_pid_nz = std::num::NonZeroI32::new(parent_pid.as_raw())
+ .ok_or_else(|| "getppid returned 0 unexpectedly".to_string())?;
+
+ // Build the poller on the caller's thread so any setup error
+ // surfaces synchronously. `EVFILT_PROC | NOTE_EXIT` is a one-shot
+ // filter, so `PollMode::Oneshot` matches the kernel semantics.
+ //
+ // SAFETY: `Process::from_pid` requires the PID to "be tied to an
+ // actual child process". Our parent is alive at this point — we
+ // re-check `getppid()` immediately after registration to close the
+ // race where the parent dies between the read above and the
+ // `add_filter` call. The BSD kqueue implementation accepts any
+ // live PID, not just our own children; the "child" wording in the
+ // polling docs is carried over from historical terminology in the
+ // kqueue(2) manpage. The kernel guarantees NOTE_EXIT fires if the
+ // PID is valid at registration.
+ let poller = Poller::new().map_err(|err| format!("polling: Poller::new failed: {err}"))?;
+ let key = 1;
+ #[allow(unsafe_code)]
+ // SAFETY requirement is documented on the enclosing function: the
+ // PID was just read from `getppid()` and re-checked below, so it
+ // points at a live process. `Process::from_pid` is an
+ // entry-in-the-kernel-table registration — the kernel validates
+ // the PID when the filter is added.
+ let filter = unsafe { Process::from_pid(parent_pid_nz, ProcessOps::Exit) };
+ poller
+ .add_filter(filter, key, PollMode::Oneshot)
+ .map_err(|err| format!("polling: add_filter(NOTE_EXIT, {parent_pid_nz}) failed: {err}"))?;
+
+ // Between getppid() and the registered filter the parent may
+ // already have died. Detect that and abort so the caller can bail.
+ if getppid() != parent_pid {
+ return Err("parent exited before procguard could arm".to_string());
+ }
+
+ // Hand off to a dedicated OS thread. Block in `poller.wait()`
+ // until the single NOTE_EXIT event fires, run the cleanup, then
+ // exit. We prefer `exit(1)` over `kill(getpid, SIGKILL)` so the
+ // callback gets to complete — SIGKILL would race it. Our children
+ // have their own procguards armed and will notice `getppid() ==
+ // 1` shortly after, so we do not need Linux-semantics exactness.
+ std::thread::Builder::new()
+ .name("procguard".to_string())
+ .spawn(move || {
+ let mut events = Events::new();
+ // Block indefinitely; the filter is Oneshot so we expect
+ // exactly one event (parent's NOTE_EXIT) or a spurious
+ // wakeup we treat the same way.
+ let _ = poller.wait(&mut events, None);
+ let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(cleanup));
+ std::process::exit(1);
+ })
+ .map(|_| ())
+ .map_err(|e| format!("failed to spawn procguard thread: {e}"))
+}
diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs
index 9888feb18..e20c7d4e5 100644
--- a/crates/openshell-driver-vm/src/runtime.rs
+++ b/crates/openshell-driver-vm/src/runtime.rs
@@ -4,20 +4,25 @@
#![allow(unsafe_code)]
use std::ffi::CString;
-use std::io::{Read, Write};
-use std::os::unix::net::UnixStream;
use std::path::{Path, PathBuf};
use std::process::{Child as StdChild, Command as StdCommand, Stdio};
use std::ptr;
use std::sync::atomic::{AtomicI32, Ordering};
use std::time::{Duration, Instant};
-use crate::{GUEST_SSH_PORT, embedded_runtime, ffi};
+use crate::{embedded_runtime, ffi, procguard};
pub const VM_RUNTIME_DIR_ENV: &str = "OPENSHELL_VM_RUNTIME_DIR";
+/// PID of the forked libkrun worker (the VM's PID 1). Zero when not running.
+/// Used by the SIGTERM/SIGINT handler to forward signals to the VM.
static CHILD_PID: AtomicI32 = AtomicI32::new(0);
+/// PID of the gvproxy helper process. Zero when not running. Used by the
+/// SIGTERM/SIGINT handler to make sure gvproxy doesn't survive the
+/// launcher on macOS (where we can't use `PR_SET_PDEATHSIG`).
+static GVPROXY_PID: AtomicI32 = AtomicI32::new(0);
+
pub struct VmLaunchConfig {
pub rootfs: PathBuf,
pub vcpus: u8,
@@ -26,23 +31,10 @@ pub struct VmLaunchConfig {
pub args: Vec,
pub env: Vec,
pub workdir: String,
- pub port_map: Vec,
pub log_level: u32,
pub console_output: PathBuf,
}
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-struct PortMapping {
- host_port: u16,
- guest_port: u16,
-}
-
-#[derive(Debug, Clone, PartialEq, Eq)]
-struct GvproxyPortPlan {
- ssh_port: u16,
- forwarded_ports: Vec,
-}
-
pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> {
if !config.rootfs.is_dir() {
return Err(format!(
@@ -51,6 +43,47 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> {
));
}
+ // Arm procguard first, BEFORE we spawn gvproxy or fork libkrun, so
+ // that the launcher can't be orphaned during setup. The cleanup
+ // callback reads the GVPROXY_PID atomic (initially 0 — no-op) and
+ // the CHILD_PID atomic (the libkrun fork), so it stays correct as
+ // those slots get populated later in this function. Only ONE arm
+ // per process: racing two watchers for the same NOTE_EXIT event
+ // would cause whichever wins to skip the cleanup.
+ if let Err(err) = procguard::die_with_parent_cleanup(|| {
+ // Cleanup order: SIGTERM gvproxy and the libkrun fork first so
+ // they can drain cleanly, then SIGKILL after a brief grace
+ // window. We can't rely on Rust destructors here; when
+ // procguard's watcher thread returns we call `std::process::exit`
+ // and the process tears down. Only async-signal-safe calls here:
+ // atomic loads and `kill(2)` are both on the POSIX list.
+ let gv_pid = GVPROXY_PID.load(Ordering::Relaxed);
+ let child_pid = CHILD_PID.load(Ordering::Relaxed);
+ if gv_pid > 0 {
+ unsafe {
+ libc::kill(gv_pid, libc::SIGTERM);
+ }
+ }
+ if child_pid > 0 {
+ unsafe {
+ libc::kill(child_pid, libc::SIGTERM);
+ }
+ }
+ std::thread::sleep(Duration::from_millis(200));
+ if gv_pid > 0 {
+ unsafe {
+ libc::kill(gv_pid, libc::SIGKILL);
+ }
+ }
+ if child_pid > 0 {
+ unsafe {
+ libc::kill(child_pid, libc::SIGKILL);
+ }
+ }
+ }) {
+ return Err(format!("procguard arm failed: {err}"));
+ }
+
#[cfg(target_os = "linux")]
check_kvm_access()?;
@@ -64,10 +97,40 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> {
vm.set_root(&config.rootfs)?;
vm.set_workdir(&config.workdir)?;
- let mut forwarded_port_map = config.port_map.clone();
- let mut gvproxy_guard = None;
- let mut gvproxy_api_sock = None;
- if !config.port_map.is_empty() {
+ // Run gvproxy strictly as the guest's virtual NIC / DHCP / router.
+ //
+ // After the supervisor-initiated relay migration (#867), the driver
+ // no longer forwards any host-side ports into the guest — all ingress
+ // traffic for SSH and exec rides the outbound `ConnectSupervisor`
+ // gRPC stream the guest opens to the gateway. What gvproxy still
+ // provides here is the TCP/IP *plane* the guest kernel needs:
+ //
+ // * a virtio-net backend attached to libkrun via a Unix
+ // SOCK_STREAM (Linux) or SOCK_DGRAM (macOS vfkit), which
+ // surfaces as `eth0` inside the guest;
+ // * the DHCP server + default router the guest's udhcpc client
+ // talks to on boot (IPs 192.168.127.1 / .2, defaults for
+ // gvisor-tap-vsock);
+ // * the host-facing gateway identity the guest uses for callbacks:
+ // the init script seeds `/etc/hosts` with
+ // `host.openshell.internal` pointing at 192.168.127.1 while
+ // leaving gvproxy's legacy `host.containers.internal` /
+ // `host.docker.internal` DNS answers intact, which is how the guest's
+ // `rewrite_openshell_endpoint_if_needed` probe reaches the host
+ // gateway when the bare loopback address doesn't resolve from
+ // inside the VM.
+ //
+ // That network plane is also what the sandbox supervisor's
+ // per-sandbox netns (veth pair + iptables, see
+ // `openshell-sandbox/src/sandbox/linux/netns.rs`) branches off of;
+ // libkrun's built-in TSI socket impersonation would not satisfy
+ // those kernel-level primitives.
+ //
+ // The `-listen` API socket and `-ssh-port` forwarder are both
+ // deliberately omitted: nothing in the driver enqueues port
+ // forwards on the API any more, and the host-side SSH listener is
+ // dead plumbing.
+ let gvproxy_guard = {
let gvproxy_binary = runtime_dir.join("gvproxy");
if !gvproxy_binary.is_file() {
return Err(format!(
@@ -76,13 +139,9 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> {
));
}
- kill_stale_gvproxy_by_port_map(&config.port_map);
-
let sock_base = gvproxy_socket_base(&config.rootfs)?;
let net_sock = sock_base.with_extension("v");
- let api_sock = sock_base.with_extension("a");
let _ = std::fs::remove_file(&net_sock);
- let _ = std::fs::remove_file(&api_sock);
let _ = std::fs::remove_file(sock_base.with_extension("v-krun.sock"));
let run_dir = config.rootfs.parent().unwrap_or(&config.rootfs);
@@ -90,9 +149,6 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> {
let gvproxy_log_file = std::fs::File::create(&gvproxy_log)
.map_err(|e| format!("create gvproxy log {}: {e}", gvproxy_log.display()))?;
- let gvproxy_ports = plan_gvproxy_ports(&config.port_map)?;
- forwarded_port_map = gvproxy_ports.forwarded_ports;
-
#[cfg(target_os = "linux")]
let (gvproxy_net_flag, gvproxy_net_url) =
("-listen-qemu", format!("unix://{}", net_sock.display()));
@@ -102,18 +158,51 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> {
format!("unixgram://{}", net_sock.display()),
);
- let child = StdCommand::new(&gvproxy_binary)
+ // `-ssh-port -1` tells gvproxy to skip its default SSH forward
+ // (127.0.0.1:2222 → guest:22). We don't use it — all gateway
+ // ingress rides the supervisor-initiated relay — and leaving
+ // the default on would bind a host-side TCP listener per
+ // sandbox, racing concurrent sandboxes for port 2222 and
+ // surfacing a misleading "sshd is reachable" endpoint. See
+ // https://github.com/containers/gvisor-tap-vsock `cmd/gvproxy/main.go`
+ // (`getForwardsMap` returns an empty map when `sshPort == -1`).
+ let mut gvproxy_cmd = StdCommand::new(&gvproxy_binary);
+ gvproxy_cmd
.arg(gvproxy_net_flag)
.arg(&gvproxy_net_url)
- .arg("-listen")
- .arg(format!("unix://{}", api_sock.display()))
.arg("-ssh-port")
- .arg(gvproxy_ports.ssh_port.to_string())
+ .arg("-1")
.stdin(Stdio::null())
.stdout(Stdio::null())
- .stderr(gvproxy_log_file)
+ .stderr(gvproxy_log_file);
+
+ // On Linux the kernel will SIGKILL gvproxy the moment this
+ // launcher dies (or is SIGKILLed). `pre_exec` runs in the child
+ // between fork and execve, so the PR_SET_PDEATHSIG flag is
+ // inherited across execve and applies to gvproxy proper. On
+ // macOS/BSDs there is no equivalent; we fall back to killing
+ // gvproxy explicitly from the launcher's procguard cleanup
+ // callback (see `run_vm` above) and SIGTERM handler
+ // (see `install_signal_forwarding` below).
+ #[cfg(target_os = "linux")]
+ {
+ use nix::sys::signal::Signal;
+ use std::os::unix::process::CommandExt as _;
+ unsafe {
+ gvproxy_cmd.pre_exec(|| {
+ nix::sys::prctl::set_pdeathsig(Signal::SIGKILL)
+ .map_err(|err| std::io::Error::other(format!("pdeathsig: {err}")))
+ });
+ }
+ }
+
+ let child = gvproxy_cmd
.spawn()
.map_err(|e| format!("failed to start gvproxy {}: {e}", gvproxy_binary.display()))?;
+ // The procguard cleanup reads GVPROXY_PID atomically. Storing it
+ // here makes the callback able to SIGTERM gvproxy if the driver
+ // dies from this moment onward.
+ GVPROXY_PID.store(child.id() as i32, Ordering::Relaxed);
wait_for_path(&net_sock, Duration::from_secs(5), "gvproxy data socket")?;
@@ -142,13 +231,9 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> {
vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?;
}
- gvproxy_guard = Some(GvproxyGuard::new(child));
- gvproxy_api_sock = Some(api_sock);
- }
+ Some(GvproxyGuard::new(child))
+ };
- if !config.port_map.is_empty() && gvproxy_api_sock.is_none() {
- vm.set_port_map(&config.port_map)?;
- }
vm.set_console_output(&config.console_output)?;
let env = if config.env.is_empty() {
@@ -166,6 +251,20 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> {
match pid {
-1 => Err(format!("fork failed: {}", std::io::Error::last_os_error())),
0 => {
+ // We are the libkrun worker (the VM's PID 1 inside the guest
+ // kernel, but a normal host process until krun_start_enter
+ // fires). Arm procguard so this fork is SIGKILLed if the
+ // parent launcher dies abruptly. On Linux this uses
+ // `PR_SET_PDEATHSIG`; on macOS this spawns a kqueue
+ // NOTE_EXIT watcher thread. Either way it closes the same
+ // leak gvproxy does above.
+ //
+ // We also SIGKILL ourselves if arming fails — there's no
+ // safe way to continue if we can't guarantee cleanup.
+ if let Err(err) = procguard::die_with_parent() {
+ eprintln!("libkrun worker: procguard arm failed: {err}");
+ std::process::exit(1);
+ }
let ret = vm.start_enter();
eprintln!("krun_start_enter failed: {ret}");
std::process::exit(1);
@@ -173,24 +272,10 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> {
_ => {
install_signal_forwarding(pid);
- let port_forward_result = if let Some(api_sock) = gvproxy_api_sock.as_ref() {
- expose_port_map(api_sock, &forwarded_port_map)
- } else {
- Ok(())
- };
-
- if let Err(err) = port_forward_result {
- unsafe {
- libc::kill(pid, libc::SIGTERM);
- }
- let _ = wait_for_child(pid);
- cleanup_gvproxy(gvproxy_guard);
- return Err(err);
- }
-
let status = wait_for_child(pid)?;
CHILD_PID.store(0, Ordering::Relaxed);
cleanup_gvproxy(gvproxy_guard);
+ GVPROXY_PID.store(0, Ordering::Relaxed);
if libc::WIFEXITED(status) {
match libc::WEXITSTATUS(status) {
@@ -399,15 +484,6 @@ impl VmContext {
)
}
- fn set_port_map(&self, port_map: &[String]) -> Result<(), String> {
- let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect();
- let (_owners, ptrs) = c_string_array(&port_strs)?;
- check(
- unsafe { (self.krun.krun_set_port_map)(self.ctx_id, ptrs.as_ptr()) },
- "krun_set_port_map",
- )
- }
-
fn set_console_output(&self, path: &Path) -> Result<(), String> {
let console_c = path_to_cstring(path)?;
check(
@@ -476,126 +552,6 @@ impl Drop for GvproxyGuard {
}
}
-fn expose_port_map(api_sock: &Path, port_map: &[String]) -> Result<(), String> {
- wait_for_path(api_sock, Duration::from_secs(2), "gvproxy API socket")?;
- let guest_ip = "192.168.127.2";
-
- for pm in port_map {
- let mapping = parse_port_mapping(pm)?;
-
- let expose_body = format!(
- r#"{{"local":":{}","remote":"{guest_ip}:{}","protocol":"tcp"}}"#,
- mapping.host_port, mapping.guest_port
- );
-
- let deadline = Instant::now() + Duration::from_secs(10);
- let mut retry_interval = Duration::from_millis(100);
- loop {
- match gvproxy_expose(api_sock, &expose_body) {
- Ok(()) => break,
- Err(err) if Instant::now() < deadline => {
- std::thread::sleep(retry_interval);
- retry_interval = (retry_interval * 2).min(Duration::from_secs(1));
- if retry_interval == Duration::from_secs(1) {
- eprintln!("retrying gvproxy port expose {pm}: {err}");
- }
- }
- Err(err) => {
- return Err(format!(
- "failed to forward port {} via gvproxy: {err}",
- mapping.host_port
- ));
- }
- }
- }
- }
-
- Ok(())
-}
-
-fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> {
- let mut stream =
- UnixStream::connect(api_sock).map_err(|e| format!("connect to gvproxy API socket: {e}"))?;
-
- let request = format!(
- "POST /services/forwarder/expose HTTP/1.1\r\n\
- Host: localhost\r\n\
- Content-Type: application/json\r\n\
- Content-Length: {}\r\n\
- Connection: close\r\n\
- \r\n\
- {}",
- body.len(),
- body,
- );
-
- stream
- .write_all(request.as_bytes())
- .map_err(|e| format!("write to gvproxy API: {e}"))?;
-
- let mut buf = [0u8; 1024];
- let n = stream
- .read(&mut buf)
- .map_err(|e| format!("read from gvproxy API: {e}"))?;
- let response = String::from_utf8_lossy(&buf[..n]);
- let status = response
- .lines()
- .next()
- .and_then(|line| line.split_whitespace().nth(1))
- .unwrap_or("0");
-
- match status {
- "200" | "204" => Ok(()),
- _ => Err(format!(
- "gvproxy API: {}",
- response.lines().next().unwrap_or("")
- )),
- }
-}
-
-fn plan_gvproxy_ports(port_map: &[String]) -> Result {
- let mut ssh_port = None;
- let mut forwarded_ports = Vec::with_capacity(port_map.len());
-
- for pm in port_map {
- let mapping = parse_port_mapping(pm)?;
- if ssh_port.is_none() && mapping.guest_port == GUEST_SSH_PORT && mapping.host_port >= 1024 {
- ssh_port = Some(mapping.host_port);
- continue;
- }
- forwarded_ports.push(pm.clone());
- }
-
- Ok(GvproxyPortPlan {
- ssh_port: match ssh_port {
- Some(port) => port,
- None => pick_gvproxy_ssh_port()?,
- },
- forwarded_ports,
- })
-}
-
-fn parse_port_mapping(pm: &str) -> Result {
- let parts: Vec<&str> = pm.split(':').collect();
- let (host, guest) = match parts.as_slice() {
- [host, guest] => (*host, *guest),
- [port] => (*port, *port),
- _ => return Err(format!("invalid port mapping '{pm}'")),
- };
-
- let host_port = host
- .parse::()
- .map_err(|_| format!("invalid port mapping '{pm}'"))?;
- let guest_port = guest
- .parse::()
- .map_err(|_| format!("invalid port mapping '{pm}'"))?;
-
- Ok(PortMapping {
- host_port,
- guest_port,
- })
-}
-
fn wait_for_path(path: &Path, timeout: Duration, label: &str) -> Result<(), String> {
let deadline = Instant::now() + timeout;
let mut interval = Duration::from_millis(5);
@@ -674,92 +630,6 @@ fn gvproxy_socket_base(rootfs: &Path) -> Result {
Ok(secure_socket_base("osd-gv")?.join(hash_path_id(rootfs)))
}
-fn pick_gvproxy_ssh_port() -> Result {
- let listener = std::net::TcpListener::bind(("127.0.0.1", 0))
- .map_err(|e| format!("allocate gvproxy ssh port on localhost: {e}"))?;
- let port = listener
- .local_addr()
- .map_err(|e| format!("read gvproxy ssh port: {e}"))?
- .port();
- drop(listener);
- Ok(port)
-}
-
-fn kill_stale_gvproxy_by_port_map(port_map: &[String]) {
- for pm in port_map {
- if let Some(host_port) = pm
- .split(':')
- .next()
- .and_then(|port| port.parse::().ok())
- {
- kill_stale_gvproxy_by_port(host_port);
- }
- }
-}
-
-fn kill_stale_gvproxy_by_port(port: u16) {
- let output = StdCommand::new("lsof")
- .args(["-ti", &format!(":{port}")])
- .output();
-
- let pids = match output {
- Ok(output) if output.status.success() => {
- String::from_utf8_lossy(&output.stdout).to_string()
- }
- _ => return,
- };
-
- for line in pids.lines() {
- if let Ok(pid) = line.trim().parse::()
- && is_process_named(pid as libc::pid_t, "gvproxy")
- {
- kill_gvproxy_pid(pid);
- }
- }
-}
-
-fn kill_gvproxy_pid(pid: u32) {
- let pid = pid as libc::pid_t;
- if unsafe { libc::kill(pid, 0) } != 0 {
- return;
- }
- if !is_process_named(pid, "gvproxy") {
- return;
- }
- unsafe {
- libc::kill(pid, libc::SIGTERM);
- }
- std::thread::sleep(Duration::from_millis(200));
-}
-
-#[cfg(target_os = "macos")]
-fn is_process_named(pid: libc::pid_t, expected: &str) -> bool {
- StdCommand::new("ps")
- .args(["-p", &pid.to_string(), "-o", "comm="])
- .output()
- .ok()
- .and_then(|output| {
- if output.status.success() {
- String::from_utf8(output.stdout).ok()
- } else {
- None
- }
- })
- .is_some_and(|name| name.trim().contains(expected))
-}
-
-#[cfg(target_os = "linux")]
-fn is_process_named(pid: libc::pid_t, expected: &str) -> bool {
- std::fs::read_to_string(format!("/proc/{pid}/comm"))
- .map(|name| name.trim().contains(expected))
- .unwrap_or(false)
-}
-
-#[cfg(not(any(target_os = "macos", target_os = "linux")))]
-fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool {
- false
-}
-
fn install_signal_forwarding(pid: i32) {
unsafe {
libc::signal(
@@ -774,11 +644,28 @@ fn install_signal_forwarding(pid: i32) {
CHILD_PID.store(pid, Ordering::Relaxed);
}
+/// Async-signal-safe handler that forwards SIGTERM to every process we
+/// own: the libkrun VM worker and the gvproxy helper. We cannot rely on
+/// Rust destructors (`GvproxyGuard::drop`, `ManagedDriverProcess::drop`)
+/// running on signal-driven exit, so we explicitly deliver the signal
+/// here. The `wait_for_child` loop reaps libkrun and `cleanup_gvproxy`
+/// reaps gvproxy before `run_vm` returns.
+///
+/// Only async-signal-safe libc calls are used — `kill(2)` is listed in
+/// POSIX.1-2017 as async-signal-safe, atomic loads are lock-free on the
+/// platforms we target.
extern "C" fn forward_signal(_sig: libc::c_int) {
- let pid = CHILD_PID.load(Ordering::Relaxed);
- if pid > 0 {
+ let vm_pid = CHILD_PID.load(Ordering::Relaxed);
+ if vm_pid > 0 {
unsafe {
- libc::kill(pid, libc::SIGTERM);
+ libc::kill(vm_pid, libc::SIGTERM);
+ }
+ }
+ let gv_pid = GVPROXY_PID.load(Ordering::Relaxed);
+ if gv_pid > 0 {
+ // gvproxy handles SIGTERM cleanly; no need for SIGKILL.
+ unsafe {
+ libc::kill(gv_pid, libc::SIGTERM);
}
}
}
@@ -840,38 +727,3 @@ fn check_kvm_access() -> Result<(), String> {
format!("cannot open /dev/kvm: {e}\nKVM access is required to run microVMs on Linux.")
})
}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn plan_gvproxy_ports_reuses_sandbox_ssh_mapping() {
- let plan = plan_gvproxy_ports(&["64739:2222".to_string()]).expect("plan should succeed");
-
- assert_eq!(plan.ssh_port, 64739);
- assert!(plan.forwarded_ports.is_empty());
- }
-
- #[test]
- fn plan_gvproxy_ports_keeps_non_ssh_mappings_for_forwarder() {
- let plan = plan_gvproxy_ports(&["64739:8080".to_string()]).expect("plan should succeed");
-
- assert_ne!(plan.ssh_port, 64739);
- assert_eq!(plan.forwarded_ports, vec!["64739:8080".to_string()]);
- }
-
- #[test]
- fn plan_gvproxy_ports_ignores_privileged_host_ports_for_direct_ssh() {
- let plan = plan_gvproxy_ports(&["22:2222".to_string()]).expect("plan should succeed");
-
- assert_ne!(plan.ssh_port, 22);
- assert_eq!(plan.forwarded_ports, vec!["22:2222".to_string()]);
- }
-
- #[test]
- fn parse_port_mapping_rejects_invalid_entries() {
- let err = parse_port_mapping("bad:mapping").expect_err("invalid mapping should fail");
- assert!(err.contains("invalid port mapping"));
- }
-}
diff --git a/crates/openshell-driver-vm/start.sh b/crates/openshell-driver-vm/start.sh
index 155136c78..b5aebbefd 100755
--- a/crates/openshell-driver-vm/start.sh
+++ b/crates/openshell-driver-vm/start.sh
@@ -5,12 +5,26 @@
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+CLI_BIN="${ROOT}/scripts/bin/openshell"
COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed"
-STATE_DIR_DEFAULT="${ROOT}/target/openshell-vm-driver-dev"
+SERVER_PORT="${OPENSHELL_SERVER_PORT:-8080}"
+# Keep the driver socket path under AF_UNIX SUN_LEN on macOS.
+STATE_DIR_ROOT="${OPENSHELL_VM_DRIVER_STATE_ROOT:-/tmp}"
+STATE_LABEL_RAW="${OPENSHELL_VM_INSTANCE:-port-${SERVER_PORT}}"
+STATE_LABEL="$(printf '%s' "${STATE_LABEL_RAW}" | tr -cs '[:alnum:]._-' '-')"
+if [ -z "${STATE_LABEL}" ]; then
+ STATE_LABEL="port-${SERVER_PORT}"
+fi
+STATE_DIR_DEFAULT="${STATE_DIR_ROOT}/openshell-vm-driver-dev-${USER:-user}-${STATE_LABEL}"
STATE_DIR="${OPENSHELL_VM_DRIVER_STATE_DIR:-${STATE_DIR_DEFAULT}}"
DB_PATH_DEFAULT="${STATE_DIR}/openshell.db"
-SERVER_PORT="${OPENSHELL_SERVER_PORT:-8080}"
VM_HOST_GATEWAY_DEFAULT="${OPENSHELL_VM_HOST_GATEWAY:-host.containers.internal}"
+LOCAL_GATEWAY_ENDPOINT_DEFAULT="http://127.0.0.1:${SERVER_PORT}"
+LOCAL_GATEWAY_ENDPOINT="${OPENSHELL_VM_LOCAL_GATEWAY_ENDPOINT:-${LOCAL_GATEWAY_ENDPOINT_DEFAULT}}"
+GATEWAY_NAME_DEFAULT="vm-driver-${STATE_LABEL}"
+GATEWAY_NAME="${OPENSHELL_VM_GATEWAY_NAME:-${GATEWAY_NAME_DEFAULT}}"
+DRIVER_DIR_DEFAULT="${ROOT}/target/debug"
+DRIVER_DIR="${OPENSHELL_DRIVER_DIR:-${DRIVER_DIR_DEFAULT}}"
export OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR:-${COMPRESSED_DIR}}"
@@ -52,11 +66,19 @@ fi
export OPENSHELL_DISABLE_TLS="$(normalize_bool "${OPENSHELL_DISABLE_TLS:-true}")"
export OPENSHELL_DB_URL="${OPENSHELL_DB_URL:-sqlite:${DB_PATH_DEFAULT}}"
export OPENSHELL_DRIVERS="${OPENSHELL_DRIVERS:-vm}"
+export OPENSHELL_DRIVER_DIR="${DRIVER_DIR}"
export OPENSHELL_GRPC_ENDPOINT="${OPENSHELL_GRPC_ENDPOINT:-http://${VM_HOST_GATEWAY_DEFAULT}:${SERVER_PORT}}"
export OPENSHELL_SSH_GATEWAY_HOST="${OPENSHELL_SSH_GATEWAY_HOST:-127.0.0.1}"
export OPENSHELL_SSH_GATEWAY_PORT="${OPENSHELL_SSH_GATEWAY_PORT:-${SERVER_PORT}}"
export OPENSHELL_SSH_HANDSHAKE_SECRET="${OPENSHELL_SSH_HANDSHAKE_SECRET:-dev-vm-driver-secret}"
export OPENSHELL_VM_DRIVER_STATE_DIR="${STATE_DIR}"
+echo "==> Gateway registration"
+echo " Name: ${GATEWAY_NAME}"
+echo " Endpoint: ${LOCAL_GATEWAY_ENDPOINT}"
+echo " Register: ${CLI_BIN} gateway add --name ${GATEWAY_NAME} ${LOCAL_GATEWAY_ENDPOINT}"
+echo " Select: ${CLI_BIN} gateway select ${GATEWAY_NAME}"
+echo " Driver: ${OPENSHELL_DRIVER_DIR}/openshell-driver-vm"
+
echo "==> Starting OpenShell server with VM compute driver"
exec "${ROOT}/target/debug/openshell-gateway"
diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs
index 95ffbfaa4..35c72f80c 100644
--- a/crates/openshell-server/src/compute/mod.rs
+++ b/crates/openshell-server/src/compute/mod.rs
@@ -11,6 +11,7 @@ use crate::grpc::policy::{SANDBOX_SETTINGS_OBJECT_TYPE, sandbox_settings_id};
use crate::persistence::{ObjectId, ObjectName, ObjectRecord, ObjectType, Store};
use crate::sandbox_index::SandboxIndex;
use crate::sandbox_watch::SandboxWatchBus;
+use crate::supervisor_session::SupervisorSessionRegistry;
use crate::tracing_bus::TracingLogBus;
use futures::{Stream, StreamExt};
use openshell_core::proto::compute::v1::{
@@ -188,6 +189,7 @@ pub struct ComputeRuntime {
sandbox_index: SandboxIndex,
sandbox_watch_bus: SandboxWatchBus,
tracing_log_bus: TracingLogBus,
+ supervisor_sessions: Arc,
sync_lock: Arc>,
}
@@ -205,6 +207,7 @@ impl ComputeRuntime {
sandbox_index: SandboxIndex,
sandbox_watch_bus: SandboxWatchBus,
tracing_log_bus: TracingLogBus,
+ supervisor_sessions: Arc,
) -> Result {
let default_image = driver
.get_capabilities(Request::new(GetCapabilitiesRequest {}))
@@ -220,6 +223,7 @@ impl ComputeRuntime {
sandbox_index,
sandbox_watch_bus,
tracing_log_bus,
+ supervisor_sessions,
sync_lock: Arc::new(Mutex::new(())),
})
}
@@ -230,6 +234,7 @@ impl ComputeRuntime {
sandbox_index: SandboxIndex,
sandbox_watch_bus: SandboxWatchBus,
tracing_log_bus: TracingLogBus,
+ supervisor_sessions: Arc,
) -> Result {
let driver = KubernetesComputeDriver::new(config)
.await
@@ -242,6 +247,7 @@ impl ComputeRuntime {
sandbox_index,
sandbox_watch_bus,
tracing_log_bus,
+ supervisor_sessions,
)
.await
}
@@ -253,6 +259,7 @@ impl ComputeRuntime {
sandbox_index: SandboxIndex,
sandbox_watch_bus: SandboxWatchBus,
tracing_log_bus: TracingLogBus,
+ supervisor_sessions: Arc,
) -> Result {
let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel));
Self::from_driver(
@@ -262,6 +269,7 @@ impl ComputeRuntime {
sandbox_index,
sandbox_watch_bus,
tracing_log_bus,
+ supervisor_sessions,
)
.await
}
@@ -563,7 +571,8 @@ impl ComputeRuntime {
existing.as_ref().and_then(|sandbox| sandbox.spec.as_ref()),
);
- let phase = derive_phase(incoming.status.as_ref());
+ let session_connected = self.supervisor_sessions.has_session(&incoming.id);
+ let mut phase = derive_phase(incoming.status.as_ref());
let mut sandbox = existing.unwrap_or_else(|| Sandbox {
id: incoming.id.clone(),
name: incoming.name.clone(),
@@ -574,6 +583,12 @@ impl ComputeRuntime {
..Default::default()
});
+ if session_connected && matches!(phase, SandboxPhase::Provisioning | SandboxPhase::Unknown)
+ {
+ ensure_supervisor_ready_status(&mut status, &sandbox.name);
+ phase = SandboxPhase::Ready;
+ }
+
let old_phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown);
if old_phase != phase {
info!(
@@ -622,6 +637,55 @@ impl ComputeRuntime {
Ok(())
}
+ pub async fn supervisor_session_connected(&self, sandbox_id: &str) -> Result<(), String> {
+ self.set_supervisor_session_state(sandbox_id, true).await
+ }
+
+ pub async fn supervisor_session_disconnected(&self, sandbox_id: &str) -> Result<(), String> {
+ self.set_supervisor_session_state(sandbox_id, false).await
+ }
+
+ async fn set_supervisor_session_state(
+ &self,
+ sandbox_id: &str,
+ connected: bool,
+ ) -> Result<(), String> {
+ let _guard = self.sync_lock.lock().await;
+ let Some(record) = self
+ .store
+ .get(Sandbox::object_type(), sandbox_id)
+ .await
+ .map_err(|e| e.to_string())?
+ else {
+ return Ok(());
+ };
+
+ let mut sandbox = decode_sandbox_record(&record)?;
+ let current_phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown);
+
+ if current_phase == SandboxPhase::Deleting || current_phase == SandboxPhase::Error {
+ return Ok(());
+ }
+
+ if connected {
+ ensure_supervisor_ready_status(&mut sandbox.status, &sandbox.name);
+ sandbox.phase = SandboxPhase::Ready as i32;
+ } else if current_phase == SandboxPhase::Ready {
+ ensure_supervisor_not_ready_status(&mut sandbox.status, &sandbox.name);
+ sandbox.phase = SandboxPhase::Provisioning as i32;
+ } else {
+ return Ok(());
+ }
+
+ self.sandbox_index.update_from_sandbox(&sandbox);
+ self.store
+ .put_message(&sandbox)
+ .await
+ .map_err(|e| e.to_string())?;
+ self.sandbox_watch_bus.notify(sandbox_id);
+ Ok(())
+ }
+
async fn apply_deleted(&self, sandbox_id: &str) -> Result<(), String> {
let _guard = self.sync_lock.lock().await;
self.apply_deleted_locked(sandbox_id).await
@@ -963,6 +1027,58 @@ fn public_status_from_driver(status: &DriverSandboxStatus) -> SandboxStatus {
}
}
+fn ensure_supervisor_ready_status(status: &mut Option, sandbox_name: &str) {
+ upsert_ready_condition(
+ status,
+ sandbox_name,
+ SandboxCondition {
+ r#type: "Ready".to_string(),
+ status: "True".to_string(),
+ reason: "DependenciesReady".to_string(),
+ message: "Supervisor session connected".to_string(),
+ last_transition_time: String::new(),
+ },
+ );
+}
+
+fn ensure_supervisor_not_ready_status(status: &mut Option, sandbox_name: &str) {
+ upsert_ready_condition(
+ status,
+ sandbox_name,
+ SandboxCondition {
+ r#type: "Ready".to_string(),
+ status: "False".to_string(),
+ reason: "DependenciesNotReady".to_string(),
+ message: "Supervisor session disconnected".to_string(),
+ last_transition_time: String::new(),
+ },
+ );
+}
+
+fn upsert_ready_condition(
+ status: &mut Option,
+ sandbox_name: &str,
+ condition: SandboxCondition,
+) {
+ let status = status.get_or_insert_with(|| SandboxStatus {
+ sandbox_name: sandbox_name.to_string(),
+ agent_pod: String::new(),
+ agent_fd: String::new(),
+ sandbox_fd: String::new(),
+ conditions: Vec::new(),
+ });
+
+ if let Some(existing) = status
+ .conditions
+ .iter_mut()
+ .find(|existing| existing.r#type == "Ready")
+ {
+ *existing = condition;
+ } else {
+ status.conditions.push(condition);
+ }
+}
+
fn public_condition_from_driver(condition: &DriverCondition) -> SandboxCondition {
SandboxCondition {
r#type: condition.r#type.clone(),
@@ -1044,6 +1160,7 @@ mod tests {
GetSandboxResponse, StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateResponse,
};
use std::sync::Arc;
+ use tokio::sync::{mpsc, oneshot};
#[derive(Debug, Default)]
struct TestDriver {
@@ -1159,10 +1276,22 @@ mod tests {
sandbox_index: SandboxIndex::new(),
sandbox_watch_bus: SandboxWatchBus::new(),
tracing_log_bus: TracingLogBus::new(),
+ supervisor_sessions: Arc::new(SupervisorSessionRegistry::new()),
sync_lock: Arc::new(Mutex::new(())),
}
}
+ fn register_test_supervisor_session(runtime: &ComputeRuntime, sandbox_id: &str) {
+ let (tx, _rx) = mpsc::channel(1);
+ let (shutdown_tx, _shutdown_rx) = oneshot::channel();
+ runtime.supervisor_sessions.register(
+ sandbox_id.to_string(),
+ "session-1".to_string(),
+ tx,
+ shutdown_tx,
+ );
+ }
+
fn sandbox_record(id: &str, name: &str, phase: SandboxPhase) -> Sandbox {
Sandbox {
id: id.to_string(),
@@ -1417,6 +1546,122 @@ mod tests {
);
}
+ #[tokio::test]
+ async fn apply_sandbox_update_promotes_connected_supervisor_session_to_ready() {
+ let runtime = test_runtime(Arc::new(TestDriver::default())).await;
+ let sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Provisioning);
+ runtime.store.put_message(&sandbox).await.unwrap();
+
+ register_test_supervisor_session(&runtime, "sb-1");
+
+ runtime
+ .apply_sandbox_update(DriverSandbox {
+ id: "sb-1".to_string(),
+ name: "sandbox-a".to_string(),
+ namespace: "default".to_string(),
+ spec: None,
+ status: Some(make_driver_status(make_driver_condition(
+ "Starting",
+ "VM is starting",
+ ))),
+ })
+ .await
+ .unwrap();
+
+ let stored = runtime
+ .store
+ .get_message::("sb-1")
+ .await
+ .unwrap()
+ .unwrap();
+ assert_eq!(
+ SandboxPhase::try_from(stored.phase).unwrap(),
+ SandboxPhase::Ready
+ );
+ let ready = stored
+ .status
+ .as_ref()
+ .and_then(|status| {
+ status
+ .conditions
+ .iter()
+ .find(|condition| condition.r#type == "Ready")
+ })
+ .unwrap();
+ assert_eq!(ready.status, "True");
+ assert_eq!(ready.reason, "DependenciesReady");
+ assert_eq!(ready.message, "Supervisor session connected");
+ }
+
+ #[tokio::test]
+ async fn supervisor_session_connected_promotes_store_state_without_driver_refresh() {
+ let runtime = test_runtime(Arc::new(TestDriver::default())).await;
+ let sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Provisioning);
+ runtime.store.put_message(&sandbox).await.unwrap();
+
+ runtime.supervisor_session_connected("sb-1").await.unwrap();
+
+ let stored = runtime
+ .store
+ .get_message::("sb-1")
+ .await
+ .unwrap()
+ .unwrap();
+ assert_eq!(
+ SandboxPhase::try_from(stored.phase).unwrap(),
+ SandboxPhase::Ready
+ );
+ }
+
+ #[tokio::test]
+ async fn supervisor_session_disconnected_demotes_ready_sandbox() {
+ let runtime = test_runtime(Arc::new(TestDriver::default())).await;
+ let mut sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Ready);
+ sandbox.status = Some(SandboxStatus {
+ sandbox_name: "sandbox-a".to_string(),
+ agent_pod: String::new(),
+ agent_fd: String::new(),
+ sandbox_fd: String::new(),
+ conditions: vec![SandboxCondition {
+ r#type: "Ready".to_string(),
+ status: "True".to_string(),
+ reason: "DependenciesReady".to_string(),
+ message: "Supervisor session connected".to_string(),
+ last_transition_time: String::new(),
+ }],
+ });
+ runtime.store.put_message(&sandbox).await.unwrap();
+
+ runtime
+ .supervisor_session_disconnected("sb-1")
+ .await
+ .unwrap();
+
+ let stored = runtime
+ .store
+ .get_message::("sb-1")
+ .await
+ .unwrap()
+ .unwrap();
+ assert_eq!(
+ SandboxPhase::try_from(stored.phase).unwrap(),
+ SandboxPhase::Provisioning
+ );
+ let ready = stored
+ .status
+ .as_ref()
+ .and_then(|status| {
+ status
+ .conditions
+ .iter()
+ .find(|condition| condition.r#type == "Ready")
+ })
+ .unwrap();
+ assert_eq!(ready.status, "False");
+ assert_eq!(ready.reason, "DependenciesNotReady");
+ assert_eq!(ready.message, "Supervisor session disconnected");
+ }
+
#[tokio::test]
async fn reconcile_store_with_backend_applies_driver_snapshot() {
let runtime = test_runtime(Arc::new(TestDriver {
diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs
index 9501ea3b2..a40794037 100644
--- a/crates/openshell-server/src/lib.rs
+++ b/crates/openshell-server/src/lib.rs
@@ -88,7 +88,7 @@ pub struct ServerState {
pub settings_mutex: tokio::sync::Mutex<()>,
/// Registry of active supervisor sessions and pending relay channels.
- pub supervisor_sessions: supervisor_session::SupervisorSessionRegistry,
+ pub supervisor_sessions: Arc,
}
fn is_benign_tls_handshake_failure(error: &std::io::Error) -> bool {
@@ -108,6 +108,7 @@ impl ServerState {
sandbox_index: SandboxIndex,
sandbox_watch_bus: SandboxWatchBus,
tracing_log_bus: TracingLogBus,
+ supervisor_sessions: Arc,
) -> Self {
Self {
config,
@@ -119,7 +120,7 @@ impl ServerState {
ssh_connections_by_token: Mutex::new(HashMap::new()),
ssh_connections_by_sandbox: Mutex::new(HashMap::new()),
settings_mutex: tokio::sync::Mutex::new(()),
- supervisor_sessions: supervisor_session::SupervisorSessionRegistry::new(),
+ supervisor_sessions,
}
}
}
@@ -150,6 +151,7 @@ pub async fn run_server(
let sandbox_index = SandboxIndex::new();
let sandbox_watch_bus = SandboxWatchBus::new();
+ let supervisor_sessions = Arc::new(supervisor_session::SupervisorSessionRegistry::new());
let compute = build_compute_runtime(
&config,
&vm_config,
@@ -157,6 +159,7 @@ pub async fn run_server(
sandbox_index.clone(),
sandbox_watch_bus.clone(),
tracing_log_bus.clone(),
+ supervisor_sessions.clone(),
)
.await?;
let state = Arc::new(ServerState::new(
@@ -166,6 +169,7 @@ pub async fn run_server(
sandbox_index,
sandbox_watch_bus,
tracing_log_bus,
+ supervisor_sessions,
));
state.compute.spawn_watchers();
@@ -261,6 +265,7 @@ async fn build_compute_runtime(
sandbox_index: SandboxIndex,
sandbox_watch_bus: SandboxWatchBus,
tracing_log_bus: TracingLogBus,
+ supervisor_sessions: Arc,
) -> Result {
let driver = configured_compute_driver(config)?;
info!(driver = %driver, "Using compute driver");
@@ -288,6 +293,7 @@ async fn build_compute_runtime(
sandbox_index,
sandbox_watch_bus,
tracing_log_bus,
+ supervisor_sessions.clone(),
)
.await
.map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))),
@@ -300,6 +306,7 @@ async fn build_compute_runtime(
sandbox_index,
sandbox_watch_bus,
tracing_log_bus,
+ supervisor_sessions,
)
.await
.map_err(|e| Error::execution(format!("failed to create compute runtime: {e}")))
diff --git a/crates/openshell-server/src/supervisor_session.rs b/crates/openshell-server/src/supervisor_session.rs
index f81ee9e3c..d130bf71d 100644
--- a/crates/openshell-server/src/supervisor_session.rs
+++ b/crates/openshell-server/src/supervisor_session.rs
@@ -180,6 +180,10 @@ impl SupervisorSessionRegistry {
.map(|s| s.tx.clone())
}
+ pub fn has_session(&self, sandbox_id: &str) -> bool {
+ self.sessions.lock().unwrap().contains_key(sandbox_id)
+ }
+
fn pending_channel_ids(&self, sandbox_id: &str) -> Vec {
self.pending_relays
.lock()
@@ -547,6 +551,19 @@ pub async fn handle_connect_supervisor(
.await;
}
+ if let Err(err) = state
+ .compute
+ .supervisor_session_connected(&sandbox_id)
+ .await
+ {
+ warn!(
+ sandbox_id = %sandbox_id,
+ session_id = %session_id,
+ error = %err,
+ "supervisor session: failed to mark sandbox ready"
+ );
+ }
+
// Step 4: Spawn the session loop that reads inbound messages.
let state_clone = Arc::clone(state);
let sandbox_id_clone = sandbox_id.clone();
@@ -565,6 +582,18 @@ pub async fn handle_connect_supervisor(
.remove_if_current(&sandbox_id_clone, &session_id);
if still_ours {
info!(sandbox_id = %sandbox_id_clone, session_id = %session_id, "supervisor session: ended");
+ if let Err(err) = state_clone
+ .compute
+ .supervisor_session_disconnected(&sandbox_id_clone)
+ .await
+ {
+ warn!(
+ sandbox_id = %sandbox_id_clone,
+ session_id = %session_id,
+ error = %err,
+ "supervisor session: failed to mark sandbox disconnected"
+ );
+ }
} else {
info!(sandbox_id = %sandbox_id_clone, session_id = %session_id, "supervisor session: ended (already superseded)");
}
diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh
index d43046d4f..566b32141 100755
--- a/crates/openshell-vm/scripts/build-rootfs.sh
+++ b/crates/openshell-vm/scripts/build-rootfs.sh
@@ -119,6 +119,65 @@ verify_checksum() {
fi
}
+ensure_build_nofile_limit() {
+ local desired="${OPENSHELL_VM_BUILD_NOFILE_LIMIT:-8192}"
+ local minimum=1024
+ local current=""
+ local hard=""
+ local target=""
+
+ [ "$(uname -s)" = "Darwin" ] || return 0
+ command -v cargo-zigbuild >/dev/null 2>&1 || return 0
+
+ current="$(ulimit -n 2>/dev/null || echo "")"
+ case "${current}" in
+ ''|*[!0-9]*)
+ return 0
+ ;;
+ esac
+
+ if [ "${current}" -ge "${desired}" ]; then
+ return 0
+ fi
+
+ hard="$(ulimit -Hn 2>/dev/null || echo "")"
+ target="${desired}"
+ case "${hard}" in
+ ''|unlimited|infinity)
+ ;;
+ *[!0-9]*)
+ ;;
+ *)
+ if [ "${hard}" -lt "${target}" ]; then
+ target="${hard}"
+ fi
+ ;;
+ esac
+
+ if [ "${target}" -gt "${current}" ] && ulimit -n "${target}" 2>/dev/null; then
+ echo "==> Raised open file limit for cargo-zigbuild: ${current} -> $(ulimit -n)"
+ fi
+
+ current="$(ulimit -n 2>/dev/null || echo "${current}")"
+ case "${current}" in
+ ''|*[!0-9]*)
+ return 0
+ ;;
+ esac
+
+ if [ "${current}" -lt "${desired}" ]; then
+ echo "WARNING: Open file limit is ${current}; cargo-zigbuild is more reliable at ${desired}+ on macOS."
+ fi
+
+ if [ "${current}" -lt "${minimum}" ]; then
+ echo "ERROR: Open file limit (${current}) is too low for cargo-zigbuild on macOS."
+ echo " Zig 0.14+ can fail with ProcessFdQuotaExceeded while linking large binaries."
+ echo " Run: ulimit -n ${desired}"
+ echo " Then re-run this script."
+ exit 1
+ fi
+}
+
if [ "$BASE_ONLY" = true ]; then
echo "==> Building base openshell-vm rootfs"
echo " Guest arch: ${GUEST_ARCH}"
@@ -135,6 +194,10 @@ else
fi
echo ""
+# cargo-zigbuild on macOS can exhaust the default per-process file descriptor
+# limit while linking larger targets with Zig 0.14+.
+ensure_build_nofile_limit
+
# ── Check for running VM ────────────────────────────────────────────────
# If an openshell-vm is using this rootfs via virtio-fs, wiping the rootfs
# corrupts the VM's filesystem (e.g. /var disappears) causing cascading
diff --git a/e2e/rust/e2e-vm.sh b/e2e/rust/e2e-vm.sh
index 5fd055036..5990d8db6 100755
--- a/e2e/rust/e2e-vm.sh
+++ b/e2e/rust/e2e-vm.sh
@@ -2,245 +2,227 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
-# Run the Rust e2e smoke test against an openshell-vm gateway.
+# Run the Rust e2e smoke test against an openshell-gateway running the
+# standalone VM compute driver (`openshell-driver-vm`).
#
-# Usage:
-# mise run e2e:vm # start new named VM on random port
-# mise run e2e:vm -- --vm-port=30051 # reuse existing VM on port 30051
-# mise run e2e:vm -- --vm-port=30051 --vm-name=my-vm # reuse existing named VM and run exec check
-#
-# Options:
-# --vm-port=PORT Skip VM startup and test against this port.
-# --vm-name=NAME VM instance name. Auto-generated for fresh VMs.
+# Architecture (post supervisor-initiated relay, PR #867):
+# * The gateway never dials the sandbox. Instead, the in-guest
+# supervisor opens an outbound `ConnectSupervisor` gRPC stream to
+# the gateway on startup and keeps it alive for the sandbox
+# lifetime. SSH (`/connect/ssh`) and `ExecSandbox` traffic ride the
+# same TCP+TLS+HTTP/2 connection as multiplexed HTTP/2 streams.
+# * There is no host-side SSH port forward. gvproxy still provides
+# guest egress so the supervisor can reach the gateway, but it no
+# longer forwards any TCP port back to the guest.
+# * Readiness is authoritative on the gateway: a sandbox's phase
+# flips to `Ready` the moment `ConnectSupervisor` registers, and
+# back to `Provisioning` when the session drops. The VM driver
+# only reports `Error` conditions for dead launcher processes.
#
-# When --vm-port is omitted:
-# 1. Picks a random free host port
-# 2. Starts the VM with --name --port :30051
-# 3. Waits for the VM to fully bootstrap (mTLS certs + gRPC health)
-# 4. Verifies `openshell-vm exec` works
-# 5. Runs the Rust smoke test
-# 6. Tears down the VM
+# Usage:
+# mise run e2e:vm
#
-# When --vm-port is given the script assumes the VM is already running
-# on that port and runs the smoke test. The VM exec check runs only when
-# --vm-name is provided (so the script can target the correct instance).
+# What the script does:
+# 1. Ensures the VM runtime (libkrun + gvproxy + rootfs) is staged.
+# 2. Builds `openshell-gateway`, `openshell-driver-vm`, and the
+# `openshell` CLI with the embedded runtime.
+# 3. On macOS, codesigns the VM driver (libkrun needs the
+# `com.apple.security.hypervisor` entitlement).
+# 4. Starts the gateway with `--drivers vm --disable-tls
+# --disable-gateway-auth --db-url sqlite::memory:` on a random
+# free port, waits for `Server listening`, then runs the
+# cluster-agnostic Rust smoke test.
+# 5. Tears the gateway down and (on failure) preserves the gateway
+# log and every VM serial console log for post-mortem.
#
-# Prerequisites (when starting a new VM): `mise run vm:build` must already
-# be done (the e2e:vm mise task handles this via depends).
+# Prerequisites (handled automatically by this script if missing):
+# - `mise run vm:setup` — downloads / builds the libkrun runtime.
+# - `mise run vm:rootfs -- --base` — builds the sandbox rootfs tarball.
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-RUNTIME_DIR="${ROOT}/target/debug/openshell-vm.runtime"
-GATEWAY_BIN="${ROOT}/target/debug/openshell-vm"
-VM_GATEWAY_IMAGE="${IMAGE_REPO_BASE:-openshell}/gateway:${IMAGE_TAG:-dev}"
-VM_GATEWAY_TAR_REL="var/lib/rancher/k3s/agent/images/openshell-server.tar.zst"
-GUEST_PORT=30051
-TIMEOUT=180
-
-named_vm_rootfs() {
- local vm_version
-
- vm_version=$("${GATEWAY_BIN}" --version | awk '{print $2}')
- printf '%s\n' "${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/${vm_version}/instances/${VM_NAME}/rootfs"
-}
-
-vm_exec() {
- local rootfs_args=()
- if [ -n "${VM_ROOTFS_DIR:-}" ]; then
- rootfs_args=(--rootfs "${VM_ROOTFS_DIR}")
- fi
- "${GATEWAY_BIN}" "${rootfs_args[@]}" --name "${VM_NAME}" exec -- "$@"
-}
+COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed"
+GATEWAY_BIN="${ROOT}/target/debug/openshell-gateway"
+DRIVER_BIN="${ROOT}/target/debug/openshell-driver-vm"
+
+# The VM driver places `compute-driver.sock` under --vm-driver-state-dir.
+# AF_UNIX SUN_LEN is 104 bytes on macOS (108 on Linux), so paths anchored
+# in the workspace's `target/` blow the limit on typical developer
+# machines — e.g. a ~100-char `~/.superset/worktrees/.../target/...`
+# prefix plus the `compute-driver.sock` leaf leaves no room. macOS'
+# per-user `$TMPDIR` (`/var/folders/xx/.../T/`) can be 50+ chars too,
+# so root state under `/tmp` unconditionally to keep UDS paths short.
+STATE_DIR_ROOT="/tmp"
+
+# Smoke test timeouts. First boot extracts the embedded libkrun runtime
+# (~60–90MB of zstd per architecture) and the sandbox rootfs (~200MB).
+# The guest then runs k3s-free sandbox supervisor startup; a cold
+# microVM is typically ready within ~15s.
+GATEWAY_READY_TIMEOUT=60
+SANDBOX_PROVISION_TIMEOUT=180
+
+# ── Build prerequisites ──────────────────────────────────────────────
+
+if [ ! -f "${COMPRESSED_DIR}/rootfs.tar.zst" ]; then
+ echo "==> Building base VM rootfs tarball (mise run vm:rootfs -- --base)"
+ mise run vm:rootfs -- --base
+fi
-prepare_named_vm_rootfs() {
- if [ -z "${VM_NAME}" ]; then
- return 0
- fi
+if [ ! -f "${COMPRESSED_DIR}/rootfs.tar.zst" ] \
+ || ! find "${COMPRESSED_DIR}" -maxdepth 1 -name 'libkrun*.zst' | grep -q .; then
+ echo "==> Preparing embedded VM runtime (mise run vm:setup)"
+ mise run vm:setup
+fi
- echo "Preparing named VM rootfs '${VM_NAME}'..."
- VM_ROOTFS_DIR="$("${ROOT}/tasks/scripts/vm/ensure-vm-rootfs.sh" --name "${VM_NAME}" \
- | tail -n 1 | sed 's/^using openshell-vm rootfs at //')"
- "${ROOT}/tasks/scripts/vm/sync-vm-rootfs.sh" --name "${VM_NAME}"
-}
+export OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR:-${COMPRESSED_DIR}}"
+
+echo "==> Building openshell-gateway, openshell-driver-vm, openshell (CLI)"
+cargo build \
+ -p openshell-server \
+ -p openshell-driver-vm \
+ -p openshell-cli \
+ --features openshell-core/dev-settings
+
+if [ "$(uname -s)" = "Darwin" ]; then
+ echo "==> Codesigning openshell-driver-vm (Hypervisor entitlement)"
+ codesign \
+ --entitlements "${ROOT}/crates/openshell-driver-vm/entitlements.plist" \
+ --force \
+ -s - \
+ "${DRIVER_BIN}"
+fi
-refresh_vm_gateway() {
- if [ -z "${VM_NAME}" ]; then
- return 0
+# ── Pick a random free host port for the gateway ─────────────────────
+
+HOST_PORT="$(python3 -c 'import socket
+s = socket.socket()
+s.bind(("", 0))
+print(s.getsockname()[1])
+s.close()')"
+
+# Per-run state dir so concurrent e2e runs don't collide on the UDS or
+# sandbox state. The VM driver creates `/compute-driver.sock`
+# and `/sandboxes//rootfs/` under here. Keep the
+# basename short — see the SUN_LEN comment above.
+RUN_STATE_DIR="${STATE_DIR_ROOT}/os-vm-e2e-${HOST_PORT}-$$"
+mkdir -p "${RUN_STATE_DIR}"
+
+GATEWAY_LOG="$(mktemp /tmp/openshell-gateway-e2e.XXXXXX)"
+
+# ── Cleanup (trap) ───────────────────────────────────────────────────
+
+cleanup() {
+ local exit_code=$?
+
+ if [ -n "${GATEWAY_PID:-}" ] && kill -0 "${GATEWAY_PID}" 2>/dev/null; then
+ echo "Stopping openshell-gateway (pid ${GATEWAY_PID})..."
+ # SIGTERM first; gateway drops ManagedDriverProcess which SIGKILLs
+ # the driver and removes the UDS. Wait briefly, then force-kill.
+ kill -TERM "${GATEWAY_PID}" 2>/dev/null || true
+ for _ in 1 2 3 4 5 6 7 8 9 10; do
+ kill -0 "${GATEWAY_PID}" 2>/dev/null || break
+ sleep 0.5
+ done
+ kill -KILL "${GATEWAY_PID}" 2>/dev/null || true
+ wait "${GATEWAY_PID}" 2>/dev/null || true
fi
- echo "Refreshing VM gateway StatefulSet image to ${VM_GATEWAY_IMAGE}..."
- # Re-import the host-synced :dev image into the VM's containerd, then
- # force a rollout when the StatefulSet already points at the same tag.
- vm_exec sh -lc "set -eu; \
- image_tar='/${VM_GATEWAY_TAR_REL}'; \
- k3s ctr -n k8s.io images import \"\${image_tar}\" >/dev/null; \
- current_image=\$(kubectl -n openshell get statefulset/openshell -o jsonpath='{.spec.template.spec.containers[?(@.name==\"openshell\")].image}'); \
- if [ \"\${current_image}\" = \"${VM_GATEWAY_IMAGE}\" ]; then \
- kubectl -n openshell rollout restart statefulset/openshell >/dev/null; \
- else \
- kubectl -n openshell set image statefulset/openshell openshell=${VM_GATEWAY_IMAGE} >/dev/null; \
- fi; \
- kubectl -n openshell rollout status statefulset/openshell --timeout=300s"
- echo "Gateway rollout complete."
-}
-
-wait_for_gateway_health() {
- local elapsed=0 timeout=60 consecutive_ok=0
-
- echo "Waiting for refreshed gateway health..."
- while [ "${elapsed}" -lt "${timeout}" ]; do
- if "${ROOT}/target/debug/openshell" status >/dev/null 2>&1; then
- consecutive_ok=$((consecutive_ok + 1))
- if [ "${consecutive_ok}" -ge 3 ]; then
- echo "Gateway health confirmed after refresh."
- return 0
- fi
- else
- consecutive_ok=0
- fi
-
- sleep 2
- elapsed=$((elapsed + 2))
- done
-
- echo "ERROR: refreshed gateway did not become healthy after ${timeout}s"
- return 1
-}
-
-# ── Parse arguments ──────────────────────────────────────────────────
-VM_PORT=""
-VM_NAME=""
-VM_ROOTFS_DIR=""
-for arg in "$@"; do
- case "$arg" in
- --vm-port=*) VM_PORT="${arg#--vm-port=}" ;;
- --vm-name=*) VM_NAME="${arg#--vm-name=}" ;;
- *) echo "Unknown argument: $arg"; exit 1 ;;
- esac
-done
+ # On failure, keep the VM console log for debugging. We deliberately
+ # print it instead of leaving it on disk because the state dir gets
+ # wiped on success.
+ if [ "${exit_code}" -ne 0 ]; then
+ echo "=== gateway log (preserved for debugging) ==="
+ cat "${GATEWAY_LOG}" 2>/dev/null || true
+ echo "=== end gateway log ==="
+
+ local console
+ while IFS= read -r -d '' console; do
+ echo "=== VM console log: ${console} ==="
+ cat "${console}" 2>/dev/null || true
+ echo "=== end VM console log ==="
+ done < <(find "${RUN_STATE_DIR}/sandboxes" -name 'rootfs-console.log' -print0 2>/dev/null)
+ fi
-# ── Determine mode ───────────────────────────────────────────────────
-if [ -n "${VM_PORT}" ]; then
- # Point at an already-running VM.
- HOST_PORT="${VM_PORT}"
- echo "Using existing VM on port ${HOST_PORT}."
- if [ -n "${VM_NAME}" ]; then
- prepare_named_vm_rootfs
+ rm -f "${GATEWAY_LOG}" 2>/dev/null || true
+ # Only wipe the per-run state dir on success. On failure, leave it for
+ # post-mortem (serial console logs, gvproxy logs, rootfs dumps).
+ if [ "${exit_code}" -eq 0 ]; then
+ rm -rf "${RUN_STATE_DIR}" 2>/dev/null || true
+ else
+ echo "NOTE: preserving ${RUN_STATE_DIR} for debugging"
fi
-else
- # Pick a random free port and start a new VM.
- HOST_PORT=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
- if [ -z "${VM_NAME}" ]; then
- VM_NAME="e2e-${HOST_PORT}-$$"
+}
+trap cleanup EXIT
+
+# ── Launch the gateway + VM driver ───────────────────────────────────
+
+SSH_HANDSHAKE_SECRET="$(openssl rand -hex 32)"
+
+echo "==> Starting openshell-gateway on 127.0.0.1:${HOST_PORT} (state: ${RUN_STATE_DIR})"
+
+# Pin --driver-dir to the workspace `target/debug/` so we always pick up
+# the driver we just cargo-built. Without this, the gateway's
+# `resolve_compute_driver_bin` fallback prefers
+# `~/.local/libexec/openshell/openshell-driver-vm` when present
+# (install-vm.sh installs there), which silently shadows development
+# builds — a subtle source of stale-binary bugs in e2e runs.
+"${GATEWAY_BIN}" \
+ --drivers vm \
+ --disable-tls \
+ --disable-gateway-auth \
+ --db-url 'sqlite::memory:' \
+ --port "${HOST_PORT}" \
+ --grpc-endpoint "http://127.0.0.1:${HOST_PORT}" \
+ --ssh-handshake-secret "${SSH_HANDSHAKE_SECRET}" \
+ --driver-dir "${ROOT}/target/debug" \
+ --vm-driver-state-dir "${RUN_STATE_DIR}" \
+ >"${GATEWAY_LOG}" 2>&1 &
+GATEWAY_PID=$!
+
+# ── Wait for gateway readiness ───────────────────────────────────────
+#
+# The gateway logs `INFO openshell_server: Server listening
+# address=0.0.0.0:` after its tonic listener is up. That is the
+# only signal the smoke test needs — the VM driver is spawned eagerly
+# but sandboxes are created on demand, so "Server listening" is the
+# right gate here.
+
+echo "==> Waiting for gateway readiness (timeout ${GATEWAY_READY_TIMEOUT}s)"
+elapsed=0
+while ! grep -q 'Server listening' "${GATEWAY_LOG}" 2>/dev/null; do
+ if ! kill -0 "${GATEWAY_PID}" 2>/dev/null; then
+ echo "ERROR: openshell-gateway exited before becoming ready"
+ exit 1
fi
-
- cleanup() {
- local exit_code=$?
- if [ -n "${VM_PID:-}" ] && kill -0 "$VM_PID" 2>/dev/null; then
- echo "Stopping openshell-vm (pid ${VM_PID})..."
- kill "$VM_PID" 2>/dev/null || true
- wait "$VM_PID" 2>/dev/null || true
- fi
- # On failure, preserve the VM console log for post-mortem debugging.
- if [ "$exit_code" -ne 0 ] && [ -n "${VM_NAME:-}" ]; then
- local console_log
- console_log="$(named_vm_rootfs)-console.log"
- if [ -f "$console_log" ]; then
- echo "=== VM console log (preserved for debugging) ==="
- cat "$console_log"
- echo "=== end VM console log ==="
- fi
- fi
- rm -f "${VM_LOG:-}" 2>/dev/null || true
- if [ -n "${VM_NAME:-}" ]; then
- rm -rf "$(dirname "$(named_vm_rootfs)")" 2>/dev/null || true
- fi
- }
- trap cleanup EXIT
-
- prepare_named_vm_rootfs
-
- echo "Starting openshell-vm '${VM_NAME}' on port ${HOST_PORT}..."
- if [ "$(uname -s)" = "Darwin" ]; then
- export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}"
+ if [ "${elapsed}" -ge "${GATEWAY_READY_TIMEOUT}" ]; then
+ echo "ERROR: openshell-gateway did not become ready after ${GATEWAY_READY_TIMEOUT}s"
+ exit 1
fi
+ sleep 1
+ elapsed=$((elapsed + 1))
+done
- VM_LOG=$(mktemp /tmp/openshell-vm-e2e.XXXXXX)
- rootfs_args=()
- if [ -n "${VM_ROOTFS_DIR}" ]; then
- rootfs_args=(--rootfs "${VM_ROOTFS_DIR}")
- fi
- "${GATEWAY_BIN}" "${rootfs_args[@]}" --name "${VM_NAME}" --port "${HOST_PORT}:${GUEST_PORT}" 2>"${VM_LOG}" &
- VM_PID=$!
-
- # ── Wait for full bootstrap (mTLS certs + gRPC health) ─────────────
- # The VM prints "Ready [Xs total]" to stderr after bootstrap_gateway()
- # stores mTLS certs and wait_for_gateway_ready() confirms the gRPC
- # service is responding. Waiting only for TCP port reachability (nc -z)
- # is insufficient because port forwarding is established before the
- # mTLS certs are written, causing `openshell status` to fail.
- echo "Waiting for VM bootstrap to complete (timeout ${TIMEOUT}s)..."
- elapsed=0
- while ! grep -q "^Ready " "${VM_LOG}" 2>/dev/null; do
- if ! kill -0 "$VM_PID" 2>/dev/null; then
- echo "ERROR: openshell-vm exited before becoming ready"
- echo "VM log:"
- cat "${VM_LOG}"
- exit 1
- fi
- if [ "$elapsed" -ge "$TIMEOUT" ]; then
- echo "ERROR: openshell-vm did not become ready after ${TIMEOUT}s"
- echo "VM log:"
- cat "${VM_LOG}"
- exit 1
- fi
- sleep 2
- elapsed=$((elapsed + 2))
- done
- echo "Gateway is ready (${elapsed}s)."
- echo "VM log:"
- cat "${VM_LOG}"
-fi
+echo "==> Gateway ready after ${elapsed}s"
-# ── Exec into the VM (when instance name is known) ───────────────────
-if [ -n "${VM_NAME}" ]; then
- echo "Verifying openshell-vm exec for '${VM_NAME}'..."
- exec_elapsed=0
- exec_timeout=60
- until vm_exec /bin/true; do
- if [ "$exec_elapsed" -ge "$exec_timeout" ]; then
- echo "ERROR: openshell-vm exec did not become ready after ${exec_timeout}s"
- exit 1
- fi
- sleep 2
- exec_elapsed=$((exec_elapsed + 2))
- done
- echo "VM exec succeeded."
-else
- echo "Skipping openshell-vm exec check (provide --vm-name for existing VMs)."
-fi
+# ── Run the smoke test ───────────────────────────────────────────────
+#
+# The CLI takes OPENSHELL_GATEWAY_ENDPOINT directly; no gateway
+# metadata lookup needed when TLS is disabled.
-refresh_vm_gateway
+export OPENSHELL_GATEWAY_ENDPOINT="http://127.0.0.1:${HOST_PORT}"
-# ── Run the smoke test ───────────────────────────────────────────────
-# The openshell CLI reads OPENSHELL_GATEWAY_ENDPOINT to connect to the
-# gateway directly, and OPENSHELL_GATEWAY to resolve mTLS certs from
-# ~/.config/openshell/gateways//mtls/.
-# In the VM, the overlayfs snapshotter re-extracts all image layers on
-# every boot. The 1GB sandbox base image extraction can take >300s
-# under contention, so allow 600s for sandbox provisioning.
-export OPENSHELL_PROVISION_TIMEOUT=600
-export OPENSHELL_GATEWAY_ENDPOINT="https://127.0.0.1:${HOST_PORT}"
-if [ -n "${VM_NAME}" ]; then
- export OPENSHELL_GATEWAY="openshell-vm-${VM_NAME}"
-else
- export OPENSHELL_GATEWAY="openshell-vm"
-fi
+# The VM driver creates each sandbox VM from scratch — the embedded
+# rootfs is extracted per sandbox, and the guest's sandbox supervisor
+# then initializes policy, netns, Landlock, and sshd. On a cold host
+# this is ~15s; allow 180s for slower CI runners.
+export OPENSHELL_PROVISION_TIMEOUT="${SANDBOX_PROVISION_TIMEOUT}"
-echo "Running e2e smoke test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${OPENSHELL_GATEWAY_ENDPOINT})..."
-cargo build -p openshell-cli --features openshell-core/dev-settings
-wait_for_gateway_health
-cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test smoke -- --nocapture
+echo "==> Running e2e smoke test (endpoint: ${OPENSHELL_GATEWAY_ENDPOINT})"
+cargo test \
+ --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \
+ --features e2e \
+ --test smoke \
+ -- --nocapture
-echo "Smoke test passed."
+echo "==> Smoke test passed."
diff --git a/tasks/scripts/vm/smoke-orphan-cleanup.sh b/tasks/scripts/vm/smoke-orphan-cleanup.sh
new file mode 100755
index 000000000..9a37861a0
--- /dev/null
+++ b/tasks/scripts/vm/smoke-orphan-cleanup.sh
@@ -0,0 +1,204 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Smoke test: start the gateway with the VM driver, create a sandbox, then
+# signal the gateway (SIGTERM then SIGKILL) and verify that no driver,
+# launcher, gvproxy, or libkrun worker processes survive.
+#
+# Exit codes:
+# 0 — both SIGTERM and SIGKILL cleanup passed
+# 1 — one or more scenarios leaked survivors
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+cd "$ROOT"
+
+PORT="${OPENSHELL_SERVER_PORT:-8091}"
+XDG="${TMPDIR:-/tmp}/vm-orphan-xdg-$$"
+STATE_DIR="${TMPDIR:-/tmp}/openshell-vm-orphan-$$"
+LOG="${TMPDIR:-/tmp}/vm-orphan-$$.log"
+
+cleanup_stray() {
+ # Best-effort: kill anything left over from our sandbox ids so repeated
+ # runs don't accumulate.
+ pkill -9 -f "openshell-vm-orphan-$$" 2>/dev/null || true
+ rm -rf "$XDG" "$STATE_DIR" 2>/dev/null || true
+ # Preserve the gateway log only on failure so operators can diagnose.
+ if [ "${EXIT_CODE:-0}" -ne 0 ]; then
+ echo "(log preserved at $LOG)" >&2
+ else
+ rm -f "$LOG" "$LOG.create" 2>/dev/null || true
+ fi
+}
+trap cleanup_stray EXIT
+
+build_binaries() {
+ echo "==> Ensuring binaries are built"
+ if [ ! -x "$ROOT/target/debug/openshell-gateway" ] || [ ! -x "$ROOT/target/debug/openshell-driver-vm" ]; then
+ cargo build -p openshell-server -p openshell-driver-vm >&2
+ fi
+ if [ "$(uname -s)" = "Darwin" ]; then
+ codesign \
+ --entitlements "$ROOT/crates/openshell-driver-vm/entitlements.plist" \
+ --force -s - \
+ "$ROOT/target/debug/openshell-driver-vm" >/dev/null 2>&1 || true
+ fi
+}
+
+start_gateway() {
+ local health_port=$((PORT + 1))
+ echo "==> Starting gateway on port $PORT (state=$STATE_DIR, health=$health_port)"
+ mkdir -p "$STATE_DIR"
+ OPENSHELL_SERVER_PORT="$PORT" \
+ OPENSHELL_HEALTH_PORT="$health_port" \
+ OPENSHELL_DB_URL="sqlite:$STATE_DIR/openshell.db" \
+ OPENSHELL_DRIVERS=vm \
+ OPENSHELL_DRIVER_DIR="$ROOT/target/debug" \
+ OPENSHELL_GRPC_ENDPOINT="http://host.containers.internal:$PORT" \
+ OPENSHELL_SSH_GATEWAY_HOST=127.0.0.1 \
+ OPENSHELL_SSH_GATEWAY_PORT="$PORT" \
+ OPENSHELL_SSH_HANDSHAKE_SECRET=dev-vm-driver-secret \
+ OPENSHELL_VM_DRIVER_STATE_DIR="$STATE_DIR" \
+ OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="$ROOT/target/vm-runtime-compressed" \
+ nohup "$ROOT/target/debug/openshell-gateway" --disable-tls \
+ > "$LOG" 2>&1 &
+ GATEWAY_PID=$!
+ echo "gateway pid=$GATEWAY_PID"
+
+ for _ in $(seq 1 60); do
+ if grep -q "Server listening" "$LOG" 2>/dev/null; then
+ return 0
+ fi
+ if ! kill -0 "$GATEWAY_PID" 2>/dev/null; then
+ echo "!! gateway died before ready"
+ tail -40 "$LOG" >&2
+ return 1
+ fi
+ sleep 1
+ done
+ echo "!! gateway never reported ready"
+ tail -40 "$LOG" >&2
+ return 1
+}
+
+create_sandbox() {
+ echo "==> Creating sandbox (--keep, long-running)"
+ mkdir -p "$XDG"
+ XDG_CONFIG_HOME="$XDG" "$ROOT/scripts/bin/openshell" gateway add \
+ --name vm-orphan http://127.0.0.1:"$PORT" >/dev/null
+ XDG_CONFIG_HOME="$XDG" "$ROOT/scripts/bin/openshell" gateway select vm-orphan >/dev/null
+
+ # Run the CLI in the background; it blocks waiting for sleep to finish.
+ XDG_CONFIG_HOME="$XDG" "$ROOT/scripts/bin/openshell" sandbox create \
+ --name "orphan-$$" --keep -- sleep 99999 \
+ > "$LOG.create" 2>&1 &
+ CLI_PID=$!
+
+ for _ in $(seq 1 60); do
+ if pgrep -f "openshell-vm-orphan-$$|$STATE_DIR/sandboxes/" >/dev/null 2>&1; then
+ if pgrep -f gvproxy >/dev/null 2>&1; then
+ echo "sandbox came up (cli pid=$CLI_PID)"
+ return 0
+ fi
+ fi
+ sleep 2
+ done
+ echo "!! sandbox never came up"
+ tail -40 "$LOG" "$LOG.create" >&2 2>/dev/null || true
+ return 1
+}
+
+snapshot_kids() {
+ # Return all PIDs whose --state-dir or --vm-rootfs references our
+ # per-run directory, plus any gvproxy that mentions our socket base.
+ pgrep -fl "state-dir $STATE_DIR|$STATE_DIR/sandboxes" 2>/dev/null || true
+ pgrep -fl "gvproxy" 2>/dev/null | grep "osd-gv" || true
+}
+
+count_alive() {
+ local alive
+ alive=$(pgrep -f "state-dir $STATE_DIR|$STATE_DIR/sandboxes" 2>/dev/null | wc -l | tr -d ' ')
+ local gv
+ gv=$(pgrep -f 'gvproxy' 2>/dev/null | xargs -r ps -o pid=,command= -p 2>/dev/null | grep -c 'osd-gv' || true)
+ echo $((alive + gv))
+}
+
+verify_cleanup() {
+ local label="$1"
+ local deadline="$2"
+ local waited=0
+ while [ "$waited" -lt "$deadline" ]; do
+ local n
+ n=$(count_alive)
+ if [ "$n" = "0" ]; then
+ echo " PASS ($label): all descendants gone after ${waited}s"
+ return 0
+ fi
+ sleep 1
+ waited=$((waited + 1))
+ done
+ echo " FAIL ($label): $(count_alive) descendants still alive after ${deadline}s:"
+ snapshot_kids | sed 's/^/ /'
+ return 1
+}
+
+run_scenario() {
+ local signal="$1"
+ local label="$2"
+ echo "======================================================"
+ echo "Scenario: $label (signal $signal)"
+ echo "======================================================"
+
+ start_gateway || return 1
+ create_sandbox || { kill -9 "$GATEWAY_PID" 2>/dev/null; return 1; }
+
+ echo "-- process tree before signal --"
+ snapshot_kids | sed 's/^/ /'
+ echo
+
+ echo "-> kill -$signal $GATEWAY_PID"
+ kill "-$signal" "$GATEWAY_PID" 2>/dev/null || true
+
+ verify_cleanup "$label" 15
+ local rc=$?
+
+ # Belt-and-braces teardown between scenarios.
+ pkill -9 -f "$STATE_DIR/sandboxes|$STATE_DIR " 2>/dev/null || true
+ pkill -9 -f 'gvproxy.*osd-gv' 2>/dev/null || true
+ rm -rf "$STATE_DIR" /tmp/osd-gv "$XDG" 2>/dev/null || true
+ # CLI may still be running; reap it.
+ kill "${CLI_PID:-0}" 2>/dev/null || true
+ sleep 1
+
+ return $rc
+}
+
+main() {
+ build_binaries
+ local overall=0
+
+ # Clean starting state.
+ pkill -9 -f 'openshell-gateway|openshell-driver-vm' 2>/dev/null || true
+ pkill -9 -f 'gvproxy.*osd-gv' 2>/dev/null || true
+ sleep 1
+
+ if ! run_scenario TERM "graceful SIGTERM"; then
+ overall=1
+ fi
+
+ if ! run_scenario KILL "abrupt SIGKILL"; then
+ overall=1
+ fi
+
+ if [ "$overall" -eq 0 ]; then
+ echo "ALL SCENARIOS PASSED"
+ else
+ echo "ONE OR MORE SCENARIOS FAILED"
+ fi
+ EXIT_CODE=$overall
+ return $overall
+}
+
+main "$@"
diff --git a/tasks/scripts/vm/vm-setup.sh b/tasks/scripts/vm/vm-setup.sh
index e7ae06d08..bccb7f754 100755
--- a/tasks/scripts/vm/vm-setup.sh
+++ b/tasks/scripts/vm/vm-setup.sh
@@ -128,4 +128,4 @@ echo " Compressed artifacts in: ${OUTPUT_DIR}"
echo ""
echo "Next steps:"
echo " mise run vm:rootfs --base # build rootfs (requires Docker)"
-echo " mise run vm # build and run the VM"
+echo " mise run gateway:vm # start openshell-gateway with the VM driver"
diff --git a/tasks/test.toml b/tasks/test.toml
index f24ea6f2b..cf45d2b6b 100644
--- a/tasks/test.toml
+++ b/tasks/test.toml
@@ -49,6 +49,5 @@ env = { UV_NO_SYNC = "1", PYTHONPATH = "python" }
run = "uv run pytest -o python_files='test_*.py' -m gpu -n ${E2E_PARALLEL:-1} e2e/python"
["e2e:vm"]
-description = "Boot openshell-vm and run smoke e2e (macOS ARM64; pass -- --vm-port=N [--vm-name=NAME] to reuse)"
-depends = ["build:docker:gateway", "vm:build"]
+description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e"
run = "e2e/rust/e2e-vm.sh"
diff --git a/tasks/vm.toml b/tasks/vm.toml
index ca06b08c1..0a44b4ff7 100644
--- a/tasks/vm.toml
+++ b/tasks/vm.toml
@@ -5,22 +5,25 @@
#
# Workflow:
# mise run vm:setup # one-time: download pre-built runtime (~30s)
-# mise run vm # build + run the VM
+# mise run gateway:vm # start openshell-gateway with the VM driver
+# mise run vm # build + run the standalone openshell-vm microVM
# mise run vm:clean # wipe everything and start over
#
-# See crates/openshell-vm/README.md for full documentation.
+# See crates/openshell-driver-vm/README.md for the `gateway:vm` flow and
+# crates/openshell-vm/README.md for the standalone microVM path.
# ═══════════════════════════════════════════════════════════════════════════
# Main Commands
# ═══════════════════════════════════════════════════════════════════════════
+["gateway:vm"]
+description = "Build openshell-gateway + openshell-driver-vm and start the gateway with the VM driver"
+run = "crates/openshell-driver-vm/start.sh"
+
[vm]
-description = "Build and run the openshell-vm microVM"
+description = "Build and run the standalone openshell-vm microVM"
depends = ["build:docker:gateway"]
-run = [
- "mise run vm:build",
- "tasks/scripts/vm/run-vm.sh",
-]
+run = ["mise run vm:build", "tasks/scripts/vm/run-vm.sh"]
["vm:build"]
description = "Build the openshell-vm binary with embedded runtime"
@@ -42,3 +45,7 @@ run = "tasks/scripts/vm/build-rootfs-tarball.sh"
["vm:clean"]
description = "Remove all VM cached artifacts (runtime, rootfs, builds)"
run = "tasks/scripts/vm/vm-clean.sh"
+
+["vm:smoke:orphan-cleanup"]
+description = "Smoke test: start gateway+driver, create a sandbox, signal the gateway, assert no orphaned processes survive"
+run = "tasks/scripts/vm/smoke-orphan-cleanup.sh"