diff --git a/Cargo.lock b/Cargo.lock index d5de42fb3..2d0bc6ce2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3101,6 +3101,7 @@ dependencies = [ "miette", "nix", "openshell-core", + "polling", "prost-types", "tar", "tokio", @@ -3672,6 +3673,20 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" +[[package]] +name = "polling" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi", + "pin-project-lite", + "rustix 1.1.4", + "windows-sys 0.61.2", +] + [[package]] name = "poly1305" version = "0.8.0" diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index 548b86d17..4cafe424f 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -1,140 +1,161 @@ # Custom libkrunfw VM Runtime -> Status: Experimental and work in progress (WIP). VM support is under active development and may change. +> Status: Experimental and work in progress (WIP). The VM compute driver is +> under active development and may change. ## Overview -The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a -lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel -is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel. +The OpenShell gateway uses [libkrun](https://github.com/containers/libkrun) via the +`openshell-driver-vm` compute driver to boot a lightweight microVM per sandbox. +Each VM runs on Apple Hypervisor.framework (macOS) or KVM (Linux), with the guest +kernel embedded inside `libkrunfw`. -The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or -conntrack support. This is insufficient for Kubernetes pod networking. +The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, +netfilter, or conntrack support. That is insufficient for the sandbox supervisor's +per-sandbox network namespace primitives (veth pair + iptables, see +`crates/openshell-sandbox/src/sandbox/linux/netns.rs`). The custom libkrunfw +runtime adds bridge, iptables/nftables, and conntrack support to the guest +kernel. -The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to -the VM kernel, enabling standard Kubernetes networking. +The driver is spawned by `openshell-gateway` as a subprocess, talks to it over a +Unix domain socket (`compute-driver.sock`) with the +`openshell.compute.v1.ComputeDriver` gRPC surface, and manages per-sandbox +microVMs. The runtime (libkrun + libkrunfw + gvproxy) and the sandbox rootfs are +embedded directly in the driver binary — no sibling files required at runtime. ## Architecture ```mermaid graph TD subgraph Host["Host (macOS / Linux)"] - BIN[openshell-vm binary] - EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy"] - CACHE["~/.local/share/openshell/vm-runtime/{version}/"] - PROV[Runtime provenance logging] - GVP[gvproxy networking proxy] - - BIN --> EMB - BIN -->|extracts to| CACHE - BIN --> PROV - BIN -->|spawns| GVP + GATEWAY["openshell-gateway
(compute::vm::spawn)"] + DRIVER["openshell-driver-vm
(compute-driver.sock)"] + EMB["Embedded runtime (zstd)
libkrun · libkrunfw · gvproxy
+ sandbox rootfs.tar.zst"] + GVP["gvproxy (per sandbox)
virtio-net · DHCP · DNS"] + + GATEWAY <-->|gRPC over UDS| DRIVER + DRIVER --> EMB + DRIVER -->|spawns one per sandbox| GVP end - subgraph Guest["Guest VM"] - INIT["openshell-vm-init.sh (PID 1)"] - VAL[Validates kernel capabilities] - CNI[Configures bridge CNI] - EXECA["Starts exec agent\nvsock port 10777"] - PKI[Generates mTLS PKI] - K3S[Execs k3s server] - EXECPY["openshell-vm-exec-agent.py"] - CHK["check-vm-capabilities.sh"] - - INIT --> VAL --> CNI --> EXECA --> PKI --> K3S + subgraph Guest["Per-sandbox microVM"] + SBXINIT["/srv/openshell-vm-sandbox-init.sh"] + SBX["/opt/openshell/bin/openshell-sandbox
(PID 1, supervisor)"] + SBXINIT --> SBX end - BIN -- "fork + krun_start_enter" --> INIT - GVP -- "virtio-net" --> Guest + DRIVER -- "fork + krun_start_enter" --> SBXINIT + GVP -- "virtio-net eth0" --> Guest + SBX -.->|"outbound ConnectSupervisor
gRPC stream"| GATEWAY + CLIENT["openshell-cli"] -->|SSH over supervisor relay| GATEWAY ``` +The driver spawns **one microVM per sandbox**. Each VM boots directly into +`openshell-sandbox` as PID 1. All gateway ingress — SSH, exec, connect — rides +the supervisor-initiated `ConnectSupervisor` gRPC stream opened from inside the +guest back out to the gateway, so gvproxy is configured with `-ssh-port -1` and +never binds a host-side TCP listener. + ## Embedded Runtime -The openshell-vm binary is fully self-contained, embedding both the VM runtime libraries -and a minimal rootfs as zstd-compressed byte arrays. On first use, the binary extracts -these to XDG cache directories with progress bars: +`openshell-driver-vm` embeds the VM runtime libraries and the sandbox rootfs as +zstd-compressed byte arrays, extracting on demand: ``` -~/.local/share/openshell/vm-runtime/{version}/ +~/.local/share/openshell/vm-runtime// # libkrun / libkrunfw / gvproxy ├── libkrun.{dylib,so} ├── libkrunfw.{5.dylib,so.5} └── gvproxy -~/.local/share/openshell/openshell-vm/{version}/instances//rootfs/ -├── usr/local/bin/k3s -├── opt/openshell/bin/openshell-sandbox -├── opt/openshell/manifests/ -└── ... +/sandboxes//rootfs/ # per-sandbox rootfs ``` -This eliminates the need for separate bundles or downloads - a single ~120MB binary -provides everything needed to run the VM. Old cache versions are automatically -cleaned up when a new version is extracted. +Old runtime cache versions are cleaned up when a new version is extracted. -### Hybrid Approach +### Sandbox rootfs preparation -The embedded rootfs uses a "minimal" configuration: -- Includes: Base Ubuntu, k3s binary, supervisor binary, helm charts, manifests -- Excludes: Pre-loaded container images (~1GB savings) +The rootfs tarball the driver embeds starts from the same minimal Ubuntu base +used across the project, and is **rewritten into a supervisor-only sandbox +guest** during extraction: -Container images are pulled on demand when sandboxes are created. First boot takes -~30-60s as k3s initializes; subsequent boots use cached state for ~3-5s startup. +- k3s state and Kubernetes manifests are stripped out +- `/srv/openshell-vm-sandbox-init.sh` is installed as the guest entrypoint +- the guest boots directly into `openshell-sandbox` — no k3s, no kube-proxy, + no CNI plugins -For the VM compute driver, the same embedded rootfs is rewritten into a -supervisor-only sandbox guest before boot: +See `crates/openshell-driver-vm/src/rootfs.rs` for the rewrite logic and +`crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh` for the init +script that gets installed. -- removes k3s state and Kubernetes manifests from the extracted rootfs -- installs `/srv/openshell-vm-sandbox-init.sh` -- boots directly into `openshell-sandbox` instead of `openshell-vm-init.sh` -- keeps the same embedded libkrun/libkrunfw kernel/runtime bundle +### `--internal-run-vm` helper -`openshell-driver-vm` now embeds the sandbox rootfs tarball independently so it can -prepare sandbox guests without linking against the `openshell-vm` Rust crate. -It now also embeds the minimal libkrun/libkrunfw bundle it needs for sandbox -boots and launches sandbox guests via a hidden helper mode in the -`openshell-driver-vm` binary itself, without depending on the `openshell-vm` -binary. The helper still starts its own embedded `gvproxy` instance to provide -virtio-net guest egress plus the single inbound SSH port forward used by the -compute driver. +The driver binary has two modes: the default mode is the gRPC server; when +launched with `--internal-run-vm` it becomes a per-sandbox launcher. The driver +spawns one launcher per sandbox as a subprocess, which in turn starts `gvproxy` +and calls `krun_start_enter` to boot the guest. Keeping the launcher in the +same binary means the driver ships a single artifact for both roles. -For fully air-gapped environments requiring pre-loaded images, build with: -```bash -mise run vm:rootfs # Full rootfs (~2GB, includes images) -mise run vm:build # Rebuild binary with full rootfs -``` +## Network Plane + +The driver launches a **dedicated `gvproxy` instance per sandbox** to provide the +guest's networking plane: + +- virtio-net backend over a Unix SOCK_STREAM (Linux) or SOCK_DGRAM (macOS vfkit) + socket, which surfaces as `eth0` inside the guest +- DHCP server + default router (192.168.127.1 / 192.168.127.2) for the guest's + udhcpc client +- DNS for host aliases: the guest init script seeds `/etc/hosts` with + `host.openshell.internal` → 192.168.127.1, while leaving gvproxy's legacy + `host.containers.internal` / `host.docker.internal` resolution intact + +The `-listen` API socket and the `-ssh-port` forwarder are both intentionally +omitted. After the supervisor-initiated relay migration the driver does not +enqueue any host-side port forwards, and the guest's SSH listener lives on a +Unix socket at `/run/openshell/ssh.sock` inside the VM that is reached over the +outbound `ConnectSupervisor` gRPC stream. Binding a host listener would race +concurrent sandboxes for port 2222 and surface a misleading "sshd is reachable" +endpoint. + +The sandbox supervisor's per-sandbox netns (veth pair + iptables) branches off +of this plane. libkrun's built-in TSI socket impersonation would not satisfy +those kernel-level primitives, which is why we need the custom libkrunfw. -## Network Profile +## Process Lifecycle Cleanup -The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and -netfilter kernel support. The init script validates these capabilities at boot and fails -fast with an actionable error if they are missing. +`openshell-driver-vm` installs a cross-platform "die when my parent dies" +primitive (`procguard`) in every link of the spawn chain so that killing +`openshell-gateway` (SIGTERM, SIGKILL, or crash) reaps the driver, per-sandbox +launcher, gvproxy, and the libkrun worker: -### Bridge Profile +- Linux: `nix::sys::prctl::set_pdeathsig(SIGKILL)` +- macOS / BSDs: `smol-rs/polling` with `ProcessOps::Exit` on a helper thread +- gvproxy (the one non-Rust child) gets `PR_SET_PDEATHSIG` via `pre_exec` on + Linux, and is SIGTERM'd from the launcher's procguard cleanup callback on + macOS -- CNI: bridge plugin with `cni0` interface -- IP masquerade: enabled (iptables-legacy via CNI bridge plugin) -- kube-proxy: enabled (nftables mode) -- Service VIPs: functional (ClusterIP, NodePort) -- hostNetwork workarounds: not required +See `crates/openshell-driver-vm/src/procguard.rs` for the implementation and +`tasks/scripts/vm/smoke-orphan-cleanup.sh` (exposed as +`mise run vm:smoke:orphan-cleanup`) for the regression test that covers both +SIGTERM and SIGKILL paths. ## Runtime Provenance -At boot, the openshell-vm binary logs provenance metadata about the loaded runtime bundle: +At driver startup the loaded runtime bundle is logged with: - Library paths and SHA-256 hashes - Whether the runtime is custom-built or stock - For custom runtimes: libkrunfw commit, kernel version, build timestamp -This information is sourced from `provenance.json` (generated by the build script) -and makes it straightforward to correlate VM behavior with a specific runtime artifact. +This information is sourced from `provenance.json` (generated by the build +script) and makes it straightforward to correlate sandbox VM behavior with a +specific runtime artifact. ## Build Pipeline ```mermaid graph LR subgraph Source["crates/openshell-vm/runtime/"] - KCONF["kernel/openshell.kconfig\nKernel config fragment"] - README["README.md\nOperator documentation"] + KCONF["kernel/openshell.kconfig
Kernel config fragment"] end subgraph Linux["Linux CI (build-libkrun.sh)"] @@ -145,101 +166,87 @@ graph LR BUILD_M["Build libkrunfw.dylib + libkrun.dylib"] end - subgraph Output["target/libkrun-build/"] - LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"] - LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"] + subgraph Output["vm-runtime-<platform>.tar.zst"] + LIB_SO["libkrunfw.so + libkrun.so + gvproxy
(Linux)"] + LIB_DY["libkrunfw.dylib + libkrun.dylib + gvproxy
(macOS)"] end - KCONF --> BUILD_L - BUILD_L --> LIB_SO - KCONF --> BUILD_M - BUILD_M --> LIB_DY + KCONF --> BUILD_L --> LIB_SO + KCONF --> BUILD_M --> LIB_DY ``` +The `vm-runtime-.tar.zst` artifact is consumed by +`openshell-driver-vm`'s `build.rs`, which embeds the library set into the +binary via `include_bytes!()`. Setting `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` +at build time (wired up by `crates/openshell-driver-vm/start.sh`) points the +build at the staged artifacts. + ## Kernel Config Fragment -The `openshell.kconfig` fragment enables these kernel features on top of the stock -libkrunfw kernel: +The `openshell.kconfig` fragment enables these kernel features on top of the +stock libkrunfw kernel: | Feature | Key Configs | Purpose | |---------|-------------|---------| -| Network namespaces | `CONFIG_NET_NS`, `CONFIG_NAMESPACES` | Pod isolation | -| veth | `CONFIG_VETH` | Pod network namespace pairs | -| Bridge device | `CONFIG_BRIDGE`, `CONFIG_BRIDGE_NETFILTER` | cni0 bridge for pod networking, kube-proxy bridge traffic visibility | +| Network namespaces | `CONFIG_NET_NS`, `CONFIG_NAMESPACES` | Sandbox netns isolation | +| veth | `CONFIG_VETH` | Sandbox network namespace pairs | +| Bridge device | `CONFIG_BRIDGE`, `CONFIG_BRIDGE_NETFILTER` | Bridge support + iptables visibility into bridge traffic | | Netfilter framework | `CONFIG_NETFILTER`, `CONFIG_NETFILTER_ADVANCED`, `CONFIG_NETFILTER_XTABLES` | iptables/nftables framework | -| xtables match modules | `CONFIG_NETFILTER_XT_MATCH_CONNTRACK`, `_COMMENT`, `_MULTIPORT`, `_MARK`, `_STATISTIC`, `_ADDRTYPE`, `_RECENT`, `_LIMIT` | kube-proxy and kubelet iptables rules | +| xtables match modules | `CONFIG_NETFILTER_XT_MATCH_CONNTRACK`, `_COMMENT`, `_MULTIPORT`, `_MARK`, `_STATISTIC`, `_ADDRTYPE`, `_RECENT`, `_LIMIT` | Sandbox supervisor iptables rules | | Connection tracking | `CONFIG_NF_CONNTRACK`, `CONFIG_NF_CT_NETLINK` | NAT state tracking | -| NAT | `CONFIG_NF_NAT` | Service VIP DNAT/SNAT | -| iptables | `CONFIG_IP_NF_IPTABLES`, `CONFIG_IP_NF_FILTER`, `CONFIG_IP_NF_NAT`, `CONFIG_IP_NF_MANGLE` | CNI bridge masquerade and compat | -| nftables | `CONFIG_NF_TABLES`, `CONFIG_NFT_CT`, `CONFIG_NFT_NAT`, `CONFIG_NFT_MASQ`, `CONFIG_NFT_NUMGEN`, `CONFIG_NFT_FIB_IPV4` | kube-proxy nftables mode (primary) | -| IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Pod-to-pod routing | -| IPVS | `CONFIG_IP_VS`, `CONFIG_IP_VS_RR`, `CONFIG_IP_VS_NFCT` | kube-proxy IPVS mode (optional) | -| Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | Kubernetes QoS | -| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Container resource limits | -| TUN/TAP | `CONFIG_TUN` | CNI plugin support | +| NAT | `CONFIG_NF_NAT` | Sandbox egress DNAT/SNAT | +| iptables | `CONFIG_IP_NF_IPTABLES`, `CONFIG_IP_NF_FILTER`, `CONFIG_IP_NF_NAT`, `CONFIG_IP_NF_MANGLE` | Masquerade and compat | +| nftables | `CONFIG_NF_TABLES`, `CONFIG_NFT_CT`, `CONFIG_NFT_NAT`, `CONFIG_NFT_MASQ`, `CONFIG_NFT_NUMGEN`, `CONFIG_NFT_FIB_IPV4` | nftables path | +| IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Sandbox-to-host routing | +| Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | QoS | +| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Sandbox resource limits | +| TUN/TAP | `CONFIG_TUN` | CNI plugin compatibility; inherited from the shared kconfig, not exercised by the driver. | | Dummy interface | `CONFIG_DUMMY` | Fallback networking | -| Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support | -| Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support | +| Landlock | `CONFIG_SECURITY_LANDLOCK` | Sandbox supervisor filesystem sandboxing | +| Seccomp filter | `CONFIG_SECCOMP_FILTER` | Sandbox supervisor syscall filtering | -See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full fragment with -inline comments explaining why each option is needed. +See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full +fragment with inline comments explaining why each option is needed. ## Verification -One verification tool is provided: - -1. **Capability checker** (`check-vm-capabilities.sh`): Runs inside the VM to verify - kernel capabilities. Produces pass/fail results for each required feature. - -## Running Commands In A Live VM - -The standalone `openshell-vm` binary supports `openshell-vm exec -- ` for a running VM. - -- Each VM instance stores local runtime state next to its instance rootfs -- libkrun maps a per-instance host Unix socket into the guest on vsock port `10777` -- `openshell-vm-init.sh` starts `openshell-vm-exec-agent.py` during boot -- `openshell-vm exec` connects to the host socket, which libkrun forwards into the guest exec agent -- The guest exec agent spawns the command, then streams stdout, stderr, and exit status back -- The host-side bootstrap also uses the exec agent to read PKI cert files from the guest - (via `cat /opt/openshell/pki/`) instead of requiring a separate vsock server - -`openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style -commands work the same way they would inside the VM shell. +- **Capability checker** (`check-vm-capabilities.sh`): runs inside a sandbox VM + to verify kernel capabilities. Produces pass/fail results for each required + feature. +- **Orphan-cleanup smoke test**: `mise run vm:smoke:orphan-cleanup` asserts + that killing the gateway leaves zero driver, launcher, gvproxy, or libkrun + survivors. ## Build Commands -```bash +```shell # One-time setup: download pre-built runtime (~30s) mise run vm:setup -# Build and run -mise run vm - -# Build embedded binary with base rootfs (~120MB, recommended) -mise run vm:rootfs -- --base # Build base rootfs tarball -mise run vm:build # Build binary with embedded rootfs - -# Build with full rootfs (air-gapped, ~2GB+) -mise run vm:rootfs # Build full rootfs tarball -mise run vm:build # Rebuild binary +# Start openshell-gateway with the VM compute driver +mise run gateway:vm # With custom kernel (optional, adds ~20 min) -FROM_SOURCE=1 mise run vm:setup # Build runtime from source -mise run vm:build # Then build embedded binary +FROM_SOURCE=1 mise run vm:setup # Wipe everything and start over mise run vm:clean ``` +See `crates/openshell-driver-vm/README.md` for the full driver workflow, +including multi-gateway development, CLI registration, and sandbox creation +examples. + ## CI/CD -The openshell-vm build is split into two GitHub Actions workflows that publish to a -rolling `vm-dev` GitHub Release: +Two GitHub Actions workflows back the driver's release artifacts, both +publishing to a rolling `vm-dev` GitHub Release: ### Kernel Runtime (`release-vm-kernel.yml`) -Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for all -supported platforms. Runs on-demand or when the kernel config / pinned versions change. +Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for +all supported platforms. Runs on-demand or when the kernel config / pinned +versions change. | Platform | Runner | Build Method | |----------|--------|-------------| @@ -247,43 +254,36 @@ supported platforms. Runs on-demand or when the kernel config / pinned versions | Linux x86_64 | `build-amd64` (self-hosted) | Native `build-libkrun.sh` | | macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` | -Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, and -provenance metadata. - -Each platform builds its own libkrunfw and libkrun natively. The kernel inside -libkrunfw is always Linux regardless of host platform. +Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, +gvproxy, and provenance metadata. Each platform builds its own libkrunfw and +libkrun natively; the kernel inside libkrunfw is always Linux regardless of +host platform. -### VM Binary (`release-vm-dev.yml`) +### Driver Binary (`release-vm-dev.yml`) -Builds the self-extracting openshell-vm binary for all platforms. Runs on every push -to `main` that touches VM-related crates. +Builds the self-contained `openshell-driver-vm` binary for every platform, +with the kernel runtime + sandbox rootfs embedded. Runs on every push to +`main` that touches VM-related crates. -```mermaid -graph TD - CV[compute-versions] --> DL[download-kernel-runtime\nfrom vm-dev release] - DL --> RFS_ARM[build-rootfs arm64] - DL --> RFS_AMD[build-rootfs amd64] - RFS_ARM --> VM_ARM[build-vm linux-arm64] - RFS_AMD --> VM_AMD[build-vm linux-amd64] - RFS_ARM --> VM_MAC["build-vm-macos\n(osxcross, reuses arm64 rootfs)"] - VM_ARM --> REL[release-vm-dev\nupload to rolling release] - VM_AMD --> REL - VM_MAC --> REL -``` +The `download-kernel-runtime` job pulls the current `vm-runtime-.tar.zst` +from the `vm-dev` release; the `build-openshell-driver-vm` jobs set +`OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed` and +run `cargo build --release -p openshell-driver-vm`. The macOS driver is +cross-compiled via osxcross (no macOS runner needed for the binary build — +only for the kernel build). -The macOS binary is cross-compiled via osxcross (no macOS runner needed for the binary -build — only for the kernel build). The macOS VM guest is always Linux ARM64, so it -reuses the arm64 rootfs. - -macOS binaries produced via osxcross are not codesigned. Users must self-sign: -```bash -codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - ./openshell-vm -``` +macOS driver binaries produced via osxcross are not codesigned. Development +builds are signed automatically by `crates/openshell-driver-vm/start.sh`; a +packaged release needs signing in CI. ## Rollout Strategy -1. Custom runtime is embedded by default when building with `mise run vm:build`. -2. The init script validates kernel capabilities at boot and fails fast if missing. -3. For development, override with `OPENSHELL_VM_RUNTIME_DIR` to use a local directory. -4. In CI, kernel runtime is pre-built and cached in the `vm-dev` release. The binary - build downloads it via `download-kernel-runtime.sh`. +1. Custom runtime is embedded by default when building `openshell-driver-vm` + with `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` set (wired up by + `crates/openshell-driver-vm/start.sh`). +2. The sandbox init script validates kernel capabilities at boot and fails + fast if missing. +3. For development, override with `OPENSHELL_VM_RUNTIME_DIR` to use a local + directory instead of the extracted cache. +4. In CI, the kernel runtime is pre-built and cached in the `vm-dev` release. + The driver build downloads it via `download-kernel-runtime.sh`. diff --git a/architecture/gateway.md b/architecture/gateway.md index 5dd2419af..9e9da6785 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -605,7 +605,7 @@ The gateway reaches the sandbox exclusively through the supervisor-initiated `Co - **Create**: The VM driver process allocates a sandbox-specific rootfs from its own embedded `rootfs.tar.zst`, injects an explicitly configured guest mTLS bundle when the gateway callback endpoint is `https://`, then re-execs itself in a hidden helper mode that loads libkrun directly and boots the supervisor. - **Networking**: The helper starts an embedded `gvproxy`, wires it into libkrun as virtio-net, and gives the guest outbound connectivity. No inbound TCP listener is needed — the supervisor reaches the gateway over its outbound `ConnectSupervisor` stream. -- **Gateway callback**: The guest init script configures `eth0` for gvproxy networking, prefers the configured `OPENSHELL_GRPC_ENDPOINT`, and falls back to host aliases or the gvproxy gateway IP (`192.168.127.1`) when local hostname resolution is unavailable on macOS. +- **Gateway callback**: The guest init script configures `eth0` for gvproxy networking, seeds `/etc/hosts` so `host.openshell.internal` resolves to the gvproxy gateway IP (`192.168.127.1`), preserves gvproxy's legacy `host.containers.internal` / `host.docker.internal` DNS answers, prefers the configured `OPENSHELL_GRPC_ENDPOINT`, and falls back to those aliases or the raw gateway IP when local hostname resolution is unavailable on macOS. - **Guest boot**: The sandbox guest runs a minimal init script that starts `openshell-sandbox` directly as PID 1 inside the VM. - **Watch stream**: Emits provisioning, ready, error, deleting, deleted, and platform-event updates so the gateway store remains the durable source of truth. diff --git a/crates/openshell-driver-vm/Cargo.toml b/crates/openshell-driver-vm/Cargo.toml index 368716ef9..b4d92b0fc 100644 --- a/crates/openshell-driver-vm/Cargo.toml +++ b/crates/openshell-driver-vm/Cargo.toml @@ -37,5 +37,13 @@ libloading = "0.8" tar = "0.4" zstd = "0.13" +# smol-rs/polling drives the BSD/macOS parent-death detection in +# procguard via kqueue's EVFILT_PROC / NOTE_EXIT filter. We could use +# it on Linux too (via epoll + pidfd) but sticking with +# nix::sys::prctl::set_pdeathsig there keeps the Linux path a single +# syscall with no helper thread. +[target.'cfg(any(target_os = "macos", target_os = "ios", target_os = "freebsd", target_os = "netbsd", target_os = "openbsd", target_os = "dragonfly"))'.dependencies] +polling = "3.11" + [lints] workspace = true diff --git a/crates/openshell-driver-vm/Makefile b/crates/openshell-driver-vm/Makefile deleted file mode 100644 index e1c360f3d..000000000 --- a/crates/openshell-driver-vm/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -.PHONY: start - -start: - ./start.sh diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md index a95462695..8808b25d9 100644 --- a/crates/openshell-driver-vm/README.md +++ b/crates/openshell-driver-vm/README.md @@ -31,19 +31,15 @@ Sandbox guests execute `/opt/openshell/bin/openshell-sandbox` as PID 1 inside th ## Quick start (recommended) -`start.sh` handles runtime setup, builds, codesigning, and environment wiring. From the repo root: ```shell -crates/openshell-driver-vm/start.sh +mise run gateway:vm ``` -or equivalently: - -```shell -make -C crates/openshell-driver-vm start -``` -First run takes a few minutes while `mise run vm:setup` stages libkrun/libkrunfw/gvproxy and `mise run vm:rootfs -- --base` builds the embedded rootfs. Subsequent runs are cached. State lives under `target/openshell-vm-driver-dev/` (SQLite DB + per-sandbox rootfs + `compute-driver.sock`). +First run takes a few minutes while `mise run vm:setup` stages libkrun/libkrunfw/gvproxy and `mise run vm:rootfs -- --base` builds the embedded rootfs. Subsequent runs are cached. To keep the Unix socket path under macOS `SUN_LEN`, `mise run gateway:vm` and `start.sh` default the state dir to `/tmp/openshell-vm-driver-dev-$USER-port-$PORT/` (SQLite DB + per-sandbox rootfs + `compute-driver.sock`) unless `OPENSHELL_VM_DRIVER_STATE_DIR` is set. +The wrapper also prints the recommended gateway name (`vm-driver-port-$PORT` by default) plus the exact repo-local `scripts/bin/openshell gateway add` and `scripts/bin/openshell gateway select` commands to use from another terminal. This avoids accidentally hitting an older `openshell` binary elsewhere on your `PATH`. +It also exports `OPENSHELL_DRIVER_DIR=$PWD/target/debug` before starting the gateway so local dev runs use the freshly built `openshell-driver-vm` instead of an older installed copy from `~/.local/libexec/openshell` or `/usr/local/libexec`. Override via environment: @@ -53,10 +49,33 @@ OPENSHELL_SSH_HANDSHAKE_SECRET=$(openssl rand -hex 32) \ crates/openshell-driver-vm/start.sh ``` +Run multiple dev gateways side by side by giving each one a unique port. The wrapper derives a distinct default state dir from that port automatically: + +```shell +OPENSHELL_SERVER_PORT=8080 mise run gateway:vm +OPENSHELL_SERVER_PORT=8081 mise run gateway:vm +``` + +If you want a custom suffix instead of `port-$PORT`, set `OPENSHELL_VM_INSTANCE`: + +```shell +OPENSHELL_SERVER_PORT=8082 \ +OPENSHELL_VM_INSTANCE=feature-a \ +mise run gateway:vm +``` + +If you want a custom CLI gateway name, set `OPENSHELL_VM_GATEWAY_NAME`: + +```shell +OPENSHELL_SERVER_PORT=8082 \ +OPENSHELL_VM_GATEWAY_NAME=vm-feature-a \ +mise run gateway:vm +``` + Teardown: ```shell -rm -rf target/openshell-vm-driver-dev +rm -rf /tmp/openshell-vm-driver-dev-$USER-port-8080 ``` ## Manual equivalent @@ -78,16 +97,17 @@ codesign \ --force -s - target/debug/openshell-driver-vm # 4. Start the gateway with the VM driver -mkdir -p target/openshell-vm-driver-dev +mkdir -p /tmp/openshell-vm-driver-dev-$USER-port-8080 target/debug/openshell-gateway \ --drivers vm \ --disable-tls \ - --database-url sqlite:target/openshell-vm-driver-dev/openshell.db \ + --database-url sqlite:/tmp/openshell-vm-driver-dev-$USER-port-8080/openshell.db \ + --driver-dir $PWD/target/debug \ --grpc-endpoint http://host.containers.internal:8080 \ --ssh-handshake-secret dev-vm-driver-secret \ --ssh-gateway-host 127.0.0.1 \ --ssh-gateway-port 8080 \ - --vm-driver-state-dir $PWD/target/openshell-vm-driver-dev + --vm-driver-state-dir /tmp/openshell-vm-driver-dev-$USER-port-8080 ``` The gateway resolves `openshell-driver-vm` in this order: `--driver-dir`, conventional install locations (`~/.local/libexec/openshell`, `/usr/local/libexec/openshell`, `/usr/local/libexec`), then a sibling of the gateway binary. @@ -97,7 +117,7 @@ The gateway resolves `openshell-driver-vm` in this order: `--driver-dir`, conven | Flag | Env var | Default | Purpose | |---|---|---|---| | `--drivers vm` | `OPENSHELL_DRIVERS` | `kubernetes` | Select the VM compute driver. | -| `--grpc-endpoint URL` | `OPENSHELL_GRPC_ENDPOINT` | — | Required. URL the sandbox guest calls back to. Use a host alias that resolves to the gateway's host from inside the VM (gvproxy answers `host.containers.internal` and `host.openshell.internal` to `192.168.127.1`). | +| `--grpc-endpoint URL` | `OPENSHELL_GRPC_ENDPOINT` | — | Required. URL the sandbox guest calls back to. Use a host alias that resolves to the gateway's host from inside the VM (`host.containers.internal` comes from gvproxy DNS; the guest init script also seeds `host.openshell.internal` to `192.168.127.1`). | | `--vm-driver-state-dir DIR` | `OPENSHELL_VM_DRIVER_STATE_DIR` | `target/openshell-vm-driver` | Per-sandbox rootfs, console logs, and the `compute-driver.sock` UDS. | | `--driver-dir DIR` | `OPENSHELL_DRIVER_DIR` | unset | Override the directory searched for `openshell-driver-vm`. | | `--vm-driver-vcpus N` | `OPENSHELL_VM_DRIVER_VCPUS` | `2` | vCPUs per sandbox. | diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh index 70dda5acb..e449003f9 100644 --- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -9,6 +9,7 @@ set -euo pipefail BOOT_START=$(date +%s%3N 2>/dev/null || date +%s) +GVPROXY_GATEWAY_IP="192.168.127.1" ts() { local now @@ -72,6 +73,20 @@ tcp_probe() { fi } +ensure_host_gateway_aliases() { + local hosts_tmp="/tmp/openshell-hosts.$$" + + if [ -f /etc/hosts ]; then + grep -vE '(^|[[:space:]])host\.openshell\.internal([[:space:]]|$)' /etc/hosts > "$hosts_tmp" || true + else + : > "$hosts_tmp" + fi + + printf '%s host.openshell.internal\n' "$GVPROXY_GATEWAY_IP" >> "$hosts_tmp" + cat "$hosts_tmp" > /etc/hosts + rm -f "$hosts_tmp" +} + rewrite_openshell_endpoint_if_needed() { local endpoint="${OPENSHELL_ENDPOINT:-}" [ -n "$endpoint" ] || return 0 @@ -92,7 +107,7 @@ rewrite_openshell_endpoint_if_needed() { return 0 fi - for candidate in host.containers.internal host.docker.internal 192.168.127.1; do + for candidate in host.openshell.internal host.containers.internal host.docker.internal "$GVPROXY_GATEWAY_IP"; do if [ "$candidate" = "$host" ]; then continue fi @@ -163,18 +178,20 @@ DHCP_SCRIPT if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then ts "WARNING: DHCP failed, falling back to static config" ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + ip route add default via "$GVPROXY_GATEWAY_IP" 2>/dev/null || true fi else ts "no DHCP client, using static config" ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + ip route add default via "$GVPROXY_GATEWAY_IP" 2>/dev/null || true fi if [ ! -s /etc/resolv.conf ]; then echo "nameserver 8.8.8.8" > /etc/resolv.conf echo "nameserver 8.8.4.4" >> /etc/resolv.conf fi + + ensure_host_gateway_aliases else ts "WARNING: eth0 not found; supervisor will start without guest egress" fi diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index 8237ba03c..d649a585a 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -33,6 +33,8 @@ const DRIVER_NAME: &str = "openshell-driver-vm"; const WATCH_BUFFER: usize = 256; const DEFAULT_VCPUS: u8 = 2; const DEFAULT_MEM_MIB: u32 = 2048; +const GVPROXY_GATEWAY_IP: &str = "192.168.127.1"; +const OPENSHELL_HOST_GATEWAY_ALIAS: &str = "host.openshell.internal"; const GUEST_SSH_SOCKET_PATH: &str = "/run/openshell/ssh.sock"; const GUEST_TLS_DIR: &str = "/opt/openshell/tls"; const GUEST_TLS_CA_PATH: &str = "/opt/openshell/tls/ca.crt"; @@ -147,7 +149,7 @@ fn validate_openshell_endpoint(endpoint: &str) -> Result<(), String> { if invalid_from_vm { return Err(format!( - "openshell endpoint '{endpoint}' is not reachable from sandbox VMs; use a concrete host such as 127.0.0.1, host.containers.internal, or another routable address" + "openshell endpoint '{endpoint}' is not reachable from sandbox VMs; use a concrete host such as 127.0.0.1, {OPENSHELL_HOST_GATEWAY_ALIAS}, or another routable address" )); } @@ -256,7 +258,19 @@ impl VmDriver { let console_output = state_dir.join("rootfs-console.log"); let mut command = Command::new(&self.launcher_bin); - command.kill_on_drop(true); + // Intentionally DO NOT set kill_on_drop(true). On a signal-driven + // driver exit (SIGKILL, SIGTERM without a handler, panic), + // tokio's Drop is racy with the launcher's procguard-initiated + // cleanup: if kill_on_drop SIGKILLs the launcher first, its + // cleanup callback never gets to SIGTERM gvproxy, and gvproxy is + // reparented to init as an orphan. Instead the whole cleanup + // cascade runs via procguard: + // driver exits → launcher's kqueue (macOS) or PR_SET_PDEATHSIG + // (Linux) fires → launcher kills gvproxy + libkrun fork → + // launcher exits → its own children die under pdeathsig. + // The explicit Drop path in VmProcess::terminate_vm_process still + // handles voluntary `delete_sandbox` teardown cleanly, where we + // do want SIGTERM + wait + SIGKILL semantics. command.stdin(Stdio::null()); command.stdout(Stdio::inherit()); command.stderr(Stdio::inherit()); @@ -403,16 +417,23 @@ impl VmDriver { snapshots } + /// Watch the launcher child process and surface errors as driver + /// conditions. + /// + /// The driver no longer owns the `Ready` transition — the gateway + /// promotes a sandbox to `Ready` the moment its supervisor session + /// lands (see `openshell-server/src/compute/mod.rs`). This loop only + /// handles the sad paths: the child process failing to start, exiting + /// abnormally, or becoming unpollable. Those still surface as driver + /// `Error` conditions so the gateway can reason about a dead VM. async fn monitor_sandbox(&self, sandbox_id: String) { - let mut ready_emitted = false; - loop { - let (process, state_dir) = { + let process = { let registry = self.registry.lock().await; let Some(record) = registry.get(&sandbox_id) else { return; }; - (record.process.clone(), record.state_dir.clone()) + record.process.clone() }; let exit_status = { @@ -469,16 +490,6 @@ impl VmDriver { return; } - if !ready_emitted && guest_ssh_ready(&state_dir).await { - if let Some(snapshot) = self - .set_snapshot_condition(&sandbox_id, ready_condition(), false) - .await - { - self.publish_snapshot(snapshot); - } - ready_emitted = true; - } - tokio::time::sleep(Duration::from_millis(250)).await; } } @@ -726,7 +737,7 @@ fn guest_visible_openshell_endpoint(endpoint: &str) -> String { None => false, }; - if should_rewrite && url.set_host(Some("192.168.127.1")).is_ok() { + if should_rewrite && url.set_host(Some(GVPROXY_GATEWAY_IP)).is_ok() { return url.to_string(); } @@ -843,16 +854,6 @@ async fn terminate_vm_process(child: &mut Child) -> Result<(), std::io::Error> { } } -async fn guest_ssh_ready(state_dir: &Path) -> bool { - let console_log = state_dir.join("rootfs-console.log"); - let Ok(contents) = tokio::fs::read_to_string(console_log).await else { - return false; - }; - - contents.contains("SSH server is ready to accept connections") - || contents.contains("SSH server listening") -} - fn sandbox_snapshot(sandbox: &Sandbox, condition: SandboxCondition, deleting: bool) -> Sandbox { Sandbox { id: sandbox.id.clone(), @@ -895,16 +896,6 @@ fn provisioning_condition() -> SandboxCondition { } } -fn ready_condition() -> SandboxCondition { - SandboxCondition { - r#type: "Ready".to_string(), - status: "True".to_string(), - reason: "Listening".to_string(), - message: "Supervisor is listening for SSH connections".to_string(), - last_transition_time: String::new(), - } -} - fn deleting_condition() -> SandboxCondition { SandboxCondition { r#type: "Ready".to_string(), @@ -1030,19 +1021,47 @@ mod tests { let env = build_guest_environment(&sandbox, &config); assert!(env.contains(&"HOME=/root".to_string())); - assert!(env.contains(&"OPENSHELL_ENDPOINT=http://192.168.127.1:8080/".to_string())); + assert!(env.contains(&format!( + "OPENSHELL_ENDPOINT=http://{GVPROXY_GATEWAY_IP}:8080/" + ))); assert!(env.contains(&"OPENSHELL_SANDBOX_ID=sandbox-123".to_string())); assert!(env.contains(&format!( "OPENSHELL_SSH_SOCKET_PATH={GUEST_SSH_SOCKET_PATH}" ))); } + #[test] + fn guest_visible_openshell_endpoint_rewrites_loopback_hosts_to_gvproxy_gateway() { + assert_eq!( + guest_visible_openshell_endpoint("http://127.0.0.1:8080"), + format!("http://{GVPROXY_GATEWAY_IP}:8080/") + ); + assert_eq!( + guest_visible_openshell_endpoint("http://localhost:8080"), + format!("http://{GVPROXY_GATEWAY_IP}:8080/") + ); + assert_eq!( + guest_visible_openshell_endpoint("https://[::1]:8443"), + format!("https://{GVPROXY_GATEWAY_IP}:8443/") + ); + } + #[test] fn guest_visible_openshell_endpoint_preserves_non_loopback_hosts() { + assert_eq!( + guest_visible_openshell_endpoint(&format!( + "http://{OPENSHELL_HOST_GATEWAY_ALIAS}:8080" + )), + format!("http://{OPENSHELL_HOST_GATEWAY_ALIAS}:8080") + ); assert_eq!( guest_visible_openshell_endpoint("http://host.containers.internal:8080"), "http://host.containers.internal:8080" ); + assert_eq!( + guest_visible_openshell_endpoint(&format!("http://{GVPROXY_GATEWAY_IP}:8080")), + format!("http://{GVPROXY_GATEWAY_IP}:8080") + ); assert_eq!( guest_visible_openshell_endpoint("https://gateway.internal:8443"), "https://gateway.internal:8443" @@ -1157,9 +1176,9 @@ mod tests { fn validate_openshell_endpoint_accepts_host_gateway() { validate_openshell_endpoint("http://host.containers.internal:8080") .expect("guest-reachable host alias should be accepted"); - validate_openshell_endpoint("http://192.168.127.1:8080") + validate_openshell_endpoint(&format!("http://{GVPROXY_GATEWAY_IP}:8080")) .expect("gateway IP should be accepted"); - validate_openshell_endpoint("http://host.openshell.internal:8080") + validate_openshell_endpoint(&format!("http://{OPENSHELL_HOST_GATEWAY_ALIAS}:8080")) .expect("openshell host alias should be accepted"); validate_openshell_endpoint("https://gateway.internal:8443") .expect("dns endpoint should be accepted"); @@ -1214,32 +1233,6 @@ mod tests { let _ = std::fs::remove_dir_all(base); } - #[tokio::test] - async fn guest_ssh_ready_detects_guest_console_marker() { - let base = unique_temp_dir(); - std::fs::create_dir_all(&base).unwrap(); - std::fs::write( - base.join("rootfs-console.log"), - "...\nINFO openshell_sandbox: SSH server is ready to accept connections\n", - ) - .unwrap(); - - assert!(guest_ssh_ready(&base).await); - - let _ = std::fs::remove_dir_all(base); - } - - #[tokio::test] - async fn guest_ssh_ready_is_false_without_marker() { - let base = unique_temp_dir(); - std::fs::create_dir_all(&base).unwrap(); - std::fs::write(base.join("rootfs-console.log"), "sandbox booting\n").unwrap(); - - assert!(!guest_ssh_ready(&base).await); - - let _ = std::fs::remove_dir_all(base); - } - fn unique_temp_dir() -> PathBuf { static COUNTER: AtomicU64 = AtomicU64::new(0); let nanos = SystemTime::now() diff --git a/crates/openshell-driver-vm/src/ffi.rs b/crates/openshell-driver-vm/src/ffi.rs index 750788ac1..a81b150af 100644 --- a/crates/openshell-driver-vm/src/ffi.rs +++ b/crates/openshell-driver-vm/src/ffi.rs @@ -37,7 +37,6 @@ type KrunSetExec = unsafe extern "C" fn( argv: *const *const c_char, envp: *const *const c_char, ) -> i32; -type KrunSetPortMap = unsafe extern "C" fn(ctx_id: u32, port_map: *const *const c_char) -> i32; type KrunSetConsoleOutput = unsafe extern "C" fn(ctx_id: u32, filepath: *const c_char) -> i32; type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32; type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32; @@ -68,7 +67,6 @@ pub struct LibKrun { pub krun_set_root: KrunSetRoot, pub krun_set_workdir: KrunSetWorkdir, pub krun_set_exec: KrunSetExec, - pub krun_set_port_map: KrunSetPortMap, pub krun_set_console_output: KrunSetConsoleOutput, pub krun_start_enter: KrunStartEnter, pub krun_disable_implicit_vsock: KrunDisableImplicitVsock, @@ -121,7 +119,6 @@ impl LibKrun { krun_set_root: load_symbol(library, b"krun_set_root\0", &libkrun_path)?, krun_set_workdir: load_symbol(library, b"krun_set_workdir\0", &libkrun_path)?, krun_set_exec: load_symbol(library, b"krun_set_exec\0", &libkrun_path)?, - krun_set_port_map: load_symbol(library, b"krun_set_port_map\0", &libkrun_path)?, krun_set_console_output: load_symbol( library, b"krun_set_console_output\0", diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs index 1c424deeb..772db47b3 100644 --- a/crates/openshell-driver-vm/src/lib.rs +++ b/crates/openshell-driver-vm/src/lib.rs @@ -4,10 +4,9 @@ pub mod driver; mod embedded_runtime; mod ffi; +pub mod procguard; mod rootfs; mod runtime; -pub const GUEST_SSH_PORT: u16 = 2222; - pub use driver::{VmDriver, VmDriverConfig}; pub use runtime::{VM_RUNTIME_DIR_ENV, VmLaunchConfig, configured_runtime_dir, run_vm}; diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs index 3a7976273..5a675e78a 100644 --- a/crates/openshell-driver-vm/src/main.rs +++ b/crates/openshell-driver-vm/src/main.rs @@ -6,7 +6,8 @@ use miette::{IntoDiagnostic, Result}; use openshell_core::VERSION; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; use openshell_driver_vm::{ - VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, VmLaunchConfig, configured_runtime_dir, run_vm, + VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, VmLaunchConfig, configured_runtime_dir, + procguard, run_vm, }; use std::net::SocketAddr; use std::path::PathBuf; @@ -34,9 +35,6 @@ struct Args { #[arg(long, hide = true)] vm_env: Vec, - #[arg(long, hide = true)] - vm_port: Vec, - #[arg(long, hide = true)] vm_console_output: Option, @@ -101,6 +99,14 @@ struct Args { async fn main() -> Result<()> { let args = Args::parse(); if args.internal_run_vm { + // We intentionally defer procguard arming until `run_vm()` so + // that the only arm is the one that knows how to clean up + // gvproxy. Racing two watchers against the same parent-death + // event causes the bare arm's `exit(1)` to win, skipping the + // gvproxy cleanup and leaking the helper. The risk window + // before `run_vm` arms procguard is ~a few syscalls long + // (`build_vm_launch_config`, `configured_runtime_dir`), which + // is negligible next to the parent gRPC server's uptime. maybe_reexec_internal_vm_with_runtime_env()?; let config = build_vm_launch_config(&args).map_err(|err| miette::miette!("{err}"))?; run_vm(&config).map_err(|err| miette::miette!("{err}"))?; @@ -113,6 +119,18 @@ async fn main() -> Result<()> { ) .init(); + // Arm procguard so that if the gateway is killed (SIGKILL or crash) + // we also die. Without this the driver is reparented to init and + // keeps its per-sandbox VM launchers alive forever. Launchers have + // their own procguards (armed in `run_vm`) which cascade cleanup of + // gvproxy and the libkrun worker the moment this driver exits. + if let Err(err) = procguard::die_with_parent() { + tracing::warn!( + error = %err, + "procguard arm failed; gateway crashes may orphan this driver" + ); + } + let driver = VmDriver::new(VmDriverConfig { openshell_endpoint: args .openshell_endpoint @@ -183,7 +201,6 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result std::result::Result Result<()> { + use std::os::unix::process::CommandExt as _; + const REEXEC_ENV: &str = "__OPENSHELL_DRIVER_VM_REEXEC"; if std::env::var_os(REEXEC_ENV).is_some() { @@ -213,14 +232,23 @@ fn maybe_reexec_internal_vm_with_runtime_env() -> Result<()> { .map_err(|err| miette::miette!("join DYLD_LIBRARY_PATH: {err}"))?; let exe = std::env::current_exe().into_diagnostic()?; let args: Vec = std::env::args().skip(1).collect(); - let status = std::process::Command::new(exe) + + // Use execvp() so the current process is *replaced* by the re-exec'd + // binary — no wrapper process sits between the compute driver and + // the actually-running VM launcher. That avoids two problems: + // 1. An extra process level that survives SIGKILL of the driver + // (the wrapper was reparenting the re-exec'd child to init). + // 2. Signal forwarding: with a wrapper, a SIGTERM to the wrapper + // doesn't reach the child unless we hand-roll forwarding. + // After exec, the child inherits our PID and our procguard arming. + let err = std::process::Command::new(exe) .args(&args) .env("DYLD_LIBRARY_PATH", &joined) .env(VM_RUNTIME_DIR_ENV, runtime_dir) .env(REEXEC_ENV, "1") - .status() - .into_diagnostic()?; - std::process::exit(status.code().unwrap_or(1)); + .exec(); + // `exec()` only returns on failure. + Err(miette::miette!("failed to re-exec with runtime env: {err}")) } #[cfg(not(target_os = "macos"))] diff --git a/crates/openshell-driver-vm/src/procguard.rs b/crates/openshell-driver-vm/src/procguard.rs new file mode 100644 index 000000000..1d91880f7 --- /dev/null +++ b/crates/openshell-driver-vm/src/procguard.rs @@ -0,0 +1,196 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Cross-platform "die when my parent dies" primitive. +//! +//! The VM driver spawns a chain of subprocesses (compute driver → `--internal-run-vm` +//! launcher → gvproxy + libkrun fork). If any link in that chain is killed +//! with SIGKILL — or simply crashes — the children are reparented to init +//! and survive indefinitely, leaking libkrun workers and gvproxy +//! instances. +//! +//! This module exposes two functions: +//! * [`die_with_parent`] — configure the kernel (Linux) or a helper +//! thread (BSDs, incl. macOS) to SIGKILL the current process when its +//! parent dies. Call it from `main` in every subprocess we spawn +//! along the chain. Idempotent-ish (each call is a full setup — see +//! the runtime.rs comment at the single call site). +//! * [`die_with_parent_cleanup`] — same as above, but on the BSD path a +//! best-effort cleanup callback runs *before* this process exits. +//! This matters when we own a non-Rust child (e.g. gvproxy) that +//! cannot arm its own procguard; the callback lets us SIGTERM it +//! first. +//! +//! The Linux path uses `nix::sys::prctl::set_pdeathsig(SIGKILL)`, and +//! the BSD path uses `smol-rs/polling` with its `kqueue::Process` + +//! `ProcessOps::Exit` filter. Both are well-tested library surfaces; +//! we keep only the glue code and the pre-arming parent-liveness +//! re-check. + +/// Arrange for the current process to receive SIGKILL if its parent dies. +/// +/// On Linux this sets `PR_SET_PDEATHSIG` to SIGKILL (via +/// `nix::sys::prctl`). The kernel delivers SIGKILL the moment +/// `getppid()` changes away from the original parent. +/// +/// On the BSD family (macOS, FreeBSD, etc.) this spawns a detached +/// helper thread that uses `kqueue` with `EVFILT_PROC | NOTE_EXIT` on +/// the parent PID. When the parent exits the thread calls `exit(1)`, +/// which is sufficient for our use case — we are not a critical daemon +/// that needs to drain state; we are a VM launcher / gRPC driver whose +/// entire job is tied to the parent's lifetime. +pub fn die_with_parent() -> Result<(), String> { + die_with_parent_cleanup(|| ()) +} + +/// Like [`die_with_parent`], but run `cleanup` (best-effort, +/// async-signal-unsafe — it runs on the helper thread) immediately +/// before terminating the process. Use this when we own children that +/// cannot arm their own procguard; the cleanup hook is the only chance +/// we get to send them SIGTERM after the kernel reparents us. +/// +/// On Linux the cleanup is a no-op: `PR_SET_PDEATHSIG` delivers SIGKILL +/// directly to us, there is no Rust-controlled moment between "parent +/// died" and "we die" in which we could run a callback. +pub fn die_with_parent_cleanup(cleanup: F) -> Result<(), String> +where + F: FnOnce() + Send + 'static, +{ + #[cfg(target_os = "linux")] + { + // Linux has no opportunity for a cleanup hook — the kernel + // delivers SIGKILL directly. Callers that need pre-exit cleanup + // must combine this with a `pre_exec` PR_SET_PDEATHSIG on their + // children (so the kernel cascades) or rely on process-group + // killpg from a signal handler in the parent. + let _ = cleanup; // intentionally dropped + install_linux_pdeathsig() + } + + #[cfg(any( + target_os = "macos", + target_os = "ios", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly", + ))] + { + install_bsd_kqueue_watcher(cleanup) + } + + #[cfg(not(any( + target_os = "linux", + target_os = "macos", + target_os = "ios", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly", + )))] + { + let _ = cleanup; + Ok(()) + } +} + +#[cfg(target_os = "linux")] +fn install_linux_pdeathsig() -> Result<(), String> { + use nix::sys::signal::Signal; + use nix::unistd::getppid; + + // Race: if the parent already died between fork/exec and this call, + // `getppid()` now returns 1 and PR_SET_PDEATHSIG will never fire. + // Read the current parent first so we can detect that case and exit. + let original_ppid = getppid(); + if original_ppid == nix::unistd::Pid::from_raw(1) { + return Err("process was already orphaned before procguard armed".to_string()); + } + + nix::sys::prctl::set_pdeathsig(Signal::SIGKILL) + .map_err(|err| format!("prctl(PR_SET_PDEATHSIG) failed: {err}"))?; + + // Re-check after arming: the parent may have died between getppid() + // and prctl(). If so, PR_SET_PDEATHSIG missed its window. + if getppid() != original_ppid { + return Err("parent exited before procguard could arm".to_string()); + } + + Ok(()) +} + +#[cfg(any( + target_os = "macos", + target_os = "ios", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly", +))] +fn install_bsd_kqueue_watcher(cleanup: F) -> Result<(), String> +where + F: FnOnce() + Send + 'static, +{ + use nix::unistd::getppid; + use polling::os::kqueue::{PollerKqueueExt, Process, ProcessOps}; + use polling::{Events, PollMode, Poller}; + + let parent_pid = getppid(); + if parent_pid == nix::unistd::Pid::from_raw(1) { + return Err("process was already orphaned before procguard armed".to_string()); + } + let parent_pid_nz = std::num::NonZeroI32::new(parent_pid.as_raw()) + .ok_or_else(|| "getppid returned 0 unexpectedly".to_string())?; + + // Build the poller on the caller's thread so any setup error + // surfaces synchronously. `EVFILT_PROC | NOTE_EXIT` is a one-shot + // filter, so `PollMode::Oneshot` matches the kernel semantics. + // + // SAFETY: `Process::from_pid` requires the PID to "be tied to an + // actual child process". Our parent is alive at this point — we + // re-check `getppid()` immediately after registration to close the + // race where the parent dies between the read above and the + // `add_filter` call. The BSD kqueue implementation accepts any + // live PID, not just our own children; the "child" wording in the + // polling docs is carried over from historical terminology in the + // kqueue(2) manpage. The kernel guarantees NOTE_EXIT fires if the + // PID is valid at registration. + let poller = Poller::new().map_err(|err| format!("polling: Poller::new failed: {err}"))?; + let key = 1; + #[allow(unsafe_code)] + // SAFETY requirement is documented on the enclosing function: the + // PID was just read from `getppid()` and re-checked below, so it + // points at a live process. `Process::from_pid` is an + // entry-in-the-kernel-table registration — the kernel validates + // the PID when the filter is added. + let filter = unsafe { Process::from_pid(parent_pid_nz, ProcessOps::Exit) }; + poller + .add_filter(filter, key, PollMode::Oneshot) + .map_err(|err| format!("polling: add_filter(NOTE_EXIT, {parent_pid_nz}) failed: {err}"))?; + + // Between getppid() and the registered filter the parent may + // already have died. Detect that and abort so the caller can bail. + if getppid() != parent_pid { + return Err("parent exited before procguard could arm".to_string()); + } + + // Hand off to a dedicated OS thread. Block in `poller.wait()` + // until the single NOTE_EXIT event fires, run the cleanup, then + // exit. We prefer `exit(1)` over `kill(getpid, SIGKILL)` so the + // callback gets to complete — SIGKILL would race it. Our children + // have their own procguards armed and will notice `getppid() == + // 1` shortly after, so we do not need Linux-semantics exactness. + std::thread::Builder::new() + .name("procguard".to_string()) + .spawn(move || { + let mut events = Events::new(); + // Block indefinitely; the filter is Oneshot so we expect + // exactly one event (parent's NOTE_EXIT) or a spurious + // wakeup we treat the same way. + let _ = poller.wait(&mut events, None); + let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(cleanup)); + std::process::exit(1); + }) + .map(|_| ()) + .map_err(|e| format!("failed to spawn procguard thread: {e}")) +} diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs index 9888feb18..e20c7d4e5 100644 --- a/crates/openshell-driver-vm/src/runtime.rs +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -4,20 +4,25 @@ #![allow(unsafe_code)] use std::ffi::CString; -use std::io::{Read, Write}; -use std::os::unix::net::UnixStream; use std::path::{Path, PathBuf}; use std::process::{Child as StdChild, Command as StdCommand, Stdio}; use std::ptr; use std::sync::atomic::{AtomicI32, Ordering}; use std::time::{Duration, Instant}; -use crate::{GUEST_SSH_PORT, embedded_runtime, ffi}; +use crate::{embedded_runtime, ffi, procguard}; pub const VM_RUNTIME_DIR_ENV: &str = "OPENSHELL_VM_RUNTIME_DIR"; +/// PID of the forked libkrun worker (the VM's PID 1). Zero when not running. +/// Used by the SIGTERM/SIGINT handler to forward signals to the VM. static CHILD_PID: AtomicI32 = AtomicI32::new(0); +/// PID of the gvproxy helper process. Zero when not running. Used by the +/// SIGTERM/SIGINT handler to make sure gvproxy doesn't survive the +/// launcher on macOS (where we can't use `PR_SET_PDEATHSIG`). +static GVPROXY_PID: AtomicI32 = AtomicI32::new(0); + pub struct VmLaunchConfig { pub rootfs: PathBuf, pub vcpus: u8, @@ -26,23 +31,10 @@ pub struct VmLaunchConfig { pub args: Vec, pub env: Vec, pub workdir: String, - pub port_map: Vec, pub log_level: u32, pub console_output: PathBuf, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -struct PortMapping { - host_port: u16, - guest_port: u16, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -struct GvproxyPortPlan { - ssh_port: u16, - forwarded_ports: Vec, -} - pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { if !config.rootfs.is_dir() { return Err(format!( @@ -51,6 +43,47 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { )); } + // Arm procguard first, BEFORE we spawn gvproxy or fork libkrun, so + // that the launcher can't be orphaned during setup. The cleanup + // callback reads the GVPROXY_PID atomic (initially 0 — no-op) and + // the CHILD_PID atomic (the libkrun fork), so it stays correct as + // those slots get populated later in this function. Only ONE arm + // per process: racing two watchers for the same NOTE_EXIT event + // would cause whichever wins to skip the cleanup. + if let Err(err) = procguard::die_with_parent_cleanup(|| { + // Cleanup order: SIGTERM gvproxy and the libkrun fork first so + // they can drain cleanly, then SIGKILL after a brief grace + // window. We can't rely on Rust destructors here; when + // procguard's watcher thread returns we call `std::process::exit` + // and the process tears down. Only async-signal-safe calls here: + // atomic loads and `kill(2)` are both on the POSIX list. + let gv_pid = GVPROXY_PID.load(Ordering::Relaxed); + let child_pid = CHILD_PID.load(Ordering::Relaxed); + if gv_pid > 0 { + unsafe { + libc::kill(gv_pid, libc::SIGTERM); + } + } + if child_pid > 0 { + unsafe { + libc::kill(child_pid, libc::SIGTERM); + } + } + std::thread::sleep(Duration::from_millis(200)); + if gv_pid > 0 { + unsafe { + libc::kill(gv_pid, libc::SIGKILL); + } + } + if child_pid > 0 { + unsafe { + libc::kill(child_pid, libc::SIGKILL); + } + } + }) { + return Err(format!("procguard arm failed: {err}")); + } + #[cfg(target_os = "linux")] check_kvm_access()?; @@ -64,10 +97,40 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { vm.set_root(&config.rootfs)?; vm.set_workdir(&config.workdir)?; - let mut forwarded_port_map = config.port_map.clone(); - let mut gvproxy_guard = None; - let mut gvproxy_api_sock = None; - if !config.port_map.is_empty() { + // Run gvproxy strictly as the guest's virtual NIC / DHCP / router. + // + // After the supervisor-initiated relay migration (#867), the driver + // no longer forwards any host-side ports into the guest — all ingress + // traffic for SSH and exec rides the outbound `ConnectSupervisor` + // gRPC stream the guest opens to the gateway. What gvproxy still + // provides here is the TCP/IP *plane* the guest kernel needs: + // + // * a virtio-net backend attached to libkrun via a Unix + // SOCK_STREAM (Linux) or SOCK_DGRAM (macOS vfkit), which + // surfaces as `eth0` inside the guest; + // * the DHCP server + default router the guest's udhcpc client + // talks to on boot (IPs 192.168.127.1 / .2, defaults for + // gvisor-tap-vsock); + // * the host-facing gateway identity the guest uses for callbacks: + // the init script seeds `/etc/hosts` with + // `host.openshell.internal` pointing at 192.168.127.1 while + // leaving gvproxy's legacy `host.containers.internal` / + // `host.docker.internal` DNS answers intact, which is how the guest's + // `rewrite_openshell_endpoint_if_needed` probe reaches the host + // gateway when the bare loopback address doesn't resolve from + // inside the VM. + // + // That network plane is also what the sandbox supervisor's + // per-sandbox netns (veth pair + iptables, see + // `openshell-sandbox/src/sandbox/linux/netns.rs`) branches off of; + // libkrun's built-in TSI socket impersonation would not satisfy + // those kernel-level primitives. + // + // The `-listen` API socket and `-ssh-port` forwarder are both + // deliberately omitted: nothing in the driver enqueues port + // forwards on the API any more, and the host-side SSH listener is + // dead plumbing. + let gvproxy_guard = { let gvproxy_binary = runtime_dir.join("gvproxy"); if !gvproxy_binary.is_file() { return Err(format!( @@ -76,13 +139,9 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { )); } - kill_stale_gvproxy_by_port_map(&config.port_map); - let sock_base = gvproxy_socket_base(&config.rootfs)?; let net_sock = sock_base.with_extension("v"); - let api_sock = sock_base.with_extension("a"); let _ = std::fs::remove_file(&net_sock); - let _ = std::fs::remove_file(&api_sock); let _ = std::fs::remove_file(sock_base.with_extension("v-krun.sock")); let run_dir = config.rootfs.parent().unwrap_or(&config.rootfs); @@ -90,9 +149,6 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { let gvproxy_log_file = std::fs::File::create(&gvproxy_log) .map_err(|e| format!("create gvproxy log {}: {e}", gvproxy_log.display()))?; - let gvproxy_ports = plan_gvproxy_ports(&config.port_map)?; - forwarded_port_map = gvproxy_ports.forwarded_ports; - #[cfg(target_os = "linux")] let (gvproxy_net_flag, gvproxy_net_url) = ("-listen-qemu", format!("unix://{}", net_sock.display())); @@ -102,18 +158,51 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { format!("unixgram://{}", net_sock.display()), ); - let child = StdCommand::new(&gvproxy_binary) + // `-ssh-port -1` tells gvproxy to skip its default SSH forward + // (127.0.0.1:2222 → guest:22). We don't use it — all gateway + // ingress rides the supervisor-initiated relay — and leaving + // the default on would bind a host-side TCP listener per + // sandbox, racing concurrent sandboxes for port 2222 and + // surfacing a misleading "sshd is reachable" endpoint. See + // https://github.com/containers/gvisor-tap-vsock `cmd/gvproxy/main.go` + // (`getForwardsMap` returns an empty map when `sshPort == -1`). + let mut gvproxy_cmd = StdCommand::new(&gvproxy_binary); + gvproxy_cmd .arg(gvproxy_net_flag) .arg(&gvproxy_net_url) - .arg("-listen") - .arg(format!("unix://{}", api_sock.display())) .arg("-ssh-port") - .arg(gvproxy_ports.ssh_port.to_string()) + .arg("-1") .stdin(Stdio::null()) .stdout(Stdio::null()) - .stderr(gvproxy_log_file) + .stderr(gvproxy_log_file); + + // On Linux the kernel will SIGKILL gvproxy the moment this + // launcher dies (or is SIGKILLed). `pre_exec` runs in the child + // between fork and execve, so the PR_SET_PDEATHSIG flag is + // inherited across execve and applies to gvproxy proper. On + // macOS/BSDs there is no equivalent; we fall back to killing + // gvproxy explicitly from the launcher's procguard cleanup + // callback (see `run_vm` above) and SIGTERM handler + // (see `install_signal_forwarding` below). + #[cfg(target_os = "linux")] + { + use nix::sys::signal::Signal; + use std::os::unix::process::CommandExt as _; + unsafe { + gvproxy_cmd.pre_exec(|| { + nix::sys::prctl::set_pdeathsig(Signal::SIGKILL) + .map_err(|err| std::io::Error::other(format!("pdeathsig: {err}"))) + }); + } + } + + let child = gvproxy_cmd .spawn() .map_err(|e| format!("failed to start gvproxy {}: {e}", gvproxy_binary.display()))?; + // The procguard cleanup reads GVPROXY_PID atomically. Storing it + // here makes the callback able to SIGTERM gvproxy if the driver + // dies from this moment onward. + GVPROXY_PID.store(child.id() as i32, Ordering::Relaxed); wait_for_path(&net_sock, Duration::from_secs(5), "gvproxy data socket")?; @@ -142,13 +231,9 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; } - gvproxy_guard = Some(GvproxyGuard::new(child)); - gvproxy_api_sock = Some(api_sock); - } + Some(GvproxyGuard::new(child)) + }; - if !config.port_map.is_empty() && gvproxy_api_sock.is_none() { - vm.set_port_map(&config.port_map)?; - } vm.set_console_output(&config.console_output)?; let env = if config.env.is_empty() { @@ -166,6 +251,20 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { match pid { -1 => Err(format!("fork failed: {}", std::io::Error::last_os_error())), 0 => { + // We are the libkrun worker (the VM's PID 1 inside the guest + // kernel, but a normal host process until krun_start_enter + // fires). Arm procguard so this fork is SIGKILLed if the + // parent launcher dies abruptly. On Linux this uses + // `PR_SET_PDEATHSIG`; on macOS this spawns a kqueue + // NOTE_EXIT watcher thread. Either way it closes the same + // leak gvproxy does above. + // + // We also SIGKILL ourselves if arming fails — there's no + // safe way to continue if we can't guarantee cleanup. + if let Err(err) = procguard::die_with_parent() { + eprintln!("libkrun worker: procguard arm failed: {err}"); + std::process::exit(1); + } let ret = vm.start_enter(); eprintln!("krun_start_enter failed: {ret}"); std::process::exit(1); @@ -173,24 +272,10 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { _ => { install_signal_forwarding(pid); - let port_forward_result = if let Some(api_sock) = gvproxy_api_sock.as_ref() { - expose_port_map(api_sock, &forwarded_port_map) - } else { - Ok(()) - }; - - if let Err(err) = port_forward_result { - unsafe { - libc::kill(pid, libc::SIGTERM); - } - let _ = wait_for_child(pid); - cleanup_gvproxy(gvproxy_guard); - return Err(err); - } - let status = wait_for_child(pid)?; CHILD_PID.store(0, Ordering::Relaxed); cleanup_gvproxy(gvproxy_guard); + GVPROXY_PID.store(0, Ordering::Relaxed); if libc::WIFEXITED(status) { match libc::WEXITSTATUS(status) { @@ -399,15 +484,6 @@ impl VmContext { ) } - fn set_port_map(&self, port_map: &[String]) -> Result<(), String> { - let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); - let (_owners, ptrs) = c_string_array(&port_strs)?; - check( - unsafe { (self.krun.krun_set_port_map)(self.ctx_id, ptrs.as_ptr()) }, - "krun_set_port_map", - ) - } - fn set_console_output(&self, path: &Path) -> Result<(), String> { let console_c = path_to_cstring(path)?; check( @@ -476,126 +552,6 @@ impl Drop for GvproxyGuard { } } -fn expose_port_map(api_sock: &Path, port_map: &[String]) -> Result<(), String> { - wait_for_path(api_sock, Duration::from_secs(2), "gvproxy API socket")?; - let guest_ip = "192.168.127.2"; - - for pm in port_map { - let mapping = parse_port_mapping(pm)?; - - let expose_body = format!( - r#"{{"local":":{}","remote":"{guest_ip}:{}","protocol":"tcp"}}"#, - mapping.host_port, mapping.guest_port - ); - - let deadline = Instant::now() + Duration::from_secs(10); - let mut retry_interval = Duration::from_millis(100); - loop { - match gvproxy_expose(api_sock, &expose_body) { - Ok(()) => break, - Err(err) if Instant::now() < deadline => { - std::thread::sleep(retry_interval); - retry_interval = (retry_interval * 2).min(Duration::from_secs(1)); - if retry_interval == Duration::from_secs(1) { - eprintln!("retrying gvproxy port expose {pm}: {err}"); - } - } - Err(err) => { - return Err(format!( - "failed to forward port {} via gvproxy: {err}", - mapping.host_port - )); - } - } - } - } - - Ok(()) -} - -fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { - let mut stream = - UnixStream::connect(api_sock).map_err(|e| format!("connect to gvproxy API socket: {e}"))?; - - let request = format!( - "POST /services/forwarder/expose HTTP/1.1\r\n\ - Host: localhost\r\n\ - Content-Type: application/json\r\n\ - Content-Length: {}\r\n\ - Connection: close\r\n\ - \r\n\ - {}", - body.len(), - body, - ); - - stream - .write_all(request.as_bytes()) - .map_err(|e| format!("write to gvproxy API: {e}"))?; - - let mut buf = [0u8; 1024]; - let n = stream - .read(&mut buf) - .map_err(|e| format!("read from gvproxy API: {e}"))?; - let response = String::from_utf8_lossy(&buf[..n]); - let status = response - .lines() - .next() - .and_then(|line| line.split_whitespace().nth(1)) - .unwrap_or("0"); - - match status { - "200" | "204" => Ok(()), - _ => Err(format!( - "gvproxy API: {}", - response.lines().next().unwrap_or("") - )), - } -} - -fn plan_gvproxy_ports(port_map: &[String]) -> Result { - let mut ssh_port = None; - let mut forwarded_ports = Vec::with_capacity(port_map.len()); - - for pm in port_map { - let mapping = parse_port_mapping(pm)?; - if ssh_port.is_none() && mapping.guest_port == GUEST_SSH_PORT && mapping.host_port >= 1024 { - ssh_port = Some(mapping.host_port); - continue; - } - forwarded_ports.push(pm.clone()); - } - - Ok(GvproxyPortPlan { - ssh_port: match ssh_port { - Some(port) => port, - None => pick_gvproxy_ssh_port()?, - }, - forwarded_ports, - }) -} - -fn parse_port_mapping(pm: &str) -> Result { - let parts: Vec<&str> = pm.split(':').collect(); - let (host, guest) = match parts.as_slice() { - [host, guest] => (*host, *guest), - [port] => (*port, *port), - _ => return Err(format!("invalid port mapping '{pm}'")), - }; - - let host_port = host - .parse::() - .map_err(|_| format!("invalid port mapping '{pm}'"))?; - let guest_port = guest - .parse::() - .map_err(|_| format!("invalid port mapping '{pm}'"))?; - - Ok(PortMapping { - host_port, - guest_port, - }) -} - fn wait_for_path(path: &Path, timeout: Duration, label: &str) -> Result<(), String> { let deadline = Instant::now() + timeout; let mut interval = Duration::from_millis(5); @@ -674,92 +630,6 @@ fn gvproxy_socket_base(rootfs: &Path) -> Result { Ok(secure_socket_base("osd-gv")?.join(hash_path_id(rootfs))) } -fn pick_gvproxy_ssh_port() -> Result { - let listener = std::net::TcpListener::bind(("127.0.0.1", 0)) - .map_err(|e| format!("allocate gvproxy ssh port on localhost: {e}"))?; - let port = listener - .local_addr() - .map_err(|e| format!("read gvproxy ssh port: {e}"))? - .port(); - drop(listener); - Ok(port) -} - -fn kill_stale_gvproxy_by_port_map(port_map: &[String]) { - for pm in port_map { - if let Some(host_port) = pm - .split(':') - .next() - .and_then(|port| port.parse::().ok()) - { - kill_stale_gvproxy_by_port(host_port); - } - } -} - -fn kill_stale_gvproxy_by_port(port: u16) { - let output = StdCommand::new("lsof") - .args(["-ti", &format!(":{port}")]) - .output(); - - let pids = match output { - Ok(output) if output.status.success() => { - String::from_utf8_lossy(&output.stdout).to_string() - } - _ => return, - }; - - for line in pids.lines() { - if let Ok(pid) = line.trim().parse::() - && is_process_named(pid as libc::pid_t, "gvproxy") - { - kill_gvproxy_pid(pid); - } - } -} - -fn kill_gvproxy_pid(pid: u32) { - let pid = pid as libc::pid_t; - if unsafe { libc::kill(pid, 0) } != 0 { - return; - } - if !is_process_named(pid, "gvproxy") { - return; - } - unsafe { - libc::kill(pid, libc::SIGTERM); - } - std::thread::sleep(Duration::from_millis(200)); -} - -#[cfg(target_os = "macos")] -fn is_process_named(pid: libc::pid_t, expected: &str) -> bool { - StdCommand::new("ps") - .args(["-p", &pid.to_string(), "-o", "comm="]) - .output() - .ok() - .and_then(|output| { - if output.status.success() { - String::from_utf8(output.stdout).ok() - } else { - None - } - }) - .is_some_and(|name| name.trim().contains(expected)) -} - -#[cfg(target_os = "linux")] -fn is_process_named(pid: libc::pid_t, expected: &str) -> bool { - std::fs::read_to_string(format!("/proc/{pid}/comm")) - .map(|name| name.trim().contains(expected)) - .unwrap_or(false) -} - -#[cfg(not(any(target_os = "macos", target_os = "linux")))] -fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool { - false -} - fn install_signal_forwarding(pid: i32) { unsafe { libc::signal( @@ -774,11 +644,28 @@ fn install_signal_forwarding(pid: i32) { CHILD_PID.store(pid, Ordering::Relaxed); } +/// Async-signal-safe handler that forwards SIGTERM to every process we +/// own: the libkrun VM worker and the gvproxy helper. We cannot rely on +/// Rust destructors (`GvproxyGuard::drop`, `ManagedDriverProcess::drop`) +/// running on signal-driven exit, so we explicitly deliver the signal +/// here. The `wait_for_child` loop reaps libkrun and `cleanup_gvproxy` +/// reaps gvproxy before `run_vm` returns. +/// +/// Only async-signal-safe libc calls are used — `kill(2)` is listed in +/// POSIX.1-2017 as async-signal-safe, atomic loads are lock-free on the +/// platforms we target. extern "C" fn forward_signal(_sig: libc::c_int) { - let pid = CHILD_PID.load(Ordering::Relaxed); - if pid > 0 { + let vm_pid = CHILD_PID.load(Ordering::Relaxed); + if vm_pid > 0 { unsafe { - libc::kill(pid, libc::SIGTERM); + libc::kill(vm_pid, libc::SIGTERM); + } + } + let gv_pid = GVPROXY_PID.load(Ordering::Relaxed); + if gv_pid > 0 { + // gvproxy handles SIGTERM cleanly; no need for SIGKILL. + unsafe { + libc::kill(gv_pid, libc::SIGTERM); } } } @@ -840,38 +727,3 @@ fn check_kvm_access() -> Result<(), String> { format!("cannot open /dev/kvm: {e}\nKVM access is required to run microVMs on Linux.") }) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn plan_gvproxy_ports_reuses_sandbox_ssh_mapping() { - let plan = plan_gvproxy_ports(&["64739:2222".to_string()]).expect("plan should succeed"); - - assert_eq!(plan.ssh_port, 64739); - assert!(plan.forwarded_ports.is_empty()); - } - - #[test] - fn plan_gvproxy_ports_keeps_non_ssh_mappings_for_forwarder() { - let plan = plan_gvproxy_ports(&["64739:8080".to_string()]).expect("plan should succeed"); - - assert_ne!(plan.ssh_port, 64739); - assert_eq!(plan.forwarded_ports, vec!["64739:8080".to_string()]); - } - - #[test] - fn plan_gvproxy_ports_ignores_privileged_host_ports_for_direct_ssh() { - let plan = plan_gvproxy_ports(&["22:2222".to_string()]).expect("plan should succeed"); - - assert_ne!(plan.ssh_port, 22); - assert_eq!(plan.forwarded_ports, vec!["22:2222".to_string()]); - } - - #[test] - fn parse_port_mapping_rejects_invalid_entries() { - let err = parse_port_mapping("bad:mapping").expect_err("invalid mapping should fail"); - assert!(err.contains("invalid port mapping")); - } -} diff --git a/crates/openshell-driver-vm/start.sh b/crates/openshell-driver-vm/start.sh index 155136c78..b5aebbefd 100755 --- a/crates/openshell-driver-vm/start.sh +++ b/crates/openshell-driver-vm/start.sh @@ -5,12 +5,26 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +CLI_BIN="${ROOT}/scripts/bin/openshell" COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed" -STATE_DIR_DEFAULT="${ROOT}/target/openshell-vm-driver-dev" +SERVER_PORT="${OPENSHELL_SERVER_PORT:-8080}" +# Keep the driver socket path under AF_UNIX SUN_LEN on macOS. +STATE_DIR_ROOT="${OPENSHELL_VM_DRIVER_STATE_ROOT:-/tmp}" +STATE_LABEL_RAW="${OPENSHELL_VM_INSTANCE:-port-${SERVER_PORT}}" +STATE_LABEL="$(printf '%s' "${STATE_LABEL_RAW}" | tr -cs '[:alnum:]._-' '-')" +if [ -z "${STATE_LABEL}" ]; then + STATE_LABEL="port-${SERVER_PORT}" +fi +STATE_DIR_DEFAULT="${STATE_DIR_ROOT}/openshell-vm-driver-dev-${USER:-user}-${STATE_LABEL}" STATE_DIR="${OPENSHELL_VM_DRIVER_STATE_DIR:-${STATE_DIR_DEFAULT}}" DB_PATH_DEFAULT="${STATE_DIR}/openshell.db" -SERVER_PORT="${OPENSHELL_SERVER_PORT:-8080}" VM_HOST_GATEWAY_DEFAULT="${OPENSHELL_VM_HOST_GATEWAY:-host.containers.internal}" +LOCAL_GATEWAY_ENDPOINT_DEFAULT="http://127.0.0.1:${SERVER_PORT}" +LOCAL_GATEWAY_ENDPOINT="${OPENSHELL_VM_LOCAL_GATEWAY_ENDPOINT:-${LOCAL_GATEWAY_ENDPOINT_DEFAULT}}" +GATEWAY_NAME_DEFAULT="vm-driver-${STATE_LABEL}" +GATEWAY_NAME="${OPENSHELL_VM_GATEWAY_NAME:-${GATEWAY_NAME_DEFAULT}}" +DRIVER_DIR_DEFAULT="${ROOT}/target/debug" +DRIVER_DIR="${OPENSHELL_DRIVER_DIR:-${DRIVER_DIR_DEFAULT}}" export OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR:-${COMPRESSED_DIR}}" @@ -52,11 +66,19 @@ fi export OPENSHELL_DISABLE_TLS="$(normalize_bool "${OPENSHELL_DISABLE_TLS:-true}")" export OPENSHELL_DB_URL="${OPENSHELL_DB_URL:-sqlite:${DB_PATH_DEFAULT}}" export OPENSHELL_DRIVERS="${OPENSHELL_DRIVERS:-vm}" +export OPENSHELL_DRIVER_DIR="${DRIVER_DIR}" export OPENSHELL_GRPC_ENDPOINT="${OPENSHELL_GRPC_ENDPOINT:-http://${VM_HOST_GATEWAY_DEFAULT}:${SERVER_PORT}}" export OPENSHELL_SSH_GATEWAY_HOST="${OPENSHELL_SSH_GATEWAY_HOST:-127.0.0.1}" export OPENSHELL_SSH_GATEWAY_PORT="${OPENSHELL_SSH_GATEWAY_PORT:-${SERVER_PORT}}" export OPENSHELL_SSH_HANDSHAKE_SECRET="${OPENSHELL_SSH_HANDSHAKE_SECRET:-dev-vm-driver-secret}" export OPENSHELL_VM_DRIVER_STATE_DIR="${STATE_DIR}" +echo "==> Gateway registration" +echo " Name: ${GATEWAY_NAME}" +echo " Endpoint: ${LOCAL_GATEWAY_ENDPOINT}" +echo " Register: ${CLI_BIN} gateway add --name ${GATEWAY_NAME} ${LOCAL_GATEWAY_ENDPOINT}" +echo " Select: ${CLI_BIN} gateway select ${GATEWAY_NAME}" +echo " Driver: ${OPENSHELL_DRIVER_DIR}/openshell-driver-vm" + echo "==> Starting OpenShell server with VM compute driver" exec "${ROOT}/target/debug/openshell-gateway" diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 95ffbfaa4..35c72f80c 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -11,6 +11,7 @@ use crate::grpc::policy::{SANDBOX_SETTINGS_OBJECT_TYPE, sandbox_settings_id}; use crate::persistence::{ObjectId, ObjectName, ObjectRecord, ObjectType, Store}; use crate::sandbox_index::SandboxIndex; use crate::sandbox_watch::SandboxWatchBus; +use crate::supervisor_session::SupervisorSessionRegistry; use crate::tracing_bus::TracingLogBus; use futures::{Stream, StreamExt}; use openshell_core::proto::compute::v1::{ @@ -188,6 +189,7 @@ pub struct ComputeRuntime { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, sync_lock: Arc>, } @@ -205,6 +207,7 @@ impl ComputeRuntime { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Result { let default_image = driver .get_capabilities(Request::new(GetCapabilitiesRequest {})) @@ -220,6 +223,7 @@ impl ComputeRuntime { sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, sync_lock: Arc::new(Mutex::new(())), }) } @@ -230,6 +234,7 @@ impl ComputeRuntime { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Result { let driver = KubernetesComputeDriver::new(config) .await @@ -242,6 +247,7 @@ impl ComputeRuntime { sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, ) .await } @@ -253,6 +259,7 @@ impl ComputeRuntime { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Result { let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); Self::from_driver( @@ -262,6 +269,7 @@ impl ComputeRuntime { sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, ) .await } @@ -563,7 +571,8 @@ impl ComputeRuntime { existing.as_ref().and_then(|sandbox| sandbox.spec.as_ref()), ); - let phase = derive_phase(incoming.status.as_ref()); + let session_connected = self.supervisor_sessions.has_session(&incoming.id); + let mut phase = derive_phase(incoming.status.as_ref()); let mut sandbox = existing.unwrap_or_else(|| Sandbox { id: incoming.id.clone(), name: incoming.name.clone(), @@ -574,6 +583,12 @@ impl ComputeRuntime { ..Default::default() }); + if session_connected && matches!(phase, SandboxPhase::Provisioning | SandboxPhase::Unknown) + { + ensure_supervisor_ready_status(&mut status, &sandbox.name); + phase = SandboxPhase::Ready; + } + let old_phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); if old_phase != phase { info!( @@ -622,6 +637,55 @@ impl ComputeRuntime { Ok(()) } + pub async fn supervisor_session_connected(&self, sandbox_id: &str) -> Result<(), String> { + self.set_supervisor_session_state(sandbox_id, true).await + } + + pub async fn supervisor_session_disconnected(&self, sandbox_id: &str) -> Result<(), String> { + self.set_supervisor_session_state(sandbox_id, false).await + } + + async fn set_supervisor_session_state( + &self, + sandbox_id: &str, + connected: bool, + ) -> Result<(), String> { + let _guard = self.sync_lock.lock().await; + let Some(record) = self + .store + .get(Sandbox::object_type(), sandbox_id) + .await + .map_err(|e| e.to_string())? + else { + return Ok(()); + }; + + let mut sandbox = decode_sandbox_record(&record)?; + let current_phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); + + if current_phase == SandboxPhase::Deleting || current_phase == SandboxPhase::Error { + return Ok(()); + } + + if connected { + ensure_supervisor_ready_status(&mut sandbox.status, &sandbox.name); + sandbox.phase = SandboxPhase::Ready as i32; + } else if current_phase == SandboxPhase::Ready { + ensure_supervisor_not_ready_status(&mut sandbox.status, &sandbox.name); + sandbox.phase = SandboxPhase::Provisioning as i32; + } else { + return Ok(()); + } + + self.sandbox_index.update_from_sandbox(&sandbox); + self.store + .put_message(&sandbox) + .await + .map_err(|e| e.to_string())?; + self.sandbox_watch_bus.notify(sandbox_id); + Ok(()) + } + async fn apply_deleted(&self, sandbox_id: &str) -> Result<(), String> { let _guard = self.sync_lock.lock().await; self.apply_deleted_locked(sandbox_id).await @@ -963,6 +1027,58 @@ fn public_status_from_driver(status: &DriverSandboxStatus) -> SandboxStatus { } } +fn ensure_supervisor_ready_status(status: &mut Option, sandbox_name: &str) { + upsert_ready_condition( + status, + sandbox_name, + SandboxCondition { + r#type: "Ready".to_string(), + status: "True".to_string(), + reason: "DependenciesReady".to_string(), + message: "Supervisor session connected".to_string(), + last_transition_time: String::new(), + }, + ); +} + +fn ensure_supervisor_not_ready_status(status: &mut Option, sandbox_name: &str) { + upsert_ready_condition( + status, + sandbox_name, + SandboxCondition { + r#type: "Ready".to_string(), + status: "False".to_string(), + reason: "DependenciesNotReady".to_string(), + message: "Supervisor session disconnected".to_string(), + last_transition_time: String::new(), + }, + ); +} + +fn upsert_ready_condition( + status: &mut Option, + sandbox_name: &str, + condition: SandboxCondition, +) { + let status = status.get_or_insert_with(|| SandboxStatus { + sandbox_name: sandbox_name.to_string(), + agent_pod: String::new(), + agent_fd: String::new(), + sandbox_fd: String::new(), + conditions: Vec::new(), + }); + + if let Some(existing) = status + .conditions + .iter_mut() + .find(|existing| existing.r#type == "Ready") + { + *existing = condition; + } else { + status.conditions.push(condition); + } +} + fn public_condition_from_driver(condition: &DriverCondition) -> SandboxCondition { SandboxCondition { r#type: condition.r#type.clone(), @@ -1044,6 +1160,7 @@ mod tests { GetSandboxResponse, StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateResponse, }; use std::sync::Arc; + use tokio::sync::{mpsc, oneshot}; #[derive(Debug, Default)] struct TestDriver { @@ -1159,10 +1276,22 @@ mod tests { sandbox_index: SandboxIndex::new(), sandbox_watch_bus: SandboxWatchBus::new(), tracing_log_bus: TracingLogBus::new(), + supervisor_sessions: Arc::new(SupervisorSessionRegistry::new()), sync_lock: Arc::new(Mutex::new(())), } } + fn register_test_supervisor_session(runtime: &ComputeRuntime, sandbox_id: &str) { + let (tx, _rx) = mpsc::channel(1); + let (shutdown_tx, _shutdown_rx) = oneshot::channel(); + runtime.supervisor_sessions.register( + sandbox_id.to_string(), + "session-1".to_string(), + tx, + shutdown_tx, + ); + } + fn sandbox_record(id: &str, name: &str, phase: SandboxPhase) -> Sandbox { Sandbox { id: id.to_string(), @@ -1417,6 +1546,122 @@ mod tests { ); } + #[tokio::test] + async fn apply_sandbox_update_promotes_connected_supervisor_session_to_ready() { + let runtime = test_runtime(Arc::new(TestDriver::default())).await; + let sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Provisioning); + runtime.store.put_message(&sandbox).await.unwrap(); + + register_test_supervisor_session(&runtime, "sb-1"); + + runtime + .apply_sandbox_update(DriverSandbox { + id: "sb-1".to_string(), + name: "sandbox-a".to_string(), + namespace: "default".to_string(), + spec: None, + status: Some(make_driver_status(make_driver_condition( + "Starting", + "VM is starting", + ))), + }) + .await + .unwrap(); + + let stored = runtime + .store + .get_message::("sb-1") + .await + .unwrap() + .unwrap(); + assert_eq!( + SandboxPhase::try_from(stored.phase).unwrap(), + SandboxPhase::Ready + ); + let ready = stored + .status + .as_ref() + .and_then(|status| { + status + .conditions + .iter() + .find(|condition| condition.r#type == "Ready") + }) + .unwrap(); + assert_eq!(ready.status, "True"); + assert_eq!(ready.reason, "DependenciesReady"); + assert_eq!(ready.message, "Supervisor session connected"); + } + + #[tokio::test] + async fn supervisor_session_connected_promotes_store_state_without_driver_refresh() { + let runtime = test_runtime(Arc::new(TestDriver::default())).await; + let sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Provisioning); + runtime.store.put_message(&sandbox).await.unwrap(); + + runtime.supervisor_session_connected("sb-1").await.unwrap(); + + let stored = runtime + .store + .get_message::("sb-1") + .await + .unwrap() + .unwrap(); + assert_eq!( + SandboxPhase::try_from(stored.phase).unwrap(), + SandboxPhase::Ready + ); + } + + #[tokio::test] + async fn supervisor_session_disconnected_demotes_ready_sandbox() { + let runtime = test_runtime(Arc::new(TestDriver::default())).await; + let mut sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Ready); + sandbox.status = Some(SandboxStatus { + sandbox_name: "sandbox-a".to_string(), + agent_pod: String::new(), + agent_fd: String::new(), + sandbox_fd: String::new(), + conditions: vec![SandboxCondition { + r#type: "Ready".to_string(), + status: "True".to_string(), + reason: "DependenciesReady".to_string(), + message: "Supervisor session connected".to_string(), + last_transition_time: String::new(), + }], + }); + runtime.store.put_message(&sandbox).await.unwrap(); + + runtime + .supervisor_session_disconnected("sb-1") + .await + .unwrap(); + + let stored = runtime + .store + .get_message::("sb-1") + .await + .unwrap() + .unwrap(); + assert_eq!( + SandboxPhase::try_from(stored.phase).unwrap(), + SandboxPhase::Provisioning + ); + let ready = stored + .status + .as_ref() + .and_then(|status| { + status + .conditions + .iter() + .find(|condition| condition.r#type == "Ready") + }) + .unwrap(); + assert_eq!(ready.status, "False"); + assert_eq!(ready.reason, "DependenciesNotReady"); + assert_eq!(ready.message, "Supervisor session disconnected"); + } + #[tokio::test] async fn reconcile_store_with_backend_applies_driver_snapshot() { let runtime = test_runtime(Arc::new(TestDriver { diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 9501ea3b2..a40794037 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -88,7 +88,7 @@ pub struct ServerState { pub settings_mutex: tokio::sync::Mutex<()>, /// Registry of active supervisor sessions and pending relay channels. - pub supervisor_sessions: supervisor_session::SupervisorSessionRegistry, + pub supervisor_sessions: Arc, } fn is_benign_tls_handshake_failure(error: &std::io::Error) -> bool { @@ -108,6 +108,7 @@ impl ServerState { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Self { Self { config, @@ -119,7 +120,7 @@ impl ServerState { ssh_connections_by_token: Mutex::new(HashMap::new()), ssh_connections_by_sandbox: Mutex::new(HashMap::new()), settings_mutex: tokio::sync::Mutex::new(()), - supervisor_sessions: supervisor_session::SupervisorSessionRegistry::new(), + supervisor_sessions, } } } @@ -150,6 +151,7 @@ pub async fn run_server( let sandbox_index = SandboxIndex::new(); let sandbox_watch_bus = SandboxWatchBus::new(); + let supervisor_sessions = Arc::new(supervisor_session::SupervisorSessionRegistry::new()); let compute = build_compute_runtime( &config, &vm_config, @@ -157,6 +159,7 @@ pub async fn run_server( sandbox_index.clone(), sandbox_watch_bus.clone(), tracing_log_bus.clone(), + supervisor_sessions.clone(), ) .await?; let state = Arc::new(ServerState::new( @@ -166,6 +169,7 @@ pub async fn run_server( sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, )); state.compute.spawn_watchers(); @@ -261,6 +265,7 @@ async fn build_compute_runtime( sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Result { let driver = configured_compute_driver(config)?; info!(driver = %driver, "Using compute driver"); @@ -288,6 +293,7 @@ async fn build_compute_runtime( sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions.clone(), ) .await .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))), @@ -300,6 +306,7 @@ async fn build_compute_runtime( sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, ) .await .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) diff --git a/crates/openshell-server/src/supervisor_session.rs b/crates/openshell-server/src/supervisor_session.rs index f81ee9e3c..d130bf71d 100644 --- a/crates/openshell-server/src/supervisor_session.rs +++ b/crates/openshell-server/src/supervisor_session.rs @@ -180,6 +180,10 @@ impl SupervisorSessionRegistry { .map(|s| s.tx.clone()) } + pub fn has_session(&self, sandbox_id: &str) -> bool { + self.sessions.lock().unwrap().contains_key(sandbox_id) + } + fn pending_channel_ids(&self, sandbox_id: &str) -> Vec { self.pending_relays .lock() @@ -547,6 +551,19 @@ pub async fn handle_connect_supervisor( .await; } + if let Err(err) = state + .compute + .supervisor_session_connected(&sandbox_id) + .await + { + warn!( + sandbox_id = %sandbox_id, + session_id = %session_id, + error = %err, + "supervisor session: failed to mark sandbox ready" + ); + } + // Step 4: Spawn the session loop that reads inbound messages. let state_clone = Arc::clone(state); let sandbox_id_clone = sandbox_id.clone(); @@ -565,6 +582,18 @@ pub async fn handle_connect_supervisor( .remove_if_current(&sandbox_id_clone, &session_id); if still_ours { info!(sandbox_id = %sandbox_id_clone, session_id = %session_id, "supervisor session: ended"); + if let Err(err) = state_clone + .compute + .supervisor_session_disconnected(&sandbox_id_clone) + .await + { + warn!( + sandbox_id = %sandbox_id_clone, + session_id = %session_id, + error = %err, + "supervisor session: failed to mark sandbox disconnected" + ); + } } else { info!(sandbox_id = %sandbox_id_clone, session_id = %session_id, "supervisor session: ended (already superseded)"); } diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index d43046d4f..566b32141 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -119,6 +119,65 @@ verify_checksum() { fi } +ensure_build_nofile_limit() { + local desired="${OPENSHELL_VM_BUILD_NOFILE_LIMIT:-8192}" + local minimum=1024 + local current="" + local hard="" + local target="" + + [ "$(uname -s)" = "Darwin" ] || return 0 + command -v cargo-zigbuild >/dev/null 2>&1 || return 0 + + current="$(ulimit -n 2>/dev/null || echo "")" + case "${current}" in + ''|*[!0-9]*) + return 0 + ;; + esac + + if [ "${current}" -ge "${desired}" ]; then + return 0 + fi + + hard="$(ulimit -Hn 2>/dev/null || echo "")" + target="${desired}" + case "${hard}" in + ''|unlimited|infinity) + ;; + *[!0-9]*) + ;; + *) + if [ "${hard}" -lt "${target}" ]; then + target="${hard}" + fi + ;; + esac + + if [ "${target}" -gt "${current}" ] && ulimit -n "${target}" 2>/dev/null; then + echo "==> Raised open file limit for cargo-zigbuild: ${current} -> $(ulimit -n)" + fi + + current="$(ulimit -n 2>/dev/null || echo "${current}")" + case "${current}" in + ''|*[!0-9]*) + return 0 + ;; + esac + + if [ "${current}" -lt "${desired}" ]; then + echo "WARNING: Open file limit is ${current}; cargo-zigbuild is more reliable at ${desired}+ on macOS." + fi + + if [ "${current}" -lt "${minimum}" ]; then + echo "ERROR: Open file limit (${current}) is too low for cargo-zigbuild on macOS." + echo " Zig 0.14+ can fail with ProcessFdQuotaExceeded while linking large binaries." + echo " Run: ulimit -n ${desired}" + echo " Then re-run this script." + exit 1 + fi +} + if [ "$BASE_ONLY" = true ]; then echo "==> Building base openshell-vm rootfs" echo " Guest arch: ${GUEST_ARCH}" @@ -135,6 +194,10 @@ else fi echo "" +# cargo-zigbuild on macOS can exhaust the default per-process file descriptor +# limit while linking larger targets with Zig 0.14+. +ensure_build_nofile_limit + # ── Check for running VM ──────────────────────────────────────────────── # If an openshell-vm is using this rootfs via virtio-fs, wiping the rootfs # corrupts the VM's filesystem (e.g. /var disappears) causing cascading diff --git a/e2e/rust/e2e-vm.sh b/e2e/rust/e2e-vm.sh index 5fd055036..5990d8db6 100755 --- a/e2e/rust/e2e-vm.sh +++ b/e2e/rust/e2e-vm.sh @@ -2,245 +2,227 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Run the Rust e2e smoke test against an openshell-vm gateway. +# Run the Rust e2e smoke test against an openshell-gateway running the +# standalone VM compute driver (`openshell-driver-vm`). # -# Usage: -# mise run e2e:vm # start new named VM on random port -# mise run e2e:vm -- --vm-port=30051 # reuse existing VM on port 30051 -# mise run e2e:vm -- --vm-port=30051 --vm-name=my-vm # reuse existing named VM and run exec check -# -# Options: -# --vm-port=PORT Skip VM startup and test against this port. -# --vm-name=NAME VM instance name. Auto-generated for fresh VMs. +# Architecture (post supervisor-initiated relay, PR #867): +# * The gateway never dials the sandbox. Instead, the in-guest +# supervisor opens an outbound `ConnectSupervisor` gRPC stream to +# the gateway on startup and keeps it alive for the sandbox +# lifetime. SSH (`/connect/ssh`) and `ExecSandbox` traffic ride the +# same TCP+TLS+HTTP/2 connection as multiplexed HTTP/2 streams. +# * There is no host-side SSH port forward. gvproxy still provides +# guest egress so the supervisor can reach the gateway, but it no +# longer forwards any TCP port back to the guest. +# * Readiness is authoritative on the gateway: a sandbox's phase +# flips to `Ready` the moment `ConnectSupervisor` registers, and +# back to `Provisioning` when the session drops. The VM driver +# only reports `Error` conditions for dead launcher processes. # -# When --vm-port is omitted: -# 1. Picks a random free host port -# 2. Starts the VM with --name --port :30051 -# 3. Waits for the VM to fully bootstrap (mTLS certs + gRPC health) -# 4. Verifies `openshell-vm exec` works -# 5. Runs the Rust smoke test -# 6. Tears down the VM +# Usage: +# mise run e2e:vm # -# When --vm-port is given the script assumes the VM is already running -# on that port and runs the smoke test. The VM exec check runs only when -# --vm-name is provided (so the script can target the correct instance). +# What the script does: +# 1. Ensures the VM runtime (libkrun + gvproxy + rootfs) is staged. +# 2. Builds `openshell-gateway`, `openshell-driver-vm`, and the +# `openshell` CLI with the embedded runtime. +# 3. On macOS, codesigns the VM driver (libkrun needs the +# `com.apple.security.hypervisor` entitlement). +# 4. Starts the gateway with `--drivers vm --disable-tls +# --disable-gateway-auth --db-url sqlite::memory:` on a random +# free port, waits for `Server listening`, then runs the +# cluster-agnostic Rust smoke test. +# 5. Tears the gateway down and (on failure) preserves the gateway +# log and every VM serial console log for post-mortem. # -# Prerequisites (when starting a new VM): `mise run vm:build` must already -# be done (the e2e:vm mise task handles this via depends). +# Prerequisites (handled automatically by this script if missing): +# - `mise run vm:setup` — downloads / builds the libkrun runtime. +# - `mise run vm:rootfs -- --base` — builds the sandbox rootfs tarball. set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -RUNTIME_DIR="${ROOT}/target/debug/openshell-vm.runtime" -GATEWAY_BIN="${ROOT}/target/debug/openshell-vm" -VM_GATEWAY_IMAGE="${IMAGE_REPO_BASE:-openshell}/gateway:${IMAGE_TAG:-dev}" -VM_GATEWAY_TAR_REL="var/lib/rancher/k3s/agent/images/openshell-server.tar.zst" -GUEST_PORT=30051 -TIMEOUT=180 - -named_vm_rootfs() { - local vm_version - - vm_version=$("${GATEWAY_BIN}" --version | awk '{print $2}') - printf '%s\n' "${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/${vm_version}/instances/${VM_NAME}/rootfs" -} - -vm_exec() { - local rootfs_args=() - if [ -n "${VM_ROOTFS_DIR:-}" ]; then - rootfs_args=(--rootfs "${VM_ROOTFS_DIR}") - fi - "${GATEWAY_BIN}" "${rootfs_args[@]}" --name "${VM_NAME}" exec -- "$@" -} +COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed" +GATEWAY_BIN="${ROOT}/target/debug/openshell-gateway" +DRIVER_BIN="${ROOT}/target/debug/openshell-driver-vm" + +# The VM driver places `compute-driver.sock` under --vm-driver-state-dir. +# AF_UNIX SUN_LEN is 104 bytes on macOS (108 on Linux), so paths anchored +# in the workspace's `target/` blow the limit on typical developer +# machines — e.g. a ~100-char `~/.superset/worktrees/.../target/...` +# prefix plus the `compute-driver.sock` leaf leaves no room. macOS' +# per-user `$TMPDIR` (`/var/folders/xx/.../T/`) can be 50+ chars too, +# so root state under `/tmp` unconditionally to keep UDS paths short. +STATE_DIR_ROOT="/tmp" + +# Smoke test timeouts. First boot extracts the embedded libkrun runtime +# (~60–90MB of zstd per architecture) and the sandbox rootfs (~200MB). +# The guest then runs k3s-free sandbox supervisor startup; a cold +# microVM is typically ready within ~15s. +GATEWAY_READY_TIMEOUT=60 +SANDBOX_PROVISION_TIMEOUT=180 + +# ── Build prerequisites ────────────────────────────────────────────── + +if [ ! -f "${COMPRESSED_DIR}/rootfs.tar.zst" ]; then + echo "==> Building base VM rootfs tarball (mise run vm:rootfs -- --base)" + mise run vm:rootfs -- --base +fi -prepare_named_vm_rootfs() { - if [ -z "${VM_NAME}" ]; then - return 0 - fi +if [ ! -f "${COMPRESSED_DIR}/rootfs.tar.zst" ] \ + || ! find "${COMPRESSED_DIR}" -maxdepth 1 -name 'libkrun*.zst' | grep -q .; then + echo "==> Preparing embedded VM runtime (mise run vm:setup)" + mise run vm:setup +fi - echo "Preparing named VM rootfs '${VM_NAME}'..." - VM_ROOTFS_DIR="$("${ROOT}/tasks/scripts/vm/ensure-vm-rootfs.sh" --name "${VM_NAME}" \ - | tail -n 1 | sed 's/^using openshell-vm rootfs at //')" - "${ROOT}/tasks/scripts/vm/sync-vm-rootfs.sh" --name "${VM_NAME}" -} +export OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR:-${COMPRESSED_DIR}}" + +echo "==> Building openshell-gateway, openshell-driver-vm, openshell (CLI)" +cargo build \ + -p openshell-server \ + -p openshell-driver-vm \ + -p openshell-cli \ + --features openshell-core/dev-settings + +if [ "$(uname -s)" = "Darwin" ]; then + echo "==> Codesigning openshell-driver-vm (Hypervisor entitlement)" + codesign \ + --entitlements "${ROOT}/crates/openshell-driver-vm/entitlements.plist" \ + --force \ + -s - \ + "${DRIVER_BIN}" +fi -refresh_vm_gateway() { - if [ -z "${VM_NAME}" ]; then - return 0 +# ── Pick a random free host port for the gateway ───────────────────── + +HOST_PORT="$(python3 -c 'import socket +s = socket.socket() +s.bind(("", 0)) +print(s.getsockname()[1]) +s.close()')" + +# Per-run state dir so concurrent e2e runs don't collide on the UDS or +# sandbox state. The VM driver creates `/compute-driver.sock` +# and `/sandboxes//rootfs/` under here. Keep the +# basename short — see the SUN_LEN comment above. +RUN_STATE_DIR="${STATE_DIR_ROOT}/os-vm-e2e-${HOST_PORT}-$$" +mkdir -p "${RUN_STATE_DIR}" + +GATEWAY_LOG="$(mktemp /tmp/openshell-gateway-e2e.XXXXXX)" + +# ── Cleanup (trap) ─────────────────────────────────────────────────── + +cleanup() { + local exit_code=$? + + if [ -n "${GATEWAY_PID:-}" ] && kill -0 "${GATEWAY_PID}" 2>/dev/null; then + echo "Stopping openshell-gateway (pid ${GATEWAY_PID})..." + # SIGTERM first; gateway drops ManagedDriverProcess which SIGKILLs + # the driver and removes the UDS. Wait briefly, then force-kill. + kill -TERM "${GATEWAY_PID}" 2>/dev/null || true + for _ in 1 2 3 4 5 6 7 8 9 10; do + kill -0 "${GATEWAY_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -KILL "${GATEWAY_PID}" 2>/dev/null || true + wait "${GATEWAY_PID}" 2>/dev/null || true fi - echo "Refreshing VM gateway StatefulSet image to ${VM_GATEWAY_IMAGE}..." - # Re-import the host-synced :dev image into the VM's containerd, then - # force a rollout when the StatefulSet already points at the same tag. - vm_exec sh -lc "set -eu; \ - image_tar='/${VM_GATEWAY_TAR_REL}'; \ - k3s ctr -n k8s.io images import \"\${image_tar}\" >/dev/null; \ - current_image=\$(kubectl -n openshell get statefulset/openshell -o jsonpath='{.spec.template.spec.containers[?(@.name==\"openshell\")].image}'); \ - if [ \"\${current_image}\" = \"${VM_GATEWAY_IMAGE}\" ]; then \ - kubectl -n openshell rollout restart statefulset/openshell >/dev/null; \ - else \ - kubectl -n openshell set image statefulset/openshell openshell=${VM_GATEWAY_IMAGE} >/dev/null; \ - fi; \ - kubectl -n openshell rollout status statefulset/openshell --timeout=300s" - echo "Gateway rollout complete." -} - -wait_for_gateway_health() { - local elapsed=0 timeout=60 consecutive_ok=0 - - echo "Waiting for refreshed gateway health..." - while [ "${elapsed}" -lt "${timeout}" ]; do - if "${ROOT}/target/debug/openshell" status >/dev/null 2>&1; then - consecutive_ok=$((consecutive_ok + 1)) - if [ "${consecutive_ok}" -ge 3 ]; then - echo "Gateway health confirmed after refresh." - return 0 - fi - else - consecutive_ok=0 - fi - - sleep 2 - elapsed=$((elapsed + 2)) - done - - echo "ERROR: refreshed gateway did not become healthy after ${timeout}s" - return 1 -} - -# ── Parse arguments ────────────────────────────────────────────────── -VM_PORT="" -VM_NAME="" -VM_ROOTFS_DIR="" -for arg in "$@"; do - case "$arg" in - --vm-port=*) VM_PORT="${arg#--vm-port=}" ;; - --vm-name=*) VM_NAME="${arg#--vm-name=}" ;; - *) echo "Unknown argument: $arg"; exit 1 ;; - esac -done + # On failure, keep the VM console log for debugging. We deliberately + # print it instead of leaving it on disk because the state dir gets + # wiped on success. + if [ "${exit_code}" -ne 0 ]; then + echo "=== gateway log (preserved for debugging) ===" + cat "${GATEWAY_LOG}" 2>/dev/null || true + echo "=== end gateway log ===" + + local console + while IFS= read -r -d '' console; do + echo "=== VM console log: ${console} ===" + cat "${console}" 2>/dev/null || true + echo "=== end VM console log ===" + done < <(find "${RUN_STATE_DIR}/sandboxes" -name 'rootfs-console.log' -print0 2>/dev/null) + fi -# ── Determine mode ─────────────────────────────────────────────────── -if [ -n "${VM_PORT}" ]; then - # Point at an already-running VM. - HOST_PORT="${VM_PORT}" - echo "Using existing VM on port ${HOST_PORT}." - if [ -n "${VM_NAME}" ]; then - prepare_named_vm_rootfs + rm -f "${GATEWAY_LOG}" 2>/dev/null || true + # Only wipe the per-run state dir on success. On failure, leave it for + # post-mortem (serial console logs, gvproxy logs, rootfs dumps). + if [ "${exit_code}" -eq 0 ]; then + rm -rf "${RUN_STATE_DIR}" 2>/dev/null || true + else + echo "NOTE: preserving ${RUN_STATE_DIR} for debugging" fi -else - # Pick a random free port and start a new VM. - HOST_PORT=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()') - if [ -z "${VM_NAME}" ]; then - VM_NAME="e2e-${HOST_PORT}-$$" +} +trap cleanup EXIT + +# ── Launch the gateway + VM driver ─────────────────────────────────── + +SSH_HANDSHAKE_SECRET="$(openssl rand -hex 32)" + +echo "==> Starting openshell-gateway on 127.0.0.1:${HOST_PORT} (state: ${RUN_STATE_DIR})" + +# Pin --driver-dir to the workspace `target/debug/` so we always pick up +# the driver we just cargo-built. Without this, the gateway's +# `resolve_compute_driver_bin` fallback prefers +# `~/.local/libexec/openshell/openshell-driver-vm` when present +# (install-vm.sh installs there), which silently shadows development +# builds — a subtle source of stale-binary bugs in e2e runs. +"${GATEWAY_BIN}" \ + --drivers vm \ + --disable-tls \ + --disable-gateway-auth \ + --db-url 'sqlite::memory:' \ + --port "${HOST_PORT}" \ + --grpc-endpoint "http://127.0.0.1:${HOST_PORT}" \ + --ssh-handshake-secret "${SSH_HANDSHAKE_SECRET}" \ + --driver-dir "${ROOT}/target/debug" \ + --vm-driver-state-dir "${RUN_STATE_DIR}" \ + >"${GATEWAY_LOG}" 2>&1 & +GATEWAY_PID=$! + +# ── Wait for gateway readiness ─────────────────────────────────────── +# +# The gateway logs `INFO openshell_server: Server listening +# address=0.0.0.0:` after its tonic listener is up. That is the +# only signal the smoke test needs — the VM driver is spawned eagerly +# but sandboxes are created on demand, so "Server listening" is the +# right gate here. + +echo "==> Waiting for gateway readiness (timeout ${GATEWAY_READY_TIMEOUT}s)" +elapsed=0 +while ! grep -q 'Server listening' "${GATEWAY_LOG}" 2>/dev/null; do + if ! kill -0 "${GATEWAY_PID}" 2>/dev/null; then + echo "ERROR: openshell-gateway exited before becoming ready" + exit 1 fi - - cleanup() { - local exit_code=$? - if [ -n "${VM_PID:-}" ] && kill -0 "$VM_PID" 2>/dev/null; then - echo "Stopping openshell-vm (pid ${VM_PID})..." - kill "$VM_PID" 2>/dev/null || true - wait "$VM_PID" 2>/dev/null || true - fi - # On failure, preserve the VM console log for post-mortem debugging. - if [ "$exit_code" -ne 0 ] && [ -n "${VM_NAME:-}" ]; then - local console_log - console_log="$(named_vm_rootfs)-console.log" - if [ -f "$console_log" ]; then - echo "=== VM console log (preserved for debugging) ===" - cat "$console_log" - echo "=== end VM console log ===" - fi - fi - rm -f "${VM_LOG:-}" 2>/dev/null || true - if [ -n "${VM_NAME:-}" ]; then - rm -rf "$(dirname "$(named_vm_rootfs)")" 2>/dev/null || true - fi - } - trap cleanup EXIT - - prepare_named_vm_rootfs - - echo "Starting openshell-vm '${VM_NAME}' on port ${HOST_PORT}..." - if [ "$(uname -s)" = "Darwin" ]; then - export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" + if [ "${elapsed}" -ge "${GATEWAY_READY_TIMEOUT}" ]; then + echo "ERROR: openshell-gateway did not become ready after ${GATEWAY_READY_TIMEOUT}s" + exit 1 fi + sleep 1 + elapsed=$((elapsed + 1)) +done - VM_LOG=$(mktemp /tmp/openshell-vm-e2e.XXXXXX) - rootfs_args=() - if [ -n "${VM_ROOTFS_DIR}" ]; then - rootfs_args=(--rootfs "${VM_ROOTFS_DIR}") - fi - "${GATEWAY_BIN}" "${rootfs_args[@]}" --name "${VM_NAME}" --port "${HOST_PORT}:${GUEST_PORT}" 2>"${VM_LOG}" & - VM_PID=$! - - # ── Wait for full bootstrap (mTLS certs + gRPC health) ───────────── - # The VM prints "Ready [Xs total]" to stderr after bootstrap_gateway() - # stores mTLS certs and wait_for_gateway_ready() confirms the gRPC - # service is responding. Waiting only for TCP port reachability (nc -z) - # is insufficient because port forwarding is established before the - # mTLS certs are written, causing `openshell status` to fail. - echo "Waiting for VM bootstrap to complete (timeout ${TIMEOUT}s)..." - elapsed=0 - while ! grep -q "^Ready " "${VM_LOG}" 2>/dev/null; do - if ! kill -0 "$VM_PID" 2>/dev/null; then - echo "ERROR: openshell-vm exited before becoming ready" - echo "VM log:" - cat "${VM_LOG}" - exit 1 - fi - if [ "$elapsed" -ge "$TIMEOUT" ]; then - echo "ERROR: openshell-vm did not become ready after ${TIMEOUT}s" - echo "VM log:" - cat "${VM_LOG}" - exit 1 - fi - sleep 2 - elapsed=$((elapsed + 2)) - done - echo "Gateway is ready (${elapsed}s)." - echo "VM log:" - cat "${VM_LOG}" -fi +echo "==> Gateway ready after ${elapsed}s" -# ── Exec into the VM (when instance name is known) ─────────────────── -if [ -n "${VM_NAME}" ]; then - echo "Verifying openshell-vm exec for '${VM_NAME}'..." - exec_elapsed=0 - exec_timeout=60 - until vm_exec /bin/true; do - if [ "$exec_elapsed" -ge "$exec_timeout" ]; then - echo "ERROR: openshell-vm exec did not become ready after ${exec_timeout}s" - exit 1 - fi - sleep 2 - exec_elapsed=$((exec_elapsed + 2)) - done - echo "VM exec succeeded." -else - echo "Skipping openshell-vm exec check (provide --vm-name for existing VMs)." -fi +# ── Run the smoke test ─────────────────────────────────────────────── +# +# The CLI takes OPENSHELL_GATEWAY_ENDPOINT directly; no gateway +# metadata lookup needed when TLS is disabled. -refresh_vm_gateway +export OPENSHELL_GATEWAY_ENDPOINT="http://127.0.0.1:${HOST_PORT}" -# ── Run the smoke test ─────────────────────────────────────────────── -# The openshell CLI reads OPENSHELL_GATEWAY_ENDPOINT to connect to the -# gateway directly, and OPENSHELL_GATEWAY to resolve mTLS certs from -# ~/.config/openshell/gateways//mtls/. -# In the VM, the overlayfs snapshotter re-extracts all image layers on -# every boot. The 1GB sandbox base image extraction can take >300s -# under contention, so allow 600s for sandbox provisioning. -export OPENSHELL_PROVISION_TIMEOUT=600 -export OPENSHELL_GATEWAY_ENDPOINT="https://127.0.0.1:${HOST_PORT}" -if [ -n "${VM_NAME}" ]; then - export OPENSHELL_GATEWAY="openshell-vm-${VM_NAME}" -else - export OPENSHELL_GATEWAY="openshell-vm" -fi +# The VM driver creates each sandbox VM from scratch — the embedded +# rootfs is extracted per sandbox, and the guest's sandbox supervisor +# then initializes policy, netns, Landlock, and sshd. On a cold host +# this is ~15s; allow 180s for slower CI runners. +export OPENSHELL_PROVISION_TIMEOUT="${SANDBOX_PROVISION_TIMEOUT}" -echo "Running e2e smoke test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${OPENSHELL_GATEWAY_ENDPOINT})..." -cargo build -p openshell-cli --features openshell-core/dev-settings -wait_for_gateway_health -cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test smoke -- --nocapture +echo "==> Running e2e smoke test (endpoint: ${OPENSHELL_GATEWAY_ENDPOINT})" +cargo test \ + --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ + --features e2e \ + --test smoke \ + -- --nocapture -echo "Smoke test passed." +echo "==> Smoke test passed." diff --git a/tasks/scripts/vm/smoke-orphan-cleanup.sh b/tasks/scripts/vm/smoke-orphan-cleanup.sh new file mode 100755 index 000000000..9a37861a0 --- /dev/null +++ b/tasks/scripts/vm/smoke-orphan-cleanup.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Smoke test: start the gateway with the VM driver, create a sandbox, then +# signal the gateway (SIGTERM then SIGKILL) and verify that no driver, +# launcher, gvproxy, or libkrun worker processes survive. +# +# Exit codes: +# 0 — both SIGTERM and SIGKILL cleanup passed +# 1 — one or more scenarios leaked survivors + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +cd "$ROOT" + +PORT="${OPENSHELL_SERVER_PORT:-8091}" +XDG="${TMPDIR:-/tmp}/vm-orphan-xdg-$$" +STATE_DIR="${TMPDIR:-/tmp}/openshell-vm-orphan-$$" +LOG="${TMPDIR:-/tmp}/vm-orphan-$$.log" + +cleanup_stray() { + # Best-effort: kill anything left over from our sandbox ids so repeated + # runs don't accumulate. + pkill -9 -f "openshell-vm-orphan-$$" 2>/dev/null || true + rm -rf "$XDG" "$STATE_DIR" 2>/dev/null || true + # Preserve the gateway log only on failure so operators can diagnose. + if [ "${EXIT_CODE:-0}" -ne 0 ]; then + echo "(log preserved at $LOG)" >&2 + else + rm -f "$LOG" "$LOG.create" 2>/dev/null || true + fi +} +trap cleanup_stray EXIT + +build_binaries() { + echo "==> Ensuring binaries are built" + if [ ! -x "$ROOT/target/debug/openshell-gateway" ] || [ ! -x "$ROOT/target/debug/openshell-driver-vm" ]; then + cargo build -p openshell-server -p openshell-driver-vm >&2 + fi + if [ "$(uname -s)" = "Darwin" ]; then + codesign \ + --entitlements "$ROOT/crates/openshell-driver-vm/entitlements.plist" \ + --force -s - \ + "$ROOT/target/debug/openshell-driver-vm" >/dev/null 2>&1 || true + fi +} + +start_gateway() { + local health_port=$((PORT + 1)) + echo "==> Starting gateway on port $PORT (state=$STATE_DIR, health=$health_port)" + mkdir -p "$STATE_DIR" + OPENSHELL_SERVER_PORT="$PORT" \ + OPENSHELL_HEALTH_PORT="$health_port" \ + OPENSHELL_DB_URL="sqlite:$STATE_DIR/openshell.db" \ + OPENSHELL_DRIVERS=vm \ + OPENSHELL_DRIVER_DIR="$ROOT/target/debug" \ + OPENSHELL_GRPC_ENDPOINT="http://host.containers.internal:$PORT" \ + OPENSHELL_SSH_GATEWAY_HOST=127.0.0.1 \ + OPENSHELL_SSH_GATEWAY_PORT="$PORT" \ + OPENSHELL_SSH_HANDSHAKE_SECRET=dev-vm-driver-secret \ + OPENSHELL_VM_DRIVER_STATE_DIR="$STATE_DIR" \ + OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="$ROOT/target/vm-runtime-compressed" \ + nohup "$ROOT/target/debug/openshell-gateway" --disable-tls \ + > "$LOG" 2>&1 & + GATEWAY_PID=$! + echo "gateway pid=$GATEWAY_PID" + + for _ in $(seq 1 60); do + if grep -q "Server listening" "$LOG" 2>/dev/null; then + return 0 + fi + if ! kill -0 "$GATEWAY_PID" 2>/dev/null; then + echo "!! gateway died before ready" + tail -40 "$LOG" >&2 + return 1 + fi + sleep 1 + done + echo "!! gateway never reported ready" + tail -40 "$LOG" >&2 + return 1 +} + +create_sandbox() { + echo "==> Creating sandbox (--keep, long-running)" + mkdir -p "$XDG" + XDG_CONFIG_HOME="$XDG" "$ROOT/scripts/bin/openshell" gateway add \ + --name vm-orphan http://127.0.0.1:"$PORT" >/dev/null + XDG_CONFIG_HOME="$XDG" "$ROOT/scripts/bin/openshell" gateway select vm-orphan >/dev/null + + # Run the CLI in the background; it blocks waiting for sleep to finish. + XDG_CONFIG_HOME="$XDG" "$ROOT/scripts/bin/openshell" sandbox create \ + --name "orphan-$$" --keep -- sleep 99999 \ + > "$LOG.create" 2>&1 & + CLI_PID=$! + + for _ in $(seq 1 60); do + if pgrep -f "openshell-vm-orphan-$$|$STATE_DIR/sandboxes/" >/dev/null 2>&1; then + if pgrep -f gvproxy >/dev/null 2>&1; then + echo "sandbox came up (cli pid=$CLI_PID)" + return 0 + fi + fi + sleep 2 + done + echo "!! sandbox never came up" + tail -40 "$LOG" "$LOG.create" >&2 2>/dev/null || true + return 1 +} + +snapshot_kids() { + # Return all PIDs whose --state-dir or --vm-rootfs references our + # per-run directory, plus any gvproxy that mentions our socket base. + pgrep -fl "state-dir $STATE_DIR|$STATE_DIR/sandboxes" 2>/dev/null || true + pgrep -fl "gvproxy" 2>/dev/null | grep "osd-gv" || true +} + +count_alive() { + local alive + alive=$(pgrep -f "state-dir $STATE_DIR|$STATE_DIR/sandboxes" 2>/dev/null | wc -l | tr -d ' ') + local gv + gv=$(pgrep -f 'gvproxy' 2>/dev/null | xargs -r ps -o pid=,command= -p 2>/dev/null | grep -c 'osd-gv' || true) + echo $((alive + gv)) +} + +verify_cleanup() { + local label="$1" + local deadline="$2" + local waited=0 + while [ "$waited" -lt "$deadline" ]; do + local n + n=$(count_alive) + if [ "$n" = "0" ]; then + echo " PASS ($label): all descendants gone after ${waited}s" + return 0 + fi + sleep 1 + waited=$((waited + 1)) + done + echo " FAIL ($label): $(count_alive) descendants still alive after ${deadline}s:" + snapshot_kids | sed 's/^/ /' + return 1 +} + +run_scenario() { + local signal="$1" + local label="$2" + echo "======================================================" + echo "Scenario: $label (signal $signal)" + echo "======================================================" + + start_gateway || return 1 + create_sandbox || { kill -9 "$GATEWAY_PID" 2>/dev/null; return 1; } + + echo "-- process tree before signal --" + snapshot_kids | sed 's/^/ /' + echo + + echo "-> kill -$signal $GATEWAY_PID" + kill "-$signal" "$GATEWAY_PID" 2>/dev/null || true + + verify_cleanup "$label" 15 + local rc=$? + + # Belt-and-braces teardown between scenarios. + pkill -9 -f "$STATE_DIR/sandboxes|$STATE_DIR " 2>/dev/null || true + pkill -9 -f 'gvproxy.*osd-gv' 2>/dev/null || true + rm -rf "$STATE_DIR" /tmp/osd-gv "$XDG" 2>/dev/null || true + # CLI may still be running; reap it. + kill "${CLI_PID:-0}" 2>/dev/null || true + sleep 1 + + return $rc +} + +main() { + build_binaries + local overall=0 + + # Clean starting state. + pkill -9 -f 'openshell-gateway|openshell-driver-vm' 2>/dev/null || true + pkill -9 -f 'gvproxy.*osd-gv' 2>/dev/null || true + sleep 1 + + if ! run_scenario TERM "graceful SIGTERM"; then + overall=1 + fi + + if ! run_scenario KILL "abrupt SIGKILL"; then + overall=1 + fi + + if [ "$overall" -eq 0 ]; then + echo "ALL SCENARIOS PASSED" + else + echo "ONE OR MORE SCENARIOS FAILED" + fi + EXIT_CODE=$overall + return $overall +} + +main "$@" diff --git a/tasks/scripts/vm/vm-setup.sh b/tasks/scripts/vm/vm-setup.sh index e7ae06d08..bccb7f754 100755 --- a/tasks/scripts/vm/vm-setup.sh +++ b/tasks/scripts/vm/vm-setup.sh @@ -128,4 +128,4 @@ echo " Compressed artifacts in: ${OUTPUT_DIR}" echo "" echo "Next steps:" echo " mise run vm:rootfs --base # build rootfs (requires Docker)" -echo " mise run vm # build and run the VM" +echo " mise run gateway:vm # start openshell-gateway with the VM driver" diff --git a/tasks/test.toml b/tasks/test.toml index f24ea6f2b..cf45d2b6b 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -49,6 +49,5 @@ env = { UV_NO_SYNC = "1", PYTHONPATH = "python" } run = "uv run pytest -o python_files='test_*.py' -m gpu -n ${E2E_PARALLEL:-1} e2e/python" ["e2e:vm"] -description = "Boot openshell-vm and run smoke e2e (macOS ARM64; pass -- --vm-port=N [--vm-name=NAME] to reuse)" -depends = ["build:docker:gateway", "vm:build"] +description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e" run = "e2e/rust/e2e-vm.sh" diff --git a/tasks/vm.toml b/tasks/vm.toml index ca06b08c1..0a44b4ff7 100644 --- a/tasks/vm.toml +++ b/tasks/vm.toml @@ -5,22 +5,25 @@ # # Workflow: # mise run vm:setup # one-time: download pre-built runtime (~30s) -# mise run vm # build + run the VM +# mise run gateway:vm # start openshell-gateway with the VM driver +# mise run vm # build + run the standalone openshell-vm microVM # mise run vm:clean # wipe everything and start over # -# See crates/openshell-vm/README.md for full documentation. +# See crates/openshell-driver-vm/README.md for the `gateway:vm` flow and +# crates/openshell-vm/README.md for the standalone microVM path. # ═══════════════════════════════════════════════════════════════════════════ # Main Commands # ═══════════════════════════════════════════════════════════════════════════ +["gateway:vm"] +description = "Build openshell-gateway + openshell-driver-vm and start the gateway with the VM driver" +run = "crates/openshell-driver-vm/start.sh" + [vm] -description = "Build and run the openshell-vm microVM" +description = "Build and run the standalone openshell-vm microVM" depends = ["build:docker:gateway"] -run = [ - "mise run vm:build", - "tasks/scripts/vm/run-vm.sh", -] +run = ["mise run vm:build", "tasks/scripts/vm/run-vm.sh"] ["vm:build"] description = "Build the openshell-vm binary with embedded runtime" @@ -42,3 +45,7 @@ run = "tasks/scripts/vm/build-rootfs-tarball.sh" ["vm:clean"] description = "Remove all VM cached artifacts (runtime, rootfs, builds)" run = "tasks/scripts/vm/vm-clean.sh" + +["vm:smoke:orphan-cleanup"] +description = "Smoke test: start gateway+driver, create a sandbox, signal the gateway, assert no orphaned processes survive" +run = "tasks/scripts/vm/smoke-orphan-cleanup.sh"