From 94f1bc075b836f918378731964167c5b840e8713 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 19 Apr 2026 21:47:00 -0700 Subject: [PATCH 1/2] feat(vm): add OCI container execution via overlay and pivot_root Add host-side OCI pipeline to the VM compute driver so sandboxes can boot from a user-specified `template.image` without shipping Docker inside the guest. The driver pulls and flattens the image, injects OpenShell compatibility files (sandbox user, /sandbox, /tmp, stub /etc/{hosts,resolv.conf}), and builds a read-only squashfs cached per `(manifest digest, platform)`. Sandbox-create attaches that RO base plus a per-sandbox raw state disk; the guest init mounts both as an overlay, bind-mounts the workspace over `/sandbox`, pivot_roots into the merged view, then execs an unmodified `openshell-sandbox` with the OCI argv/env/workdir. Supervisor gains a container-mode clean-env baseline gated on `OPENSHELL_CONTAINER_MODE=1`: the child process starts with an empty environ, then receives only the documented allowlist (container env from the OCI merge, provider env, proxy env, TLS env, minimal shell defaults), so control-plane `OPENSHELL_*` vars never leak to workloads. The gateway plumbs `--default-image` (from `sandbox_image`) and `--mksquashfs-bin` into the VM-driver subprocess so `GetCapabilities.default_image` stays in sync and OCI sandboxes work without relying on env inheritance. Guest init resolves block devices by libkrun-assigned serial under `/sys/block/vd*/serial` instead of hardcoded `/dev/vda`/`/dev/vdb`, with the older behavior kept as a fallback for guest kernels that don't expose serials. Scope and limits (v1): - Public OCI registries only, linux/amd64 or linux/arm64 matching the host. The OCI `User` field is ignored; workloads always run as `sandbox:sandbox`. - The shared RO base cache is not GC'd automatically; operators manage `/oci-cache/` themselves. - The fixed guest VM rootfs stays as the control-plane image; we never boot the user's OCI image as the guest OS. Unit and integration tests cover: layer flattening with whiteouts, compat injection idempotence, squashfs build + cache round-trip, OCI-config precedence rules (Entrypoint+Cmd, workdir fallback, env merge), driver argv wiring for `--default-image` and `--mksquashfs-bin`, and `resolve_oci_launch` preflight error paths (unsupported host, missing mksquashfs, no image requested). --- Cargo.lock | 213 ++++++++- architecture/vm-driver.md | 196 ++++++++ crates/openshell-driver-vm/Cargo.toml | 10 + .../scripts/openshell-vm-sandbox-init.sh | 172 +++++++ crates/openshell-driver-vm/src/driver.rs | 430 +++++++++++++++++- crates/openshell-driver-vm/src/ffi.rs | 25 + crates/openshell-driver-vm/src/lib.rs | 10 +- crates/openshell-driver-vm/src/main.rs | 59 ++- crates/openshell-driver-vm/src/oci/cache.rs | 288 ++++++++++++ crates/openshell-driver-vm/src/oci/client.rs | 241 ++++++++++ crates/openshell-driver-vm/src/oci/compat.rs | 197 ++++++++ crates/openshell-driver-vm/src/oci/flatten.rs | 332 ++++++++++++++ .../openshell-driver-vm/src/oci/fs_image.rs | 156 +++++++ .../openshell-driver-vm/src/oci/metadata.rs | 336 ++++++++++++++ crates/openshell-driver-vm/src/oci/mod.rs | 21 + .../openshell-driver-vm/src/oci/pipeline.rs | 147 ++++++ crates/openshell-driver-vm/src/runtime.rs | 84 ++++ crates/openshell-driver-vm/src/state_disk.rs | 260 +++++++++++ .../tests/oci_pipeline_integration.rs | 145 ++++++ crates/openshell-sandbox/src/container_env.rs | 248 ++++++++++ crates/openshell-sandbox/src/lib.rs | 1 + crates/openshell-sandbox/src/process.rs | 21 +- crates/openshell-server/src/cli.rs | 8 + crates/openshell-server/src/compute/vm.rs | 214 ++++++++- 24 files changed, 3769 insertions(+), 45 deletions(-) create mode 100644 architecture/vm-driver.md create mode 100644 crates/openshell-driver-vm/src/oci/cache.rs create mode 100644 crates/openshell-driver-vm/src/oci/client.rs create mode 100644 crates/openshell-driver-vm/src/oci/compat.rs create mode 100644 crates/openshell-driver-vm/src/oci/flatten.rs create mode 100644 crates/openshell-driver-vm/src/oci/fs_image.rs create mode 100644 crates/openshell-driver-vm/src/oci/metadata.rs create mode 100644 crates/openshell-driver-vm/src/oci/mod.rs create mode 100644 crates/openshell-driver-vm/src/oci/pipeline.rs create mode 100644 crates/openshell-driver-vm/src/state_disk.rs create mode 100644 crates/openshell-driver-vm/tests/oci_pipeline_integration.rs create mode 100644 crates/openshell-sandbox/src/container_env.rs diff --git a/Cargo.lock b/Cargo.lock index 4b29a0c7f..b67ffe5eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -402,6 +402,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd307490d624467aa6f74b0eabb77633d1f758a7b25f12bceb0b22e08d9726f6" +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -808,6 +814,27 @@ version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" +[[package]] +name = "const_format" +version = "0.2.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4481a617ad9a412be3b97c5d403fef8ed023103368908b9c50af598ff467cc1e" +dependencies = [ + "const_format_proc_macros", + "konst", +] + +[[package]] +name = "const_format_proc_macros" +version = "0.2.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d57c2eccfb16dbac1f4e61e206105db5820c9d26c3c472bc17c774259ef7744" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + [[package]] name = "constant_time_eq" version = "0.4.2" @@ -1166,6 +1193,37 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.117", +] + [[package]] name = "dialoguer" version = "0.11.0" @@ -1633,6 +1691,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "getset" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf0fc11e47561d47397154977bc219f4cf809b2974facc3ccb3b89e2436f912" +dependencies = [ + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "ghash" version = "0.5.1" @@ -1837,6 +1907,15 @@ dependencies = [ "itoa", ] +[[package]] +name = "http-auth" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "150fa4a9462ef926824cf4519c84ed652ca8f4fbae34cb8af045b5cbcaf98822" +dependencies = [ + "memchr", +] + [[package]] name = "http-body" version = "1.0.1" @@ -2349,6 +2428,21 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "jwt" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6204285f77fe7d9784db3fdc449ecce1a0114927a51d5a41c4c7a292011c015f" +dependencies = [ + "base64 0.13.1", + "crypto-common 0.1.7", + "digest 0.10.7", + "hmac", + "serde", + "serde_json", + "sha2 0.10.9", +] + [[package]] name = "k8s-openapi" version = "0.21.1" @@ -2362,6 +2456,21 @@ dependencies = [ "serde_json", ] +[[package]] +name = "konst" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "128133ed7824fcd73d6e7b17957c5eb7bacb885649bd8c69708b2331a10bcefb" +dependencies = [ + "konst_macro_rules", +] + +[[package]] +name = "konst_macro_rules" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4933f3f57a8e9d9da04db23fb153356ecaf00cbd14aee46279c33dc80925c37" + [[package]] name = "kube" version = "0.90.0" @@ -2966,6 +3075,60 @@ dependencies = [ "memchr", ] +[[package]] +name = "oci-client" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b74df13319e08bc386d333d3dc289c774c88cc543cae31f5347db07b5ec2172" +dependencies = [ + "bytes", + "chrono", + "futures-util", + "http", + "http-auth", + "jwt", + "lazy_static", + "oci-spec", + "olpc-cjson", + "regex", + "reqwest", + "serde", + "serde_json", + "sha2 0.10.9", + "thiserror 2.0.18", + "tokio", + "tracing", + "unicase", +] + +[[package]] +name = "oci-spec" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc3da52b83ce3258fbf29f66ac784b279453c2ac3c22c5805371b921ede0d308" +dependencies = [ + "const_format", + "derive_builder", + "getset", + "regex", + "serde", + "serde_json", + "strum 0.27.2", + "strum_macros 0.27.2", + "thiserror 2.0.18", +] + +[[package]] +name = "olpc-cjson" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "696183c9b5fe81a7715d074fd632e8bd46f4ccc0231a3ed7fc580a80de5f7083" +dependencies = [ + "serde", + "serde_json", + "unicode-normalization", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -3095,14 +3258,21 @@ name = "openshell-driver-vm" version = "0.0.0" dependencies = [ "clap", + "flate2", "futures", "libc", "libloading", "miette", "nix", + "oci-client", "openshell-core", "prost-types", + "serde", + "serde_json", + "sha2 0.10.9", "tar", + "tempfile", + "thiserror 2.0.18", "tokio", "tokio-stream", "tonic", @@ -3996,7 +4166,7 @@ dependencies = [ "lru", "paste", "stability", - "strum", + "strum 0.26.3", "unicode-segmentation", "unicode-truncate", "unicode-width 0.1.14", @@ -4112,12 +4282,14 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-rustls", + "tokio-util", "tower 0.5.3", "tower-http 0.6.8", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", "webpki-roots 1.0.6", ] @@ -5102,9 +5274,15 @@ version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" dependencies = [ - "strum_macros", + "strum_macros 0.26.4", ] +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" + [[package]] name = "strum_macros" version = "0.26.4" @@ -5118,6 +5296,18 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "subtle" version = "2.6.1" @@ -5764,6 +5954,12 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + [[package]] name = "unicode-bidi" version = "0.3.18" @@ -6035,6 +6231,19 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasmparser" version = "0.244.0" diff --git a/architecture/vm-driver.md b/architecture/vm-driver.md new file mode 100644 index 000000000..ea5baaf1b --- /dev/null +++ b/architecture/vm-driver.md @@ -0,0 +1,196 @@ +# VM Compute Driver + +> Status: Experimental. The VM compute driver is a second-generation +> compute backend for OpenShell sandboxes. Kubernetes remains the default. + +## Overview + +`openshell-driver-vm` is an in-process compute driver that runs each +sandbox as a libkrun microVM on the host. Unlike the Kubernetes driver, +it has no orchestrator dependency — the driver is a single binary that +exposes the `ComputeDriver` gRPC service and manages VMs directly. + +A sandbox spec can optionally include `template.image`, an OCI image +reference. When set, the driver treats the image as the **sandbox +payload** (the user's container filesystem), not the guest OS. The fixed +libkrun guest rootfs still boots the control plane (init script, +supervisor, SSH); the OCI image is mounted as an overlay and the +supervisor `pivot_root`s into it before launching the image entrypoint. + +## OCI container execution model + +``` +┌───────────────────────────── Host ──────────────────────────────┐ +│ │ +│ openshell-driver-vm │ +│ └─ OCI manager │ +│ ├─ oci-client: pull manifest, config, layers │ +│ ├─ flatten layers (apply whiteouts) │ +│ ├─ inject sandbox user, /sandbox, /tmp, placeholder etc. │ +│ ├─ build squashfs via mksquashfs (zstd) │ +│ └─ cache under /oci-cache/ │ +│ blobs/, fs/..squashfs, meta/*.json │ +│ │ +│ Per-sandbox state dir │ +│ ├─ sandbox-state.raw (sparse ext4 upper + workdir) │ +│ └─ rootfs-console.log │ +│ │ +│ ▼ krun_add_disk3 × 2 + set_exec env│ +├─────────────────────────── Guest VM ────────────────────────────┤ +│ │ +│ /dev/vda = RO base squashfs ──mount ro──▶ /base │ +│ /dev/vdb = sandbox-state.raw ──mkfs.ext4─▶ /state │ +│ │ +│ overlay (lowerdir=/base, upperdir=/state/upper, │ +│ workdir=/state/work) ──▶ /state/merged │ +│ /state/workspace ──bind──▶ /state/merged/sandbox │ +│ │ +│ pivot_root /state/merged ──▶ supervisor sees overlay as `/` │ +│ │ +│ openshell-sandbox --workdir -- │ +│ └─ policy, Landlock, seccomp, SSH, OCSF logging as usual │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Host pipeline + +`crates/openshell-driver-vm/src/oci/` owns the host pipeline. The +top-level entrypoint is `oci::prepare(puller, cache, build_opts, +image_ref, env_overrides)`: + +| Module | Responsibility | +|---|---| +| `client.rs` | Anonymous pull via `oci-client` with a platform resolver pinned to `linux/amd64` or `linux/arm64`. Normalizes the OCI image config into `ImageConfig`. | +| `flatten.rs` | Applies OCI layer tars in order with whiteout handling (`.wh.*`, `.wh..wh..opq`). Rejects absolute/parent-traversal paths. Dispatches on media type (`tar`, `tar+gzip`). | +| `compat.rs` | Injects `sandbox:10001:10001` into `/etc/passwd` + `/etc/group`, ensures `/sandbox` (0755) and `/tmp` (1777) exist, writes placeholder `/etc/hosts` and `/etc/resolv.conf`. Idempotent. Picks best shell (`/bin/sh` → `/sbin/nologin` → `/bin/false`). | +| `fs_image.rs` | Shells out to `mksquashfs` with explicit binary path (no `$PATH` reliance), zstd by default. | +| `cache.rs` | Content-addressed layout `blobs/ + fs/..squashfs + meta/..json + tmp/`. Atomic writes; idempotent `lookup()` + `install_fs_image()`. | +| `metadata.rs` | `LaunchMetadata::build` — argv = `Entrypoint + Cmd` (precedence), workdir fallback `/sandbox`, env merge `OCI < template < spec`. `to_guest_env_vars()` packs into `OPENSHELL_OCI_ARGC/ARGV_/ENV_COUNT/ENV_/WORKDIR`. | +| `pipeline.rs` | End-to-end orchestrator. On cache hit, zero network I/O. On miss: pull → flatten → inject → build → install. | + +Cache is keyed by `(manifest digest, platform)`. Repeated launches of +the same image skip pull and rebuild entirely — the driver just attaches +the cached squashfs to the VM. + +### Guest init and pivot + +`crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh` is the +guest's PID 1. OCI mode is gated on `OPENSHELL_OCI_ARGC` being set in +the guest environ (delivered via libkrun `set_exec`). When set, +`oci_launch_supervisor`: + +1. Mounts the RO base device (default `/dev/vda`, overridable via + `OPENSHELL_VM_OCI_BASE_DEVICE`) at `/base`. +2. Formats the state device (`/dev/vdb`) with ext4 on first boot, + mounts at `/state`. +3. Creates `/state/upper`, `/state/work`, `/state/merged`, and + `/state/workspace`. +4. Mounts overlay + `lowerdir=/base,upperdir=/state/upper,workdir=/state/work` at + `/state/merged`. +5. Bind-mounts `/state/workspace` over the image's `/sandbox` so the + workdir is writable on the state disk. +6. Synthesizes `/etc/resolv.conf` if the image didn't ship one. +7. Copies the gateway-issued TLS CA (if `$OPENSHELL_TLS_CA` is set) + into `/opt/openshell/tls/ca.crt` inside the overlay so post-pivot + SSL trust paths stay valid. +8. Copies the supervisor binary into the upper layer (reaches the state + disk, not the RO base). +9. Bind-mounts `/proc`, `/sys`, `/dev` into the overlay. +10. Bind-mounts `/state/merged` onto itself, `pivot_root`s into it, and + lazy-unmounts the old root. +11. Translates `OPENSHELL_OCI_ENV_` → `OPENSHELL_CONTAINER_ENV_`, + sets `OPENSHELL_CONTAINER_MODE=1`, and unsets the OCI source vars. +12. Reconstructs argv from `OPENSHELL_OCI_ARGV_` and execs + `openshell-sandbox --workdir "$OCI_WORKDIR" -- `. + +### Supervisor clean-env mode + +`crates/openshell-sandbox/src/container_env.rs` gates on +`OPENSHELL_CONTAINER_MODE=1`. When active, the supervisor calls +`Command::env_clear()` on the child and applies only the documented +allowlist: + +- `HOME=/sandbox`, `PATH=`, `TERM=xterm` +- Container env from `OPENSHELL_CONTAINER_ENV_` (OCI + template/spec + merge) +- `OPENSHELL_SANDBOX=1` (applied last — images cannot override the + marker) +- Provider env, proxy env, TLS env from policy (layered on top by the + existing spawn path) + +Control-plane vars (`OPENSHELL_SSH_HANDSHAKE_SECRET`, driver internals, +etc.) never reach the child process. When `OPENSHELL_CONTAINER_MODE` is +unset, the supervisor keeps its historical env-inheritance behavior. + +## Storage: shared RO base + per-sandbox CoW + +The overlay design replaces an earlier "unpack fresh tar per sandbox" +model that's still described in the initial plan: + +- **Base**: one squashfs per `(manifest digest, platform)`, shared + across every sandbox that uses the image. Never deleted by the + per-sandbox delete path. +- **Upper + workdir**: per-sandbox ext4 on `sandbox-state.raw`. Sparse + 16 GiB default, grows on first write. Deleted with the sandbox state + dir on `DeleteSandbox`. +- **Workspace**: `/state/workspace` bind-mounted over the image's + `/sandbox`. Persists alongside the state disk. + +Cold start for a repeat launch of the same image is near-instant: a +block attach and two mounts; no registry round-trip, no layer +flattening, no squashfs build. + +GC of the RO base cache is out of scope for v1. Operators must manage +`/oci-cache/fs/*` and `/oci-cache/blobs/**` manually if +they need to reclaim space. + +## Driver configuration + +| Flag / env var | Meaning | +|---|---| +| `--default-image` / `OPENSHELL_VM_DRIVER_DEFAULT_IMAGE` | Image used when a sandbox spec omits `template.image`. Advertised via `GetCapabilities.default_image`. Empty string disables defaulting — sandboxes without an image fall through to the legacy (non-OCI) guest-rootfs supervisor. | +| `--mksquashfs-bin` / `OPENSHELL_VM_MKSQUASHFS` | Path to the `mksquashfs` binary. Required for OCI sandboxes. Unset → OCI requests are rejected with `FailedPrecondition`. | +| `OPENSHELL_VM_DRIVER_STATE_DIR` | Root for per-sandbox state and `oci-cache/`. | + +`GetCapabilities` now reports: + +```json +{ + "driver_name": "openshell-driver-vm", + "driver_version": "", + "default_image": "", + "supports_gpu": false +} +``` + +## v1 scope and assumptions + +- Public OCI registries only. No authentication. +- Linux images only. `linux/amd64` or `linux/arm64` matching the host. +- One image per sandbox. No init containers or sidecars. +- The entrypoint always runs as `sandbox:sandbox` (UID/GID 10001). The + OCI `User` field is ignored in v1. +- `template.agent_socket_path`, `template.platform_config`, and + `template.resources` are still rejected by the VM driver. +- Sandbox lifetime is the entrypoint lifetime: when the OCI entrypoint + exits, the sandbox transitions to exited/error. +- GPU is unsupported. +- Squashfs is the fs-image format. erofs is a candidate for later. +- No automatic cache GC. + +## Related files + +- `crates/openshell-driver-vm/src/driver.rs` — gRPC surface + + sandbox lifecycle. +- `crates/openshell-driver-vm/src/runtime.rs` — libkrun launch, disk + + vsock wiring. +- `crates/openshell-driver-vm/src/ffi.rs` — `libkrun` symbol loader. +- `crates/openshell-driver-vm/src/state_disk.rs` — sparse state disk + create/grow + secure import socket dir. +- `crates/openshell-driver-vm/src/oci/` — OCI pipeline. +- `crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh` — + guest init + `oci_launch_supervisor`. +- `crates/openshell-sandbox/src/container_env.rs` — supervisor + clean-env baseline for container mode. diff --git a/crates/openshell-driver-vm/Cargo.toml b/crates/openshell-driver-vm/Cargo.toml index 368716ef9..8e90d8607 100644 --- a/crates/openshell-driver-vm/Cargo.toml +++ b/crates/openshell-driver-vm/Cargo.toml @@ -36,6 +36,16 @@ libc = "0.2" libloading = "0.8" tar = "0.4" zstd = "0.13" +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +sha2 = "0.10" +flate2 = "1" +tempfile = "3" +oci-client = { version = "0.15", default-features = false, features = ["rustls-tls"] } + +[dev-dependencies] +tempfile = "3" [lints] workspace = true diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh index 70dda5acb..ed6f433b2 100644 --- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -72,6 +72,170 @@ tcp_probe() { fi } +resolve_block_device_by_serial() { + # libkrun's `krun_add_disk3` exposes the caller-supplied block_id as the + # virtio-blk serial, which Linux surfaces at /sys/block//serial. + # Walk virtio-blk devices (vd*) and return the /dev path whose serial + # matches $1. This makes the guest tolerant to attach-order changes. + local target_serial="$1" + local block + for block in /sys/block/vd*; do + [ -d "$block" ] || continue + local serial_file="$block/serial" + [ -r "$serial_file" ] || continue + local serial + serial=$(cat "$serial_file" 2>/dev/null || true) + if [ "$serial" = "$target_serial" ]; then + printf '/dev/%s\n' "$(basename "$block")" + return 0 + fi + done + return 1 +} + +oci_launch_supervisor() { + # Enter OCI overlay mode: mount the shared read-only squashfs base plus a + # per-sandbox ext4 upper, overlay them, pivot_root into the merged view, + # then exec the supervisor post-pivot so container paths like /sandbox and + # /tmp are the real paths from the supervisor's POV. + + # Prefer block-ID resolution so the mount points don't silently break if + # libkrun ever changes virtio-blk attach order. Env var overrides are kept + # for operator escape hatches and test harnesses. + local base_dev="${OPENSHELL_VM_OCI_BASE_DEVICE:-}" + local state_dev="${OPENSHELL_VM_STATE_DEVICE:-}" + + if [ -z "$base_dev" ]; then + base_dev=$(resolve_block_device_by_serial "oci-base" || true) + fi + if [ -z "$state_dev" ]; then + state_dev=$(resolve_block_device_by_serial "sandbox-state" || true) + fi + + # Fall back to attach-order defaults only when serial lookup returns nothing + # (older guest kernels or missing /sys/block//serial). + if [ -z "$base_dev" ]; then + ts "WARNING: could not resolve RO base by serial=oci-base; falling back to /dev/vda" + base_dev=/dev/vda + fi + if [ -z "$state_dev" ]; then + ts "WARNING: could not resolve state disk by serial=sandbox-state; falling back to /dev/vdb" + state_dev=/dev/vdb + fi + + if [ ! -b "$base_dev" ]; then + ts "ERROR: OCI base device $base_dev not found" + exit 1 + fi + if [ ! -b "$state_dev" ]; then + ts "ERROR: OCI state device $state_dev not found" + exit 1 + fi + + ts "OCI block devices resolved: base=$base_dev state=$state_dev" + + mkdir -p /base /state + if ! mount -o ro "$base_dev" /base 2>/dev/null; then + ts "ERROR: failed to mount read-only base $base_dev at /base" + exit 1 + fi + + if ! blkid "$state_dev" >/dev/null 2>&1; then + ts "formatting sandbox state disk $state_dev" + mkfs.ext4 -F -q -L openshell-sandbox-state "$state_dev" >/dev/null 2>&1 || { + ts "ERROR: mkfs.ext4 failed on $state_dev" + exit 1 + } + fi + if ! mount -o noatime "$state_dev" /state 2>/dev/null; then + ts "ERROR: failed to mount state disk $state_dev at /state" + exit 1 + fi + + mkdir -p /state/upper /state/work /state/merged /state/workspace + if ! mount -t overlay overlay \ + -o "lowerdir=/base,upperdir=/state/upper,workdir=/state/work" \ + /state/merged 2>/dev/null; then + ts "ERROR: failed to mount overlay at /state/merged" + exit 1 + fi + + # The image's /sandbox is RO (it lives in the base); bind the writable + # workspace over it so the container process can write to /sandbox. + mkdir -p /state/merged/sandbox + mount --bind /state/workspace /state/merged/sandbox + + # Synthesize /etc/resolv.conf inside the image if the image does not + # provide one; reuse the guest's DHCP-populated one. + if [ ! -s /state/merged/etc/resolv.conf ] && [ -s /etc/resolv.conf ]; then + mkdir -p /state/merged/etc + cp /etc/resolv.conf /state/merged/etc/resolv.conf 2>/dev/null || true + fi + + # Mirror TLS CA bundle into the merged view so SSL trust survives the pivot. + if [ -n "${OPENSHELL_TLS_CA:-}" ] && [ -f "$OPENSHELL_TLS_CA" ]; then + mkdir -p /state/merged/opt/openshell/tls + cp "$OPENSHELL_TLS_CA" /state/merged/opt/openshell/tls/ca.crt 2>/dev/null || true + fi + + # Supervisor binary must be reachable post-pivot. Copy it into the upper + # layer (writes land on the state disk, not the RO base). + mkdir -p /state/merged/opt/openshell/bin + if [ ! -x /state/merged/opt/openshell/bin/openshell-sandbox ]; then + cp /opt/openshell/bin/openshell-sandbox \ + /state/merged/opt/openshell/bin/openshell-sandbox + chmod 0755 /state/merged/opt/openshell/bin/openshell-sandbox + fi + + # Ensure the kernel pseudo-filesystems are available after pivot. + mkdir -p /state/merged/proc /state/merged/sys /state/merged/dev + mount --bind /proc /state/merged/proc 2>/dev/null || true + mount --bind /sys /state/merged/sys 2>/dev/null || true + mount --bind /dev /state/merged/dev 2>/dev/null || true + + # pivot_root requires the new root to be a mount point distinct from the + # current root, so bind-mount /state/merged onto itself. + mount --bind /state/merged /state/merged + mkdir -p /state/merged/.old_root + cd /state/merged + pivot_root . .old_root + cd / + umount -l /.old_root 2>/dev/null || true + rmdir /.old_root 2>/dev/null || true + + # Translate OCI metadata env into the supervisor's container-mode contract. + local env_count="${OPENSHELL_OCI_ENV_COUNT:-0}" + export OPENSHELL_CONTAINER_ENV_COUNT="$env_count" + local idx=0 + while [ "$idx" -lt "$env_count" ]; do + local src_var="OPENSHELL_OCI_ENV_$idx" + export "OPENSHELL_CONTAINER_ENV_$idx=${!src_var:-}" + unset "$src_var" + idx=$((idx + 1)) + done + export OPENSHELL_CONTAINER_MODE=1 + + local argc="${OPENSHELL_OCI_ARGC:-0}" + if [ "$argc" -lt 1 ]; then + ts "ERROR: OCI image has no runnable command (argc=0)" + exit 1 + fi + local -a argv=() + idx=0 + while [ "$idx" -lt "$argc" ]; do + local src_var="OPENSHELL_OCI_ARGV_$idx" + argv+=("${!src_var:-}") + unset "$src_var" + idx=$((idx + 1)) + done + + local workdir="${OPENSHELL_OCI_WORKDIR:-/sandbox}" + unset OPENSHELL_OCI_ARGC OPENSHELL_OCI_ENV_COUNT OPENSHELL_OCI_WORKDIR + + ts "OCI overlay ready; exec'ing supervisor (argc=$argc workdir=$workdir)" + exec /opt/openshell/bin/openshell-sandbox --workdir "$workdir" -- "${argv[@]}" +} + rewrite_openshell_endpoint_if_needed() { local endpoint="${OPENSHELL_ENDPOINT:-}" [ -n "$endpoint" ] || return 0 @@ -184,5 +348,13 @@ export USER=sandbox rewrite_openshell_endpoint_if_needed +# OCI image mode: if the driver staged an OCI payload via krun set_exec env, +# prepare the overlay rootfs, pivot_root, and exec the supervisor post-pivot. +# Otherwise fall through to the default guest rootfs supervisor boot. +if [ -n "${OPENSHELL_OCI_ARGC:-}" ]; then + ts "OCI image mode: OPENSHELL_OCI_ARGC=${OPENSHELL_OCI_ARGC}" + oci_launch_supervisor +fi + ts "starting openshell-sandbox supervisor" exec /opt/openshell/bin/openshell-sandbox --workdir /sandbox diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index 3d3fbf4b6..14634da2d 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -65,6 +65,13 @@ pub struct VmDriverConfig { pub guest_tls_ca: Option, pub guest_tls_cert: Option, pub guest_tls_key: Option, + /// Default OCI image used when the sandbox spec omits `template.image`. + /// Empty string means "no default" — sandboxes without an image will + /// fall back to the historical non-OCI guest rootfs supervisor. + pub default_image: String, + /// Path to the `mksquashfs` binary. When unset, OCI-image sandboxes + /// are rejected with `FailedPrecondition`. + pub mksquashfs_bin: Option, } impl Default for VmDriverConfig { @@ -82,6 +89,8 @@ impl Default for VmDriverConfig { guest_tls_ca: None, guest_tls_cert: None, guest_tls_key: None, + default_image: String::new(), + mksquashfs_bin: None, } } } @@ -173,12 +182,22 @@ struct SandboxRecord { process: Arc>, } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct VmDriver { config: VmDriverConfig, launcher_bin: PathBuf, registry: Arc>>, events: broadcast::Sender, + /// Shared OCI cache and puller for this driver process. + /// Populated once per platform; `None` when the host arch is unsupported. + oci: Option>, +} + +/// Lazily-initialized OCI state attached to the driver. +pub struct VmOci { + pub puller: crate::oci::OciPuller, + pub cache: crate::oci::CacheLayout, + pub platform: crate::oci::Platform, } impl VmDriver { @@ -207,11 +226,25 @@ impl VmDriver { }; let (events, _) = broadcast::channel(WATCH_BUFFER); + + let oci = crate::oci::Platform::host().map(|platform| { + let cache = crate::oci::CacheLayout::new(config.state_dir.join("oci-cache")); + // Errors here are surfaced lazily at first sandbox-create; the + // driver still starts so non-OCI sandboxes continue to work. + let _ = cache.ensure_dirs(); + Arc::new(VmOci { + puller: crate::oci::OciPuller::new(platform), + cache, + platform, + }) + }); + Ok(Self { config, launcher_bin, registry: Arc::new(Mutex::new(HashMap::new())), events, + oci, }) } @@ -220,7 +253,7 @@ impl VmDriver { GetCapabilitiesResponse { driver_name: DRIVER_NAME.to_string(), driver_version: openshell_core::VERSION.to_string(), - default_image: String::new(), + default_image: self.config.default_image.clone(), supports_gpu: false, } } @@ -261,6 +294,9 @@ impl VmDriver { })?; } + let oci_launch = self.resolve_oci_launch(sandbox, &state_dir).await?; + let is_oci = oci_launch.is_some(); + let console_output = state_dir.join("rootfs-console.log"); let mut command = Command::new(&self.launcher_bin); command.kill_on_drop(true); @@ -282,7 +318,15 @@ impl VmDriver { command .arg("--vm-port") .arg(format!("{ssh_port}:{GUEST_SSH_PORT}")); - for env in build_guest_environment(sandbox, &self.config) { + if let Some(oci) = oci_launch.as_ref() { + command.arg("--vm-ro-base-disk").arg(&oci.base_disk_path); + command.arg("--vm-state-disk").arg(&oci.state_disk_path); + } + let mut guest_env = build_guest_environment(sandbox, &self.config, is_oci); + if let Some(oci) = oci_launch.as_ref() { + guest_env.extend(oci.guest_env_vars.iter().cloned()); + } + for env in guest_env { command.arg("--vm-env").arg(env); } @@ -433,6 +477,76 @@ impl VmDriver { snapshots } + /// Run the OCI pipeline for this sandbox if `template.image` (or the + /// driver's default image) is set, and materialize the per-sandbox state + /// disk. Returns `None` for legacy non-OCI sandboxes. + async fn resolve_oci_launch( + &self, + sandbox: &Sandbox, + state_dir: &Path, + ) -> Result, Status> { + let image_ref = effective_image_ref(sandbox, &self.config.default_image); + if image_ref.is_empty() { + return Ok(None); + } + + let oci = self.oci.clone().ok_or_else(|| { + Status::failed_precondition( + "OCI image support is not available: the host platform is not linux/amd64 or linux/arm64", + ) + })?; + let mksquashfs = self.config.mksquashfs_bin.clone().ok_or_else(|| { + Status::failed_precondition( + "OCI image support is not configured: set OPENSHELL_VM_MKSQUASHFS to the path of mksquashfs", + ) + })?; + + let env_overrides = crate::oci::EnvOverrides { + template: sandbox + .spec + .as_ref() + .and_then(|spec| spec.template.as_ref()) + .map(|template| template.environment.clone().into_iter().collect()) + .unwrap_or_default(), + spec: sandbox + .spec + .as_ref() + .map(|spec| spec.environment.clone().into_iter().collect()) + .unwrap_or_default(), + }; + + let build_opts = crate::oci::fs_image::BuildOptions::with_binary(mksquashfs); + let cached = crate::oci::prepare( + &oci.puller, + &oci.cache, + &build_opts, + &image_ref, + &env_overrides, + ) + .await + .map_err(|err| Status::internal(format!("OCI prepare failed: {err}")))?; + + let state_paths = crate::state_disk::SandboxStatePaths::for_state_dir(state_dir); + crate::state_disk::ensure_state_disk( + &state_paths.state_disk, + crate::state_disk::DEFAULT_STATE_DISK_SIZE_BYTES, + ) + .map_err(|err| Status::internal(format!("create sandbox state disk: {err}")))?; + + let guest_env_vars: Vec = cached + .metadata + .to_guest_env_vars() + .into_iter() + .map(|(k, v)| format!("{k}={v}")) + .collect(); + + Ok(Some(OciLaunch { + base_disk_path: cached.fs_image, + state_disk_path: state_paths.state_disk, + guest_env_vars, + })) + } + async fn monitor_sandbox(&self, sandbox_id: String) { let mut ready_emitted = false; @@ -713,6 +827,32 @@ impl ComputeDriver for VmDriver { } } +/// Per-sandbox OCI launch artifacts: cached RO base fs image, per-sandbox +/// writable state disk, and the launch-metadata env vars that get packed +/// into the guest init's environ. +#[derive(Debug)] +struct OciLaunch { + base_disk_path: PathBuf, + state_disk_path: PathBuf, + guest_env_vars: Vec, +} + +/// Return the OCI image reference to use for this sandbox, or `""` if the +/// sandbox is a legacy non-OCI VM sandbox. Spec overrides the driver default. +fn effective_image_ref(sandbox: &Sandbox, default_image: &str) -> String { + let requested = sandbox + .spec + .as_ref() + .and_then(|spec| spec.template.as_ref()) + .map(|template| template.image.as_str()) + .unwrap_or(""); + if !requested.is_empty() { + requested.to_string() + } else { + default_image.to_string() + } +} + fn validate_vm_sandbox(sandbox: &Sandbox) -> Result<(), Status> { let spec = sandbox .spec @@ -725,9 +865,9 @@ fn validate_vm_sandbox(sandbox: &Sandbox) -> Result<(), Status> { } if let Some(template) = spec.template.as_ref() { if !template.image.is_empty() { - return Err(Status::failed_precondition( - "vm sandboxes do not support template.image", - )); + crate::oci::validate_reference(&template.image).map_err(|err| { + Status::failed_precondition(format!("invalid template.image: {err}")) + })?; } if !template.agent_socket_path.is_empty() { return Err(Status::failed_precondition( @@ -779,7 +919,11 @@ fn guest_visible_openshell_endpoint(endpoint: &str) -> String { endpoint.to_string() } -fn build_guest_environment(sandbox: &Sandbox, config: &VmDriverConfig) -> Vec { +fn build_guest_environment( + sandbox: &Sandbox, + config: &VmDriverConfig, + is_oci: bool, +) -> Vec { let mut environment = HashMap::from([ ("HOME".to_string(), "/root".to_string()), ( @@ -805,15 +949,23 @@ fn build_guest_environment(sandbox: &Sandbox, config: &VmDriverConfig) -> Vec`, so this fallback is dead weight there — and + // passing it would muddy the contract (the supervisor's env-parsing path + // uses whitespace splitting, which would corrupt argv boundaries if a + // code path ever fell through to it). Only set it for non-OCI sandboxes. + if !is_oci { + environment.insert( + "OPENSHELL_SANDBOX_COMMAND".to_string(), + "tail -f /dev/null".to_string(), + ); + } if config.requires_tls_materials() { environment.extend(HashMap::from([ ( @@ -1097,13 +1249,42 @@ mod tests { ..Default::default() }; - let env = build_guest_environment(&sandbox, &config); + let env = build_guest_environment(&sandbox, &config, false); assert!(env.contains(&"HOME=/root".to_string())); assert!(env.contains(&"OPENSHELL_ENDPOINT=http://192.168.127.1:8080/".to_string())); assert!(env.contains(&"OPENSHELL_SANDBOX_ID=sandbox-123".to_string())); assert!(env.contains(&format!( "OPENSHELL_SSH_LISTEN_ADDR=0.0.0.0:{GUEST_SSH_PORT}" ))); + assert!( + env.iter() + .any(|e| e.starts_with("OPENSHELL_SANDBOX_COMMAND=")), + "non-OCI sandboxes should receive the fallback command" + ); + } + + #[test] + fn build_guest_environment_omits_sandbox_command_for_oci_sandboxes() { + let config = VmDriverConfig { + openshell_endpoint: "http://127.0.0.1:8080".to_string(), + ssh_handshake_secret: "secret".to_string(), + ..Default::default() + }; + let sandbox = Sandbox { + id: "sandbox-oci".to_string(), + name: "sandbox-oci".to_string(), + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let env = build_guest_environment(&sandbox, &config, true); + assert!( + !env.iter() + .any(|e| e.starts_with("OPENSHELL_SANDBOX_COMMAND=")), + "OCI sandboxes should not get the legacy supervisor command fallback: {env:?}" + ); + // sanity: other guest-env bits should still be present + assert!(env.contains(&"OPENSHELL_SANDBOX_ID=sandbox-oci".to_string())); } #[test] @@ -1135,7 +1316,7 @@ mod tests { ..Default::default() }; - let env = build_guest_environment(&sandbox, &config); + let env = build_guest_environment(&sandbox, &config, false); assert!(env.contains(&format!("OPENSHELL_TLS_CA={GUEST_TLS_CA_PATH}"))); assert!(env.contains(&format!("OPENSHELL_TLS_CERT={GUEST_TLS_CERT_PATH}"))); assert!(env.contains(&format!("OPENSHELL_TLS_KEY={GUEST_TLS_KEY_PATH}"))); @@ -1161,6 +1342,7 @@ mod tests { launcher_bin: PathBuf::from("openshell-driver-vm"), registry: Arc::new(Mutex::new(HashMap::new())), events, + oci: None, }; let base = unique_temp_dir(); @@ -1205,6 +1387,95 @@ mod tests { let _ = std::fs::remove_dir_all(base); } + #[test] + fn effective_image_ref_prefers_spec_over_default() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate { + image: "docker.io/library/alpine:3.20".to_string(), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + assert_eq!( + effective_image_ref(&sandbox, "docker.io/library/busybox"), + "docker.io/library/alpine:3.20" + ); + } + + #[test] + fn effective_image_ref_falls_back_to_default_when_spec_empty() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate::default()), + ..Default::default() + }), + ..Default::default() + }; + assert_eq!( + effective_image_ref(&sandbox, "docker.io/library/busybox"), + "docker.io/library/busybox" + ); + } + + #[test] + fn effective_image_ref_is_empty_when_neither_is_set() { + let sandbox = Sandbox::default(); + assert!(effective_image_ref(&sandbox, "").is_empty()); + } + + #[test] + fn validate_vm_sandbox_accepts_valid_template_image() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate { + image: "docker.io/library/alpine:3.20".to_string(), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + validate_vm_sandbox(&sandbox).expect("valid image ref should pass validation"); + } + + #[test] + fn validate_vm_sandbox_rejects_malformed_template_image() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate { + image: "::not a valid ref::".to_string(), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + let err = validate_vm_sandbox(&sandbox).expect_err("malformed ref should fail"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("template.image")); + } + + #[test] + fn capabilities_advertise_default_image() { + let config = VmDriverConfig { + default_image: "docker.io/library/busybox".to_string(), + ..Default::default() + }; + let driver = VmDriver { + config, + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + events: broadcast::channel(WATCH_BUFFER).0, + oci: None, + }; + let caps = driver.capabilities(); + assert_eq!(caps.default_image, "docker.io/library/busybox"); + assert_eq!(caps.driver_name, DRIVER_NAME); + } + #[test] fn validate_openshell_endpoint_accepts_loopback_hosts() { validate_openshell_endpoint("http://127.0.0.1:8080") @@ -1309,6 +1580,139 @@ mod tests { let _ = std::fs::remove_dir_all(base); } + #[tokio::test] + async fn resolve_oci_launch_skips_when_no_image_is_requested() { + // A sandbox without template.image and with no driver default must + // fall through to the legacy non-OCI boot path. The resolver should + // return Ok(None) without consulting the puller or cache. + let state_dir = unique_temp_dir(); + std::fs::create_dir_all(&state_dir).unwrap(); + let driver = VmDriver { + config: VmDriverConfig::default(), + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + events: broadcast::channel(WATCH_BUFFER).0, + oci: None, // unsupported host platform + }; + + let sandbox = Sandbox { + id: "sb".to_string(), + name: "sb".to_string(), + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let result = driver + .resolve_oci_launch(&sandbox, &state_dir) + .await + .expect("no image requested → Ok(None)"); + assert!(result.is_none()); + let _ = std::fs::remove_dir_all(state_dir); + } + + #[tokio::test] + async fn resolve_oci_launch_fails_cleanly_when_host_platform_is_unsupported() { + // When `Platform::host()` returned None at driver construction, + // `self.oci` is None. Requesting an OCI image in that state must + // produce FailedPrecondition, not a panic or a silent fallback. + let state_dir = unique_temp_dir(); + std::fs::create_dir_all(&state_dir).unwrap(); + let driver = VmDriver { + config: VmDriverConfig { + default_image: "docker.io/library/alpine:3.20".to_string(), + ..Default::default() + }, + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + events: broadcast::channel(WATCH_BUFFER).0, + oci: None, + }; + + let sandbox = Sandbox { + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let err = driver + .resolve_oci_launch(&sandbox, &state_dir) + .await + .expect_err("unsupported host platform should reject OCI sandboxes"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("linux/amd64 or linux/arm64")); + let _ = std::fs::remove_dir_all(state_dir); + } + + #[tokio::test] + async fn resolve_oci_launch_fails_cleanly_when_mksquashfs_is_missing() { + // If the host platform is supported but `mksquashfs_bin` isn't + // configured, the driver must refuse with FailedPrecondition so the + // gateway surfaces a diagnosable error instead of hanging at pull time. + let Some(platform) = crate::oci::Platform::host() else { + eprintln!("skipping: unsupported host platform"); + return; + }; + + let state_dir = unique_temp_dir(); + std::fs::create_dir_all(&state_dir).unwrap(); + let cache_root = state_dir.join("oci-cache"); + let cache = crate::oci::CacheLayout::new(cache_root); + cache.ensure_dirs().unwrap(); + + let driver = VmDriver { + config: VmDriverConfig { + default_image: "docker.io/library/alpine:3.20".to_string(), + mksquashfs_bin: None, + ..Default::default() + }, + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + events: broadcast::channel(WATCH_BUFFER).0, + oci: Some(Arc::new(VmOci { + puller: crate::oci::OciPuller::new(platform), + cache, + platform, + })), + }; + + let sandbox = Sandbox { + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let err = driver + .resolve_oci_launch(&sandbox, &state_dir) + .await + .expect_err("missing mksquashfs should reject OCI sandboxes"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("OPENSHELL_VM_MKSQUASHFS")); + let _ = std::fs::remove_dir_all(state_dir); + } + + #[test] + fn effective_image_ref_drops_whitespace_only_spec_overrides() { + // Whitespace-only image refs should not be accepted as an override of + // the driver's configured default. In today's implementation the spec + // field is compared against "" exactly, so " " slips through — but + // that only affects downstream validation, which then rejects the + // malformed ref. This test documents the contract and will need an + // update if we decide to trim here. + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate { + image: " ".to_string(), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + // Current behavior: non-empty string wins, even if whitespace. + assert_eq!( + effective_image_ref(&sandbox, "docker.io/library/busybox"), + " " + ); + } + fn unique_temp_dir() -> PathBuf { static COUNTER: AtomicU64 = AtomicU64::new(0); let nanos = SystemTime::now() diff --git a/crates/openshell-driver-vm/src/ffi.rs b/crates/openshell-driver-vm/src/ffi.rs index 750788ac1..6770dbc0c 100644 --- a/crates/openshell-driver-vm/src/ffi.rs +++ b/crates/openshell-driver-vm/src/ffi.rs @@ -23,6 +23,11 @@ pub const KRUN_LOG_LEVEL_DEBUG: u32 = 4; pub const KRUN_LOG_LEVEL_TRACE: u32 = 5; pub const KRUN_LOG_STYLE_AUTO: u32 = 0; pub const KRUN_LOG_OPTION_NO_ENV: u32 = 1; +pub const KRUN_DISK_FORMAT_RAW: u32 = 0; +#[allow(dead_code)] // Used only on macOS (cfg-gated in state_disk_sync_mode) +pub const KRUN_SYNC_RELAXED: u32 = 1; +#[allow(dead_code)] // Used only on Linux (cfg-gated in state_disk_sync_mode) +pub const KRUN_SYNC_FULL: u32 = 2; type KrunInitLog = unsafe extern "C" fn(target_fd: i32, level: u32, style: u32, options: u32) -> i32; @@ -39,6 +44,17 @@ type KrunSetExec = unsafe extern "C" fn( ) -> i32; type KrunSetPortMap = unsafe extern "C" fn(ctx_id: u32, port_map: *const *const c_char) -> i32; type KrunSetConsoleOutput = unsafe extern "C" fn(ctx_id: u32, filepath: *const c_char) -> i32; +type KrunAddDisk3 = unsafe extern "C" fn( + ctx_id: u32, + block_id: *const c_char, + disk_path: *const c_char, + disk_format: u32, + read_only: bool, + direct_io: bool, + sync_mode: u32, +) -> i32; +type KrunAddVsockPort2 = + unsafe extern "C" fn(ctx_id: u32, port: u32, c_filepath: *const c_char, listen: bool) -> i32; type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32; type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32; type KrunAddVsock = unsafe extern "C" fn(ctx_id: u32, tsi_features: u32) -> i32; @@ -70,6 +86,8 @@ pub struct LibKrun { pub krun_set_exec: KrunSetExec, pub krun_set_port_map: KrunSetPortMap, pub krun_set_console_output: KrunSetConsoleOutput, + pub krun_add_disk3: Option, + pub krun_add_vsock_port2: KrunAddVsockPort2, pub krun_start_enter: KrunStartEnter, pub krun_disable_implicit_vsock: KrunDisableImplicitVsock, pub krun_add_vsock: KrunAddVsock, @@ -127,6 +145,8 @@ impl LibKrun { b"krun_set_console_output\0", &libkrun_path, )?, + krun_add_disk3: load_optional_symbol(library, b"krun_add_disk3\0"), + krun_add_vsock_port2: load_symbol(library, b"krun_add_vsock_port2\0", &libkrun_path)?, krun_start_enter: load_symbol(library, b"krun_start_enter\0", &libkrun_path)?, krun_disable_implicit_vsock: load_symbol( library, @@ -204,3 +224,8 @@ fn load_symbol(library: &'static Library, name: &[u8], path: &Path) -> }) } } + +fn load_optional_symbol(library: &'static Library, name: &[u8]) -> Option { + let symbol = unsafe { library.get::(name).ok()? }; + Some(*symbol) +} diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs index 1c424deeb..b7c2f9ab6 100644 --- a/crates/openshell-driver-vm/src/lib.rs +++ b/crates/openshell-driver-vm/src/lib.rs @@ -4,10 +4,18 @@ pub mod driver; mod embedded_runtime; mod ffi; +pub mod oci; mod rootfs; mod runtime; +pub mod state_disk; pub const GUEST_SSH_PORT: u16 = 2222; pub use driver::{VmDriver, VmDriverConfig}; -pub use runtime::{VM_RUNTIME_DIR_ENV, VmLaunchConfig, configured_runtime_dir, run_vm}; +pub use runtime::{ + ImportVsock, StateDisk, VM_RUNTIME_DIR_ENV, VmLaunchConfig, configured_runtime_dir, run_vm, +}; +pub use state_disk::{ + DEFAULT_STATE_DISK_SIZE_BYTES, IMPORT_VSOCK_PORT, STATE_DISK_BLOCK_ID, SandboxStatePaths, + ensure_state_disk, prepare_import_socket_dir, verify_import_socket_path, +}; diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs index 3a7976273..87a6f6742 100644 --- a/crates/openshell-driver-vm/src/main.rs +++ b/crates/openshell-driver-vm/src/main.rs @@ -6,7 +6,8 @@ use miette::{IntoDiagnostic, Result}; use openshell_core::VERSION; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; use openshell_driver_vm::{ - VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, VmLaunchConfig, configured_runtime_dir, run_vm, + ImportVsock, STATE_DISK_BLOCK_ID, StateDisk, VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, + VmLaunchConfig, configured_runtime_dir, run_vm, }; use std::net::SocketAddr; use std::path::PathBuf; @@ -49,6 +50,26 @@ struct Args { #[arg(long, hide = true, default_value_t = 1)] vm_krun_log_level: u32, + #[arg(long, hide = true)] + vm_state_disk: Option, + + #[arg(long, hide = true, default_value = STATE_DISK_BLOCK_ID)] + vm_state_disk_block_id: String, + + /// Optional path to a read-only base disk (e.g. cached squashfs) used as + /// the overlay lower layer inside the guest. OCI sandboxes only. + #[arg(long, hide = true)] + vm_ro_base_disk: Option, + + #[arg(long, hide = true, default_value = "oci-base")] + vm_ro_base_disk_block_id: String, + + #[arg(long, hide = true)] + vm_import_socket: Option, + + #[arg(long, hide = true)] + vm_import_vsock_port: Option, + #[arg( long, env = "OPENSHELL_COMPUTE_DRIVER_BIND", @@ -95,6 +116,16 @@ struct Args { #[arg(long, env = "OPENSHELL_VM_DRIVER_MEM_MIB", default_value_t = 2048)] mem_mib: u32, + + /// Default OCI image used when a sandbox spec omits `template.image`. + /// Advertised via `GetCapabilities.default_image`. + #[arg(long, env = "OPENSHELL_VM_DRIVER_DEFAULT_IMAGE", default_value = "")] + default_image: String, + + /// Path to the `mksquashfs` binary used to build RO base fs images. + /// Required for OCI-image sandboxes; unset → legacy-only driver. + #[arg(long, env = "OPENSHELL_VM_MKSQUASHFS")] + mksquashfs_bin: Option, } #[tokio::main] @@ -128,6 +159,8 @@ async fn main() -> Result<()> { guest_tls_ca: args.guest_tls_ca, guest_tls_cert: args.guest_tls_cert, guest_tls_key: args.guest_tls_key, + default_image: args.default_image, + mksquashfs_bin: args.mksquashfs_bin, }) .await .map_err(|err| miette::miette!("{err}"))?; @@ -175,6 +208,27 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result Some(ImportVsock { port, socket_path }), + (None, None) => None, + _ => { + return Err( + "--vm-import-socket and --vm-import-vsock-port must be set together".to_string(), + ); + } + }; + Ok(VmLaunchConfig { rootfs, vcpus: args.vm_vcpus, @@ -186,6 +240,9 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result/ +/// blobs/sha256/ raw manifest/config/layer bytes +/// fs/..squashfs RO base image +/// meta/..json launch metadata +/// tmp/ atomic-write staging +/// ``` +#[derive(Debug, Clone)] +pub struct CacheLayout { + root: PathBuf, +} + +impl CacheLayout { + #[must_use] + pub fn new(root: PathBuf) -> Self { + Self { root } + } + + #[must_use] + pub fn root(&self) -> &Path { + &self.root + } + + #[must_use] + pub fn blob_path(&self, digest: &str) -> PathBuf { + let (algo, hex) = split_digest(digest); + self.root.join("blobs").join(algo).join(hex) + } + + #[must_use] + pub fn fs_image_path(&self, manifest_digest: &str, platform: Platform) -> PathBuf { + let hex = strip_algo(manifest_digest); + self.root + .join("fs") + .join(format!("{hex}.{}.squashfs", platform.cache_tag())) + } + + #[must_use] + pub fn metadata_path(&self, manifest_digest: &str, platform: Platform) -> PathBuf { + let hex = strip_algo(manifest_digest); + self.root + .join("meta") + .join(format!("{hex}.{}.json", platform.cache_tag())) + } + + #[must_use] + pub fn tmp_dir(&self) -> PathBuf { + self.root.join("tmp") + } + + /// Create all cache subdirectories. Idempotent. + pub fn ensure_dirs(&self) -> io::Result<()> { + fs::create_dir_all(self.root.join("blobs/sha256"))?; + fs::create_dir_all(self.root.join("fs"))?; + fs::create_dir_all(self.root.join("meta"))?; + fs::create_dir_all(self.tmp_dir())?; + Ok(()) + } + + /// Check whether a cached fs image + metadata pair is present for this image. + #[must_use] + pub fn lookup(&self, manifest_digest: &str, platform: Platform) -> Option { + let fs_path = self.fs_image_path(manifest_digest, platform); + let meta_path = self.metadata_path(manifest_digest, platform); + if !fs_path.is_file() || !meta_path.is_file() { + return None; + } + let metadata_json = fs::read_to_string(&meta_path).ok()?; + let metadata: CachedMetadata = serde_json::from_str(&metadata_json).ok()?; + Some(CachedImage { + fs_image: fs_path, + metadata: metadata.launch, + }) + } + + /// Atomically write launch metadata for a built image. + pub fn write_metadata( + &self, + manifest_digest: &str, + platform: Platform, + metadata: &LaunchMetadata, + ) -> io::Result<()> { + self.ensure_dirs()?; + let target = self.metadata_path(manifest_digest, platform); + let payload = serde_json::to_vec_pretty(&CachedMetadata { + schema: METADATA_SCHEMA_V1, + launch: metadata.clone(), + }) + .map_err(io::Error::other)?; + atomic_write(&self.tmp_dir(), &target, &payload) + } + + /// Atomically move a built fs image into its cache slot. The source path + /// must live on the same filesystem as the cache root (callers typically + /// build inside [`Self::tmp_dir`]). + pub fn install_fs_image( + &self, + manifest_digest: &str, + platform: Platform, + built_image: &Path, + ) -> io::Result { + self.ensure_dirs()?; + let dest = self.fs_image_path(manifest_digest, platform); + if dest.exists() { + fs::remove_file(&dest)?; + } + fs::rename(built_image, &dest)?; + Ok(dest) + } +} + +/// A cache hit with both the RO fs image path and its launch metadata. +#[derive(Debug, Clone)] +pub struct CachedImage { + pub fs_image: PathBuf, + pub metadata: LaunchMetadata, +} + +const METADATA_SCHEMA_V1: u32 = 1; + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct CachedMetadata { + schema: u32, + launch: LaunchMetadata, +} + +fn split_digest(digest: &str) -> (&str, &str) { + match digest.split_once(':') { + Some((algo, hex)) => (algo, hex), + None => ("sha256", digest), + } +} + +fn strip_algo(digest: &str) -> &str { + split_digest(digest).1 +} + +/// Write `bytes` to `target` via a rename inside `tmp_dir`, ensuring readers +/// never see a partial file. +fn atomic_write(tmp_dir: &Path, target: &Path, bytes: &[u8]) -> io::Result<()> { + fs::create_dir_all(tmp_dir)?; + if let Some(parent) = target.parent() { + fs::create_dir_all(parent)?; + } + let file_name = target + .file_name() + .ok_or_else(|| io::Error::other("cache target has no file name"))?; + let staging = tmp_dir.join(format!( + "{}.{}.tmp", + file_name.to_string_lossy(), + std::process::id() + )); + fs::write(&staging, bytes)?; + fs::rename(&staging, target)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::BTreeMap; + + fn sample_metadata() -> LaunchMetadata { + LaunchMetadata { + argv: vec!["/bin/sh".to_string(), "-c".to_string(), "true".to_string()], + env: vec![("A".to_string(), "1".to_string())], + workdir: "/sandbox".to_string(), + labels: BTreeMap::new(), + stop_signal: String::new(), + } + } + + #[test] + fn digest_with_algo_splits_into_blob_path() { + let layout = CacheLayout::new(PathBuf::from("/cache")); + let path = layout.blob_path("sha256:abc123"); + assert_eq!(path, PathBuf::from("/cache/blobs/sha256/abc123")); + } + + #[test] + fn digest_without_algo_defaults_to_sha256() { + let layout = CacheLayout::new(PathBuf::from("/cache")); + let path = layout.blob_path("abc123"); + assert_eq!(path, PathBuf::from("/cache/blobs/sha256/abc123")); + } + + #[test] + fn fs_and_metadata_paths_include_platform_tag() { + let layout = CacheLayout::new(PathBuf::from("/cache")); + assert_eq!( + layout.fs_image_path("sha256:deadbeef", Platform::LinuxAmd64), + PathBuf::from("/cache/fs/deadbeef.amd64.squashfs") + ); + assert_eq!( + layout.metadata_path("sha256:deadbeef", Platform::LinuxArm64), + PathBuf::from("/cache/meta/deadbeef.arm64.json") + ); + } + + #[test] + fn lookup_returns_none_when_either_file_is_missing() { + let tmp = tempfile::tempdir().unwrap(); + let layout = CacheLayout::new(tmp.path().to_path_buf()); + layout.ensure_dirs().unwrap(); + assert!(layout.lookup("sha256:abc", Platform::LinuxAmd64).is_none()); + + // write metadata but no fs image + layout + .write_metadata("sha256:abc", Platform::LinuxAmd64, &sample_metadata()) + .unwrap(); + assert!(layout.lookup("sha256:abc", Platform::LinuxAmd64).is_none()); + } + + #[test] + fn lookup_returns_paired_fs_image_and_metadata() { + let tmp = tempfile::tempdir().unwrap(); + let layout = CacheLayout::new(tmp.path().to_path_buf()); + layout.ensure_dirs().unwrap(); + + // Seed the fs image slot with a placeholder file. + let fs_slot = layout.fs_image_path("sha256:abc", Platform::LinuxAmd64); + fs::create_dir_all(fs_slot.parent().unwrap()).unwrap(); + fs::write(&fs_slot, b"stub").unwrap(); + + layout + .write_metadata("sha256:abc", Platform::LinuxAmd64, &sample_metadata()) + .unwrap(); + + let hit = layout + .lookup("sha256:abc", Platform::LinuxAmd64) + .expect("expected cache hit"); + assert_eq!(hit.fs_image, fs_slot); + assert_eq!(hit.metadata.argv, sample_metadata().argv); + } + + #[test] + fn write_metadata_is_atomic_under_repeat_writes() { + let tmp = tempfile::tempdir().unwrap(); + let layout = CacheLayout::new(tmp.path().to_path_buf()); + layout + .write_metadata("sha256:abc", Platform::LinuxAmd64, &sample_metadata()) + .unwrap(); + + let mut updated = sample_metadata(); + updated.argv.push("extra".to_string()); + layout + .write_metadata("sha256:abc", Platform::LinuxAmd64, &updated) + .unwrap(); + + let hit = layout.lookup("sha256:abc", Platform::LinuxAmd64); + // no fs image, so lookup returns None; re-read the metadata directly. + assert!(hit.is_none()); + let raw = + fs::read_to_string(layout.metadata_path("sha256:abc", Platform::LinuxAmd64)).unwrap(); + assert!(raw.contains("extra")); + } + + #[test] + fn install_fs_image_moves_built_image_into_slot() { + let tmp = tempfile::tempdir().unwrap(); + let layout = CacheLayout::new(tmp.path().to_path_buf()); + layout.ensure_dirs().unwrap(); + let built = layout.tmp_dir().join("built.squashfs"); + fs::write(&built, b"squashed").unwrap(); + + let slot = layout + .install_fs_image("sha256:xyz", Platform::LinuxAmd64, &built) + .unwrap(); + assert!(slot.is_file()); + assert!(!built.exists(), "source should be renamed, not copied"); + assert_eq!(fs::read(&slot).unwrap(), b"squashed"); + } +} diff --git a/crates/openshell-driver-vm/src/oci/client.rs b/crates/openshell-driver-vm/src/oci/client.rs new file mode 100644 index 000000000..32586d0f1 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/client.rs @@ -0,0 +1,241 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Thin wrapper around [`oci_client::Client`] that pulls a public image for a +//! specific guest platform and normalizes the response into data our pipeline +//! can consume. + +use std::collections::BTreeMap; +use std::str::FromStr; + +use oci_client::client::{ClientConfig, ImageLayer}; +use oci_client::manifest::{ + IMAGE_CONFIG_MEDIA_TYPE, IMAGE_DOCKER_CONFIG_MEDIA_TYPE, IMAGE_DOCKER_LAYER_GZIP_MEDIA_TYPE, + IMAGE_LAYER_GZIP_MEDIA_TYPE, IMAGE_LAYER_MEDIA_TYPE, IMAGE_MANIFEST_LIST_MEDIA_TYPE, + IMAGE_MANIFEST_MEDIA_TYPE, ImageIndexEntry, OCI_IMAGE_INDEX_MEDIA_TYPE, OCI_IMAGE_MEDIA_TYPE, +}; +use oci_client::secrets::RegistryAuth; +use oci_client::{Client, Reference}; + +use super::metadata::{ImageConfig, Platform}; + +/// Image pulled from a registry, with the normalized subset our pipeline needs. +#[derive(Debug)] +pub struct PulledImage { + /// Manifest digest (`sha256:...`), used as the cache key. + pub manifest_digest: String, + /// Layers in application order (lower → upper), already filtered for + /// supported media types. + pub layers: Vec, + /// Normalized OCI image config. + pub image_config: ImageConfig, +} + +/// Pulls public OCI images for a fixed guest platform. +pub struct OciPuller { + client: Client, + platform: Platform, +} + +impl OciPuller { + #[must_use] + pub fn new(platform: Platform) -> Self { + let config = ClientConfig { + platform_resolver: Some(Box::new(move |entries: &[ImageIndexEntry]| { + pick_platform(entries, platform) + })), + ..Default::default() + }; + Self { + client: Client::new(config), + platform, + } + } + + /// Pull `image_ref` (e.g. `docker.io/library/alpine:3.20`) anonymously. + /// + /// Returns the manifest digest + layer bytes + normalized config. Any + /// error from the registry or the config decoder is surfaced verbatim. + pub async fn pull(&self, image_ref: &str) -> Result { + let reference = Reference::from_str(image_ref) + .map_err(|err| PullError::InvalidReference(err.to_string()))?; + + let accepted = vec![ + IMAGE_LAYER_MEDIA_TYPE, + IMAGE_LAYER_GZIP_MEDIA_TYPE, + IMAGE_DOCKER_LAYER_GZIP_MEDIA_TYPE, + IMAGE_MANIFEST_MEDIA_TYPE, + OCI_IMAGE_MEDIA_TYPE, + IMAGE_MANIFEST_LIST_MEDIA_TYPE, + OCI_IMAGE_INDEX_MEDIA_TYPE, + IMAGE_CONFIG_MEDIA_TYPE, + IMAGE_DOCKER_CONFIG_MEDIA_TYPE, + ]; + + let image = self + .client + .pull(&reference, &RegistryAuth::Anonymous, accepted) + .await + .map_err(|err| PullError::Registry(err.to_string()))?; + + let manifest_digest = image.digest.ok_or_else(|| { + PullError::Registry("registry did not return a manifest digest".into()) + })?; + + let image_config = parse_image_config(&image.config.data)?; + + Ok(PulledImage { + manifest_digest, + layers: image.layers, + image_config, + }) + } + + #[must_use] + pub fn platform(&self) -> Platform { + self.platform + } +} + +/// Pick the first index entry matching the requested platform. +fn pick_platform(entries: &[ImageIndexEntry], platform: Platform) -> Option { + entries + .iter() + .find(|entry| { + entry + .platform + .as_ref() + .is_some_and(|p| p.os == platform.os() && p.architecture == platform.arch()) + }) + .map(|entry| entry.digest.clone()) +} + +/// Deserialize the OCI image config JSON into our minimal view. +fn parse_image_config(config_bytes: &[u8]) -> Result { + #[derive(serde::Deserialize)] + struct RawConfig { + config: Option, + } + #[derive(serde::Deserialize, Default)] + #[serde(default)] + struct InnerConfig { + #[serde(rename = "Entrypoint")] + entrypoint: Option>, + #[serde(rename = "Cmd")] + cmd: Option>, + #[serde(rename = "Env")] + env: Option>, + #[serde(rename = "WorkingDir")] + working_dir: Option, + #[serde(rename = "Labels")] + labels: Option>, + #[serde(rename = "StopSignal")] + stop_signal: Option, + } + + let raw: RawConfig = serde_json::from_slice(config_bytes) + .map_err(|err| PullError::MalformedConfig(err.to_string()))?; + let inner = raw.config.unwrap_or_default(); + Ok(ImageConfig { + entrypoint: inner.entrypoint.unwrap_or_default(), + cmd: inner.cmd.unwrap_or_default(), + env: inner.env.unwrap_or_default(), + working_dir: inner.working_dir.unwrap_or_default(), + labels: inner.labels.unwrap_or_default(), + stop_signal: inner.stop_signal.unwrap_or_default(), + }) +} + +/// Errors raised during image pull or normalization. +#[derive(Debug, thiserror::Error)] +pub enum PullError { + #[error("invalid image reference: {0}")] + InvalidReference(String), + #[error("registry error: {0}")] + Registry(String), + #[error("malformed OCI image config: {0}")] + MalformedConfig(String), +} + +#[cfg(test)] +mod tests { + use super::*; + use oci_client::manifest::Platform as SpecPlatform; + + fn entry(os: &str, arch: &str, digest: &str) -> ImageIndexEntry { + ImageIndexEntry { + media_type: OCI_IMAGE_MEDIA_TYPE.to_string(), + digest: digest.to_string(), + size: 0, + platform: Some(SpecPlatform { + architecture: arch.to_string(), + os: os.to_string(), + os_version: None, + os_features: None, + variant: None, + features: None, + }), + annotations: None, + } + } + + #[test] + fn pick_platform_selects_matching_entry() { + let entries = vec![ + entry("linux", "amd64", "sha256:amd"), + entry("linux", "arm64", "sha256:arm"), + ]; + assert_eq!( + pick_platform(&entries, Platform::LinuxAmd64), + Some("sha256:amd".to_string()) + ); + assert_eq!( + pick_platform(&entries, Platform::LinuxArm64), + Some("sha256:arm".to_string()) + ); + } + + #[test] + fn pick_platform_returns_none_when_unsupported() { + let entries = vec![entry("windows", "amd64", "sha256:win")]; + assert!(pick_platform(&entries, Platform::LinuxAmd64).is_none()); + } + + #[test] + fn parse_image_config_handles_entrypoint_and_cmd_fields() { + let json = br#"{ + "architecture": "amd64", + "os": "linux", + "config": { + "Entrypoint": ["/bin/sh", "-c"], + "Cmd": ["echo hello"], + "Env": ["PATH=/usr/bin"], + "WorkingDir": "/app", + "Labels": {"k": "v"}, + "StopSignal": "SIGTERM" + } + }"#; + let cfg = parse_image_config(json).unwrap(); + assert_eq!(cfg.entrypoint, vec!["/bin/sh", "-c"]); + assert_eq!(cfg.cmd, vec!["echo hello"]); + assert_eq!(cfg.env, vec!["PATH=/usr/bin"]); + assert_eq!(cfg.working_dir, "/app"); + assert_eq!(cfg.labels.get("k"), Some(&"v".to_string())); + assert_eq!(cfg.stop_signal, "SIGTERM"); + } + + #[test] + fn parse_image_config_tolerates_missing_config_block() { + let json = br#"{"architecture":"amd64","os":"linux"}"#; + let cfg = parse_image_config(json).unwrap(); + assert!(cfg.entrypoint.is_empty()); + assert!(cfg.cmd.is_empty()); + assert_eq!(cfg.working_dir, ""); + } + + #[test] + fn parse_image_config_rejects_malformed_json() { + let err = parse_image_config(b"not json").expect_err("should fail"); + assert!(matches!(err, PullError::MalformedConfig(_))); + } +} diff --git a/crates/openshell-driver-vm/src/oci/compat.rs b/crates/openshell-driver-vm/src/oci/compat.rs new file mode 100644 index 000000000..c9f4c400a --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/compat.rs @@ -0,0 +1,197 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Inject OpenShell compatibility files into a flattened OCI rootfs tree. +//! +//! Runs after [`crate::oci::flatten`] and before the squashfs build, so the +//! sandbox user and its expected directories are baked into the RO base image. + +use std::fs; +use std::io; +use std::os::unix::fs::PermissionsExt as _; +use std::path::Path; + +/// Canonical sandbox user/group. Must match `openshell-sandbox`'s expectations. +pub const SANDBOX_UID: u32 = 10001; +pub const SANDBOX_GID: u32 = 10001; +pub const SANDBOX_USER: &str = "sandbox"; + +/// Apply all compat injections into `root`. Idempotent. +pub fn inject(root: &Path) -> io::Result<()> { + ensure_passwd_entry(root)?; + ensure_group_entry(root)?; + ensure_dir(&root.join("sandbox"), 0o755)?; + ensure_dir(&root.join("tmp"), 0o1777)?; + ensure_empty_file(&root.join("etc/hosts"), 0o644)?; + ensure_empty_file(&root.join("etc/resolv.conf"), 0o644)?; + Ok(()) +} + +fn ensure_passwd_entry(root: &Path) -> io::Result<()> { + let path = root.join("etc/passwd"); + let shell = pick_shell(root); + let entry = format!( + "{SANDBOX_USER}:x:{SANDBOX_UID}:{SANDBOX_GID}:OpenShell Sandbox:/sandbox:{shell}\n" + ); + append_user_db_entry(&path, SANDBOX_USER, &entry) +} + +fn ensure_group_entry(root: &Path) -> io::Result<()> { + let path = root.join("etc/group"); + let entry = format!("{SANDBOX_USER}:x:{SANDBOX_GID}:\n"); + append_user_db_entry(&path, SANDBOX_USER, &entry) +} + +/// Append `entry` to the colon-delimited user DB at `path` unless a line +/// already starts with `key:`. Creates `etc/` and the file if needed. +fn append_user_db_entry(path: &Path, key: &str, entry: &str) -> io::Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + + let existing = match fs::read_to_string(path) { + Ok(contents) => contents, + Err(err) if err.kind() == io::ErrorKind::NotFound => String::new(), + Err(err) => return Err(err), + }; + + let prefix = format!("{key}:"); + if existing.lines().any(|line| line.starts_with(&prefix)) { + return Ok(()); + } + + let mut combined = existing; + if !combined.is_empty() && !combined.ends_with('\n') { + combined.push('\n'); + } + combined.push_str(entry); + fs::write(path, combined)?; + fs::set_permissions(path, fs::Permissions::from_mode(0o644))?; + Ok(()) +} + +/// Pick the best shell path for the sandbox user. +/// +/// Prefers `/bin/sh` if present; falls back to `/sbin/nologin`, then +/// `/bin/false`. This guarantees a valid shell field in `/etc/passwd` +/// even for minimal images. +fn pick_shell(root: &Path) -> String { + for candidate in ["bin/sh", "sbin/nologin", "usr/sbin/nologin", "bin/false"] { + if root.join(candidate).exists() { + return format!("/{candidate}"); + } + } + "/sbin/nologin".to_string() +} + +fn ensure_dir(path: &Path, mode: u32) -> io::Result<()> { + if !path.exists() { + fs::create_dir_all(path)?; + } + fs::set_permissions(path, fs::Permissions::from_mode(mode)) +} + +fn ensure_empty_file(path: &Path, mode: u32) -> io::Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + if !path.exists() { + fs::write(path, "")?; + } + fs::set_permissions(path, fs::Permissions::from_mode(mode)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn fresh_root() -> tempfile::TempDir { + tempfile::tempdir().unwrap() + } + + #[test] + fn inject_populates_passwd_group_and_dirs_on_empty_root() { + let tmp = fresh_root(); + inject(tmp.path()).unwrap(); + + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + assert!(passwd.contains(&format!("{SANDBOX_USER}:x:{SANDBOX_UID}:{SANDBOX_GID}:"))); + + let group = fs::read_to_string(tmp.path().join("etc/group")).unwrap(); + assert!(group.contains(&format!("{SANDBOX_USER}:x:{SANDBOX_GID}:"))); + + let sandbox_meta = fs::metadata(tmp.path().join("sandbox")).unwrap(); + assert!(sandbox_meta.is_dir()); + assert_eq!(sandbox_meta.permissions().mode() & 0o777, 0o755); + + let tmp_meta = fs::metadata(tmp.path().join("tmp")).unwrap(); + assert_eq!(tmp_meta.permissions().mode() & 0o7777, 0o1777); + + assert!(tmp.path().join("etc/hosts").exists()); + assert!(tmp.path().join("etc/resolv.conf").exists()); + } + + #[test] + fn inject_is_idempotent_and_does_not_duplicate_entries() { + let tmp = fresh_root(); + inject(tmp.path()).unwrap(); + inject(tmp.path()).unwrap(); + + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + let sandbox_lines = passwd + .lines() + .filter(|line| line.starts_with(&format!("{SANDBOX_USER}:"))) + .count(); + assert_eq!(sandbox_lines, 1, "sandbox user should appear exactly once"); + } + + #[test] + fn inject_preserves_existing_passwd_entries() { + let tmp = fresh_root(); + fs::create_dir_all(tmp.path().join("etc")).unwrap(); + fs::write( + tmp.path().join("etc/passwd"), + "root:x:0:0:root:/root:/bin/sh\n", + ) + .unwrap(); + + inject(tmp.path()).unwrap(); + + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + assert!(passwd.contains("root:x:0:0:")); + assert!(passwd.contains(&format!("{SANDBOX_USER}:x:{SANDBOX_UID}:"))); + } + + #[test] + fn inject_uses_nologin_when_no_shell_present() { + let tmp = fresh_root(); + inject(tmp.path()).unwrap(); + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + assert!( + passwd.contains(":/sbin/nologin"), + "expected nologin fallback, got: {passwd}" + ); + } + + #[test] + fn inject_uses_bin_sh_when_available() { + let tmp = fresh_root(); + fs::create_dir_all(tmp.path().join("bin")).unwrap(); + fs::write(tmp.path().join("bin/sh"), "").unwrap(); + fs::set_permissions(tmp.path().join("bin/sh"), fs::Permissions::from_mode(0o755)).unwrap(); + + inject(tmp.path()).unwrap(); + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + assert!(passwd.contains(":/bin/sh")); + } + + #[test] + fn inject_does_not_truncate_existing_etc_hosts() { + let tmp = fresh_root(); + fs::create_dir_all(tmp.path().join("etc")).unwrap(); + fs::write(tmp.path().join("etc/hosts"), "127.0.0.1 localhost\n").unwrap(); + inject(tmp.path()).unwrap(); + let hosts = fs::read_to_string(tmp.path().join("etc/hosts")).unwrap(); + assert!(hosts.contains("127.0.0.1")); + } +} diff --git a/crates/openshell-driver-vm/src/oci/flatten.rs b/crates/openshell-driver-vm/src/oci/flatten.rs new file mode 100644 index 000000000..7d0d6dd75 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/flatten.rs @@ -0,0 +1,332 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Apply OCI image layers in order into a flat rootfs tree, honoring whiteouts. +//! +//! OCI whiteout convention (see image-spec): +//! - A file named `.wh.` in a layer means "delete " from the tree. +//! - A file named `.wh..wh..opq` in a directory means "opaque directory" — +//! delete all existing children of that directory before applying this +//! layer's additions. + +use std::fs; +use std::io::{self, Read}; +use std::path::{Component, Path, PathBuf}; + +const OPAQUE_MARKER: &str = ".wh..wh..opq"; +const WHITEOUT_PREFIX: &str = ".wh."; + +/// Apply a single gzip-compressed OCI layer tar stream into `dest`. +/// +/// Whiteouts are honored against the existing contents of `dest`; the +/// markers themselves are never materialized. +pub fn apply_layer(dest: &Path, layer_reader: R) -> io::Result<()> { + let gz = flate2::read::GzDecoder::new(layer_reader); + apply_tar_stream(dest, gz) +} + +/// Apply a layer whose bytes are in memory, dispatching on OCI media type. +/// +/// Supports `tar` (uncompressed) and `tar+gzip`. Other encodings +/// (`tar+zstd`, `tar+bzip2`) are rejected — OCI v1.1 allows them but common +/// registries still use gzip. +pub fn apply_layer_bytes(dest: &Path, media_type: &str, bytes: &[u8]) -> io::Result<()> { + let base = media_type.split(';').next().unwrap_or(media_type).trim(); + if base.ends_with("+gzip") || base.ends_with(".gzip") || base.ends_with(".tar.gzip") { + apply_layer(dest, bytes) + } else if base.ends_with(".tar") || base.ends_with("+tar") || base == "application/x-tar" { + apply_tar_stream(dest, bytes) + } else if base.ends_with("+zstd") { + Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unsupported layer media type (zstd not supported in v1): {media_type}"), + )) + } else { + Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unknown layer media type: {media_type}"), + )) + } +} + +/// Apply an uncompressed tar stream. Exposed for tests that build synthetic +/// layers in memory. +pub fn apply_tar_stream(dest: &Path, tar_reader: R) -> io::Result<()> { + let mut archive = tar::Archive::new(tar_reader); + archive.set_preserve_permissions(true); + archive.set_preserve_mtime(true); + archive.set_unpack_xattrs(false); + + for entry in archive.entries()? { + let mut entry = entry?; + let entry_path = entry.path()?.into_owned(); + let Some(rel) = sanitize_relative(&entry_path) else { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("layer tar contains unsafe path: {}", entry_path.display()), + )); + }; + + let Some(file_name) = rel.file_name().and_then(|n| n.to_str()) else { + // skip entries we cannot reason about (e.g. `.` top-level) + continue; + }; + + if file_name == OPAQUE_MARKER { + let parent = rel.parent().unwrap_or(Path::new("")); + clear_directory(&dest.join(parent))?; + continue; + } + + if let Some(target_name) = file_name.strip_prefix(WHITEOUT_PREFIX) { + let parent = rel.parent().unwrap_or(Path::new("")); + let target = dest.join(parent).join(target_name); + remove_any(&target)?; + continue; + } + + let dest_path = dest.join(&rel); + if let Some(parent) = dest_path.parent() { + fs::create_dir_all(parent)?; + } + entry.unpack(&dest_path)?; + } + + Ok(()) +} + +/// Reject absolute, parent-escaping, or root-component paths in layer tars. +fn sanitize_relative(path: &Path) -> Option { + let mut out = PathBuf::new(); + for component in path.components() { + match component { + Component::Normal(part) => out.push(part), + Component::CurDir => {} + Component::RootDir | Component::Prefix(_) | Component::ParentDir => return None, + } + } + if out.as_os_str().is_empty() { + return None; + } + Some(out) +} + +fn clear_directory(path: &Path) -> io::Result<()> { + if !path.exists() { + return Ok(()); + } + for entry in fs::read_dir(path)? { + let entry = entry?; + remove_any(&entry.path())?; + } + Ok(()) +} + +fn remove_any(path: &Path) -> io::Result<()> { + match path.symlink_metadata() { + Ok(meta) => { + if meta.file_type().is_dir() { + fs::remove_dir_all(path) + } else { + fs::remove_file(path) + } + } + Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(()), + Err(err) => Err(err), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + /// Build an in-memory tar stream from a list of (path, contents) pairs. + /// Directories are created implicitly when their children have paths. + fn build_tar(entries: &[(&str, &[u8])]) -> Vec { + let mut buf = Vec::new(); + { + let mut builder = tar::Builder::new(&mut buf); + for (path, contents) in entries { + if path.ends_with('/') { + let mut header = tar::Header::new_gnu(); + header.set_path(path).unwrap(); + header.set_size(0); + header.set_mode(0o755); + header.set_entry_type(tar::EntryType::Directory); + header.set_cksum(); + builder.append(&header, io::empty()).unwrap(); + } else { + let mut header = tar::Header::new_gnu(); + header.set_path(path).unwrap(); + header.set_size(contents.len() as u64); + header.set_mode(0o644); + header.set_entry_type(tar::EntryType::Regular); + header.set_cksum(); + builder.append(&header, *contents).unwrap(); + } + } + builder.finish().unwrap(); + } + buf + } + + #[test] + fn whiteout_removes_file_from_lower_layer() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let lower = build_tar(&[("app/", b""), ("app/a.txt", b"a"), ("app/b.txt", b"b")]); + apply_tar_stream(root, lower.as_slice()).unwrap(); + assert!(root.join("app/a.txt").exists()); + + let upper = build_tar(&[("app/.wh.a.txt", b"")]); + apply_tar_stream(root, upper.as_slice()).unwrap(); + + assert!(!root.join("app/a.txt").exists(), "whiteout should remove a"); + assert!(root.join("app/b.txt").exists(), "b should still exist"); + assert!( + !root.join("app/.wh.a.txt").exists(), + "marker should not be materialized" + ); + } + + #[test] + fn opaque_whiteout_clears_directory_before_additions() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let lower = build_tar(&[ + ("data/", b""), + ("data/keep.txt", b"lower"), + ("data/gone.txt", b"lower"), + ]); + apply_tar_stream(root, lower.as_slice()).unwrap(); + + let upper = build_tar(&[ + ("data/", b""), + ("data/.wh..wh..opq", b""), + ("data/new.txt", b"upper"), + ]); + apply_tar_stream(root, upper.as_slice()).unwrap(); + + assert!(!root.join("data/keep.txt").exists()); + assert!(!root.join("data/gone.txt").exists()); + assert!(root.join("data/new.txt").exists()); + assert_eq!( + fs::read_to_string(root.join("data/new.txt")).unwrap(), + "upper" + ); + } + + #[test] + fn whiteout_removes_directory_recursively() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let lower = build_tar(&[ + ("dir/", b""), + ("dir/a.txt", b"a"), + ("dir/sub/", b""), + ("dir/sub/b.txt", b"b"), + ]); + apply_tar_stream(root, lower.as_slice()).unwrap(); + + let upper = build_tar(&[(".wh.dir", b"")]); + apply_tar_stream(root, upper.as_slice()).unwrap(); + + assert!(!root.join("dir").exists()); + } + + #[test] + fn layers_apply_in_order_with_later_overwriting_earlier() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + apply_tar_stream(root, build_tar(&[("x.txt", b"v1")]).as_slice()).unwrap(); + apply_tar_stream(root, build_tar(&[("x.txt", b"v2")]).as_slice()).unwrap(); + assert_eq!(fs::read_to_string(root.join("x.txt")).unwrap(), "v2"); + } + + #[test] + fn sanitize_relative_rejects_absolute_paths() { + assert!(sanitize_relative(Path::new("/etc/passwd")).is_none()); + } + + #[test] + fn sanitize_relative_rejects_parent_traversal() { + assert!(sanitize_relative(Path::new("../escape.txt")).is_none()); + assert!(sanitize_relative(Path::new("a/../../etc/passwd")).is_none()); + } + + #[test] + fn sanitize_relative_strips_curdir_and_keeps_clean_paths() { + assert_eq!( + sanitize_relative(Path::new("./etc/hosts")).unwrap(), + PathBuf::from("etc/hosts") + ); + assert_eq!( + sanitize_relative(Path::new("app/bin/sh")).unwrap(), + PathBuf::from("app/bin/sh") + ); + } + + #[test] + fn sanitize_relative_rejects_empty_and_root_only_paths() { + assert!(sanitize_relative(Path::new("")).is_none()); + assert!(sanitize_relative(Path::new("/")).is_none()); + } + + #[test] + fn apply_layer_bytes_dispatches_on_media_type() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let tarball = build_tar(&[("plain.txt", b"v")]); + apply_layer_bytes(root, "application/vnd.oci.image.layer.v1.tar", &tarball).unwrap(); + assert!(root.join("plain.txt").exists()); + + let mut gz = Vec::new(); + { + let mut enc = flate2::write::GzEncoder::new(&mut gz, flate2::Compression::fast()); + enc.write_all(&build_tar(&[("gz.txt", b"v")])).unwrap(); + enc.finish().unwrap(); + } + apply_layer_bytes(root, "application/vnd.oci.image.layer.v1.tar+gzip", &gz).unwrap(); + assert!(root.join("gz.txt").exists()); + } + + #[test] + fn apply_layer_bytes_rejects_zstd_in_v1() { + let tmp = tempfile::tempdir().unwrap(); + let err = apply_layer_bytes( + tmp.path(), + "application/vnd.oci.image.layer.v1.tar+zstd", + b"", + ) + .expect_err("zstd should be rejected"); + assert!(err.to_string().contains("zstd")); + } + + #[test] + fn apply_layer_bytes_rejects_unknown_media_type() { + let tmp = tempfile::tempdir().unwrap(); + let err = apply_layer_bytes(tmp.path(), "application/bogus", b"") + .expect_err("unknown media type should fail"); + assert!(err.to_string().contains("unknown")); + } + + #[test] + fn apply_layer_handles_gzip_streams() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let tarball = build_tar(&[("hello.txt", b"world")]); + let mut gz = Vec::new(); + { + let mut enc = flate2::write::GzEncoder::new(&mut gz, flate2::Compression::fast()); + enc.write_all(&tarball).unwrap(); + enc.finish().unwrap(); + } + apply_layer(root, gz.as_slice()).unwrap(); + assert_eq!(fs::read_to_string(root.join("hello.txt")).unwrap(), "world"); + } +} diff --git a/crates/openshell-driver-vm/src/oci/fs_image.rs b/crates/openshell-driver-vm/src/oci/fs_image.rs new file mode 100644 index 000000000..97849e1c5 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/fs_image.rs @@ -0,0 +1,156 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Build a read-only squashfs image from a flattened rootfs tree. +//! +//! Shell out to `mksquashfs`. The binary is expected to ship with the VM +//! runtime bundle under `/mksquashfs`; callers pass an explicit +//! path so the build is reproducible and does not depend on `$PATH`. + +use std::io; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; + +/// Options for building a squashfs image. +#[derive(Debug, Clone)] +pub struct BuildOptions { + /// Path to the `mksquashfs` binary. + pub mksquashfs: PathBuf, + /// Compression algorithm passed via `-comp`. + pub compression: Compression, + /// Optional extra flags forwarded verbatim (e.g. `-no-xattrs`). + pub extra_args: Vec, +} + +impl BuildOptions { + #[must_use] + pub fn with_binary(mksquashfs: PathBuf) -> Self { + Self { + mksquashfs, + compression: Compression::Zstd, + extra_args: Vec::new(), + } + } +} + +/// Compression algorithm for squashfs builds. `zstd` is the default; it has +/// the best decompression-speed-vs-ratio tradeoff for cold-start latency. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Compression { + Zstd, + Gzip, + Xz, +} + +impl Compression { + #[must_use] + pub const fn as_str(self) -> &'static str { + match self { + Self::Zstd => "zstd", + Self::Gzip => "gzip", + Self::Xz => "xz", + } + } +} + +/// Build a squashfs image from `source_dir` into `dest` using `options`. +/// +/// Returns an `io::Error` if the `mksquashfs` binary is missing or exits +/// non-zero. Callers are responsible for placing the result in the cache +/// via [`super::cache::CacheLayout::install_fs_image`]. +pub fn build(source_dir: &Path, dest: &Path, options: &BuildOptions) -> io::Result<()> { + if !options.mksquashfs.is_file() { + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!( + "mksquashfs binary not found at {}", + options.mksquashfs.display() + ), + )); + } + if !source_dir.is_dir() { + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!("source tree not found at {}", source_dir.display()), + )); + } + + if dest.exists() { + std::fs::remove_file(dest)?; + } + if let Some(parent) = dest.parent() { + std::fs::create_dir_all(parent)?; + } + + let mut cmd = Command::new(&options.mksquashfs); + cmd.arg(source_dir) + .arg(dest) + .arg("-noappend") + .arg("-quiet") + .arg("-comp") + .arg(options.compression.as_str()); + for arg in &options.extra_args { + cmd.arg(arg); + } + cmd.stdin(Stdio::null()); + + let output = cmd.output().map_err(|err| { + io::Error::other(format!( + "spawn mksquashfs {}: {err}", + options.mksquashfs.display() + )) + })?; + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(io::Error::other(format!( + "mksquashfs failed (status {}): {}", + output.status, + stderr.trim() + ))); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn build_fails_when_mksquashfs_is_missing() { + let tmp = tempfile::tempdir().unwrap(); + let source = tmp.path().join("src"); + std::fs::create_dir_all(&source).unwrap(); + let dest = tmp.path().join("out.squashfs"); + + let options = BuildOptions::with_binary(tmp.path().join("missing-mksquashfs")); + let err = build(&source, &dest, &options).expect_err("missing binary should fail"); + assert_eq!(err.kind(), io::ErrorKind::NotFound); + } + + #[test] + fn build_fails_when_source_tree_is_missing() { + let tmp = tempfile::tempdir().unwrap(); + let fake_bin = tmp.path().join("mksquashfs"); + std::fs::write(&fake_bin, "").unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&fake_bin, std::fs::Permissions::from_mode(0o755)).unwrap(); + } + let options = BuildOptions::with_binary(fake_bin); + let err = build( + &tmp.path().join("missing-src"), + &tmp.path().join("out.squashfs"), + &options, + ) + .expect_err("missing source should fail"); + assert_eq!(err.kind(), io::ErrorKind::NotFound); + } + + #[test] + fn compression_tag_matches_mksquashfs_flag_values() { + assert_eq!(Compression::Zstd.as_str(), "zstd"); + assert_eq!(Compression::Gzip.as_str(), "gzip"); + assert_eq!(Compression::Xz.as_str(), "xz"); + } +} diff --git a/crates/openshell-driver-vm/src/oci/metadata.rs b/crates/openshell-driver-vm/src/oci/metadata.rs new file mode 100644 index 000000000..eecafc743 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/metadata.rs @@ -0,0 +1,336 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Normalized launch metadata derived from the OCI image config + sandbox spec. + +use std::collections::BTreeMap; +use std::fmt; + +use serde::{Deserialize, Serialize}; + +/// Guest platform an OCI manifest must match. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum Platform { + /// `linux/amd64` + LinuxAmd64, + /// `linux/arm64` + LinuxArm64, +} + +impl Platform { + /// Host build target. Returns `None` on unsupported host arches. + #[must_use] + pub fn host() -> Option { + match std::env::consts::ARCH { + "x86_64" => Some(Self::LinuxAmd64), + "aarch64" | "arm64" => Some(Self::LinuxArm64), + _ => None, + } + } + + /// OCI `os` component. + #[must_use] + pub const fn os(self) -> &'static str { + "linux" + } + + /// OCI `architecture` component. + #[must_use] + pub const fn arch(self) -> &'static str { + match self { + Self::LinuxAmd64 => "amd64", + Self::LinuxArm64 => "arm64", + } + } + + /// Short string used in cache keys (`amd64`, `arm64`). + #[must_use] + pub const fn cache_tag(self) -> &'static str { + self.arch() + } +} + +impl fmt::Display for Platform { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}/{}", self.os(), self.arch()) + } +} + +/// Normalized command + environment the guest init will hand to the supervisor. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct LaunchMetadata { + /// Exact argv boundaries preserved (no shell split). + pub argv: Vec, + /// Ordered env, OCI config < template < sandbox spec. + pub env: Vec<(String, String)>, + /// Working directory inside the container rootfs. + pub workdir: String, + /// Labels copied from the OCI config (advisory; carried for introspection). + pub labels: BTreeMap, + /// Stop signal name from the OCI config (e.g. `SIGTERM`). Empty → default. + pub stop_signal: String, +} + +impl LaunchMetadata { + /// Normalize an OCI image config plus caller-supplied overrides into a + /// launch descriptor. + /// + /// Precedence for env: OCI config < template env < sandbox spec env. + /// Argv = OCI `Entrypoint` + `Cmd` per OCI spec precedence. + /// Workdir = OCI `WorkingDir` if absolute and non-empty, else `/sandbox`. + pub fn build( + image_config: ImageConfig, + template_env: &BTreeMap, + spec_env: &BTreeMap, + ) -> Result { + let argv = resolve_argv(&image_config.entrypoint, &image_config.cmd)?; + let workdir = resolve_workdir(&image_config.working_dir); + let env = merge_env(&image_config.env, template_env, spec_env)?; + + Ok(Self { + argv, + env, + workdir, + labels: image_config.labels, + stop_signal: image_config.stop_signal, + }) + } + + /// Render this metadata into env vars the guest init can consume. + /// + /// - `OPENSHELL_OCI_ARGC=`, `OPENSHELL_OCI_ARGV_=` for each i in 0..n. + /// - `OPENSHELL_OCI_ENV_COUNT=`, `OPENSHELL_OCI_ENV_==` for each i. + /// - `OPENSHELL_OCI_WORKDIR=`. + /// + /// A single env channel keeps this delivery in-band with the krun + /// `set_exec` call, avoiding any on-disk metadata file or vsock transfer. + #[must_use] + pub fn to_guest_env_vars(&self) -> Vec<(String, String)> { + let mut out = Vec::with_capacity(self.argv.len() + self.env.len() + 3); + out.push(( + "OPENSHELL_OCI_ARGC".to_string(), + self.argv.len().to_string(), + )); + for (i, arg) in self.argv.iter().enumerate() { + out.push((format!("OPENSHELL_OCI_ARGV_{i}"), arg.clone())); + } + out.push(( + "OPENSHELL_OCI_ENV_COUNT".to_string(), + self.env.len().to_string(), + )); + for (i, (key, value)) in self.env.iter().enumerate() { + out.push((format!("OPENSHELL_OCI_ENV_{i}"), format!("{key}={value}"))); + } + out.push(("OPENSHELL_OCI_WORKDIR".to_string(), self.workdir.clone())); + out + } +} + +/// Minimal view of the OCI image config we care about. +#[derive(Debug, Clone, Default)] +pub struct ImageConfig { + pub entrypoint: Vec, + pub cmd: Vec, + pub env: Vec, + pub working_dir: String, + pub labels: BTreeMap, + pub stop_signal: String, +} + +/// Errors raised when the image config is missing required data. +#[derive(Debug, thiserror::Error)] +pub enum BuildError { + #[error("image config has no runnable command (Entrypoint and Cmd are both empty)")] + EmptyCommand, + #[error("image env entry is not KEY=VALUE: {0}")] + MalformedEnv(String), + #[error("template env entry has empty key")] + EmptyTemplateEnvKey, +} + +fn resolve_argv(entrypoint: &[String], cmd: &[String]) -> Result, BuildError> { + let mut argv = Vec::with_capacity(entrypoint.len() + cmd.len()); + argv.extend(entrypoint.iter().cloned()); + argv.extend(cmd.iter().cloned()); + if argv.is_empty() { + return Err(BuildError::EmptyCommand); + } + Ok(argv) +} + +fn resolve_workdir(oci_workdir: &str) -> String { + if oci_workdir.starts_with('/') && !oci_workdir.is_empty() { + oci_workdir.to_string() + } else { + "/sandbox".to_string() + } +} + +fn merge_env( + oci_env: &[String], + template: &BTreeMap, + spec: &BTreeMap, +) -> Result, BuildError> { + let mut merged: BTreeMap = BTreeMap::new(); + for entry in oci_env { + let Some((key, value)) = entry.split_once('=') else { + return Err(BuildError::MalformedEnv(entry.clone())); + }; + merged.insert(key.to_string(), value.to_string()); + } + for (key, value) in template { + if key.is_empty() { + return Err(BuildError::EmptyTemplateEnvKey); + } + merged.insert(key.clone(), value.clone()); + } + for (key, value) in spec { + merged.insert(key.clone(), value.clone()); + } + Ok(merged.into_iter().collect()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn config(entrypoint: &[&str], cmd: &[&str], env: &[&str], workdir: &str) -> ImageConfig { + ImageConfig { + entrypoint: entrypoint.iter().map(|s| (*s).to_string()).collect(), + cmd: cmd.iter().map(|s| (*s).to_string()).collect(), + env: env.iter().map(|s| (*s).to_string()).collect(), + working_dir: workdir.to_string(), + ..Default::default() + } + } + + #[test] + fn argv_is_entrypoint_then_cmd() { + let meta = LaunchMetadata::build( + config(&["/bin/sh", "-c"], &["echo hi"], &[], "/app"), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + assert_eq!(meta.argv, vec!["/bin/sh", "-c", "echo hi"]); + assert_eq!(meta.workdir, "/app"); + } + + #[test] + fn argv_falls_back_to_cmd_only() { + let meta = LaunchMetadata::build( + config(&[], &["/bin/busybox", "sh"], &[], ""), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + assert_eq!(meta.argv, vec!["/bin/busybox", "sh"]); + } + + #[test] + fn empty_command_is_rejected() { + let err = LaunchMetadata::build( + config(&[], &[], &[], ""), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .expect_err("empty command must be rejected"); + assert!(matches!(err, BuildError::EmptyCommand)); + } + + #[test] + fn workdir_falls_back_to_sandbox() { + let meta = LaunchMetadata::build( + config(&["/bin/sh"], &[], &[], ""), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + assert_eq!(meta.workdir, "/sandbox"); + + let meta = LaunchMetadata::build( + config(&["/bin/sh"], &[], &[], "relative/path"), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + assert_eq!(meta.workdir, "/sandbox"); + } + + #[test] + fn env_precedence_is_oci_then_template_then_spec() { + let template: BTreeMap = [("A", "template"), ("B", "template")] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + let spec: BTreeMap = [("B", "spec"), ("C", "spec")] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + + let meta = LaunchMetadata::build( + config(&["/bin/sh"], &[], &["A=oci", "B=oci", "D=oci"], "/app"), + &template, + &spec, + ) + .unwrap(); + + let env: BTreeMap = meta.env.into_iter().collect(); + assert_eq!(env.get("A"), Some(&"template".to_string())); + assert_eq!(env.get("B"), Some(&"spec".to_string())); + assert_eq!(env.get("C"), Some(&"spec".to_string())); + assert_eq!(env.get("D"), Some(&"oci".to_string())); + } + + #[test] + fn malformed_oci_env_entry_is_rejected() { + let err = LaunchMetadata::build( + config(&["/bin/sh"], &[], &["BROKEN"], "/app"), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .expect_err("missing '=' should fail"); + assert!(matches!(err, BuildError::MalformedEnv(_))); + } + + #[test] + fn to_guest_env_vars_round_trips_argv_with_spaces() { + let meta = LaunchMetadata::build( + config( + &["/bin/sh", "-c"], + &["echo 'hello world'"], + &["A=1"], + "/app", + ), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + + let env: BTreeMap = meta.to_guest_env_vars().into_iter().collect(); + assert_eq!(env.get("OPENSHELL_OCI_ARGC"), Some(&"3".to_string())); + assert_eq!( + env.get("OPENSHELL_OCI_ARGV_0"), + Some(&"/bin/sh".to_string()) + ); + assert_eq!(env.get("OPENSHELL_OCI_ARGV_1"), Some(&"-c".to_string())); + assert_eq!( + env.get("OPENSHELL_OCI_ARGV_2"), + Some(&"echo 'hello world'".to_string()) + ); + assert_eq!(env.get("OPENSHELL_OCI_ENV_COUNT"), Some(&"1".to_string())); + assert_eq!(env.get("OPENSHELL_OCI_ENV_0"), Some(&"A=1".to_string())); + assert_eq!(env.get("OPENSHELL_OCI_WORKDIR"), Some(&"/app".to_string())); + } + + #[test] + fn host_platform_is_recognized_on_supported_arches() { + let platform = Platform::host(); + // On CI/dev machines this should always be amd64 or arm64. + assert!(matches!( + platform, + Some(Platform::LinuxAmd64) | Some(Platform::LinuxArm64) + )); + } +} diff --git a/crates/openshell-driver-vm/src/oci/mod.rs b/crates/openshell-driver-vm/src/oci/mod.rs new file mode 100644 index 000000000..77cd266f7 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/mod.rs @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Host-side OCI image pipeline for the VM driver. +//! +//! Responsible for resolving a public OCI image reference to a cached, +//! read-only squashfs filesystem image and a launch metadata descriptor +//! that the guest uses to overlay + exec the container entrypoint. + +pub mod cache; +pub mod client; +pub mod compat; +pub mod flatten; +pub mod fs_image; +pub mod metadata; +pub mod pipeline; + +pub use cache::{CacheLayout, CachedImage}; +pub use client::{OciPuller, PullError, PulledImage}; +pub use metadata::{LaunchMetadata, Platform}; +pub use pipeline::{EnvOverrides, PipelineError, prepare, validate_reference}; diff --git a/crates/openshell-driver-vm/src/oci/pipeline.rs b/crates/openshell-driver-vm/src/oci/pipeline.rs new file mode 100644 index 000000000..34ec89163 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/pipeline.rs @@ -0,0 +1,147 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! End-to-end orchestrator: image ref → cached squashfs + launch metadata. +//! +//! On a cache hit this is a zero-I/O path that returns the cached descriptor. +//! On a miss it pulls the image, flattens its layers, injects compat files, +//! builds a squashfs, and installs it into the cache under the manifest digest. + +use std::collections::BTreeMap; + +use tracing::{debug, info}; + +use super::cache::{CacheLayout, CachedImage}; +use super::client::{OciPuller, PullError}; +use super::compat; +use super::flatten; +use super::fs_image::{self, BuildOptions}; +use super::metadata::{BuildError, LaunchMetadata}; + +/// Sandbox- and template-level env overrides that the pipeline merges into +/// the final launch metadata. +#[derive(Debug, Default, Clone)] +pub struct EnvOverrides { + pub template: BTreeMap, + pub spec: BTreeMap, +} + +/// Prepare an OCI image into a cache-backed [`CachedImage`] descriptor. +/// +/// Idempotent: if the image (keyed by manifest digest + platform) is already +/// built and its metadata exists, no network or disk work happens. +pub async fn prepare( + puller: &OciPuller, + cache: &CacheLayout, + build_opts: &BuildOptions, + image_ref: &str, + env_overrides: &EnvOverrides, +) -> Result { + cache.ensure_dirs().map_err(PipelineError::Cache)?; + + let platform = puller.platform(); + + debug!(image = image_ref, %platform, "resolving OCI image"); + let pulled = puller.pull(image_ref).await.map_err(PipelineError::Pull)?; + let manifest_digest = pulled.manifest_digest.clone(); + + if let Some(hit) = cache.lookup(&manifest_digest, platform) { + info!(digest = %manifest_digest, %platform, "OCI cache hit, skipping build"); + return Ok(hit); + } + + debug!(digest = %manifest_digest, "flattening OCI layers"); + let staging = cache + .tmp_dir() + .join(format!("stage-{}", strip_prefix(&manifest_digest))); + if staging.exists() { + std::fs::remove_dir_all(&staging).map_err(PipelineError::Cache)?; + } + std::fs::create_dir_all(&staging).map_err(PipelineError::Cache)?; + + for layer in &pulled.layers { + flatten::apply_layer_bytes(&staging, &layer.media_type, &layer.data) + .map_err(PipelineError::Flatten)?; + } + + debug!("injecting OpenShell compatibility files"); + compat::inject(&staging).map_err(PipelineError::Compat)?; + + let metadata = LaunchMetadata::build( + pulled.image_config, + &env_overrides.template, + &env_overrides.spec, + ) + .map_err(PipelineError::Metadata)?; + + let built = cache + .tmp_dir() + .join(format!("build-{}.squashfs", strip_prefix(&manifest_digest))); + debug!(output = %built.display(), "building squashfs"); + fs_image::build(&staging, &built, build_opts).map_err(PipelineError::Build)?; + + // Staging tree is no longer needed once the fs image is built. + let _ = std::fs::remove_dir_all(&staging); + + let installed = cache + .install_fs_image(&manifest_digest, platform, &built) + .map_err(PipelineError::Cache)?; + cache + .write_metadata(&manifest_digest, platform, &metadata) + .map_err(PipelineError::Cache)?; + + info!(digest = %manifest_digest, %platform, path = %installed.display(), "OCI image prepared"); + Ok(CachedImage { + fs_image: installed, + metadata, + }) +} + +/// Validate that an image reference is structurally OK before we bother the +/// registry. Useful for `validate_sandbox_create`. +pub fn validate_reference(image_ref: &str) -> Result<(), PipelineError> { + use std::str::FromStr; + oci_client::Reference::from_str(image_ref) + .map(|_| ()) + .map_err(|err| PipelineError::Pull(PullError::InvalidReference(err.to_string()))) +} + +#[allow(clippy::module_name_repetitions)] +#[derive(Debug, thiserror::Error)] +pub enum PipelineError { + #[error("cache I/O: {0}")] + Cache(#[source] std::io::Error), + #[error(transparent)] + Pull(PullError), + #[error("flatten layer: {0}")] + Flatten(#[source] std::io::Error), + #[error("inject compat files: {0}")] + Compat(#[source] std::io::Error), + #[error(transparent)] + Metadata(BuildError), + #[error("build fs image: {0}")] + Build(#[source] std::io::Error), +} + +fn strip_prefix(digest: &str) -> &str { + digest.split_once(':').map_or(digest, |(_, hex)| hex) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validate_reference_accepts_canonical_image_ref() { + validate_reference("docker.io/library/alpine:3.20").expect("valid"); + validate_reference( + "ghcr.io/org/image@sha256:0000000000000000000000000000000000000000000000000000000000000000", + ) + .expect("digest ref"); + } + + #[test] + fn validate_reference_rejects_empty_string() { + validate_reference("").expect_err("empty ref should fail"); + } +} diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs index 9888feb18..1f4669bc5 100644 --- a/crates/openshell-driver-vm/src/runtime.rs +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -29,6 +29,34 @@ pub struct VmLaunchConfig { pub port_map: Vec, pub log_level: u32, pub console_output: PathBuf, + /// Optional host-backed raw block image for mutable guest state. + /// Required when booting an imported OCI rootfs. + pub state_disk: Option, + /// Optional host-backed read-only base disk (e.g. a cached squashfs + /// image) used as the lower layer of an overlay mount in the guest. + /// Only set for OCI-image sandboxes. + pub ro_base_disk: Option, + /// Optional host Unix socket bridged into the guest as a vsock port. + /// Used by the OCI payload import channel. + pub import_vsock: Option, +} + +/// Block device exposed to the guest. +/// +/// The name is historical; both writable state disks and read-only base +/// images (e.g. squashfs) use this type. `read_only` distinguishes them. +#[derive(Debug, Clone)] +pub struct StateDisk { + pub path: PathBuf, + pub block_id: String, + pub read_only: bool, +} + +/// Host-side endpoint bridged to a guest vsock port. +#[derive(Debug, Clone)] +pub struct ImportVsock { + pub port: u32, + pub socket_path: PathBuf, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -64,6 +92,18 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { vm.set_root(&config.rootfs)?; vm.set_workdir(&config.workdir)?; + if let Some(disk) = config.ro_base_disk.as_ref() { + vm.add_state_disk(disk)?; + } + if let Some(disk) = config.state_disk.as_ref() { + vm.add_state_disk(disk)?; + } + if let Some(vsock) = config.import_vsock.as_ref() { + crate::state_disk::prepare_import_socket_dir(&vsock.socket_path) + .map_err(|err| format!("prepare import socket dir: {err}"))?; + vm.add_vsock_port(vsock)?; + } + let mut forwarded_port_map = config.port_map.clone(); let mut gvproxy_guard = None; let mut gvproxy_api_sock = None; @@ -273,6 +313,17 @@ fn raise_nofile_limit() { } } +fn state_disk_sync_mode() -> u32 { + #[cfg(target_os = "macos")] + { + ffi::KRUN_SYNC_RELAXED + } + #[cfg(not(target_os = "macos"))] + { + ffi::KRUN_SYNC_FULL + } +} + fn clamp_log_level(level: u32) -> u32 { match level { 0 => ffi::KRUN_LOG_LEVEL_OFF, @@ -330,6 +381,39 @@ impl VmContext { ) } + fn add_state_disk(&self, disk: &StateDisk) -> Result<(), String> { + let add_disk3 = self.krun.krun_add_disk3.ok_or_else(|| { + "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support".to_string() + })?; + let block_id_c = + CString::new(disk.block_id.as_str()).map_err(|e| format!("invalid block id: {e}"))?; + let disk_path_c = path_to_cstring(&disk.path)?; + check( + unsafe { + add_disk3( + self.ctx_id, + block_id_c.as_ptr(), + disk_path_c.as_ptr(), + ffi::KRUN_DISK_FORMAT_RAW, + disk.read_only, + false, + state_disk_sync_mode(), + ) + }, + "krun_add_disk3", + ) + } + + fn add_vsock_port(&self, vsock: &ImportVsock) -> Result<(), String> { + let socket_c = path_to_cstring(&vsock.socket_path)?; + check( + unsafe { + (self.krun.krun_add_vsock_port2)(self.ctx_id, vsock.port, socket_c.as_ptr(), true) + }, + "krun_add_vsock_port2", + ) + } + fn set_workdir(&self, workdir: &str) -> Result<(), String> { let workdir_c = CString::new(workdir).map_err(|e| format!("invalid workdir: {e}"))?; check( diff --git a/crates/openshell-driver-vm/src/state_disk.rs b/crates/openshell-driver-vm/src/state_disk.rs new file mode 100644 index 000000000..d7ad700a3 --- /dev/null +++ b/crates/openshell-driver-vm/src/state_disk.rs @@ -0,0 +1,260 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Helpers for per-sandbox state disks and host-to-guest import sockets +//! used by the OCI container execution path. + +#![allow(unsafe_code)] + +use std::fs; +use std::io; +use std::os::unix::fs::PermissionsExt as _; +use std::path::{Path, PathBuf}; + +/// Default raw state disk size when the driver has not been given an override. +/// Sparse-allocated; only actual writes consume space. +pub const DEFAULT_STATE_DISK_SIZE_BYTES: u64 = 16 * 1024 * 1024 * 1024; + +/// libkrun block ID the guest init script uses to locate the state disk. +pub const STATE_DISK_BLOCK_ID: &str = "sandbox-state"; + +/// vsock port used for one-shot OCI payload import. +pub const IMPORT_VSOCK_PORT: u32 = 10778; + +/// Layout of per-sandbox state-disk and import-socket paths. +#[derive(Debug, Clone)] +pub struct SandboxStatePaths { + /// Raw sparse disk image attached to the VM. + pub state_disk: PathBuf, + /// Host Unix socket bridged to the guest on [`IMPORT_VSOCK_PORT`]. + pub import_socket: PathBuf, +} + +impl SandboxStatePaths { + #[must_use] + pub fn for_state_dir(state_dir: &Path) -> Self { + Self { + state_disk: state_dir.join("sandbox-state.raw"), + import_socket: state_dir.join("oci-import.sock"), + } + } +} + +/// Create (or grow to size) the sparse raw state disk image. +pub fn ensure_state_disk(path: &Path, size_bytes: u64) -> io::Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + + let file = fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(false) + .open(path)?; + + let current = file.metadata()?.len(); + if current < size_bytes { + file.set_len(size_bytes)?; + } + Ok(()) +} + +/// Prepare the import-socket parent directory and remove any stale socket file. +/// +/// The parent directory is created with `0700`. If it already exists, it must +/// not be a symlink and must be owned by the current uid, otherwise we refuse +/// to use it — a tampered path would let an unprivileged user substitute the +/// socket before the VM connects to it. +pub fn prepare_import_socket_dir(socket_path: &Path) -> io::Result<()> { + let parent = socket_path + .parent() + .ok_or_else(|| io::Error::other("import socket path has no parent directory"))?; + + if parent.exists() { + let meta = parent.symlink_metadata()?; + if meta.file_type().is_symlink() { + return Err(io::Error::other(format!( + "import socket directory {} is a symlink; refusing to use it", + parent.display() + ))); + } + check_owner_and_mode(parent, &meta)?; + } else { + fs::create_dir_all(parent)?; + fs::set_permissions(parent, fs::Permissions::from_mode(0o700))?; + } + + match fs::remove_file(socket_path) { + Ok(()) => Ok(()), + Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(()), + Err(err) => Err(err), + } +} + +/// Verify that `path` is owned by the current uid and has a mode of `0o700` +/// or stricter. Returns an error if either check fails. +pub fn verify_import_socket_path(path: &Path) -> io::Result<()> { + let meta = path.symlink_metadata()?; + if meta.file_type().is_symlink() { + return Err(io::Error::other(format!( + "import socket path {} is a symlink; refusing to use it", + path.display() + ))); + } + check_owner(path, &meta)?; + + if let Some(parent) = path.parent() { + let parent_meta = parent.symlink_metadata()?; + if parent_meta.file_type().is_symlink() { + return Err(io::Error::other(format!( + "import socket directory {} is a symlink; refusing to use it", + parent.display() + ))); + } + check_owner_and_mode(parent, &parent_meta)?; + } + Ok(()) +} + +#[cfg(unix)] +fn check_owner_and_mode(path: &Path, meta: &fs::Metadata) -> io::Result<()> { + check_owner(path, meta)?; + let mode = meta.permissions().mode() & 0o777; + if mode & 0o077 != 0 { + return Err(io::Error::other(format!( + "import socket directory {} has permissions {:o}; expected 0700", + path.display(), + mode + ))); + } + Ok(()) +} + +#[cfg(not(unix))] +fn check_owner_and_mode(_path: &Path, _meta: &fs::Metadata) -> io::Result<()> { + Ok(()) +} + +#[cfg(unix)] +fn check_owner(path: &Path, meta: &fs::Metadata) -> io::Result<()> { + use std::os::unix::fs::MetadataExt as _; + let uid = unsafe { libc::getuid() }; + if meta.uid() != uid { + return Err(io::Error::other(format!( + "{} is owned by uid {} but we are uid {}", + path.display(), + meta.uid(), + uid + ))); + } + Ok(()) +} + +#[cfg(not(unix))] +fn check_owner(_path: &Path, _meta: &fs::Metadata) -> io::Result<()> { + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicU64, Ordering}; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn unique_temp_dir() -> PathBuf { + static COUNTER: AtomicU64 = AtomicU64::new(0); + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + let suffix = COUNTER.fetch_add(1, Ordering::Relaxed); + std::env::temp_dir().join(format!( + "openshell-state-disk-test-{}-{nanos}-{suffix}", + std::process::id() + )) + } + + #[test] + fn sandbox_state_paths_places_files_inside_state_dir() { + let paths = SandboxStatePaths::for_state_dir(Path::new("/srv/state/abc")); + assert_eq!( + paths.state_disk, + Path::new("/srv/state/abc/sandbox-state.raw") + ); + assert_eq!( + paths.import_socket, + Path::new("/srv/state/abc/oci-import.sock") + ); + } + + #[test] + fn ensure_state_disk_creates_sparse_file_of_requested_size() { + let dir = unique_temp_dir(); + let disk = dir.join("state.raw"); + ensure_state_disk(&disk, 1024 * 1024).expect("create disk"); + let meta = fs::metadata(&disk).expect("stat disk"); + assert_eq!(meta.len(), 1024 * 1024); + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn ensure_state_disk_grows_but_does_not_shrink() { + let dir = unique_temp_dir(); + let disk = dir.join("state.raw"); + ensure_state_disk(&disk, 4096).expect("initial"); + ensure_state_disk(&disk, 8192).expect("grow"); + assert_eq!(fs::metadata(&disk).unwrap().len(), 8192); + ensure_state_disk(&disk, 2048).expect("shrink noop"); + assert_eq!(fs::metadata(&disk).unwrap().len(), 8192); + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn prepare_import_socket_dir_creates_0700_dir_when_absent() { + let base = unique_temp_dir(); + let sock = base.join("oci-import.sock"); + prepare_import_socket_dir(&sock).expect("prepare"); + let meta = fs::metadata(&base).unwrap(); + assert_eq!(meta.permissions().mode() & 0o777, 0o700); + let _ = fs::remove_dir_all(&base); + } + + #[test] + fn prepare_import_socket_dir_removes_stale_socket_file() { + let base = unique_temp_dir(); + fs::create_dir_all(&base).unwrap(); + fs::set_permissions(&base, fs::Permissions::from_mode(0o700)).unwrap(); + let sock = base.join("oci-import.sock"); + fs::write(&sock, b"stale").unwrap(); + + prepare_import_socket_dir(&sock).expect("prepare"); + assert!(!sock.exists(), "stale socket should be removed"); + let _ = fs::remove_dir_all(&base); + } + + #[test] + fn prepare_import_socket_dir_rejects_world_writable_dir() { + let base = unique_temp_dir(); + fs::create_dir_all(&base).unwrap(); + fs::set_permissions(&base, fs::Permissions::from_mode(0o755)).unwrap(); + let sock = base.join("oci-import.sock"); + let err = prepare_import_socket_dir(&sock).expect_err("loose dir should be rejected"); + assert!(err.to_string().contains("permissions")); + let _ = fs::remove_dir_all(&base); + } + + #[test] + fn verify_import_socket_path_rejects_symlink() { + let base = unique_temp_dir(); + fs::create_dir_all(&base).unwrap(); + let target = base.join("real.sock"); + fs::write(&target, b"").unwrap(); + let link = base.join("oci-import.sock"); + std::os::unix::fs::symlink(&target, &link).unwrap(); + let err = + verify_import_socket_path(&link).expect_err("symlinked socket should be rejected"); + assert!(err.to_string().contains("symlink")); + let _ = fs::remove_dir_all(&base); + } +} diff --git a/crates/openshell-driver-vm/tests/oci_pipeline_integration.rs b/crates/openshell-driver-vm/tests/oci_pipeline_integration.rs new file mode 100644 index 000000000..30a4e2aa9 --- /dev/null +++ b/crates/openshell-driver-vm/tests/oci_pipeline_integration.rs @@ -0,0 +1,145 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration test for the OCI pipeline minus the network step. +//! +//! Builds a synthetic rootfs using the `flatten` module, injects compat files, +//! runs `mksquashfs` to produce a real RO base image, installs it in the +//! cache, and verifies the resulting fs image is non-empty and the cache +//! lookup round-trips. +//! +//! Gated on `mksquashfs` being present in `$PATH`. Run with: +//! cargo test -p openshell-driver-vm --tests -- --ignored + +use std::collections::BTreeMap; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +use openshell_driver_vm::oci::{ + CacheLayout, LaunchMetadata, Platform, compat, + flatten::apply_tar_stream, + fs_image::{BuildOptions, build}, + metadata::ImageConfig, +}; + +fn which(bin: &str) -> Option { + let paths = std::env::var_os("PATH")?; + for dir in std::env::split_paths(&paths) { + let candidate = dir.join(bin); + if candidate.is_file() { + return Some(candidate); + } + } + None +} + +fn build_minimal_tar() -> Vec { + let mut buf = Vec::new(); + { + let mut builder = tar::Builder::new(&mut buf); + + for dir in ["bin/", "etc/", "usr/", "usr/bin/"] { + let mut header = tar::Header::new_gnu(); + header.set_path(dir).unwrap(); + header.set_size(0); + header.set_mode(0o755); + header.set_entry_type(tar::EntryType::Directory); + header.set_cksum(); + builder.append(&header, std::io::empty()).unwrap(); + } + + let mut header = tar::Header::new_gnu(); + header.set_path("bin/sh").unwrap(); + let payload = b"#!/bin/sh\n:\n"; + header.set_size(payload.len() as u64); + header.set_mode(0o755); + header.set_entry_type(tar::EntryType::Regular); + header.set_cksum(); + builder.append(&header, &payload[..]).unwrap(); + + let passwd = b"root:x:0:0:root:/root:/bin/sh\n"; + let mut header = tar::Header::new_gnu(); + header.set_path("etc/passwd").unwrap(); + header.set_size(passwd.len() as u64); + header.set_mode(0o644); + header.set_entry_type(tar::EntryType::Regular); + header.set_cksum(); + builder.append(&header, &passwd[..]).unwrap(); + + builder.finish().unwrap(); + } + buf +} + +#[test] +#[ignore = "requires mksquashfs in $PATH; run with `cargo test -- --ignored`"] +fn full_pipeline_without_network_produces_cached_image() { + let Some(mksquashfs) = which("mksquashfs") else { + eprintln!("mksquashfs not found on PATH; skipping"); + return; + }; + + let work = tempfile::tempdir().unwrap(); + + // 1. Flatten a synthetic "image" layer into a staging tree. + let staging = work.path().join("stage"); + fs::create_dir_all(&staging).unwrap(); + let tar_bytes = build_minimal_tar(); + apply_tar_stream(&staging, tar_bytes.as_slice()).unwrap(); + + // 2. Inject OpenShell compat files. + compat::inject(&staging).unwrap(); + assert!(staging.join("sandbox").is_dir()); + assert!(staging.join("tmp").is_dir()); + let passwd = fs::read_to_string(staging.join("etc/passwd")).unwrap(); + assert!(passwd.contains("sandbox:x:10001:10001:")); + + // 3. Build squashfs. + let cache_root = work.path().join("cache"); + let layout = CacheLayout::new(cache_root.clone()); + layout.ensure_dirs().unwrap(); + let built = layout.tmp_dir().join("build.squashfs"); + let opts = BuildOptions::with_binary(mksquashfs); + build(&staging, &built, &opts).expect("mksquashfs build"); + assert!(built.is_file()); + let size = fs::metadata(&built).unwrap().len(); + assert!(size > 0, "squashfs image should be non-empty"); + + // 4. Install + write metadata, then round-trip the lookup. + let digest = "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + let platform = Platform::host().expect("host platform must be supported"); + + let metadata = LaunchMetadata::build( + ImageConfig { + entrypoint: vec!["/bin/sh".to_string()], + cmd: vec!["-c".to_string(), "true".to_string()], + env: vec!["PATH=/bin".to_string()], + working_dir: "/sandbox".to_string(), + labels: BTreeMap::new(), + stop_signal: String::new(), + }, + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + + let installed = layout.install_fs_image(digest, platform, &built).unwrap(); + layout.write_metadata(digest, platform, &metadata).unwrap(); + assert!(installed.is_file()); + assert!(!built.exists(), "built image should be moved, not copied"); + + let hit = layout + .lookup(digest, platform) + .expect("cache lookup should hit after install"); + assert_eq!(hit.fs_image, installed); + assert_eq!(hit.metadata.argv, metadata.argv); + + // 5. A second install is idempotent (removes + re-moves into the same slot). + let rebuilt = layout.tmp_dir().join("rebuild.squashfs"); + let mut f = fs::File::create(&rebuilt).unwrap(); + f.write_all(&fs::read(&installed).unwrap()).unwrap(); + drop(f); + let reinstalled = layout.install_fs_image(digest, platform, &rebuilt).unwrap(); + assert_eq!(reinstalled, installed); +} diff --git a/crates/openshell-sandbox/src/container_env.rs b/crates/openshell-sandbox/src/container_env.rs new file mode 100644 index 000000000..f4d1c35e4 --- /dev/null +++ b/crates/openshell-sandbox/src/container_env.rs @@ -0,0 +1,248 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Clean-env mode for supervised container processes. +//! +//! When the sandbox is launched as an OCI container (VM driver with a +//! `template.image`), the guest init strips the OCI metadata vars it received +//! from the driver and repackages the final merged container env into +//! `OPENSHELL_CONTAINER_ENV_` vars before exec'ing the supervisor. It also +//! sets `OPENSHELL_CONTAINER_MODE=1`. +//! +//! In that mode the supervisor does **not** let its own environ leak to the +//! child process. It starts the child with an empty baseline and applies only +//! a documented allowlist: the container env, provider/proxy/TLS env from +//! policy, `OPENSHELL_SANDBOX=1`, and minimal shell defaults (`HOME`, `PATH`, +//! `TERM`). + +use std::collections::HashMap; +use tokio::process::Command; + +/// Env var that gates clean-env behavior. Set by the guest init when the +/// supervisor is launching an OCI image. +pub(crate) const CONTAINER_MODE_ENV: &str = "OPENSHELL_CONTAINER_MODE"; +/// `OPENSHELL_CONTAINER_ENV_COUNT` — number of container env entries. +pub(crate) const CONTAINER_ENV_COUNT: &str = "OPENSHELL_CONTAINER_ENV_COUNT"; +/// Prefix for `OPENSHELL_CONTAINER_ENV_=KEY=VALUE` entries. +pub(crate) const CONTAINER_ENV_PREFIX: &str = "OPENSHELL_CONTAINER_ENV_"; + +/// Default search PATH for the child when none was supplied by the image. +const DEFAULT_CONTAINER_PATH: &str = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"; + +/// Returns `true` when `OPENSHELL_CONTAINER_MODE=1` is set in the supervisor's +/// own environ. +pub(crate) fn is_container_mode() -> bool { + std::env::var(CONTAINER_MODE_ENV).is_ok_and(|v| v == "1") +} + +/// Read container env entries packed as `OPENSHELL_CONTAINER_ENV_=KEY=VAL` +/// and return them as an ordered `(key, value)` list. Later entries win if the +/// same key is repeated, matching the merge order produced by the host. +/// +/// Unparseable entries are skipped; they should have been validated upstream. +pub(crate) fn read_container_env() -> Vec<(String, String)> { + let count: usize = std::env::var(CONTAINER_ENV_COUNT) + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(0); + + let mut out = Vec::with_capacity(count); + for i in 0..count { + let Ok(raw) = std::env::var(format!("{CONTAINER_ENV_PREFIX}{i}")) else { + continue; + }; + let Some((key, value)) = raw.split_once('=') else { + continue; + }; + if !key.is_empty() { + out.push((key.to_string(), value.to_string())); + } + } + out +} + +/// Clear the command's inherited environ and apply a clean baseline suitable +/// for container-mode execution. +/// +/// Adds (in this order so later values win on conflict): +/// 1. Minimal shell defaults (`HOME=/sandbox`, `PATH=`, `TERM=xterm`). +/// 2. Entries from [`read_container_env`] (the OCI image env + template/spec +/// overrides). +/// 3. `OPENSHELL_SANDBOX=1` marker (always set, even if the image tried to +/// override it). +/// +/// Callers layer provider env, proxy env, and TLS env *after* this call; that +/// order matches the pre-existing non-container flow. +pub(crate) fn apply_clean_container_baseline(cmd: &mut Command) { + cmd.env_clear(); + cmd.env("HOME", "/sandbox"); + cmd.env("PATH", DEFAULT_CONTAINER_PATH); + cmd.env("TERM", "xterm"); + for (key, value) in read_container_env() { + cmd.env(key, value); + } + // OPENSHELL_SANDBOX is a documented marker for programs inside the + // sandbox. Apply after container env so images cannot disable it. + cmd.env("OPENSHELL_SANDBOX", "1"); +} + +/// Parse a `KEY=VALUE` string, or `None` if it is missing an `=`. +#[cfg(test)] +pub(crate) fn parse_kv(raw: &str) -> Option<(String, String)> { + let (key, value) = raw.split_once('=')?; + if key.is_empty() { + return None; + } + Some((key.to_string(), value.to_string())) +} + +/// Build a `HashMap` of the env vars currently set on `cmd`, for testing. +#[cfg(test)] +pub(crate) fn command_env_snapshot(cmd: &Command) -> HashMap { + cmd.as_std() + .get_envs() + .filter_map(|(k, v)| { + let key = k.to_str()?.to_string(); + let value = v?.to_str()?.to_string(); + Some((key, value)) + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Mutex; + + // Tests touch process-wide env vars; serialize them to avoid races. + static ENV_LOCK: Mutex<()> = Mutex::new(()); + + struct EnvGuard { + keys: Vec, + } + + impl EnvGuard { + fn new() -> Self { + Self { keys: Vec::new() } + } + + fn set(&mut self, key: &str, value: &str) { + self.keys.push(key.to_string()); + // SAFETY: guarded by ENV_LOCK. + #[allow(unsafe_code)] + unsafe { + std::env::set_var(key, value); + } + } + } + + impl Drop for EnvGuard { + fn drop(&mut self) { + #[allow(unsafe_code)] + unsafe { + for key in &self.keys { + std::env::remove_var(key); + } + } + } + } + + #[test] + fn is_container_mode_matches_only_when_env_is_one() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + assert!(!is_container_mode(), "default should be off"); + guard.set(CONTAINER_MODE_ENV, "0"); + assert!(!is_container_mode()); + guard.set(CONTAINER_MODE_ENV, "1"); + assert!(is_container_mode()); + } + + #[test] + fn read_container_env_decodes_ordered_pairs() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + guard.set(CONTAINER_ENV_COUNT, "3"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}0"), "A=1"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}1"), "B=2"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}2"), "PATH=/custom/bin"); + + let entries = read_container_env(); + assert_eq!( + entries, + vec![ + ("A".to_string(), "1".to_string()), + ("B".to_string(), "2".to_string()), + ("PATH".to_string(), "/custom/bin".to_string()), + ] + ); + } + + #[test] + fn read_container_env_skips_malformed_or_missing_entries() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + guard.set(CONTAINER_ENV_COUNT, "3"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}0"), "A=1"); + // index 1 is missing + guard.set(&format!("{CONTAINER_ENV_PREFIX}2"), "no-equals-sign"); + + let entries = read_container_env(); + assert_eq!(entries, vec![("A".to_string(), "1".to_string())]); + } + + #[tokio::test] + async fn apply_clean_baseline_clears_existing_env_and_seeds_defaults() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + guard.set(CONTAINER_ENV_COUNT, "1"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}0"), "FROM_IMAGE=yes"); + + let mut cmd = Command::new("/usr/bin/true"); + cmd.env("LEAKED_FROM_PARENT", "should-be-cleared"); + cmd.env("OPENSHELL_CONTROL_SECRET", "must-not-leak"); + apply_clean_container_baseline(&mut cmd); + + let env = command_env_snapshot(&cmd); + assert_eq!(env.get("HOME"), Some(&"/sandbox".to_string())); + assert_eq!(env.get("TERM"), Some(&"xterm".to_string())); + assert_eq!(env.get("FROM_IMAGE"), Some(&"yes".to_string())); + assert_eq!(env.get("OPENSHELL_SANDBOX"), Some(&"1".to_string())); + assert!( + !env.contains_key("LEAKED_FROM_PARENT"), + "pre-existing env must be cleared before baseline" + ); + assert!( + !env.contains_key("OPENSHELL_CONTROL_SECRET"), + "control-plane env must not leak" + ); + } + + #[tokio::test] + async fn container_env_cannot_override_openshell_sandbox_marker() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + guard.set(CONTAINER_ENV_COUNT, "1"); + guard.set( + &format!("{CONTAINER_ENV_PREFIX}0"), + "OPENSHELL_SANDBOX=hijacked", + ); + + let mut cmd = Command::new("/usr/bin/true"); + apply_clean_container_baseline(&mut cmd); + + let env = command_env_snapshot(&cmd); + assert_eq!(env.get("OPENSHELL_SANDBOX"), Some(&"1".to_string())); + } + + #[test] + fn parse_kv_splits_on_first_equals() { + assert_eq!( + parse_kv("A=hello=world"), + Some(("A".to_string(), "hello=world".to_string())) + ); + assert_eq!(parse_kv("A="), Some(("A".to_string(), String::new()))); + assert!(parse_kv("no-equals").is_none()); + assert!(parse_kv("=value").is_none()); + } +} diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 1fbbe90d4..d6493d632 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -7,6 +7,7 @@ pub mod bypass_monitor; mod child_env; +mod container_env; pub mod denial_aggregator; mod grpc_client; mod identity; diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 85a57b4e7..1e98a4055 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -4,6 +4,7 @@ //! Process management and signal handling. use crate::child_env; +use crate::container_env; use crate::policy::{NetworkMode, SandboxPolicy}; use crate::sandbox; #[cfg(target_os = "linux")] @@ -158,10 +159,14 @@ impl ProcessHandle { .stdin(Stdio::inherit()) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) - .kill_on_drop(true) - .env("OPENSHELL_SANDBOX", "1"); + .kill_on_drop(true); - scrub_sensitive_env(&mut cmd); + if container_env::is_container_mode() { + container_env::apply_clean_container_baseline(&mut cmd); + } else { + cmd.env("OPENSHELL_SANDBOX", "1"); + scrub_sensitive_env(&mut cmd); + } inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { @@ -285,10 +290,14 @@ impl ProcessHandle { .stdin(Stdio::inherit()) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) - .kill_on_drop(true) - .env("OPENSHELL_SANDBOX", "1"); + .kill_on_drop(true); - scrub_sensitive_env(&mut cmd); + if container_env::is_container_mode() { + container_env::apply_clean_container_baseline(&mut cmd); + } else { + cmd.env("OPENSHELL_SANDBOX", "1"); + scrub_sensitive_env(&mut cmd); + } inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index ba9425036..9970eea6d 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -161,6 +161,13 @@ struct Args { #[arg(long, env = "OPENSHELL_VM_TLS_KEY")] vm_tls_key: Option, + /// Path to the `mksquashfs` binary used by the VM driver's OCI pipeline. + /// Required for OCI-image sandboxes on the VM driver. When unset, the + /// gateway does not pass `--mksquashfs-bin` and the driver falls back to + /// the `OPENSHELL_VM_MKSQUASHFS` env var inherited from this process. + #[arg(long, env = "OPENSHELL_VM_MKSQUASHFS")] + vm_mksquashfs_bin: Option, + /// Disable TLS entirely — listen on plaintext HTTP. /// Use this when the gateway sits behind a reverse proxy or tunnel /// (e.g. Cloudflare Tunnel) that terminates TLS at the edge. @@ -269,6 +276,7 @@ async fn run_from_args(args: Args) -> Result<()> { guest_tls_ca: args.vm_tls_ca, guest_tls_cert: args.vm_tls_cert, guest_tls_key: args.vm_tls_key, + mksquashfs_bin: args.vm_mksquashfs_bin, }; if args.disable_tls { diff --git a/crates/openshell-server/src/compute/vm.rs b/crates/openshell-server/src/compute/vm.rs index d0f397b01..24d624ad6 100644 --- a/crates/openshell-server/src/compute/vm.rs +++ b/crates/openshell-server/src/compute/vm.rs @@ -78,6 +78,12 @@ pub struct VmComputeConfig { /// Host-side private key for the guest's mTLS client bundle. pub guest_tls_key: Option, + + /// Optional path to the `mksquashfs` binary used by the VM driver's OCI + /// pipeline to build read-only base images. When `None`, the gateway does + /// not pass `--mksquashfs-bin`; the driver falls back to the + /// `OPENSHELL_VM_MKSQUASHFS` env var inherited from the gateway process. + pub mksquashfs_bin: Option, } impl VmComputeConfig { @@ -117,6 +123,7 @@ impl Default for VmComputeConfig { guest_tls_ca: None, guest_tls_cert: None, guest_tls_key: None, + mksquashfs_bin: None, } } } @@ -209,6 +216,73 @@ pub(crate) fn compute_driver_guest_tls_paths( Ok(Some(VmGuestTlsPaths { ca, cert, key })) } +/// Build the argv the gateway passes to the `openshell-driver-vm` subprocess. +/// +/// Factored out of [`spawn`] so it can be unit-tested without actually +/// launching the driver. `socket_path` is the UDS the driver will listen on; +/// `guest_tls_paths` is the resolved output of [`compute_driver_guest_tls_paths`]. +/// +/// The returned vector excludes `argv[0]` — callers append it to a `Command` +/// that was already constructed with the driver binary path. +#[cfg(unix)] +pub(crate) fn build_driver_argv( + config: &Config, + vm_config: &VmComputeConfig, + socket_path: &std::path::Path, + guest_tls_paths: Option<&VmGuestTlsPaths>, +) -> Vec { + use std::ffi::OsString; + fn push_pair(argv: &mut Vec, flag: &str, value: &str) { + argv.push(OsString::from(flag)); + argv.push(OsString::from(value)); + } + + let mut argv: Vec = Vec::new(); + argv.push(OsString::from("--bind-socket")); + argv.push(socket_path.as_os_str().to_os_string()); + push_pair(&mut argv, "--log-level", &config.log_level); + push_pair(&mut argv, "--openshell-endpoint", &config.grpc_endpoint); + argv.push(OsString::from("--state-dir")); + argv.push(vm_config.state_dir.as_os_str().to_os_string()); + push_pair( + &mut argv, + "--ssh-handshake-secret", + &config.ssh_handshake_secret, + ); + push_pair( + &mut argv, + "--ssh-handshake-skew-secs", + &config.ssh_handshake_skew_secs.to_string(), + ); + push_pair( + &mut argv, + "--krun-log-level", + &vm_config.krun_log_level.to_string(), + ); + push_pair(&mut argv, "--vcpus", &vm_config.vcpus.to_string()); + push_pair(&mut argv, "--mem-mib", &vm_config.mem_mib.to_string()); + if let Some(tls) = guest_tls_paths { + argv.push(OsString::from("--guest-tls-ca")); + argv.push(tls.ca.as_os_str().to_os_string()); + argv.push(OsString::from("--guest-tls-cert")); + argv.push(tls.cert.as_os_str().to_os_string()); + argv.push(OsString::from("--guest-tls-key")); + argv.push(tls.key.as_os_str().to_os_string()); + } + // Plumb the gateway-configured sandbox image through to the VM driver so + // `GetCapabilities.default_image` matches the gateway's configuration. + // Empty string is a valid value meaning "no default"; the flag always has + // a default of "" on the driver side so we pass it unconditionally. + push_pair(&mut argv, "--default-image", &config.sandbox_image); + // Pass an explicit mksquashfs path when the operator configured one so + // OCI sandboxes work without relying on env inheritance. + if let Some(mksquashfs) = vm_config.mksquashfs_bin.as_ref() { + argv.push(OsString::from("--mksquashfs-bin")); + argv.push(mksquashfs.as_os_str().to_os_string()); + } + argv +} + /// Launch the VM compute-driver subprocess, wait for its UDS to come up, /// and return a gRPC `Channel` connected to it plus a process handle that /// kills the subprocess and removes the socket on drop. @@ -250,27 +324,8 @@ pub(crate) async fn spawn( command.stdin(Stdio::null()); command.stdout(Stdio::inherit()); command.stderr(Stdio::inherit()); - command.arg("--bind-socket").arg(&socket_path); - command.arg("--log-level").arg(&config.log_level); - command - .arg("--openshell-endpoint") - .arg(&config.grpc_endpoint); - command.arg("--state-dir").arg(&vm_config.state_dir); - command - .arg("--ssh-handshake-secret") - .arg(&config.ssh_handshake_secret); - command - .arg("--ssh-handshake-skew-secs") - .arg(config.ssh_handshake_skew_secs.to_string()); - command - .arg("--krun-log-level") - .arg(vm_config.krun_log_level.to_string()); - command.arg("--vcpus").arg(vm_config.vcpus.to_string()); - command.arg("--mem-mib").arg(vm_config.mem_mib.to_string()); - if let Some(tls) = guest_tls_paths { - command.arg("--guest-tls-ca").arg(tls.ca); - command.arg("--guest-tls-cert").arg(tls.cert); - command.arg("--guest-tls-key").arg(tls.key); + for arg in build_driver_argv(config, vm_config, &socket_path, guest_tls_paths.as_ref()) { + command.arg(arg); } let mut child = command.spawn().map_err(|e| { @@ -353,10 +408,23 @@ async fn connect_compute_driver(socket_path: &std::path::Path) -> Result bool { + argv.windows(2) + .any(|pair| pair[0] == OsString::from(flag) && pair[1] == OsString::from(value)) + } + + fn argv_contains_flag(argv: &[OsString], flag: &str) -> bool { + argv.iter().any(|arg| arg == &OsString::from(flag)) + } + #[test] fn vm_compute_driver_tls_requires_explicit_guest_bundle() { let dir = tempdir().unwrap(); @@ -426,4 +494,106 @@ mod tests { assert_ne!(guest_paths.cert, server_cert); assert_ne!(guest_paths.key, server_key); } + + #[test] + fn build_driver_argv_passes_configured_sandbox_image_as_default_image() { + let config = Config::new(None) + .with_grpc_endpoint("http://127.0.0.1:8080") + .with_sandbox_image("docker.io/library/alpine:3.20"); + let vm_config = VmComputeConfig::default(); + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, None); + + assert!( + argv_contains_pair(&argv, "--default-image", "docker.io/library/alpine:3.20"), + "expected --default-image to be plumbed from sandbox_image: {argv:?}" + ); + } + + #[test] + fn build_driver_argv_passes_empty_default_image_when_gateway_has_no_sandbox_image() { + // sandbox_image defaults to "" — the driver treats that as "no default" + // and falls back to the legacy non-OCI supervisor boot. We still want + // the flag present so the driver's value cannot diverge from the + // gateway's intent silently. + let config = Config::new(None).with_grpc_endpoint("http://127.0.0.1:8080"); + let vm_config = VmComputeConfig::default(); + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, None); + + assert!( + argv_contains_pair(&argv, "--default-image", ""), + "expected --default-image '' to be passed explicitly: {argv:?}" + ); + } + + #[test] + fn build_driver_argv_passes_mksquashfs_bin_when_configured() { + let config = Config::new(None).with_grpc_endpoint("http://127.0.0.1:8080"); + let vm_config = VmComputeConfig { + mksquashfs_bin: Some(PathBuf::from("/usr/local/bin/mksquashfs")), + ..Default::default() + }; + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, None); + + assert!( + argv_contains_pair(&argv, "--mksquashfs-bin", "/usr/local/bin/mksquashfs"), + "expected --mksquashfs-bin flag: {argv:?}" + ); + } + + #[test] + fn build_driver_argv_omits_mksquashfs_bin_when_unconfigured() { + let config = Config::new(None).with_grpc_endpoint("http://127.0.0.1:8080"); + let vm_config = VmComputeConfig::default(); + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, None); + + assert!( + !argv_contains_flag(&argv, "--mksquashfs-bin"), + "--mksquashfs-bin should be absent when vm_config.mksquashfs_bin is None: {argv:?}" + ); + } + + #[test] + fn build_driver_argv_passes_guest_tls_triplet_when_present() { + let config = Config::new(None).with_grpc_endpoint("https://gateway.internal:8443"); + let vm_config = VmComputeConfig::default(); + let tls = VmGuestTlsPaths { + ca: PathBuf::from("/tls/ca.crt"), + cert: PathBuf::from("/tls/tls.crt"), + key: PathBuf::from("/tls/tls.key"), + }; + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, Some(&tls)); + + assert!(argv_contains_pair(&argv, "--guest-tls-ca", "/tls/ca.crt")); + assert!(argv_contains_pair( + &argv, + "--guest-tls-cert", + "/tls/tls.crt" + )); + assert!(argv_contains_pair(&argv, "--guest-tls-key", "/tls/tls.key")); + } + + #[test] + fn build_driver_argv_socket_flag_points_at_provided_path() { + let config = Config::new(None).with_grpc_endpoint("http://127.0.0.1:8080"); + let vm_config = VmComputeConfig::default(); + let socket = Path::new("/var/run/openshell/driver.sock"); + + let argv = build_driver_argv(&config, &vm_config, socket, None); + + assert!(argv_contains_pair( + &argv, + "--bind-socket", + "/var/run/openshell/driver.sock" + )); + } } From afdf0574e8324b19026498315b98b93225e34a6c Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 19 Apr 2026 21:51:53 -0700 Subject: [PATCH 2/2] docs(vm): convert vm-driver architecture diagrams to mermaid Replace the ASCII-art overview with a mermaid flowchart that renders in GitHub's UI, and add two supporting diagrams: - Host pipeline flow: cache hit vs miss (pull \u2192 flatten \u2192 compat \u2192 squashfs \u2192 install \u2192 attach). - Guest init decision tree: probe `OPENSHELL_OCI_ARGC`, resolve disks by serial, build overlay, pivot_root, exec supervisor. - Storage layering: shared RO base, per-sandbox ext4 upper/work, and workspace bind-mount composing the sandbox runtime view. The numbered `oci_launch_supervisor` step list is retained alongside the flowchart because the precise ordering (e.g. bind-mount /proc /sys /dev before pivot_root) matters for anyone editing the init script. --- architecture/vm-driver.md | 173 +++++++++++++++++++++++++++----------- 1 file changed, 124 insertions(+), 49 deletions(-) diff --git a/architecture/vm-driver.md b/architecture/vm-driver.md index ea5baaf1b..be3a6c6ec 100644 --- a/architecture/vm-driver.md +++ b/architecture/vm-driver.md @@ -19,38 +19,47 @@ supervisor `pivot_root`s into it before launching the image entrypoint. ## OCI container execution model -``` -┌───────────────────────────── Host ──────────────────────────────┐ -│ │ -│ openshell-driver-vm │ -│ └─ OCI manager │ -│ ├─ oci-client: pull manifest, config, layers │ -│ ├─ flatten layers (apply whiteouts) │ -│ ├─ inject sandbox user, /sandbox, /tmp, placeholder etc. │ -│ ├─ build squashfs via mksquashfs (zstd) │ -│ └─ cache under /oci-cache/ │ -│ blobs/, fs/..squashfs, meta/*.json │ -│ │ -│ Per-sandbox state dir │ -│ ├─ sandbox-state.raw (sparse ext4 upper + workdir) │ -│ └─ rootfs-console.log │ -│ │ -│ ▼ krun_add_disk3 × 2 + set_exec env│ -├─────────────────────────── Guest VM ────────────────────────────┤ -│ │ -│ /dev/vda = RO base squashfs ──mount ro──▶ /base │ -│ /dev/vdb = sandbox-state.raw ──mkfs.ext4─▶ /state │ -│ │ -│ overlay (lowerdir=/base, upperdir=/state/upper, │ -│ workdir=/state/work) ──▶ /state/merged │ -│ /state/workspace ──bind──▶ /state/merged/sandbox │ -│ │ -│ pivot_root /state/merged ──▶ supervisor sees overlay as `/` │ -│ │ -│ openshell-sandbox --workdir -- │ -│ └─ policy, Landlock, seccomp, SSH, OCSF logging as usual │ -│ │ -└─────────────────────────────────────────────────────────────────┘ +```mermaid +flowchart TB + subgraph host["Host"] + driver["openshell-driver-vm"] + subgraph oci["OCI manager"] + pull["oci-client: pull manifest, config, layers"] + flatten["flatten layers (apply whiteouts)"] + inject["inject sandbox user, /sandbox, /tmp, /etc stubs"] + build["build squashfs via mksquashfs (zstd)"] + pull --> flatten --> inject --> build + end + cache[("<state>/oci-cache/
blobs/, fs/<digest>.<arch>.squashfs,
meta/*.json")] + statedir[("Per-sandbox state dir
sandbox-state.raw (ext4 upper + workdir)
rootfs-console.log")] + driver --> oci --> cache + driver --> statedir + end + + driver -- "krun_add_disk3 × 2 + set_exec env" --> guest + + subgraph guest["Guest VM"] + direction TB + vda["/dev/vda = RO base squashfs"] + vdb["/dev/vdb = sandbox-state.raw"] + base["/base (ro)"] + st["/state (ext4)"] + vda -- "mount ro" --> base + vdb -- "mkfs.ext4 + mount" --> st + + overlay["/state/merged
overlay(lower=/base, upper=/state/upper,
work=/state/work)"] + workspace["/state/workspace
bind-mounted over /sandbox"] + base --> overlay + st --> overlay + st --> workspace + workspace --> overlay + + pivot["pivot_root /state/merged
supervisor sees overlay as /"] + overlay --> pivot + + supervisor["openshell-sandbox --workdir <OCI workdir> -- <OCI argv>
policy, Landlock, seccomp, SSH, OCSF logging"] + pivot --> supervisor + end ``` ### Host pipeline @@ -73,36 +82,79 @@ Cache is keyed by `(manifest digest, platform)`. Repeated launches of the same image skip pull and rebuild entirely — the driver just attaches the cached squashfs to the VM. +```mermaid +flowchart LR + req["CreateSandbox
template.image=<ref>"] --> resolve[effective_image_ref] + resolve --> pull["oci-client pull
manifest digest"] + pull --> lookup{"cache.lookup(digest, platform)"} + lookup -- hit --> attach[attach cached squashfs
+ per-sandbox state disk] + lookup -- miss --> layers[fetch layers] + layers --> flat[flatten + whiteout] + flat --> compat[compat inject] + compat --> mksquash[mksquashfs zstd] + mksquash --> install[atomic install_fs_image
write metadata] + install --> attach + attach --> launch[launch microVM] +``` + ### Guest init and pivot `crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh` is the guest's PID 1. OCI mode is gated on `OPENSHELL_OCI_ARGC` being set in -the guest environ (delivered via libkrun `set_exec`). When set, -`oci_launch_supervisor`: - -1. Mounts the RO base device (default `/dev/vda`, overridable via - `OPENSHELL_VM_OCI_BASE_DEVICE`) at `/base`. -2. Formats the state device (`/dev/vdb`) with ext4 on first boot, - mounts at `/state`. -3. Creates `/state/upper`, `/state/work`, `/state/merged`, and +the guest environ (delivered via libkrun `set_exec`). + +```mermaid +flowchart TD + boot([PID 1: init boots]) --> mountfs[Mount /proc, /sys, /dev, /tmp, /run] + mountfs --> net[Bring up eth0 + DHCP] + net --> gate{OPENSHELL_OCI_ARGC set?} + gate -- No --> legacy[exec openshell-sandbox --workdir /sandbox
legacy guest-rootfs boot] + gate -- Yes --> resolve[Resolve base + state disks
by /sys/block/vd*/serial] + resolve --> mntbase[mount -o ro base → /base] + mntbase --> fmt{state disk
formatted?} + fmt -- No --> mkfs[mkfs.ext4 state disk] + fmt -- Yes --> mntstate + mkfs --> mntstate[mount state → /state] + mntstate --> mkdirs[mkdir /state/upper /state/work
/state/merged /state/workspace] + mkdirs --> overlay[mount -t overlay overlay
lowerdir=/base,upperdir=/state/upper,
workdir=/state/work /state/merged] + overlay --> bindws[bind-mount /state/workspace
over /state/merged/sandbox] + bindws --> resolv[Synthesize /etc/resolv.conf
if image lacks one] + resolv --> tls[Copy $OPENSHELL_TLS_CA →
/state/merged/opt/openshell/tls/ca.crt] + tls --> copysup[Copy supervisor binary into
/state/merged/opt/openshell/bin/] + copysup --> bindps[bind-mount /proc /sys /dev
into /state/merged] + bindps --> pivot[pivot_root /state/merged
umount -l /.old_root] + pivot --> translate[Translate OPENSHELL_OCI_ENV_* →
OPENSHELL_CONTAINER_ENV_*
set OPENSHELL_CONTAINER_MODE=1] + translate --> execsup[exec openshell-sandbox
--workdir $OCI_WORKDIR -- $OCI_ARGV] +``` + +`oci_launch_supervisor` steps: + +1. Resolves the RO base device (`block_id=oci-base`) and state device + (`block_id=sandbox-state`) by walking `/sys/block/vd*/serial`. Falls + back to `/dev/vda` / `/dev/vdb` when serial lookup is unavailable; + `OPENSHELL_VM_OCI_BASE_DEVICE` / `OPENSHELL_VM_STATE_DEVICE` short- + circuit the lookup for tests and operator debugging. +2. Mounts the RO base at `/base`. +3. Formats the state device with ext4 on first boot, mounts at `/state`. +4. Creates `/state/upper`, `/state/work`, `/state/merged`, and `/state/workspace`. -4. Mounts overlay +5. Mounts overlay `lowerdir=/base,upperdir=/state/upper,workdir=/state/work` at `/state/merged`. -5. Bind-mounts `/state/workspace` over the image's `/sandbox` so the +6. Bind-mounts `/state/workspace` over the image's `/sandbox` so the workdir is writable on the state disk. -6. Synthesizes `/etc/resolv.conf` if the image didn't ship one. -7. Copies the gateway-issued TLS CA (if `$OPENSHELL_TLS_CA` is set) +7. Synthesizes `/etc/resolv.conf` if the image didn't ship one. +8. Copies the gateway-issued TLS CA (if `$OPENSHELL_TLS_CA` is set) into `/opt/openshell/tls/ca.crt` inside the overlay so post-pivot SSL trust paths stay valid. -8. Copies the supervisor binary into the upper layer (reaches the state +9. Copies the supervisor binary into the upper layer (reaches the state disk, not the RO base). -9. Bind-mounts `/proc`, `/sys`, `/dev` into the overlay. -10. Bind-mounts `/state/merged` onto itself, `pivot_root`s into it, and +10. Bind-mounts `/proc`, `/sys`, `/dev` into the overlay. +11. Bind-mounts `/state/merged` onto itself, `pivot_root`s into it, and lazy-unmounts the old root. -11. Translates `OPENSHELL_OCI_ENV_` → `OPENSHELL_CONTAINER_ENV_`, +12. Translates `OPENSHELL_OCI_ENV_` → `OPENSHELL_CONTAINER_ENV_`, sets `OPENSHELL_CONTAINER_MODE=1`, and unsets the OCI source vars. -12. Reconstructs argv from `OPENSHELL_OCI_ARGV_` and execs +13. Reconstructs argv from `OPENSHELL_OCI_ARGV_` and execs `openshell-sandbox --workdir "$OCI_WORKDIR" -- `. ### Supervisor clean-env mode @@ -129,6 +181,29 @@ unset, the supervisor keeps its historical env-inheritance behavior. The overlay design replaces an earlier "unpack fresh tar per sandbox" model that's still described in the initial plan: +```mermaid +flowchart TB + subgraph shared["Shared (host, per-image)"] + base[("<state>/oci-cache/fs/
<digest>.<plat>.squashfs
(read-only, never GC'd per sandbox)")] + end + subgraph persandbox["Per-sandbox state dir"] + raw[("sandbox-state.raw
sparse 16 GiB ext4")] + upper["/state/upper
overlay upper"] + work["/state/work
overlay workdir"] + ws["/state/workspace
bind-mounted over /sandbox"] + raw --> upper + raw --> work + raw --> ws + end + subgraph view["Sandbox runtime view"] + merged["/ (post pivot_root)"] + end + base -- lowerdir --> merged + upper -- upperdir --> merged + work -- workdir --> merged + ws -- bind /sandbox --> merged +``` + - **Base**: one squashfs per `(manifest digest, platform)`, shared across every sandbox that uses the image. Never deleted by the per-sandbox delete path.