diff --git a/Cargo.lock b/Cargo.lock index 4b29a0c7f..b67ffe5eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -402,6 +402,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd307490d624467aa6f74b0eabb77633d1f758a7b25f12bceb0b22e08d9726f6" +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -808,6 +814,27 @@ version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" +[[package]] +name = "const_format" +version = "0.2.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4481a617ad9a412be3b97c5d403fef8ed023103368908b9c50af598ff467cc1e" +dependencies = [ + "const_format_proc_macros", + "konst", +] + +[[package]] +name = "const_format_proc_macros" +version = "0.2.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d57c2eccfb16dbac1f4e61e206105db5820c9d26c3c472bc17c774259ef7744" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + [[package]] name = "constant_time_eq" version = "0.4.2" @@ -1166,6 +1193,37 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.117", +] + [[package]] name = "dialoguer" version = "0.11.0" @@ -1633,6 +1691,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "getset" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf0fc11e47561d47397154977bc219f4cf809b2974facc3ccb3b89e2436f912" +dependencies = [ + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "ghash" version = "0.5.1" @@ -1837,6 +1907,15 @@ dependencies = [ "itoa", ] +[[package]] +name = "http-auth" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "150fa4a9462ef926824cf4519c84ed652ca8f4fbae34cb8af045b5cbcaf98822" +dependencies = [ + "memchr", +] + [[package]] name = "http-body" version = "1.0.1" @@ -2349,6 +2428,21 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "jwt" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6204285f77fe7d9784db3fdc449ecce1a0114927a51d5a41c4c7a292011c015f" +dependencies = [ + "base64 0.13.1", + "crypto-common 0.1.7", + "digest 0.10.7", + "hmac", + "serde", + "serde_json", + "sha2 0.10.9", +] + [[package]] name = "k8s-openapi" version = "0.21.1" @@ -2362,6 +2456,21 @@ dependencies = [ "serde_json", ] +[[package]] +name = "konst" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "128133ed7824fcd73d6e7b17957c5eb7bacb885649bd8c69708b2331a10bcefb" +dependencies = [ + "konst_macro_rules", +] + +[[package]] +name = "konst_macro_rules" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4933f3f57a8e9d9da04db23fb153356ecaf00cbd14aee46279c33dc80925c37" + [[package]] name = "kube" version = "0.90.0" @@ -2966,6 +3075,60 @@ dependencies = [ "memchr", ] +[[package]] +name = "oci-client" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b74df13319e08bc386d333d3dc289c774c88cc543cae31f5347db07b5ec2172" +dependencies = [ + "bytes", + "chrono", + "futures-util", + "http", + "http-auth", + "jwt", + "lazy_static", + "oci-spec", + "olpc-cjson", + "regex", + "reqwest", + "serde", + "serde_json", + "sha2 0.10.9", + "thiserror 2.0.18", + "tokio", + "tracing", + "unicase", +] + +[[package]] +name = "oci-spec" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc3da52b83ce3258fbf29f66ac784b279453c2ac3c22c5805371b921ede0d308" +dependencies = [ + "const_format", + "derive_builder", + "getset", + "regex", + "serde", + "serde_json", + "strum 0.27.2", + "strum_macros 0.27.2", + "thiserror 2.0.18", +] + +[[package]] +name = "olpc-cjson" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "696183c9b5fe81a7715d074fd632e8bd46f4ccc0231a3ed7fc580a80de5f7083" +dependencies = [ + "serde", + "serde_json", + "unicode-normalization", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -3095,14 +3258,21 @@ name = "openshell-driver-vm" version = "0.0.0" dependencies = [ "clap", + "flate2", "futures", "libc", "libloading", "miette", "nix", + "oci-client", "openshell-core", "prost-types", + "serde", + "serde_json", + "sha2 0.10.9", "tar", + "tempfile", + "thiserror 2.0.18", "tokio", "tokio-stream", "tonic", @@ -3996,7 +4166,7 @@ dependencies = [ "lru", "paste", "stability", - "strum", + "strum 0.26.3", "unicode-segmentation", "unicode-truncate", "unicode-width 0.1.14", @@ -4112,12 +4282,14 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-rustls", + "tokio-util", "tower 0.5.3", "tower-http 0.6.8", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", "webpki-roots 1.0.6", ] @@ -5102,9 +5274,15 @@ version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" dependencies = [ - "strum_macros", + "strum_macros 0.26.4", ] +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" + [[package]] name = "strum_macros" version = "0.26.4" @@ -5118,6 +5296,18 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "subtle" version = "2.6.1" @@ -5764,6 +5954,12 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + [[package]] name = "unicode-bidi" version = "0.3.18" @@ -6035,6 +6231,19 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasmparser" version = "0.244.0" diff --git a/architecture/vm-driver.md b/architecture/vm-driver.md new file mode 100644 index 000000000..be3a6c6ec --- /dev/null +++ b/architecture/vm-driver.md @@ -0,0 +1,271 @@ +# VM Compute Driver + +> Status: Experimental. The VM compute driver is a second-generation +> compute backend for OpenShell sandboxes. Kubernetes remains the default. + +## Overview + +`openshell-driver-vm` is an in-process compute driver that runs each +sandbox as a libkrun microVM on the host. Unlike the Kubernetes driver, +it has no orchestrator dependency — the driver is a single binary that +exposes the `ComputeDriver` gRPC service and manages VMs directly. + +A sandbox spec can optionally include `template.image`, an OCI image +reference. When set, the driver treats the image as the **sandbox +payload** (the user's container filesystem), not the guest OS. The fixed +libkrun guest rootfs still boots the control plane (init script, +supervisor, SSH); the OCI image is mounted as an overlay and the +supervisor `pivot_root`s into it before launching the image entrypoint. + +## OCI container execution model + +```mermaid +flowchart TB + subgraph host["Host"] + driver["openshell-driver-vm"] + subgraph oci["OCI manager"] + pull["oci-client: pull manifest, config, layers"] + flatten["flatten layers (apply whiteouts)"] + inject["inject sandbox user, /sandbox, /tmp, /etc stubs"] + build["build squashfs via mksquashfs (zstd)"] + pull --> flatten --> inject --> build + end + cache[("<state>/oci-cache/
blobs/, fs/<digest>.<arch>.squashfs,
meta/*.json")] + statedir[("Per-sandbox state dir
sandbox-state.raw (ext4 upper + workdir)
rootfs-console.log")] + driver --> oci --> cache + driver --> statedir + end + + driver -- "krun_add_disk3 × 2 + set_exec env" --> guest + + subgraph guest["Guest VM"] + direction TB + vda["/dev/vda = RO base squashfs"] + vdb["/dev/vdb = sandbox-state.raw"] + base["/base (ro)"] + st["/state (ext4)"] + vda -- "mount ro" --> base + vdb -- "mkfs.ext4 + mount" --> st + + overlay["/state/merged
overlay(lower=/base, upper=/state/upper,
work=/state/work)"] + workspace["/state/workspace
bind-mounted over /sandbox"] + base --> overlay + st --> overlay + st --> workspace + workspace --> overlay + + pivot["pivot_root /state/merged
supervisor sees overlay as /"] + overlay --> pivot + + supervisor["openshell-sandbox --workdir <OCI workdir> -- <OCI argv>
policy, Landlock, seccomp, SSH, OCSF logging"] + pivot --> supervisor + end +``` + +### Host pipeline + +`crates/openshell-driver-vm/src/oci/` owns the host pipeline. The +top-level entrypoint is `oci::prepare(puller, cache, build_opts, +image_ref, env_overrides)`: + +| Module | Responsibility | +|---|---| +| `client.rs` | Anonymous pull via `oci-client` with a platform resolver pinned to `linux/amd64` or `linux/arm64`. Normalizes the OCI image config into `ImageConfig`. | +| `flatten.rs` | Applies OCI layer tars in order with whiteout handling (`.wh.*`, `.wh..wh..opq`). Rejects absolute/parent-traversal paths. Dispatches on media type (`tar`, `tar+gzip`). | +| `compat.rs` | Injects `sandbox:10001:10001` into `/etc/passwd` + `/etc/group`, ensures `/sandbox` (0755) and `/tmp` (1777) exist, writes placeholder `/etc/hosts` and `/etc/resolv.conf`. Idempotent. Picks best shell (`/bin/sh` → `/sbin/nologin` → `/bin/false`). | +| `fs_image.rs` | Shells out to `mksquashfs` with explicit binary path (no `$PATH` reliance), zstd by default. | +| `cache.rs` | Content-addressed layout `blobs/ + fs/..squashfs + meta/..json + tmp/`. Atomic writes; idempotent `lookup()` + `install_fs_image()`. | +| `metadata.rs` | `LaunchMetadata::build` — argv = `Entrypoint + Cmd` (precedence), workdir fallback `/sandbox`, env merge `OCI < template < spec`. `to_guest_env_vars()` packs into `OPENSHELL_OCI_ARGC/ARGV_/ENV_COUNT/ENV_/WORKDIR`. | +| `pipeline.rs` | End-to-end orchestrator. On cache hit, zero network I/O. On miss: pull → flatten → inject → build → install. | + +Cache is keyed by `(manifest digest, platform)`. Repeated launches of +the same image skip pull and rebuild entirely — the driver just attaches +the cached squashfs to the VM. + +```mermaid +flowchart LR + req["CreateSandbox
template.image=<ref>"] --> resolve[effective_image_ref] + resolve --> pull["oci-client pull
manifest digest"] + pull --> lookup{"cache.lookup(digest, platform)"} + lookup -- hit --> attach[attach cached squashfs
+ per-sandbox state disk] + lookup -- miss --> layers[fetch layers] + layers --> flat[flatten + whiteout] + flat --> compat[compat inject] + compat --> mksquash[mksquashfs zstd] + mksquash --> install[atomic install_fs_image
write metadata] + install --> attach + attach --> launch[launch microVM] +``` + +### Guest init and pivot + +`crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh` is the +guest's PID 1. OCI mode is gated on `OPENSHELL_OCI_ARGC` being set in +the guest environ (delivered via libkrun `set_exec`). + +```mermaid +flowchart TD + boot([PID 1: init boots]) --> mountfs[Mount /proc, /sys, /dev, /tmp, /run] + mountfs --> net[Bring up eth0 + DHCP] + net --> gate{OPENSHELL_OCI_ARGC set?} + gate -- No --> legacy[exec openshell-sandbox --workdir /sandbox
legacy guest-rootfs boot] + gate -- Yes --> resolve[Resolve base + state disks
by /sys/block/vd*/serial] + resolve --> mntbase[mount -o ro base → /base] + mntbase --> fmt{state disk
formatted?} + fmt -- No --> mkfs[mkfs.ext4 state disk] + fmt -- Yes --> mntstate + mkfs --> mntstate[mount state → /state] + mntstate --> mkdirs[mkdir /state/upper /state/work
/state/merged /state/workspace] + mkdirs --> overlay[mount -t overlay overlay
lowerdir=/base,upperdir=/state/upper,
workdir=/state/work /state/merged] + overlay --> bindws[bind-mount /state/workspace
over /state/merged/sandbox] + bindws --> resolv[Synthesize /etc/resolv.conf
if image lacks one] + resolv --> tls[Copy $OPENSHELL_TLS_CA →
/state/merged/opt/openshell/tls/ca.crt] + tls --> copysup[Copy supervisor binary into
/state/merged/opt/openshell/bin/] + copysup --> bindps[bind-mount /proc /sys /dev
into /state/merged] + bindps --> pivot[pivot_root /state/merged
umount -l /.old_root] + pivot --> translate[Translate OPENSHELL_OCI_ENV_* →
OPENSHELL_CONTAINER_ENV_*
set OPENSHELL_CONTAINER_MODE=1] + translate --> execsup[exec openshell-sandbox
--workdir $OCI_WORKDIR -- $OCI_ARGV] +``` + +`oci_launch_supervisor` steps: + +1. Resolves the RO base device (`block_id=oci-base`) and state device + (`block_id=sandbox-state`) by walking `/sys/block/vd*/serial`. Falls + back to `/dev/vda` / `/dev/vdb` when serial lookup is unavailable; + `OPENSHELL_VM_OCI_BASE_DEVICE` / `OPENSHELL_VM_STATE_DEVICE` short- + circuit the lookup for tests and operator debugging. +2. Mounts the RO base at `/base`. +3. Formats the state device with ext4 on first boot, mounts at `/state`. +4. Creates `/state/upper`, `/state/work`, `/state/merged`, and + `/state/workspace`. +5. Mounts overlay + `lowerdir=/base,upperdir=/state/upper,workdir=/state/work` at + `/state/merged`. +6. Bind-mounts `/state/workspace` over the image's `/sandbox` so the + workdir is writable on the state disk. +7. Synthesizes `/etc/resolv.conf` if the image didn't ship one. +8. Copies the gateway-issued TLS CA (if `$OPENSHELL_TLS_CA` is set) + into `/opt/openshell/tls/ca.crt` inside the overlay so post-pivot + SSL trust paths stay valid. +9. Copies the supervisor binary into the upper layer (reaches the state + disk, not the RO base). +10. Bind-mounts `/proc`, `/sys`, `/dev` into the overlay. +11. Bind-mounts `/state/merged` onto itself, `pivot_root`s into it, and + lazy-unmounts the old root. +12. Translates `OPENSHELL_OCI_ENV_` → `OPENSHELL_CONTAINER_ENV_`, + sets `OPENSHELL_CONTAINER_MODE=1`, and unsets the OCI source vars. +13. Reconstructs argv from `OPENSHELL_OCI_ARGV_` and execs + `openshell-sandbox --workdir "$OCI_WORKDIR" -- `. + +### Supervisor clean-env mode + +`crates/openshell-sandbox/src/container_env.rs` gates on +`OPENSHELL_CONTAINER_MODE=1`. When active, the supervisor calls +`Command::env_clear()` on the child and applies only the documented +allowlist: + +- `HOME=/sandbox`, `PATH=`, `TERM=xterm` +- Container env from `OPENSHELL_CONTAINER_ENV_` (OCI + template/spec + merge) +- `OPENSHELL_SANDBOX=1` (applied last — images cannot override the + marker) +- Provider env, proxy env, TLS env from policy (layered on top by the + existing spawn path) + +Control-plane vars (`OPENSHELL_SSH_HANDSHAKE_SECRET`, driver internals, +etc.) never reach the child process. When `OPENSHELL_CONTAINER_MODE` is +unset, the supervisor keeps its historical env-inheritance behavior. + +## Storage: shared RO base + per-sandbox CoW + +The overlay design replaces an earlier "unpack fresh tar per sandbox" +model that's still described in the initial plan: + +```mermaid +flowchart TB + subgraph shared["Shared (host, per-image)"] + base[("<state>/oci-cache/fs/
<digest>.<plat>.squashfs
(read-only, never GC'd per sandbox)")] + end + subgraph persandbox["Per-sandbox state dir"] + raw[("sandbox-state.raw
sparse 16 GiB ext4")] + upper["/state/upper
overlay upper"] + work["/state/work
overlay workdir"] + ws["/state/workspace
bind-mounted over /sandbox"] + raw --> upper + raw --> work + raw --> ws + end + subgraph view["Sandbox runtime view"] + merged["/ (post pivot_root)"] + end + base -- lowerdir --> merged + upper -- upperdir --> merged + work -- workdir --> merged + ws -- bind /sandbox --> merged +``` + +- **Base**: one squashfs per `(manifest digest, platform)`, shared + across every sandbox that uses the image. Never deleted by the + per-sandbox delete path. +- **Upper + workdir**: per-sandbox ext4 on `sandbox-state.raw`. Sparse + 16 GiB default, grows on first write. Deleted with the sandbox state + dir on `DeleteSandbox`. +- **Workspace**: `/state/workspace` bind-mounted over the image's + `/sandbox`. Persists alongside the state disk. + +Cold start for a repeat launch of the same image is near-instant: a +block attach and two mounts; no registry round-trip, no layer +flattening, no squashfs build. + +GC of the RO base cache is out of scope for v1. Operators must manage +`/oci-cache/fs/*` and `/oci-cache/blobs/**` manually if +they need to reclaim space. + +## Driver configuration + +| Flag / env var | Meaning | +|---|---| +| `--default-image` / `OPENSHELL_VM_DRIVER_DEFAULT_IMAGE` | Image used when a sandbox spec omits `template.image`. Advertised via `GetCapabilities.default_image`. Empty string disables defaulting — sandboxes without an image fall through to the legacy (non-OCI) guest-rootfs supervisor. | +| `--mksquashfs-bin` / `OPENSHELL_VM_MKSQUASHFS` | Path to the `mksquashfs` binary. Required for OCI sandboxes. Unset → OCI requests are rejected with `FailedPrecondition`. | +| `OPENSHELL_VM_DRIVER_STATE_DIR` | Root for per-sandbox state and `oci-cache/`. | + +`GetCapabilities` now reports: + +```json +{ + "driver_name": "openshell-driver-vm", + "driver_version": "", + "default_image": "", + "supports_gpu": false +} +``` + +## v1 scope and assumptions + +- Public OCI registries only. No authentication. +- Linux images only. `linux/amd64` or `linux/arm64` matching the host. +- One image per sandbox. No init containers or sidecars. +- The entrypoint always runs as `sandbox:sandbox` (UID/GID 10001). The + OCI `User` field is ignored in v1. +- `template.agent_socket_path`, `template.platform_config`, and + `template.resources` are still rejected by the VM driver. +- Sandbox lifetime is the entrypoint lifetime: when the OCI entrypoint + exits, the sandbox transitions to exited/error. +- GPU is unsupported. +- Squashfs is the fs-image format. erofs is a candidate for later. +- No automatic cache GC. + +## Related files + +- `crates/openshell-driver-vm/src/driver.rs` — gRPC surface + + sandbox lifecycle. +- `crates/openshell-driver-vm/src/runtime.rs` — libkrun launch, disk + + vsock wiring. +- `crates/openshell-driver-vm/src/ffi.rs` — `libkrun` symbol loader. +- `crates/openshell-driver-vm/src/state_disk.rs` — sparse state disk + create/grow + secure import socket dir. +- `crates/openshell-driver-vm/src/oci/` — OCI pipeline. +- `crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh` — + guest init + `oci_launch_supervisor`. +- `crates/openshell-sandbox/src/container_env.rs` — supervisor + clean-env baseline for container mode. diff --git a/crates/openshell-driver-vm/Cargo.toml b/crates/openshell-driver-vm/Cargo.toml index 368716ef9..8e90d8607 100644 --- a/crates/openshell-driver-vm/Cargo.toml +++ b/crates/openshell-driver-vm/Cargo.toml @@ -36,6 +36,16 @@ libc = "0.2" libloading = "0.8" tar = "0.4" zstd = "0.13" +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +sha2 = "0.10" +flate2 = "1" +tempfile = "3" +oci-client = { version = "0.15", default-features = false, features = ["rustls-tls"] } + +[dev-dependencies] +tempfile = "3" [lints] workspace = true diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh index 70dda5acb..ed6f433b2 100644 --- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -72,6 +72,170 @@ tcp_probe() { fi } +resolve_block_device_by_serial() { + # libkrun's `krun_add_disk3` exposes the caller-supplied block_id as the + # virtio-blk serial, which Linux surfaces at /sys/block//serial. + # Walk virtio-blk devices (vd*) and return the /dev path whose serial + # matches $1. This makes the guest tolerant to attach-order changes. + local target_serial="$1" + local block + for block in /sys/block/vd*; do + [ -d "$block" ] || continue + local serial_file="$block/serial" + [ -r "$serial_file" ] || continue + local serial + serial=$(cat "$serial_file" 2>/dev/null || true) + if [ "$serial" = "$target_serial" ]; then + printf '/dev/%s\n' "$(basename "$block")" + return 0 + fi + done + return 1 +} + +oci_launch_supervisor() { + # Enter OCI overlay mode: mount the shared read-only squashfs base plus a + # per-sandbox ext4 upper, overlay them, pivot_root into the merged view, + # then exec the supervisor post-pivot so container paths like /sandbox and + # /tmp are the real paths from the supervisor's POV. + + # Prefer block-ID resolution so the mount points don't silently break if + # libkrun ever changes virtio-blk attach order. Env var overrides are kept + # for operator escape hatches and test harnesses. + local base_dev="${OPENSHELL_VM_OCI_BASE_DEVICE:-}" + local state_dev="${OPENSHELL_VM_STATE_DEVICE:-}" + + if [ -z "$base_dev" ]; then + base_dev=$(resolve_block_device_by_serial "oci-base" || true) + fi + if [ -z "$state_dev" ]; then + state_dev=$(resolve_block_device_by_serial "sandbox-state" || true) + fi + + # Fall back to attach-order defaults only when serial lookup returns nothing + # (older guest kernels or missing /sys/block//serial). + if [ -z "$base_dev" ]; then + ts "WARNING: could not resolve RO base by serial=oci-base; falling back to /dev/vda" + base_dev=/dev/vda + fi + if [ -z "$state_dev" ]; then + ts "WARNING: could not resolve state disk by serial=sandbox-state; falling back to /dev/vdb" + state_dev=/dev/vdb + fi + + if [ ! -b "$base_dev" ]; then + ts "ERROR: OCI base device $base_dev not found" + exit 1 + fi + if [ ! -b "$state_dev" ]; then + ts "ERROR: OCI state device $state_dev not found" + exit 1 + fi + + ts "OCI block devices resolved: base=$base_dev state=$state_dev" + + mkdir -p /base /state + if ! mount -o ro "$base_dev" /base 2>/dev/null; then + ts "ERROR: failed to mount read-only base $base_dev at /base" + exit 1 + fi + + if ! blkid "$state_dev" >/dev/null 2>&1; then + ts "formatting sandbox state disk $state_dev" + mkfs.ext4 -F -q -L openshell-sandbox-state "$state_dev" >/dev/null 2>&1 || { + ts "ERROR: mkfs.ext4 failed on $state_dev" + exit 1 + } + fi + if ! mount -o noatime "$state_dev" /state 2>/dev/null; then + ts "ERROR: failed to mount state disk $state_dev at /state" + exit 1 + fi + + mkdir -p /state/upper /state/work /state/merged /state/workspace + if ! mount -t overlay overlay \ + -o "lowerdir=/base,upperdir=/state/upper,workdir=/state/work" \ + /state/merged 2>/dev/null; then + ts "ERROR: failed to mount overlay at /state/merged" + exit 1 + fi + + # The image's /sandbox is RO (it lives in the base); bind the writable + # workspace over it so the container process can write to /sandbox. + mkdir -p /state/merged/sandbox + mount --bind /state/workspace /state/merged/sandbox + + # Synthesize /etc/resolv.conf inside the image if the image does not + # provide one; reuse the guest's DHCP-populated one. + if [ ! -s /state/merged/etc/resolv.conf ] && [ -s /etc/resolv.conf ]; then + mkdir -p /state/merged/etc + cp /etc/resolv.conf /state/merged/etc/resolv.conf 2>/dev/null || true + fi + + # Mirror TLS CA bundle into the merged view so SSL trust survives the pivot. + if [ -n "${OPENSHELL_TLS_CA:-}" ] && [ -f "$OPENSHELL_TLS_CA" ]; then + mkdir -p /state/merged/opt/openshell/tls + cp "$OPENSHELL_TLS_CA" /state/merged/opt/openshell/tls/ca.crt 2>/dev/null || true + fi + + # Supervisor binary must be reachable post-pivot. Copy it into the upper + # layer (writes land on the state disk, not the RO base). + mkdir -p /state/merged/opt/openshell/bin + if [ ! -x /state/merged/opt/openshell/bin/openshell-sandbox ]; then + cp /opt/openshell/bin/openshell-sandbox \ + /state/merged/opt/openshell/bin/openshell-sandbox + chmod 0755 /state/merged/opt/openshell/bin/openshell-sandbox + fi + + # Ensure the kernel pseudo-filesystems are available after pivot. + mkdir -p /state/merged/proc /state/merged/sys /state/merged/dev + mount --bind /proc /state/merged/proc 2>/dev/null || true + mount --bind /sys /state/merged/sys 2>/dev/null || true + mount --bind /dev /state/merged/dev 2>/dev/null || true + + # pivot_root requires the new root to be a mount point distinct from the + # current root, so bind-mount /state/merged onto itself. + mount --bind /state/merged /state/merged + mkdir -p /state/merged/.old_root + cd /state/merged + pivot_root . .old_root + cd / + umount -l /.old_root 2>/dev/null || true + rmdir /.old_root 2>/dev/null || true + + # Translate OCI metadata env into the supervisor's container-mode contract. + local env_count="${OPENSHELL_OCI_ENV_COUNT:-0}" + export OPENSHELL_CONTAINER_ENV_COUNT="$env_count" + local idx=0 + while [ "$idx" -lt "$env_count" ]; do + local src_var="OPENSHELL_OCI_ENV_$idx" + export "OPENSHELL_CONTAINER_ENV_$idx=${!src_var:-}" + unset "$src_var" + idx=$((idx + 1)) + done + export OPENSHELL_CONTAINER_MODE=1 + + local argc="${OPENSHELL_OCI_ARGC:-0}" + if [ "$argc" -lt 1 ]; then + ts "ERROR: OCI image has no runnable command (argc=0)" + exit 1 + fi + local -a argv=() + idx=0 + while [ "$idx" -lt "$argc" ]; do + local src_var="OPENSHELL_OCI_ARGV_$idx" + argv+=("${!src_var:-}") + unset "$src_var" + idx=$((idx + 1)) + done + + local workdir="${OPENSHELL_OCI_WORKDIR:-/sandbox}" + unset OPENSHELL_OCI_ARGC OPENSHELL_OCI_ENV_COUNT OPENSHELL_OCI_WORKDIR + + ts "OCI overlay ready; exec'ing supervisor (argc=$argc workdir=$workdir)" + exec /opt/openshell/bin/openshell-sandbox --workdir "$workdir" -- "${argv[@]}" +} + rewrite_openshell_endpoint_if_needed() { local endpoint="${OPENSHELL_ENDPOINT:-}" [ -n "$endpoint" ] || return 0 @@ -184,5 +348,13 @@ export USER=sandbox rewrite_openshell_endpoint_if_needed +# OCI image mode: if the driver staged an OCI payload via krun set_exec env, +# prepare the overlay rootfs, pivot_root, and exec the supervisor post-pivot. +# Otherwise fall through to the default guest rootfs supervisor boot. +if [ -n "${OPENSHELL_OCI_ARGC:-}" ]; then + ts "OCI image mode: OPENSHELL_OCI_ARGC=${OPENSHELL_OCI_ARGC}" + oci_launch_supervisor +fi + ts "starting openshell-sandbox supervisor" exec /opt/openshell/bin/openshell-sandbox --workdir /sandbox diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index 3d3fbf4b6..14634da2d 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -65,6 +65,13 @@ pub struct VmDriverConfig { pub guest_tls_ca: Option, pub guest_tls_cert: Option, pub guest_tls_key: Option, + /// Default OCI image used when the sandbox spec omits `template.image`. + /// Empty string means "no default" — sandboxes without an image will + /// fall back to the historical non-OCI guest rootfs supervisor. + pub default_image: String, + /// Path to the `mksquashfs` binary. When unset, OCI-image sandboxes + /// are rejected with `FailedPrecondition`. + pub mksquashfs_bin: Option, } impl Default for VmDriverConfig { @@ -82,6 +89,8 @@ impl Default for VmDriverConfig { guest_tls_ca: None, guest_tls_cert: None, guest_tls_key: None, + default_image: String::new(), + mksquashfs_bin: None, } } } @@ -173,12 +182,22 @@ struct SandboxRecord { process: Arc>, } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct VmDriver { config: VmDriverConfig, launcher_bin: PathBuf, registry: Arc>>, events: broadcast::Sender, + /// Shared OCI cache and puller for this driver process. + /// Populated once per platform; `None` when the host arch is unsupported. + oci: Option>, +} + +/// Lazily-initialized OCI state attached to the driver. +pub struct VmOci { + pub puller: crate::oci::OciPuller, + pub cache: crate::oci::CacheLayout, + pub platform: crate::oci::Platform, } impl VmDriver { @@ -207,11 +226,25 @@ impl VmDriver { }; let (events, _) = broadcast::channel(WATCH_BUFFER); + + let oci = crate::oci::Platform::host().map(|platform| { + let cache = crate::oci::CacheLayout::new(config.state_dir.join("oci-cache")); + // Errors here are surfaced lazily at first sandbox-create; the + // driver still starts so non-OCI sandboxes continue to work. + let _ = cache.ensure_dirs(); + Arc::new(VmOci { + puller: crate::oci::OciPuller::new(platform), + cache, + platform, + }) + }); + Ok(Self { config, launcher_bin, registry: Arc::new(Mutex::new(HashMap::new())), events, + oci, }) } @@ -220,7 +253,7 @@ impl VmDriver { GetCapabilitiesResponse { driver_name: DRIVER_NAME.to_string(), driver_version: openshell_core::VERSION.to_string(), - default_image: String::new(), + default_image: self.config.default_image.clone(), supports_gpu: false, } } @@ -261,6 +294,9 @@ impl VmDriver { })?; } + let oci_launch = self.resolve_oci_launch(sandbox, &state_dir).await?; + let is_oci = oci_launch.is_some(); + let console_output = state_dir.join("rootfs-console.log"); let mut command = Command::new(&self.launcher_bin); command.kill_on_drop(true); @@ -282,7 +318,15 @@ impl VmDriver { command .arg("--vm-port") .arg(format!("{ssh_port}:{GUEST_SSH_PORT}")); - for env in build_guest_environment(sandbox, &self.config) { + if let Some(oci) = oci_launch.as_ref() { + command.arg("--vm-ro-base-disk").arg(&oci.base_disk_path); + command.arg("--vm-state-disk").arg(&oci.state_disk_path); + } + let mut guest_env = build_guest_environment(sandbox, &self.config, is_oci); + if let Some(oci) = oci_launch.as_ref() { + guest_env.extend(oci.guest_env_vars.iter().cloned()); + } + for env in guest_env { command.arg("--vm-env").arg(env); } @@ -433,6 +477,76 @@ impl VmDriver { snapshots } + /// Run the OCI pipeline for this sandbox if `template.image` (or the + /// driver's default image) is set, and materialize the per-sandbox state + /// disk. Returns `None` for legacy non-OCI sandboxes. + async fn resolve_oci_launch( + &self, + sandbox: &Sandbox, + state_dir: &Path, + ) -> Result, Status> { + let image_ref = effective_image_ref(sandbox, &self.config.default_image); + if image_ref.is_empty() { + return Ok(None); + } + + let oci = self.oci.clone().ok_or_else(|| { + Status::failed_precondition( + "OCI image support is not available: the host platform is not linux/amd64 or linux/arm64", + ) + })?; + let mksquashfs = self.config.mksquashfs_bin.clone().ok_or_else(|| { + Status::failed_precondition( + "OCI image support is not configured: set OPENSHELL_VM_MKSQUASHFS to the path of mksquashfs", + ) + })?; + + let env_overrides = crate::oci::EnvOverrides { + template: sandbox + .spec + .as_ref() + .and_then(|spec| spec.template.as_ref()) + .map(|template| template.environment.clone().into_iter().collect()) + .unwrap_or_default(), + spec: sandbox + .spec + .as_ref() + .map(|spec| spec.environment.clone().into_iter().collect()) + .unwrap_or_default(), + }; + + let build_opts = crate::oci::fs_image::BuildOptions::with_binary(mksquashfs); + let cached = crate::oci::prepare( + &oci.puller, + &oci.cache, + &build_opts, + &image_ref, + &env_overrides, + ) + .await + .map_err(|err| Status::internal(format!("OCI prepare failed: {err}")))?; + + let state_paths = crate::state_disk::SandboxStatePaths::for_state_dir(state_dir); + crate::state_disk::ensure_state_disk( + &state_paths.state_disk, + crate::state_disk::DEFAULT_STATE_DISK_SIZE_BYTES, + ) + .map_err(|err| Status::internal(format!("create sandbox state disk: {err}")))?; + + let guest_env_vars: Vec = cached + .metadata + .to_guest_env_vars() + .into_iter() + .map(|(k, v)| format!("{k}={v}")) + .collect(); + + Ok(Some(OciLaunch { + base_disk_path: cached.fs_image, + state_disk_path: state_paths.state_disk, + guest_env_vars, + })) + } + async fn monitor_sandbox(&self, sandbox_id: String) { let mut ready_emitted = false; @@ -713,6 +827,32 @@ impl ComputeDriver for VmDriver { } } +/// Per-sandbox OCI launch artifacts: cached RO base fs image, per-sandbox +/// writable state disk, and the launch-metadata env vars that get packed +/// into the guest init's environ. +#[derive(Debug)] +struct OciLaunch { + base_disk_path: PathBuf, + state_disk_path: PathBuf, + guest_env_vars: Vec, +} + +/// Return the OCI image reference to use for this sandbox, or `""` if the +/// sandbox is a legacy non-OCI VM sandbox. Spec overrides the driver default. +fn effective_image_ref(sandbox: &Sandbox, default_image: &str) -> String { + let requested = sandbox + .spec + .as_ref() + .and_then(|spec| spec.template.as_ref()) + .map(|template| template.image.as_str()) + .unwrap_or(""); + if !requested.is_empty() { + requested.to_string() + } else { + default_image.to_string() + } +} + fn validate_vm_sandbox(sandbox: &Sandbox) -> Result<(), Status> { let spec = sandbox .spec @@ -725,9 +865,9 @@ fn validate_vm_sandbox(sandbox: &Sandbox) -> Result<(), Status> { } if let Some(template) = spec.template.as_ref() { if !template.image.is_empty() { - return Err(Status::failed_precondition( - "vm sandboxes do not support template.image", - )); + crate::oci::validate_reference(&template.image).map_err(|err| { + Status::failed_precondition(format!("invalid template.image: {err}")) + })?; } if !template.agent_socket_path.is_empty() { return Err(Status::failed_precondition( @@ -779,7 +919,11 @@ fn guest_visible_openshell_endpoint(endpoint: &str) -> String { endpoint.to_string() } -fn build_guest_environment(sandbox: &Sandbox, config: &VmDriverConfig) -> Vec { +fn build_guest_environment( + sandbox: &Sandbox, + config: &VmDriverConfig, + is_oci: bool, +) -> Vec { let mut environment = HashMap::from([ ("HOME".to_string(), "/root".to_string()), ( @@ -805,15 +949,23 @@ fn build_guest_environment(sandbox: &Sandbox, config: &VmDriverConfig) -> Vec`, so this fallback is dead weight there — and + // passing it would muddy the contract (the supervisor's env-parsing path + // uses whitespace splitting, which would corrupt argv boundaries if a + // code path ever fell through to it). Only set it for non-OCI sandboxes. + if !is_oci { + environment.insert( + "OPENSHELL_SANDBOX_COMMAND".to_string(), + "tail -f /dev/null".to_string(), + ); + } if config.requires_tls_materials() { environment.extend(HashMap::from([ ( @@ -1097,13 +1249,42 @@ mod tests { ..Default::default() }; - let env = build_guest_environment(&sandbox, &config); + let env = build_guest_environment(&sandbox, &config, false); assert!(env.contains(&"HOME=/root".to_string())); assert!(env.contains(&"OPENSHELL_ENDPOINT=http://192.168.127.1:8080/".to_string())); assert!(env.contains(&"OPENSHELL_SANDBOX_ID=sandbox-123".to_string())); assert!(env.contains(&format!( "OPENSHELL_SSH_LISTEN_ADDR=0.0.0.0:{GUEST_SSH_PORT}" ))); + assert!( + env.iter() + .any(|e| e.starts_with("OPENSHELL_SANDBOX_COMMAND=")), + "non-OCI sandboxes should receive the fallback command" + ); + } + + #[test] + fn build_guest_environment_omits_sandbox_command_for_oci_sandboxes() { + let config = VmDriverConfig { + openshell_endpoint: "http://127.0.0.1:8080".to_string(), + ssh_handshake_secret: "secret".to_string(), + ..Default::default() + }; + let sandbox = Sandbox { + id: "sandbox-oci".to_string(), + name: "sandbox-oci".to_string(), + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let env = build_guest_environment(&sandbox, &config, true); + assert!( + !env.iter() + .any(|e| e.starts_with("OPENSHELL_SANDBOX_COMMAND=")), + "OCI sandboxes should not get the legacy supervisor command fallback: {env:?}" + ); + // sanity: other guest-env bits should still be present + assert!(env.contains(&"OPENSHELL_SANDBOX_ID=sandbox-oci".to_string())); } #[test] @@ -1135,7 +1316,7 @@ mod tests { ..Default::default() }; - let env = build_guest_environment(&sandbox, &config); + let env = build_guest_environment(&sandbox, &config, false); assert!(env.contains(&format!("OPENSHELL_TLS_CA={GUEST_TLS_CA_PATH}"))); assert!(env.contains(&format!("OPENSHELL_TLS_CERT={GUEST_TLS_CERT_PATH}"))); assert!(env.contains(&format!("OPENSHELL_TLS_KEY={GUEST_TLS_KEY_PATH}"))); @@ -1161,6 +1342,7 @@ mod tests { launcher_bin: PathBuf::from("openshell-driver-vm"), registry: Arc::new(Mutex::new(HashMap::new())), events, + oci: None, }; let base = unique_temp_dir(); @@ -1205,6 +1387,95 @@ mod tests { let _ = std::fs::remove_dir_all(base); } + #[test] + fn effective_image_ref_prefers_spec_over_default() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate { + image: "docker.io/library/alpine:3.20".to_string(), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + assert_eq!( + effective_image_ref(&sandbox, "docker.io/library/busybox"), + "docker.io/library/alpine:3.20" + ); + } + + #[test] + fn effective_image_ref_falls_back_to_default_when_spec_empty() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate::default()), + ..Default::default() + }), + ..Default::default() + }; + assert_eq!( + effective_image_ref(&sandbox, "docker.io/library/busybox"), + "docker.io/library/busybox" + ); + } + + #[test] + fn effective_image_ref_is_empty_when_neither_is_set() { + let sandbox = Sandbox::default(); + assert!(effective_image_ref(&sandbox, "").is_empty()); + } + + #[test] + fn validate_vm_sandbox_accepts_valid_template_image() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate { + image: "docker.io/library/alpine:3.20".to_string(), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + validate_vm_sandbox(&sandbox).expect("valid image ref should pass validation"); + } + + #[test] + fn validate_vm_sandbox_rejects_malformed_template_image() { + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate { + image: "::not a valid ref::".to_string(), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + let err = validate_vm_sandbox(&sandbox).expect_err("malformed ref should fail"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("template.image")); + } + + #[test] + fn capabilities_advertise_default_image() { + let config = VmDriverConfig { + default_image: "docker.io/library/busybox".to_string(), + ..Default::default() + }; + let driver = VmDriver { + config, + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + events: broadcast::channel(WATCH_BUFFER).0, + oci: None, + }; + let caps = driver.capabilities(); + assert_eq!(caps.default_image, "docker.io/library/busybox"); + assert_eq!(caps.driver_name, DRIVER_NAME); + } + #[test] fn validate_openshell_endpoint_accepts_loopback_hosts() { validate_openshell_endpoint("http://127.0.0.1:8080") @@ -1309,6 +1580,139 @@ mod tests { let _ = std::fs::remove_dir_all(base); } + #[tokio::test] + async fn resolve_oci_launch_skips_when_no_image_is_requested() { + // A sandbox without template.image and with no driver default must + // fall through to the legacy non-OCI boot path. The resolver should + // return Ok(None) without consulting the puller or cache. + let state_dir = unique_temp_dir(); + std::fs::create_dir_all(&state_dir).unwrap(); + let driver = VmDriver { + config: VmDriverConfig::default(), + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + events: broadcast::channel(WATCH_BUFFER).0, + oci: None, // unsupported host platform + }; + + let sandbox = Sandbox { + id: "sb".to_string(), + name: "sb".to_string(), + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let result = driver + .resolve_oci_launch(&sandbox, &state_dir) + .await + .expect("no image requested → Ok(None)"); + assert!(result.is_none()); + let _ = std::fs::remove_dir_all(state_dir); + } + + #[tokio::test] + async fn resolve_oci_launch_fails_cleanly_when_host_platform_is_unsupported() { + // When `Platform::host()` returned None at driver construction, + // `self.oci` is None. Requesting an OCI image in that state must + // produce FailedPrecondition, not a panic or a silent fallback. + let state_dir = unique_temp_dir(); + std::fs::create_dir_all(&state_dir).unwrap(); + let driver = VmDriver { + config: VmDriverConfig { + default_image: "docker.io/library/alpine:3.20".to_string(), + ..Default::default() + }, + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + events: broadcast::channel(WATCH_BUFFER).0, + oci: None, + }; + + let sandbox = Sandbox { + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let err = driver + .resolve_oci_launch(&sandbox, &state_dir) + .await + .expect_err("unsupported host platform should reject OCI sandboxes"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("linux/amd64 or linux/arm64")); + let _ = std::fs::remove_dir_all(state_dir); + } + + #[tokio::test] + async fn resolve_oci_launch_fails_cleanly_when_mksquashfs_is_missing() { + // If the host platform is supported but `mksquashfs_bin` isn't + // configured, the driver must refuse with FailedPrecondition so the + // gateway surfaces a diagnosable error instead of hanging at pull time. + let Some(platform) = crate::oci::Platform::host() else { + eprintln!("skipping: unsupported host platform"); + return; + }; + + let state_dir = unique_temp_dir(); + std::fs::create_dir_all(&state_dir).unwrap(); + let cache_root = state_dir.join("oci-cache"); + let cache = crate::oci::CacheLayout::new(cache_root); + cache.ensure_dirs().unwrap(); + + let driver = VmDriver { + config: VmDriverConfig { + default_image: "docker.io/library/alpine:3.20".to_string(), + mksquashfs_bin: None, + ..Default::default() + }, + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + events: broadcast::channel(WATCH_BUFFER).0, + oci: Some(Arc::new(VmOci { + puller: crate::oci::OciPuller::new(platform), + cache, + platform, + })), + }; + + let sandbox = Sandbox { + spec: Some(SandboxSpec::default()), + ..Default::default() + }; + + let err = driver + .resolve_oci_launch(&sandbox, &state_dir) + .await + .expect_err("missing mksquashfs should reject OCI sandboxes"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("OPENSHELL_VM_MKSQUASHFS")); + let _ = std::fs::remove_dir_all(state_dir); + } + + #[test] + fn effective_image_ref_drops_whitespace_only_spec_overrides() { + // Whitespace-only image refs should not be accepted as an override of + // the driver's configured default. In today's implementation the spec + // field is compared against "" exactly, so " " slips through — but + // that only affects downstream validation, which then rejects the + // malformed ref. This test documents the contract and will need an + // update if we decide to trim here. + let sandbox = Sandbox { + spec: Some(SandboxSpec { + template: Some(SandboxTemplate { + image: " ".to_string(), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + // Current behavior: non-empty string wins, even if whitespace. + assert_eq!( + effective_image_ref(&sandbox, "docker.io/library/busybox"), + " " + ); + } + fn unique_temp_dir() -> PathBuf { static COUNTER: AtomicU64 = AtomicU64::new(0); let nanos = SystemTime::now() diff --git a/crates/openshell-driver-vm/src/ffi.rs b/crates/openshell-driver-vm/src/ffi.rs index 750788ac1..6770dbc0c 100644 --- a/crates/openshell-driver-vm/src/ffi.rs +++ b/crates/openshell-driver-vm/src/ffi.rs @@ -23,6 +23,11 @@ pub const KRUN_LOG_LEVEL_DEBUG: u32 = 4; pub const KRUN_LOG_LEVEL_TRACE: u32 = 5; pub const KRUN_LOG_STYLE_AUTO: u32 = 0; pub const KRUN_LOG_OPTION_NO_ENV: u32 = 1; +pub const KRUN_DISK_FORMAT_RAW: u32 = 0; +#[allow(dead_code)] // Used only on macOS (cfg-gated in state_disk_sync_mode) +pub const KRUN_SYNC_RELAXED: u32 = 1; +#[allow(dead_code)] // Used only on Linux (cfg-gated in state_disk_sync_mode) +pub const KRUN_SYNC_FULL: u32 = 2; type KrunInitLog = unsafe extern "C" fn(target_fd: i32, level: u32, style: u32, options: u32) -> i32; @@ -39,6 +44,17 @@ type KrunSetExec = unsafe extern "C" fn( ) -> i32; type KrunSetPortMap = unsafe extern "C" fn(ctx_id: u32, port_map: *const *const c_char) -> i32; type KrunSetConsoleOutput = unsafe extern "C" fn(ctx_id: u32, filepath: *const c_char) -> i32; +type KrunAddDisk3 = unsafe extern "C" fn( + ctx_id: u32, + block_id: *const c_char, + disk_path: *const c_char, + disk_format: u32, + read_only: bool, + direct_io: bool, + sync_mode: u32, +) -> i32; +type KrunAddVsockPort2 = + unsafe extern "C" fn(ctx_id: u32, port: u32, c_filepath: *const c_char, listen: bool) -> i32; type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32; type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32; type KrunAddVsock = unsafe extern "C" fn(ctx_id: u32, tsi_features: u32) -> i32; @@ -70,6 +86,8 @@ pub struct LibKrun { pub krun_set_exec: KrunSetExec, pub krun_set_port_map: KrunSetPortMap, pub krun_set_console_output: KrunSetConsoleOutput, + pub krun_add_disk3: Option, + pub krun_add_vsock_port2: KrunAddVsockPort2, pub krun_start_enter: KrunStartEnter, pub krun_disable_implicit_vsock: KrunDisableImplicitVsock, pub krun_add_vsock: KrunAddVsock, @@ -127,6 +145,8 @@ impl LibKrun { b"krun_set_console_output\0", &libkrun_path, )?, + krun_add_disk3: load_optional_symbol(library, b"krun_add_disk3\0"), + krun_add_vsock_port2: load_symbol(library, b"krun_add_vsock_port2\0", &libkrun_path)?, krun_start_enter: load_symbol(library, b"krun_start_enter\0", &libkrun_path)?, krun_disable_implicit_vsock: load_symbol( library, @@ -204,3 +224,8 @@ fn load_symbol(library: &'static Library, name: &[u8], path: &Path) -> }) } } + +fn load_optional_symbol(library: &'static Library, name: &[u8]) -> Option { + let symbol = unsafe { library.get::(name).ok()? }; + Some(*symbol) +} diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs index 1c424deeb..b7c2f9ab6 100644 --- a/crates/openshell-driver-vm/src/lib.rs +++ b/crates/openshell-driver-vm/src/lib.rs @@ -4,10 +4,18 @@ pub mod driver; mod embedded_runtime; mod ffi; +pub mod oci; mod rootfs; mod runtime; +pub mod state_disk; pub const GUEST_SSH_PORT: u16 = 2222; pub use driver::{VmDriver, VmDriverConfig}; -pub use runtime::{VM_RUNTIME_DIR_ENV, VmLaunchConfig, configured_runtime_dir, run_vm}; +pub use runtime::{ + ImportVsock, StateDisk, VM_RUNTIME_DIR_ENV, VmLaunchConfig, configured_runtime_dir, run_vm, +}; +pub use state_disk::{ + DEFAULT_STATE_DISK_SIZE_BYTES, IMPORT_VSOCK_PORT, STATE_DISK_BLOCK_ID, SandboxStatePaths, + ensure_state_disk, prepare_import_socket_dir, verify_import_socket_path, +}; diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs index 3a7976273..87a6f6742 100644 --- a/crates/openshell-driver-vm/src/main.rs +++ b/crates/openshell-driver-vm/src/main.rs @@ -6,7 +6,8 @@ use miette::{IntoDiagnostic, Result}; use openshell_core::VERSION; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; use openshell_driver_vm::{ - VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, VmLaunchConfig, configured_runtime_dir, run_vm, + ImportVsock, STATE_DISK_BLOCK_ID, StateDisk, VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, + VmLaunchConfig, configured_runtime_dir, run_vm, }; use std::net::SocketAddr; use std::path::PathBuf; @@ -49,6 +50,26 @@ struct Args { #[arg(long, hide = true, default_value_t = 1)] vm_krun_log_level: u32, + #[arg(long, hide = true)] + vm_state_disk: Option, + + #[arg(long, hide = true, default_value = STATE_DISK_BLOCK_ID)] + vm_state_disk_block_id: String, + + /// Optional path to a read-only base disk (e.g. cached squashfs) used as + /// the overlay lower layer inside the guest. OCI sandboxes only. + #[arg(long, hide = true)] + vm_ro_base_disk: Option, + + #[arg(long, hide = true, default_value = "oci-base")] + vm_ro_base_disk_block_id: String, + + #[arg(long, hide = true)] + vm_import_socket: Option, + + #[arg(long, hide = true)] + vm_import_vsock_port: Option, + #[arg( long, env = "OPENSHELL_COMPUTE_DRIVER_BIND", @@ -95,6 +116,16 @@ struct Args { #[arg(long, env = "OPENSHELL_VM_DRIVER_MEM_MIB", default_value_t = 2048)] mem_mib: u32, + + /// Default OCI image used when a sandbox spec omits `template.image`. + /// Advertised via `GetCapabilities.default_image`. + #[arg(long, env = "OPENSHELL_VM_DRIVER_DEFAULT_IMAGE", default_value = "")] + default_image: String, + + /// Path to the `mksquashfs` binary used to build RO base fs images. + /// Required for OCI-image sandboxes; unset → legacy-only driver. + #[arg(long, env = "OPENSHELL_VM_MKSQUASHFS")] + mksquashfs_bin: Option, } #[tokio::main] @@ -128,6 +159,8 @@ async fn main() -> Result<()> { guest_tls_ca: args.guest_tls_ca, guest_tls_cert: args.guest_tls_cert, guest_tls_key: args.guest_tls_key, + default_image: args.default_image, + mksquashfs_bin: args.mksquashfs_bin, }) .await .map_err(|err| miette::miette!("{err}"))?; @@ -175,6 +208,27 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result Some(ImportVsock { port, socket_path }), + (None, None) => None, + _ => { + return Err( + "--vm-import-socket and --vm-import-vsock-port must be set together".to_string(), + ); + } + }; + Ok(VmLaunchConfig { rootfs, vcpus: args.vm_vcpus, @@ -186,6 +240,9 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result/ +/// blobs/sha256/ raw manifest/config/layer bytes +/// fs/..squashfs RO base image +/// meta/..json launch metadata +/// tmp/ atomic-write staging +/// ``` +#[derive(Debug, Clone)] +pub struct CacheLayout { + root: PathBuf, +} + +impl CacheLayout { + #[must_use] + pub fn new(root: PathBuf) -> Self { + Self { root } + } + + #[must_use] + pub fn root(&self) -> &Path { + &self.root + } + + #[must_use] + pub fn blob_path(&self, digest: &str) -> PathBuf { + let (algo, hex) = split_digest(digest); + self.root.join("blobs").join(algo).join(hex) + } + + #[must_use] + pub fn fs_image_path(&self, manifest_digest: &str, platform: Platform) -> PathBuf { + let hex = strip_algo(manifest_digest); + self.root + .join("fs") + .join(format!("{hex}.{}.squashfs", platform.cache_tag())) + } + + #[must_use] + pub fn metadata_path(&self, manifest_digest: &str, platform: Platform) -> PathBuf { + let hex = strip_algo(manifest_digest); + self.root + .join("meta") + .join(format!("{hex}.{}.json", platform.cache_tag())) + } + + #[must_use] + pub fn tmp_dir(&self) -> PathBuf { + self.root.join("tmp") + } + + /// Create all cache subdirectories. Idempotent. + pub fn ensure_dirs(&self) -> io::Result<()> { + fs::create_dir_all(self.root.join("blobs/sha256"))?; + fs::create_dir_all(self.root.join("fs"))?; + fs::create_dir_all(self.root.join("meta"))?; + fs::create_dir_all(self.tmp_dir())?; + Ok(()) + } + + /// Check whether a cached fs image + metadata pair is present for this image. + #[must_use] + pub fn lookup(&self, manifest_digest: &str, platform: Platform) -> Option { + let fs_path = self.fs_image_path(manifest_digest, platform); + let meta_path = self.metadata_path(manifest_digest, platform); + if !fs_path.is_file() || !meta_path.is_file() { + return None; + } + let metadata_json = fs::read_to_string(&meta_path).ok()?; + let metadata: CachedMetadata = serde_json::from_str(&metadata_json).ok()?; + Some(CachedImage { + fs_image: fs_path, + metadata: metadata.launch, + }) + } + + /// Atomically write launch metadata for a built image. + pub fn write_metadata( + &self, + manifest_digest: &str, + platform: Platform, + metadata: &LaunchMetadata, + ) -> io::Result<()> { + self.ensure_dirs()?; + let target = self.metadata_path(manifest_digest, platform); + let payload = serde_json::to_vec_pretty(&CachedMetadata { + schema: METADATA_SCHEMA_V1, + launch: metadata.clone(), + }) + .map_err(io::Error::other)?; + atomic_write(&self.tmp_dir(), &target, &payload) + } + + /// Atomically move a built fs image into its cache slot. The source path + /// must live on the same filesystem as the cache root (callers typically + /// build inside [`Self::tmp_dir`]). + pub fn install_fs_image( + &self, + manifest_digest: &str, + platform: Platform, + built_image: &Path, + ) -> io::Result { + self.ensure_dirs()?; + let dest = self.fs_image_path(manifest_digest, platform); + if dest.exists() { + fs::remove_file(&dest)?; + } + fs::rename(built_image, &dest)?; + Ok(dest) + } +} + +/// A cache hit with both the RO fs image path and its launch metadata. +#[derive(Debug, Clone)] +pub struct CachedImage { + pub fs_image: PathBuf, + pub metadata: LaunchMetadata, +} + +const METADATA_SCHEMA_V1: u32 = 1; + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct CachedMetadata { + schema: u32, + launch: LaunchMetadata, +} + +fn split_digest(digest: &str) -> (&str, &str) { + match digest.split_once(':') { + Some((algo, hex)) => (algo, hex), + None => ("sha256", digest), + } +} + +fn strip_algo(digest: &str) -> &str { + split_digest(digest).1 +} + +/// Write `bytes` to `target` via a rename inside `tmp_dir`, ensuring readers +/// never see a partial file. +fn atomic_write(tmp_dir: &Path, target: &Path, bytes: &[u8]) -> io::Result<()> { + fs::create_dir_all(tmp_dir)?; + if let Some(parent) = target.parent() { + fs::create_dir_all(parent)?; + } + let file_name = target + .file_name() + .ok_or_else(|| io::Error::other("cache target has no file name"))?; + let staging = tmp_dir.join(format!( + "{}.{}.tmp", + file_name.to_string_lossy(), + std::process::id() + )); + fs::write(&staging, bytes)?; + fs::rename(&staging, target)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::BTreeMap; + + fn sample_metadata() -> LaunchMetadata { + LaunchMetadata { + argv: vec!["/bin/sh".to_string(), "-c".to_string(), "true".to_string()], + env: vec![("A".to_string(), "1".to_string())], + workdir: "/sandbox".to_string(), + labels: BTreeMap::new(), + stop_signal: String::new(), + } + } + + #[test] + fn digest_with_algo_splits_into_blob_path() { + let layout = CacheLayout::new(PathBuf::from("/cache")); + let path = layout.blob_path("sha256:abc123"); + assert_eq!(path, PathBuf::from("/cache/blobs/sha256/abc123")); + } + + #[test] + fn digest_without_algo_defaults_to_sha256() { + let layout = CacheLayout::new(PathBuf::from("/cache")); + let path = layout.blob_path("abc123"); + assert_eq!(path, PathBuf::from("/cache/blobs/sha256/abc123")); + } + + #[test] + fn fs_and_metadata_paths_include_platform_tag() { + let layout = CacheLayout::new(PathBuf::from("/cache")); + assert_eq!( + layout.fs_image_path("sha256:deadbeef", Platform::LinuxAmd64), + PathBuf::from("/cache/fs/deadbeef.amd64.squashfs") + ); + assert_eq!( + layout.metadata_path("sha256:deadbeef", Platform::LinuxArm64), + PathBuf::from("/cache/meta/deadbeef.arm64.json") + ); + } + + #[test] + fn lookup_returns_none_when_either_file_is_missing() { + let tmp = tempfile::tempdir().unwrap(); + let layout = CacheLayout::new(tmp.path().to_path_buf()); + layout.ensure_dirs().unwrap(); + assert!(layout.lookup("sha256:abc", Platform::LinuxAmd64).is_none()); + + // write metadata but no fs image + layout + .write_metadata("sha256:abc", Platform::LinuxAmd64, &sample_metadata()) + .unwrap(); + assert!(layout.lookup("sha256:abc", Platform::LinuxAmd64).is_none()); + } + + #[test] + fn lookup_returns_paired_fs_image_and_metadata() { + let tmp = tempfile::tempdir().unwrap(); + let layout = CacheLayout::new(tmp.path().to_path_buf()); + layout.ensure_dirs().unwrap(); + + // Seed the fs image slot with a placeholder file. + let fs_slot = layout.fs_image_path("sha256:abc", Platform::LinuxAmd64); + fs::create_dir_all(fs_slot.parent().unwrap()).unwrap(); + fs::write(&fs_slot, b"stub").unwrap(); + + layout + .write_metadata("sha256:abc", Platform::LinuxAmd64, &sample_metadata()) + .unwrap(); + + let hit = layout + .lookup("sha256:abc", Platform::LinuxAmd64) + .expect("expected cache hit"); + assert_eq!(hit.fs_image, fs_slot); + assert_eq!(hit.metadata.argv, sample_metadata().argv); + } + + #[test] + fn write_metadata_is_atomic_under_repeat_writes() { + let tmp = tempfile::tempdir().unwrap(); + let layout = CacheLayout::new(tmp.path().to_path_buf()); + layout + .write_metadata("sha256:abc", Platform::LinuxAmd64, &sample_metadata()) + .unwrap(); + + let mut updated = sample_metadata(); + updated.argv.push("extra".to_string()); + layout + .write_metadata("sha256:abc", Platform::LinuxAmd64, &updated) + .unwrap(); + + let hit = layout.lookup("sha256:abc", Platform::LinuxAmd64); + // no fs image, so lookup returns None; re-read the metadata directly. + assert!(hit.is_none()); + let raw = + fs::read_to_string(layout.metadata_path("sha256:abc", Platform::LinuxAmd64)).unwrap(); + assert!(raw.contains("extra")); + } + + #[test] + fn install_fs_image_moves_built_image_into_slot() { + let tmp = tempfile::tempdir().unwrap(); + let layout = CacheLayout::new(tmp.path().to_path_buf()); + layout.ensure_dirs().unwrap(); + let built = layout.tmp_dir().join("built.squashfs"); + fs::write(&built, b"squashed").unwrap(); + + let slot = layout + .install_fs_image("sha256:xyz", Platform::LinuxAmd64, &built) + .unwrap(); + assert!(slot.is_file()); + assert!(!built.exists(), "source should be renamed, not copied"); + assert_eq!(fs::read(&slot).unwrap(), b"squashed"); + } +} diff --git a/crates/openshell-driver-vm/src/oci/client.rs b/crates/openshell-driver-vm/src/oci/client.rs new file mode 100644 index 000000000..32586d0f1 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/client.rs @@ -0,0 +1,241 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Thin wrapper around [`oci_client::Client`] that pulls a public image for a +//! specific guest platform and normalizes the response into data our pipeline +//! can consume. + +use std::collections::BTreeMap; +use std::str::FromStr; + +use oci_client::client::{ClientConfig, ImageLayer}; +use oci_client::manifest::{ + IMAGE_CONFIG_MEDIA_TYPE, IMAGE_DOCKER_CONFIG_MEDIA_TYPE, IMAGE_DOCKER_LAYER_GZIP_MEDIA_TYPE, + IMAGE_LAYER_GZIP_MEDIA_TYPE, IMAGE_LAYER_MEDIA_TYPE, IMAGE_MANIFEST_LIST_MEDIA_TYPE, + IMAGE_MANIFEST_MEDIA_TYPE, ImageIndexEntry, OCI_IMAGE_INDEX_MEDIA_TYPE, OCI_IMAGE_MEDIA_TYPE, +}; +use oci_client::secrets::RegistryAuth; +use oci_client::{Client, Reference}; + +use super::metadata::{ImageConfig, Platform}; + +/// Image pulled from a registry, with the normalized subset our pipeline needs. +#[derive(Debug)] +pub struct PulledImage { + /// Manifest digest (`sha256:...`), used as the cache key. + pub manifest_digest: String, + /// Layers in application order (lower → upper), already filtered for + /// supported media types. + pub layers: Vec, + /// Normalized OCI image config. + pub image_config: ImageConfig, +} + +/// Pulls public OCI images for a fixed guest platform. +pub struct OciPuller { + client: Client, + platform: Platform, +} + +impl OciPuller { + #[must_use] + pub fn new(platform: Platform) -> Self { + let config = ClientConfig { + platform_resolver: Some(Box::new(move |entries: &[ImageIndexEntry]| { + pick_platform(entries, platform) + })), + ..Default::default() + }; + Self { + client: Client::new(config), + platform, + } + } + + /// Pull `image_ref` (e.g. `docker.io/library/alpine:3.20`) anonymously. + /// + /// Returns the manifest digest + layer bytes + normalized config. Any + /// error from the registry or the config decoder is surfaced verbatim. + pub async fn pull(&self, image_ref: &str) -> Result { + let reference = Reference::from_str(image_ref) + .map_err(|err| PullError::InvalidReference(err.to_string()))?; + + let accepted = vec![ + IMAGE_LAYER_MEDIA_TYPE, + IMAGE_LAYER_GZIP_MEDIA_TYPE, + IMAGE_DOCKER_LAYER_GZIP_MEDIA_TYPE, + IMAGE_MANIFEST_MEDIA_TYPE, + OCI_IMAGE_MEDIA_TYPE, + IMAGE_MANIFEST_LIST_MEDIA_TYPE, + OCI_IMAGE_INDEX_MEDIA_TYPE, + IMAGE_CONFIG_MEDIA_TYPE, + IMAGE_DOCKER_CONFIG_MEDIA_TYPE, + ]; + + let image = self + .client + .pull(&reference, &RegistryAuth::Anonymous, accepted) + .await + .map_err(|err| PullError::Registry(err.to_string()))?; + + let manifest_digest = image.digest.ok_or_else(|| { + PullError::Registry("registry did not return a manifest digest".into()) + })?; + + let image_config = parse_image_config(&image.config.data)?; + + Ok(PulledImage { + manifest_digest, + layers: image.layers, + image_config, + }) + } + + #[must_use] + pub fn platform(&self) -> Platform { + self.platform + } +} + +/// Pick the first index entry matching the requested platform. +fn pick_platform(entries: &[ImageIndexEntry], platform: Platform) -> Option { + entries + .iter() + .find(|entry| { + entry + .platform + .as_ref() + .is_some_and(|p| p.os == platform.os() && p.architecture == platform.arch()) + }) + .map(|entry| entry.digest.clone()) +} + +/// Deserialize the OCI image config JSON into our minimal view. +fn parse_image_config(config_bytes: &[u8]) -> Result { + #[derive(serde::Deserialize)] + struct RawConfig { + config: Option, + } + #[derive(serde::Deserialize, Default)] + #[serde(default)] + struct InnerConfig { + #[serde(rename = "Entrypoint")] + entrypoint: Option>, + #[serde(rename = "Cmd")] + cmd: Option>, + #[serde(rename = "Env")] + env: Option>, + #[serde(rename = "WorkingDir")] + working_dir: Option, + #[serde(rename = "Labels")] + labels: Option>, + #[serde(rename = "StopSignal")] + stop_signal: Option, + } + + let raw: RawConfig = serde_json::from_slice(config_bytes) + .map_err(|err| PullError::MalformedConfig(err.to_string()))?; + let inner = raw.config.unwrap_or_default(); + Ok(ImageConfig { + entrypoint: inner.entrypoint.unwrap_or_default(), + cmd: inner.cmd.unwrap_or_default(), + env: inner.env.unwrap_or_default(), + working_dir: inner.working_dir.unwrap_or_default(), + labels: inner.labels.unwrap_or_default(), + stop_signal: inner.stop_signal.unwrap_or_default(), + }) +} + +/// Errors raised during image pull or normalization. +#[derive(Debug, thiserror::Error)] +pub enum PullError { + #[error("invalid image reference: {0}")] + InvalidReference(String), + #[error("registry error: {0}")] + Registry(String), + #[error("malformed OCI image config: {0}")] + MalformedConfig(String), +} + +#[cfg(test)] +mod tests { + use super::*; + use oci_client::manifest::Platform as SpecPlatform; + + fn entry(os: &str, arch: &str, digest: &str) -> ImageIndexEntry { + ImageIndexEntry { + media_type: OCI_IMAGE_MEDIA_TYPE.to_string(), + digest: digest.to_string(), + size: 0, + platform: Some(SpecPlatform { + architecture: arch.to_string(), + os: os.to_string(), + os_version: None, + os_features: None, + variant: None, + features: None, + }), + annotations: None, + } + } + + #[test] + fn pick_platform_selects_matching_entry() { + let entries = vec![ + entry("linux", "amd64", "sha256:amd"), + entry("linux", "arm64", "sha256:arm"), + ]; + assert_eq!( + pick_platform(&entries, Platform::LinuxAmd64), + Some("sha256:amd".to_string()) + ); + assert_eq!( + pick_platform(&entries, Platform::LinuxArm64), + Some("sha256:arm".to_string()) + ); + } + + #[test] + fn pick_platform_returns_none_when_unsupported() { + let entries = vec![entry("windows", "amd64", "sha256:win")]; + assert!(pick_platform(&entries, Platform::LinuxAmd64).is_none()); + } + + #[test] + fn parse_image_config_handles_entrypoint_and_cmd_fields() { + let json = br#"{ + "architecture": "amd64", + "os": "linux", + "config": { + "Entrypoint": ["/bin/sh", "-c"], + "Cmd": ["echo hello"], + "Env": ["PATH=/usr/bin"], + "WorkingDir": "/app", + "Labels": {"k": "v"}, + "StopSignal": "SIGTERM" + } + }"#; + let cfg = parse_image_config(json).unwrap(); + assert_eq!(cfg.entrypoint, vec!["/bin/sh", "-c"]); + assert_eq!(cfg.cmd, vec!["echo hello"]); + assert_eq!(cfg.env, vec!["PATH=/usr/bin"]); + assert_eq!(cfg.working_dir, "/app"); + assert_eq!(cfg.labels.get("k"), Some(&"v".to_string())); + assert_eq!(cfg.stop_signal, "SIGTERM"); + } + + #[test] + fn parse_image_config_tolerates_missing_config_block() { + let json = br#"{"architecture":"amd64","os":"linux"}"#; + let cfg = parse_image_config(json).unwrap(); + assert!(cfg.entrypoint.is_empty()); + assert!(cfg.cmd.is_empty()); + assert_eq!(cfg.working_dir, ""); + } + + #[test] + fn parse_image_config_rejects_malformed_json() { + let err = parse_image_config(b"not json").expect_err("should fail"); + assert!(matches!(err, PullError::MalformedConfig(_))); + } +} diff --git a/crates/openshell-driver-vm/src/oci/compat.rs b/crates/openshell-driver-vm/src/oci/compat.rs new file mode 100644 index 000000000..c9f4c400a --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/compat.rs @@ -0,0 +1,197 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Inject OpenShell compatibility files into a flattened OCI rootfs tree. +//! +//! Runs after [`crate::oci::flatten`] and before the squashfs build, so the +//! sandbox user and its expected directories are baked into the RO base image. + +use std::fs; +use std::io; +use std::os::unix::fs::PermissionsExt as _; +use std::path::Path; + +/// Canonical sandbox user/group. Must match `openshell-sandbox`'s expectations. +pub const SANDBOX_UID: u32 = 10001; +pub const SANDBOX_GID: u32 = 10001; +pub const SANDBOX_USER: &str = "sandbox"; + +/// Apply all compat injections into `root`. Idempotent. +pub fn inject(root: &Path) -> io::Result<()> { + ensure_passwd_entry(root)?; + ensure_group_entry(root)?; + ensure_dir(&root.join("sandbox"), 0o755)?; + ensure_dir(&root.join("tmp"), 0o1777)?; + ensure_empty_file(&root.join("etc/hosts"), 0o644)?; + ensure_empty_file(&root.join("etc/resolv.conf"), 0o644)?; + Ok(()) +} + +fn ensure_passwd_entry(root: &Path) -> io::Result<()> { + let path = root.join("etc/passwd"); + let shell = pick_shell(root); + let entry = format!( + "{SANDBOX_USER}:x:{SANDBOX_UID}:{SANDBOX_GID}:OpenShell Sandbox:/sandbox:{shell}\n" + ); + append_user_db_entry(&path, SANDBOX_USER, &entry) +} + +fn ensure_group_entry(root: &Path) -> io::Result<()> { + let path = root.join("etc/group"); + let entry = format!("{SANDBOX_USER}:x:{SANDBOX_GID}:\n"); + append_user_db_entry(&path, SANDBOX_USER, &entry) +} + +/// Append `entry` to the colon-delimited user DB at `path` unless a line +/// already starts with `key:`. Creates `etc/` and the file if needed. +fn append_user_db_entry(path: &Path, key: &str, entry: &str) -> io::Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + + let existing = match fs::read_to_string(path) { + Ok(contents) => contents, + Err(err) if err.kind() == io::ErrorKind::NotFound => String::new(), + Err(err) => return Err(err), + }; + + let prefix = format!("{key}:"); + if existing.lines().any(|line| line.starts_with(&prefix)) { + return Ok(()); + } + + let mut combined = existing; + if !combined.is_empty() && !combined.ends_with('\n') { + combined.push('\n'); + } + combined.push_str(entry); + fs::write(path, combined)?; + fs::set_permissions(path, fs::Permissions::from_mode(0o644))?; + Ok(()) +} + +/// Pick the best shell path for the sandbox user. +/// +/// Prefers `/bin/sh` if present; falls back to `/sbin/nologin`, then +/// `/bin/false`. This guarantees a valid shell field in `/etc/passwd` +/// even for minimal images. +fn pick_shell(root: &Path) -> String { + for candidate in ["bin/sh", "sbin/nologin", "usr/sbin/nologin", "bin/false"] { + if root.join(candidate).exists() { + return format!("/{candidate}"); + } + } + "/sbin/nologin".to_string() +} + +fn ensure_dir(path: &Path, mode: u32) -> io::Result<()> { + if !path.exists() { + fs::create_dir_all(path)?; + } + fs::set_permissions(path, fs::Permissions::from_mode(mode)) +} + +fn ensure_empty_file(path: &Path, mode: u32) -> io::Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + if !path.exists() { + fs::write(path, "")?; + } + fs::set_permissions(path, fs::Permissions::from_mode(mode)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn fresh_root() -> tempfile::TempDir { + tempfile::tempdir().unwrap() + } + + #[test] + fn inject_populates_passwd_group_and_dirs_on_empty_root() { + let tmp = fresh_root(); + inject(tmp.path()).unwrap(); + + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + assert!(passwd.contains(&format!("{SANDBOX_USER}:x:{SANDBOX_UID}:{SANDBOX_GID}:"))); + + let group = fs::read_to_string(tmp.path().join("etc/group")).unwrap(); + assert!(group.contains(&format!("{SANDBOX_USER}:x:{SANDBOX_GID}:"))); + + let sandbox_meta = fs::metadata(tmp.path().join("sandbox")).unwrap(); + assert!(sandbox_meta.is_dir()); + assert_eq!(sandbox_meta.permissions().mode() & 0o777, 0o755); + + let tmp_meta = fs::metadata(tmp.path().join("tmp")).unwrap(); + assert_eq!(tmp_meta.permissions().mode() & 0o7777, 0o1777); + + assert!(tmp.path().join("etc/hosts").exists()); + assert!(tmp.path().join("etc/resolv.conf").exists()); + } + + #[test] + fn inject_is_idempotent_and_does_not_duplicate_entries() { + let tmp = fresh_root(); + inject(tmp.path()).unwrap(); + inject(tmp.path()).unwrap(); + + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + let sandbox_lines = passwd + .lines() + .filter(|line| line.starts_with(&format!("{SANDBOX_USER}:"))) + .count(); + assert_eq!(sandbox_lines, 1, "sandbox user should appear exactly once"); + } + + #[test] + fn inject_preserves_existing_passwd_entries() { + let tmp = fresh_root(); + fs::create_dir_all(tmp.path().join("etc")).unwrap(); + fs::write( + tmp.path().join("etc/passwd"), + "root:x:0:0:root:/root:/bin/sh\n", + ) + .unwrap(); + + inject(tmp.path()).unwrap(); + + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + assert!(passwd.contains("root:x:0:0:")); + assert!(passwd.contains(&format!("{SANDBOX_USER}:x:{SANDBOX_UID}:"))); + } + + #[test] + fn inject_uses_nologin_when_no_shell_present() { + let tmp = fresh_root(); + inject(tmp.path()).unwrap(); + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + assert!( + passwd.contains(":/sbin/nologin"), + "expected nologin fallback, got: {passwd}" + ); + } + + #[test] + fn inject_uses_bin_sh_when_available() { + let tmp = fresh_root(); + fs::create_dir_all(tmp.path().join("bin")).unwrap(); + fs::write(tmp.path().join("bin/sh"), "").unwrap(); + fs::set_permissions(tmp.path().join("bin/sh"), fs::Permissions::from_mode(0o755)).unwrap(); + + inject(tmp.path()).unwrap(); + let passwd = fs::read_to_string(tmp.path().join("etc/passwd")).unwrap(); + assert!(passwd.contains(":/bin/sh")); + } + + #[test] + fn inject_does_not_truncate_existing_etc_hosts() { + let tmp = fresh_root(); + fs::create_dir_all(tmp.path().join("etc")).unwrap(); + fs::write(tmp.path().join("etc/hosts"), "127.0.0.1 localhost\n").unwrap(); + inject(tmp.path()).unwrap(); + let hosts = fs::read_to_string(tmp.path().join("etc/hosts")).unwrap(); + assert!(hosts.contains("127.0.0.1")); + } +} diff --git a/crates/openshell-driver-vm/src/oci/flatten.rs b/crates/openshell-driver-vm/src/oci/flatten.rs new file mode 100644 index 000000000..7d0d6dd75 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/flatten.rs @@ -0,0 +1,332 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Apply OCI image layers in order into a flat rootfs tree, honoring whiteouts. +//! +//! OCI whiteout convention (see image-spec): +//! - A file named `.wh.` in a layer means "delete " from the tree. +//! - A file named `.wh..wh..opq` in a directory means "opaque directory" — +//! delete all existing children of that directory before applying this +//! layer's additions. + +use std::fs; +use std::io::{self, Read}; +use std::path::{Component, Path, PathBuf}; + +const OPAQUE_MARKER: &str = ".wh..wh..opq"; +const WHITEOUT_PREFIX: &str = ".wh."; + +/// Apply a single gzip-compressed OCI layer tar stream into `dest`. +/// +/// Whiteouts are honored against the existing contents of `dest`; the +/// markers themselves are never materialized. +pub fn apply_layer(dest: &Path, layer_reader: R) -> io::Result<()> { + let gz = flate2::read::GzDecoder::new(layer_reader); + apply_tar_stream(dest, gz) +} + +/// Apply a layer whose bytes are in memory, dispatching on OCI media type. +/// +/// Supports `tar` (uncompressed) and `tar+gzip`. Other encodings +/// (`tar+zstd`, `tar+bzip2`) are rejected — OCI v1.1 allows them but common +/// registries still use gzip. +pub fn apply_layer_bytes(dest: &Path, media_type: &str, bytes: &[u8]) -> io::Result<()> { + let base = media_type.split(';').next().unwrap_or(media_type).trim(); + if base.ends_with("+gzip") || base.ends_with(".gzip") || base.ends_with(".tar.gzip") { + apply_layer(dest, bytes) + } else if base.ends_with(".tar") || base.ends_with("+tar") || base == "application/x-tar" { + apply_tar_stream(dest, bytes) + } else if base.ends_with("+zstd") { + Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unsupported layer media type (zstd not supported in v1): {media_type}"), + )) + } else { + Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unknown layer media type: {media_type}"), + )) + } +} + +/// Apply an uncompressed tar stream. Exposed for tests that build synthetic +/// layers in memory. +pub fn apply_tar_stream(dest: &Path, tar_reader: R) -> io::Result<()> { + let mut archive = tar::Archive::new(tar_reader); + archive.set_preserve_permissions(true); + archive.set_preserve_mtime(true); + archive.set_unpack_xattrs(false); + + for entry in archive.entries()? { + let mut entry = entry?; + let entry_path = entry.path()?.into_owned(); + let Some(rel) = sanitize_relative(&entry_path) else { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("layer tar contains unsafe path: {}", entry_path.display()), + )); + }; + + let Some(file_name) = rel.file_name().and_then(|n| n.to_str()) else { + // skip entries we cannot reason about (e.g. `.` top-level) + continue; + }; + + if file_name == OPAQUE_MARKER { + let parent = rel.parent().unwrap_or(Path::new("")); + clear_directory(&dest.join(parent))?; + continue; + } + + if let Some(target_name) = file_name.strip_prefix(WHITEOUT_PREFIX) { + let parent = rel.parent().unwrap_or(Path::new("")); + let target = dest.join(parent).join(target_name); + remove_any(&target)?; + continue; + } + + let dest_path = dest.join(&rel); + if let Some(parent) = dest_path.parent() { + fs::create_dir_all(parent)?; + } + entry.unpack(&dest_path)?; + } + + Ok(()) +} + +/// Reject absolute, parent-escaping, or root-component paths in layer tars. +fn sanitize_relative(path: &Path) -> Option { + let mut out = PathBuf::new(); + for component in path.components() { + match component { + Component::Normal(part) => out.push(part), + Component::CurDir => {} + Component::RootDir | Component::Prefix(_) | Component::ParentDir => return None, + } + } + if out.as_os_str().is_empty() { + return None; + } + Some(out) +} + +fn clear_directory(path: &Path) -> io::Result<()> { + if !path.exists() { + return Ok(()); + } + for entry in fs::read_dir(path)? { + let entry = entry?; + remove_any(&entry.path())?; + } + Ok(()) +} + +fn remove_any(path: &Path) -> io::Result<()> { + match path.symlink_metadata() { + Ok(meta) => { + if meta.file_type().is_dir() { + fs::remove_dir_all(path) + } else { + fs::remove_file(path) + } + } + Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(()), + Err(err) => Err(err), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + /// Build an in-memory tar stream from a list of (path, contents) pairs. + /// Directories are created implicitly when their children have paths. + fn build_tar(entries: &[(&str, &[u8])]) -> Vec { + let mut buf = Vec::new(); + { + let mut builder = tar::Builder::new(&mut buf); + for (path, contents) in entries { + if path.ends_with('/') { + let mut header = tar::Header::new_gnu(); + header.set_path(path).unwrap(); + header.set_size(0); + header.set_mode(0o755); + header.set_entry_type(tar::EntryType::Directory); + header.set_cksum(); + builder.append(&header, io::empty()).unwrap(); + } else { + let mut header = tar::Header::new_gnu(); + header.set_path(path).unwrap(); + header.set_size(contents.len() as u64); + header.set_mode(0o644); + header.set_entry_type(tar::EntryType::Regular); + header.set_cksum(); + builder.append(&header, *contents).unwrap(); + } + } + builder.finish().unwrap(); + } + buf + } + + #[test] + fn whiteout_removes_file_from_lower_layer() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let lower = build_tar(&[("app/", b""), ("app/a.txt", b"a"), ("app/b.txt", b"b")]); + apply_tar_stream(root, lower.as_slice()).unwrap(); + assert!(root.join("app/a.txt").exists()); + + let upper = build_tar(&[("app/.wh.a.txt", b"")]); + apply_tar_stream(root, upper.as_slice()).unwrap(); + + assert!(!root.join("app/a.txt").exists(), "whiteout should remove a"); + assert!(root.join("app/b.txt").exists(), "b should still exist"); + assert!( + !root.join("app/.wh.a.txt").exists(), + "marker should not be materialized" + ); + } + + #[test] + fn opaque_whiteout_clears_directory_before_additions() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let lower = build_tar(&[ + ("data/", b""), + ("data/keep.txt", b"lower"), + ("data/gone.txt", b"lower"), + ]); + apply_tar_stream(root, lower.as_slice()).unwrap(); + + let upper = build_tar(&[ + ("data/", b""), + ("data/.wh..wh..opq", b""), + ("data/new.txt", b"upper"), + ]); + apply_tar_stream(root, upper.as_slice()).unwrap(); + + assert!(!root.join("data/keep.txt").exists()); + assert!(!root.join("data/gone.txt").exists()); + assert!(root.join("data/new.txt").exists()); + assert_eq!( + fs::read_to_string(root.join("data/new.txt")).unwrap(), + "upper" + ); + } + + #[test] + fn whiteout_removes_directory_recursively() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let lower = build_tar(&[ + ("dir/", b""), + ("dir/a.txt", b"a"), + ("dir/sub/", b""), + ("dir/sub/b.txt", b"b"), + ]); + apply_tar_stream(root, lower.as_slice()).unwrap(); + + let upper = build_tar(&[(".wh.dir", b"")]); + apply_tar_stream(root, upper.as_slice()).unwrap(); + + assert!(!root.join("dir").exists()); + } + + #[test] + fn layers_apply_in_order_with_later_overwriting_earlier() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + apply_tar_stream(root, build_tar(&[("x.txt", b"v1")]).as_slice()).unwrap(); + apply_tar_stream(root, build_tar(&[("x.txt", b"v2")]).as_slice()).unwrap(); + assert_eq!(fs::read_to_string(root.join("x.txt")).unwrap(), "v2"); + } + + #[test] + fn sanitize_relative_rejects_absolute_paths() { + assert!(sanitize_relative(Path::new("/etc/passwd")).is_none()); + } + + #[test] + fn sanitize_relative_rejects_parent_traversal() { + assert!(sanitize_relative(Path::new("../escape.txt")).is_none()); + assert!(sanitize_relative(Path::new("a/../../etc/passwd")).is_none()); + } + + #[test] + fn sanitize_relative_strips_curdir_and_keeps_clean_paths() { + assert_eq!( + sanitize_relative(Path::new("./etc/hosts")).unwrap(), + PathBuf::from("etc/hosts") + ); + assert_eq!( + sanitize_relative(Path::new("app/bin/sh")).unwrap(), + PathBuf::from("app/bin/sh") + ); + } + + #[test] + fn sanitize_relative_rejects_empty_and_root_only_paths() { + assert!(sanitize_relative(Path::new("")).is_none()); + assert!(sanitize_relative(Path::new("/")).is_none()); + } + + #[test] + fn apply_layer_bytes_dispatches_on_media_type() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + let tarball = build_tar(&[("plain.txt", b"v")]); + apply_layer_bytes(root, "application/vnd.oci.image.layer.v1.tar", &tarball).unwrap(); + assert!(root.join("plain.txt").exists()); + + let mut gz = Vec::new(); + { + let mut enc = flate2::write::GzEncoder::new(&mut gz, flate2::Compression::fast()); + enc.write_all(&build_tar(&[("gz.txt", b"v")])).unwrap(); + enc.finish().unwrap(); + } + apply_layer_bytes(root, "application/vnd.oci.image.layer.v1.tar+gzip", &gz).unwrap(); + assert!(root.join("gz.txt").exists()); + } + + #[test] + fn apply_layer_bytes_rejects_zstd_in_v1() { + let tmp = tempfile::tempdir().unwrap(); + let err = apply_layer_bytes( + tmp.path(), + "application/vnd.oci.image.layer.v1.tar+zstd", + b"", + ) + .expect_err("zstd should be rejected"); + assert!(err.to_string().contains("zstd")); + } + + #[test] + fn apply_layer_bytes_rejects_unknown_media_type() { + let tmp = tempfile::tempdir().unwrap(); + let err = apply_layer_bytes(tmp.path(), "application/bogus", b"") + .expect_err("unknown media type should fail"); + assert!(err.to_string().contains("unknown")); + } + + #[test] + fn apply_layer_handles_gzip_streams() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let tarball = build_tar(&[("hello.txt", b"world")]); + let mut gz = Vec::new(); + { + let mut enc = flate2::write::GzEncoder::new(&mut gz, flate2::Compression::fast()); + enc.write_all(&tarball).unwrap(); + enc.finish().unwrap(); + } + apply_layer(root, gz.as_slice()).unwrap(); + assert_eq!(fs::read_to_string(root.join("hello.txt")).unwrap(), "world"); + } +} diff --git a/crates/openshell-driver-vm/src/oci/fs_image.rs b/crates/openshell-driver-vm/src/oci/fs_image.rs new file mode 100644 index 000000000..97849e1c5 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/fs_image.rs @@ -0,0 +1,156 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Build a read-only squashfs image from a flattened rootfs tree. +//! +//! Shell out to `mksquashfs`. The binary is expected to ship with the VM +//! runtime bundle under `/mksquashfs`; callers pass an explicit +//! path so the build is reproducible and does not depend on `$PATH`. + +use std::io; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; + +/// Options for building a squashfs image. +#[derive(Debug, Clone)] +pub struct BuildOptions { + /// Path to the `mksquashfs` binary. + pub mksquashfs: PathBuf, + /// Compression algorithm passed via `-comp`. + pub compression: Compression, + /// Optional extra flags forwarded verbatim (e.g. `-no-xattrs`). + pub extra_args: Vec, +} + +impl BuildOptions { + #[must_use] + pub fn with_binary(mksquashfs: PathBuf) -> Self { + Self { + mksquashfs, + compression: Compression::Zstd, + extra_args: Vec::new(), + } + } +} + +/// Compression algorithm for squashfs builds. `zstd` is the default; it has +/// the best decompression-speed-vs-ratio tradeoff for cold-start latency. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Compression { + Zstd, + Gzip, + Xz, +} + +impl Compression { + #[must_use] + pub const fn as_str(self) -> &'static str { + match self { + Self::Zstd => "zstd", + Self::Gzip => "gzip", + Self::Xz => "xz", + } + } +} + +/// Build a squashfs image from `source_dir` into `dest` using `options`. +/// +/// Returns an `io::Error` if the `mksquashfs` binary is missing or exits +/// non-zero. Callers are responsible for placing the result in the cache +/// via [`super::cache::CacheLayout::install_fs_image`]. +pub fn build(source_dir: &Path, dest: &Path, options: &BuildOptions) -> io::Result<()> { + if !options.mksquashfs.is_file() { + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!( + "mksquashfs binary not found at {}", + options.mksquashfs.display() + ), + )); + } + if !source_dir.is_dir() { + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!("source tree not found at {}", source_dir.display()), + )); + } + + if dest.exists() { + std::fs::remove_file(dest)?; + } + if let Some(parent) = dest.parent() { + std::fs::create_dir_all(parent)?; + } + + let mut cmd = Command::new(&options.mksquashfs); + cmd.arg(source_dir) + .arg(dest) + .arg("-noappend") + .arg("-quiet") + .arg("-comp") + .arg(options.compression.as_str()); + for arg in &options.extra_args { + cmd.arg(arg); + } + cmd.stdin(Stdio::null()); + + let output = cmd.output().map_err(|err| { + io::Error::other(format!( + "spawn mksquashfs {}: {err}", + options.mksquashfs.display() + )) + })?; + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(io::Error::other(format!( + "mksquashfs failed (status {}): {}", + output.status, + stderr.trim() + ))); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn build_fails_when_mksquashfs_is_missing() { + let tmp = tempfile::tempdir().unwrap(); + let source = tmp.path().join("src"); + std::fs::create_dir_all(&source).unwrap(); + let dest = tmp.path().join("out.squashfs"); + + let options = BuildOptions::with_binary(tmp.path().join("missing-mksquashfs")); + let err = build(&source, &dest, &options).expect_err("missing binary should fail"); + assert_eq!(err.kind(), io::ErrorKind::NotFound); + } + + #[test] + fn build_fails_when_source_tree_is_missing() { + let tmp = tempfile::tempdir().unwrap(); + let fake_bin = tmp.path().join("mksquashfs"); + std::fs::write(&fake_bin, "").unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&fake_bin, std::fs::Permissions::from_mode(0o755)).unwrap(); + } + let options = BuildOptions::with_binary(fake_bin); + let err = build( + &tmp.path().join("missing-src"), + &tmp.path().join("out.squashfs"), + &options, + ) + .expect_err("missing source should fail"); + assert_eq!(err.kind(), io::ErrorKind::NotFound); + } + + #[test] + fn compression_tag_matches_mksquashfs_flag_values() { + assert_eq!(Compression::Zstd.as_str(), "zstd"); + assert_eq!(Compression::Gzip.as_str(), "gzip"); + assert_eq!(Compression::Xz.as_str(), "xz"); + } +} diff --git a/crates/openshell-driver-vm/src/oci/metadata.rs b/crates/openshell-driver-vm/src/oci/metadata.rs new file mode 100644 index 000000000..eecafc743 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/metadata.rs @@ -0,0 +1,336 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Normalized launch metadata derived from the OCI image config + sandbox spec. + +use std::collections::BTreeMap; +use std::fmt; + +use serde::{Deserialize, Serialize}; + +/// Guest platform an OCI manifest must match. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum Platform { + /// `linux/amd64` + LinuxAmd64, + /// `linux/arm64` + LinuxArm64, +} + +impl Platform { + /// Host build target. Returns `None` on unsupported host arches. + #[must_use] + pub fn host() -> Option { + match std::env::consts::ARCH { + "x86_64" => Some(Self::LinuxAmd64), + "aarch64" | "arm64" => Some(Self::LinuxArm64), + _ => None, + } + } + + /// OCI `os` component. + #[must_use] + pub const fn os(self) -> &'static str { + "linux" + } + + /// OCI `architecture` component. + #[must_use] + pub const fn arch(self) -> &'static str { + match self { + Self::LinuxAmd64 => "amd64", + Self::LinuxArm64 => "arm64", + } + } + + /// Short string used in cache keys (`amd64`, `arm64`). + #[must_use] + pub const fn cache_tag(self) -> &'static str { + self.arch() + } +} + +impl fmt::Display for Platform { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}/{}", self.os(), self.arch()) + } +} + +/// Normalized command + environment the guest init will hand to the supervisor. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct LaunchMetadata { + /// Exact argv boundaries preserved (no shell split). + pub argv: Vec, + /// Ordered env, OCI config < template < sandbox spec. + pub env: Vec<(String, String)>, + /// Working directory inside the container rootfs. + pub workdir: String, + /// Labels copied from the OCI config (advisory; carried for introspection). + pub labels: BTreeMap, + /// Stop signal name from the OCI config (e.g. `SIGTERM`). Empty → default. + pub stop_signal: String, +} + +impl LaunchMetadata { + /// Normalize an OCI image config plus caller-supplied overrides into a + /// launch descriptor. + /// + /// Precedence for env: OCI config < template env < sandbox spec env. + /// Argv = OCI `Entrypoint` + `Cmd` per OCI spec precedence. + /// Workdir = OCI `WorkingDir` if absolute and non-empty, else `/sandbox`. + pub fn build( + image_config: ImageConfig, + template_env: &BTreeMap, + spec_env: &BTreeMap, + ) -> Result { + let argv = resolve_argv(&image_config.entrypoint, &image_config.cmd)?; + let workdir = resolve_workdir(&image_config.working_dir); + let env = merge_env(&image_config.env, template_env, spec_env)?; + + Ok(Self { + argv, + env, + workdir, + labels: image_config.labels, + stop_signal: image_config.stop_signal, + }) + } + + /// Render this metadata into env vars the guest init can consume. + /// + /// - `OPENSHELL_OCI_ARGC=`, `OPENSHELL_OCI_ARGV_=` for each i in 0..n. + /// - `OPENSHELL_OCI_ENV_COUNT=`, `OPENSHELL_OCI_ENV_==` for each i. + /// - `OPENSHELL_OCI_WORKDIR=`. + /// + /// A single env channel keeps this delivery in-band with the krun + /// `set_exec` call, avoiding any on-disk metadata file or vsock transfer. + #[must_use] + pub fn to_guest_env_vars(&self) -> Vec<(String, String)> { + let mut out = Vec::with_capacity(self.argv.len() + self.env.len() + 3); + out.push(( + "OPENSHELL_OCI_ARGC".to_string(), + self.argv.len().to_string(), + )); + for (i, arg) in self.argv.iter().enumerate() { + out.push((format!("OPENSHELL_OCI_ARGV_{i}"), arg.clone())); + } + out.push(( + "OPENSHELL_OCI_ENV_COUNT".to_string(), + self.env.len().to_string(), + )); + for (i, (key, value)) in self.env.iter().enumerate() { + out.push((format!("OPENSHELL_OCI_ENV_{i}"), format!("{key}={value}"))); + } + out.push(("OPENSHELL_OCI_WORKDIR".to_string(), self.workdir.clone())); + out + } +} + +/// Minimal view of the OCI image config we care about. +#[derive(Debug, Clone, Default)] +pub struct ImageConfig { + pub entrypoint: Vec, + pub cmd: Vec, + pub env: Vec, + pub working_dir: String, + pub labels: BTreeMap, + pub stop_signal: String, +} + +/// Errors raised when the image config is missing required data. +#[derive(Debug, thiserror::Error)] +pub enum BuildError { + #[error("image config has no runnable command (Entrypoint and Cmd are both empty)")] + EmptyCommand, + #[error("image env entry is not KEY=VALUE: {0}")] + MalformedEnv(String), + #[error("template env entry has empty key")] + EmptyTemplateEnvKey, +} + +fn resolve_argv(entrypoint: &[String], cmd: &[String]) -> Result, BuildError> { + let mut argv = Vec::with_capacity(entrypoint.len() + cmd.len()); + argv.extend(entrypoint.iter().cloned()); + argv.extend(cmd.iter().cloned()); + if argv.is_empty() { + return Err(BuildError::EmptyCommand); + } + Ok(argv) +} + +fn resolve_workdir(oci_workdir: &str) -> String { + if oci_workdir.starts_with('/') && !oci_workdir.is_empty() { + oci_workdir.to_string() + } else { + "/sandbox".to_string() + } +} + +fn merge_env( + oci_env: &[String], + template: &BTreeMap, + spec: &BTreeMap, +) -> Result, BuildError> { + let mut merged: BTreeMap = BTreeMap::new(); + for entry in oci_env { + let Some((key, value)) = entry.split_once('=') else { + return Err(BuildError::MalformedEnv(entry.clone())); + }; + merged.insert(key.to_string(), value.to_string()); + } + for (key, value) in template { + if key.is_empty() { + return Err(BuildError::EmptyTemplateEnvKey); + } + merged.insert(key.clone(), value.clone()); + } + for (key, value) in spec { + merged.insert(key.clone(), value.clone()); + } + Ok(merged.into_iter().collect()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn config(entrypoint: &[&str], cmd: &[&str], env: &[&str], workdir: &str) -> ImageConfig { + ImageConfig { + entrypoint: entrypoint.iter().map(|s| (*s).to_string()).collect(), + cmd: cmd.iter().map(|s| (*s).to_string()).collect(), + env: env.iter().map(|s| (*s).to_string()).collect(), + working_dir: workdir.to_string(), + ..Default::default() + } + } + + #[test] + fn argv_is_entrypoint_then_cmd() { + let meta = LaunchMetadata::build( + config(&["/bin/sh", "-c"], &["echo hi"], &[], "/app"), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + assert_eq!(meta.argv, vec!["/bin/sh", "-c", "echo hi"]); + assert_eq!(meta.workdir, "/app"); + } + + #[test] + fn argv_falls_back_to_cmd_only() { + let meta = LaunchMetadata::build( + config(&[], &["/bin/busybox", "sh"], &[], ""), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + assert_eq!(meta.argv, vec!["/bin/busybox", "sh"]); + } + + #[test] + fn empty_command_is_rejected() { + let err = LaunchMetadata::build( + config(&[], &[], &[], ""), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .expect_err("empty command must be rejected"); + assert!(matches!(err, BuildError::EmptyCommand)); + } + + #[test] + fn workdir_falls_back_to_sandbox() { + let meta = LaunchMetadata::build( + config(&["/bin/sh"], &[], &[], ""), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + assert_eq!(meta.workdir, "/sandbox"); + + let meta = LaunchMetadata::build( + config(&["/bin/sh"], &[], &[], "relative/path"), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + assert_eq!(meta.workdir, "/sandbox"); + } + + #[test] + fn env_precedence_is_oci_then_template_then_spec() { + let template: BTreeMap = [("A", "template"), ("B", "template")] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + let spec: BTreeMap = [("B", "spec"), ("C", "spec")] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + + let meta = LaunchMetadata::build( + config(&["/bin/sh"], &[], &["A=oci", "B=oci", "D=oci"], "/app"), + &template, + &spec, + ) + .unwrap(); + + let env: BTreeMap = meta.env.into_iter().collect(); + assert_eq!(env.get("A"), Some(&"template".to_string())); + assert_eq!(env.get("B"), Some(&"spec".to_string())); + assert_eq!(env.get("C"), Some(&"spec".to_string())); + assert_eq!(env.get("D"), Some(&"oci".to_string())); + } + + #[test] + fn malformed_oci_env_entry_is_rejected() { + let err = LaunchMetadata::build( + config(&["/bin/sh"], &[], &["BROKEN"], "/app"), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .expect_err("missing '=' should fail"); + assert!(matches!(err, BuildError::MalformedEnv(_))); + } + + #[test] + fn to_guest_env_vars_round_trips_argv_with_spaces() { + let meta = LaunchMetadata::build( + config( + &["/bin/sh", "-c"], + &["echo 'hello world'"], + &["A=1"], + "/app", + ), + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + + let env: BTreeMap = meta.to_guest_env_vars().into_iter().collect(); + assert_eq!(env.get("OPENSHELL_OCI_ARGC"), Some(&"3".to_string())); + assert_eq!( + env.get("OPENSHELL_OCI_ARGV_0"), + Some(&"/bin/sh".to_string()) + ); + assert_eq!(env.get("OPENSHELL_OCI_ARGV_1"), Some(&"-c".to_string())); + assert_eq!( + env.get("OPENSHELL_OCI_ARGV_2"), + Some(&"echo 'hello world'".to_string()) + ); + assert_eq!(env.get("OPENSHELL_OCI_ENV_COUNT"), Some(&"1".to_string())); + assert_eq!(env.get("OPENSHELL_OCI_ENV_0"), Some(&"A=1".to_string())); + assert_eq!(env.get("OPENSHELL_OCI_WORKDIR"), Some(&"/app".to_string())); + } + + #[test] + fn host_platform_is_recognized_on_supported_arches() { + let platform = Platform::host(); + // On CI/dev machines this should always be amd64 or arm64. + assert!(matches!( + platform, + Some(Platform::LinuxAmd64) | Some(Platform::LinuxArm64) + )); + } +} diff --git a/crates/openshell-driver-vm/src/oci/mod.rs b/crates/openshell-driver-vm/src/oci/mod.rs new file mode 100644 index 000000000..77cd266f7 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/mod.rs @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Host-side OCI image pipeline for the VM driver. +//! +//! Responsible for resolving a public OCI image reference to a cached, +//! read-only squashfs filesystem image and a launch metadata descriptor +//! that the guest uses to overlay + exec the container entrypoint. + +pub mod cache; +pub mod client; +pub mod compat; +pub mod flatten; +pub mod fs_image; +pub mod metadata; +pub mod pipeline; + +pub use cache::{CacheLayout, CachedImage}; +pub use client::{OciPuller, PullError, PulledImage}; +pub use metadata::{LaunchMetadata, Platform}; +pub use pipeline::{EnvOverrides, PipelineError, prepare, validate_reference}; diff --git a/crates/openshell-driver-vm/src/oci/pipeline.rs b/crates/openshell-driver-vm/src/oci/pipeline.rs new file mode 100644 index 000000000..34ec89163 --- /dev/null +++ b/crates/openshell-driver-vm/src/oci/pipeline.rs @@ -0,0 +1,147 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! End-to-end orchestrator: image ref → cached squashfs + launch metadata. +//! +//! On a cache hit this is a zero-I/O path that returns the cached descriptor. +//! On a miss it pulls the image, flattens its layers, injects compat files, +//! builds a squashfs, and installs it into the cache under the manifest digest. + +use std::collections::BTreeMap; + +use tracing::{debug, info}; + +use super::cache::{CacheLayout, CachedImage}; +use super::client::{OciPuller, PullError}; +use super::compat; +use super::flatten; +use super::fs_image::{self, BuildOptions}; +use super::metadata::{BuildError, LaunchMetadata}; + +/// Sandbox- and template-level env overrides that the pipeline merges into +/// the final launch metadata. +#[derive(Debug, Default, Clone)] +pub struct EnvOverrides { + pub template: BTreeMap, + pub spec: BTreeMap, +} + +/// Prepare an OCI image into a cache-backed [`CachedImage`] descriptor. +/// +/// Idempotent: if the image (keyed by manifest digest + platform) is already +/// built and its metadata exists, no network or disk work happens. +pub async fn prepare( + puller: &OciPuller, + cache: &CacheLayout, + build_opts: &BuildOptions, + image_ref: &str, + env_overrides: &EnvOverrides, +) -> Result { + cache.ensure_dirs().map_err(PipelineError::Cache)?; + + let platform = puller.platform(); + + debug!(image = image_ref, %platform, "resolving OCI image"); + let pulled = puller.pull(image_ref).await.map_err(PipelineError::Pull)?; + let manifest_digest = pulled.manifest_digest.clone(); + + if let Some(hit) = cache.lookup(&manifest_digest, platform) { + info!(digest = %manifest_digest, %platform, "OCI cache hit, skipping build"); + return Ok(hit); + } + + debug!(digest = %manifest_digest, "flattening OCI layers"); + let staging = cache + .tmp_dir() + .join(format!("stage-{}", strip_prefix(&manifest_digest))); + if staging.exists() { + std::fs::remove_dir_all(&staging).map_err(PipelineError::Cache)?; + } + std::fs::create_dir_all(&staging).map_err(PipelineError::Cache)?; + + for layer in &pulled.layers { + flatten::apply_layer_bytes(&staging, &layer.media_type, &layer.data) + .map_err(PipelineError::Flatten)?; + } + + debug!("injecting OpenShell compatibility files"); + compat::inject(&staging).map_err(PipelineError::Compat)?; + + let metadata = LaunchMetadata::build( + pulled.image_config, + &env_overrides.template, + &env_overrides.spec, + ) + .map_err(PipelineError::Metadata)?; + + let built = cache + .tmp_dir() + .join(format!("build-{}.squashfs", strip_prefix(&manifest_digest))); + debug!(output = %built.display(), "building squashfs"); + fs_image::build(&staging, &built, build_opts).map_err(PipelineError::Build)?; + + // Staging tree is no longer needed once the fs image is built. + let _ = std::fs::remove_dir_all(&staging); + + let installed = cache + .install_fs_image(&manifest_digest, platform, &built) + .map_err(PipelineError::Cache)?; + cache + .write_metadata(&manifest_digest, platform, &metadata) + .map_err(PipelineError::Cache)?; + + info!(digest = %manifest_digest, %platform, path = %installed.display(), "OCI image prepared"); + Ok(CachedImage { + fs_image: installed, + metadata, + }) +} + +/// Validate that an image reference is structurally OK before we bother the +/// registry. Useful for `validate_sandbox_create`. +pub fn validate_reference(image_ref: &str) -> Result<(), PipelineError> { + use std::str::FromStr; + oci_client::Reference::from_str(image_ref) + .map(|_| ()) + .map_err(|err| PipelineError::Pull(PullError::InvalidReference(err.to_string()))) +} + +#[allow(clippy::module_name_repetitions)] +#[derive(Debug, thiserror::Error)] +pub enum PipelineError { + #[error("cache I/O: {0}")] + Cache(#[source] std::io::Error), + #[error(transparent)] + Pull(PullError), + #[error("flatten layer: {0}")] + Flatten(#[source] std::io::Error), + #[error("inject compat files: {0}")] + Compat(#[source] std::io::Error), + #[error(transparent)] + Metadata(BuildError), + #[error("build fs image: {0}")] + Build(#[source] std::io::Error), +} + +fn strip_prefix(digest: &str) -> &str { + digest.split_once(':').map_or(digest, |(_, hex)| hex) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validate_reference_accepts_canonical_image_ref() { + validate_reference("docker.io/library/alpine:3.20").expect("valid"); + validate_reference( + "ghcr.io/org/image@sha256:0000000000000000000000000000000000000000000000000000000000000000", + ) + .expect("digest ref"); + } + + #[test] + fn validate_reference_rejects_empty_string() { + validate_reference("").expect_err("empty ref should fail"); + } +} diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs index 9888feb18..1f4669bc5 100644 --- a/crates/openshell-driver-vm/src/runtime.rs +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -29,6 +29,34 @@ pub struct VmLaunchConfig { pub port_map: Vec, pub log_level: u32, pub console_output: PathBuf, + /// Optional host-backed raw block image for mutable guest state. + /// Required when booting an imported OCI rootfs. + pub state_disk: Option, + /// Optional host-backed read-only base disk (e.g. a cached squashfs + /// image) used as the lower layer of an overlay mount in the guest. + /// Only set for OCI-image sandboxes. + pub ro_base_disk: Option, + /// Optional host Unix socket bridged into the guest as a vsock port. + /// Used by the OCI payload import channel. + pub import_vsock: Option, +} + +/// Block device exposed to the guest. +/// +/// The name is historical; both writable state disks and read-only base +/// images (e.g. squashfs) use this type. `read_only` distinguishes them. +#[derive(Debug, Clone)] +pub struct StateDisk { + pub path: PathBuf, + pub block_id: String, + pub read_only: bool, +} + +/// Host-side endpoint bridged to a guest vsock port. +#[derive(Debug, Clone)] +pub struct ImportVsock { + pub port: u32, + pub socket_path: PathBuf, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -64,6 +92,18 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { vm.set_root(&config.rootfs)?; vm.set_workdir(&config.workdir)?; + if let Some(disk) = config.ro_base_disk.as_ref() { + vm.add_state_disk(disk)?; + } + if let Some(disk) = config.state_disk.as_ref() { + vm.add_state_disk(disk)?; + } + if let Some(vsock) = config.import_vsock.as_ref() { + crate::state_disk::prepare_import_socket_dir(&vsock.socket_path) + .map_err(|err| format!("prepare import socket dir: {err}"))?; + vm.add_vsock_port(vsock)?; + } + let mut forwarded_port_map = config.port_map.clone(); let mut gvproxy_guard = None; let mut gvproxy_api_sock = None; @@ -273,6 +313,17 @@ fn raise_nofile_limit() { } } +fn state_disk_sync_mode() -> u32 { + #[cfg(target_os = "macos")] + { + ffi::KRUN_SYNC_RELAXED + } + #[cfg(not(target_os = "macos"))] + { + ffi::KRUN_SYNC_FULL + } +} + fn clamp_log_level(level: u32) -> u32 { match level { 0 => ffi::KRUN_LOG_LEVEL_OFF, @@ -330,6 +381,39 @@ impl VmContext { ) } + fn add_state_disk(&self, disk: &StateDisk) -> Result<(), String> { + let add_disk3 = self.krun.krun_add_disk3.ok_or_else(|| { + "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support".to_string() + })?; + let block_id_c = + CString::new(disk.block_id.as_str()).map_err(|e| format!("invalid block id: {e}"))?; + let disk_path_c = path_to_cstring(&disk.path)?; + check( + unsafe { + add_disk3( + self.ctx_id, + block_id_c.as_ptr(), + disk_path_c.as_ptr(), + ffi::KRUN_DISK_FORMAT_RAW, + disk.read_only, + false, + state_disk_sync_mode(), + ) + }, + "krun_add_disk3", + ) + } + + fn add_vsock_port(&self, vsock: &ImportVsock) -> Result<(), String> { + let socket_c = path_to_cstring(&vsock.socket_path)?; + check( + unsafe { + (self.krun.krun_add_vsock_port2)(self.ctx_id, vsock.port, socket_c.as_ptr(), true) + }, + "krun_add_vsock_port2", + ) + } + fn set_workdir(&self, workdir: &str) -> Result<(), String> { let workdir_c = CString::new(workdir).map_err(|e| format!("invalid workdir: {e}"))?; check( diff --git a/crates/openshell-driver-vm/src/state_disk.rs b/crates/openshell-driver-vm/src/state_disk.rs new file mode 100644 index 000000000..d7ad700a3 --- /dev/null +++ b/crates/openshell-driver-vm/src/state_disk.rs @@ -0,0 +1,260 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Helpers for per-sandbox state disks and host-to-guest import sockets +//! used by the OCI container execution path. + +#![allow(unsafe_code)] + +use std::fs; +use std::io; +use std::os::unix::fs::PermissionsExt as _; +use std::path::{Path, PathBuf}; + +/// Default raw state disk size when the driver has not been given an override. +/// Sparse-allocated; only actual writes consume space. +pub const DEFAULT_STATE_DISK_SIZE_BYTES: u64 = 16 * 1024 * 1024 * 1024; + +/// libkrun block ID the guest init script uses to locate the state disk. +pub const STATE_DISK_BLOCK_ID: &str = "sandbox-state"; + +/// vsock port used for one-shot OCI payload import. +pub const IMPORT_VSOCK_PORT: u32 = 10778; + +/// Layout of per-sandbox state-disk and import-socket paths. +#[derive(Debug, Clone)] +pub struct SandboxStatePaths { + /// Raw sparse disk image attached to the VM. + pub state_disk: PathBuf, + /// Host Unix socket bridged to the guest on [`IMPORT_VSOCK_PORT`]. + pub import_socket: PathBuf, +} + +impl SandboxStatePaths { + #[must_use] + pub fn for_state_dir(state_dir: &Path) -> Self { + Self { + state_disk: state_dir.join("sandbox-state.raw"), + import_socket: state_dir.join("oci-import.sock"), + } + } +} + +/// Create (or grow to size) the sparse raw state disk image. +pub fn ensure_state_disk(path: &Path, size_bytes: u64) -> io::Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + + let file = fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(false) + .open(path)?; + + let current = file.metadata()?.len(); + if current < size_bytes { + file.set_len(size_bytes)?; + } + Ok(()) +} + +/// Prepare the import-socket parent directory and remove any stale socket file. +/// +/// The parent directory is created with `0700`. If it already exists, it must +/// not be a symlink and must be owned by the current uid, otherwise we refuse +/// to use it — a tampered path would let an unprivileged user substitute the +/// socket before the VM connects to it. +pub fn prepare_import_socket_dir(socket_path: &Path) -> io::Result<()> { + let parent = socket_path + .parent() + .ok_or_else(|| io::Error::other("import socket path has no parent directory"))?; + + if parent.exists() { + let meta = parent.symlink_metadata()?; + if meta.file_type().is_symlink() { + return Err(io::Error::other(format!( + "import socket directory {} is a symlink; refusing to use it", + parent.display() + ))); + } + check_owner_and_mode(parent, &meta)?; + } else { + fs::create_dir_all(parent)?; + fs::set_permissions(parent, fs::Permissions::from_mode(0o700))?; + } + + match fs::remove_file(socket_path) { + Ok(()) => Ok(()), + Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(()), + Err(err) => Err(err), + } +} + +/// Verify that `path` is owned by the current uid and has a mode of `0o700` +/// or stricter. Returns an error if either check fails. +pub fn verify_import_socket_path(path: &Path) -> io::Result<()> { + let meta = path.symlink_metadata()?; + if meta.file_type().is_symlink() { + return Err(io::Error::other(format!( + "import socket path {} is a symlink; refusing to use it", + path.display() + ))); + } + check_owner(path, &meta)?; + + if let Some(parent) = path.parent() { + let parent_meta = parent.symlink_metadata()?; + if parent_meta.file_type().is_symlink() { + return Err(io::Error::other(format!( + "import socket directory {} is a symlink; refusing to use it", + parent.display() + ))); + } + check_owner_and_mode(parent, &parent_meta)?; + } + Ok(()) +} + +#[cfg(unix)] +fn check_owner_and_mode(path: &Path, meta: &fs::Metadata) -> io::Result<()> { + check_owner(path, meta)?; + let mode = meta.permissions().mode() & 0o777; + if mode & 0o077 != 0 { + return Err(io::Error::other(format!( + "import socket directory {} has permissions {:o}; expected 0700", + path.display(), + mode + ))); + } + Ok(()) +} + +#[cfg(not(unix))] +fn check_owner_and_mode(_path: &Path, _meta: &fs::Metadata) -> io::Result<()> { + Ok(()) +} + +#[cfg(unix)] +fn check_owner(path: &Path, meta: &fs::Metadata) -> io::Result<()> { + use std::os::unix::fs::MetadataExt as _; + let uid = unsafe { libc::getuid() }; + if meta.uid() != uid { + return Err(io::Error::other(format!( + "{} is owned by uid {} but we are uid {}", + path.display(), + meta.uid(), + uid + ))); + } + Ok(()) +} + +#[cfg(not(unix))] +fn check_owner(_path: &Path, _meta: &fs::Metadata) -> io::Result<()> { + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicU64, Ordering}; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn unique_temp_dir() -> PathBuf { + static COUNTER: AtomicU64 = AtomicU64::new(0); + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + let suffix = COUNTER.fetch_add(1, Ordering::Relaxed); + std::env::temp_dir().join(format!( + "openshell-state-disk-test-{}-{nanos}-{suffix}", + std::process::id() + )) + } + + #[test] + fn sandbox_state_paths_places_files_inside_state_dir() { + let paths = SandboxStatePaths::for_state_dir(Path::new("/srv/state/abc")); + assert_eq!( + paths.state_disk, + Path::new("/srv/state/abc/sandbox-state.raw") + ); + assert_eq!( + paths.import_socket, + Path::new("/srv/state/abc/oci-import.sock") + ); + } + + #[test] + fn ensure_state_disk_creates_sparse_file_of_requested_size() { + let dir = unique_temp_dir(); + let disk = dir.join("state.raw"); + ensure_state_disk(&disk, 1024 * 1024).expect("create disk"); + let meta = fs::metadata(&disk).expect("stat disk"); + assert_eq!(meta.len(), 1024 * 1024); + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn ensure_state_disk_grows_but_does_not_shrink() { + let dir = unique_temp_dir(); + let disk = dir.join("state.raw"); + ensure_state_disk(&disk, 4096).expect("initial"); + ensure_state_disk(&disk, 8192).expect("grow"); + assert_eq!(fs::metadata(&disk).unwrap().len(), 8192); + ensure_state_disk(&disk, 2048).expect("shrink noop"); + assert_eq!(fs::metadata(&disk).unwrap().len(), 8192); + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn prepare_import_socket_dir_creates_0700_dir_when_absent() { + let base = unique_temp_dir(); + let sock = base.join("oci-import.sock"); + prepare_import_socket_dir(&sock).expect("prepare"); + let meta = fs::metadata(&base).unwrap(); + assert_eq!(meta.permissions().mode() & 0o777, 0o700); + let _ = fs::remove_dir_all(&base); + } + + #[test] + fn prepare_import_socket_dir_removes_stale_socket_file() { + let base = unique_temp_dir(); + fs::create_dir_all(&base).unwrap(); + fs::set_permissions(&base, fs::Permissions::from_mode(0o700)).unwrap(); + let sock = base.join("oci-import.sock"); + fs::write(&sock, b"stale").unwrap(); + + prepare_import_socket_dir(&sock).expect("prepare"); + assert!(!sock.exists(), "stale socket should be removed"); + let _ = fs::remove_dir_all(&base); + } + + #[test] + fn prepare_import_socket_dir_rejects_world_writable_dir() { + let base = unique_temp_dir(); + fs::create_dir_all(&base).unwrap(); + fs::set_permissions(&base, fs::Permissions::from_mode(0o755)).unwrap(); + let sock = base.join("oci-import.sock"); + let err = prepare_import_socket_dir(&sock).expect_err("loose dir should be rejected"); + assert!(err.to_string().contains("permissions")); + let _ = fs::remove_dir_all(&base); + } + + #[test] + fn verify_import_socket_path_rejects_symlink() { + let base = unique_temp_dir(); + fs::create_dir_all(&base).unwrap(); + let target = base.join("real.sock"); + fs::write(&target, b"").unwrap(); + let link = base.join("oci-import.sock"); + std::os::unix::fs::symlink(&target, &link).unwrap(); + let err = + verify_import_socket_path(&link).expect_err("symlinked socket should be rejected"); + assert!(err.to_string().contains("symlink")); + let _ = fs::remove_dir_all(&base); + } +} diff --git a/crates/openshell-driver-vm/tests/oci_pipeline_integration.rs b/crates/openshell-driver-vm/tests/oci_pipeline_integration.rs new file mode 100644 index 000000000..30a4e2aa9 --- /dev/null +++ b/crates/openshell-driver-vm/tests/oci_pipeline_integration.rs @@ -0,0 +1,145 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration test for the OCI pipeline minus the network step. +//! +//! Builds a synthetic rootfs using the `flatten` module, injects compat files, +//! runs `mksquashfs` to produce a real RO base image, installs it in the +//! cache, and verifies the resulting fs image is non-empty and the cache +//! lookup round-trips. +//! +//! Gated on `mksquashfs` being present in `$PATH`. Run with: +//! cargo test -p openshell-driver-vm --tests -- --ignored + +use std::collections::BTreeMap; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +use openshell_driver_vm::oci::{ + CacheLayout, LaunchMetadata, Platform, compat, + flatten::apply_tar_stream, + fs_image::{BuildOptions, build}, + metadata::ImageConfig, +}; + +fn which(bin: &str) -> Option { + let paths = std::env::var_os("PATH")?; + for dir in std::env::split_paths(&paths) { + let candidate = dir.join(bin); + if candidate.is_file() { + return Some(candidate); + } + } + None +} + +fn build_minimal_tar() -> Vec { + let mut buf = Vec::new(); + { + let mut builder = tar::Builder::new(&mut buf); + + for dir in ["bin/", "etc/", "usr/", "usr/bin/"] { + let mut header = tar::Header::new_gnu(); + header.set_path(dir).unwrap(); + header.set_size(0); + header.set_mode(0o755); + header.set_entry_type(tar::EntryType::Directory); + header.set_cksum(); + builder.append(&header, std::io::empty()).unwrap(); + } + + let mut header = tar::Header::new_gnu(); + header.set_path("bin/sh").unwrap(); + let payload = b"#!/bin/sh\n:\n"; + header.set_size(payload.len() as u64); + header.set_mode(0o755); + header.set_entry_type(tar::EntryType::Regular); + header.set_cksum(); + builder.append(&header, &payload[..]).unwrap(); + + let passwd = b"root:x:0:0:root:/root:/bin/sh\n"; + let mut header = tar::Header::new_gnu(); + header.set_path("etc/passwd").unwrap(); + header.set_size(passwd.len() as u64); + header.set_mode(0o644); + header.set_entry_type(tar::EntryType::Regular); + header.set_cksum(); + builder.append(&header, &passwd[..]).unwrap(); + + builder.finish().unwrap(); + } + buf +} + +#[test] +#[ignore = "requires mksquashfs in $PATH; run with `cargo test -- --ignored`"] +fn full_pipeline_without_network_produces_cached_image() { + let Some(mksquashfs) = which("mksquashfs") else { + eprintln!("mksquashfs not found on PATH; skipping"); + return; + }; + + let work = tempfile::tempdir().unwrap(); + + // 1. Flatten a synthetic "image" layer into a staging tree. + let staging = work.path().join("stage"); + fs::create_dir_all(&staging).unwrap(); + let tar_bytes = build_minimal_tar(); + apply_tar_stream(&staging, tar_bytes.as_slice()).unwrap(); + + // 2. Inject OpenShell compat files. + compat::inject(&staging).unwrap(); + assert!(staging.join("sandbox").is_dir()); + assert!(staging.join("tmp").is_dir()); + let passwd = fs::read_to_string(staging.join("etc/passwd")).unwrap(); + assert!(passwd.contains("sandbox:x:10001:10001:")); + + // 3. Build squashfs. + let cache_root = work.path().join("cache"); + let layout = CacheLayout::new(cache_root.clone()); + layout.ensure_dirs().unwrap(); + let built = layout.tmp_dir().join("build.squashfs"); + let opts = BuildOptions::with_binary(mksquashfs); + build(&staging, &built, &opts).expect("mksquashfs build"); + assert!(built.is_file()); + let size = fs::metadata(&built).unwrap().len(); + assert!(size > 0, "squashfs image should be non-empty"); + + // 4. Install + write metadata, then round-trip the lookup. + let digest = "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + let platform = Platform::host().expect("host platform must be supported"); + + let metadata = LaunchMetadata::build( + ImageConfig { + entrypoint: vec!["/bin/sh".to_string()], + cmd: vec!["-c".to_string(), "true".to_string()], + env: vec!["PATH=/bin".to_string()], + working_dir: "/sandbox".to_string(), + labels: BTreeMap::new(), + stop_signal: String::new(), + }, + &BTreeMap::new(), + &BTreeMap::new(), + ) + .unwrap(); + + let installed = layout.install_fs_image(digest, platform, &built).unwrap(); + layout.write_metadata(digest, platform, &metadata).unwrap(); + assert!(installed.is_file()); + assert!(!built.exists(), "built image should be moved, not copied"); + + let hit = layout + .lookup(digest, platform) + .expect("cache lookup should hit after install"); + assert_eq!(hit.fs_image, installed); + assert_eq!(hit.metadata.argv, metadata.argv); + + // 5. A second install is idempotent (removes + re-moves into the same slot). + let rebuilt = layout.tmp_dir().join("rebuild.squashfs"); + let mut f = fs::File::create(&rebuilt).unwrap(); + f.write_all(&fs::read(&installed).unwrap()).unwrap(); + drop(f); + let reinstalled = layout.install_fs_image(digest, platform, &rebuilt).unwrap(); + assert_eq!(reinstalled, installed); +} diff --git a/crates/openshell-sandbox/src/container_env.rs b/crates/openshell-sandbox/src/container_env.rs new file mode 100644 index 000000000..f4d1c35e4 --- /dev/null +++ b/crates/openshell-sandbox/src/container_env.rs @@ -0,0 +1,248 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Clean-env mode for supervised container processes. +//! +//! When the sandbox is launched as an OCI container (VM driver with a +//! `template.image`), the guest init strips the OCI metadata vars it received +//! from the driver and repackages the final merged container env into +//! `OPENSHELL_CONTAINER_ENV_` vars before exec'ing the supervisor. It also +//! sets `OPENSHELL_CONTAINER_MODE=1`. +//! +//! In that mode the supervisor does **not** let its own environ leak to the +//! child process. It starts the child with an empty baseline and applies only +//! a documented allowlist: the container env, provider/proxy/TLS env from +//! policy, `OPENSHELL_SANDBOX=1`, and minimal shell defaults (`HOME`, `PATH`, +//! `TERM`). + +use std::collections::HashMap; +use tokio::process::Command; + +/// Env var that gates clean-env behavior. Set by the guest init when the +/// supervisor is launching an OCI image. +pub(crate) const CONTAINER_MODE_ENV: &str = "OPENSHELL_CONTAINER_MODE"; +/// `OPENSHELL_CONTAINER_ENV_COUNT` — number of container env entries. +pub(crate) const CONTAINER_ENV_COUNT: &str = "OPENSHELL_CONTAINER_ENV_COUNT"; +/// Prefix for `OPENSHELL_CONTAINER_ENV_=KEY=VALUE` entries. +pub(crate) const CONTAINER_ENV_PREFIX: &str = "OPENSHELL_CONTAINER_ENV_"; + +/// Default search PATH for the child when none was supplied by the image. +const DEFAULT_CONTAINER_PATH: &str = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"; + +/// Returns `true` when `OPENSHELL_CONTAINER_MODE=1` is set in the supervisor's +/// own environ. +pub(crate) fn is_container_mode() -> bool { + std::env::var(CONTAINER_MODE_ENV).is_ok_and(|v| v == "1") +} + +/// Read container env entries packed as `OPENSHELL_CONTAINER_ENV_=KEY=VAL` +/// and return them as an ordered `(key, value)` list. Later entries win if the +/// same key is repeated, matching the merge order produced by the host. +/// +/// Unparseable entries are skipped; they should have been validated upstream. +pub(crate) fn read_container_env() -> Vec<(String, String)> { + let count: usize = std::env::var(CONTAINER_ENV_COUNT) + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(0); + + let mut out = Vec::with_capacity(count); + for i in 0..count { + let Ok(raw) = std::env::var(format!("{CONTAINER_ENV_PREFIX}{i}")) else { + continue; + }; + let Some((key, value)) = raw.split_once('=') else { + continue; + }; + if !key.is_empty() { + out.push((key.to_string(), value.to_string())); + } + } + out +} + +/// Clear the command's inherited environ and apply a clean baseline suitable +/// for container-mode execution. +/// +/// Adds (in this order so later values win on conflict): +/// 1. Minimal shell defaults (`HOME=/sandbox`, `PATH=`, `TERM=xterm`). +/// 2. Entries from [`read_container_env`] (the OCI image env + template/spec +/// overrides). +/// 3. `OPENSHELL_SANDBOX=1` marker (always set, even if the image tried to +/// override it). +/// +/// Callers layer provider env, proxy env, and TLS env *after* this call; that +/// order matches the pre-existing non-container flow. +pub(crate) fn apply_clean_container_baseline(cmd: &mut Command) { + cmd.env_clear(); + cmd.env("HOME", "/sandbox"); + cmd.env("PATH", DEFAULT_CONTAINER_PATH); + cmd.env("TERM", "xterm"); + for (key, value) in read_container_env() { + cmd.env(key, value); + } + // OPENSHELL_SANDBOX is a documented marker for programs inside the + // sandbox. Apply after container env so images cannot disable it. + cmd.env("OPENSHELL_SANDBOX", "1"); +} + +/// Parse a `KEY=VALUE` string, or `None` if it is missing an `=`. +#[cfg(test)] +pub(crate) fn parse_kv(raw: &str) -> Option<(String, String)> { + let (key, value) = raw.split_once('=')?; + if key.is_empty() { + return None; + } + Some((key.to_string(), value.to_string())) +} + +/// Build a `HashMap` of the env vars currently set on `cmd`, for testing. +#[cfg(test)] +pub(crate) fn command_env_snapshot(cmd: &Command) -> HashMap { + cmd.as_std() + .get_envs() + .filter_map(|(k, v)| { + let key = k.to_str()?.to_string(); + let value = v?.to_str()?.to_string(); + Some((key, value)) + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Mutex; + + // Tests touch process-wide env vars; serialize them to avoid races. + static ENV_LOCK: Mutex<()> = Mutex::new(()); + + struct EnvGuard { + keys: Vec, + } + + impl EnvGuard { + fn new() -> Self { + Self { keys: Vec::new() } + } + + fn set(&mut self, key: &str, value: &str) { + self.keys.push(key.to_string()); + // SAFETY: guarded by ENV_LOCK. + #[allow(unsafe_code)] + unsafe { + std::env::set_var(key, value); + } + } + } + + impl Drop for EnvGuard { + fn drop(&mut self) { + #[allow(unsafe_code)] + unsafe { + for key in &self.keys { + std::env::remove_var(key); + } + } + } + } + + #[test] + fn is_container_mode_matches_only_when_env_is_one() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + assert!(!is_container_mode(), "default should be off"); + guard.set(CONTAINER_MODE_ENV, "0"); + assert!(!is_container_mode()); + guard.set(CONTAINER_MODE_ENV, "1"); + assert!(is_container_mode()); + } + + #[test] + fn read_container_env_decodes_ordered_pairs() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + guard.set(CONTAINER_ENV_COUNT, "3"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}0"), "A=1"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}1"), "B=2"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}2"), "PATH=/custom/bin"); + + let entries = read_container_env(); + assert_eq!( + entries, + vec![ + ("A".to_string(), "1".to_string()), + ("B".to_string(), "2".to_string()), + ("PATH".to_string(), "/custom/bin".to_string()), + ] + ); + } + + #[test] + fn read_container_env_skips_malformed_or_missing_entries() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + guard.set(CONTAINER_ENV_COUNT, "3"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}0"), "A=1"); + // index 1 is missing + guard.set(&format!("{CONTAINER_ENV_PREFIX}2"), "no-equals-sign"); + + let entries = read_container_env(); + assert_eq!(entries, vec![("A".to_string(), "1".to_string())]); + } + + #[tokio::test] + async fn apply_clean_baseline_clears_existing_env_and_seeds_defaults() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + guard.set(CONTAINER_ENV_COUNT, "1"); + guard.set(&format!("{CONTAINER_ENV_PREFIX}0"), "FROM_IMAGE=yes"); + + let mut cmd = Command::new("/usr/bin/true"); + cmd.env("LEAKED_FROM_PARENT", "should-be-cleared"); + cmd.env("OPENSHELL_CONTROL_SECRET", "must-not-leak"); + apply_clean_container_baseline(&mut cmd); + + let env = command_env_snapshot(&cmd); + assert_eq!(env.get("HOME"), Some(&"/sandbox".to_string())); + assert_eq!(env.get("TERM"), Some(&"xterm".to_string())); + assert_eq!(env.get("FROM_IMAGE"), Some(&"yes".to_string())); + assert_eq!(env.get("OPENSHELL_SANDBOX"), Some(&"1".to_string())); + assert!( + !env.contains_key("LEAKED_FROM_PARENT"), + "pre-existing env must be cleared before baseline" + ); + assert!( + !env.contains_key("OPENSHELL_CONTROL_SECRET"), + "control-plane env must not leak" + ); + } + + #[tokio::test] + async fn container_env_cannot_override_openshell_sandbox_marker() { + let _lock = ENV_LOCK.lock().unwrap(); + let mut guard = EnvGuard::new(); + guard.set(CONTAINER_ENV_COUNT, "1"); + guard.set( + &format!("{CONTAINER_ENV_PREFIX}0"), + "OPENSHELL_SANDBOX=hijacked", + ); + + let mut cmd = Command::new("/usr/bin/true"); + apply_clean_container_baseline(&mut cmd); + + let env = command_env_snapshot(&cmd); + assert_eq!(env.get("OPENSHELL_SANDBOX"), Some(&"1".to_string())); + } + + #[test] + fn parse_kv_splits_on_first_equals() { + assert_eq!( + parse_kv("A=hello=world"), + Some(("A".to_string(), "hello=world".to_string())) + ); + assert_eq!(parse_kv("A="), Some(("A".to_string(), String::new()))); + assert!(parse_kv("no-equals").is_none()); + assert!(parse_kv("=value").is_none()); + } +} diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 1fbbe90d4..d6493d632 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -7,6 +7,7 @@ pub mod bypass_monitor; mod child_env; +mod container_env; pub mod denial_aggregator; mod grpc_client; mod identity; diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 85a57b4e7..1e98a4055 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -4,6 +4,7 @@ //! Process management and signal handling. use crate::child_env; +use crate::container_env; use crate::policy::{NetworkMode, SandboxPolicy}; use crate::sandbox; #[cfg(target_os = "linux")] @@ -158,10 +159,14 @@ impl ProcessHandle { .stdin(Stdio::inherit()) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) - .kill_on_drop(true) - .env("OPENSHELL_SANDBOX", "1"); + .kill_on_drop(true); - scrub_sensitive_env(&mut cmd); + if container_env::is_container_mode() { + container_env::apply_clean_container_baseline(&mut cmd); + } else { + cmd.env("OPENSHELL_SANDBOX", "1"); + scrub_sensitive_env(&mut cmd); + } inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { @@ -285,10 +290,14 @@ impl ProcessHandle { .stdin(Stdio::inherit()) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) - .kill_on_drop(true) - .env("OPENSHELL_SANDBOX", "1"); + .kill_on_drop(true); - scrub_sensitive_env(&mut cmd); + if container_env::is_container_mode() { + container_env::apply_clean_container_baseline(&mut cmd); + } else { + cmd.env("OPENSHELL_SANDBOX", "1"); + scrub_sensitive_env(&mut cmd); + } inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index ba9425036..9970eea6d 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -161,6 +161,13 @@ struct Args { #[arg(long, env = "OPENSHELL_VM_TLS_KEY")] vm_tls_key: Option, + /// Path to the `mksquashfs` binary used by the VM driver's OCI pipeline. + /// Required for OCI-image sandboxes on the VM driver. When unset, the + /// gateway does not pass `--mksquashfs-bin` and the driver falls back to + /// the `OPENSHELL_VM_MKSQUASHFS` env var inherited from this process. + #[arg(long, env = "OPENSHELL_VM_MKSQUASHFS")] + vm_mksquashfs_bin: Option, + /// Disable TLS entirely — listen on plaintext HTTP. /// Use this when the gateway sits behind a reverse proxy or tunnel /// (e.g. Cloudflare Tunnel) that terminates TLS at the edge. @@ -269,6 +276,7 @@ async fn run_from_args(args: Args) -> Result<()> { guest_tls_ca: args.vm_tls_ca, guest_tls_cert: args.vm_tls_cert, guest_tls_key: args.vm_tls_key, + mksquashfs_bin: args.vm_mksquashfs_bin, }; if args.disable_tls { diff --git a/crates/openshell-server/src/compute/vm.rs b/crates/openshell-server/src/compute/vm.rs index d0f397b01..24d624ad6 100644 --- a/crates/openshell-server/src/compute/vm.rs +++ b/crates/openshell-server/src/compute/vm.rs @@ -78,6 +78,12 @@ pub struct VmComputeConfig { /// Host-side private key for the guest's mTLS client bundle. pub guest_tls_key: Option, + + /// Optional path to the `mksquashfs` binary used by the VM driver's OCI + /// pipeline to build read-only base images. When `None`, the gateway does + /// not pass `--mksquashfs-bin`; the driver falls back to the + /// `OPENSHELL_VM_MKSQUASHFS` env var inherited from the gateway process. + pub mksquashfs_bin: Option, } impl VmComputeConfig { @@ -117,6 +123,7 @@ impl Default for VmComputeConfig { guest_tls_ca: None, guest_tls_cert: None, guest_tls_key: None, + mksquashfs_bin: None, } } } @@ -209,6 +216,73 @@ pub(crate) fn compute_driver_guest_tls_paths( Ok(Some(VmGuestTlsPaths { ca, cert, key })) } +/// Build the argv the gateway passes to the `openshell-driver-vm` subprocess. +/// +/// Factored out of [`spawn`] so it can be unit-tested without actually +/// launching the driver. `socket_path` is the UDS the driver will listen on; +/// `guest_tls_paths` is the resolved output of [`compute_driver_guest_tls_paths`]. +/// +/// The returned vector excludes `argv[0]` — callers append it to a `Command` +/// that was already constructed with the driver binary path. +#[cfg(unix)] +pub(crate) fn build_driver_argv( + config: &Config, + vm_config: &VmComputeConfig, + socket_path: &std::path::Path, + guest_tls_paths: Option<&VmGuestTlsPaths>, +) -> Vec { + use std::ffi::OsString; + fn push_pair(argv: &mut Vec, flag: &str, value: &str) { + argv.push(OsString::from(flag)); + argv.push(OsString::from(value)); + } + + let mut argv: Vec = Vec::new(); + argv.push(OsString::from("--bind-socket")); + argv.push(socket_path.as_os_str().to_os_string()); + push_pair(&mut argv, "--log-level", &config.log_level); + push_pair(&mut argv, "--openshell-endpoint", &config.grpc_endpoint); + argv.push(OsString::from("--state-dir")); + argv.push(vm_config.state_dir.as_os_str().to_os_string()); + push_pair( + &mut argv, + "--ssh-handshake-secret", + &config.ssh_handshake_secret, + ); + push_pair( + &mut argv, + "--ssh-handshake-skew-secs", + &config.ssh_handshake_skew_secs.to_string(), + ); + push_pair( + &mut argv, + "--krun-log-level", + &vm_config.krun_log_level.to_string(), + ); + push_pair(&mut argv, "--vcpus", &vm_config.vcpus.to_string()); + push_pair(&mut argv, "--mem-mib", &vm_config.mem_mib.to_string()); + if let Some(tls) = guest_tls_paths { + argv.push(OsString::from("--guest-tls-ca")); + argv.push(tls.ca.as_os_str().to_os_string()); + argv.push(OsString::from("--guest-tls-cert")); + argv.push(tls.cert.as_os_str().to_os_string()); + argv.push(OsString::from("--guest-tls-key")); + argv.push(tls.key.as_os_str().to_os_string()); + } + // Plumb the gateway-configured sandbox image through to the VM driver so + // `GetCapabilities.default_image` matches the gateway's configuration. + // Empty string is a valid value meaning "no default"; the flag always has + // a default of "" on the driver side so we pass it unconditionally. + push_pair(&mut argv, "--default-image", &config.sandbox_image); + // Pass an explicit mksquashfs path when the operator configured one so + // OCI sandboxes work without relying on env inheritance. + if let Some(mksquashfs) = vm_config.mksquashfs_bin.as_ref() { + argv.push(OsString::from("--mksquashfs-bin")); + argv.push(mksquashfs.as_os_str().to_os_string()); + } + argv +} + /// Launch the VM compute-driver subprocess, wait for its UDS to come up, /// and return a gRPC `Channel` connected to it plus a process handle that /// kills the subprocess and removes the socket on drop. @@ -250,27 +324,8 @@ pub(crate) async fn spawn( command.stdin(Stdio::null()); command.stdout(Stdio::inherit()); command.stderr(Stdio::inherit()); - command.arg("--bind-socket").arg(&socket_path); - command.arg("--log-level").arg(&config.log_level); - command - .arg("--openshell-endpoint") - .arg(&config.grpc_endpoint); - command.arg("--state-dir").arg(&vm_config.state_dir); - command - .arg("--ssh-handshake-secret") - .arg(&config.ssh_handshake_secret); - command - .arg("--ssh-handshake-skew-secs") - .arg(config.ssh_handshake_skew_secs.to_string()); - command - .arg("--krun-log-level") - .arg(vm_config.krun_log_level.to_string()); - command.arg("--vcpus").arg(vm_config.vcpus.to_string()); - command.arg("--mem-mib").arg(vm_config.mem_mib.to_string()); - if let Some(tls) = guest_tls_paths { - command.arg("--guest-tls-ca").arg(tls.ca); - command.arg("--guest-tls-cert").arg(tls.cert); - command.arg("--guest-tls-key").arg(tls.key); + for arg in build_driver_argv(config, vm_config, &socket_path, guest_tls_paths.as_ref()) { + command.arg(arg); } let mut child = command.spawn().map_err(|e| { @@ -353,10 +408,23 @@ async fn connect_compute_driver(socket_path: &std::path::Path) -> Result bool { + argv.windows(2) + .any(|pair| pair[0] == OsString::from(flag) && pair[1] == OsString::from(value)) + } + + fn argv_contains_flag(argv: &[OsString], flag: &str) -> bool { + argv.iter().any(|arg| arg == &OsString::from(flag)) + } + #[test] fn vm_compute_driver_tls_requires_explicit_guest_bundle() { let dir = tempdir().unwrap(); @@ -426,4 +494,106 @@ mod tests { assert_ne!(guest_paths.cert, server_cert); assert_ne!(guest_paths.key, server_key); } + + #[test] + fn build_driver_argv_passes_configured_sandbox_image_as_default_image() { + let config = Config::new(None) + .with_grpc_endpoint("http://127.0.0.1:8080") + .with_sandbox_image("docker.io/library/alpine:3.20"); + let vm_config = VmComputeConfig::default(); + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, None); + + assert!( + argv_contains_pair(&argv, "--default-image", "docker.io/library/alpine:3.20"), + "expected --default-image to be plumbed from sandbox_image: {argv:?}" + ); + } + + #[test] + fn build_driver_argv_passes_empty_default_image_when_gateway_has_no_sandbox_image() { + // sandbox_image defaults to "" — the driver treats that as "no default" + // and falls back to the legacy non-OCI supervisor boot. We still want + // the flag present so the driver's value cannot diverge from the + // gateway's intent silently. + let config = Config::new(None).with_grpc_endpoint("http://127.0.0.1:8080"); + let vm_config = VmComputeConfig::default(); + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, None); + + assert!( + argv_contains_pair(&argv, "--default-image", ""), + "expected --default-image '' to be passed explicitly: {argv:?}" + ); + } + + #[test] + fn build_driver_argv_passes_mksquashfs_bin_when_configured() { + let config = Config::new(None).with_grpc_endpoint("http://127.0.0.1:8080"); + let vm_config = VmComputeConfig { + mksquashfs_bin: Some(PathBuf::from("/usr/local/bin/mksquashfs")), + ..Default::default() + }; + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, None); + + assert!( + argv_contains_pair(&argv, "--mksquashfs-bin", "/usr/local/bin/mksquashfs"), + "expected --mksquashfs-bin flag: {argv:?}" + ); + } + + #[test] + fn build_driver_argv_omits_mksquashfs_bin_when_unconfigured() { + let config = Config::new(None).with_grpc_endpoint("http://127.0.0.1:8080"); + let vm_config = VmComputeConfig::default(); + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, None); + + assert!( + !argv_contains_flag(&argv, "--mksquashfs-bin"), + "--mksquashfs-bin should be absent when vm_config.mksquashfs_bin is None: {argv:?}" + ); + } + + #[test] + fn build_driver_argv_passes_guest_tls_triplet_when_present() { + let config = Config::new(None).with_grpc_endpoint("https://gateway.internal:8443"); + let vm_config = VmComputeConfig::default(); + let tls = VmGuestTlsPaths { + ca: PathBuf::from("/tls/ca.crt"), + cert: PathBuf::from("/tls/tls.crt"), + key: PathBuf::from("/tls/tls.key"), + }; + let socket = PathBuf::from("/tmp/drv.sock"); + + let argv = build_driver_argv(&config, &vm_config, &socket, Some(&tls)); + + assert!(argv_contains_pair(&argv, "--guest-tls-ca", "/tls/ca.crt")); + assert!(argv_contains_pair( + &argv, + "--guest-tls-cert", + "/tls/tls.crt" + )); + assert!(argv_contains_pair(&argv, "--guest-tls-key", "/tls/tls.key")); + } + + #[test] + fn build_driver_argv_socket_flag_points_at_provided_path() { + let config = Config::new(None).with_grpc_endpoint("http://127.0.0.1:8080"); + let vm_config = VmComputeConfig::default(); + let socket = Path::new("/var/run/openshell/driver.sock"); + + let argv = build_driver_argv(&config, &vm_config, socket, None); + + assert!(argv_contains_pair( + &argv, + "--bind-socket", + "/var/run/openshell/driver.sock" + )); + } }