From d39bc2b3c5a4ceab0ee649910414f3c8cfe5fde2 Mon Sep 17 00:00:00 2001
From: bilby91 <mfernandez@crunchloop.io>
Date: Tue, 23 Jun 2026 14:02:46 -0300
Subject: [PATCH 1/3] compose: orchestrator-driven health probing on Podman
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Podman runs a container's HEALTHCHECK as root and fires the first probe
immediately at start (it ignores start_period for the initial run, and
exposes no per-restore flag to defer it — confirmed on 5.7.0). For
privilege-dropping images this breaks the service: rabbitmq's
`rabbitmq-diagnostics` probe, run as root, creates a root-owned
/var/lib/rabbitmq/.erlang.cookie that the gosu-dropped uid-999 server
then can't read (eacces), so rabbitmq exits 1. Docker is unaffected
because it defers the first probe past the interval.

Have the compose orchestrator probe health itself on backends that opt
in via the new selfHealthProber interface (Podman returns true; Docker
and Apple unchanged). When set: omit RunSpec.HealthCheck so the backend
runs no native healthcheck, and evaluate service_healthy gates by
exec'ing the compose test command in waitFor — which defers the first
probe until after the service has initialized, matching Docker.

See design/compose-native-health.md (incl. the two-path divergence and
the out-of-scope rootless-gosu issue).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 compose/orchestrator.go             | 168 ++++++++++++++++++++++------
 compose/orchestrator_health_test.go | 117 +++++++++++++++++++
 compose/orchestrator_test.go        |   4 +
 design/compose-native-health.md     | 166 +++++++++++++++++++++++++++
 runtime/podman/podman.go            |  12 ++
 5 files changed, 434 insertions(+), 33 deletions(-)
 create mode 100644 compose/orchestrator_health_test.go
 create mode 100644 design/compose-native-health.md

diff --git a/compose/orchestrator.go b/compose/orchestrator.go
index 94c82ab..f9b5916 100644
--- a/compose/orchestrator.go
+++ b/compose/orchestrator.go
@@ -31,6 +31,7 @@ import (
 	"errors"
 	"fmt"
 	"sort"
+	"strings"
 	"sync"
 	"time"
 
@@ -82,15 +83,36 @@ type Orchestrator struct {
 	// PollInterval is the cadence for health polling. Tests
 	// override; production default below.
 	PollInterval time.Duration
+
+	// selfProbe is true when the backend asks the orchestrator to run
+	// healthcheck probes itself (via Exec) instead of configuring the
+	// backend's native HEALTHCHECK — see selfHealthProber and
+	// design/compose-native-health.md. Computed once at construction.
+	selfProbe bool
+}
+
+// selfHealthProber is an optional runtime capability: a backend that
+// implements it and returns true asks the orchestrator to drive
+// healthcheck probes itself rather than relying on the backend's native
+// HEALTHCHECK. Podman implements it — its native healthcheck runs eagerly
+// as root and races privilege-dropping images. Docker and Apple do not,
+// keeping their existing behavior.
+type selfHealthProber interface {
+	PreferSelfProbedHealth() bool
 }
 
 // NewOrchestrator constructs an Orchestrator with sane defaults.
 func NewOrchestrator(rt runtime.Runtime, backendName string) *Orchestrator {
+	selfProbe := false
+	if p, ok := rt.(selfHealthProber); ok {
+		selfProbe = p.PreferSelfProbedHealth()
+	}
 	return &Orchestrator{
 		rt:            rt,
 		BackendName:   backendName,
 		HealthTimeout: DefaultHealthTimeout,
 		PollInterval:  500 * time.Millisecond,
+		selfProbe:     selfProbe,
 	}
 }
 
@@ -481,6 +503,12 @@ func (o *Orchestrator) ensureService(
 	}
 
 	spec := serviceToRunSpec(plan, svc, projectLabels, hash, imageDigest)
+	if o.selfProbe {
+		// The orchestrator probes health itself (see waitFor); never
+		// configure the backend's native HEALTHCHECK, whose eager
+		// root probe breaks privilege-dropping images on Podman.
+		spec.HealthCheck = nil
+	}
 	c, err := o.rt.RunContainer(ctx, spec)
 	if err != nil {
 		// Compose's `up -d` pulls missing images implicitly. Mirror
@@ -582,7 +610,11 @@ func (o *Orchestrator) gateLevel(
 		if cid == "" {
 			continue
 		}
-		if err := o.waitFor(ctx, svcName, cid, req.condition, deadline); err != nil {
+		var hc *runtime.HealthCheckSpec
+		if svc, ok := plan.Project.Services[svcName]; ok {
+			hc = healthCheckOf(svc.HealthCheck)
+		}
+		if err := o.waitFor(ctx, svcName, cid, req.condition, hc, deadline); err != nil {
 			if req.optional {
 				// Per compose spec: a non-required dependency that
 				// fails to satisfy its condition does not block the
@@ -596,46 +628,63 @@ func (o *Orchestrator) gateLevel(
 	return nil
 }
 
-// waitFor polls a service's condition until satisfied or deadline.
+// waitFor polls a service's condition until satisfied or deadline. When
+// o.selfProbe is set, a service_healthy gate is evaluated by executing the
+// service's healthcheck command (hc) via Exec rather than reading the
+// backend's native health status — see selfHealthProber and
+// design/compose-native-health.md. The completion condition always reads
+// container state (no native healthcheck required either way).
 func (o *Orchestrator) waitFor(
-	ctx context.Context, svc, id, cond string, deadline time.Time,
+	ctx context.Context, svc, id, cond string, hc *runtime.HealthCheckSpec, deadline time.Time,
 ) error {
+	// In self-probe mode, honor start_period as a grace delay before the
+	// first probe so an eager probe can't run before the service inits.
+	probeNotBefore := time.Now()
+	if o.selfProbe && hc != nil {
+		probeNotBefore = probeNotBefore.Add(hc.StartPeriod)
+	}
 	for {
 		if err := ctx.Err(); err != nil {
 			return err
 		}
-		details, err := o.rt.InspectContainer(ctx, id)
-		if err == nil && details != nil {
-			switch cond {
-			case "service_healthy":
-				// Treat HealthNone as satisfied: a container with
-				// no HEALTHCHECK directive can still be a
-				// service_healthy gate target (compose's behavior),
-				// so falling back to State=Running keeps that case
-				// working. For services that DO declare a
-				// healthcheck, require Healthy explicitly.
-				switch details.Health {
-				case runtime.HealthHealthy:
-					return nil
-				case runtime.HealthNone:
-					if details.State == runtime.StateRunning {
+		if o.selfProbe && cond == "service_healthy" {
+			if o.probeHealthy(ctx, id, hc, probeNotBefore) {
+				return nil
+			}
+		} else {
+			details, err := o.rt.InspectContainer(ctx, id)
+			if err == nil && details != nil {
+				switch cond {
+				case "service_healthy":
+					// Treat HealthNone as satisfied: a container with
+					// no HEALTHCHECK directive can still be a
+					// service_healthy gate target (compose's behavior),
+					// so falling back to State=Running keeps that case
+					// working. For services that DO declare a
+					// healthcheck, require Healthy explicitly.
+					switch details.Health {
+					case runtime.HealthHealthy:
 						return nil
+					case runtime.HealthNone:
+						if details.State == runtime.StateRunning {
+							return nil
+						}
+					case runtime.HealthUnhealthy:
+						return fmt.Errorf(
+							"compose: service %q reported unhealthy while waiting on service_healthy",
+							svc,
+						)
+					}
+				case "service_completed_successfully":
+					if details.State == runtime.StateExited && details.ExitCode == 0 {
+						return nil
+					}
+					if details.State == runtime.StateExited && details.ExitCode != 0 {
+						return fmt.Errorf(
+							"compose: %s exited with code %d while waiting for completion",
+							svc, details.ExitCode,
+						)
 					}
-				case runtime.HealthUnhealthy:
-					return fmt.Errorf(
-						"compose: service %q reported unhealthy while waiting on service_healthy",
-						svc,
-					)
-				}
-			case "service_completed_successfully":
-				if details.State == runtime.StateExited && details.ExitCode == 0 {
-					return nil
-				}
-				if details.State == runtime.StateExited && details.ExitCode != 0 {
-					return fmt.Errorf(
-						"compose: %s exited with code %d while waiting for completion",
-						svc, details.ExitCode,
-					)
 				}
 			}
 		}
@@ -654,6 +703,59 @@ func (o *Orchestrator) waitFor(
 	}
 }
 
+// probeHealthy runs the service's compose healthcheck once via Exec and
+// reports whether it currently passes. A nil/disabled/NONE/empty
+// healthcheck falls back to "is the container running?", mirroring the
+// native path's HealthNone behavior. During the start_period grace
+// (before notBefore) it reports not-healthy without probing, so the first
+// probe is deferred until the service has had time to initialize.
+func (o *Orchestrator) probeHealthy(
+	ctx context.Context, id string, hc *runtime.HealthCheckSpec, notBefore time.Time,
+) bool {
+	cmd := healthProbeCmd(hc)
+	if cmd == nil {
+		details, err := o.rt.InspectContainer(ctx, id)
+		return err == nil && details != nil && details.State == runtime.StateRunning
+	}
+	if time.Now().Before(notBefore) {
+		return false
+	}
+	probeCtx := ctx
+	if hc.Timeout > 0 {
+		var cancel context.CancelFunc
+		probeCtx, cancel = context.WithTimeout(ctx, hc.Timeout)
+		defer cancel()
+	}
+	res, err := o.rt.ExecContainer(probeCtx, id, runtime.ExecOptions{Cmd: cmd})
+	return err == nil && res.ExitCode == 0
+}
+
+// healthProbeCmd converts a compose-normalized healthcheck Test into an
+// Exec command, or nil for NONE / disabled / empty. Compose normalizes
+// Test to a CMD / CMD-SHELL / NONE leading token; the default branch
+// shell-runs a bare-string form defensively.
+func healthProbeCmd(hc *runtime.HealthCheckSpec) []string {
+	if hc == nil || hc.Disable || len(hc.Test) == 0 {
+		return nil
+	}
+	switch hc.Test[0] {
+	case "NONE":
+		return nil
+	case "CMD":
+		if len(hc.Test) < 2 {
+			return nil
+		}
+		return append([]string(nil), hc.Test[1:]...)
+	case "CMD-SHELL":
+		if len(hc.Test) < 2 {
+			return nil
+		}
+		return []string{"/bin/sh", "-c", hc.Test[1]}
+	default:
+		return []string{"/bin/sh", "-c", strings.Join(hc.Test, " ")}
+	}
+}
+
 // serviceToRunSpec is the in-memory translation from compose's
 // ServiceConfig to runtime.RunSpec. This is intentionally minimal
 // for C6 — env / labels / mounts / command / entrypoint / user /
diff --git a/compose/orchestrator_health_test.go b/compose/orchestrator_health_test.go
new file mode 100644
index 0000000..f70e84c
--- /dev/null
+++ b/compose/orchestrator_health_test.go
@@ -0,0 +1,117 @@
+package compose
+
+import (
+	"context"
+	"errors"
+	"reflect"
+	"testing"
+	"time"
+
+	"github.com/crunchloop/devcontainer/runtime"
+)
+
+func TestHealthProbeCmd(t *testing.T) {
+	tests := []struct {
+		name string
+		hc   *runtime.HealthCheckSpec
+		want []string
+	}{
+		{"nil", nil, nil},
+		{"disabled", &runtime.HealthCheckSpec{Test: []string{"CMD", "true"}, Disable: true}, nil},
+		{"empty", &runtime.HealthCheckSpec{Test: nil}, nil},
+		{"none", &runtime.HealthCheckSpec{Test: []string{"NONE"}}, nil},
+		{"cmd", &runtime.HealthCheckSpec{Test: []string{"CMD", "rabbitmq-diagnostics", "-q", "ping"}}, []string{"rabbitmq-diagnostics", "-q", "ping"}},
+		{"cmd-no-args", &runtime.HealthCheckSpec{Test: []string{"CMD"}}, nil},
+		{"cmd-shell", &runtime.HealthCheckSpec{Test: []string{"CMD-SHELL", "pg_isready -U postgres"}}, []string{"/bin/sh", "-c", "pg_isready -U postgres"}},
+		{"bare-string-defensive", &runtime.HealthCheckSpec{Test: []string{"curl", "-f", "localhost"}}, []string{"/bin/sh", "-c", "curl -f localhost"}},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := healthProbeCmd(tt.hc)
+			if !reflect.DeepEqual(got, tt.want) {
+				t.Fatalf("healthProbeCmd(%v) = %v, want %v", tt.hc, got, tt.want)
+			}
+		})
+	}
+}
+
+// selfProbingMock is a mockRuntime that opts into orchestrator-driven
+// health probing, exercising the selfHealthProber detection.
+type selfProbingMock struct{ *mockRuntime }
+
+func (selfProbingMock) PreferSelfProbedHealth() bool { return true }
+
+func TestNewOrchestratorDetectsSelfProbe(t *testing.T) {
+	if NewOrchestrator(newMockRuntime(), "docker").selfProbe {
+		t.Error("docker mock (no selfHealthProber) should not self-probe")
+	}
+	if !NewOrchestrator(selfProbingMock{newMockRuntime()}, "podman").selfProbe {
+		t.Error("podman-like mock should self-probe")
+	}
+}
+
+func TestWaitForSelfProbeHealthy(t *testing.T) {
+	rt := newMockRuntime()
+	var execed int
+	rt.OnExec = func(id string, opts runtime.ExecOptions) (runtime.ExecResult, error) {
+		execed++
+		// Succeed only on the 2nd probe — proves we poll the command,
+		// not the native health status.
+		if execed >= 2 {
+			return runtime.ExecResult{ExitCode: 0}, nil
+		}
+		return runtime.ExecResult{ExitCode: 1}, nil
+	}
+	orch := NewOrchestrator(selfProbingMock{rt}, "podman")
+	orch.PollInterval = time.Millisecond
+
+	hc := &runtime.HealthCheckSpec{Test: []string{"CMD", "rabbitmq-diagnostics", "-q", "ping"}}
+	err := orch.waitFor(context.Background(), "rabbitmq", "c1", "service_healthy", hc, time.Now().Add(2*time.Second))
+	if err != nil {
+		t.Fatalf("waitFor = %v, want nil (probe should pass on 2nd try)", err)
+	}
+	if execed < 2 {
+		t.Fatalf("expected >= 2 probe execs, got %d", execed)
+	}
+}
+
+func TestWaitForSelfProbeTimeout(t *testing.T) {
+	rt := newMockRuntime()
+	rt.OnExec = func(id string, opts runtime.ExecOptions) (runtime.ExecResult, error) {
+		return runtime.ExecResult{ExitCode: 1}, nil // never healthy
+	}
+	orch := NewOrchestrator(selfProbingMock{rt}, "podman")
+	orch.PollInterval = time.Millisecond
+	orch.HealthTimeout = 30 * time.Millisecond
+
+	hc := &runtime.HealthCheckSpec{Test: []string{"CMD", "false"}}
+	err := orch.waitFor(context.Background(), "svc", "c1", "service_healthy", hc, time.Now().Add(30*time.Millisecond))
+	var hte *HealthTimeoutError
+	if !errors.As(err, &hte) {
+		t.Fatalf("waitFor = %v, want *HealthTimeoutError", err)
+	}
+}
+
+// A service with no/NONE healthcheck, self-probe mode: gate is satisfied
+// by the container merely running (mirrors native HealthNone behavior).
+func TestWaitForSelfProbeNoHealthcheckRunning(t *testing.T) {
+	rt := newMockRuntime()
+	c, err := rt.RunContainer(context.Background(), runtime.RunSpec{Name: "svc"})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := rt.StartContainer(context.Background(), c.ID); err != nil {
+		t.Fatal(err)
+	}
+	rt.OnExec = func(id string, opts runtime.ExecOptions) (runtime.ExecResult, error) {
+		t.Fatal("should not exec a probe when there is no healthcheck")
+		return runtime.ExecResult{}, nil
+	}
+	orch := NewOrchestrator(selfProbingMock{rt}, "podman")
+	orch.PollInterval = time.Millisecond
+
+	err = orch.waitFor(context.Background(), "svc", c.ID, "service_healthy", nil, time.Now().Add(time.Second))
+	if err != nil {
+		t.Fatalf("waitFor = %v, want nil (running container, no healthcheck)", err)
+	}
+}
diff --git a/compose/orchestrator_test.go b/compose/orchestrator_test.go
index 1593c66..4e1321e 100644
--- a/compose/orchestrator_test.go
+++ b/compose/orchestrator_test.go
@@ -45,6 +45,7 @@ type mockRuntime struct {
 	// Hooks (set to override default behavior).
 	OnRunContainer func(spec runtime.RunSpec) (*runtime.Container, error)
 	OnInspect      func(id string, base *runtime.ContainerDetails) *runtime.ContainerDetails
+	OnExec         func(id string, opts runtime.ExecOptions) (runtime.ExecResult, error)
 
 	// imageDigest, when non-empty, is the digest InspectImage
 	// returns for every reference. Tests that exercise digest-drift
@@ -139,6 +140,9 @@ func (m *mockRuntime) RemoveContainer(ctx context.Context, id string, opts runti
 }
 
 func (m *mockRuntime) ExecContainer(ctx context.Context, id string, opts runtime.ExecOptions) (runtime.ExecResult, error) {
+	if m.OnExec != nil {
+		return m.OnExec(id, opts)
+	}
 	return runtime.ExecResult{}, nil
 }
 
diff --git a/design/compose-native-health.md b/design/compose-native-health.md
new file mode 100644
index 0000000..0f272a1
--- /dev/null
+++ b/design/compose-native-health.md
@@ -0,0 +1,166 @@
+# Design — Orchestrator-driven health probing (Podman)
+
+**Status:** Draft for review
+**Date:** 2026-06-23
+**Scope:** Make the native compose orchestrator (`compose/orchestrator.go`)
+run healthcheck probes itself via `Exec` on backends whose native
+HEALTHCHECK is unsafe, instead of configuring the backend's native
+healthcheck. Fixes RabbitMQ (and any privilege-dropping image) failing to
+boot under the Podman backend.
+
+Companion to `design/compose-native.md` (the orchestrator) and
+`design/podman-backend.md` (the Podman runtime).
+
+---
+
+## 1. Problem
+
+A `rabbitmq` service in a multi-service compose devcontainer exits 1 under
+the Podman backend:
+
+```
+Error when reading /var/lib/rabbitmq/.erlang.cookie: eacces
+```
+
+Root cause (reproduced deterministically, incl. bare `podman run
+--health-cmd ...` — i.e. a Podman behavior, not a bug in our translation):
+
+1. The rabbitmq image runs as root; its entrypoint `gosu`-drops
+   `rabbitmq-server` to uid 999.
+2. **Podman runs the container HEALTHCHECK as root and fires the first
+   probe ~1-2s after start** (it does *not* defer the first probe for
+   `start_period`).
+3. rabbitmq's healthcheck `rabbitmq-diagnostics -q ping` is an Erlang
+   escript. Run as root with `HOME=/var/lib/rabbitmq` and no cookie yet,
+   it **creates `/var/lib/rabbitmq/.erlang.cookie` owned root:root,
+   mode 400**.
+4. The uid-999 server then cannot read root's cookie → `eacces` → exit 1.
+
+Docker is unaffected because it defers the first probe past the interval,
+so the 999 server writes the `999:999` cookie first; a later root probe
+just reads it (root bypasses DAC). Only Erlang-cookie-style images are hit
+— `pg_isready` / `curl` probes don't create root-owned files the main
+process must read.
+
+Evidence: no healthcheck → works; `CMD true` healthcheck → works; real
+healthcheck (even with `start_period: 120s`) → fails; first probe deferred
+until after server init → works.
+
+## 2. Decision
+
+On backends that opt in, the orchestrator **owns health probing**:
+
+- **Do not** set `RunSpec.HealthCheck` (the backend never runs a native
+  HEALTHCHECK, so no eager root probe).
+- For `depends_on.<svc>.condition: service_healthy` gates, the orchestrator
+  runs the service's compose `test` command itself via `ExecContainer`,
+  polling until it exits 0 (healthy) or `HealthTimeout` (→
+  `HealthTimeoutError`).
+
+Because probing is driven by the gate loop — which runs only *after* the
+level's containers have reached running — the first probe lands after the
+service has begun initializing, mirroring Docker's deferred-first-probe
+behavior. `start_period` is honored as a grace delay before the first
+probe.
+
+Services that declare a healthcheck but are **not** gated by any dependent
+(e.g. rabbitmq, when the dependent depends on it with the list/`service_started`
+form) are simply never probed — they run without a health status, which is
+correct: nothing consumes it, and the eager-root-probe crash is gone.
+
+### 2.1 Backend opt-in
+
+A backend signals the preference through an optional, consumer-defined
+interface (Go idiom — the orchestrator declares what it needs):
+
+```go
+// in compose/
+type selfHealthProber interface { PreferSelfProbedHealth() bool }
+```
+
+`runtime/podman.Runtime` implements it returning `true`. Docker and
+Apple do not implement it → unchanged behavior (Docker keeps native
+healthchecks + `InspectContainer().Health` gating; Apple keeps its
+`Healthchecks: false` fallback).
+
+Rejected alternatives:
+- A `runtime.Capabilities` bit — `Capabilities` models compose *feature*
+  support; "native healthcheck runs eagerly as root" is a backend quirk,
+  not a feature. Keeping it out of `Capabilities` avoids conflating the two.
+- Type-asserting `*podman.Runtime` in the engine — couples the engine to a
+  concrete backend and risks an import cycle.
+
+### 2.2 Probe semantics
+
+- `test` is compose-normalized to `["CMD", arg...]`, `["CMD-SHELL", str]`,
+  or `["NONE"]`. CMD → `ExecOptions.Cmd = test[1:]`; CMD-SHELL →
+  `["/bin/sh", "-c", test[1]]`; NONE / empty / `Disable` → treated as "no
+  healthcheck" → fall back to `State == Running` (same as today's
+  `HealthNone`).
+- Exec runs as the container's default user (`ExecOptions.User=""`). Safe
+  because probing is deferred: by the time we probe, the main process has
+  initialized. (Residual edge: a *gated* Erlang-cookie service with no
+  `start_period` could still be probed before its server writes the cookie;
+  `start_period` closes that. The motivating project does not hit this —
+  its rabbitmq is not gated.)
+- Exit 0 → healthy → gate satisfied. Non-zero → keep polling until deadline.
+
+## 3. Scope of change
+
+- `runtime/podman/podman.go`: add `PreferSelfProbedHealth() bool`.
+- `compose/orchestrator.go`: detect opt-in; omit `HealthCheck` from the
+  RunSpec when opted in; in `waitFor`, self-probe `service_healthy` gates
+  via Exec using the service's healthcheck spec.
+- Tests: unit (test→exec translation; self-probe gate satisfied/timeout via
+  a fake runtime). Validated manually end-to-end against a real
+  multi-service compose devcontainer on Podman (OrbStack) — rabbitmq boots
+  clean; a Podman integration test is follow-up work.
+
+No change to the Docker or Apple paths.
+
+## 3.1 Two coexisting health-check paths (the divergence)
+
+This change introduces a **second** health mechanism rather than replacing
+the first, so the orchestrator now has two, selected per-backend by
+`selfProbe`:
+
+| | Native path (Docker, Apple) | Self-probe path (Podman) |
+|---|---|---|
+| Healthcheck on container | `RunSpec.HealthCheck` set; runtime runs it | omitted |
+| Who probes | the runtime, as the container's user | the orchestrator, via `ExecContainer` |
+| `service_healthy` gate reads | `InspectContainer().Health` | exit code of the exec'd `test` |
+| Selected when | `selfProbe == false` | `selfProbe == true` |
+
+**Behavioral divergence to be aware of**, beyond two code paths to
+maintain: on the native path *every* service with a `healthcheck:` gets a
+persistent health status surfaced by the runtime (`podman ps` / `docker ps`
+HEALTH column). On the self-probe path the native healthcheck is omitted
+entirely, so (a) only services actually gated by a `service_healthy`
+dependency are ever probed, and (b) the probe result is transient — used to
+satisfy the gate, never surfaced as the container's reported health. For
+the motivating project this is invisible (nothing consumes rabbitmq's
+health; a `service → postgres` `service_healthy` gate still works via the
+self-probe), but the HEALTH column differs
+between backends.
+
+This fork is deliberate — Podman's eager-root healthcheck forced it — not
+accidental. Convergence options, to revisit when the `compose-native`
+rollout flips the default backend (`design/compose-native.md` §10):
+1. Unify on self-probe for all backends — one path, Docker parity, but lose
+   the runtime's native HEALTH surface.
+2. Keep the fork but also set a native healthcheck for non-gated services on
+   the self-probe path where safe — narrows the divergence, more complex.
+3. Leave as-is, documented (current choice).
+
+## 4. Out of scope — rootless Podman + gosu
+
+A *distinct* failure exists for the same image class under **rootless**
+Podman: `gosu`'s setuid/setgid privilege drop is denied by the user
+namespace (containers/podman#6816), so `rabbitmq-server` can't drop to uid
+999 at all. That is a different mechanism from the healthcheck race fixed
+here and this change does **not** address it. Our validation is on
+**rootful** Podman (the C/R target — root socket), where gosu works; if a
+rootless deployment is ever targeted, rabbitmq needs a separate remedy
+(e.g. `user:` pinning + uid-mapped volumes). Confirmed via #20893 that
+`--health-start-period` is doc-only and never defers the first probe, so it
+is not a viable alternative.
diff --git a/runtime/podman/podman.go b/runtime/podman/podman.go
index 004b7d1..c6d9433 100644
--- a/runtime/podman/podman.go
+++ b/runtime/podman/podman.go
@@ -119,3 +119,15 @@ func (r *Runtime) Capabilities() runtime.Capabilities {
 		Checkpoint:       r.checkpointOK,
 	}
 }
+
+// PreferSelfProbedHealth tells the compose orchestrator to run healthcheck
+// probes itself (via Exec) rather than configuring Podman's native
+// HEALTHCHECK. Podman runs the native healthcheck as root and fires the
+// first probe immediately at container start — before the main process
+// initializes — which breaks privilege-dropping images: e.g. rabbitmq's
+// `rabbitmq-diagnostics` probe, run as root, creates a root-owned
+// /var/lib/rabbitmq/.erlang.cookie that the gosu-dropped (uid 999) server
+// then cannot read (eacces). Letting the orchestrator probe defers the
+// first check until after the service is up, matching Docker's behavior.
+// See design/compose-native-health.md.
+func (r *Runtime) PreferSelfProbedHealth() bool { return true }

From 0c62dbe23f5461d046f3d5fb9e7537ec90d4d6b8 Mon Sep 17 00:00:00 2001
From: bilby91 <mfernandez@crunchloop.io>
Date: Tue, 23 Jun 2026 14:02:46 -0300
Subject: [PATCH 2/3] runtime/checkpoint: make multi-service project restore
 work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Validating CheckpointProject/RestoreProject against a real multi-service
compose devcontainer (8 services) surfaced restore failures the
single-container alpine spike never hit. Fixes:

- RestoreProject recreates the <project>_default network before
  restoring containers. A cross-node restore lands on a fresh store with
  no network, and libpod restore fails with "network not found". Mirrors
  how compose.Orchestrator.Up names + labels it; idempotent, so same-node
  is unaffected. (Custom/extra networks aren't recorded yet — follow-up.)

- RestoreProject clears a stale same-named container before restore.
  CheckpointProject(StopAfter) stops but does not remove containers, so a
  same-node restore-in-place collides ("ID already in use"). Inspect the
  deterministic <project>-<service>-1 name and remove it only when NOT
  running (state is in the archive; RemoveVolumes stays false so the data
  volume survives); refuse to clobber a running container.

- Add IgnoreVolumes to ProjectRestoreOptions/RestoreOptions ->
  runtime.RestoreSpec -> Podman restore ignoreVolumes=true, so a
  same-node restore-in-place reuses existing volumes instead of colliding
  ("volume already exists"). Cross-node leaves it false (content comes
  from the archive).

Folded the design decisions + two node prerequisites (CRIU
skip-file-rwx-check; base images on the destination) into
design/checkpoint-restore.md §3.3/§7. Verified end-to-end incl. true
cross-node migration with state preserved.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 checkpoint.go                |  7 +++++
 checkpoint_project.go        | 46 +++++++++++++++++++++++++++
 checkpoint_project_test.go   |  7 +++++
 design/checkpoint-restore.md | 60 +++++++++++++++++++++++++++++++-----
 runtime/podman/checkpoint.go |  3 ++
 runtime/runtime.go           |  8 +++++
 6 files changed, 124 insertions(+), 7 deletions(-)

diff --git a/checkpoint.go b/checkpoint.go
index 3f8f407..27f74b9 100644
--- a/checkpoint.go
+++ b/checkpoint.go
@@ -38,6 +38,12 @@ type RestoreOptions struct {
 	// established connections.
 	TCPEstablished bool
 
+	// IgnoreVolumes skips restoring volume content from the archive,
+	// reusing an existing volume. Leave false for cross-node restore;
+	// set true for same-node restore-in-place. See
+	// ProjectRestoreOptions.IgnoreVolumes.
+	IgnoreVolumes bool
+
 	// LocalEnv overrides os.Environ() for the reattached workspace's
 	// substituter localEnv pass. Nil means use the current process
 	// environment — matches AttachOptions.LocalEnv. On a cross-node
@@ -117,6 +123,7 @@ func (e *Engine) Restore(ctx context.Context, opts RestoreOptions) (*Workspace,
 		ArchivePath:    opts.ArchivePath,
 		Name:           opts.Name,
 		TCPEstablished: opts.TCPEstablished,
+		IgnoreVolumes:  opts.IgnoreVolumes,
 	})
 	if err != nil {
 		return nil, fmt.Errorf("restore: %w", err)
diff --git a/checkpoint_project.go b/checkpoint_project.go
index edf951f..bfb6624 100644
--- a/checkpoint_project.go
+++ b/checkpoint_project.go
@@ -63,6 +63,14 @@ type ProjectRestoreOptions struct {
 	// established connections.
 	TCPEstablished bool
 
+	// IgnoreVolumes skips restoring volume content from the archives,
+	// reusing whatever volumes already exist. Leave false for a
+	// cross-node restore (the destination has no volumes, so content
+	// must come from the archive). Set true for a same-node
+	// restore-in-place, where the volumes still exist with current data
+	// and re-extracting them would collide ("volume already exists").
+	IgnoreVolumes bool
+
 	// LocalEnv overrides os.Environ() for the reattached primary
 	// workspace's substituter (parity with RestoreOptions.LocalEnv).
 	LocalEnv map[string]string
@@ -199,11 +207,49 @@ func (e *Engine) RestoreProject(ctx context.Context, opts ProjectRestoreOptions)
 		return nil, fmt.Errorf("RestoreProject: %w", err)
 	}
 
+	// A cross-node restore lands on a fresh store with no project network.
+	// The checkpointed containers were attached to <project>_default, and
+	// libpod restore fails ("network not found") unless it exists first.
+	// Recreate it before restoring any container, mirroring how
+	// compose.Orchestrator.Up names + labels it (CreateNetwork is
+	// idempotent — a label-matching network is reused, so same-node
+	// restore is unaffected). Custom/extra compose networks aren't yet
+	// recorded in the manifest — see design/checkpoint-restore-fixes.md.
+	if _, err := e.runtime.CreateNetwork(ctx, runtime.NetworkSpec{
+		Name: manifest.Project + "_default",
+		Labels: map[string]string{
+			compose.LabelComposeProject: manifest.Project,
+			compose.LabelEngine:         compose.EngineDisplayName,
+		},
+	}); err != nil {
+		return nil, fmt.Errorf("RestoreProject: recreate project network: %w", err)
+	}
+
 	out := &ProjectRestore{Project: manifest.Project, Services: map[string]*runtime.Container{}}
 	for _, svc := range manifest.Services {
+		// Restore (--import) re-creates the container under its archived,
+		// deterministic compose name. On a fresh/cross-node store nothing
+		// pre-exists. On a same-node restore the checkpoint left the source
+		// container *stopped* under this name (StopAfter stops, it does not
+		// remove), which collides with re-create ("that ID is already in
+		// use"). Clear a non-running leftover — its full state is in the
+		// archive — but refuse to clobber a *running* container: that would
+		// be destroying a live service, not restoring it. RemoveVolumes
+		// stays false so the service's data volume survives for reuse.
+		name := manifest.Project + "-" + svc.Service + "-1"
+		if d, ierr := e.runtime.InspectContainer(ctx, name); ierr == nil && d != nil {
+			if d.State == runtime.StateRunning {
+				return nil, fmt.Errorf("RestoreProject: service %q: a running container %q already exists — stop it before restoring", svc.Service, name)
+			}
+			if rerr := e.runtime.RemoveContainer(ctx, name, runtime.RemoveOptions{Force: true}); rerr != nil {
+				return nil, fmt.Errorf("RestoreProject: service %q: clearing stale container %q: %w", svc.Service, name, rerr)
+			}
+		}
+
 		c, err := cr.Restore(ctx, runtime.RestoreSpec{
 			ArchivePath:    filepath.Join(opts.ArchiveDir, svc.Archive),
 			TCPEstablished: opts.TCPEstablished,
+			IgnoreVolumes:  opts.IgnoreVolumes,
 		})
 		if err != nil {
 			return nil, fmt.Errorf("RestoreProject: service %q: %w", svc.Service, err)
diff --git a/checkpoint_project_test.go b/checkpoint_project_test.go
index 1fa0f21..ceb105c 100644
--- a/checkpoint_project_test.go
+++ b/checkpoint_project_test.go
@@ -39,6 +39,13 @@ func (f *fakeProjectRuntime) Capabilities() runtime.Capabilities {
 	return c
 }
 
+// CreateNetwork succeeds: RestoreProject recreates the project network
+// before restoring containers (cross-node fresh-store path), so the
+// checkpoint-capable fake must support it.
+func (f *fakeProjectRuntime) CreateNetwork(ctx context.Context, spec runtime.NetworkSpec) (string, error) {
+	return "net-" + spec.Name, nil
+}
+
 func (f *fakeProjectRuntime) ListContainers(ctx context.Context, filter runtime.LabelFilter) ([]runtime.Container, error) {
 	f.fakeRuntime.mu.Lock()
 	defer f.fakeRuntime.mu.Unlock()
diff --git a/design/checkpoint-restore.md b/design/checkpoint-restore.md
index 3820554..6c82287 100644
--- a/design/checkpoint-restore.md
+++ b/design/checkpoint-restore.md
@@ -242,13 +242,46 @@ Model notes (validated by the Phase-0 spike, §7):
 - **Network:** the shared network re-forms as containers come back
   (`restore --import` re-attaches networking, and Podman restores the
   original container name, so service-name DNS resolves again). The network
-  must still exist on the target; recreating it cross-node is the
-  orchestrator's caller's job (or a follow-up).
+  must exist on the target *before* restore, or libpod fails with `network
+  not found`. **`RestoreProject` now recreates `<project>_default` itself**
+  (idempotent, mirroring `compose.Orchestrator.Up`'s name + labels) so a
+  cross-node restore onto a fresh store works. Only the default network is
+  recreated; custom/extra compose networks aren't recorded in the manifest
+  yet — a follow-up records the network set at checkpoint time.
+- **Replace, don't duplicate:** `restore --import` always creates a *new*
+  container under the archived (deterministic `<project>-<service>-1`)
+  name. `CheckpointProject(StopAfter)` stops but does not remove, so a
+  same-node restore-in-place collides on that name (`ID already in use`).
+  `RestoreProject` clears it first, but only when **not running** (its
+  state is in the archive; `RemoveVolumes` stays false so the data volume
+  survives) — a *running* same-named container is a genuine conflict and
+  restore errors rather than killing a live service. No-op cross-node.
+- **Volumes:** restore re-extracts volume content from the archive by
+  default (correct cross-node, where the destination has none). For a
+  same-node restore-in-place the volumes still exist and re-extraction
+  collides (`volume already exists`); `RestoreSpec.IgnoreVolumes`
+  (surfaced as `ProjectRestoreOptions`/`RestoreOptions.IgnoreVolumes` →
+  Podman `ignoreVolumes=true`) reuses the existing volume instead.
 - **Completeness:** the manifest is written last, so its presence implies a
   complete set; a partial checkpoint leaves no manifest and `RestoreProject`
   fails cleanly.
 - **Scale:** one container per service in v1 (no compose `scale`).
 
+Two **node prerequisites** the multi-service validation surfaced (both
+node-provisioning, not library code — the engine talks to a remote socket):
+
+- **CRIU `skip-file-rwx-check`** in `/etc/criu/default.conf` on every node
+  that restores. Some images set their data dir to `0700` (Postgres
+  requires it) and CRIU records the process cwd mode at checkpoint; the
+  restored volume mountpoint is `0755`, so CRIU's file-mode consistency
+  check rejects it. Podman exposes no per-restore flag for it (5.7.0), and
+  the content is correct — only the mountpoint mode differs — so the CRIU
+  config switch is the fix, alongside installing CRIU itself.
+- **Base images present on the destination.** A project restore is *not*
+  fully self-contained: Podman pulls each service's base image if absent.
+  A cross-node destination needs the images cached or registry credentials
+  (in practice it already runs the same workloads).
+
 ## 4. Capability gate
 
 Add one field to the `Capabilities` struct
@@ -362,13 +395,26 @@ What's **proven** (2026-06-19, bench pod, real image):
   peer-IP change on restore is the residual edge (matches the
   "agents reconnect" assumption).
 
+- ✅ **Real multi-service devcontainer, full project round trip + true
+  cross-node migration (2026-06-23).** An 8-service compose devcontainer
+  (Postgres, RabbitMQ, MinIO, an OAuth mock, an app sidecar, …) checkpointed
+  on node A and `RestoreProject`'d on node B — a separate, never-ran-this
+  Podman store. All 8 services resumed and in-memory/disk state survived the
+  move (a file written in the app container and a row written in Postgres
+  pre-checkpoint were present post-restore on B). Exercised the §3.3
+  decisions (network recreation, replace-guard, volume handling) and both
+  node prerequisites above. Timings: checkpoint ~9–17s, restore ~6–7s
+  cross-node / ~5s same-node-reuse for a ~580 MB archive set; storage speed
+  (local vs shared mount) and whether volume content is re-extracted are the
+  dominant variables.
+
 What's **still open** (minor):
 
-- **Working-set timing.** Idle container was fast (~0.5s same-pod, ~3.5s
-  cross-pod incl. rootfs unpack); a busy agent's memory footprint sets the
-  real checkpoint time vs the eviction window. Measure on a real workload.
-- **Forced cross-node** placement (anti-affinity) — belt-and-suspenders;
-  the archive is already proven node-independent.
+- **Working-set timing on a busy agent.** The validations above were on
+  freshly-started services; a saturated agent's memory footprint still sets
+  the real checkpoint time vs the eviction window — measure under load.
+- **Forced cross-node** placement (anti-affinity) — now exercised across two
+  isolated Podman stores (2026-06-23); the archive is proven node-independent.
 
 ## 8. Constraints, risks & mitigations
 
diff --git a/runtime/podman/checkpoint.go b/runtime/podman/checkpoint.go
index 63120d2..ecbb638 100644
--- a/runtime/podman/checkpoint.go
+++ b/runtime/podman/checkpoint.go
@@ -75,6 +75,9 @@ func (r *Runtime) Restore(ctx context.Context, spec runtime.RestoreSpec) (*runti
 	q := url.Values{}
 	q.Set("import", "true")
 	q.Set("tcpestablished", strconv.FormatBool(spec.TCPEstablished))
+	if spec.IgnoreVolumes {
+		q.Set("ignoreVolumes", "true")
+	}
 	if spec.Name != "" {
 		q.Set("name", spec.Name)
 	}
diff --git a/runtime/runtime.go b/runtime/runtime.go
index 0e47673..7a69fdf 100644
--- a/runtime/runtime.go
+++ b/runtime/runtime.go
@@ -148,6 +148,14 @@ type RestoreSpec struct {
 	// TCPEstablished must match the checkpoint when the archive captured
 	// established connections.
 	TCPEstablished bool
+
+	// IgnoreVolumes asks the backend to skip restoring volume content
+	// from the archive, reusing whatever volume already exists. For
+	// Podman this maps to restore's ignore-volumes; cross-node restore
+	// leaves it false (content must come from the archive), same-node
+	// restore-in-place sets it true to avoid a "volume already exists"
+	// collision.
+	IgnoreVolumes bool
 }
 
 // CheckpointRef describes a written checkpoint archive.

From 49eca6d318cf519ef2dc1d8f705eebb7923a2797 Mon Sep 17 00:00:00 2001
From: bilby91 <mfernandez@crunchloop.io>
Date: Tue, 23 Jun 2026 14:37:33 -0300
Subject: [PATCH 3/3] address review: fully disable native HC + surface inspect
 errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes from PR review:

- compose: in self-probe mode set RunSpec.HealthCheck to {Disable:true}
  rather than nil. nil means "inherit from image" (toHealthcheck), so an
  image-baked HEALTHCHECK would still run natively on Podman and could
  reintroduce the eager-root-probe failure. Disable emits the NONE
  sentinel that actually suppresses it. Adds a regression test.

- checkpoint: RestoreProject now distinguishes a not-found container
  (normal fresh/cross-node case → proceed) from any other InspectContainer
  error (daemon/API/permission), which is surfaced instead of masked
  behind a downstream restore error.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 checkpoint_project.go               | 13 ++++++++++++-
 compose/orchestrator.go             | 10 ++++++----
 compose/orchestrator_health_test.go | 22 ++++++++++++++++++++++
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/checkpoint_project.go b/checkpoint_project.go
index bfb6624..b273d96 100644
--- a/checkpoint_project.go
+++ b/checkpoint_project.go
@@ -3,6 +3,7 @@ package devcontainer
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
@@ -237,7 +238,17 @@ func (e *Engine) RestoreProject(ctx context.Context, opts ProjectRestoreOptions)
 		// be destroying a live service, not restoring it. RemoveVolumes
 		// stays false so the service's data volume survives for reuse.
 		name := manifest.Project + "-" + svc.Service + "-1"
-		if d, ierr := e.runtime.InspectContainer(ctx, name); ierr == nil && d != nil {
+		d, ierr := e.runtime.InspectContainer(ctx, name)
+		switch {
+		case ierr != nil:
+			// Absent is the normal fresh/cross-node case — proceed. Any
+			// other inspect failure (daemon/API/permission) is surfaced
+			// rather than masked behind a downstream restore error.
+			var notFound *runtime.ContainerNotFoundError
+			if !errors.As(ierr, &notFound) {
+				return nil, fmt.Errorf("RestoreProject: service %q: inspect existing container %q: %w", svc.Service, name, ierr)
+			}
+		case d != nil:
 			if d.State == runtime.StateRunning {
 				return nil, fmt.Errorf("RestoreProject: service %q: a running container %q already exists — stop it before restoring", svc.Service, name)
 			}
diff --git a/compose/orchestrator.go b/compose/orchestrator.go
index f9b5916..b1994d4 100644
--- a/compose/orchestrator.go
+++ b/compose/orchestrator.go
@@ -504,10 +504,12 @@ func (o *Orchestrator) ensureService(
 
 	spec := serviceToRunSpec(plan, svc, projectLabels, hash, imageDigest)
 	if o.selfProbe {
-		// The orchestrator probes health itself (see waitFor); never
-		// configure the backend's native HEALTHCHECK, whose eager
-		// root probe breaks privilege-dropping images on Podman.
-		spec.HealthCheck = nil
+		// The orchestrator probes health itself (see waitFor); explicitly
+		// DISABLE the backend's native HEALTHCHECK. Nil would mean "inherit
+		// from image" (toHealthcheck), so an image-baked HEALTHCHECK would
+		// still run natively — reintroducing Podman's eager root probe that
+		// breaks privilege-dropping images. Disable emits the NONE sentinel.
+		spec.HealthCheck = &runtime.HealthCheckSpec{Disable: true}
 	}
 	c, err := o.rt.RunContainer(ctx, spec)
 	if err != nil {
diff --git a/compose/orchestrator_health_test.go b/compose/orchestrator_health_test.go
index f70e84c..9c896f9 100644
--- a/compose/orchestrator_health_test.go
+++ b/compose/orchestrator_health_test.go
@@ -50,6 +50,28 @@ func TestNewOrchestratorDetectsSelfProbe(t *testing.T) {
 	}
 }
 
+// In self-probe mode the RunSpec must DISABLE the native healthcheck, not
+// leave it nil — nil means "inherit the image's HEALTHCHECK", which would
+// let Podman run an image-baked probe (as root, eagerly) and reintroduce
+// the very failure self-probing exists to avoid.
+func TestUp_SelfProbeDisablesNativeHealthcheck(t *testing.T) {
+	rt := newMockRuntime()
+	orch := NewOrchestrator(selfProbingMock{rt}, "podman")
+	proj := newProject(t, map[string][]string{"app": nil})
+
+	var got *runtime.HealthCheckSpec
+	rt.OnRunContainer = func(spec runtime.RunSpec) (*runtime.Container, error) {
+		got = spec.HealthCheck
+		return nil, nil
+	}
+	if _, err := orch.Up(context.Background(), &Plan{Project: proj, ProjectName: "dc-x"}); err != nil {
+		t.Fatalf("Up: %v", err)
+	}
+	if got == nil || !got.Disable {
+		t.Fatalf("self-probe must set HealthCheck.Disable=true (got %+v); nil would inherit the image healthcheck", got)
+	}
+}
+
 func TestWaitForSelfProbeHealthy(t *testing.T) {
 	rt := newMockRuntime()
 	var execed int