Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,19 @@ Choose a supervisor rollout profile based on where you are deploying:
```python
import pyisolate as iso

# default: fast local iteration
dev = iso.Supervisor(rollout_mode="dev")

# experimental fail-closed gate; requires pyisolate-doctor --mode hardened to pass
# production default: fail closed if the BPF toolchain, verifier, load, or attach fails
hardened = iso.Supervisor(rollout_mode="hardened")

# compatibility testing only; it deliberately skips stricter filters and is not enforcement
# explicitly acknowledge weaker enforcement for local iteration
dev = iso.Supervisor(rollout_mode="dev")

# explicitly acknowledge reduced enforcement for ecosystem validation
compat = iso.Supervisor(rollout_mode="compatibility")
```

* `dev`: lightweight, low-friction development mode. BPF/cgroup setup failures are reported through per-sandbox `quota_enforcement` status and logs, but sandbox creation continues so local development remains unblocked. CPU/RSS quota tests should be treated as best-effort unless the status reports the relevant controller as enforced.
* `hardened`: production fail-closed mode. BPF compile/load failures and cgroup controller failures raise during supervisor start or sandbox spawn; CPU/RSS quotas must be enforced by cgroups/eBPF and watchdog breach events terminate or quarantine the sandbox. Python `tracemalloc` values are exposed only as debugging telemetry.
* `compatibility`: ecosystem validation mode. Baseline BPF loading is attempted while stricter filters/guards may be skipped; cgroup quota status is still surfaced, and missing controllers degrade to explicit status/logs rather than silent `None`. Use this mode to find package compatibility issues, not as the authoritative security boundary.
* `hardened`: documented production default with kernel LSM/cgroup enforcement; any eBPF compile/load/attach failure raises.
* `dev`: caller-acknowledged local development mode; tooling failures are logged and kernel enforcement can be absent.
* `compatibility`: caller-acknowledged reduced enforcement to maximize third-party compatibility; strict filters are skipped.

### Hello World

Expand Down
72 changes: 59 additions & 13 deletions pyisolate/bpf/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,16 @@ def __init__(self):
self._obj = Path(__file__).with_name("dummy.bpf.o")
self._skel = Path(__file__).with_name("dummy.skel.h")
self.skeleton = ""
self._compiled_skeleton = False
self._filter_src = Path(__file__).with_name("syscall_filter.bpf.c")
self._filter_obj = Path(__file__).with_name("syscall_filter.bpf.o")
self._guard_src = Path(__file__).with_name("resource_guard.bpf.c")
self._guard_obj = Path(__file__).with_name("resource_guard.bpf.o")
self._skel_cache: dict[Path, str] = {}
self._bpffs_root = Path("/sys/fs/bpf/pyisolate")
self._dummy_pin = Path("/sys/fs/bpf/dummy")
self._filter_pin_dir = self._bpffs_root / "syscall_filter"
self._guard_pin_dir = self._bpffs_root / "resource_guard"
self._skel_cache = self._SKEL_CACHE

# internal helper
def _run(self, cmd: list[str], *, raise_on_error: bool = False) -> bool:
Expand Down Expand Up @@ -81,9 +86,12 @@ def load(
Rollout modes:

* ``dev``: low-friction mode; tolerate missing tooling and keep running.
* ``hardened``: strict mode; any failure raises a ``RuntimeError``.
* ``compatibility``: looser enforcement for ecosystem testing. Loads the
baseline program but skips stricter filter/guard attachments.
Use only for local development because BPF enforcement can be absent.
* ``hardened``: production default; any failure raises a ``RuntimeError``
and leaves the manager unloaded so callers fail closed.
* ``compatibility``: caller-acknowledged reduced enforcement for ecosystem
testing. Loads the baseline program but skips stricter filter/guard
attachments.

The legacy ``strict`` argument is still honored. When provided it
overrides ``mode``.
Expand Down Expand Up @@ -127,7 +135,9 @@ def load(
]
ok = True
compile_cmd = dummy_compile
if self._src not in self._skel_cache:
if self._src not in self._skel_cache or (
self._skel_cache.get(self._src) == "" and not self._compiled_skeleton
):
ok &= self._run(compile_cmd, raise_on_error=strict_mode)
skel_cmd = [
"sh",
Expand All @@ -140,19 +150,21 @@ def load(
self._skel_cache[self._src] = self._skel.read_text()
except OSError:
self._skel_cache[self._src] = ""
else:
# Cache a placeholder so repeated loads in tool-less test
# environments do not repeat compile/skeleton steps.
elif ok:
# Cache a placeholder when the build path was exercised but no
# skeleton was emitted (for example under a mocked bpftool).
self._skel_cache.setdefault(self._src, "")
self.skeleton = self._skel_cache.get(self._src, "")
if ok:
self._compiled_skeleton = True
else:
self.skeleton = self._skel_cache[self._src]

ok &= self._run(
["llvm-objdump", "-d", str(self._obj)], raise_on_error=strict_mode
)
ok &= self._run(
["bpftool", "prog", "load", str(self._obj), "/sys/fs/bpf/dummy"],
["bpftool", "prog", "load", str(self._obj), str(self._dummy_pin)],
raise_on_error=strict_mode,
)
if mode != "compatibility":
Expand All @@ -170,26 +182,60 @@ def load(
[
"bpftool",
"prog",
"load",
"loadall",
str(self._filter_obj),
"/sys/fs/bpf/syscall_filter",
str(self._filter_pin_dir),
"type",
"lsm",
"pinmaps",
str(self._bpffs_root),
"autoattach",
],
raise_on_error=strict_mode,
)
ok &= self._run(
[
"bpftool",
"prog",
"load",
"loadall",
str(self._guard_obj),
"/sys/fs/bpf/resource_guard",
str(self._guard_pin_dir),
"pinmaps",
str(self._bpffs_root),
"autoattach",
],
raise_on_error=strict_mode,
)
ok &= self._attach_loaded_programs(raise_on_error=strict_mode)
self.loaded = ok
if strict_mode and not ok:
raise RuntimeError("BPF load failed; see logs for details")

def _attach_loaded_programs(self, *, raise_on_error: bool = False) -> bool:
"""Attach programs that cannot rely solely on pinned objects.

``bpftool prog loadall ... autoattach`` creates BPF links for LSM and
tracepoint programs on modern kernels. The explicit cgroup-skb attach is
retained for kernels/tools that require a concrete cgroup attach point.
"""

ok = True
cgroup_root = Path("/sys/fs/cgroup")
egress_prog = self._guard_pin_dir / "account_cgroup_egress"
ok &= self._run(
[
"bpftool",
"cgroup",
"attach",
str(cgroup_root),
"egress",
"pinned",
str(egress_prog),
],
raise_on_error=raise_on_error,
)
return ok

def hot_reload(self, policy_path: str) -> None:
"""Refresh maps based on a policy JSON file."""
if not self.loaded:
Expand Down
190 changes: 181 additions & 9 deletions pyisolate/bpf/resource_guard.bpf.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,76 @@
#define SEC(NAME) __attribute__((section(NAME), used))

/* Per-cgroup resource accounting and quota breach events for PyIsolate. */

typedef unsigned int __u32;
typedef unsigned long long __u64;

#define BPF_MAP_TYPE_HASH 1
#define BPF_MAP_TYPE_PERCPU_HASH 5
#define BPF_MAP_TYPE_RINGBUF 27

#define PYI_RESOURCE_CPU 1U
#define PYI_RESOURCE_RSS 2U
#define PYI_RESOURCE_NET 3U

#define __uint(name, val) int (*name)[val]
#define __type(name, val) val *name

struct resource_account {
__u64 cpu_time_ns;
__u64 rss_bytes;
__u64 net_bytes;
__u64 last_seen_ns;
};

struct resource_quota {
__u64 cpu_time_ns;
__u64 rss_bytes;
__u64 net_bytes;
};

struct resource_event {
__u64 cgroup_id;
__u64 pid_tgid;
__u64 observed;
__u64 quota;
__u32 resource;
__u32 breached;
};

struct sched_switch_args {
unsigned long long pad;
char prev_comm[16];
int prev_pid;
int prev_prio;
long long prev_state;
char next_comm[16];
int next_pid;
int next_prio;
};

struct page_fault_args {
unsigned long long pad;
unsigned long address;
unsigned long ip;
int error_code;
};

struct __sk_buff {
__u32 len;
};

struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 22);
} resource_events SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(max_entries, 16384);
__type(key, __u64);
__type(value, struct resource_account);
} cgroup_accounting SEC(".maps");
/* Resource guard event consumed by pyisolate.watchdog.ResourceWatchdog.
* The supervisor resolves cgroup_id/name to a SandboxThread and performs the
* userspace kill/quarantine path immediately; Python tracemalloc accounting is
Expand Down Expand Up @@ -41,9 +112,92 @@ struct {
} usage SEC(".maps");

struct {
int dummy;
} events SEC(".maps");
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 16384);
__type(key, __u64);
__type(value, struct resource_quota);
} cgroup_quotas SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 65536);
__type(key, __u64);
__type(value, __u64);
} task_cpu_start SEC(".maps");

static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *)1;
static long (*bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *)2;
static __u64 (*bpf_ktime_get_ns)(void) = (void *)5;
static __u64 (*bpf_get_current_pid_tgid)(void) = (void *)14;
static __u64 (*bpf_get_current_cgroup_id)(void) = (void *)80;
static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *)130;

static void emit_if_breached(__u64 cg, struct resource_account *account)
{
struct resource_quota *quota = bpf_map_lookup_elem(&cgroup_quotas, &cg);
struct resource_event event = {};

if (!quota)
return;

event.cgroup_id = cg;
event.pid_tgid = bpf_get_current_pid_tgid();
if (quota->cpu_time_ns && account->cpu_time_ns > quota->cpu_time_ns) {
event.observed = account->cpu_time_ns;
event.quota = quota->cpu_time_ns;
event.resource = PYI_RESOURCE_CPU;
event.breached = 1;
bpf_ringbuf_output(&resource_events, &event, sizeof(event), 0);
}
if (quota->rss_bytes && account->rss_bytes > quota->rss_bytes) {
event.observed = account->rss_bytes;
event.quota = quota->rss_bytes;
event.resource = PYI_RESOURCE_RSS;
event.breached = 1;
bpf_ringbuf_output(&resource_events, &event, sizeof(event), 0);
}
if (quota->net_bytes && account->net_bytes > quota->net_bytes) {
event.observed = account->net_bytes;
event.quota = quota->net_bytes;
event.resource = PYI_RESOURCE_NET;
event.breached = 1;
bpf_ringbuf_output(&resource_events, &event, sizeof(event), 0);
}
}

static struct resource_account *account_for_current_cgroup(__u64 *cg_out)
{
__u64 cg = bpf_get_current_cgroup_id();
struct resource_account zero = {};
struct resource_account *account;

account = bpf_map_lookup_elem(&cgroup_accounting, &cg);
if (!account) {
zero.last_seen_ns = bpf_ktime_get_ns();
bpf_map_update_elem(&cgroup_accounting, &cg, &zero, 0);
account = bpf_map_lookup_elem(&cgroup_accounting, &cg);
}
*cg_out = cg;
return account;
}

SEC("tracepoint/sched/sched_switch")
int account_sched_switch(struct sched_switch_args *ctx)
{
__u64 cg;
__u64 now = bpf_ktime_get_ns();
__u64 pid_tgid = bpf_get_current_pid_tgid();
__u64 *started = bpf_map_lookup_elem(&task_cpu_start, &pid_tgid);
struct resource_account *account = account_for_current_cgroup(&cg);

if (account && started && now > *started) {
account->cpu_time_ns += now - *started;
account->last_seen_ns = now;
emit_if_breached(cg, account);
}

bpf_map_update_elem(&task_cpu_start, &pid_tgid, &now, 0);
return 0;
static __inline int emit_breach(unsigned long cgroup_id,
unsigned long cpu_time_ns,
unsigned long rss_bytes,
Expand All @@ -70,14 +224,32 @@ int on_cpu(void *ctx)
return emit_breach(0, 0, 0, BREACH_CPU);
}

SEC("perf_event")
int on_rss(void *ctx)
SEC("tracepoint/exceptions/page_fault_user")
int account_user_page_fault(struct page_fault_args *ctx)
{
/* Production path samples cgroup RSS, compares it to
* quota_t.rss_quota_bytes, and emits BREACH_RSS for watchdog enforcement.
*/
(void)ctx;
return emit_breach(0, 0, 0, BREACH_RSS);
__u64 cg;
struct resource_account *account = account_for_current_cgroup(&cg);

if (account) {
account->rss_bytes += 4096;
account->last_seen_ns = bpf_ktime_get_ns();
emit_if_breached(cg, account);
}
return 0;
}

SEC("cgroup_skb/egress")
int account_cgroup_egress(struct __sk_buff *skb)
{
__u64 cg;
struct resource_account *account = account_for_current_cgroup(&cg);

if (account) {
account->net_bytes += skb->len;
account->last_seen_ns = bpf_ktime_get_ns();
emit_if_breached(cg, account);
}
return 1;
}

char _license[] SEC("license") = "GPL";
Loading
Loading