aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlois Wohlschlager <alois1@gmx-topmail.de>2024-05-26 10:35:03 +0200
committerAlois Wohlschlager <alois1@gmx-topmail.de>2024-07-25 18:24:40 +0200
commit127ee1a101e3f5ebab39ad98cbe58fefcd52eca5 (patch)
treebe6a52c93f39712e9f01d8077280b2b6149386eb
parent233408f677f8cd310d9760ff6c1bd9c14282cc57 (diff)
libstore/build: use an allowlist approach to syscall filtering
Previously, system call filtering (to prevent builders from storing files with setuid/setgid permission bits or extended attributes) was performed using a blocklist. While this looks simple at first, it actually carries significant security and maintainability risks: after all, the kernel may add new syscalls to achieve the same functionality one is trying to block, and it can even be hard to actually add the syscall to the blocklist when building against a C library that doesn't know about it yet. For a recent demonstration of this happening in practice to Nix, see the introduction of fchmodat2 [0] [1]. The allowlist approach does not share the same drawback. While it does require a rather large list of harmless syscalls to be maintained in the codebase, failing to update this list (and roll out the update to all users) in time has rather benign effects; at worst, very recent programs that already rely on new syscalls will fail with an error the same way they would on a slightly older kernel that doesn't support them yet. Most importantly, no unintended new ways of performing dangerous operations will be silently allowed. Another possible drawback is reduced system call performance due to the larger filter created by the allowlist requiring more computation [2]. However, this issue has not convincingly been demonstrated yet in practice, for example in systemd or various browsers. To the contrary, it has been measured that the the actual filter constructed here has approximately the same overhead as a very simple filter blocking only one system call. This commit tries to keep the behavior as close to unchanged as possible. The system call list is in line with libseccomp 2.5.5 and glibc 2.39, which are the latest versions at the point of writing. Since libseccomp 2.5.5 is already a requirement and the distributions shipping this together with older versions of glibc are mostly not a thing any more, this should not lead to more build failures any more. [0] https://github.com/NixOS/nixpkgs/issues/300635 [1] https://github.com/NixOS/nix/issues/10424 [2] https://github.com/flatpak/flatpak/pull/4462#issuecomment-1061690607 Change-Id: I541be3ea9b249bcceddfed6a5a13ac10b11e16ad
-rw-r--r--flake.nix1
-rw-r--r--maintainers/check-syscalls.nix16
-rwxr-xr-xmaintainers/check-syscalls.sh7
-rw-r--r--package.nix2
-rw-r--r--src/libstore/build/local-derivation-goal.cc549
-rw-r--r--src/libstore/linux/fchmodat2-compat.hh35
-rw-r--r--tests/nixos/setuid/fchmodat2-suid.c5
7 files changed, 552 insertions, 63 deletions
diff --git a/flake.nix b/flake.nix
index 0609a8681..1fc02a590 100644
--- a/flake.nix
+++ b/flake.nix
@@ -164,6 +164,7 @@
nixUnstable = prev.nixUnstable;
check-headers = final.buildPackages.callPackage ./maintainers/check-headers.nix { };
+ check-syscalls = final.buildPackages.callPackage ./maintainers/check-syscalls.nix { };
default-busybox-sandbox-shell = final.busybox.override {
useMusl = true;
diff --git a/maintainers/check-syscalls.nix b/maintainers/check-syscalls.nix
new file mode 100644
index 000000000..1a3de5c6d
--- /dev/null
+++ b/maintainers/check-syscalls.nix
@@ -0,0 +1,16 @@
+{
+ runCommandNoCC,
+ lib,
+ libseccomp,
+ writeShellScriptBin,
+}:
+let
+ syscalls-csv = runCommandNoCC "syscalls.csv" { } ''
+ echo ${lib.escapeShellArg libseccomp.src}
+ tar -xf ${lib.escapeShellArg libseccomp.src} --strip-components=2 ${libseccomp.name}/src/syscalls.csv
+ mv syscalls.csv "$out"
+ '';
+in
+writeShellScriptBin "check-syscalls" ''
+ ${./check-syscalls.sh} ${syscalls-csv}
+''
diff --git a/maintainers/check-syscalls.sh b/maintainers/check-syscalls.sh
new file mode 100755
index 000000000..cd72ac23b
--- /dev/null
+++ b/maintainers/check-syscalls.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+set -e
+
+diff -u <(awk < src/libstore/build/local-derivation-goal.cc '/BEGIN extract-syscalls/ { extracting = 1; next }
+match($0, /allowSyscall\(ctx, SCMP_SYS\(([^)]*)\)\);|\/\/ skip ([^ ]*)/, result) { print result[1] result[2] }
+/END extract-syscalls/ { extracting = 0; next }') <(tail -n+2 "$1" | cut -d, -f 1)
diff --git a/package.nix b/package.nix
index 16477a6a2..c6b2e4451 100644
--- a/package.nix
+++ b/package.nix
@@ -403,6 +403,7 @@ stdenv.mkDerivation (finalAttrs: {
# Lix specific packages
pre-commit-checks,
contribNotice,
+ check-syscalls,
}:
let
glibcFix = lib.optionalAttrs (buildPlatform.isLinux && glibcLocales != null) {
@@ -453,6 +454,7 @@ stdenv.mkDerivation (finalAttrs: {
# `bash` from inside `nix develop`, say, because you are using it
# via direnv, you will by default get bash (unusable edition).
bashInteractive
+ check-syscalls
pythonEnv
# docker image tool
skopeo
diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc
index 8e1f4e849..33cbfb29d 100644
--- a/src/libstore/build/local-derivation-goal.cc
+++ b/src/libstore/build/local-derivation-goal.cc
@@ -45,7 +45,6 @@
#include <sys/prctl.h>
#include <sys/syscall.h>
#if HAVE_SECCOMP
-#include "linux/fchmodat2-compat.hh"
#include <seccomp.h>
#endif
#define pivot_root(new_root, put_old) (syscall(SYS_pivot_root, new_root, put_old))
@@ -1363,6 +1362,20 @@ void LocalDerivationGoal::chownToBuilder(const Path & path)
throw SysError("cannot change ownership of '%1%'", path);
}
+#if HAVE_SECCOMP
+
+static void allowSyscall(scmp_filter_ctx ctx, int syscall) {
+ if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 0) != 0)
+ throw SysError("unable to add seccomp rule");
+}
+
+#define ALLOW_CHMOD_IF_SAFE(ctx, syscall, modePos) \
+ if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISUID | S_ISGID, 0)) != 0 || \
+ seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISUID, S_ISUID)) != 0 || \
+ seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISGID, S_ISGID)) != 0) \
+ throw SysError("unable to add seccomp rule");
+
+#endif
void setupSeccomp()
{
@@ -1370,7 +1383,9 @@ void setupSeccomp()
#if HAVE_SECCOMP
scmp_filter_ctx ctx;
- if (!(ctx = seccomp_init(SCMP_ACT_ALLOW)))
+ // Pretend that syscalls we don't yet know about don't exist.
+ // This is the best option for compatibility: after all, they did in fact not exist not too long ago.
+ if (!(ctx = seccomp_init(SCMP_ACT_ERRNO(ENOSYS))))
throw SysError("unable to initialize seccomp mode 2");
Finally cleanup([&]() {
@@ -1405,28 +1420,514 @@ void setupSeccomp()
seccomp_arch_add(ctx, SCMP_ARCH_MIPSEL64N32) != 0)
printError("unable to add mips64el-*abin32 seccomp architecture");
- /* Prevent builders from creating setuid/setgid binaries. */
- for (int perm : { S_ISUID, S_ISGID }) {
- if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(chmod), 1,
- SCMP_A1(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0)
- throw SysError("unable to add seccomp rule");
-
- if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(fchmod), 1,
- SCMP_A1(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0)
- throw SysError("unable to add seccomp rule");
-
- if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(fchmodat), 1,
- SCMP_A2(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0)
- throw SysError("unable to add seccomp rule");
-
- if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), NIX_SYSCALL_FCHMODAT2, 1,
- SCMP_A2(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0)
- throw SysError("unable to add seccomp rule");
- }
-
- /* Prevent builders from creating EAs or ACLs. Not all filesystems
- support these, and they're not allowed in the Nix store because
- they're not representable in the NAR serialisation. */
+ // This list is intended for machine consumption.
+ // Please keep its format, order and BEGIN/END markers.
+ //
+ // Currently, it is up to date with libseccomp 2.5.5 and glibc 2.39.
+ // Run check-syscalls to determine which new syscalls should be added.
+ // New syscalls must be audited and handled in a way that blocks the following dangerous operations:
+ // * Creation of non-empty setuid/setgid files
+ // * Creation of extended attributes (including ACLs)
+ //
+ // BEGIN extract-syscalls
+ allowSyscall(ctx, SCMP_SYS(accept));
+ allowSyscall(ctx, SCMP_SYS(accept4));
+ allowSyscall(ctx, SCMP_SYS(access));
+ allowSyscall(ctx, SCMP_SYS(acct));
+ allowSyscall(ctx, SCMP_SYS(add_key));
+ allowSyscall(ctx, SCMP_SYS(adjtimex));
+ allowSyscall(ctx, SCMP_SYS(afs_syscall));
+ allowSyscall(ctx, SCMP_SYS(alarm));
+ allowSyscall(ctx, SCMP_SYS(arch_prctl));
+ allowSyscall(ctx, SCMP_SYS(arm_fadvise64_64));
+ allowSyscall(ctx, SCMP_SYS(arm_sync_file_range));
+ allowSyscall(ctx, SCMP_SYS(bdflush));
+ allowSyscall(ctx, SCMP_SYS(bind));
+ allowSyscall(ctx, SCMP_SYS(bpf));
+ allowSyscall(ctx, SCMP_SYS(break));
+ allowSyscall(ctx, SCMP_SYS(breakpoint));
+ allowSyscall(ctx, SCMP_SYS(brk));
+ allowSyscall(ctx, SCMP_SYS(cachectl));
+ allowSyscall(ctx, SCMP_SYS(cacheflush));
+ allowSyscall(ctx, SCMP_SYS(cachestat));
+ allowSyscall(ctx, SCMP_SYS(capget));
+ allowSyscall(ctx, SCMP_SYS(capset));
+ allowSyscall(ctx, SCMP_SYS(chdir));
+ // skip chmod (dangerous)
+ allowSyscall(ctx, SCMP_SYS(chown));
+ allowSyscall(ctx, SCMP_SYS(chown32));
+ allowSyscall(ctx, SCMP_SYS(chroot));
+ allowSyscall(ctx, SCMP_SYS(clock_adjtime));
+ allowSyscall(ctx, SCMP_SYS(clock_adjtime64));
+ allowSyscall(ctx, SCMP_SYS(clock_getres));
+ allowSyscall(ctx, SCMP_SYS(clock_getres_time64));
+ allowSyscall(ctx, SCMP_SYS(clock_gettime));
+ allowSyscall(ctx, SCMP_SYS(clock_gettime64));
+ allowSyscall(ctx, SCMP_SYS(clock_nanosleep));
+ allowSyscall(ctx, SCMP_SYS(clock_nanosleep_time64));
+ allowSyscall(ctx, SCMP_SYS(clock_settime));
+ allowSyscall(ctx, SCMP_SYS(clock_settime64));
+ allowSyscall(ctx, SCMP_SYS(clone));
+ allowSyscall(ctx, SCMP_SYS(clone3));
+ allowSyscall(ctx, SCMP_SYS(close));
+ allowSyscall(ctx, SCMP_SYS(close_range));
+ allowSyscall(ctx, SCMP_SYS(connect));
+ allowSyscall(ctx, SCMP_SYS(copy_file_range));
+ allowSyscall(ctx, SCMP_SYS(creat));
+ allowSyscall(ctx, SCMP_SYS(create_module));
+ allowSyscall(ctx, SCMP_SYS(delete_module));
+ allowSyscall(ctx, SCMP_SYS(dup));
+ allowSyscall(ctx, SCMP_SYS(dup2));
+ allowSyscall(ctx, SCMP_SYS(dup3));
+ allowSyscall(ctx, SCMP_SYS(epoll_create));
+ allowSyscall(ctx, SCMP_SYS(epoll_create1));
+ allowSyscall(ctx, SCMP_SYS(epoll_ctl));
+ allowSyscall(ctx, SCMP_SYS(epoll_ctl_old));
+ allowSyscall(ctx, SCMP_SYS(epoll_pwait));
+ allowSyscall(ctx, SCMP_SYS(epoll_pwait2));
+ allowSyscall(ctx, SCMP_SYS(epoll_wait));
+ allowSyscall(ctx, SCMP_SYS(epoll_wait_old));
+ allowSyscall(ctx, SCMP_SYS(eventfd));
+ allowSyscall(ctx, SCMP_SYS(eventfd2));
+ allowSyscall(ctx, SCMP_SYS(execve));
+ allowSyscall(ctx, SCMP_SYS(execveat));
+ allowSyscall(ctx, SCMP_SYS(exit));
+ allowSyscall(ctx, SCMP_SYS(exit_group));
+ allowSyscall(ctx, SCMP_SYS(faccessat));
+ allowSyscall(ctx, SCMP_SYS(faccessat2));
+ allowSyscall(ctx, SCMP_SYS(fadvise64));
+ allowSyscall(ctx, SCMP_SYS(fadvise64_64));
+ allowSyscall(ctx, SCMP_SYS(fallocate));
+ allowSyscall(ctx, SCMP_SYS(fanotify_init));
+ allowSyscall(ctx, SCMP_SYS(fanotify_mark));
+ allowSyscall(ctx, SCMP_SYS(fchdir));
+ // skip fchmod (dangerous)
+ // skip fchmodat (dangerous)
+ // skip fchmodat2 (dangerous)
+ allowSyscall(ctx, SCMP_SYS(fchown));
+ allowSyscall(ctx, SCMP_SYS(fchown32));
+ allowSyscall(ctx, SCMP_SYS(fchownat));
+ allowSyscall(ctx, SCMP_SYS(fcntl));
+ allowSyscall(ctx, SCMP_SYS(fcntl64));
+ allowSyscall(ctx, SCMP_SYS(fdatasync));
+ allowSyscall(ctx, SCMP_SYS(fgetxattr));
+ allowSyscall(ctx, SCMP_SYS(finit_module));
+ allowSyscall(ctx, SCMP_SYS(flistxattr));
+ allowSyscall(ctx, SCMP_SYS(flock));
+ allowSyscall(ctx, SCMP_SYS(fork));
+ allowSyscall(ctx, SCMP_SYS(fremovexattr));
+ allowSyscall(ctx, SCMP_SYS(fsconfig));
+ // skip fsetxattr (dangerous)
+ allowSyscall(ctx, SCMP_SYS(fsmount));
+ allowSyscall(ctx, SCMP_SYS(fsopen));
+ allowSyscall(ctx, SCMP_SYS(fspick));
+ allowSyscall(ctx, SCMP_SYS(fstat));
+ allowSyscall(ctx, SCMP_SYS(fstat64));
+ allowSyscall(ctx, SCMP_SYS(fstatat64));
+ allowSyscall(ctx, SCMP_SYS(fstatfs));
+ allowSyscall(ctx, SCMP_SYS(fstatfs64));
+ allowSyscall(ctx, SCMP_SYS(fsync));
+ allowSyscall(ctx, SCMP_SYS(ftime));
+ allowSyscall(ctx, SCMP_SYS(ftruncate));
+ allowSyscall(ctx, SCMP_SYS(ftruncate64));
+ allowSyscall(ctx, SCMP_SYS(futex));
+ allowSyscall(ctx, SCMP_SYS(futex_requeue));
+ allowSyscall(ctx, SCMP_SYS(futex_time64));
+ allowSyscall(ctx, SCMP_SYS(futex_wait));
+ allowSyscall(ctx, SCMP_SYS(futex_waitv));
+ allowSyscall(ctx, SCMP_SYS(futex_wake));
+ allowSyscall(ctx, SCMP_SYS(futimesat));
+ allowSyscall(ctx, SCMP_SYS(getcpu));
+ allowSyscall(ctx, SCMP_SYS(getcwd));
+ allowSyscall(ctx, SCMP_SYS(getdents));
+ allowSyscall(ctx, SCMP_SYS(getdents64));
+ allowSyscall(ctx, SCMP_SYS(getegid));
+ allowSyscall(ctx, SCMP_SYS(getegid32));
+ allowSyscall(ctx, SCMP_SYS(geteuid));
+ allowSyscall(ctx, SCMP_SYS(geteuid32));
+ allowSyscall(ctx, SCMP_SYS(getgid));
+ allowSyscall(ctx, SCMP_SYS(getgid32));
+ allowSyscall(ctx, SCMP_SYS(getgroups));
+ allowSyscall(ctx, SCMP_SYS(getgroups32));
+ allowSyscall(ctx, SCMP_SYS(getitimer));
+ allowSyscall(ctx, SCMP_SYS(get_kernel_syms));
+ allowSyscall(ctx, SCMP_SYS(get_mempolicy));
+ allowSyscall(ctx, SCMP_SYS(getpeername));
+ allowSyscall(ctx, SCMP_SYS(getpgid));
+ allowSyscall(ctx, SCMP_SYS(getpgrp));
+ allowSyscall(ctx, SCMP_SYS(getpid));
+ allowSyscall(ctx, SCMP_SYS(getpmsg));
+ allowSyscall(ctx, SCMP_SYS(getppid));
+ allowSyscall(ctx, SCMP_SYS(getpriority));
+ allowSyscall(ctx, SCMP_SYS(getrandom));
+ allowSyscall(ctx, SCMP_SYS(getresgid));
+ allowSyscall(ctx, SCMP_SYS(getresgid32));
+ allowSyscall(ctx, SCMP_SYS(getresuid));
+ allowSyscall(ctx, SCMP_SYS(getresuid32));
+ allowSyscall(ctx, SCMP_SYS(getrlimit));
+ allowSyscall(ctx, SCMP_SYS(get_robust_list));
+ allowSyscall(ctx, SCMP_SYS(getrusage));
+ allowSyscall(ctx, SCMP_SYS(getsid));
+ allowSyscall(ctx, SCMP_SYS(getsockname));
+ allowSyscall(ctx, SCMP_SYS(getsockopt));
+ allowSyscall(ctx, SCMP_SYS(get_thread_area));
+ allowSyscall(ctx, SCMP_SYS(gettid));
+ allowSyscall(ctx, SCMP_SYS(gettimeofday));
+ allowSyscall(ctx, SCMP_SYS(get_tls));
+ allowSyscall(ctx, SCMP_SYS(getuid));
+ allowSyscall(ctx, SCMP_SYS(getuid32));
+ allowSyscall(ctx, SCMP_SYS(getxattr));
+ allowSyscall(ctx, SCMP_SYS(gtty));
+ allowSyscall(ctx, SCMP_SYS(idle));
+ allowSyscall(ctx, SCMP_SYS(init_module));
+ allowSyscall(ctx, SCMP_SYS(inotify_add_watch));
+ allowSyscall(ctx, SCMP_SYS(inotify_init));
+ allowSyscall(ctx, SCMP_SYS(inotify_init1));
+ allowSyscall(ctx, SCMP_SYS(inotify_rm_watch));
+ allowSyscall(ctx, SCMP_SYS(io_cancel));
+ allowSyscall(ctx, SCMP_SYS(ioctl));
+ allowSyscall(ctx, SCMP_SYS(io_destroy));
+ allowSyscall(ctx, SCMP_SYS(io_getevents));
+ allowSyscall(ctx, SCMP_SYS(ioperm));
+ allowSyscall(ctx, SCMP_SYS(io_pgetevents));
+ allowSyscall(ctx, SCMP_SYS(io_pgetevents_time64));
+ allowSyscall(ctx, SCMP_SYS(iopl));
+ allowSyscall(ctx, SCMP_SYS(ioprio_get));
+ allowSyscall(ctx, SCMP_SYS(ioprio_set));
+ allowSyscall(ctx, SCMP_SYS(io_setup));
+ allowSyscall(ctx, SCMP_SYS(io_submit));
+ allowSyscall(ctx, SCMP_SYS(io_uring_enter));
+ allowSyscall(ctx, SCMP_SYS(io_uring_register));
+ allowSyscall(ctx, SCMP_SYS(io_uring_setup));
+ allowSyscall(ctx, SCMP_SYS(ipc));
+ allowSyscall(ctx, SCMP_SYS(kcmp));
+ allowSyscall(ctx, SCMP_SYS(kexec_file_load));
+ allowSyscall(ctx, SCMP_SYS(kexec_load));
+ allowSyscall(ctx, SCMP_SYS(keyctl));
+ allowSyscall(ctx, SCMP_SYS(kill));
+ allowSyscall(ctx, SCMP_SYS(landlock_add_rule));
+ allowSyscall(ctx, SCMP_SYS(landlock_create_ruleset));
+ allowSyscall(ctx, SCMP_SYS(landlock_restrict_self));
+ allowSyscall(ctx, SCMP_SYS(lchown));
+ allowSyscall(ctx, SCMP_SYS(lchown32));
+ allowSyscall(ctx, SCMP_SYS(lgetxattr));
+ allowSyscall(ctx, SCMP_SYS(link));
+ allowSyscall(ctx, SCMP_SYS(linkat));
+ allowSyscall(ctx, SCMP_SYS(listen));
+ allowSyscall(ctx, SCMP_SYS(listxattr));
+ allowSyscall(ctx, SCMP_SYS(llistxattr));
+ allowSyscall(ctx, SCMP_SYS(_llseek));
+ allowSyscall(ctx, SCMP_SYS(lock));
+ allowSyscall(ctx, SCMP_SYS(lookup_dcookie));
+ allowSyscall(ctx, SCMP_SYS(lremovexattr));
+ allowSyscall(ctx, SCMP_SYS(lseek));
+ // skip lsetxattr (dangerous)
+ allowSyscall(ctx, SCMP_SYS(lstat));
+ allowSyscall(ctx, SCMP_SYS(lstat64));
+ allowSyscall(ctx, SCMP_SYS(madvise));
+ allowSyscall(ctx, SCMP_SYS(map_shadow_stack));
+ allowSyscall(ctx, SCMP_SYS(mbind));
+ allowSyscall(ctx, SCMP_SYS(membarrier));
+ allowSyscall(ctx, SCMP_SYS(memfd_create));
+ allowSyscall(ctx, SCMP_SYS(memfd_secret));
+ allowSyscall(ctx, SCMP_SYS(migrate_pages));
+ allowSyscall(ctx, SCMP_SYS(mincore));
+ allowSyscall(ctx, SCMP_SYS(mkdir));
+ allowSyscall(ctx, SCMP_SYS(mkdirat));
+ allowSyscall(ctx, SCMP_SYS(mknod));
+ allowSyscall(ctx, SCMP_SYS(mknodat));
+ allowSyscall(ctx, SCMP_SYS(mlock));
+ allowSyscall(ctx, SCMP_SYS(mlock2));
+ allowSyscall(ctx, SCMP_SYS(mlockall));
+ allowSyscall(ctx, SCMP_SYS(mmap));
+ allowSyscall(ctx, SCMP_SYS(mmap2));
+ allowSyscall(ctx, SCMP_SYS(modify_ldt));
+ allowSyscall(ctx, SCMP_SYS(mount));
+ allowSyscall(ctx, SCMP_SYS(mount_setattr));
+ allowSyscall(ctx, SCMP_SYS(move_mount));
+ allowSyscall(ctx, SCMP_SYS(move_pages));
+ allowSyscall(ctx, SCMP_SYS(mprotect));
+ allowSyscall(ctx, SCMP_SYS(mpx));
+ allowSyscall(ctx, SCMP_SYS(mq_getsetattr));
+ allowSyscall(ctx, SCMP_SYS(mq_notify));
+ allowSyscall(ctx, SCMP_SYS(mq_open));
+ allowSyscall(ctx, SCMP_SYS(mq_timedreceive));
+ allowSyscall(ctx, SCMP_SYS(mq_timedreceive_time64));
+ allowSyscall(ctx, SCMP_SYS(mq_timedsend));
+ allowSyscall(ctx, SCMP_SYS(mq_timedsend_time64));
+ allowSyscall(ctx, SCMP_SYS(mq_unlink));
+ allowSyscall(ctx, SCMP_SYS(mremap));
+ allowSyscall(ctx, SCMP_SYS(msgctl));
+ allowSyscall(ctx, SCMP_SYS(msgget));
+ allowSyscall(ctx, SCMP_SYS(msgrcv));
+ allowSyscall(ctx, SCMP_SYS(msgsnd));
+ allowSyscall(ctx, SCMP_SYS(msync));
+ allowSyscall(ctx, SCMP_SYS(multiplexer));
+ allowSyscall(ctx, SCMP_SYS(munlock));
+ allowSyscall(ctx, SCMP_SYS(munlockall));
+ allowSyscall(ctx, SCMP_SYS(munmap));
+ allowSyscall(ctx, SCMP_SYS(name_to_handle_at));
+ allowSyscall(ctx, SCMP_SYS(nanosleep));
+ allowSyscall(ctx, SCMP_SYS(newfstatat));
+ allowSyscall(ctx, SCMP_SYS(_newselect));
+ allowSyscall(ctx, SCMP_SYS(nfsservctl));
+ allowSyscall(ctx, SCMP_SYS(nice));
+ allowSyscall(ctx, SCMP_SYS(oldfstat));
+ allowSyscall(ctx, SCMP_SYS(oldlstat));
+ allowSyscall(ctx, SCMP_SYS(oldolduname));
+ allowSyscall(ctx, SCMP_SYS(oldstat));
+ allowSyscall(ctx, SCMP_SYS(olduname));
+ allowSyscall(ctx, SCMP_SYS(open));
+ allowSyscall(ctx, SCMP_SYS(openat));
+ allowSyscall(ctx, SCMP_SYS(openat2));
+ allowSyscall(ctx, SCMP_SYS(open_by_handle_at));
+ allowSyscall(ctx, SCMP_SYS(open_tree));
+ allowSyscall(ctx, SCMP_SYS(pause));
+ allowSyscall(ctx, SCMP_SYS(pciconfig_iobase));
+ allowSyscall(ctx, SCMP_SYS(pciconfig_read));
+ allowSyscall(ctx, SCMP_SYS(pciconfig_write));
+ allowSyscall(ctx, SCMP_SYS(perf_event_open));
+ allowSyscall(ctx, SCMP_SYS(personality));
+ allowSyscall(ctx, SCMP_SYS(pidfd_getfd));
+ allowSyscall(ctx, SCMP_SYS(pidfd_open));
+ allowSyscall(ctx, SCMP_SYS(pidfd_send_signal));
+ allowSyscall(ctx, SCMP_SYS(pipe));
+ allowSyscall(ctx, SCMP_SYS(pipe2));
+ allowSyscall(ctx, SCMP_SYS(pivot_root));
+ allowSyscall(ctx, SCMP_SYS(pkey_alloc));
+ allowSyscall(ctx, SCMP_SYS(pkey_free));
+ allowSyscall(ctx, SCMP_SYS(pkey_mprotect));
+ allowSyscall(ctx, SCMP_SYS(poll));
+ allowSyscall(ctx, SCMP_SYS(ppoll));
+ allowSyscall(ctx, SCMP_SYS(ppoll_time64));
+ allowSyscall(ctx, SCMP_SYS(prctl));
+ allowSyscall(ctx, SCMP_SYS(pread64));
+ allowSyscall(ctx, SCMP_SYS(preadv));
+ allowSyscall(ctx, SCMP_SYS(preadv2));
+ allowSyscall(ctx, SCMP_SYS(prlimit64));
+ allowSyscall(ctx, SCMP_SYS(process_madvise));
+ allowSyscall(ctx, SCMP_SYS(process_mrelease));
+ allowSyscall(ctx, SCMP_SYS(process_vm_readv));
+ allowSyscall(ctx, SCMP_SYS(process_vm_writev));
+ allowSyscall(ctx, SCMP_SYS(prof));
+ allowSyscall(ctx, SCMP_SYS(profil));
+ allowSyscall(ctx, SCMP_SYS(pselect6));
+ allowSyscall(ctx, SCMP_SYS(pselect6_time64));
+ allowSyscall(ctx, SCMP_SYS(ptrace));
+ allowSyscall(ctx, SCMP_SYS(putpmsg));
+ allowSyscall(ctx, SCMP_SYS(pwrite64));
+ allowSyscall(ctx, SCMP_SYS(pwritev));
+ allowSyscall(ctx, SCMP_SYS(pwritev2));
+ allowSyscall(ctx, SCMP_SYS(query_module));
+ allowSyscall(ctx, SCMP_SYS(quotactl));
+ allowSyscall(ctx, SCMP_SYS(quotactl_fd));
+ allowSyscall(ctx, SCMP_SYS(read));
+ allowSyscall(ctx, SCMP_SYS(readahead));
+ allowSyscall(ctx, SCMP_SYS(readdir));
+ allowSyscall(ctx, SCMP_SYS(readlink));
+ allowSyscall(ctx, SCMP_SYS(readlinkat));
+ allowSyscall(ctx, SCMP_SYS(readv));
+ allowSyscall(ctx, SCMP_SYS(reboot));
+ allowSyscall(ctx, SCMP_SYS(recv));
+ allowSyscall(ctx, SCMP_SYS(recvfrom));
+ allowSyscall(ctx, SCMP_SYS(recvmmsg));
+ allowSyscall(ctx, SCMP_SYS(recvmmsg_time64));
+ allowSyscall(ctx, SCMP_SYS(recvmsg));
+ allowSyscall(ctx, SCMP_SYS(remap_file_pages));
+ allowSyscall(ctx, SCMP_SYS(removexattr));
+ allowSyscall(ctx, SCMP_SYS(rename));
+ allowSyscall(ctx, SCMP_SYS(renameat));
+ allowSyscall(ctx, SCMP_SYS(renameat2));
+ allowSyscall(ctx, SCMP_SYS(request_key));
+ allowSyscall(ctx, SCMP_SYS(restart_syscall));
+ allowSyscall(ctx, SCMP_SYS(riscv_flush_icache));
+ allowSyscall(ctx, SCMP_SYS(rmdir));
+ allowSyscall(ctx, SCMP_SYS(rseq));
+ allowSyscall(ctx, SCMP_SYS(rtas));
+ allowSyscall(ctx, SCMP_SYS(rt_sigaction));
+ allowSyscall(ctx, SCMP_SYS(rt_sigpending));
+ allowSyscall(ctx, SCMP_SYS(rt_sigprocmask));
+ allowSyscall(ctx, SCMP_SYS(rt_sigqueueinfo));
+ allowSyscall(ctx, SCMP_SYS(rt_sigreturn));
+ allowSyscall(ctx, SCMP_SYS(rt_sigsuspend));
+ allowSyscall(ctx, SCMP_SYS(rt_sigtimedwait));
+ allowSyscall(ctx, SCMP_SYS(rt_sigtimedwait_time64));
+ allowSyscall(ctx, SCMP_SYS(rt_tgsigqueueinfo));
+ allowSyscall(ctx, SCMP_SYS(s390_guarded_storage));
+ allowSyscall(ctx, SCMP_SYS(s390_pci_mmio_read));
+ allowSyscall(ctx, SCMP_SYS(s390_pci_mmio_write));
+ allowSyscall(ctx, SCMP_SYS(s390_runtime_instr));
+ allowSyscall(ctx, SCMP_SYS(s390_sthyi));
+ allowSyscall(ctx, SCMP_SYS(sched_getaffinity));
+ allowSyscall(ctx, SCMP_SYS(sched_getattr));
+ allowSyscall(ctx, SCMP_SYS(sched_getparam));
+ allowSyscall(ctx, SCMP_SYS(sched_get_priority_max));
+ allowSyscall(ctx, SCMP_SYS(sched_get_priority_min));
+ allowSyscall(ctx, SCMP_SYS(sched_getscheduler));
+ allowSyscall(ctx, SCMP_SYS(sched_rr_get_interval));
+ allowSyscall(ctx, SCMP_SYS(sched_rr_get_interval_time64));
+ allowSyscall(ctx, SCMP_SYS(sched_setaffinity));
+ allowSyscall(ctx, SCMP_SYS(sched_setattr));
+ allowSyscall(ctx, SCMP_SYS(sched_setparam));
+ allowSyscall(ctx, SCMP_SYS(sched_setscheduler));
+ allowSyscall(ctx, SCMP_SYS(sched_yield));
+ allowSyscall(ctx, SCMP_SYS(seccomp));
+ allowSyscall(ctx, SCMP_SYS(security));
+ allowSyscall(ctx, SCMP_SYS(select));
+ allowSyscall(ctx, SCMP_SYS(semctl));
+ allowSyscall(ctx, SCMP_SYS(semget));
+ allowSyscall(ctx, SCMP_SYS(semop));
+ allowSyscall(ctx, SCMP_SYS(semtimedop));
+ allowSyscall(ctx, SCMP_SYS(semtimedop_time64));
+ allowSyscall(ctx, SCMP_SYS(send));
+ allowSyscall(ctx, SCMP_SYS(sendfile));
+ allowSyscall(ctx, SCMP_SYS(sendfile64));
+ allowSyscall(ctx, SCMP_SYS(sendmmsg));
+ allowSyscall(ctx, SCMP_SYS(sendmsg));
+ allowSyscall(ctx, SCMP_SYS(sendto));
+ allowSyscall(ctx, SCMP_SYS(setdomainname));
+ allowSyscall(ctx, SCMP_SYS(setfsgid));
+ allowSyscall(ctx, SCMP_SYS(setfsgid32));
+ allowSyscall(ctx, SCMP_SYS(setfsuid));
+ allowSyscall(ctx, SCMP_SYS(setfsuid32));
+ allowSyscall(ctx, SCMP_SYS(setgid));
+ allowSyscall(ctx, SCMP_SYS(setgid32));
+ allowSyscall(ctx, SCMP_SYS(setgroups));
+ allowSyscall(ctx, SCMP_SYS(setgroups32));
+ allowSyscall(ctx, SCMP_SYS(sethostname));
+ allowSyscall(ctx, SCMP_SYS(setitimer));
+ allowSyscall(ctx, SCMP_SYS(set_mempolicy));
+ allowSyscall(ctx, SCMP_SYS(set_mempolicy_home_node));
+ allowSyscall(ctx, SCMP_SYS(setns));
+ allowSyscall(ctx, SCMP_SYS(setpgid));
+ allowSyscall(ctx, SCMP_SYS(setpriority));
+ allowSyscall(ctx, SCMP_SYS(setregid));
+ allowSyscall(ctx, SCMP_SYS(setregid32));
+ allowSyscall(ctx, SCMP_SYS(setresgid));
+ allowSyscall(ctx, SCMP_SYS(setresgid32));
+ allowSyscall(ctx, SCMP_SYS(setresuid));
+ allowSyscall(ctx, SCMP_SYS(setresuid32));
+ allowSyscall(ctx, SCMP_SYS(setreuid));
+ allowSyscall(ctx, SCMP_SYS(setreuid32));
+ allowSyscall(ctx, SCMP_SYS(setrlimit));
+ allowSyscall(ctx, SCMP_SYS(set_robust_list));
+ allowSyscall(ctx, SCMP_SYS(setsid));
+ allowSyscall(ctx, SCMP_SYS(setsockopt));
+ allowSyscall(ctx, SCMP_SYS(set_thread_area));
+ allowSyscall(ctx, SCMP_SYS(set_tid_address));
+ allowSyscall(ctx, SCMP_SYS(settimeofday));
+ allowSyscall(ctx, SCMP_SYS(set_tls));
+ allowSyscall(ctx, SCMP_SYS(setuid));
+ allowSyscall(ctx, SCMP_SYS(setuid32));
+ // skip setxattr (dangerous)
+ allowSyscall(ctx, SCMP_SYS(sgetmask));
+ allowSyscall(ctx, SCMP_SYS(shmat));
+ allowSyscall(ctx, SCMP_SYS(shmctl));
+ allowSyscall(ctx, SCMP_SYS(shmdt));
+ allowSyscall(ctx, SCMP_SYS(shmget));
+ allowSyscall(ctx, SCMP_SYS(shutdown));
+ allowSyscall(ctx, SCMP_SYS(sigaction));
+ allowSyscall(ctx, SCMP_SYS(sigaltstack));
+ allowSyscall(ctx, SCMP_SYS(signal));
+ allowSyscall(ctx, SCMP_SYS(signalfd));
+ allowSyscall(ctx, SCMP_SYS(signalfd4));
+ allowSyscall(ctx, SCMP_SYS(sigpending));
+ allowSyscall(ctx, SCMP_SYS(sigprocmask));
+ allowSyscall(ctx, SCMP_SYS(sigreturn));
+ allowSyscall(ctx, SCMP_SYS(sigsuspend));
+ allowSyscall(ctx, SCMP_SYS(socket));
+ allowSyscall(ctx, SCMP_SYS(socketcall));
+ allowSyscall(ctx, SCMP_SYS(socketpair));
+ allowSyscall(ctx, SCMP_SYS(splice));
+ allowSyscall(ctx, SCMP_SYS(spu_create));
+ allowSyscall(ctx, SCMP_SYS(spu_run));
+ allowSyscall(ctx, SCMP_SYS(ssetmask));
+ allowSyscall(ctx, SCMP_SYS(stat));
+ allowSyscall(ctx, SCMP_SYS(stat64));
+ allowSyscall(ctx, SCMP_SYS(statfs));
+ allowSyscall(ctx, SCMP_SYS(statfs64));
+ allowSyscall(ctx, SCMP_SYS(statx));
+ allowSyscall(ctx, SCMP_SYS(stime));
+ allowSyscall(ctx, SCMP_SYS(stty));
+ allowSyscall(ctx, SCMP_SYS(subpage_prot));
+ allowSyscall(ctx, SCMP_SYS(swapcontext));
+ allowSyscall(ctx, SCMP_SYS(swapoff));
+ allowSyscall(ctx, SCMP_SYS(swapon));
+ allowSyscall(ctx, SCMP_SYS(switch_endian));
+ allowSyscall(ctx, SCMP_SYS(symlink));
+ allowSyscall(ctx, SCMP_SYS(symlinkat));
+ allowSyscall(ctx, SCMP_SYS(sync));
+ allowSyscall(ctx, SCMP_SYS(sync_file_range));
+ allowSyscall(ctx, SCMP_SYS(sync_file_range2));
+ allowSyscall(ctx, SCMP_SYS(syncfs));
+ allowSyscall(ctx, SCMP_SYS(syscall));
+ allowSyscall(ctx, SCMP_SYS(_sysctl));
+ allowSyscall(ctx, SCMP_SYS(sys_debug_setcontext));
+ allowSyscall(ctx, SCMP_SYS(sysfs));
+ allowSyscall(ctx, SCMP_SYS(sysinfo));
+ allowSyscall(ctx, SCMP_SYS(syslog));
+ allowSyscall(ctx, SCMP_SYS(sysmips));
+ allowSyscall(ctx, SCMP_SYS(tee));
+ allowSyscall(ctx, SCMP_SYS(tgkill));
+ allowSyscall(ctx, SCMP_SYS(time));
+ allowSyscall(ctx, SCMP_SYS(timer_create));
+ allowSyscall(ctx, SCMP_SYS(timer_delete));
+ allowSyscall(ctx, SCMP_SYS(timerfd));
+ allowSyscall(ctx, SCMP_SYS(timerfd_create));
+ allowSyscall(ctx, SCMP_SYS(timerfd_gettime));
+ allowSyscall(ctx, SCMP_SYS(timerfd_gettime64));
+ allowSyscall(ctx, SCMP_SYS(timerfd_settime));
+ allowSyscall(ctx, SCMP_SYS(timerfd_settime64));
+ allowSyscall(ctx, SCMP_SYS(timer_getoverrun));
+ allowSyscall(ctx, SCMP_SYS(timer_gettime));
+ allowSyscall(ctx, SCMP_SYS(timer_gettime64));
+ allowSyscall(ctx, SCMP_SYS(timer_settime));
+ allowSyscall(ctx, SCMP_SYS(timer_settime64));
+ allowSyscall(ctx, SCMP_SYS(times));
+ allowSyscall(ctx, SCMP_SYS(tkill));
+ allowSyscall(ctx, SCMP_SYS(truncate));
+ allowSyscall(ctx, SCMP_SYS(truncate64));
+ allowSyscall(ctx, SCMP_SYS(tuxcall));
+ allowSyscall(ctx, SCMP_SYS(ugetrlimit));
+ allowSyscall(ctx, SCMP_SYS(ulimit));
+ allowSyscall(ctx, SCMP_SYS(umask));
+ allowSyscall(ctx, SCMP_SYS(umount));
+ allowSyscall(ctx, SCMP_SYS(umount2));
+ allowSyscall(ctx, SCMP_SYS(uname));
+ allowSyscall(ctx, SCMP_SYS(unlink));
+ allowSyscall(ctx, SCMP_SYS(unlinkat));
+ allowSyscall(ctx, SCMP_SYS(unshare));
+ allowSyscall(ctx, SCMP_SYS(uselib));
+ allowSyscall(ctx, SCMP_SYS(userfaultfd));
+ allowSyscall(ctx, SCMP_SYS(usr26));
+ allowSyscall(ctx, SCMP_SYS(usr32));
+ allowSyscall(ctx, SCMP_SYS(ustat));
+ allowSyscall(ctx, SCMP_SYS(utime));
+ allowSyscall(ctx, SCMP_SYS(utimensat));
+ allowSyscall(ctx, SCMP_SYS(utimensat_time64));
+ allowSyscall(ctx, SCMP_SYS(utimes));
+ allowSyscall(ctx, SCMP_SYS(vfork));
+ allowSyscall(ctx, SCMP_SYS(vhangup));
+ allowSyscall(ctx, SCMP_SYS(vm86));
+ allowSyscall(ctx, SCMP_SYS(vm86old));
+ allowSyscall(ctx, SCMP_SYS(vmsplice));
+ allowSyscall(ctx, SCMP_SYS(vserver));
+ allowSyscall(ctx, SCMP_SYS(wait4));
+ allowSyscall(ctx, SCMP_SYS(waitid));
+ allowSyscall(ctx, SCMP_SYS(waitpid));
+ allowSyscall(ctx, SCMP_SYS(write));
+ allowSyscall(ctx, SCMP_SYS(writev));
+ // END extract-syscalls
+
+ // chmod family: prevent adding setuid/setgid bits to existing files.
+ // The Nix store does not support setuid/setgid, and even their temporary creation can weaken the security of the sandbox.
+ ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(chmod), 1);
+ ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmod), 1);
+ ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmodat), 2);
+ ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmodat2), 2);
+
+ // setxattr family: prevent creation of extended attributes or ACLs.
+ // Not all filesystems support them, and they're incompatible with the NAR format.
if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(setxattr), 0) != 0 ||
seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(lsetxattr), 0) != 0 ||
seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(fsetxattr), 0) != 0)
diff --git a/src/libstore/linux/fchmodat2-compat.hh b/src/libstore/linux/fchmodat2-compat.hh
deleted file mode 100644
index d5ef81e22..000000000
--- a/src/libstore/linux/fchmodat2-compat.hh
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Determine the syscall number for `fchmodat2`.
- *
- * On most platforms this is 452. Exceptions can be found on
- * a glibc git checkout via `rg --pcre2 'define __NR_fchmodat2 (?!452)'`.
- *
- * The problem is that glibc 2.39 and libseccomp 2.5.5 are needed to
- * get the syscall number. However, a Lix built against nixpkgs 23.11
- * (glibc 2.38) should still have the issue fixed without depending
- * on the build environment.
- *
- * To achieve that, the macros below try to determine the platform and
- * set the syscall number which is platform-specific, but
- * in most cases 452.
- *
- * TODO: remove this when 23.11 is EOL and the entire (supported) ecosystem
- * is on glibc 2.39.
- */
-
-#pragma once
-///@file
-
-#if defined(__alpha__)
-# define NIX_SYSCALL_FCHMODAT2 562
-#elif defined(__x86_64__) && SIZE_MAX == 0xFFFFFFFF // x32
-# define NIX_SYSCALL_FCHMODAT2 1073742276
-#elif defined(__mips__) && defined(__mips64) && defined(_ABIN64) // mips64/n64
-# define NIX_SYSCALL_FCHMODAT2 5452
-#elif defined(__mips__) && defined(__mips64) && defined(_ABIN32) // mips64/n32
-# define NIX_SYSCALL_FCHMODAT2 6452
-#elif defined(__mips__) && defined(_ABIO32) // mips32
-# define NIX_SYSCALL_FCHMODAT2 4452
-#else
-# define NIX_SYSCALL_FCHMODAT2 452
-#endif
diff --git a/tests/nixos/setuid/fchmodat2-suid.c b/tests/nixos/setuid/fchmodat2-suid.c
index 931489ad7..7280331d5 100644
--- a/tests/nixos/setuid/fchmodat2-suid.c
+++ b/tests/nixos/setuid/fchmodat2-suid.c
@@ -12,10 +12,7 @@ int main(void) {
fprintf(fd, "henlo :3");
fclose(fd);
- // FIXME use something nicer here that's less
- // platform-dependent as soon as we go to 24.05
- // and the glibc is new enough to support fchmodat2
- long rs = syscall(452, NULL, name, S_ISUID, 0);
+ long rs = syscall(SYS_fchmodat2, NULL, name, S_ISUID, 0);
assert(rs == -1);
assert(errno == EPERM);
}