diff options
-rw-r--r-- | doc/manual/rl-next/block-io-uring.md | 12 | ||||
-rw-r--r-- | flake.nix | 1 | ||||
-rw-r--r-- | maintainers/check-syscalls.nix | 16 | ||||
-rwxr-xr-x | maintainers/check-syscalls.sh | 7 | ||||
-rw-r--r-- | package.nix | 2 | ||||
-rw-r--r-- | src/libstore/build/local-derivation-goal.cc | 555 | ||||
-rw-r--r-- | src/libstore/linux/fchmodat2-compat.hh | 35 | ||||
-rw-r--r-- | tests/nixos/default.nix | 2 | ||||
-rw-r--r-- | tests/nixos/io_uring/default.nix | 7 | ||||
-rw-r--r-- | tests/nixos/io_uring/package.nix | 19 | ||||
-rw-r--r-- | tests/nixos/setuid/fchmodat2-suid.c | 5 |
11 files changed, 593 insertions, 68 deletions
diff --git a/doc/manual/rl-next/block-io-uring.md b/doc/manual/rl-next/block-io-uring.md new file mode 100644 index 000000000..6ebba9a20 --- /dev/null +++ b/doc/manual/rl-next/block-io-uring.md @@ -0,0 +1,12 @@ +--- +synopsis: "Block io_uring in the Linux sandbox" +cls: 1611 +credits: alois31 +category: Breaking Changes +--- + +The io\_uring API has the unfortunate property that it is not possible to selectively decide which operations should be allowed. +This, together with the fact that new operations are routinely added, makes it a hazard to the proper function of the sandbox. + +Therefore, any access to io\_uring has been made unavailable inside the sandbox. +As such, attempts to execute any system calls forming part of this API will fail with the error `ENOSYS`, as if io\_uring support had not been configured into the kernel. @@ -164,6 +164,7 @@ nixUnstable = prev.nixUnstable; check-headers = final.buildPackages.callPackage ./maintainers/check-headers.nix { }; + check-syscalls = final.buildPackages.callPackage ./maintainers/check-syscalls.nix { }; default-busybox-sandbox-shell = final.busybox.override { useMusl = true; diff --git a/maintainers/check-syscalls.nix b/maintainers/check-syscalls.nix new file mode 100644 index 000000000..1a3de5c6d --- /dev/null +++ b/maintainers/check-syscalls.nix @@ -0,0 +1,16 @@ +{ + runCommandNoCC, + lib, + libseccomp, + writeShellScriptBin, +}: +let + syscalls-csv = runCommandNoCC "syscalls.csv" { } '' + echo ${lib.escapeShellArg libseccomp.src} + tar -xf ${lib.escapeShellArg libseccomp.src} --strip-components=2 ${libseccomp.name}/src/syscalls.csv + mv syscalls.csv "$out" + ''; +in +writeShellScriptBin "check-syscalls" '' + ${./check-syscalls.sh} ${syscalls-csv} +'' diff --git a/maintainers/check-syscalls.sh b/maintainers/check-syscalls.sh new file mode 100755 index 000000000..cd72ac23b --- /dev/null +++ b/maintainers/check-syscalls.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +set -e + +diff -u <(awk < src/libstore/build/local-derivation-goal.cc '/BEGIN extract-syscalls/ { extracting = 1; next } +match($0, /allowSyscall\(ctx, SCMP_SYS\(([^)]*)\)\);|\/\/ skip ([^ ]*)/, result) { print result[1] result[2] } +/END extract-syscalls/ { extracting = 0; next }') <(tail -n+2 "$1" | cut -d, -f 1) diff --git a/package.nix b/package.nix index bde48ae4b..e5e0033df 100644 --- a/package.nix +++ b/package.nix @@ -391,6 +391,7 @@ stdenv.mkDerivation (finalAttrs: { # Lix specific packages pre-commit-checks, contribNotice, + check-syscalls, }: let glibcFix = lib.optionalAttrs (buildPlatform.isLinux && glibcLocales != null) { @@ -441,6 +442,7 @@ stdenv.mkDerivation (finalAttrs: { # `bash` from inside `nix develop`, say, because you are using it # via direnv, you will by default get bash (unusable edition). bashInteractive + check-syscalls pythonEnv # docker image tool skopeo diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index da2433326..2f1f338c1 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -45,7 +45,6 @@ #include <sys/prctl.h> #include <sys/syscall.h> #if HAVE_SECCOMP -#include "linux/fchmodat2-compat.hh" #include <seccomp.h> #endif #define pivot_root(new_root, put_old) (syscall(SYS_pivot_root, new_root, put_old)) @@ -1363,6 +1362,20 @@ void LocalDerivationGoal::chownToBuilder(const Path & path) throw SysError("cannot change ownership of '%1%'", path); } +#if HAVE_SECCOMP + +static void allowSyscall(scmp_filter_ctx ctx, int syscall) { + if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 0) != 0) + throw SysError("unable to add seccomp rule"); +} + +#define ALLOW_CHMOD_IF_SAFE(ctx, syscall, modePos) \ + if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISUID | S_ISGID, 0)) != 0 || \ + seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISUID, S_ISUID)) != 0 || \ + seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISGID, S_ISGID)) != 0) \ + throw SysError("unable to add seccomp rule"); + +#endif void setupSeccomp() { @@ -1370,7 +1383,9 @@ void setupSeccomp() #if HAVE_SECCOMP scmp_filter_ctx ctx; - if (!(ctx = seccomp_init(SCMP_ACT_ALLOW))) + // Pretend that syscalls we don't yet know about don't exist. + // This is the best option for compatibility: after all, they did in fact not exist not too long ago. + if (!(ctx = seccomp_init(SCMP_ACT_ERRNO(ENOSYS)))) throw SysError("unable to initialize seccomp mode 2"); Finally cleanup([&]() { @@ -1405,28 +1420,514 @@ void setupSeccomp() seccomp_arch_add(ctx, SCMP_ARCH_MIPSEL64N32) != 0) printError("unable to add mips64el-*abin32 seccomp architecture"); - /* Prevent builders from creating setuid/setgid binaries. */ - for (int perm : { S_ISUID, S_ISGID }) { - if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(chmod), 1, - SCMP_A1(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0) - throw SysError("unable to add seccomp rule"); - - if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(fchmod), 1, - SCMP_A1(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0) - throw SysError("unable to add seccomp rule"); - - if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(fchmodat), 1, - SCMP_A2(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0) - throw SysError("unable to add seccomp rule"); - - if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), NIX_SYSCALL_FCHMODAT2, 1, - SCMP_A2(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0) - throw SysError("unable to add seccomp rule"); - } - - /* Prevent builders from creating EAs or ACLs. Not all filesystems - support these, and they're not allowed in the Nix store because - they're not representable in the NAR serialisation. */ + // This list is intended for machine consumption. + // Please keep its format, order and BEGIN/END markers. + // + // Currently, it is up to date with libseccomp 2.5.5 and glibc 2.39. + // Run check-syscalls to determine which new syscalls should be added. + // New syscalls must be audited and handled in a way that blocks the following dangerous operations: + // * Creation of non-empty setuid/setgid files + // * Creation of extended attributes (including ACLs) + // + // BEGIN extract-syscalls + allowSyscall(ctx, SCMP_SYS(accept)); + allowSyscall(ctx, SCMP_SYS(accept4)); + allowSyscall(ctx, SCMP_SYS(access)); + allowSyscall(ctx, SCMP_SYS(acct)); + allowSyscall(ctx, SCMP_SYS(add_key)); + allowSyscall(ctx, SCMP_SYS(adjtimex)); + allowSyscall(ctx, SCMP_SYS(afs_syscall)); + allowSyscall(ctx, SCMP_SYS(alarm)); + allowSyscall(ctx, SCMP_SYS(arch_prctl)); + allowSyscall(ctx, SCMP_SYS(arm_fadvise64_64)); + allowSyscall(ctx, SCMP_SYS(arm_sync_file_range)); + allowSyscall(ctx, SCMP_SYS(bdflush)); + allowSyscall(ctx, SCMP_SYS(bind)); + allowSyscall(ctx, SCMP_SYS(bpf)); + allowSyscall(ctx, SCMP_SYS(break)); + allowSyscall(ctx, SCMP_SYS(breakpoint)); + allowSyscall(ctx, SCMP_SYS(brk)); + allowSyscall(ctx, SCMP_SYS(cachectl)); + allowSyscall(ctx, SCMP_SYS(cacheflush)); + allowSyscall(ctx, SCMP_SYS(cachestat)); + allowSyscall(ctx, SCMP_SYS(capget)); + allowSyscall(ctx, SCMP_SYS(capset)); + allowSyscall(ctx, SCMP_SYS(chdir)); + // skip chmod (dangerous) + allowSyscall(ctx, SCMP_SYS(chown)); + allowSyscall(ctx, SCMP_SYS(chown32)); + allowSyscall(ctx, SCMP_SYS(chroot)); + allowSyscall(ctx, SCMP_SYS(clock_adjtime)); + allowSyscall(ctx, SCMP_SYS(clock_adjtime64)); + allowSyscall(ctx, SCMP_SYS(clock_getres)); + allowSyscall(ctx, SCMP_SYS(clock_getres_time64)); + allowSyscall(ctx, SCMP_SYS(clock_gettime)); + allowSyscall(ctx, SCMP_SYS(clock_gettime64)); + allowSyscall(ctx, SCMP_SYS(clock_nanosleep)); + allowSyscall(ctx, SCMP_SYS(clock_nanosleep_time64)); + allowSyscall(ctx, SCMP_SYS(clock_settime)); + allowSyscall(ctx, SCMP_SYS(clock_settime64)); + allowSyscall(ctx, SCMP_SYS(clone)); + allowSyscall(ctx, SCMP_SYS(clone3)); + allowSyscall(ctx, SCMP_SYS(close)); + allowSyscall(ctx, SCMP_SYS(close_range)); + allowSyscall(ctx, SCMP_SYS(connect)); + allowSyscall(ctx, SCMP_SYS(copy_file_range)); + allowSyscall(ctx, SCMP_SYS(creat)); + allowSyscall(ctx, SCMP_SYS(create_module)); + allowSyscall(ctx, SCMP_SYS(delete_module)); + allowSyscall(ctx, SCMP_SYS(dup)); + allowSyscall(ctx, SCMP_SYS(dup2)); + allowSyscall(ctx, SCMP_SYS(dup3)); + allowSyscall(ctx, SCMP_SYS(epoll_create)); + allowSyscall(ctx, SCMP_SYS(epoll_create1)); + allowSyscall(ctx, SCMP_SYS(epoll_ctl)); + allowSyscall(ctx, SCMP_SYS(epoll_ctl_old)); + allowSyscall(ctx, SCMP_SYS(epoll_pwait)); + allowSyscall(ctx, SCMP_SYS(epoll_pwait2)); + allowSyscall(ctx, SCMP_SYS(epoll_wait)); + allowSyscall(ctx, SCMP_SYS(epoll_wait_old)); + allowSyscall(ctx, SCMP_SYS(eventfd)); + allowSyscall(ctx, SCMP_SYS(eventfd2)); + allowSyscall(ctx, SCMP_SYS(execve)); + allowSyscall(ctx, SCMP_SYS(execveat)); + allowSyscall(ctx, SCMP_SYS(exit)); + allowSyscall(ctx, SCMP_SYS(exit_group)); + allowSyscall(ctx, SCMP_SYS(faccessat)); + allowSyscall(ctx, SCMP_SYS(faccessat2)); + allowSyscall(ctx, SCMP_SYS(fadvise64)); + allowSyscall(ctx, SCMP_SYS(fadvise64_64)); + allowSyscall(ctx, SCMP_SYS(fallocate)); + allowSyscall(ctx, SCMP_SYS(fanotify_init)); + allowSyscall(ctx, SCMP_SYS(fanotify_mark)); + allowSyscall(ctx, SCMP_SYS(fchdir)); + // skip fchmod (dangerous) + // skip fchmodat (dangerous) + // skip fchmodat2 (dangerous) + allowSyscall(ctx, SCMP_SYS(fchown)); + allowSyscall(ctx, SCMP_SYS(fchown32)); + allowSyscall(ctx, SCMP_SYS(fchownat)); + allowSyscall(ctx, SCMP_SYS(fcntl)); + allowSyscall(ctx, SCMP_SYS(fcntl64)); + allowSyscall(ctx, SCMP_SYS(fdatasync)); + allowSyscall(ctx, SCMP_SYS(fgetxattr)); + allowSyscall(ctx, SCMP_SYS(finit_module)); + allowSyscall(ctx, SCMP_SYS(flistxattr)); + allowSyscall(ctx, SCMP_SYS(flock)); + allowSyscall(ctx, SCMP_SYS(fork)); + allowSyscall(ctx, SCMP_SYS(fremovexattr)); + allowSyscall(ctx, SCMP_SYS(fsconfig)); + // skip fsetxattr (dangerous) + allowSyscall(ctx, SCMP_SYS(fsmount)); + allowSyscall(ctx, SCMP_SYS(fsopen)); + allowSyscall(ctx, SCMP_SYS(fspick)); + allowSyscall(ctx, SCMP_SYS(fstat)); + allowSyscall(ctx, SCMP_SYS(fstat64)); + allowSyscall(ctx, SCMP_SYS(fstatat64)); + allowSyscall(ctx, SCMP_SYS(fstatfs)); + allowSyscall(ctx, SCMP_SYS(fstatfs64)); + allowSyscall(ctx, SCMP_SYS(fsync)); + allowSyscall(ctx, SCMP_SYS(ftime)); + allowSyscall(ctx, SCMP_SYS(ftruncate)); + allowSyscall(ctx, SCMP_SYS(ftruncate64)); + allowSyscall(ctx, SCMP_SYS(futex)); + allowSyscall(ctx, SCMP_SYS(futex_requeue)); + allowSyscall(ctx, SCMP_SYS(futex_time64)); + allowSyscall(ctx, SCMP_SYS(futex_wait)); + allowSyscall(ctx, SCMP_SYS(futex_waitv)); + allowSyscall(ctx, SCMP_SYS(futex_wake)); + allowSyscall(ctx, SCMP_SYS(futimesat)); + allowSyscall(ctx, SCMP_SYS(getcpu)); + allowSyscall(ctx, SCMP_SYS(getcwd)); + allowSyscall(ctx, SCMP_SYS(getdents)); + allowSyscall(ctx, SCMP_SYS(getdents64)); + allowSyscall(ctx, SCMP_SYS(getegid)); + allowSyscall(ctx, SCMP_SYS(getegid32)); + allowSyscall(ctx, SCMP_SYS(geteuid)); + allowSyscall(ctx, SCMP_SYS(geteuid32)); + allowSyscall(ctx, SCMP_SYS(getgid)); + allowSyscall(ctx, SCMP_SYS(getgid32)); + allowSyscall(ctx, SCMP_SYS(getgroups)); + allowSyscall(ctx, SCMP_SYS(getgroups32)); + allowSyscall(ctx, SCMP_SYS(getitimer)); + allowSyscall(ctx, SCMP_SYS(get_kernel_syms)); + allowSyscall(ctx, SCMP_SYS(get_mempolicy)); + allowSyscall(ctx, SCMP_SYS(getpeername)); + allowSyscall(ctx, SCMP_SYS(getpgid)); + allowSyscall(ctx, SCMP_SYS(getpgrp)); + allowSyscall(ctx, SCMP_SYS(getpid)); + allowSyscall(ctx, SCMP_SYS(getpmsg)); + allowSyscall(ctx, SCMP_SYS(getppid)); + allowSyscall(ctx, SCMP_SYS(getpriority)); + allowSyscall(ctx, SCMP_SYS(getrandom)); + allowSyscall(ctx, SCMP_SYS(getresgid)); + allowSyscall(ctx, SCMP_SYS(getresgid32)); + allowSyscall(ctx, SCMP_SYS(getresuid)); + allowSyscall(ctx, SCMP_SYS(getresuid32)); + allowSyscall(ctx, SCMP_SYS(getrlimit)); + allowSyscall(ctx, SCMP_SYS(get_robust_list)); + allowSyscall(ctx, SCMP_SYS(getrusage)); + allowSyscall(ctx, SCMP_SYS(getsid)); + allowSyscall(ctx, SCMP_SYS(getsockname)); + allowSyscall(ctx, SCMP_SYS(getsockopt)); + allowSyscall(ctx, SCMP_SYS(get_thread_area)); + allowSyscall(ctx, SCMP_SYS(gettid)); + allowSyscall(ctx, SCMP_SYS(gettimeofday)); + allowSyscall(ctx, SCMP_SYS(get_tls)); + allowSyscall(ctx, SCMP_SYS(getuid)); + allowSyscall(ctx, SCMP_SYS(getuid32)); + allowSyscall(ctx, SCMP_SYS(getxattr)); + allowSyscall(ctx, SCMP_SYS(gtty)); + allowSyscall(ctx, SCMP_SYS(idle)); + allowSyscall(ctx, SCMP_SYS(init_module)); + allowSyscall(ctx, SCMP_SYS(inotify_add_watch)); + allowSyscall(ctx, SCMP_SYS(inotify_init)); + allowSyscall(ctx, SCMP_SYS(inotify_init1)); + allowSyscall(ctx, SCMP_SYS(inotify_rm_watch)); + allowSyscall(ctx, SCMP_SYS(io_cancel)); + allowSyscall(ctx, SCMP_SYS(ioctl)); + allowSyscall(ctx, SCMP_SYS(io_destroy)); + allowSyscall(ctx, SCMP_SYS(io_getevents)); + allowSyscall(ctx, SCMP_SYS(ioperm)); + allowSyscall(ctx, SCMP_SYS(io_pgetevents)); + allowSyscall(ctx, SCMP_SYS(io_pgetevents_time64)); + allowSyscall(ctx, SCMP_SYS(iopl)); + allowSyscall(ctx, SCMP_SYS(ioprio_get)); + allowSyscall(ctx, SCMP_SYS(ioprio_set)); + allowSyscall(ctx, SCMP_SYS(io_setup)); + allowSyscall(ctx, SCMP_SYS(io_submit)); + // skip io_uring_enter (may become dangerous) + // skip io_uring_register (may become dangerous) + // skip io_uring_setup (may become dangerous) + allowSyscall(ctx, SCMP_SYS(ipc)); + allowSyscall(ctx, SCMP_SYS(kcmp)); + allowSyscall(ctx, SCMP_SYS(kexec_file_load)); + allowSyscall(ctx, SCMP_SYS(kexec_load)); + allowSyscall(ctx, SCMP_SYS(keyctl)); + allowSyscall(ctx, SCMP_SYS(kill)); + allowSyscall(ctx, SCMP_SYS(landlock_add_rule)); + allowSyscall(ctx, SCMP_SYS(landlock_create_ruleset)); + allowSyscall(ctx, SCMP_SYS(landlock_restrict_self)); + allowSyscall(ctx, SCMP_SYS(lchown)); + allowSyscall(ctx, SCMP_SYS(lchown32)); + allowSyscall(ctx, SCMP_SYS(lgetxattr)); + allowSyscall(ctx, SCMP_SYS(link)); + allowSyscall(ctx, SCMP_SYS(linkat)); + allowSyscall(ctx, SCMP_SYS(listen)); + allowSyscall(ctx, SCMP_SYS(listxattr)); + allowSyscall(ctx, SCMP_SYS(llistxattr)); + allowSyscall(ctx, SCMP_SYS(_llseek)); + allowSyscall(ctx, SCMP_SYS(lock)); + allowSyscall(ctx, SCMP_SYS(lookup_dcookie)); + allowSyscall(ctx, SCMP_SYS(lremovexattr)); + allowSyscall(ctx, SCMP_SYS(lseek)); + // skip lsetxattr (dangerous) + allowSyscall(ctx, SCMP_SYS(lstat)); + allowSyscall(ctx, SCMP_SYS(lstat64)); + allowSyscall(ctx, SCMP_SYS(madvise)); + allowSyscall(ctx, SCMP_SYS(map_shadow_stack)); + allowSyscall(ctx, SCMP_SYS(mbind)); + allowSyscall(ctx, SCMP_SYS(membarrier)); + allowSyscall(ctx, SCMP_SYS(memfd_create)); + allowSyscall(ctx, SCMP_SYS(memfd_secret)); + allowSyscall(ctx, SCMP_SYS(migrate_pages)); + allowSyscall(ctx, SCMP_SYS(mincore)); + allowSyscall(ctx, SCMP_SYS(mkdir)); + allowSyscall(ctx, SCMP_SYS(mkdirat)); + allowSyscall(ctx, SCMP_SYS(mknod)); + allowSyscall(ctx, SCMP_SYS(mknodat)); + allowSyscall(ctx, SCMP_SYS(mlock)); + allowSyscall(ctx, SCMP_SYS(mlock2)); + allowSyscall(ctx, SCMP_SYS(mlockall)); + allowSyscall(ctx, SCMP_SYS(mmap)); + allowSyscall(ctx, SCMP_SYS(mmap2)); + allowSyscall(ctx, SCMP_SYS(modify_ldt)); + allowSyscall(ctx, SCMP_SYS(mount)); + allowSyscall(ctx, SCMP_SYS(mount_setattr)); + allowSyscall(ctx, SCMP_SYS(move_mount)); + allowSyscall(ctx, SCMP_SYS(move_pages)); + allowSyscall(ctx, SCMP_SYS(mprotect)); + allowSyscall(ctx, SCMP_SYS(mpx)); + allowSyscall(ctx, SCMP_SYS(mq_getsetattr)); + allowSyscall(ctx, SCMP_SYS(mq_notify)); + allowSyscall(ctx, SCMP_SYS(mq_open)); + allowSyscall(ctx, SCMP_SYS(mq_timedreceive)); + allowSyscall(ctx, SCMP_SYS(mq_timedreceive_time64)); + allowSyscall(ctx, SCMP_SYS(mq_timedsend)); + allowSyscall(ctx, SCMP_SYS(mq_timedsend_time64)); + allowSyscall(ctx, SCMP_SYS(mq_unlink)); + allowSyscall(ctx, SCMP_SYS(mremap)); + allowSyscall(ctx, SCMP_SYS(msgctl)); + allowSyscall(ctx, SCMP_SYS(msgget)); + allowSyscall(ctx, SCMP_SYS(msgrcv)); + allowSyscall(ctx, SCMP_SYS(msgsnd)); + allowSyscall(ctx, SCMP_SYS(msync)); + allowSyscall(ctx, SCMP_SYS(multiplexer)); + allowSyscall(ctx, SCMP_SYS(munlock)); + allowSyscall(ctx, SCMP_SYS(munlockall)); + allowSyscall(ctx, SCMP_SYS(munmap)); + allowSyscall(ctx, SCMP_SYS(name_to_handle_at)); + allowSyscall(ctx, SCMP_SYS(nanosleep)); + allowSyscall(ctx, SCMP_SYS(newfstatat)); + allowSyscall(ctx, SCMP_SYS(_newselect)); + allowSyscall(ctx, SCMP_SYS(nfsservctl)); + allowSyscall(ctx, SCMP_SYS(nice)); + allowSyscall(ctx, SCMP_SYS(oldfstat)); + allowSyscall(ctx, SCMP_SYS(oldlstat)); + allowSyscall(ctx, SCMP_SYS(oldolduname)); + allowSyscall(ctx, SCMP_SYS(oldstat)); + allowSyscall(ctx, SCMP_SYS(olduname)); + allowSyscall(ctx, SCMP_SYS(open)); + allowSyscall(ctx, SCMP_SYS(openat)); + allowSyscall(ctx, SCMP_SYS(openat2)); + allowSyscall(ctx, SCMP_SYS(open_by_handle_at)); + allowSyscall(ctx, SCMP_SYS(open_tree)); + allowSyscall(ctx, SCMP_SYS(pause)); + allowSyscall(ctx, SCMP_SYS(pciconfig_iobase)); + allowSyscall(ctx, SCMP_SYS(pciconfig_read)); + allowSyscall(ctx, SCMP_SYS(pciconfig_write)); + allowSyscall(ctx, SCMP_SYS(perf_event_open)); + allowSyscall(ctx, SCMP_SYS(personality)); + allowSyscall(ctx, SCMP_SYS(pidfd_getfd)); + allowSyscall(ctx, SCMP_SYS(pidfd_open)); + allowSyscall(ctx, SCMP_SYS(pidfd_send_signal)); + allowSyscall(ctx, SCMP_SYS(pipe)); + allowSyscall(ctx, SCMP_SYS(pipe2)); + allowSyscall(ctx, SCMP_SYS(pivot_root)); + allowSyscall(ctx, SCMP_SYS(pkey_alloc)); + allowSyscall(ctx, SCMP_SYS(pkey_free)); + allowSyscall(ctx, SCMP_SYS(pkey_mprotect)); + allowSyscall(ctx, SCMP_SYS(poll)); + allowSyscall(ctx, SCMP_SYS(ppoll)); + allowSyscall(ctx, SCMP_SYS(ppoll_time64)); + allowSyscall(ctx, SCMP_SYS(prctl)); + allowSyscall(ctx, SCMP_SYS(pread64)); + allowSyscall(ctx, SCMP_SYS(preadv)); + allowSyscall(ctx, SCMP_SYS(preadv2)); + allowSyscall(ctx, SCMP_SYS(prlimit64)); + allowSyscall(ctx, SCMP_SYS(process_madvise)); + allowSyscall(ctx, SCMP_SYS(process_mrelease)); + allowSyscall(ctx, SCMP_SYS(process_vm_readv)); + allowSyscall(ctx, SCMP_SYS(process_vm_writev)); + allowSyscall(ctx, SCMP_SYS(prof)); + allowSyscall(ctx, SCMP_SYS(profil)); + allowSyscall(ctx, SCMP_SYS(pselect6)); + allowSyscall(ctx, SCMP_SYS(pselect6_time64)); + allowSyscall(ctx, SCMP_SYS(ptrace)); + allowSyscall(ctx, SCMP_SYS(putpmsg)); + allowSyscall(ctx, SCMP_SYS(pwrite64)); + allowSyscall(ctx, SCMP_SYS(pwritev)); + allowSyscall(ctx, SCMP_SYS(pwritev2)); + allowSyscall(ctx, SCMP_SYS(query_module)); + allowSyscall(ctx, SCMP_SYS(quotactl)); + allowSyscall(ctx, SCMP_SYS(quotactl_fd)); + allowSyscall(ctx, SCMP_SYS(read)); + allowSyscall(ctx, SCMP_SYS(readahead)); + allowSyscall(ctx, SCMP_SYS(readdir)); + allowSyscall(ctx, SCMP_SYS(readlink)); + allowSyscall(ctx, SCMP_SYS(readlinkat)); + allowSyscall(ctx, SCMP_SYS(readv)); + allowSyscall(ctx, SCMP_SYS(reboot)); + allowSyscall(ctx, SCMP_SYS(recv)); + allowSyscall(ctx, SCMP_SYS(recvfrom)); + allowSyscall(ctx, SCMP_SYS(recvmmsg)); + allowSyscall(ctx, SCMP_SYS(recvmmsg_time64)); + allowSyscall(ctx, SCMP_SYS(recvmsg)); + allowSyscall(ctx, SCMP_SYS(remap_file_pages)); + allowSyscall(ctx, SCMP_SYS(removexattr)); + allowSyscall(ctx, SCMP_SYS(rename)); + allowSyscall(ctx, SCMP_SYS(renameat)); + allowSyscall(ctx, SCMP_SYS(renameat2)); + allowSyscall(ctx, SCMP_SYS(request_key)); + allowSyscall(ctx, SCMP_SYS(restart_syscall)); + allowSyscall(ctx, SCMP_SYS(riscv_flush_icache)); + allowSyscall(ctx, SCMP_SYS(rmdir)); + allowSyscall(ctx, SCMP_SYS(rseq)); + allowSyscall(ctx, SCMP_SYS(rtas)); + allowSyscall(ctx, SCMP_SYS(rt_sigaction)); + allowSyscall(ctx, SCMP_SYS(rt_sigpending)); + allowSyscall(ctx, SCMP_SYS(rt_sigprocmask)); + allowSyscall(ctx, SCMP_SYS(rt_sigqueueinfo)); + allowSyscall(ctx, SCMP_SYS(rt_sigreturn)); + allowSyscall(ctx, SCMP_SYS(rt_sigsuspend)); + allowSyscall(ctx, SCMP_SYS(rt_sigtimedwait)); + allowSyscall(ctx, SCMP_SYS(rt_sigtimedwait_time64)); + allowSyscall(ctx, SCMP_SYS(rt_tgsigqueueinfo)); + allowSyscall(ctx, SCMP_SYS(s390_guarded_storage)); + allowSyscall(ctx, SCMP_SYS(s390_pci_mmio_read)); + allowSyscall(ctx, SCMP_SYS(s390_pci_mmio_write)); + allowSyscall(ctx, SCMP_SYS(s390_runtime_instr)); + allowSyscall(ctx, SCMP_SYS(s390_sthyi)); + allowSyscall(ctx, SCMP_SYS(sched_getaffinity)); + allowSyscall(ctx, SCMP_SYS(sched_getattr)); + allowSyscall(ctx, SCMP_SYS(sched_getparam)); + allowSyscall(ctx, SCMP_SYS(sched_get_priority_max)); + allowSyscall(ctx, SCMP_SYS(sched_get_priority_min)); + allowSyscall(ctx, SCMP_SYS(sched_getscheduler)); + allowSyscall(ctx, SCMP_SYS(sched_rr_get_interval)); + allowSyscall(ctx, SCMP_SYS(sched_rr_get_interval_time64)); + allowSyscall(ctx, SCMP_SYS(sched_setaffinity)); + allowSyscall(ctx, SCMP_SYS(sched_setattr)); + allowSyscall(ctx, SCMP_SYS(sched_setparam)); + allowSyscall(ctx, SCMP_SYS(sched_setscheduler)); + allowSyscall(ctx, SCMP_SYS(sched_yield)); + allowSyscall(ctx, SCMP_SYS(seccomp)); + allowSyscall(ctx, SCMP_SYS(security)); + allowSyscall(ctx, SCMP_SYS(select)); + allowSyscall(ctx, SCMP_SYS(semctl)); + allowSyscall(ctx, SCMP_SYS(semget)); + allowSyscall(ctx, SCMP_SYS(semop)); + allowSyscall(ctx, SCMP_SYS(semtimedop)); + allowSyscall(ctx, SCMP_SYS(semtimedop_time64)); + allowSyscall(ctx, SCMP_SYS(send)); + allowSyscall(ctx, SCMP_SYS(sendfile)); + allowSyscall(ctx, SCMP_SYS(sendfile64)); + allowSyscall(ctx, SCMP_SYS(sendmmsg)); + allowSyscall(ctx, SCMP_SYS(sendmsg)); + allowSyscall(ctx, SCMP_SYS(sendto)); + allowSyscall(ctx, SCMP_SYS(setdomainname)); + allowSyscall(ctx, SCMP_SYS(setfsgid)); + allowSyscall(ctx, SCMP_SYS(setfsgid32)); + allowSyscall(ctx, SCMP_SYS(setfsuid)); + allowSyscall(ctx, SCMP_SYS(setfsuid32)); + allowSyscall(ctx, SCMP_SYS(setgid)); + allowSyscall(ctx, SCMP_SYS(setgid32)); + allowSyscall(ctx, SCMP_SYS(setgroups)); + allowSyscall(ctx, SCMP_SYS(setgroups32)); + allowSyscall(ctx, SCMP_SYS(sethostname)); + allowSyscall(ctx, SCMP_SYS(setitimer)); + allowSyscall(ctx, SCMP_SYS(set_mempolicy)); + allowSyscall(ctx, SCMP_SYS(set_mempolicy_home_node)); + allowSyscall(ctx, SCMP_SYS(setns)); + allowSyscall(ctx, SCMP_SYS(setpgid)); + allowSyscall(ctx, SCMP_SYS(setpriority)); + allowSyscall(ctx, SCMP_SYS(setregid)); + allowSyscall(ctx, SCMP_SYS(setregid32)); + allowSyscall(ctx, SCMP_SYS(setresgid)); + allowSyscall(ctx, SCMP_SYS(setresgid32)); + allowSyscall(ctx, SCMP_SYS(setresuid)); + allowSyscall(ctx, SCMP_SYS(setresuid32)); + allowSyscall(ctx, SCMP_SYS(setreuid)); + allowSyscall(ctx, SCMP_SYS(setreuid32)); + allowSyscall(ctx, SCMP_SYS(setrlimit)); + allowSyscall(ctx, SCMP_SYS(set_robust_list)); + allowSyscall(ctx, SCMP_SYS(setsid)); + allowSyscall(ctx, SCMP_SYS(setsockopt)); + allowSyscall(ctx, SCMP_SYS(set_thread_area)); + allowSyscall(ctx, SCMP_SYS(set_tid_address)); + allowSyscall(ctx, SCMP_SYS(settimeofday)); + allowSyscall(ctx, SCMP_SYS(set_tls)); + allowSyscall(ctx, SCMP_SYS(setuid)); + allowSyscall(ctx, SCMP_SYS(setuid32)); + // skip setxattr (dangerous) + allowSyscall(ctx, SCMP_SYS(sgetmask)); + allowSyscall(ctx, SCMP_SYS(shmat)); + allowSyscall(ctx, SCMP_SYS(shmctl)); + allowSyscall(ctx, SCMP_SYS(shmdt)); + allowSyscall(ctx, SCMP_SYS(shmget)); + allowSyscall(ctx, SCMP_SYS(shutdown)); + allowSyscall(ctx, SCMP_SYS(sigaction)); + allowSyscall(ctx, SCMP_SYS(sigaltstack)); + allowSyscall(ctx, SCMP_SYS(signal)); + allowSyscall(ctx, SCMP_SYS(signalfd)); + allowSyscall(ctx, SCMP_SYS(signalfd4)); + allowSyscall(ctx, SCMP_SYS(sigpending)); + allowSyscall(ctx, SCMP_SYS(sigprocmask)); + allowSyscall(ctx, SCMP_SYS(sigreturn)); + allowSyscall(ctx, SCMP_SYS(sigsuspend)); + allowSyscall(ctx, SCMP_SYS(socket)); + allowSyscall(ctx, SCMP_SYS(socketcall)); + allowSyscall(ctx, SCMP_SYS(socketpair)); + allowSyscall(ctx, SCMP_SYS(splice)); + allowSyscall(ctx, SCMP_SYS(spu_create)); + allowSyscall(ctx, SCMP_SYS(spu_run)); + allowSyscall(ctx, SCMP_SYS(ssetmask)); + allowSyscall(ctx, SCMP_SYS(stat)); + allowSyscall(ctx, SCMP_SYS(stat64)); + allowSyscall(ctx, SCMP_SYS(statfs)); + allowSyscall(ctx, SCMP_SYS(statfs64)); + allowSyscall(ctx, SCMP_SYS(statx)); + allowSyscall(ctx, SCMP_SYS(stime)); + allowSyscall(ctx, SCMP_SYS(stty)); + allowSyscall(ctx, SCMP_SYS(subpage_prot)); + allowSyscall(ctx, SCMP_SYS(swapcontext)); + allowSyscall(ctx, SCMP_SYS(swapoff)); + allowSyscall(ctx, SCMP_SYS(swapon)); + allowSyscall(ctx, SCMP_SYS(switch_endian)); + allowSyscall(ctx, SCMP_SYS(symlink)); + allowSyscall(ctx, SCMP_SYS(symlinkat)); + allowSyscall(ctx, SCMP_SYS(sync)); + allowSyscall(ctx, SCMP_SYS(sync_file_range)); + allowSyscall(ctx, SCMP_SYS(sync_file_range2)); + allowSyscall(ctx, SCMP_SYS(syncfs)); + allowSyscall(ctx, SCMP_SYS(syscall)); + allowSyscall(ctx, SCMP_SYS(_sysctl)); + allowSyscall(ctx, SCMP_SYS(sys_debug_setcontext)); + allowSyscall(ctx, SCMP_SYS(sysfs)); + allowSyscall(ctx, SCMP_SYS(sysinfo)); + allowSyscall(ctx, SCMP_SYS(syslog)); + allowSyscall(ctx, SCMP_SYS(sysmips)); + allowSyscall(ctx, SCMP_SYS(tee)); + allowSyscall(ctx, SCMP_SYS(tgkill)); + allowSyscall(ctx, SCMP_SYS(time)); + allowSyscall(ctx, SCMP_SYS(timer_create)); + allowSyscall(ctx, SCMP_SYS(timer_delete)); + allowSyscall(ctx, SCMP_SYS(timerfd)); + allowSyscall(ctx, SCMP_SYS(timerfd_create)); + allowSyscall(ctx, SCMP_SYS(timerfd_gettime)); + allowSyscall(ctx, SCMP_SYS(timerfd_gettime64)); + allowSyscall(ctx, SCMP_SYS(timerfd_settime)); + allowSyscall(ctx, SCMP_SYS(timerfd_settime64)); + allowSyscall(ctx, SCMP_SYS(timer_getoverrun)); + allowSyscall(ctx, SCMP_SYS(timer_gettime)); + allowSyscall(ctx, SCMP_SYS(timer_gettime64)); + allowSyscall(ctx, SCMP_SYS(timer_settime)); + allowSyscall(ctx, SCMP_SYS(timer_settime64)); + allowSyscall(ctx, SCMP_SYS(times)); + allowSyscall(ctx, SCMP_SYS(tkill)); + allowSyscall(ctx, SCMP_SYS(truncate)); + allowSyscall(ctx, SCMP_SYS(truncate64)); + allowSyscall(ctx, SCMP_SYS(tuxcall)); + allowSyscall(ctx, SCMP_SYS(ugetrlimit)); + allowSyscall(ctx, SCMP_SYS(ulimit)); + allowSyscall(ctx, SCMP_SYS(umask)); + allowSyscall(ctx, SCMP_SYS(umount)); + allowSyscall(ctx, SCMP_SYS(umount2)); + allowSyscall(ctx, SCMP_SYS(uname)); + allowSyscall(ctx, SCMP_SYS(unlink)); + allowSyscall(ctx, SCMP_SYS(unlinkat)); + allowSyscall(ctx, SCMP_SYS(unshare)); + allowSyscall(ctx, SCMP_SYS(uselib)); + allowSyscall(ctx, SCMP_SYS(userfaultfd)); + allowSyscall(ctx, SCMP_SYS(usr26)); + allowSyscall(ctx, SCMP_SYS(usr32)); + allowSyscall(ctx, SCMP_SYS(ustat)); + allowSyscall(ctx, SCMP_SYS(utime)); + allowSyscall(ctx, SCMP_SYS(utimensat)); + allowSyscall(ctx, SCMP_SYS(utimensat_time64)); + allowSyscall(ctx, SCMP_SYS(utimes)); + allowSyscall(ctx, SCMP_SYS(vfork)); + allowSyscall(ctx, SCMP_SYS(vhangup)); + allowSyscall(ctx, SCMP_SYS(vm86)); + allowSyscall(ctx, SCMP_SYS(vm86old)); + allowSyscall(ctx, SCMP_SYS(vmsplice)); + allowSyscall(ctx, SCMP_SYS(vserver)); + allowSyscall(ctx, SCMP_SYS(wait4)); + allowSyscall(ctx, SCMP_SYS(waitid)); + allowSyscall(ctx, SCMP_SYS(waitpid)); + allowSyscall(ctx, SCMP_SYS(write)); + allowSyscall(ctx, SCMP_SYS(writev)); + // END extract-syscalls + + // chmod family: prevent adding setuid/setgid bits to existing files. + // The Nix store does not support setuid/setgid, and even their temporary creation can weaken the security of the sandbox. + ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(chmod), 1); + ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmod), 1); + ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmodat), 2); + ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmodat2), 2); + + // setxattr family: prevent creation of extended attributes or ACLs. + // Not all filesystems support them, and they're incompatible with the NAR format. if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(setxattr), 0) != 0 || seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(lsetxattr), 0) != 0 || seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(fsetxattr), 0) != 0) @@ -1460,11 +1961,7 @@ void LocalDerivationGoal::runChild() commonChildInit(); - try { - setupSeccomp(); - } catch (...) { - if (buildUser) throw; - } + setupSeccomp(); bool setUser = true; diff --git a/src/libstore/linux/fchmodat2-compat.hh b/src/libstore/linux/fchmodat2-compat.hh deleted file mode 100644 index d5ef81e22..000000000 --- a/src/libstore/linux/fchmodat2-compat.hh +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Determine the syscall number for `fchmodat2`. - * - * On most platforms this is 452. Exceptions can be found on - * a glibc git checkout via `rg --pcre2 'define __NR_fchmodat2 (?!452)'`. - * - * The problem is that glibc 2.39 and libseccomp 2.5.5 are needed to - * get the syscall number. However, a Lix built against nixpkgs 23.11 - * (glibc 2.38) should still have the issue fixed without depending - * on the build environment. - * - * To achieve that, the macros below try to determine the platform and - * set the syscall number which is platform-specific, but - * in most cases 452. - * - * TODO: remove this when 23.11 is EOL and the entire (supported) ecosystem - * is on glibc 2.39. - */ - -#pragma once -///@file - -#if defined(__alpha__) -# define NIX_SYSCALL_FCHMODAT2 562 -#elif defined(__x86_64__) && SIZE_MAX == 0xFFFFFFFF // x32 -# define NIX_SYSCALL_FCHMODAT2 1073742276 -#elif defined(__mips__) && defined(__mips64) && defined(_ABIN64) // mips64/n64 -# define NIX_SYSCALL_FCHMODAT2 5452 -#elif defined(__mips__) && defined(__mips64) && defined(_ABIN32) // mips64/n32 -# define NIX_SYSCALL_FCHMODAT2 6452 -#elif defined(__mips__) && defined(_ABIO32) // mips32 -# define NIX_SYSCALL_FCHMODAT2 4452 -#else -# define NIX_SYSCALL_FCHMODAT2 452 -#endif diff --git a/tests/nixos/default.nix b/tests/nixos/default.nix index 301eede46..20e66f6c1 100644 --- a/tests/nixos/default.nix +++ b/tests/nixos/default.nix @@ -155,4 +155,6 @@ in broken-userns = runNixOSTestFor "x86_64-linux" ./broken-userns.nix; coredumps = runNixOSTestFor "x86_64-linux" ./coredumps; + + io_uring = runNixOSTestFor "x86_64-linux" ./io_uring; } diff --git a/tests/nixos/io_uring/default.nix b/tests/nixos/io_uring/default.nix new file mode 100644 index 000000000..9cd445d6a --- /dev/null +++ b/tests/nixos/io_uring/default.nix @@ -0,0 +1,7 @@ +let + inherit (import ../util.nix) mkNixBuildTest; +in +mkNixBuildTest { + name = "io_uring"; + expressionFile = ./package.nix; +} diff --git a/tests/nixos/io_uring/package.nix b/tests/nixos/io_uring/package.nix new file mode 100644 index 000000000..8f980183a --- /dev/null +++ b/tests/nixos/io_uring/package.nix @@ -0,0 +1,19 @@ +{ runCommandCC }: +runCommandCC "io_uring-is-blocked" { } '' + cat > test.c <<EOF + #include <errno.h> + #include <sys/syscall.h> + #include <unistd.h> + + int main() { + int res = syscall(SYS_io_uring_setup, 0, NULL); + return res == -1 && errno == ENOSYS ? 0 : 1; + } + EOF + "$CC" -o test test.c + if ! ./test; then + echo "Oh no! io_uring is available!" + exit 1 + fi + touch "$out" +'' diff --git a/tests/nixos/setuid/fchmodat2-suid.c b/tests/nixos/setuid/fchmodat2-suid.c index 931489ad7..7280331d5 100644 --- a/tests/nixos/setuid/fchmodat2-suid.c +++ b/tests/nixos/setuid/fchmodat2-suid.c @@ -12,10 +12,7 @@ int main(void) { fprintf(fd, "henlo :3"); fclose(fd); - // FIXME use something nicer here that's less - // platform-dependent as soon as we go to 24.05 - // and the glibc is new enough to support fchmodat2 - long rs = syscall(452, NULL, name, S_ISUID, 0); + long rs = syscall(SYS_fchmodat2, NULL, name, S_ISUID, 0); assert(rs == -1); assert(errno == EPERM); } |