@@ -6,10 +6,21 @@
#ifndef __MM_ID_H
#define __MM_ID_H
+#ifdef CONFIG_UML_SECCOMP
+#define STUB_MAX_FDS 4
+#else
+#define STUB_MAX_FDS 0
+#endif
+
struct mm_id {
int pid;
unsigned long stack;
int syscall_data_len;
+
+ /* Only used with SECCOMP mode */
+ int sock;
+ int syscall_fd_num;
+ int syscall_fd_map[STUB_MAX_FDS];
};
void __switch_mm(struct mm_id *mm_idp);
@@ -13,6 +13,7 @@
#include <as-layout.h>
#include <sysdep/tls.h>
#include <sysdep/stub-data.h>
+#include <mm_id.h>
#define FUTEX_IN_CHILD 0
#define FUTEX_IN_KERN 1
@@ -80,6 +80,9 @@ void destroy_context(struct mm_struct *mm)
mmu->id.pid = -1;
}
+ if (using_seccomp && mmu->id.sock)
+ os_close_file(mmu->id.sock);
+
free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES));
guard(spinlock_irqsave)(&mm_list_lock);
@@ -7,24 +7,54 @@
#ifdef CONFIG_UML_SECCOMP
#include <linux/futex.h>
+#include <sys/socket.h>
#include <errno.h>
#endif
-static __always_inline int syscall_handler(struct stub_data *d)
+/*
+ * Known security issues
+ *
+ * Userspace can jump to this address to execute *any* syscall that is
+ * permitted by the stub. As we will return afterwards, it can do
+ * whatever it likes, including:
+ * - Tricking the kernel into handing out the memory FD
+ * - Using this memory FD to read/write all physical memory
+ * - Running in parallel to the kernel processing a syscall
+ * (possibly creating data races?)
+ * - Blocking e.g. SIGALRM to avoid time based scheduling
+ *
+ * To avoid this, the permitted location for each syscall needs to be
+ * checked for in the SECCOMP filter (which is reasonably simple). Also,
+ * more care will need to go into considerations how the code might be
+ * tricked by using a prepared stack (or even modifying the stack from
+ * another thread in case SMP support is added).
+ *
+ * As for the SIGALRM, the best counter measure will be to check in the
+ * kernel that the process is reporting back the SIGALRM in a timely
+ * fashion.
+ */
+static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
{
+ struct stub_data *d = get_stub_data();
int i;
unsigned long res;
+ int fd;
for (i = 0; i < d->syscall_data_len; i++) {
struct stub_syscall *sc = &d->syscall_data[i];
switch (sc->syscall) {
case STUB_SYSCALL_MMAP:
+ if (fd_map)
+ fd = fd_map[sc->mem.fd];
+ else
+ fd = sc->mem.fd;
+
res = stub_syscall6(STUB_MMAP_NR,
sc->mem.addr, sc->mem.length,
sc->mem.prot,
MAP_SHARED | MAP_FIXED,
- sc->mem.fd, sc->mem.offset);
+ fd, sc->mem.offset);
if (res != sc->mem.addr) {
d->err = res;
d->syscall_data_len = i;
@@ -56,19 +86,35 @@ static __always_inline int syscall_handler(struct stub_data *d)
void __section(".__syscall_stub")
stub_syscall_handler(void)
{
- struct stub_data *d = get_stub_data();
-
- syscall_handler(d);
+ syscall_handler(NULL);
trap_myself();
}
#ifdef CONFIG_UML_SECCOMP
-void __attribute__ ((__section__ (".__syscall_stub")))
+void __section(".__syscall_stub")
stub_signal_interrupt(int sig, siginfo_t *info, void *p)
{
struct stub_data *d = get_stub_data();
+ char rcv_data;
+ union {
+ char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
+ struct cmsghdr align;
+ } ctrl = {};
+ struct iovec iov = {
+ .iov_base = &rcv_data,
+ .iov_len = 1,
+ };
+ struct msghdr msghdr = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = &ctrl,
+ .msg_controllen = sizeof(ctrl),
+ };
ucontext_t *uc = p;
+ struct cmsghdr *fd_msg;
+ int *fd_map;
+ int num_fds;
long res;
d->signal = sig;
@@ -81,19 +127,43 @@ stub_signal_interrupt(int sig, siginfo_t *info, void *p)
res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
FUTEX_WAKE, 1);
} while (res == -EINTR);
+
do {
res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
FUTEX_WAIT, FUTEX_IN_KERN, 0);
} while (res == -EINTR || d->futex == FUTEX_IN_KERN);
- if (res < 0 && res != -EAGAIN)
- stub_syscall2(__NR_kill, 0, SIGKILL);
+ if (d->syscall_data_len) {
+ /* Read passed FDs (if any) */
+ do {
+ res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
+ } while (res == -EINTR);
+
+ /* We should never have a receive error (other than -EAGAIN) */
+ if (res < 0 && res != -EAGAIN)
+ stub_syscall1(__NR_exit_group, 1);
+
+ /* Receive the FDs */
+ num_fds = 0;
+ fd_msg = msghdr.msg_control;
+ fd_map = (void *)&CMSG_DATA(fd_msg);
+ if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
+ num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+ /* Try running queued syscalls. */
+ res = syscall_handler(fd_map);
+
+ while (num_fds)
+ stub_syscall2(__NR_close, fd_map[--num_fds], 0);
+ } else {
+ res = 0;
+ }
- /* Try running queued syscalls. */
- if (syscall_handler(d) < 0 || d->restart_wait) {
+ if (res < 0 || d->restart_wait) {
/* Report SIGSYS if we restart. */
d->signal = SIGSYS;
d->restart_wait = 0;
+
goto restart_wait;
}
@@ -1,5 +1,6 @@
#include <sys/ptrace.h>
#include <sys/prctl.h>
+#include <sys/fcntl.h>
#include <asm/unistd.h>
#include <sysdep/stub.h>
#include <stub-data.h>
@@ -45,7 +46,11 @@ noinline static void real_init(void)
if (res != sizeof(init_data))
stub_syscall1(__NR_exit, 10);
- stub_syscall1(__NR_close, 0);
+ /* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */
+ if (!init_data.seccomp)
+ stub_syscall1(__NR_close, 0);
+ else
+ stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK);
/* map stub code + data */
res = stub_syscall6(STUB_MMAP_NR,
@@ -63,6 +68,10 @@ noinline static void real_init(void)
if (res != init_data.stub_start + UM_KERN_PAGE_SIZE)
stub_syscall1(__NR_exit, 12);
+ /* In SECCOMP mode, we only need the signalling FD from now on */
+ if (init_data.seccomp)
+ stub_syscall3(__NR_close_range, 1, ~0U, 0);
+
/* setup signal stack inside stub data */
stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
@@ -153,8 +162,12 @@ noinline static void real_init(void)
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
offsetof(struct seccomp_data, nr)),
- /* [10-14] Check against permitted syscalls */
+ /* [10-16] Check against permitted syscalls */
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
+ 7, 0),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg,
+ 6, 0),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close,
5, 0),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
4, 0),
@@ -170,10 +183,10 @@ noinline static void real_init(void)
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
1, 0),
- /* [15] Not one of the permitted syscalls */
+ /* [17] Not one of the permitted syscalls */
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
- /* [16] Permitted call for the stub */
+ /* [18] Permitted call for the stub */
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
@@ -44,6 +44,16 @@ void syscall_stub_dump_error(struct mm_id *mm_idp)
print_hex_dump(UM_KERN_ERR, " syscall data: ", 0,
16, 4, sc, sizeof(*sc), 0);
+
+ if (using_seccomp) {
+ printk(UM_KERN_ERR "%s: FD map num: %d", __func__,
+ mm_idp->syscall_fd_num);
+ print_hex_dump(UM_KERN_ERR,
+ " FD map: ", 0, 16,
+ sizeof(mm_idp->syscall_fd_map[0]),
+ mm_idp->syscall_fd_map,
+ sizeof(mm_idp->syscall_fd_map), 0);
+ }
}
static inline unsigned long *check_init_stack(struct mm_id * mm_idp,
@@ -119,6 +129,9 @@ static inline long do_syscall_stub(struct mm_id *mm_idp)
mm_idp->syscall_data_len = 0;
}
+ if (using_seccomp)
+ mm_idp->syscall_fd_num = 0;
+
return mm_idp->syscall_data_len;
}
@@ -181,6 +194,44 @@ static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp,
return NULL;
}
+static int get_stub_fd(struct mm_id *mm_idp, int fd)
+{
+ int i;
+
+ /* Find an FD slot (or flush and use first) */
+ if (!using_seccomp)
+ return fd;
+
+ /* Already crashed, value does not matter */
+ if (mm_idp->syscall_data_len < 0)
+ return 0;
+
+ /* Find existing FD in map if we can allocate another syscall */
+ if (mm_idp->syscall_data_len <
+ ARRAY_SIZE(((struct stub_data *)NULL)->syscall_data)) {
+ for (i = 0; i < mm_idp->syscall_fd_num; i++) {
+ if (mm_idp->syscall_fd_map[i] == fd)
+ return i;
+ }
+
+ if (mm_idp->syscall_fd_num < STUB_MAX_FDS) {
+ i = mm_idp->syscall_fd_num;
+ mm_idp->syscall_fd_map[i] = fd;
+
+ mm_idp->syscall_fd_num++;
+
+ return i;
+ }
+ }
+
+ /* FD map full or no syscall space available, continue after flush */
+ do_syscall_stub(mm_idp);
+ mm_idp->syscall_fd_map[0] = fd;
+ mm_idp->syscall_fd_num = 1;
+
+ return 0;
+}
+
int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot,
int phys_fd, unsigned long long offset)
{
@@ -188,12 +239,21 @@ int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot,
/* Compress with previous syscall if that is possible */
sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt);
- if (sc && sc->mem.prot == prot && sc->mem.fd == phys_fd &&
+ if (sc && sc->mem.prot == prot &&
sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) {
- sc->mem.length += len;
- return 0;
+ int prev_fd = sc->mem.fd;
+
+ if (using_seccomp)
+ prev_fd = mm_idp->syscall_fd_map[sc->mem.fd];
+
+ if (phys_fd == prev_fd) {
+ sc->mem.length += len;
+ return 0;
+ }
}
+ phys_fd = get_stub_fd(mm_idp, phys_fd);
+
sc = syscall_stub_alloc(mm_idp);
sc->syscall = STUB_SYSCALL_MMAP;
sc->mem.addr = virt;
@@ -16,6 +16,7 @@
#include <sys/mman.h>
#include <sys/wait.h>
#include <sys/stat.h>
+#include <sys/socket.h>
#include <asm/unistd.h>
#include <as-layout.h>
#include <init.h>
@@ -153,7 +154,39 @@ void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
int ret;
do {
+ const char byte = 0;
+ struct iovec iov = {
+ .iov_base = (void *)&byte,
+ .iov_len = sizeof(byte),
+ };
+ union {
+ char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))];
+ struct cmsghdr align;
+ } ctrl;
+ struct msghdr msgh = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ };
+
if (!running) {
+ if (mm_idp->syscall_fd_num) {
+ unsigned int fds_size =
+ sizeof(int) * mm_idp->syscall_fd_num;
+ struct cmsghdr *cmsg;
+
+ msgh.msg_control = ctrl.data;
+ msgh.msg_controllen = CMSG_SPACE(fds_size);
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(fds_size);
+ memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map,
+ fds_size);
+
+ CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock,
+ &msgh, 0));
+ }
+
data->signal = 0;
data->futex = FUTEX_IN_CHILD;
CATCH_EINTR(syscall(__NR_futex, &data->futex,
@@ -189,7 +222,7 @@ void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
} while (wait_sigsys && data->signal == SIGALRM);
if (ret < 0 && errno != EAGAIN) {
- printk(UM_KERN_ERR "%s : waiting for child futex failed, errno = %d\n",
+ printk(UM_KERN_ERR "%s : waiting for child failed, errno = %d\n",
__func__, errno);
goto out_kill;
}
@@ -247,10 +280,16 @@ extern char __syscall_stub_start[];
static int stub_exe_fd;
-static int userspace_tramp(void *stack)
+struct tramp_data {
+ struct stub_data *stub_data;
+ /* 0 is inherited, 1 is the kernel side */
+ int sockpair[2];
+};
+
+static int userspace_tramp(void *data)
{
+ struct tramp_data *tramp_data = data;
char *const argv[] = { "uml-userspace", NULL };
- int pipe_fds[2];
unsigned long long offset;
struct stub_init_data init_data = {
.seccomp = using_seccomp,
@@ -277,27 +316,32 @@ static int userspace_tramp(void *stack)
&offset);
init_data.stub_code_offset = MMAP_OFFSET(offset);
- init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset);
+ init_data.stub_data_fd = phys_mapping(uml_to_phys(tramp_data->stub_data),
+ &offset);
init_data.stub_data_offset = MMAP_OFFSET(offset);
- /* Set CLOEXEC on all FDs and then unset on all memory related FDs */
- close_range(0, ~0U, CLOSE_RANGE_CLOEXEC);
+ /* dup2 signaling FD/socket to STDIN */
+ if (dup2(tramp_data->sockpair[0], 0) < 0)
+ exit(3);
+
+ /*
+ * Set CLOEXEC on all FDs except the signaling one and then unset for
+ * the main memory FD as well as IOMEM regions (if not in SECCOMP).
+ */
+ close_range(1, ~0U, CLOSE_RANGE_CLOEXEC);
fcntl(init_data.stub_data_fd, F_SETFD, 0);
- for (iomem = iomem_regions; iomem; iomem = iomem->next)
- fcntl(iomem->fd, F_SETFD, 0);
- /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */
- if (pipe(pipe_fds))
- exit(2);
+ if (!using_seccomp) {
+ for (iomem = iomem_regions; iomem; iomem = iomem->next)
+ fcntl(iomem->fd, F_SETFD, 0);
+ }
- if (dup2(pipe_fds[0], 0) < 0)
- exit(3);
- close(pipe_fds[0]);
+ close(tramp_data->sockpair[0]);
/* Write init_data and close write side */
- ret = write(pipe_fds[1], &init_data, sizeof(init_data));
- close(pipe_fds[1]);
+ ret = write(tramp_data->sockpair[1], &init_data, sizeof(init_data));
+ close(tramp_data->sockpair[1]);
if (ret != sizeof(init_data))
exit(4);
@@ -390,7 +434,7 @@ int userspace_pid[NR_CPUS];
/**
* start_userspace() - prepare a new userspace process
- * @stub_stack: pointer to the stub stack.
+ * @mm_id: The corresponding struct mm_id
*
* Setups a new temporary stack page that is used while userspace_tramp() runs
* Clones the kernel process into a new userspace process, with FDs only.
@@ -402,9 +446,12 @@ int userspace_pid[NR_CPUS];
int start_userspace(struct mm_id *mm_id)
{
struct stub_data *proc_data = (void *)mm_id->stack;
+ struct tramp_data tramp_data = {
+ .stub_data = proc_data,
+ };
void *stack;
unsigned long sp;
- int pid, status, n, err;
+ int status, n, err;
/* setup a temporary stack page */
stack = mmap(NULL, UM_KERN_PAGE_SIZE,
@@ -420,25 +467,32 @@ int start_userspace(struct mm_id *mm_id)
/* set stack pointer to the end of the stack page, so it can grow downwards */
sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
+ /* socket pair for init data and SECCOMP FD passing (no CLOEXEC here) */
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, tramp_data.sockpair)) {
+ err = -errno;
+ printk(UM_KERN_ERR "%s : socketpair failed, errno = %d\n",
+ __func__, errno);
+ return err;
+ }
+
if (using_seccomp)
proc_data->futex = FUTEX_IN_CHILD;
- /* clone into new userspace process */
- pid = clone(userspace_tramp, (void *) sp,
+ mm_id->pid = clone(userspace_tramp, (void *) sp,
CLONE_VFORK | CLONE_VM | SIGCHLD,
- (void *)mm_id->stack);
- if (pid < 0) {
+ (void *)&tramp_data);
+ if (mm_id->pid < 0) {
err = -errno;
printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
__func__, errno);
- return err;
+ goto out_close;
}
if (using_seccomp) {
wait_stub_done_seccomp(mm_id, 1, 1);
} else {
do {
- CATCH_EINTR(n = waitpid(pid, &status,
+ CATCH_EINTR(n = waitpid(mm_id->pid, &status,
WUNTRACED | __WALL));
if (n < 0) {
err = -errno;
@@ -455,7 +509,7 @@ int start_userspace(struct mm_id *mm_id)
goto out_kill;
}
- if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
+ if (ptrace(PTRACE_SETOPTIONS, mm_id->pid, NULL,
(void *) PTRACE_O_TRACESYSGOOD) < 0) {
err = -errno;
printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n",
@@ -471,12 +525,22 @@ int start_userspace(struct mm_id *mm_id)
goto out_kill;
}
- mm_id->pid = pid;
+ close(tramp_data.sockpair[0]);
+ if (using_seccomp)
+ mm_id->sock = tramp_data.sockpair[1];
+ else
+ close(tramp_data.sockpair[1]);
- return pid;
+ return 0;
+
+out_kill:
+ os_kill_ptraced_process(mm_id->pid, 1);
+out_close:
+ close(tramp_data.sockpair[0]);
+ close(tramp_data.sockpair[1]);
+
+ mm_id->pid = -1;
- out_kill:
- os_kill_ptraced_process(pid, 1);
return err;
}
@@ -537,17 +601,8 @@ void userspace(struct uml_pt_regs *regs)
/* Mark pending syscalls for flushing */
proc_data->syscall_data_len = mm_id->syscall_data_len;
- mm_id->syscall_data_len = 0;
- proc_data->signal = 0;
- proc_data->futex = FUTEX_IN_CHILD;
- CATCH_EINTR(syscall(__NR_futex, &proc_data->futex,
- FUTEX_WAKE, 1, NULL, NULL, 0));
- do {
- ret = syscall(__NR_futex, &proc_data->futex,
- FUTEX_WAIT, FUTEX_IN_CHILD, NULL, NULL, 0);
- } while ((ret == -1 && errno == EINTR) ||
- proc_data->futex == FUTEX_IN_CHILD);
+ wait_stub_done_seccomp(mm_id, 0, 0);
sig = proc_data->signal;
@@ -555,9 +610,13 @@ void userspace(struct uml_pt_regs *regs)
printk(UM_KERN_ERR "%s - Error flushing stub syscalls",
__func__);
syscall_stub_dump_error(mm_id);
+ mm_id->syscall_data_len = proc_data->err;
fatal_sigsegv();
}
+ mm_id->syscall_data_len = 0;
+ mm_id->syscall_fd_num = 0;
+
ret = get_stub_state(regs, proc_data, NULL);
if (ret) {
printk(UM_KERN_ERR "%s - failed to get regs: %d",