Message ID | 894d17f7924e7d31bfd5d6595ee84158f7411e47.1731290567.git.thehajime@gmail.com |
---|---|
State | RFC |
Headers | show |
Series | nommu UML | expand |
Hi, On Mon, 2024-11-11 at 15:27 +0900, Hajime Tazaki wrote: > As userspace on UML/!MMU also need to configure %fs register when it is > running to correctly access thread structure, host syscalls implemented > in os-Linux drivers may be puzzled when they are called. Thus it has to > configure %fs register via arch_prctl(SET_FS) on every host syscalls. > > [SNIP] > + > +/** > + * get_host_cpu_features() return true with X86_FEATURE_FSGSBASE even > + * if the kernel is older and disabled using fsgsbase instruction. > + * thus detection is based on whether SIGILL is raised or not. > + */ > +static jmp_buf jmpbuf; > +static void sigill(int sig, siginfo_t *si, void *ctx_void) > +{ > + siglongjmp(jmpbuf, 1); > +} > + > +void __init check_fsgsbase(void) > +{ > + unsigned long fsbase; > + struct sigaction sa; > + > + /* Probe FSGSBASE */ > + memset(&sa, 0, sizeof(sa)); > + sa.sa_sigaction = sigill; > + sa.sa_flags = SA_SIGINFO | SA_RESETHAND; > + sigemptyset(&sa.sa_mask); > + if (sigaction(SIGILL, &sa, 0)) > + os_warn("sigaction"); > + > + os_info("Checking FSGSBASE instructions..."); > + if (sigsetjmp(jmpbuf, 0) == 0) { > + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); > + host_has_fsgsbase = 1; > + os_info("OK\n"); > + } else { > + host_has_fsgsbase = 0; > + os_info("disabled\n"); > + } > +} According to Documentation/arch/x86/x86_64/fsgs.rst it looks like this can also be checked using the HWCAP2_FSGSBASE bit in AT_HWCAP2. Maybe that is a bit simpler? > [SNIP] > > __visible void do_syscall_64(struct pt_regs *regs) > { > int syscall; > @@ -49,6 +76,9 @@ __visible void do_syscall_64(struct pt_regs *regs) > if (syscall == __NR_vfork) > stack_copy = vfork_save_stack(); > > + /* set fs register to the original host one */ > + os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs); > + > if (likely(syscall < NR_syscalls)) { > PT_REGS_SET_SYSCALL_RETURN(regs, > EXECUTE_SYSCALL(syscall, regs)); > @@ -63,6 +93,11 @@ __visible void do_syscall_64(struct pt_regs *regs) > set_thread_flag(TIF_SIGPENDING); > interrupt_end(); > > + /* restore back fs register to userspace configured one */ > + os_x86_arch_prctl(0, ARCH_SET_FS, > + (void *)(current->thread.regs.regs.gp[FS_BASE > + / sizeof(unsigned long)])); > + > /* execve succeeded */ > if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0) > userspace(¤t->thread.regs.regs); > diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c > index edb17fc73e07..d56df936a2d7 100644 > --- a/arch/x86/um/syscalls_64.c > +++ b/arch/x86/um/syscalls_64.c > @@ -12,11 +12,26 @@ > #include <asm/prctl.h> /* XXX This should get the constants from libc */ > #include <registers.h> > #include <os.h> > +#include <asm/thread_info.h> > +#include <asm/mman.h> > + > +#ifndef CONFIG_MMU > +/* > + * The guest libc can change FS, which confuses the host libc. > + * In fact, changing FS directly is not supported (check > + * man arch_prctl). So, whenever we make a host syscall, > + * we should be changing FS to the original FS (not the > + * one set by the guest libc). This original FS is stored > + * in host_fs. > + */ > +long long host_fs = -1; Right, the libc already uses it for its own thread-local storage. That is a bit annoying, as UML doesn't need threading in that sense. Note that similar handling needs to happen for every userspace to kernel switch. I guess the only other location is the signal handler. Benjamin > +#endif > > long arch_prctl(struct task_struct *task, int option, > unsigned long __user *arg2) > { > long ret = -EINVAL; > +#ifdef CONFIG_MMU > > switch (option) { > case ARCH_SET_FS: > @@ -38,6 +53,48 @@ long arch_prctl(struct task_struct *task, int option, > } > > return ret; > +#else > + > + unsigned long *ptr = arg2, tmp; > + > + switch (option) { > + case ARCH_SET_FS: > + if (host_fs == -1) > + os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs); > + ret = 0; > + break; > + case ARCH_SET_GS: > + ret = 0; > + break; > + case ARCH_GET_FS: > + case ARCH_GET_GS: > + ptr = &tmp; > + break; > + } > + > + ret = os_arch_prctl(0, option, ptr); > + if (ret) > + return ret; > + > + switch (option) { > + case ARCH_SET_FS: > + current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] = > + (unsigned long) arg2; > + break; > + case ARCH_SET_GS: > + current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] = > + (unsigned long) arg2; > + break; > + case ARCH_GET_FS: > + ret = put_user(current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)], arg2); > + break; > + case ARCH_GET_GS: > + ret = put_user(current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)], arg2); > + break; > + } > + > + return ret; > +#endif > } > > SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
On Wed, 27 Nov 2024 19:00:11 +0900, Benjamin Berg wrote: > > + > > + os_info("Checking FSGSBASE instructions..."); > > + if (sigsetjmp(jmpbuf, 0) == 0) { > > + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); > > + host_has_fsgsbase = 1; > > + os_info("OK\n"); > > + } else { > > + host_has_fsgsbase = 0; > > + os_info("disabled\n"); > > + } > > +} > > According to Documentation/arch/x86/x86_64/fsgs.rst it looks like this > can also be checked using the HWCAP2_FSGSBASE bit in AT_HWCAP2. > > Maybe that is a bit simpler? Ah, thanks. This should be much simpler: +#include <sys/auxv.h> +#include <asm/hwcap2.h> +void __init check_fsgsbase(void) +{ + unsigned long auxv = getauxval(AT_HWCAP2); + + os_info("Checking FSGSBASE instructions..."); + if (auxv & HWCAP2_FSGSBASE) { + host_has_fsgsbase = 1; + os_info("OK\n"); + } else { + host_has_fsgsbase = 0; + os_info("disabled\n"); + } +} > > > [SNIP] > > > > __visible void do_syscall_64(struct pt_regs *regs) > > { > > int syscall; > > @@ -49,6 +76,9 @@ __visible void do_syscall_64(struct pt_regs *regs) > > if (syscall == __NR_vfork) > > stack_copy = vfork_save_stack(); > > > > + /* set fs register to the original host one */ > > + os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs); > > + > > if (likely(syscall < NR_syscalls)) { > > PT_REGS_SET_SYSCALL_RETURN(regs, > > EXECUTE_SYSCALL(syscall, regs)); > > @@ -63,6 +93,11 @@ __visible void do_syscall_64(struct pt_regs *regs) > > set_thread_flag(TIF_SIGPENDING); > > interrupt_end(); > > > > + /* restore back fs register to userspace configured one */ > > + os_x86_arch_prctl(0, ARCH_SET_FS, > > + (void *)(current->thread.regs.regs.gp[FS_BASE > > + / sizeof(unsigned long)])); > > + > > /* execve succeeded */ > > if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0) > > userspace(¤t->thread.regs.regs); > > diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c > > index edb17fc73e07..d56df936a2d7 100644 > > --- a/arch/x86/um/syscalls_64.c > > +++ b/arch/x86/um/syscalls_64.c > > @@ -12,11 +12,26 @@ > > #include <asm/prctl.h> /* XXX This should get the constants from libc */ > > #include <registers.h> > > #include <os.h> > > +#include <asm/thread_info.h> > > +#include <asm/mman.h> > > + > > +#ifndef CONFIG_MMU > > +/* > > + * The guest libc can change FS, which confuses the host libc. > > + * In fact, changing FS directly is not supported (check > > + * man arch_prctl). So, whenever we make a host syscall, > > + * we should be changing FS to the original FS (not the > > + * one set by the guest libc). This original FS is stored > > + * in host_fs. > > + */ > > +long long host_fs = -1; > > Right, the libc already uses it for its own thread-local storage. That > is a bit annoying, as UML doesn't need threading in that sense. > > Note that similar handling needs to happen for every userspace to > kernel switch. I guess the only other location is the signal handler. Thanks too. I guess the former (arch_switch_to) handles in my patch but the latter (signal handler) doesn't. Let me try to check. -- Hajime
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 5a6722f254d5..69a7854f5f87 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -136,6 +136,9 @@ static inline struct openflags of_cloexec(struct openflags flags) return flags; } +/* cpu.c */ +extern int host_has_fsgsbase; + /* file.c */ extern int os_stat_file(const char *file_name, struct uml_stat *buf); extern int os_stat_fd(const int fd, struct uml_stat *buf); @@ -221,6 +224,8 @@ extern int os_drop_memory(void *addr, int length); extern int can_drop_memory(void); extern int os_mincore(void *addr, unsigned long len); #ifndef CONFIG_MMU +extern long long host_fs; +extern int os_arch_prctl(int pid, int option, unsigned long *arg); extern int os_setup_seccomp(void); #endif diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 20ff8d5971db..af7c5f4373bc 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -8,7 +8,7 @@ KCOV_INSTRUMENT := n obj-y = execvp.o file.o helper.o irq.o main.o mem.o process.o \ registers.o sigio.o signal.o start_up.o time.o tty.o \ - umid.o user_syms.o util.o drivers/ skas/ + umid.o user_syms.o util.o cpu.o drivers/ skas/ CFLAGS_signal.o += -Wframe-larger-than=4096 @@ -18,7 +18,7 @@ obj-$(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) += elf_aux.o USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \ main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \ - tty.o umid.o util.o + tty.o umid.o util.o cpu.o include $(srctree)/arch/um/scripts/Makefile.rules CFLAGS_process.o=-g -O0 diff --git a/arch/um/os-Linux/cpu.c b/arch/um/os-Linux/cpu.c new file mode 100644 index 000000000000..49b6d8b4d65d --- /dev/null +++ b/arch/um/os-Linux/cpu.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <string.h> +#include <signal.h> +#include <setjmp.h> +#include <init.h> +#include "internal.h" + +int host_has_fsgsbase; +/* those definitions can be pulled from os.h but if we include this + * it shows conflicts of jmp_buf definitions in longjmp.h (UM) and + * host one. thus we declared here instead. + */ +void os_info(const char *fmt, ...); +void os_warn(const char *fmt, ...); + +/** + * get_host_cpu_features() return true with X86_FEATURE_FSGSBASE even + * if the kernel is older and disabled using fsgsbase instruction. + * thus detection is based on whether SIGILL is raised or not. + */ +static jmp_buf jmpbuf; +static void sigill(int sig, siginfo_t *si, void *ctx_void) +{ + siglongjmp(jmpbuf, 1); +} + +void __init check_fsgsbase(void) +{ + unsigned long fsbase; + struct sigaction sa; + + /* Probe FSGSBASE */ + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = sigill; + sa.sa_flags = SA_SIGINFO | SA_RESETHAND; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGILL, &sa, 0)) + os_warn("sigaction"); + + os_info("Checking FSGSBASE instructions..."); + if (sigsetjmp(jmpbuf, 0) == 0) { + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); + host_has_fsgsbase = 1; + os_info("OK\n"); + } else { + host_has_fsgsbase = 0; + os_info("disabled\n"); + } +} diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h index 317fca190c2b..60220b8b8843 100644 --- a/arch/um/os-Linux/internal.h +++ b/arch/um/os-Linux/internal.h @@ -2,6 +2,11 @@ #ifndef __UM_OS_LINUX_INTERNAL_H #define __UM_OS_LINUX_INTERNAL_H +/* + * cpu.c + */ +void check_fsgsbase(void); + /* * elf_aux.c */ diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index 0afcdeb8995b..aecf63d3db79 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -17,6 +17,7 @@ #include <kern_util.h> #include <os.h> #include <um_malloc.h> +#include <asm/prctl.h> /* XXX This should get the constants from libc */ #include "internal.h" #define PGD_BOUND (4 * 1024 * 1024) @@ -158,6 +159,10 @@ int __init main(int argc, char **argv, char **envp) change_sig(SIGPIPE, 0); ret = linux_main(argc, argv, envp); +#ifndef CONFIG_MMU + os_arch_prctl(0, ARCH_SET_FS, (void *)host_fs); +#endif + /* * Disable SIGPROF - I have no idea why libc doesn't do this or turn * off the profiling time, but UML dies with a SIGPROF just before diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index 5acf6d41a4c2..5a3b09096f92 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -221,6 +221,14 @@ void os_set_pdeathsig(void) } #ifndef CONFIG_MMU +#include <unistd.h> +#include <sys/syscall.h> /* For SYS_xxx definitions */ + +int os_arch_prctl(int pid, int option, unsigned long *arg2) +{ + return syscall(SYS_arch_prctl, option, arg2); +} + int os_setup_seccomp(void) { int err; diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 93fc82c01aba..88164893cbec 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -293,6 +293,9 @@ void __init os_early_checks(void) */ check_tmpexec(); + /* probe fsgsbase instruction */ + check_fsgsbase(); + pid = start_ptraced_child(); if (init_pid_registers(pid)) fatal("Failed to initialize default registers"); diff --git a/arch/x86/um/do_syscall_64.c b/arch/x86/um/do_syscall_64.c index 203bacc4cb3c..75326acc931b 100644 --- a/arch/x86/um/do_syscall_64.c +++ b/arch/x86/um/do_syscall_64.c @@ -3,6 +3,8 @@ //#define DEBUG 1 #include <linux/kernel.h> #include <linux/ptrace.h> +#include <asm/fsgsbase.h> +#include <asm/prctl.h> #include <kern_util.h> #include <sysdep/syscalls.h> #include <os.h> @@ -34,6 +36,31 @@ static void vfork_restore_stack(void *stack_copy) stack_copy, 8); } +static int os_x86_arch_prctl(int pid, int option, unsigned long *arg2) +{ + if (host_has_fsgsbase) { + switch (option) { + case ARCH_SET_FS: + wrfsbase(*arg2); + break; + case ARCH_SET_GS: + wrgsbase(*arg2); + break; + case ARCH_GET_FS: + *arg2 = rdfsbase(); + break; + case ARCH_GET_GS: + *arg2 = rdgsbase(); + break; + } + return 0; + } else { + return os_arch_prctl(pid, option, arg2); + } + + return 0; +} + __visible void do_syscall_64(struct pt_regs *regs) { int syscall; @@ -49,6 +76,9 @@ __visible void do_syscall_64(struct pt_regs *regs) if (syscall == __NR_vfork) stack_copy = vfork_save_stack(); + /* set fs register to the original host one */ + os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs); + if (likely(syscall < NR_syscalls)) { PT_REGS_SET_SYSCALL_RETURN(regs, EXECUTE_SYSCALL(syscall, regs)); @@ -63,6 +93,11 @@ __visible void do_syscall_64(struct pt_regs *regs) set_thread_flag(TIF_SIGPENDING); interrupt_end(); + /* restore back fs register to userspace configured one */ + os_x86_arch_prctl(0, ARCH_SET_FS, + (void *)(current->thread.regs.regs.gp[FS_BASE + / sizeof(unsigned long)])); + /* execve succeeded */ if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0) userspace(¤t->thread.regs.regs); diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index edb17fc73e07..d56df936a2d7 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -12,11 +12,26 @@ #include <asm/prctl.h> /* XXX This should get the constants from libc */ #include <registers.h> #include <os.h> +#include <asm/thread_info.h> +#include <asm/mman.h> + +#ifndef CONFIG_MMU +/* + * The guest libc can change FS, which confuses the host libc. + * In fact, changing FS directly is not supported (check + * man arch_prctl). So, whenever we make a host syscall, + * we should be changing FS to the original FS (not the + * one set by the guest libc). This original FS is stored + * in host_fs. + */ +long long host_fs = -1; +#endif long arch_prctl(struct task_struct *task, int option, unsigned long __user *arg2) { long ret = -EINVAL; +#ifdef CONFIG_MMU switch (option) { case ARCH_SET_FS: @@ -38,6 +53,48 @@ long arch_prctl(struct task_struct *task, int option, } return ret; +#else + + unsigned long *ptr = arg2, tmp; + + switch (option) { + case ARCH_SET_FS: + if (host_fs == -1) + os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs); + ret = 0; + break; + case ARCH_SET_GS: + ret = 0; + break; + case ARCH_GET_FS: + case ARCH_GET_GS: + ptr = &tmp; + break; + } + + ret = os_arch_prctl(0, option, ptr); + if (ret) + return ret; + + switch (option) { + case ARCH_SET_FS: + current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] = + (unsigned long) arg2; + break; + case ARCH_SET_GS: + current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] = + (unsigned long) arg2; + break; + case ARCH_GET_FS: + ret = put_user(current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)], arg2); + break; + case ARCH_GET_GS: + ret = put_user(current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)], arg2); + break; + } + + return ret; +#endif } SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)