Message ID | 20230922105609.545573-1-anton.ivanov@cambridgegreys.com |
---|---|
State | Superseded |
Headers | show |
Series | [v6] um: Enable preemption in UML | expand |
On 22/09/2023 11:56, anton.ivanov@cambridgegreys.com wrote: > From: Anton Ivanov <anton.ivanov@cambridgegreys.com> > > 1. Preemption requires saving/restoring FPU state. This patch > adds support for it using GCC intrinsics as well as appropriate > storage space in the thread structure. > > 2. irq critical sections need preempt_disable()/preempt_enable(). > > 3. TLB critical sections need preempt_disable()/preempt_enable(). New in this patch - 4 and 5. > > 4. UML TLB flush is also invoked during a fork. This happens > with interrupts and preempt disabled which disagrees with the > standard mm locking via rwsem. The mm lock for this code path > had to be replaced with an rcu. > > 5. The FPU state area is statically allocated depending on > the enabled PREEMPT options. PREEMPT_DYNAMIC and chosing the > preemption model at start time is disabled for the UM arch. > > Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com> > --- > arch/um/Kconfig | 2 +- > arch/um/include/asm/fpu/api.h | 9 ++- > arch/um/include/asm/processor-generic.h | 4 ++ > arch/um/kernel/Makefile | 4 ++ > arch/um/kernel/fpu.c | 75 +++++++++++++++++++++++++ > arch/um/kernel/irq.c | 2 + > arch/um/kernel/tlb.c | 21 ++++++- > 7 files changed, 111 insertions(+), 6 deletions(-) > create mode 100644 arch/um/kernel/fpu.c > > diff --git a/arch/um/Kconfig b/arch/um/Kconfig > index b5e179360534..19176fde82f3 100644 > --- a/arch/um/Kconfig > +++ b/arch/um/Kconfig > @@ -11,7 +11,7 @@ config UML > select ARCH_HAS_KCOV > select ARCH_HAS_STRNCPY_FROM_USER > select ARCH_HAS_STRNLEN_USER > - select ARCH_NO_PREEMPT > + select ARCH_NO_PREEMPT_DYNAMIC > select HAVE_ARCH_AUDITSYSCALL > select HAVE_ARCH_KASAN if X86_64 > select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN > diff --git a/arch/um/include/asm/fpu/api.h b/arch/um/include/asm/fpu/api.h > index 71bfd9ef3938..9e7680bf48f0 100644 > --- a/arch/um/include/asm/fpu/api.h > +++ b/arch/um/include/asm/fpu/api.h > @@ -4,12 +4,15 @@ > > /* Copyright (c) 2020 Cambridge Greys Ltd > * Copyright (c) 2020 Red Hat Inc. > - * A set of "dummy" defines to allow the direct inclusion > - * of x86 optimized copy, xor, etc routines into the > - * UML code tree. */ > + */ > > +#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) > +extern void kernel_fpu_begin(void); > +extern void kernel_fpu_end(void); > +#else > #define kernel_fpu_begin() (void)0 > #define kernel_fpu_end() (void)0 > +#endif > > static inline bool irq_fpu_usable(void) > { > diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h > index 7414154b8e9a..9970e70be1e4 100644 > --- a/arch/um/include/asm/processor-generic.h > +++ b/arch/um/include/asm/processor-generic.h > @@ -44,6 +44,10 @@ struct thread_struct { > } cb; > } u; > } request; > +#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) > +/* Intel docs require xsave/xrestore area to be aligned to 64 bytes */ > + u8 fpu[2048] __aligned(64); > +#endif > }; > > #define INIT_THREAD \ > diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile > index 811188be954c..c616e884a488 100644 > --- a/arch/um/kernel/Makefile > +++ b/arch/um/kernel/Makefile > @@ -26,9 +26,13 @@ obj-$(CONFIG_OF) += dtb.o > obj-$(CONFIG_EARLY_PRINTK) += early_printk.o > obj-$(CONFIG_STACKTRACE) += stacktrace.o > obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o > +obj-$(CONFIG_PREEMPT) += fpu.o > +obj-$(CONFIG_PREEMPT_VOLUNTARY) += fpu.o > > USER_OBJS := config.o > > +CFLAGS_fpu.o += -mxsave -mxsaveopt > + > include $(srctree)/arch/um/scripts/Makefile.rules > > targets := config.c config.tmp capflags.c > diff --git a/arch/um/kernel/fpu.c b/arch/um/kernel/fpu.c > new file mode 100644 > index 000000000000..4817276b2a26 > --- /dev/null > +++ b/arch/um/kernel/fpu.c > @@ -0,0 +1,75 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2023 Cambridge Greys Ltd > + * Copyright (C) 2023 Red Hat Inc > + */ > + > +#include <linux/cpu.h> > +#include <linux/init.h> > +#include <asm/fpu/api.h> > +#include <asm/cpufeature.h> > + > +/* > + * The critical section between kernel_fpu_begin() and kernel_fpu_end() > + * is non-reentrant. It is the caller's responsibility to avoid reentrance. > + */ > + > +static DEFINE_PER_CPU(bool, in_kernel_fpu); > + > +/* UML and driver code it pulls out of the x86 tree knows about 387 features > + * up to and including AVX512. TILE, etc are not yet supported. > + */ > + > +#define KNOWN_387_FEATURES 0xFF > + > +void kernel_fpu_begin(void) > +{ > + preempt_disable(); > + > + WARN_ON(this_cpu_read(in_kernel_fpu)); > + > + this_cpu_write(in_kernel_fpu, true); > + > +#ifdef CONFIG_64BIT > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT))) > + __builtin_ia32_xsaveopt64(¤t->thread.fpu, KNOWN_387_FEATURES); > + else { > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) > + __builtin_ia32_xsave64(¤t->thread.fpu, KNOWN_387_FEATURES); > + else > + __builtin_ia32_fxsave64(¤t->thread.fpu); > + } > +#else > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT))) > + __builtin_ia32_xsaveopt(¤t->thread.fpu, KNOWN_387_FEATURES); > + else { > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) > + __builtin_ia32_xsave(¤t->thread.fpu, KNOWN_387_FEATURES); > + else > + __builtin_ia32_fxsave(¤t->thread.fpu); > + } > +#endif > +} > +EXPORT_SYMBOL_GPL(kernel_fpu_begin); > + > +void kernel_fpu_end(void) > +{ > + WARN_ON(!this_cpu_read(in_kernel_fpu)); > + > +#ifdef CONFIG_64BIT > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) > + __builtin_ia32_xrstor64(¤t->thread.fpu, KNOWN_387_FEATURES); > + else > + __builtin_ia32_fxrstor64(¤t->thread.fpu); > +#else > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) > + __builtin_ia32_xrstor(¤t->thread.fpu, KNOWN_387_FEATURES); > + else > + __builtin_ia32_fxrstor(¤t->thread.fpu); > +#endif > + this_cpu_write(in_kernel_fpu, false); > + > + preempt_enable(); > +} > +EXPORT_SYMBOL_GPL(kernel_fpu_end); > + > diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c > index 635d44606bfe..c02525da45df 100644 > --- a/arch/um/kernel/irq.c > +++ b/arch/um/kernel/irq.c > @@ -195,7 +195,9 @@ static void _sigio_handler(struct uml_pt_regs *regs, > > void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) > { > + preempt_disable(); > _sigio_handler(regs, irqs_suspended); > + preempt_enable(); > } > > static struct irq_entry *get_irq_entry_by_fd(int fd) > diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c > index 7d050ab0f78a..00b1870c2d62 100644 > --- a/arch/um/kernel/tlb.c > +++ b/arch/um/kernel/tlb.c > @@ -322,6 +322,8 @@ static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, > unsigned long addr = start_addr, next; > int ret = 0, userspace = 1; > > + preempt_disable(); > + > hvc = INIT_HVC(mm, force, userspace); > pgd = pgd_offset(mm, addr); > do { > @@ -346,6 +348,7 @@ static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, > "process: %d\n", task_tgid_vnr(current)); > mm_idp->kill = 1; > } > + preempt_enable(); > } > > static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) > @@ -362,6 +365,9 @@ static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) > > mm = &init_mm; > hvc = INIT_HVC(mm, force, userspace); > + > + preempt_disable(); > + > for (addr = start; addr < end;) { > pgd = pgd_offset(mm, addr); > if (!pgd_present(*pgd)) { > @@ -449,6 +455,9 @@ static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) > > if (err < 0) > panic("flush_tlb_kernel failed, errno = %d\n", err); > + > + preempt_enable(); > + > return updated; > } > > @@ -466,6 +475,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) > > address &= PAGE_MASK; > > + preempt_disable(); > + > pgd = pgd_offset(mm, address); > if (!pgd_present(*pgd)) > goto kill; > @@ -520,6 +531,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) > > *pte = pte_mkuptodate(*pte); > > + preempt_enable(); > return; > > kill: > @@ -597,8 +609,13 @@ void force_flush_all(void) > struct vm_area_struct *vma; > VMA_ITERATOR(vmi, mm, 0); > > - mmap_read_lock(mm); > + /* We use a RCU lock instead of a mm lock, because > + * this can be invoked out of critical/atomic sections > + * and that does not agree with the sleepable semantics > + * of the standard semaphore based mm lock. > + */ > + rcu_read_lock(); > for_each_vma(vmi, vma) > fix_range(mm, vma->vm_start, vma->vm_end, 1); > - mmap_read_unlock(mm); > + rcu_read_unlock(); > }
> > 4. UML TLB flush is also invoked during a fork. This happens > with interrupts and preempt disabled which disagrees with the > standard mm locking via rwsem. The mm lock for this code path > had to be replaced with an rcu. For the record, even if I figured out this gets rid of the complaints, I'm not entirely happy with this - yeah it's safe now, but it still feels entirely wrong. But I guess we can also do this and then remove it entirely like the patch I just posted. Order probably doesn't matter much. (Note my patch removes the locking completely since it's now invoked by the kernel even under write mmap lock.) > diff --git a/arch/um/kernel/fpu.c b/arch/um/kernel/fpu.c > new file mode 100644 > index 000000000000..4817276b2a26 > --- /dev/null > +++ b/arch/um/kernel/fpu.c > +#ifdef CONFIG_64BIT > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT))) > + __builtin_ia32_xsaveopt64(¤t->thread.fpu, KNOWN_387_FEATURES); > + else { > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) > + __builtin_ia32_xsave64(¤t->thread.fpu, KNOWN_387_FEATURES); > + else > + __builtin_ia32_fxsave64(¤t->thread.fpu); :) OK, I'll stop mentioning it ;-) > +EXPORT_SYMBOL_GPL(kernel_fpu_end); > + git am complained here about blank line at EOF > @@ -597,8 +609,13 @@ void force_flush_all(void) > struct vm_area_struct *vma; > VMA_ITERATOR(vmi, mm, 0); > > - mmap_read_lock(mm); > + /* We use a RCU lock instead of a mm lock, because > + * this can be invoked out of critical/atomic sections > + * and that does not agree with the sleepable semantics > + * of the standard semaphore based mm lock. > + */ > + rcu_read_lock(); Yeah I guess ... Seems like a very mechanical description of what's going on, rather than a description of why this is correct (which assumes no preempt and no SMP)? I'd have preferred that, but with the patch I just posted we'll just kill this entirely so it doesn't matter in the end. johannes
On 22/09/2023 12:22, Johannes Berg wrote: >> >> 4. UML TLB flush is also invoked during a fork. This happens >> with interrupts and preempt disabled which disagrees with the >> standard mm locking via rwsem. The mm lock for this code path >> had to be replaced with an rcu. > > For the record, even if I figured out this gets rid of the complaints, > I'm not entirely happy with this - yeah it's safe now, but it still > feels entirely wrong. > > But I guess we can also do this and then remove it entirely like the > patch I just posted. Order probably doesn't matter much. Indeed. We opened the can of worms. It will be a while until they are all in bigger and more comfortable new quarters. > > (Note my patch removes the locking completely since it's now invoked by > the kernel even under write mmap lock.) Ack. > >> diff --git a/arch/um/kernel/fpu.c b/arch/um/kernel/fpu.c >> new file mode 100644 >> index 000000000000..4817276b2a26 >> --- /dev/null >> +++ b/arch/um/kernel/fpu.c > >> +#ifdef CONFIG_64BIT >> + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT))) >> + __builtin_ia32_xsaveopt64(¤t->thread.fpu, KNOWN_387_FEATURES); >> + else { >> + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) >> + __builtin_ia32_xsave64(¤t->thread.fpu, KNOWN_387_FEATURES); >> + else >> + __builtin_ia32_fxsave64(¤t->thread.fpu); > > :) > > OK, I'll stop mentioning it ;-) > >> +EXPORT_SYMBOL_GPL(kernel_fpu_end); >> + > > git am complained here about blank line at EOF > >> @@ -597,8 +609,13 @@ void force_flush_all(void) >> struct vm_area_struct *vma; >> VMA_ITERATOR(vmi, mm, 0); >> >> - mmap_read_lock(mm); >> + /* We use a RCU lock instead of a mm lock, because >> + * this can be invoked out of critical/atomic sections >> + * and that does not agree with the sleepable semantics >> + * of the standard semaphore based mm lock. >> + */ >> + rcu_read_lock(); > > Yeah I guess ... Seems like a very mechanical description of what's > going on, rather than a description of why this is correct (which > assumes no preempt and no SMP)? > > I'd have preferred that, but with the patch I just posted we'll just > kill this entirely so it doesn't matter in the end. > > johannes >
On Fri, Sept 22, 2023 at 06:56, anton.ivanov@cambridgegreys.com wrote: > > From: Anton Ivanov anton.ivanov@cambridgegreys.com > > > 1. Preemption requires saving/restoring FPU state. This patch > adds support for it using GCC intrinsics as well as appropriate > storage space in the thread structure. > > 2. irq critical sections need preempt_disable()/preempt_enable(). > > 3. TLB critical sections need preempt_disable()/preempt_enable(). > > 4. UML TLB flush is also invoked during a fork. This happens > with interrupts and preempt disabled which disagrees with the > standard mm locking via rwsem. The mm lock for this code path > had to be replaced with an rcu. "Had to be"? Why? It's a very unconventional locking method and should have some justification. > > 5. The FPU state area is statically allocated depending on > the enabled PREEMPT options. PREEMPT_DYNAMIC and chosing the > preemption model at start time is disabled for the UM arch. > > Signed-off-by: Anton Ivanov anton.ivanov@cambridgegreys.com > > --- > arch/um/Kconfig | 2 +- > arch/um/include/asm/fpu/api.h | 9 ++- > arch/um/include/asm/processor-generic.h | 4 ++ > arch/um/kernel/Makefile | 4 ++ > arch/um/kernel/fpu.c | 75 +++++++++++++++++++++++++ > arch/um/kernel/irq.c | 2 + > arch/um/kernel/tlb.c | 21 ++++++- > 7 files changed, 111 insertions(+), 6 deletions(-) > create mode 100644 arch/um/kernel/fpu.c > > diff --git a/arch/um/Kconfig b/arch/um/Kconfig > index b5e179360534..19176fde82f3 100644 > --- a/arch/um/Kconfig > +++ b/arch/um/Kconfig > @@ -11,7 +11,7 @@ config UML > select ARCH_HAS_KCOV > select ARCH_HAS_STRNCPY_FROM_USER > select ARCH_HAS_STRNLEN_USER > - select ARCH_NO_PREEMPT > + select ARCH_NO_PREEMPT_DYNAMIC > select HAVE_ARCH_AUDITSYSCALL > select HAVE_ARCH_KASAN if X86_64 > select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN > diff --git a/arch/um/include/asm/fpu/api.h b/arch/um/include/asm/fpu/api.h > index 71bfd9ef3938..9e7680bf48f0 100644 > --- a/arch/um/include/asm/fpu/api.h > +++ b/arch/um/include/asm/fpu/api.h > @@ -4,12 +4,15 @@ > > /* Copyright (c) 2020 Cambridge Greys Ltd > * Copyright (c) 2020 Red Hat Inc. > - * A set of "dummy" defines to allow the direct inclusion > - * of x86 optimized copy, xor, etc routines into the > - * UML code tree. */ > + / > > +#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) > +extern void kernel_fpu_begin(void); > +extern void kernel_fpu_end(void); > +#else > #define kernel_fpu_begin() (void)0 > #define kernel_fpu_end() (void)0 > +#endif > > static inline bool irq_fpu_usable(void) > { > diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h > index 7414154b8e9a..9970e70be1e4 100644 > --- a/arch/um/include/asm/processor-generic.h > +++ b/arch/um/include/asm/processor-generic.h > @@ -44,6 +44,10 @@ struct thread_struct { > } cb; > } u; > } request; > +#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) > +/ Intel docs require xsave/xrestore area to be aligned to 64 bytes / > + u8 fpu[2048] __aligned(64); > +#endif > }; > > #define INIT_THREAD \ > diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile > index 811188be954c..c616e884a488 100644 > --- a/arch/um/kernel/Makefile > +++ b/arch/um/kernel/Makefile > @@ -26,9 +26,13 @@ obj-$(CONFIG_OF) += dtb.o > obj-$(CONFIG_EARLY_PRINTK) += early_printk.o > obj-$(CONFIG_STACKTRACE) += stacktrace.o > obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o > +obj-$(CONFIG_PREEMPT) += fpu.o > +obj-$(CONFIG_PREEMPT_VOLUNTARY) += fpu.o > > USER_OBJS := config.o > > +CFLAGS_fpu.o += -mxsave -mxsaveopt > + > include $(srctree)/arch/um/scripts/Makefile.rules > > targets := config.c config.tmp capflags.c > diff --git a/arch/um/kernel/fpu.c b/arch/um/kernel/fpu.c > new file mode 100644 > index 000000000000..4817276b2a26 > --- /dev/null > +++ b/arch/um/kernel/fpu.c > @@ -0,0 +1,75 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/ > + * Copyright (C) 2023 Cambridge Greys Ltd > + * Copyright (C) 2023 Red Hat Inc > + */ > + > +#include <linux/cpu.h> > > +#include <linux/init.h> > > +#include <asm/fpu/api.h> > > +#include <asm/cpufeature.h> > > + > +/* > + * The critical section between kernel_fpu_begin() and kernel_fpu_end() > + * is non-reentrant. It is the caller's responsibility to avoid reentrance. > + / > + > +static DEFINE_PER_CPU(bool, in_kernel_fpu); > + > +/ UML and driver code it pulls out of the x86 tree knows about 387 features > + * up to and including AVX512. TILE, etc are not yet supported. > + */ > + > +#define KNOWN_387_FEATURES 0xFF > + > +void kernel_fpu_begin(void) > +{ > + preempt_disable(); > + > + WARN_ON(this_cpu_read(in_kernel_fpu)); > + > + this_cpu_write(in_kernel_fpu, true); > + > +#ifdef CONFIG_64BIT > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT))) > + __builtin_ia32_xsaveopt64(¤t->thread.fpu, KNOWN_387_FEATURES); > > + else { > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) > + __builtin_ia32_xsave64(¤t->thread.fpu, KNOWN_387_FEATURES); > > + else > + __builtin_ia32_fxsave64(¤t->thread.fpu); > > + } > +#else > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT))) > + __builtin_ia32_xsaveopt(¤t->thread.fpu, KNOWN_387_FEATURES); > > + else { > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) > + __builtin_ia32_xsave(¤t->thread.fpu, KNOWN_387_FEATURES); > > + else > + __builtin_ia32_fxsave(¤t->thread.fpu); > > + } > +#endif > +} > +EXPORT_SYMBOL_GPL(kernel_fpu_begin); > + Johannes is right about how flattening the else branch makes it a bit more readable, imo. > +void kernel_fpu_end(void) > +{ > + WARN_ON(!this_cpu_read(in_kernel_fpu)); > + > +#ifdef CONFIG_64BIT > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) > + __builtin_ia32_xrstor64(¤t->thread.fpu, KNOWN_387_FEATURES); > > + else > + __builtin_ia32_fxrstor64(¤t->thread.fpu); > > +#else > + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) > + __builtin_ia32_xrstor(¤t->thread.fpu, KNOWN_387_FEATURES); > > + else > + __builtin_ia32_fxrstor(¤t->thread.fpu); > > +#endif > + this_cpu_write(in_kernel_fpu, false); > + > + preempt_enable(); > +} > +EXPORT_SYMBOL_GPL(kernel_fpu_end); --->8--- > @@ -597,8 +609,13 @@ void force_flush_all(void) > struct vm_area_struct vma; > VMA_ITERATOR(vmi, mm, 0); > > - mmap_read_lock(mm); > + / We use a RCU lock instead of a mm lock, because > + * this can be invoked out of critical/atomic sections > + * and that does not agree with the sleepable semantics > + * of the standard semaphore based mm lock. > + */ As Johannes said, this comment doesn't explain why it's needed, just that an rcu lock is needed. > + rcu_read_lock(); > for_each_vma(vmi, vma) > fix_range(mm, vma->vm_start, vma->vm_end, 1); > > - mmap_read_unlock(mm); > + rcu_read_unlock(); > } > -- > 2.30.2 I didn't audit the assortment of preempt_{enable,disable}() pairs, but I did test this patch with defconfig + debug.config + CONFIG_PREEMPT=y for 15 minutes and I didn't see any warnings that used to show up. I think that we should fix what currently exists, then rebase this PREEMPT patch on top of the results. Plus that'll give me time to look into the avx + !xsave edge case :) Cheers, Peter
----- Ursprüngliche Mail ----- > Von: "Johannes Berg" <johannes@sipsolutions.net> >> - mmap_read_lock(mm); >> + /* We use a RCU lock instead of a mm lock, because >> + * this can be invoked out of critical/atomic sections >> + * and that does not agree with the sleepable semantics >> + * of the standard semaphore based mm lock. >> + */ >> + rcu_read_lock(); > > Yeah I guess ... Seems like a very mechanical description of what's > going on, rather than a description of why this is correct (which > assumes no preempt and no SMP)? Is the VMA list actually RCU aware? On UML we might get away with that, but I agree with Johannes, this needs a more detailed explanation. Thanks, //richard
diff --git a/arch/um/Kconfig b/arch/um/Kconfig index b5e179360534..19176fde82f3 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -11,7 +11,7 @@ config UML select ARCH_HAS_KCOV select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER - select ARCH_NO_PREEMPT + select ARCH_NO_PREEMPT_DYNAMIC select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN diff --git a/arch/um/include/asm/fpu/api.h b/arch/um/include/asm/fpu/api.h index 71bfd9ef3938..9e7680bf48f0 100644 --- a/arch/um/include/asm/fpu/api.h +++ b/arch/um/include/asm/fpu/api.h @@ -4,12 +4,15 @@ /* Copyright (c) 2020 Cambridge Greys Ltd * Copyright (c) 2020 Red Hat Inc. - * A set of "dummy" defines to allow the direct inclusion - * of x86 optimized copy, xor, etc routines into the - * UML code tree. */ + */ +#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) +extern void kernel_fpu_begin(void); +extern void kernel_fpu_end(void); +#else #define kernel_fpu_begin() (void)0 #define kernel_fpu_end() (void)0 +#endif static inline bool irq_fpu_usable(void) { diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index 7414154b8e9a..9970e70be1e4 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -44,6 +44,10 @@ struct thread_struct { } cb; } u; } request; +#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) +/* Intel docs require xsave/xrestore area to be aligned to 64 bytes */ + u8 fpu[2048] __aligned(64); +#endif }; #define INIT_THREAD \ diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 811188be954c..c616e884a488 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -26,9 +26,13 @@ obj-$(CONFIG_OF) += dtb.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o +obj-$(CONFIG_PREEMPT) += fpu.o +obj-$(CONFIG_PREEMPT_VOLUNTARY) += fpu.o USER_OBJS := config.o +CFLAGS_fpu.o += -mxsave -mxsaveopt + include $(srctree)/arch/um/scripts/Makefile.rules targets := config.c config.tmp capflags.c diff --git a/arch/um/kernel/fpu.c b/arch/um/kernel/fpu.c new file mode 100644 index 000000000000..4817276b2a26 --- /dev/null +++ b/arch/um/kernel/fpu.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Cambridge Greys Ltd + * Copyright (C) 2023 Red Hat Inc + */ + +#include <linux/cpu.h> +#include <linux/init.h> +#include <asm/fpu/api.h> +#include <asm/cpufeature.h> + +/* + * The critical section between kernel_fpu_begin() and kernel_fpu_end() + * is non-reentrant. It is the caller's responsibility to avoid reentrance. + */ + +static DEFINE_PER_CPU(bool, in_kernel_fpu); + +/* UML and driver code it pulls out of the x86 tree knows about 387 features + * up to and including AVX512. TILE, etc are not yet supported. + */ + +#define KNOWN_387_FEATURES 0xFF + +void kernel_fpu_begin(void) +{ + preempt_disable(); + + WARN_ON(this_cpu_read(in_kernel_fpu)); + + this_cpu_write(in_kernel_fpu, true); + +#ifdef CONFIG_64BIT + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT))) + __builtin_ia32_xsaveopt64(¤t->thread.fpu, KNOWN_387_FEATURES); + else { + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) + __builtin_ia32_xsave64(¤t->thread.fpu, KNOWN_387_FEATURES); + else + __builtin_ia32_fxsave64(¤t->thread.fpu); + } +#else + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT))) + __builtin_ia32_xsaveopt(¤t->thread.fpu, KNOWN_387_FEATURES); + else { + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) + __builtin_ia32_xsave(¤t->thread.fpu, KNOWN_387_FEATURES); + else + __builtin_ia32_fxsave(¤t->thread.fpu); + } +#endif +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + WARN_ON(!this_cpu_read(in_kernel_fpu)); + +#ifdef CONFIG_64BIT + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) + __builtin_ia32_xrstor64(¤t->thread.fpu, KNOWN_387_FEATURES); + else + __builtin_ia32_fxrstor64(¤t->thread.fpu); +#else + if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE))) + __builtin_ia32_xrstor(¤t->thread.fpu, KNOWN_387_FEATURES); + else + __builtin_ia32_fxrstor(¤t->thread.fpu); +#endif + this_cpu_write(in_kernel_fpu, false); + + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end); + diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 635d44606bfe..c02525da45df 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -195,7 +195,9 @@ static void _sigio_handler(struct uml_pt_regs *regs, void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { + preempt_disable(); _sigio_handler(regs, irqs_suspended); + preempt_enable(); } static struct irq_entry *get_irq_entry_by_fd(int fd) diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 7d050ab0f78a..00b1870c2d62 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -322,6 +322,8 @@ static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, unsigned long addr = start_addr, next; int ret = 0, userspace = 1; + preempt_disable(); + hvc = INIT_HVC(mm, force, userspace); pgd = pgd_offset(mm, addr); do { @@ -346,6 +348,7 @@ static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, "process: %d\n", task_tgid_vnr(current)); mm_idp->kill = 1; } + preempt_enable(); } static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) @@ -362,6 +365,9 @@ static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) mm = &init_mm; hvc = INIT_HVC(mm, force, userspace); + + preempt_disable(); + for (addr = start; addr < end;) { pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) { @@ -449,6 +455,9 @@ static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) if (err < 0) panic("flush_tlb_kernel failed, errno = %d\n", err); + + preempt_enable(); + return updated; } @@ -466,6 +475,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) address &= PAGE_MASK; + preempt_disable(); + pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto kill; @@ -520,6 +531,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) *pte = pte_mkuptodate(*pte); + preempt_enable(); return; kill: @@ -597,8 +609,13 @@ void force_flush_all(void) struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); - mmap_read_lock(mm); + /* We use a RCU lock instead of a mm lock, because + * this can be invoked out of critical/atomic sections + * and that does not agree with the sleepable semantics + * of the standard semaphore based mm lock. + */ + rcu_read_lock(); for_each_vma(vmi, vma) fix_range(mm, vma->vm_start, vma->vm_end, 1); - mmap_read_unlock(mm); + rcu_read_unlock(); }