Message ID | 20201015150159.28933-3-cmr@codefail.de (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Improve signal performance on PPC64 with KUAP | expand |
Context | Check | Description |
---|---|---|
snowpatch_ozlabs/apply_patch | warning | Failed to apply on branch powerpc/merge (118be7377c97e35c33819bcb3bbbae5a42a4ac43) |
snowpatch_ozlabs/apply_patch | warning | Failed to apply on branch powerpc/next (a2d0230b91f7e23ceb5d8fb6a9799f30517ec33a) |
snowpatch_ozlabs/apply_patch | warning | Failed to apply on branch linus/master (3e4fb4346c781068610d03c12b16c0cfb0fd24a3) |
snowpatch_ozlabs/apply_patch | warning | Failed to apply on branch powerpc/fixes (0460534b532e5518c657c7d6492b9337d975eaa3) |
snowpatch_ozlabs/apply_patch | warning | Failed to apply on branch linux-next (03d992bd2de6c7f2c9bbd4260ff104c0926ce3ff) |
snowpatch_ozlabs/apply_patch | fail | Failed to apply to any branch |
Le 15/10/2020 à 17:01, Christopher M. Riedl a écrit : > Reuse the "safe" implementation from signal.c except for calling > unsafe_copy_from_user() to copy into a local buffer. Unlike the > unsafe_copy_{vsx,fpr}_to_user() functions the "copy from" functions > cannot use unsafe_get_user() directly to bypass the local buffer since > doing so significantly reduces signal handling performance. Why can't the functions use unsafe_get_user(), why does it significantly reduces signal handling performance ? How much significant ? I would expect that not going through an intermediate memory area would be more efficient Christophe > > Signed-off-by: Christopher M. Riedl <cmr@codefail.de> > --- > arch/powerpc/kernel/signal.h | 33 +++++++++++++++++++++++++++++++++ > 1 file changed, 33 insertions(+) > > diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h > index 2559a681536e..e9aaeac0da37 100644 > --- a/arch/powerpc/kernel/signal.h > +++ b/arch/powerpc/kernel/signal.h > @@ -53,6 +53,33 @@ unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); > &buf[i], label);\ > } while (0) > > +#define unsafe_copy_fpr_from_user(task, from, label) do { \ > + struct task_struct *__t = task; \ > + u64 __user *__f = (u64 __user *)from; \ > + u64 buf[ELF_NFPREG]; \ > + int i; \ > + \ > + unsafe_copy_from_user(buf, __f, ELF_NFPREG * sizeof(double), \ > + label); \ > + for (i = 0; i < ELF_NFPREG - 1; i++) \ > + __t->thread.TS_FPR(i) = buf[i]; \ > + __t->thread.fp_state.fpscr = buf[i]; \ > +} while (0) > + > +#define unsafe_copy_vsx_from_user(task, from, label) do { \ > + struct task_struct *__t = task; \ > + u64 __user *__f = (u64 __user *)from; \ > + u64 buf[ELF_NVSRHALFREG]; \ > + int i; \ > + \ > + unsafe_copy_from_user(buf, __f, \ > + ELF_NVSRHALFREG * sizeof(double), \ > + label); \ > + for (i = 0; i < ELF_NVSRHALFREG ; i++) \ > + __t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; \ > +} while (0) > + > + > #ifdef CONFIG_PPC_TRANSACTIONAL_MEM > #define unsafe_copy_ckfpr_to_user(to, task, label) do { \ > struct task_struct *__t = task; \ > @@ -80,6 +107,10 @@ unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); > unsafe_copy_to_user(to, (task)->thread.fp_state.fpr, \ > ELF_NFPREG * sizeof(double), label) > > +#define unsafe_copy_fpr_from_user(task, from, label) \ > + unsafe_copy_from_user((task)->thread.fp_state.fpr, from \ > + ELF_NFPREG * sizeof(double), label) > + > static inline unsigned long > copy_fpr_to_user(void __user *to, struct task_struct *task) > { > @@ -115,6 +146,8 @@ copy_ckfpr_from_user(struct task_struct *task, void __user *from) > #else > #define unsafe_copy_fpr_to_user(to, task, label) do { } while (0) > > +#define unsafe_copy_fpr_from_user(task, from, label) do { } while (0) > + > static inline unsigned long > copy_fpr_to_user(void __user *to, struct task_struct *task) > { >
On Fri Oct 16, 2020 at 10:48 AM CDT, Christophe Leroy wrote: > > > Le 15/10/2020 à 17:01, Christopher M. Riedl a écrit : > > Reuse the "safe" implementation from signal.c except for calling > > unsafe_copy_from_user() to copy into a local buffer. Unlike the > > unsafe_copy_{vsx,fpr}_to_user() functions the "copy from" functions > > cannot use unsafe_get_user() directly to bypass the local buffer since > > doing so significantly reduces signal handling performance. > > Why can't the functions use unsafe_get_user(), why does it significantly > reduces signal handling > performance ? How much significant ? I would expect that not going > through an intermediate memory > area would be more efficient > Here is a comparison, 'unsafe-signal64-regs' avoids the intermediate buffer: | | hash | radix | | -------------------- | ------ | ------ | | linuxppc/next | 289014 | 158408 | | unsafe-signal64 | 298506 | 253053 | | unsafe-signal64-regs | 254898 | 220831 | I have not figured out the 'why' yet. As you mentioned in your series, technically calling __copy_tofrom_user() is overkill for these operations. The only obvious difference between unsafe_put_user() and unsafe_get_user() is that we don't have asm-goto for the 'get' variant. Instead we wrap with unsafe_op_wrap() which inserts a conditional and then goto to the label. Implemenations: #define unsafe_copy_fpr_from_user(task, from, label) do { \ struct task_struct *__t = task; \ u64 __user *buf = (u64 __user *)from; \ int i; \ \ for (i = 0; i < ELF_NFPREG - 1; i++) \ unsafe_get_user(__t->thread.TS_FPR(i), &buf[i], label); \ unsafe_get_user(__t->thread.fp_state.fpscr, &buf[i], label); \ } while (0) #define unsafe_copy_vsx_from_user(task, from, label) do { \ struct task_struct *__t = task; \ u64 __user *buf = (u64 __user *)from; \ int i; \ \ for (i = 0; i < ELF_NVSRHALFREG ; i++) \ unsafe_get_user(__t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \ &buf[i], label); \ } while (0) > Christophe > > > > > > Signed-off-by: Christopher M. Riedl <cmr@codefail.de> > > --- > > arch/powerpc/kernel/signal.h | 33 +++++++++++++++++++++++++++++++++ > > 1 file changed, 33 insertions(+) > > > > diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h > > index 2559a681536e..e9aaeac0da37 100644 > > --- a/arch/powerpc/kernel/signal.h > > +++ b/arch/powerpc/kernel/signal.h > > @@ -53,6 +53,33 @@ unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); > > &buf[i], label);\ > > } while (0) > > > > +#define unsafe_copy_fpr_from_user(task, from, label) do { \ > > + struct task_struct *__t = task; \ > > + u64 __user *__f = (u64 __user *)from; \ > > + u64 buf[ELF_NFPREG]; \ > > + int i; \ > > + \ > > + unsafe_copy_from_user(buf, __f, ELF_NFPREG * sizeof(double), \ > > + label); \ > > + for (i = 0; i < ELF_NFPREG - 1; i++) \ > > + __t->thread.TS_FPR(i) = buf[i]; \ > > + __t->thread.fp_state.fpscr = buf[i]; \ > > +} while (0) > > + > > +#define unsafe_copy_vsx_from_user(task, from, label) do { \ > > + struct task_struct *__t = task; \ > > + u64 __user *__f = (u64 __user *)from; \ > > + u64 buf[ELF_NVSRHALFREG]; \ > > + int i; \ > > + \ > > + unsafe_copy_from_user(buf, __f, \ > > + ELF_NVSRHALFREG * sizeof(double), \ > > + label); \ > > + for (i = 0; i < ELF_NVSRHALFREG ; i++) \ > > + __t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; \ > > +} while (0) > > + > > + > > #ifdef CONFIG_PPC_TRANSACTIONAL_MEM > > #define unsafe_copy_ckfpr_to_user(to, task, label) do { \ > > struct task_struct *__t = task; \ > > @@ -80,6 +107,10 @@ unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); > > unsafe_copy_to_user(to, (task)->thread.fp_state.fpr, \ > > ELF_NFPREG * sizeof(double), label) > > > > +#define unsafe_copy_fpr_from_user(task, from, label) \ > > + unsafe_copy_from_user((task)->thread.fp_state.fpr, from \ > > + ELF_NFPREG * sizeof(double), label) > > + > > static inline unsigned long > > copy_fpr_to_user(void __user *to, struct task_struct *task) > > { > > @@ -115,6 +146,8 @@ copy_ckfpr_from_user(struct task_struct *task, void __user *from) > > #else > > #define unsafe_copy_fpr_to_user(to, task, label) do { } while (0) > > > > +#define unsafe_copy_fpr_from_user(task, from, label) do { } while (0) > > + > > static inline unsigned long > > copy_fpr_to_user(void __user *to, struct task_struct *task) > > { > >
Le 20/10/2020 à 04:01, Christopher M. Riedl a écrit : > On Fri Oct 16, 2020 at 10:48 AM CDT, Christophe Leroy wrote: >> >> >> Le 15/10/2020 à 17:01, Christopher M. Riedl a écrit : >>> Reuse the "safe" implementation from signal.c except for calling >>> unsafe_copy_from_user() to copy into a local buffer. Unlike the >>> unsafe_copy_{vsx,fpr}_to_user() functions the "copy from" functions >>> cannot use unsafe_get_user() directly to bypass the local buffer since >>> doing so significantly reduces signal handling performance. >> >> Why can't the functions use unsafe_get_user(), why does it significantly >> reduces signal handling >> performance ? How much significant ? I would expect that not going >> through an intermediate memory >> area would be more efficient >> > > Here is a comparison, 'unsafe-signal64-regs' avoids the intermediate buffer: > > | | hash | radix | > | -------------------- | ------ | ------ | > | linuxppc/next | 289014 | 158408 | > | unsafe-signal64 | 298506 | 253053 | > | unsafe-signal64-regs | 254898 | 220831 | > > I have not figured out the 'why' yet. As you mentioned in your series, > technically calling __copy_tofrom_user() is overkill for these > operations. The only obvious difference between unsafe_put_user() and > unsafe_get_user() is that we don't have asm-goto for the 'get' variant. > Instead we wrap with unsafe_op_wrap() which inserts a conditional and > then goto to the label. > > Implemenations: > > #define unsafe_copy_fpr_from_user(task, from, label) do { \ > struct task_struct *__t = task; \ > u64 __user *buf = (u64 __user *)from; \ > int i; \ > \ > for (i = 0; i < ELF_NFPREG - 1; i++) \ > unsafe_get_user(__t->thread.TS_FPR(i), &buf[i], label); \ > unsafe_get_user(__t->thread.fp_state.fpscr, &buf[i], label); \ > } while (0) > > #define unsafe_copy_vsx_from_user(task, from, label) do { \ > struct task_struct *__t = task; \ > u64 __user *buf = (u64 __user *)from; \ > int i; \ > \ > for (i = 0; i < ELF_NVSRHALFREG ; i++) \ > unsafe_get_user(__t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \ > &buf[i], label); \ > } while (0) > Do you have CONFIG_PROVE_LOCKING or CONFIG_DEBUG_ATOMIC_SLEEP enabled in your config ? If yes, could you try together with the patch from Alexey https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210204121612.32721-1-aik@ozlabs.ru/ ? Thanks Christophe
On Sat Feb 6, 2021 at 10:32 AM CST, Christophe Leroy wrote: > > > Le 20/10/2020 à 04:01, Christopher M. Riedl a écrit : > > On Fri Oct 16, 2020 at 10:48 AM CDT, Christophe Leroy wrote: > >> > >> > >> Le 15/10/2020 à 17:01, Christopher M. Riedl a écrit : > >>> Reuse the "safe" implementation from signal.c except for calling > >>> unsafe_copy_from_user() to copy into a local buffer. Unlike the > >>> unsafe_copy_{vsx,fpr}_to_user() functions the "copy from" functions > >>> cannot use unsafe_get_user() directly to bypass the local buffer since > >>> doing so significantly reduces signal handling performance. > >> > >> Why can't the functions use unsafe_get_user(), why does it significantly > >> reduces signal handling > >> performance ? How much significant ? I would expect that not going > >> through an intermediate memory > >> area would be more efficient > >> > > > > Here is a comparison, 'unsafe-signal64-regs' avoids the intermediate buffer: > > > > | | hash | radix | > > | -------------------- | ------ | ------ | > > | linuxppc/next | 289014 | 158408 | > > | unsafe-signal64 | 298506 | 253053 | > > | unsafe-signal64-regs | 254898 | 220831 | > > > > I have not figured out the 'why' yet. As you mentioned in your series, > > technically calling __copy_tofrom_user() is overkill for these > > operations. The only obvious difference between unsafe_put_user() and > > unsafe_get_user() is that we don't have asm-goto for the 'get' variant. > > Instead we wrap with unsafe_op_wrap() which inserts a conditional and > > then goto to the label. > > > > Implemenations: > > > > #define unsafe_copy_fpr_from_user(task, from, label) do { \ > > struct task_struct *__t = task; \ > > u64 __user *buf = (u64 __user *)from; \ > > int i; \ > > \ > > for (i = 0; i < ELF_NFPREG - 1; i++) \ > > unsafe_get_user(__t->thread.TS_FPR(i), &buf[i], label); \ > > unsafe_get_user(__t->thread.fp_state.fpscr, &buf[i], label); \ > > } while (0) > > > > #define unsafe_copy_vsx_from_user(task, from, label) do { \ > > struct task_struct *__t = task; \ > > u64 __user *buf = (u64 __user *)from; \ > > int i; \ > > \ > > for (i = 0; i < ELF_NVSRHALFREG ; i++) \ > > unsafe_get_user(__t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \ > > &buf[i], label); \ > > } while (0) > > > > Do you have CONFIG_PROVE_LOCKING or CONFIG_DEBUG_ATOMIC_SLEEP enabled in > your config ? I don't have these set in my config (ppc64le_defconfig). I think I figured this out - the reason for the lower signal throughput is the barrier_nospec() in __get_user_nocheck(). When looping we incur that cost on every iteration. Commenting it out results in signal performance of ~316K w/ hash on the unsafe-signal64-regs branch. Obviously the barrier is there for a reason but it is quite costly. This also explains why the copy_{fpr,vsx}_to_user() direction does not suffer from the slowdown because there is no need for barrier_nospec(). > > If yes, could you try together with the patch from Alexey > https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210204121612.32721-1-aik@ozlabs.ru/ > ? > > Thanks > Christophe
Le 06/02/2021 à 18:39, Christopher M. Riedl a écrit : > On Sat Feb 6, 2021 at 10:32 AM CST, Christophe Leroy wrote: >> >> >> Le 20/10/2020 à 04:01, Christopher M. Riedl a écrit : >>> On Fri Oct 16, 2020 at 10:48 AM CDT, Christophe Leroy wrote: >>>> >>>> >>>> Le 15/10/2020 à 17:01, Christopher M. Riedl a écrit : >>>>> Reuse the "safe" implementation from signal.c except for calling >>>>> unsafe_copy_from_user() to copy into a local buffer. Unlike the >>>>> unsafe_copy_{vsx,fpr}_to_user() functions the "copy from" functions >>>>> cannot use unsafe_get_user() directly to bypass the local buffer since >>>>> doing so significantly reduces signal handling performance. >>>> >>>> Why can't the functions use unsafe_get_user(), why does it significantly >>>> reduces signal handling >>>> performance ? How much significant ? I would expect that not going >>>> through an intermediate memory >>>> area would be more efficient >>>> >>> >>> Here is a comparison, 'unsafe-signal64-regs' avoids the intermediate buffer: >>> >>> | | hash | radix | >>> | -------------------- | ------ | ------ | >>> | linuxppc/next | 289014 | 158408 | >>> | unsafe-signal64 | 298506 | 253053 | >>> | unsafe-signal64-regs | 254898 | 220831 | >>> >>> I have not figured out the 'why' yet. As you mentioned in your series, >>> technically calling __copy_tofrom_user() is overkill for these >>> operations. The only obvious difference between unsafe_put_user() and >>> unsafe_get_user() is that we don't have asm-goto for the 'get' variant. >>> Instead we wrap with unsafe_op_wrap() which inserts a conditional and >>> then goto to the label. >>> >>> Implemenations: >>> >>> #define unsafe_copy_fpr_from_user(task, from, label) do { \ >>> struct task_struct *__t = task; \ >>> u64 __user *buf = (u64 __user *)from; \ >>> int i; \ >>> \ >>> for (i = 0; i < ELF_NFPREG - 1; i++) \ >>> unsafe_get_user(__t->thread.TS_FPR(i), &buf[i], label); \ >>> unsafe_get_user(__t->thread.fp_state.fpscr, &buf[i], label); \ >>> } while (0) >>> >>> #define unsafe_copy_vsx_from_user(task, from, label) do { \ >>> struct task_struct *__t = task; \ >>> u64 __user *buf = (u64 __user *)from; \ >>> int i; \ >>> \ >>> for (i = 0; i < ELF_NVSRHALFREG ; i++) \ >>> unsafe_get_user(__t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \ >>> &buf[i], label); \ >>> } while (0) >>> >> >> Do you have CONFIG_PROVE_LOCKING or CONFIG_DEBUG_ATOMIC_SLEEP enabled in >> your config ? > > I don't have these set in my config (ppc64le_defconfig). I think I > figured this out - the reason for the lower signal throughput is the > barrier_nospec() in __get_user_nocheck(). When looping we incur that > cost on every iteration. Commenting it out results in signal performance > of ~316K w/ hash on the unsafe-signal64-regs branch. Obviously the > barrier is there for a reason but it is quite costly. Interesting. Can you try with the patch I just sent out https://patchwork.ozlabs.org/project/linuxppc-dev/patch/c72f014730823b413528e90ab6c4d3bcb79f8497.1612692067.git.christophe.leroy@csgroup.eu/ Thanks Christophe
On Sun Feb 7, 2021 at 4:12 AM CST, Christophe Leroy wrote: > > > Le 06/02/2021 à 18:39, Christopher M. Riedl a écrit : > > On Sat Feb 6, 2021 at 10:32 AM CST, Christophe Leroy wrote: > >> > >> > >> Le 20/10/2020 à 04:01, Christopher M. Riedl a écrit : > >>> On Fri Oct 16, 2020 at 10:48 AM CDT, Christophe Leroy wrote: > >>>> > >>>> > >>>> Le 15/10/2020 à 17:01, Christopher M. Riedl a écrit : > >>>>> Reuse the "safe" implementation from signal.c except for calling > >>>>> unsafe_copy_from_user() to copy into a local buffer. Unlike the > >>>>> unsafe_copy_{vsx,fpr}_to_user() functions the "copy from" functions > >>>>> cannot use unsafe_get_user() directly to bypass the local buffer since > >>>>> doing so significantly reduces signal handling performance. > >>>> > >>>> Why can't the functions use unsafe_get_user(), why does it significantly > >>>> reduces signal handling > >>>> performance ? How much significant ? I would expect that not going > >>>> through an intermediate memory > >>>> area would be more efficient > >>>> > >>> > >>> Here is a comparison, 'unsafe-signal64-regs' avoids the intermediate buffer: > >>> > >>> | | hash | radix | > >>> | -------------------- | ------ | ------ | > >>> | linuxppc/next | 289014 | 158408 | > >>> | unsafe-signal64 | 298506 | 253053 | > >>> | unsafe-signal64-regs | 254898 | 220831 | > >>> > >>> I have not figured out the 'why' yet. As you mentioned in your series, > >>> technically calling __copy_tofrom_user() is overkill for these > >>> operations. The only obvious difference between unsafe_put_user() and > >>> unsafe_get_user() is that we don't have asm-goto for the 'get' variant. > >>> Instead we wrap with unsafe_op_wrap() which inserts a conditional and > >>> then goto to the label. > >>> > >>> Implemenations: > >>> > >>> #define unsafe_copy_fpr_from_user(task, from, label) do { \ > >>> struct task_struct *__t = task; \ > >>> u64 __user *buf = (u64 __user *)from; \ > >>> int i; \ > >>> \ > >>> for (i = 0; i < ELF_NFPREG - 1; i++) \ > >>> unsafe_get_user(__t->thread.TS_FPR(i), &buf[i], label); \ > >>> unsafe_get_user(__t->thread.fp_state.fpscr, &buf[i], label); \ > >>> } while (0) > >>> > >>> #define unsafe_copy_vsx_from_user(task, from, label) do { \ > >>> struct task_struct *__t = task; \ > >>> u64 __user *buf = (u64 __user *)from; \ > >>> int i; \ > >>> \ > >>> for (i = 0; i < ELF_NVSRHALFREG ; i++) \ > >>> unsafe_get_user(__t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \ > >>> &buf[i], label); \ > >>> } while (0) > >>> > >> > >> Do you have CONFIG_PROVE_LOCKING or CONFIG_DEBUG_ATOMIC_SLEEP enabled in > >> your config ? > > > > I don't have these set in my config (ppc64le_defconfig). I think I > > figured this out - the reason for the lower signal throughput is the > > barrier_nospec() in __get_user_nocheck(). When looping we incur that > > cost on every iteration. Commenting it out results in signal performance > > of ~316K w/ hash on the unsafe-signal64-regs branch. Obviously the > > barrier is there for a reason but it is quite costly. > > Interesting. > > Can you try with the patch I just sent out > https://patchwork.ozlabs.org/project/linuxppc-dev/patch/c72f014730823b413528e90ab6c4d3bcb79f8497.1612692067.git.christophe.leroy@csgroup.eu/ Yeah that patch solves the problem. Using unsafe_get_user() in a loop is actually faster on radix than using the intermediary buffer step. A summary of results below (unsafe-signal64-v6 uses unsafe_get_user() and avoids the local buffer): | | hash | radix | | -------------------------------- | ------ | ------ | | unsafe-signal64-v5 | 194533 | 230089 | | unsafe-signal64-v6 | 176739 | 202840 | | unsafe-signal64-v5+barrier patch | 203037 | 234936 | | unsafe-signal64-v6+barrier patch | 205484 | 241030 | I am still expecting some comments/feedback on my v5 before sending out v6. Should I include your patch in my series as well? > > Thanks > Christophe
Le 08/02/2021 à 18:14, Christopher M. Riedl a écrit : > On Sun Feb 7, 2021 at 4:12 AM CST, Christophe Leroy wrote: >> >> >> Le 06/02/2021 à 18:39, Christopher M. Riedl a écrit : >>> On Sat Feb 6, 2021 at 10:32 AM CST, Christophe Leroy wrote: >>>> >>>> >>>> Le 20/10/2020 à 04:01, Christopher M. Riedl a écrit : >>>>> On Fri Oct 16, 2020 at 10:48 AM CDT, Christophe Leroy wrote: >>>>>> >>>>>> >>>>>> Le 15/10/2020 à 17:01, Christopher M. Riedl a écrit : >>>>>>> Reuse the "safe" implementation from signal.c except for calling >>>>>>> unsafe_copy_from_user() to copy into a local buffer. Unlike the >>>>>>> unsafe_copy_{vsx,fpr}_to_user() functions the "copy from" functions >>>>>>> cannot use unsafe_get_user() directly to bypass the local buffer since >>>>>>> doing so significantly reduces signal handling performance. >>>>>> >>>>>> Why can't the functions use unsafe_get_user(), why does it significantly >>>>>> reduces signal handling >>>>>> performance ? How much significant ? I would expect that not going >>>>>> through an intermediate memory >>>>>> area would be more efficient >>>>>> >>>>> >>>>> Here is a comparison, 'unsafe-signal64-regs' avoids the intermediate buffer: >>>>> >>>>> | | hash | radix | >>>>> | -------------------- | ------ | ------ | >>>>> | linuxppc/next | 289014 | 158408 | >>>>> | unsafe-signal64 | 298506 | 253053 | >>>>> | unsafe-signal64-regs | 254898 | 220831 | >>>>> >>>>> I have not figured out the 'why' yet. As you mentioned in your series, >>>>> technically calling __copy_tofrom_user() is overkill for these >>>>> operations. The only obvious difference between unsafe_put_user() and >>>>> unsafe_get_user() is that we don't have asm-goto for the 'get' variant. >>>>> Instead we wrap with unsafe_op_wrap() which inserts a conditional and >>>>> then goto to the label. >>>>> >>>>> Implemenations: >>>>> >>>>> #define unsafe_copy_fpr_from_user(task, from, label) do { \ >>>>> struct task_struct *__t = task; \ >>>>> u64 __user *buf = (u64 __user *)from; \ >>>>> int i; \ >>>>> \ >>>>> for (i = 0; i < ELF_NFPREG - 1; i++) \ >>>>> unsafe_get_user(__t->thread.TS_FPR(i), &buf[i], label); \ >>>>> unsafe_get_user(__t->thread.fp_state.fpscr, &buf[i], label); \ >>>>> } while (0) >>>>> >>>>> #define unsafe_copy_vsx_from_user(task, from, label) do { \ >>>>> struct task_struct *__t = task; \ >>>>> u64 __user *buf = (u64 __user *)from; \ >>>>> int i; \ >>>>> \ >>>>> for (i = 0; i < ELF_NVSRHALFREG ; i++) \ >>>>> unsafe_get_user(__t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \ >>>>> &buf[i], label); \ >>>>> } while (0) >>>>> >>>> >>>> Do you have CONFIG_PROVE_LOCKING or CONFIG_DEBUG_ATOMIC_SLEEP enabled in >>>> your config ? >>> >>> I don't have these set in my config (ppc64le_defconfig). I think I >>> figured this out - the reason for the lower signal throughput is the >>> barrier_nospec() in __get_user_nocheck(). When looping we incur that >>> cost on every iteration. Commenting it out results in signal performance >>> of ~316K w/ hash on the unsafe-signal64-regs branch. Obviously the >>> barrier is there for a reason but it is quite costly. >> >> Interesting. >> >> Can you try with the patch I just sent out >> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/c72f014730823b413528e90ab6c4d3bcb79f8497.1612692067.git.christophe.leroy@csgroup.eu/ > > Yeah that patch solves the problem. Using unsafe_get_user() in a loop is > actually faster on radix than using the intermediary buffer step. A > summary of results below (unsafe-signal64-v6 uses unsafe_get_user() and > avoids the local buffer): > > | | hash | radix | > | -------------------------------- | ------ | ------ | > | unsafe-signal64-v5 | 194533 | 230089 | > | unsafe-signal64-v6 | 176739 | 202840 | > | unsafe-signal64-v5+barrier patch | 203037 | 234936 | > | unsafe-signal64-v6+barrier patch | 205484 | 241030 | > > I am still expecting some comments/feedback on my v5 before sending out > v6. Should I include your patch in my series as well? > My patch is now flagged "under review" in patchwork so I suppose Michael picked it already. Christophe
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h index 2559a681536e..e9aaeac0da37 100644 --- a/arch/powerpc/kernel/signal.h +++ b/arch/powerpc/kernel/signal.h @@ -53,6 +53,33 @@ unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); &buf[i], label);\ } while (0) +#define unsafe_copy_fpr_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *__f = (u64 __user *)from; \ + u64 buf[ELF_NFPREG]; \ + int i; \ + \ + unsafe_copy_from_user(buf, __f, ELF_NFPREG * sizeof(double), \ + label); \ + for (i = 0; i < ELF_NFPREG - 1; i++) \ + __t->thread.TS_FPR(i) = buf[i]; \ + __t->thread.fp_state.fpscr = buf[i]; \ +} while (0) + +#define unsafe_copy_vsx_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *__f = (u64 __user *)from; \ + u64 buf[ELF_NVSRHALFREG]; \ + int i; \ + \ + unsafe_copy_from_user(buf, __f, \ + ELF_NVSRHALFREG * sizeof(double), \ + label); \ + for (i = 0; i < ELF_NVSRHALFREG ; i++) \ + __t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; \ +} while (0) + + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM #define unsafe_copy_ckfpr_to_user(to, task, label) do { \ struct task_struct *__t = task; \ @@ -80,6 +107,10 @@ unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); unsafe_copy_to_user(to, (task)->thread.fp_state.fpr, \ ELF_NFPREG * sizeof(double), label) +#define unsafe_copy_fpr_from_user(task, from, label) \ + unsafe_copy_from_user((task)->thread.fp_state.fpr, from \ + ELF_NFPREG * sizeof(double), label) + static inline unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task) { @@ -115,6 +146,8 @@ copy_ckfpr_from_user(struct task_struct *task, void __user *from) #else #define unsafe_copy_fpr_to_user(to, task, label) do { } while (0) +#define unsafe_copy_fpr_from_user(task, from, label) do { } while (0) + static inline unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task) {
Reuse the "safe" implementation from signal.c except for calling unsafe_copy_from_user() to copy into a local buffer. Unlike the unsafe_copy_{vsx,fpr}_to_user() functions the "copy from" functions cannot use unsafe_get_user() directly to bypass the local buffer since doing so significantly reduces signal handling performance. Signed-off-by: Christopher M. Riedl <cmr@codefail.de> --- arch/powerpc/kernel/signal.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+)