diff mbox series

[v1,1/2] x86: Implement sched_yield syscall for x86 only.

Message ID 20230608090050.2056824-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,1/2] x86: Implement sched_yield syscall for x86 only. | expand

Commit Message

Noah Goldstein June 8, 2023, 9 a.m. UTC
We slightly optimize it by using `vzeroall` before the actual syscall.
This returns the SSE, AVX, and ZMM_HI256 xsave/xrstor states to the
init-state which allows the imminent context switch to skip
saving/restoring those states.
---
 .../unix/sysv/linux/x86_64/sched-yield-impl.h | 29 ++++++++++
 sysdeps/unix/sysv/linux/x86_64/sched_yield.c  | 56 +++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/sched_yield.c

Comments

Gabriel Ravier June 8, 2023, 10:13 a.m. UTC | #1
On 6/8/23 11:00, Noah Goldstein via Libc-alpha wrote:
> We slightly optimize it by using `vzeroall` before the actual syscall.
> This returns the SSE, AVX, and ZMM_HI256 xsave/xrstor states to the
> init-state which allows the imminent context switch to skip
> saving/restoring those states.
Could this potentially be explained in a bit more detail ? I've been 
searching around for almost half an hour now and I've seen nothing that 
indicates how this optimization actually works - not that I don't 
believe you, but I'm just a bit confused as to what this actually 
accomplishes.
> ---
>   .../unix/sysv/linux/x86_64/sched-yield-impl.h | 29 ++++++++++
>   sysdeps/unix/sysv/linux/x86_64/sched_yield.c  | 56 +++++++++++++++++++
>   2 files changed, 85 insertions(+)
>   create mode 100644 sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h
>   create mode 100644 sysdeps/unix/sysv/linux/x86_64/sched_yield.c
>
> diff --git a/sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h b/sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h
> new file mode 100644
> index 0000000000..03622ccea4
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h
> @@ -0,0 +1,29 @@
> +/* Yield current process.  Linux specific syscall.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +static int TARGET
> +SCHED_YIELD (void)
> +{
> +  PREPARE_CONTEXT_SWITCH ();
> +  return INLINE_SYSCALL_CALL (sched_yield);
> +}
> +#undef TARGET
> +#undef SCHED_YIELD
> +#undef PREPARE_CONTEXT_SWITCH
> diff --git a/sysdeps/unix/sysv/linux/x86_64/sched_yield.c b/sysdeps/unix/sysv/linux/x86_64/sched_yield.c
> new file mode 100644
> index 0000000000..e87acf124b
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/sched_yield.c
> @@ -0,0 +1,56 @@
> +/* clock_nanosleep for x86_64.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Only difference is if we have AVX, use vzeroall to clear inuse for SSE, AVX,
> +   and ZMM_HI256 xsave/xrstor state.  This enables the init-state optimization
> +   saving overhead on context switches.  */
> +
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD(4)
> +# include <immintrin.h>
> +# define TARGET __attribute__ ((target ("avx")))
> +# define PREPARE_CONTEXT_SWITCH() _mm256_zeroall ()
> +# define SCHED_YIELD __sched_yield_avx
> +# include "sched-yield-impl.h"
> +#endif
> +#if ISA_SHOULD_BUILD(2)
> +# define TARGET
> +# define PREPARE_CONTEXT_SWITCH()
> +# define SCHED_YIELD __sched_yield_generic
> +# include "sched-yield-impl.h"
> +#endif
> +
> +#include <init-arch.h>
> +#include <ifunc-init.h>
> +
> +static inline void *
> +__sched_yield_ifunc_selector (void)
> +{
> +#if MINIMUM_X86_ISA_LEVEL >= 3
> +  return __sched_yield_avx;
> +#else
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> +    return __sched_yield_avx;
> +  return __sched_yield_generic;
> +#endif
> +}
> +
> +libc_ifunc (__sched_yield, __sched_yield_ifunc_selector ());
> +libc_hidden_def (__sched_yield);
> +weak_alias (__sched_yield, sched_yield);
Florian Weimer June 8, 2023, 11:43 a.m. UTC | #2
* Noah Goldstein via Libc-alpha:

> We slightly optimize it by using `vzeroall` before the actual syscall.
> This returns the SSE, AVX, and ZMM_HI256 xsave/xrstor states to the
> init-state which allows the imminent context switch to skip
> saving/restoring those states.

Surely there is a better way to implement this, enabling something
similar for all system calls issued by libc on the kernel side?  It
changes userspace ABI, so it has to be opt-in.  Maybe it could be an
additional flag in the system call number, indicating that it is safe
to zap the vector state if it is beneficial.
Adhemerval Zanella Netto June 8, 2023, 12:08 p.m. UTC | #3
On 08/06/23 08:43, Florian Weimer wrote:
> * Noah Goldstein via Libc-alpha:
> 
>> We slightly optimize it by using `vzeroall` before the actual syscall.
>> This returns the SSE, AVX, and ZMM_HI256 xsave/xrstor states to the
>> init-state which allows the imminent context switch to skip
>> saving/restoring those states.
> 
> Surely there is a better way to implement this, enabling something
> similar for all system calls issued by libc on the kernel side?  It
> changes userspace ABI, so it has to be opt-in.  Maybe it could be an
> additional flag in the system call number, indicating that it is safe
> to zap the vector state if it is beneficial.

Agree, trying to implement it on userland seems really hacky.  It means
to potentially override and/or add an ifunc variant to any syscall that
can potentially trigger a context switch; besides adding arch-specific
implementation for something the kernel already has the information
(so it can rewrite the syscall entrypoint depending of the ISA).
Noah Goldstein June 8, 2023, 5:39 p.m. UTC | #4
On Thu, Jun 8, 2023 at 7:08 AM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 08/06/23 08:43, Florian Weimer wrote:
> > * Noah Goldstein via Libc-alpha:
> >
> >> We slightly optimize it by using `vzeroall` before the actual syscall.
> >> This returns the SSE, AVX, and ZMM_HI256 xsave/xrstor states to the
> >> init-state which allows the imminent context switch to skip
> >> saving/restoring those states.
> >
> > Surely there is a better way to implement this, enabling something
> > similar for all system calls issued by libc on the kernel side?  It
> > changes userspace ABI, so it has to be opt-in.  Maybe it could be an
> > additional flag in the system call number, indicating that it is safe
> > to zap the vector state if it is beneficial.
It seems like a much bigger change than is needed.
>
> Agree, trying to implement it on userland seems really hacky.  It means
> to potentially override and/or add an ifunc variant to any syscall that
> can potentially trigger a context switch; besides adding arch-specific
> implementation for something the kernel already has the information
> (so it can rewrite the syscall entrypoint depending of the ISA).

I don't think we need/want this for every syscall. Only the syscalls
where there is a high probability of a proper ctx switch and the calling
process going back to the schedule loop.
Otherwise the kernel generally just takes care to not touch vector registers
and doesn't bother with the save/restore.
Noah Goldstein June 8, 2023, 5:43 p.m. UTC | #5
On Thu, Jun 8, 2023 at 5:13 AM Gabriel Ravier <gabravier@gmail.com> wrote:
>
> On 6/8/23 11:00, Noah Goldstein via Libc-alpha wrote:
> > We slightly optimize it by using `vzeroall` before the actual syscall.
> > This returns the SSE, AVX, and ZMM_HI256 xsave/xrstor states to the
> > init-state which allows the imminent context switch to skip
> > saving/restoring those states.
> Could this potentially be explained in a bit more detail ? I've been
> searching around for almost half an hour now and I've seen nothing that
> indicates how this optimization actually works - not that I don't
> believe you, but I'm just a bit confused as to what this actually
> accomplishes.

On context switch there is an "init optimization" where register classes that
are known to be in their in initial state xsave/rstor don't actually write/read
them:
https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf#page=324
In this case, `vzeroall` restores SSE, AVX, and ZMM_HI256 state to init
state:
https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf#page=309
> > ---
> >   .../unix/sysv/linux/x86_64/sched-yield-impl.h | 29 ++++++++++
> >   sysdeps/unix/sysv/linux/x86_64/sched_yield.c  | 56 +++++++++++++++++++
> >   2 files changed, 85 insertions(+)
> >   create mode 100644 sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h
> >   create mode 100644 sysdeps/unix/sysv/linux/x86_64/sched_yield.c
> >
> > diff --git a/sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h b/sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h
> > new file mode 100644
> > index 0000000000..03622ccea4
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h
> > @@ -0,0 +1,29 @@
> > +/* Yield current process.  Linux specific syscall.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +
> > +static int TARGET
> > +SCHED_YIELD (void)
> > +{
> > +  PREPARE_CONTEXT_SWITCH ();
> > +  return INLINE_SYSCALL_CALL (sched_yield);
> > +}
> > +#undef TARGET
> > +#undef SCHED_YIELD
> > +#undef PREPARE_CONTEXT_SWITCH
> > diff --git a/sysdeps/unix/sysv/linux/x86_64/sched_yield.c b/sysdeps/unix/sysv/linux/x86_64/sched_yield.c
> > new file mode 100644
> > index 0000000000..e87acf124b
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/x86_64/sched_yield.c
> > @@ -0,0 +1,56 @@
> > +/* clock_nanosleep for x86_64.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Only difference is if we have AVX, use vzeroall to clear inuse for SSE, AVX,
> > +   and ZMM_HI256 xsave/xrstor state.  This enables the init-state optimization
> > +   saving overhead on context switches.  */
> > +
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD(4)
> > +# include <immintrin.h>
> > +# define TARGET __attribute__ ((target ("avx")))
> > +# define PREPARE_CONTEXT_SWITCH() _mm256_zeroall ()
> > +# define SCHED_YIELD __sched_yield_avx
> > +# include "sched-yield-impl.h"
> > +#endif
> > +#if ISA_SHOULD_BUILD(2)
> > +# define TARGET
> > +# define PREPARE_CONTEXT_SWITCH()
> > +# define SCHED_YIELD __sched_yield_generic
> > +# include "sched-yield-impl.h"
> > +#endif
> > +
> > +#include <init-arch.h>
> > +#include <ifunc-init.h>
> > +
> > +static inline void *
> > +__sched_yield_ifunc_selector (void)
> > +{
> > +#if MINIMUM_X86_ISA_LEVEL >= 3
> > +  return __sched_yield_avx;
> > +#else
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > +    return __sched_yield_avx;
> > +  return __sched_yield_generic;
> > +#endif
> > +}
> > +
> > +libc_ifunc (__sched_yield, __sched_yield_ifunc_selector ());
> > +libc_hidden_def (__sched_yield);
> > +weak_alias (__sched_yield, sched_yield);
Zack Weinberg June 8, 2023, 6:26 p.m. UTC | #6
On Thu, Jun 8, 2023, at 10:39 AM, Noah Goldstein via Libc-alpha wrote:
> I don't think we need/want this for every syscall. Only the syscalls
> where there is a high probability of a proper ctx switch and the calling
> process going back to the schedule loop.

Yeah, but that includes every syscall that performs I/O, which is most of them. Isn't it?

If these registers are all call-clobbered then maybe it makes sense to do this unconditionally in the syscall entry path, kernel side. That way only context switches triggered by actual preemption would have to pay the extra register save costs.

zw
Florian Weimer June 8, 2023, 7:41 p.m. UTC | #7
* Zack Weinberg via Libc-alpha:

> If these registers are all call-clobbered then maybe it makes sense
> to do this unconditionally in the syscall entry path, kernel
> side.

This is not a backwards-compatible change and probably breaks glibc
itself because the asm constraints clearly indicate that vector
registers are NOT clobbered.  This really looks like an oversight in
the syscall ABI specification, but it's very much too late to change
it by default.

The other factor is that if the system call is non-blocking, the
syscall enter/exit paths and (usually) the kernel code in between do
not clobber the vector state, so it's not saved and restored.  As far
as I understand it, after the syscall ABI change, saving the vector
state is only needed if the scheduler preempts the code in userspace,
not when the task voluntarily de-schedules itself during a syscall.
Likewise in the other direction.
Noah Goldstein June 8, 2023, 7:53 p.m. UTC | #8
On Thu, Jun 8, 2023 at 2:41 PM Florian Weimer <fw@deneb.enyo.de> wrote:
>
> * Zack Weinberg via Libc-alpha:
>
> > If these registers are all call-clobbered then maybe it makes sense
> > to do this unconditionally in the syscall entry path, kernel
> > side.
>
> This is not a backwards-compatible change and probably breaks glibc
> itself because the asm constraints clearly indicate that vector
> registers are NOT clobbered.  This really looks like an oversight in
> the syscall ABI specification, but it's very much too late to change
> it by default.
>
> The other factor is that if the system call is non-blocking, the
> syscall enter/exit paths and (usually) the kernel code in between do
> not clobber the vector state, so it's not saved and restored.  As far
> as I understand it, after the syscall ABI change, saving the vector
> state is only needed if the scheduler preempts the code in userspace,
> not when the task voluntarily de-schedules itself during a syscall.
> Likewise in the other direction.

I think that's right, hence we only need a few select functions.
Zack Weinberg June 8, 2023, 8:22 p.m. UTC | #9
On Thu, Jun 8, 2023, at 12:53 PM, Noah Goldstein via Libc-alpha wrote:
> On Thu, Jun 8, 2023 at 2:41 PM Florian Weimer <fw@deneb.enyo.de> wrote:
>>
>> * Zack Weinberg via Libc-alpha:
>>
>> > If these registers are all call-clobbered then maybe it makes sense
>> > to do this unconditionally in the syscall entry path, kernel
>> > side.
>>
>> This is not a backwards-compatible change and probably breaks glibc
>> itself because the asm constraints clearly indicate that vector
>> registers are NOT clobbered. 
>
> we only need a few select functions.

If the vector regs aren't call clobbered (and I really mean *call* clobbered here, not syscall clobbered) then this isn't a safe change *at all*, ne?

I see why compatibility precludes doing this kernel-side, but then it seems to me the proper place is in the syscall stub macros.  

zw
Noah Goldstein June 8, 2023, 8:38 p.m. UTC | #10
On Thu, Jun 8, 2023 at 3:23 PM Zack Weinberg via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Jun 8, 2023, at 12:53 PM, Noah Goldstein via Libc-alpha wrote:
> > On Thu, Jun 8, 2023 at 2:41 PM Florian Weimer <fw@deneb.enyo.de> wrote:
> >>
> >> * Zack Weinberg via Libc-alpha:
> >>
> >> > If these registers are all call-clobbered then maybe it makes sense
> >> > to do this unconditionally in the syscall entry path, kernel
> >> > side.
> >>
> >> This is not a backwards-compatible change and probably breaks glibc
> >> itself because the asm constraints clearly indicate that vector
> >> registers are NOT clobbered.
> >
> > we only need a few select functions.
>
> If the vector regs aren't call clobbered (and I really mean *call* clobbered here, not syscall clobbered) then this isn't a safe change *at all*, ne?
>
> I see why compatibility precludes doing this kernel-side, but then it seems to me the proper place is in the syscall stub macros.
>
We are taking advantage of the fact that call ABI clobbers all
vectors. macro doesn't imply any clobbers.

> zw
Zack Weinberg June 8, 2023, 8:44 p.m. UTC | #11
On Thu, Jun 8, 2023, at 1:38 PM, Noah Goldstein via Libc-alpha wrote:
> On Thu, Jun 8, 2023 at 3:23 PM Zack Weinberg via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> On Thu, Jun 8, 2023, at 12:53 PM, Noah Goldstein via Libc-alpha wrote:
>> > On Thu, Jun 8, 2023 at 2:41 PM Florian Weimer <fw@deneb.enyo.de> wrote:
>> >>
>> >> * Zack Weinberg via Libc-alpha:
>> >>
>> >> > If these registers are all call-clobbered then maybe it makes sense
>> >> > to do this unconditionally in the syscall entry path, kernel
>> >> > side.
>> >>
>> >> This is not a backwards-compatible change and probably breaks glibc
>> >> itself because the asm constraints clearly indicate that vector
>> >> registers are NOT clobbered.
>> >
>> > we only need a few select functions.
>>
>> If the vector regs aren't call clobbered (and I really mean *call* clobbered here, not syscall clobbered) then this isn't a safe change *at all*, ne?
>>
>> I see why compatibility precludes doing this kernel-side, but then it seems to me the proper place is in the syscall stub macros.
>>
> We are taking advantage of the fact that call ABI clobbers all
> vectors. macro doesn't imply any clobbers.

OK, so then why *not* alter the syscall stub macros to do this uniformly for all syscalls, or for all but a handful of things which are unlikely to cause a context switch and the extra cost of the clear instruction itself is significant (e.g. get*id, sigprocmask).

zw
Noah Goldstein June 8, 2023, 9:06 p.m. UTC | #12
On Thu, Jun 8, 2023 at 3:44 PM Zack Weinberg via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On Thu, Jun 8, 2023, at 1:38 PM, Noah Goldstein via Libc-alpha wrote:
> > On Thu, Jun 8, 2023 at 3:23 PM Zack Weinberg via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >> On Thu, Jun 8, 2023, at 12:53 PM, Noah Goldstein via Libc-alpha wrote:
> >> > On Thu, Jun 8, 2023 at 2:41 PM Florian Weimer <fw@deneb.enyo.de> wrote:
> >> >>
> >> >> * Zack Weinberg via Libc-alpha:
> >> >>
> >> >> > If these registers are all call-clobbered then maybe it makes sense
> >> >> > to do this unconditionally in the syscall entry path, kernel
> >> >> > side.
> >> >>
> >> >> This is not a backwards-compatible change and probably breaks glibc
> >> >> itself because the asm constraints clearly indicate that vector
> >> >> registers are NOT clobbered.
> >> >
> >> > we only need a few select functions.
> >>
> >> If the vector regs aren't call clobbered (and I really mean *call* clobbered here, not syscall clobbered) then this isn't a safe change *at all*, ne?
> >>
> >> I see why compatibility precludes doing this kernel-side, but then it seems to me the proper place is in the syscall stub macros.
> >>
> > We are taking advantage of the fact that call ABI clobbers all
> > vectors. macro doesn't imply any clobbers.
>
> OK, so then why *not* alter the syscall stub macros to do this uniformly for all syscalls, or for all but a handful of things which are unlikely to cause a context switch and the extra cost of the clear instruction itself is significant (e.g. get*id, sigprocmask).
Maybe Im missing something, but it can only be done in functions. We
could put it in `syscall(long int sys_num, ...)` but not something
like INTERNAL_SYSCALL
>
> zw
Florian Weimer June 8, 2023, 9:25 p.m. UTC | #13
* Noah Goldstein via Libc-alpha:

> Maybe Im missing something, but it can only be done in functions. We
> could put it in `syscall(long int sys_num, ...)` but not something
> like INTERNAL_SYSCALL

You can add vector register clobbers.

The problem is that it's not beneficial in general and might impact
small packet receive performance with an event loop (where the
previous poll ensures that the subsequent recvmsg etc. is pretty much
always non-blocking).  But in other cases, receive operations are
blocking, and would benefit from that VZEROALL.

Only the kernel knows if the VZEROALL equivalent is beneficial during
that particular execution of the system call.  But glibc still needs
to help the kernel and communicate that discarding the vector state is
safe in this particular context.
Zack Weinberg June 9, 2023, 5:59 a.m. UTC | #14
On Thu, Jun 8, 2023, at 5:25 PM, Florian Weimer wrote:
> The problem is that it's not beneficial in general and might impact
> small packet receive performance with an event loop (where the
> previous poll ensures that the subsequent recvmsg etc. is pretty much
> always non-blocking).  But in other cases, receive operations are
> blocking, and would benefit from that VZEROALL.
>
> Only the kernel knows if the VZEROALL equivalent is beneficial during
> that particular execution of the system call.  But glibc still needs
> to help the kernel and communicate that discarding the vector state is
> safe in this particular context.

The negative effect on non-blocking syscalls would be due to the cost of
the VZEROALL itself, right?

I'm not having any luck thinking of a good way to communicate this
context information to the kernel.  If we could put flags in the high
bits of syscall numbers that would be very efficient, but it would break
compatibility with old kernels, old strace binaries, and lots of other
stuff.  But any other place we could put it would involve either
stomping on another register (and IIRC there are no call-clobbered
integer registers _left_ to stomp on) or making the kernel do an extra
memory load in the syscall entry path.  Have you got any ideas?

zw
Noah Goldstein June 10, 2023, 1:11 a.m. UTC | #15
On Fri, Jun 9, 2023 at 12:59 AM Zack Weinberg via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Jun 8, 2023, at 5:25 PM, Florian Weimer wrote:
> > The problem is that it's not beneficial in general and might impact
> > small packet receive performance with an event loop (where the
> > previous poll ensures that the subsequent recvmsg etc. is pretty much
> > always non-blocking).  But in other cases, receive operations are
> > blocking, and would benefit from that VZEROALL.
> >
> > Only the kernel knows if the VZEROALL equivalent is beneficial during
> > that particular execution of the system call.  But glibc still needs
> > to help the kernel and communicate that discarding the vector state is
> > safe in this particular context.
>
> The negative effect on non-blocking syscalls would be due to the cost of
> the VZEROALL itself, right?
>
> I'm not having any luck thinking of a good way to communicate this
> context information to the kernel.  If we could put flags in the high
> bits of syscall numbers that would be very efficient, but it would break
> compatibility with old kernels, old strace binaries, and lots of other
> stuff.  But any other place we could put it would involve either
> stomping on another register (and IIRC there are no call-clobbered
> integer registers _left_ to stomp on) or making the kernel do an extra
> memory load in the syscall entry path.  Have you got any ideas?
>
There are some output only registers for syscalls on x86_64 at least.
rcx/r11. Those get clobbered by syscall anyways so writing to rcx
instruction beforehand would probably not break anything.
> zw
Gabriel Ravier June 10, 2023, 2:07 a.m. UTC | #16
On 6/10/23 03:11, Noah Goldstein via Libc-alpha wrote:
> On Fri, Jun 9, 2023 at 12:59 AM Zack Weinberg via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>> On Thu, Jun 8, 2023, at 5:25 PM, Florian Weimer wrote:
>>> The problem is that it's not beneficial in general and might impact
>>> small packet receive performance with an event loop (where the
>>> previous poll ensures that the subsequent recvmsg etc. is pretty much
>>> always non-blocking).  But in other cases, receive operations are
>>> blocking, and would benefit from that VZEROALL.
>>>
>>> Only the kernel knows if the VZEROALL equivalent is beneficial during
>>> that particular execution of the system call.  But glibc still needs
>>> to help the kernel and communicate that discarding the vector state is
>>> safe in this particular context.
>> The negative effect on non-blocking syscalls would be due to the cost of
>> the VZEROALL itself, right?
>>
>> I'm not having any luck thinking of a good way to communicate this
>> context information to the kernel.  If we could put flags in the high
>> bits of syscall numbers that would be very efficient, but it would break
>> compatibility with old kernels, old strace binaries, and lots of other
>> stuff.  But any other place we could put it would involve either
>> stomping on another register (and IIRC there are no call-clobbered
>> integer registers _left_ to stomp on) or making the kernel do an extra
>> memory load in the syscall entry path.  Have you got any ideas?
>>
> There are some output only registers for syscalls on x86_64 at least.
> rcx/r11. Those get clobbered by syscall anyways so writing to rcx
> instruction beforehand would probably not break anything.
The syscall instruction itself overwrites these with rip and rflags, so 
how is the kernel is supposed to determine what value they had beforehand ?
>> zw
Noah Goldstein June 10, 2023, 4:59 a.m. UTC | #17
On Fri, Jun 9, 2023 at 9:07 PM Gabriel Ravier <gabravier@gmail.com> wrote:
>
> On 6/10/23 03:11, Noah Goldstein via Libc-alpha wrote:
> > On Fri, Jun 9, 2023 at 12:59 AM Zack Weinberg via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >> On Thu, Jun 8, 2023, at 5:25 PM, Florian Weimer wrote:
> >>> The problem is that it's not beneficial in general and might impact
> >>> small packet receive performance with an event loop (where the
> >>> previous poll ensures that the subsequent recvmsg etc. is pretty much
> >>> always non-blocking).  But in other cases, receive operations are
> >>> blocking, and would benefit from that VZEROALL.
> >>>
> >>> Only the kernel knows if the VZEROALL equivalent is beneficial during
> >>> that particular execution of the system call.  But glibc still needs
> >>> to help the kernel and communicate that discarding the vector state is
> >>> safe in this particular context.
> >> The negative effect on non-blocking syscalls would be due to the cost of
> >> the VZEROALL itself, right?
> >>
> >> I'm not having any luck thinking of a good way to communicate this
> >> context information to the kernel.  If we could put flags in the high
> >> bits of syscall numbers that would be very efficient, but it would break
> >> compatibility with old kernels, old strace binaries, and lots of other
> >> stuff.  But any other place we could put it would involve either
> >> stomping on another register (and IIRC there are no call-clobbered
> >> integer registers _left_ to stomp on) or making the kernel do an extra
> >> memory load in the syscall entry path.  Have you got any ideas?
> >>
> > There are some output only registers for syscalls on x86_64 at least.
> > rcx/r11. Those get clobbered by syscall anyways so writing to rcx
> > instruction beforehand would probably not break anything.
> The syscall instruction itself overwrites these with rip and rflags, so
> how is the kernel is supposed to determine what value they had beforehand ?

Oh, I thought that happened before the return to userspace, not before
the transition to the kernel. Nevermind.
> >> zw
>
>
diff mbox series

Patch

diff --git a/sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h b/sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h
new file mode 100644
index 0000000000..03622ccea4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/sched-yield-impl.h
@@ -0,0 +1,29 @@ 
+/* Yield current process.  Linux specific syscall.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+static int TARGET
+SCHED_YIELD (void)
+{
+  PREPARE_CONTEXT_SWITCH ();
+  return INLINE_SYSCALL_CALL (sched_yield);
+}
+#undef TARGET
+#undef SCHED_YIELD
+#undef PREPARE_CONTEXT_SWITCH
diff --git a/sysdeps/unix/sysv/linux/x86_64/sched_yield.c b/sysdeps/unix/sysv/linux/x86_64/sched_yield.c
new file mode 100644
index 0000000000..e87acf124b
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/sched_yield.c
@@ -0,0 +1,56 @@ 
+/* clock_nanosleep for x86_64.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Only difference is if we have AVX, use vzeroall to clear inuse for SSE, AVX,
+   and ZMM_HI256 xsave/xrstor state.  This enables the init-state optimization
+   saving overhead on context switches.  */
+
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD(4)
+# include <immintrin.h>
+# define TARGET __attribute__ ((target ("avx")))
+# define PREPARE_CONTEXT_SWITCH() _mm256_zeroall ()
+# define SCHED_YIELD __sched_yield_avx
+# include "sched-yield-impl.h"
+#endif
+#if ISA_SHOULD_BUILD(2)
+# define TARGET
+# define PREPARE_CONTEXT_SWITCH()
+# define SCHED_YIELD __sched_yield_generic
+# include "sched-yield-impl.h"
+#endif
+
+#include <init-arch.h>
+#include <ifunc-init.h>
+
+static inline void *
+__sched_yield_ifunc_selector (void)
+{
+#if MINIMUM_X86_ISA_LEVEL >= 3
+  return __sched_yield_avx;
+#else
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
+    return __sched_yield_avx;
+  return __sched_yield_generic;
+#endif
+}
+
+libc_ifunc (__sched_yield, __sched_yield_ifunc_selector ());
+libc_hidden_def (__sched_yield);
+weak_alias (__sched_yield, sched_yield);