Message ID | 20220627211858.2807553-2-hjl.tools@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v2,1/2] x86: Move CPU_FEATURE{S}_{USABLE|ARCH}_P to isa-level.h | expand |
On Mon, Jun 27, 2022 at 2:19 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers > aren't used. For x86-64 ISA level v4 build, both SSE and AVX resolvers > are unused. Check the minimum x86-64 ISA level to exclude the unused > run-time resolvers. > --- > sysdeps/x86/isa-level.h | 2 ++ > sysdeps/x86_64/dl-machine.h | 12 ++++--- > sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++---------------- > 3 files changed, 42 insertions(+), 31 deletions(-) > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h > index c6156e7f7a..f293aea906 100644 > --- a/sysdeps/x86/isa-level.h > +++ b/sysdeps/x86/isa-level.h > @@ -68,10 +68,12 @@ > compile-time constant.. */ > > /* ISA level >= 4 guaranteed includes. */ > +#define AVX512F_X86_ISA_LEVEL 4 > #define AVX512VL_X86_ISA_LEVEL 4 > #define AVX512BW_X86_ISA_LEVEL 4 > > /* ISA level >= 3 guaranteed includes. */ > +#define AVX_X86_ISA_LEVEL 3 > #define AVX2_X86_ISA_LEVEL 3 > #define BMI2_X86_ISA_LEVEL 3 > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h > index 34766325ae..005d089501 100644 > --- a/sysdeps/x86_64/dl-machine.h > +++ b/sysdeps/x86_64/dl-machine.h > @@ -28,6 +28,7 @@ > #include <dl-tlsdesc.h> > #include <dl-static-tls.h> > #include <dl-machine-rel.h> > +#include <isa-level.h> > > /* Return nonzero iff ELF header is compatible with the running host. */ > static inline int __attribute__ ((unused)) > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], > /* Identify this shared object. */ > *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l; > > + const struct cpu_features* cpu_features = __get_cpu_features (); > + > /* The got[2] entry contains the address of a function which gets > called to get the address of a so far unresolved function and > jump to it. The profiling extension of the dynamic linker allows > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], > end in this function. */ > if (__glibc_unlikely (profile)) > { > - if (CPU_FEATURE_USABLE (AVX512F)) > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F)) > *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512; > - else if (CPU_FEATURE_USABLE (AVX)) > + else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX)) > *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx; > else > *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse; > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], > /* This function will get called to fix up the GOT entry > indicated by the offset on the stack, and then jump to > the resolved address. */ > - if (GLRO(dl_x86_cpu_features).xsave_state_size != 0) > + if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL > + || GLRO(dl_x86_cpu_features).xsave_state_size != 0) > *(ElfW(Addr) *) (got + 2) > - = (CPU_FEATURE_USABLE (XSAVEC) > + = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC) > ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec > : (ElfW(Addr)) &_dl_runtime_resolve_xsave); > else > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S > index 831a654713..f669805ac5 100644 > --- a/sysdeps/x86_64/dl-trampoline.S > +++ b/sysdeps/x86_64/dl-trampoline.S > @@ -20,6 +20,7 @@ > #include <sysdep.h> > #include <cpu-features-offsets.h> > #include <link-defines.h> > +#include <isa-level.h> > > #ifndef DL_STACK_ALIGNMENT > /* Due to GCC bug: > @@ -62,35 +63,39 @@ > #undef VMOVA > #undef VEC_SIZE > > -#define VEC_SIZE 32 > -#define VMOVA vmovdqa > -#define VEC(i) ymm##i > -#define _dl_runtime_profile _dl_runtime_profile_avx > -#include "dl-trampoline.h" > -#undef _dl_runtime_profile > -#undef VEC > -#undef VMOVA > -#undef VEC_SIZE > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL > +# define VEC_SIZE 32 > +# define VMOVA vmovdqa > +# define VEC(i) ymm##i > +# define _dl_runtime_profile _dl_runtime_profile_avx > +# include "dl-trampoline.h" > +# undef _dl_runtime_profile > +# undef VEC > +# undef VMOVA > +# undef VEC_SIZE > +#endif > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL > /* movaps/movups is 1-byte shorter. */ > -#define VEC_SIZE 16 > -#define VMOVA movaps > -#define VEC(i) xmm##i > -#define _dl_runtime_profile _dl_runtime_profile_sse > -#undef RESTORE_AVX > -#include "dl-trampoline.h" > -#undef _dl_runtime_profile > -#undef VEC > -#undef VMOVA > -#undef VEC_SIZE > - > -#define USE_FXSAVE > -#define STATE_SAVE_ALIGNMENT 16 > -#define _dl_runtime_resolve _dl_runtime_resolve_fxsave > -#include "dl-trampoline.h" > -#undef _dl_runtime_resolve > -#undef USE_FXSAVE > -#undef STATE_SAVE_ALIGNMENT > +# define VEC_SIZE 16 > +# define VMOVA movaps > +# define VEC(i) xmm##i > +# define _dl_runtime_profile _dl_runtime_profile_sse > +# undef RESTORE_AVX > +# include "dl-trampoline.h" > +# undef _dl_runtime_profile > +# undef VEC > +# undef VMOVA > +# undef VEC_SIZE > + > +# define USE_FXSAVE > +# define STATE_SAVE_ALIGNMENT 16 > +# define _dl_runtime_resolve _dl_runtime_resolve_fxsave > +# include "dl-trampoline.h" > +# undef _dl_runtime_resolve > +# undef USE_FXSAVE > +# undef STATE_SAVE_ALIGNMENT > +#endif > > #define USE_XSAVE > #define STATE_SAVE_ALIGNMENT 64 > -- > 2.36.1 > Sorry missed this earlier. But can you update the text section in dl-trampoline for the corresponding implementations? I also have a patch to do some misc cleanup in that file so I can do it if you don't mind splitting the patch.
On Mon, Jun 27, 2022 at 2:47 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Mon, Jun 27, 2022 at 2:19 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers > > aren't used. For x86-64 ISA level v4 build, both SSE and AVX resolvers > > are unused. Check the minimum x86-64 ISA level to exclude the unused > > run-time resolvers. > > --- > > sysdeps/x86/isa-level.h | 2 ++ > > sysdeps/x86_64/dl-machine.h | 12 ++++--- > > sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++---------------- > > 3 files changed, 42 insertions(+), 31 deletions(-) > > > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h > > index c6156e7f7a..f293aea906 100644 > > --- a/sysdeps/x86/isa-level.h > > +++ b/sysdeps/x86/isa-level.h > > @@ -68,10 +68,12 @@ > > compile-time constant.. */ > > > > /* ISA level >= 4 guaranteed includes. */ > > +#define AVX512F_X86_ISA_LEVEL 4 > > #define AVX512VL_X86_ISA_LEVEL 4 > > #define AVX512BW_X86_ISA_LEVEL 4 > > > > /* ISA level >= 3 guaranteed includes. */ > > +#define AVX_X86_ISA_LEVEL 3 > > #define AVX2_X86_ISA_LEVEL 3 > > #define BMI2_X86_ISA_LEVEL 3 > > > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h > > index 34766325ae..005d089501 100644 > > --- a/sysdeps/x86_64/dl-machine.h > > +++ b/sysdeps/x86_64/dl-machine.h > > @@ -28,6 +28,7 @@ > > #include <dl-tlsdesc.h> > > #include <dl-static-tls.h> > > #include <dl-machine-rel.h> > > +#include <isa-level.h> > > > > /* Return nonzero iff ELF header is compatible with the running host. */ > > static inline int __attribute__ ((unused)) > > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], > > /* Identify this shared object. */ > > *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l; > > > > + const struct cpu_features* cpu_features = __get_cpu_features (); > > + > > /* The got[2] entry contains the address of a function which gets > > called to get the address of a so far unresolved function and > > jump to it. The profiling extension of the dynamic linker allows > > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], > > end in this function. */ > > if (__glibc_unlikely (profile)) > > { > > - if (CPU_FEATURE_USABLE (AVX512F)) > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F)) > > *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512; > > - else if (CPU_FEATURE_USABLE (AVX)) > > + else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX)) > > *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx; > > else > > *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse; > > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], > > /* This function will get called to fix up the GOT entry > > indicated by the offset on the stack, and then jump to > > the resolved address. */ > > - if (GLRO(dl_x86_cpu_features).xsave_state_size != 0) > > + if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL > > + || GLRO(dl_x86_cpu_features).xsave_state_size != 0) > > *(ElfW(Addr) *) (got + 2) > > - = (CPU_FEATURE_USABLE (XSAVEC) > > + = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC) > > ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec > > : (ElfW(Addr)) &_dl_runtime_resolve_xsave); > > else > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S > > index 831a654713..f669805ac5 100644 > > --- a/sysdeps/x86_64/dl-trampoline.S > > +++ b/sysdeps/x86_64/dl-trampoline.S > > @@ -20,6 +20,7 @@ > > #include <sysdep.h> > > #include <cpu-features-offsets.h> > > #include <link-defines.h> > > +#include <isa-level.h> > > > > #ifndef DL_STACK_ALIGNMENT > > /* Due to GCC bug: > > @@ -62,35 +63,39 @@ > > #undef VMOVA > > #undef VEC_SIZE > > > > -#define VEC_SIZE 32 > > -#define VMOVA vmovdqa > > -#define VEC(i) ymm##i > > -#define _dl_runtime_profile _dl_runtime_profile_avx > > -#include "dl-trampoline.h" > > -#undef _dl_runtime_profile > > -#undef VEC > > -#undef VMOVA > > -#undef VEC_SIZE > > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL > > +# define VEC_SIZE 32 > > +# define VMOVA vmovdqa > > +# define VEC(i) ymm##i > > +# define _dl_runtime_profile _dl_runtime_profile_avx > > +# include "dl-trampoline.h" > > +# undef _dl_runtime_profile > > +# undef VEC > > +# undef VMOVA > > +# undef VEC_SIZE > > +#endif > > > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL > > /* movaps/movups is 1-byte shorter. */ > > -#define VEC_SIZE 16 > > -#define VMOVA movaps > > -#define VEC(i) xmm##i > > -#define _dl_runtime_profile _dl_runtime_profile_sse > > -#undef RESTORE_AVX > > -#include "dl-trampoline.h" > > -#undef _dl_runtime_profile > > -#undef VEC > > -#undef VMOVA > > -#undef VEC_SIZE > > - > > -#define USE_FXSAVE > > -#define STATE_SAVE_ALIGNMENT 16 > > -#define _dl_runtime_resolve _dl_runtime_resolve_fxsave > > -#include "dl-trampoline.h" > > -#undef _dl_runtime_resolve > > -#undef USE_FXSAVE > > -#undef STATE_SAVE_ALIGNMENT > > +# define VEC_SIZE 16 > > +# define VMOVA movaps > > +# define VEC(i) xmm##i > > +# define _dl_runtime_profile _dl_runtime_profile_sse > > +# undef RESTORE_AVX > > +# include "dl-trampoline.h" > > +# undef _dl_runtime_profile > > +# undef VEC > > +# undef VMOVA > > +# undef VEC_SIZE > > + > > +# define USE_FXSAVE > > +# define STATE_SAVE_ALIGNMENT 16 > > +# define _dl_runtime_resolve _dl_runtime_resolve_fxsave > > +# include "dl-trampoline.h" > > +# undef _dl_runtime_resolve > > +# undef USE_FXSAVE > > +# undef STATE_SAVE_ALIGNMENT > > +#endif > > > > #define USE_XSAVE > > #define STATE_SAVE_ALIGNMENT 64 > > -- > > 2.36.1 > > > > Sorry missed this earlier. But can you update the text section in > dl-trampoline for the corresponding > implementations? Did you mean a separate patch to change the text section name? > I also have a patch to do some misc cleanup in that file so I can do > it if you don't mind > splitting the patch.
diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h index c6156e7f7a..f293aea906 100644 --- a/sysdeps/x86/isa-level.h +++ b/sysdeps/x86/isa-level.h @@ -68,10 +68,12 @@ compile-time constant.. */ /* ISA level >= 4 guaranteed includes. */ +#define AVX512F_X86_ISA_LEVEL 4 #define AVX512VL_X86_ISA_LEVEL 4 #define AVX512BW_X86_ISA_LEVEL 4 /* ISA level >= 3 guaranteed includes. */ +#define AVX_X86_ISA_LEVEL 3 #define AVX2_X86_ISA_LEVEL 3 #define BMI2_X86_ISA_LEVEL 3 diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index 34766325ae..005d089501 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -28,6 +28,7 @@ #include <dl-tlsdesc.h> #include <dl-static-tls.h> #include <dl-machine-rel.h> +#include <isa-level.h> /* Return nonzero iff ELF header is compatible with the running host. */ static inline int __attribute__ ((unused)) @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], /* Identify this shared object. */ *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l; + const struct cpu_features* cpu_features = __get_cpu_features (); + /* The got[2] entry contains the address of a function which gets called to get the address of a so far unresolved function and jump to it. The profiling extension of the dynamic linker allows @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], end in this function. */ if (__glibc_unlikely (profile)) { - if (CPU_FEATURE_USABLE (AVX512F)) + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F)) *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512; - else if (CPU_FEATURE_USABLE (AVX)) + else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX)) *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx; else *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse; @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], /* This function will get called to fix up the GOT entry indicated by the offset on the stack, and then jump to the resolved address. */ - if (GLRO(dl_x86_cpu_features).xsave_state_size != 0) + if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL + || GLRO(dl_x86_cpu_features).xsave_state_size != 0) *(ElfW(Addr) *) (got + 2) - = (CPU_FEATURE_USABLE (XSAVEC) + = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC) ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec : (ElfW(Addr)) &_dl_runtime_resolve_xsave); else diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 831a654713..f669805ac5 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -20,6 +20,7 @@ #include <sysdep.h> #include <cpu-features-offsets.h> #include <link-defines.h> +#include <isa-level.h> #ifndef DL_STACK_ALIGNMENT /* Due to GCC bug: @@ -62,35 +63,39 @@ #undef VMOVA #undef VEC_SIZE -#define VEC_SIZE 32 -#define VMOVA vmovdqa -#define VEC(i) ymm##i -#define _dl_runtime_profile _dl_runtime_profile_avx -#include "dl-trampoline.h" -#undef _dl_runtime_profile -#undef VEC -#undef VMOVA -#undef VEC_SIZE +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL +# define VEC_SIZE 32 +# define VMOVA vmovdqa +# define VEC(i) ymm##i +# define _dl_runtime_profile _dl_runtime_profile_avx +# include "dl-trampoline.h" +# undef _dl_runtime_profile +# undef VEC +# undef VMOVA +# undef VEC_SIZE +#endif +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL /* movaps/movups is 1-byte shorter. */ -#define VEC_SIZE 16 -#define VMOVA movaps -#define VEC(i) xmm##i -#define _dl_runtime_profile _dl_runtime_profile_sse -#undef RESTORE_AVX -#include "dl-trampoline.h" -#undef _dl_runtime_profile -#undef VEC -#undef VMOVA -#undef VEC_SIZE - -#define USE_FXSAVE -#define STATE_SAVE_ALIGNMENT 16 -#define _dl_runtime_resolve _dl_runtime_resolve_fxsave -#include "dl-trampoline.h" -#undef _dl_runtime_resolve -#undef USE_FXSAVE -#undef STATE_SAVE_ALIGNMENT +# define VEC_SIZE 16 +# define VMOVA movaps +# define VEC(i) xmm##i +# define _dl_runtime_profile _dl_runtime_profile_sse +# undef RESTORE_AVX +# include "dl-trampoline.h" +# undef _dl_runtime_profile +# undef VEC +# undef VMOVA +# undef VEC_SIZE + +# define USE_FXSAVE +# define STATE_SAVE_ALIGNMENT 16 +# define _dl_runtime_resolve _dl_runtime_resolve_fxsave +# include "dl-trampoline.h" +# undef _dl_runtime_resolve +# undef USE_FXSAVE +# undef STATE_SAVE_ALIGNMENT +#endif #define USE_XSAVE #define STATE_SAVE_ALIGNMENT 64