Message ID | 20240317125541.799962-1-hjl.tools@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v4] x86-64: Allocate state buffer space for RDI, RSI and RBX | expand |
On Sun, Mar 17, 2024 at 5:55 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack. > After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define > TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX > to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to > STATE_SAVE_OFFSET(%rsp). > > +==================+<- stack frame start aligned at 8 or 16 bytes > | |<- RDI > | |<- RSI > | |<- RBX > | |<- paddings from stack realignment of 64 bytes > |------------------|<- xsave buffer end aligned at 64 bytes > | |<- > | |<- > | |<- > |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) > | |<- 8-byte padding > | |<- 8-byte padding > | |<- R11 > | |<- R10 > | |<- R9 > | |<- R8 > | |<- RDX > | |<- RCX > +==================+<- State buffer start aligned at 64 bytes > > This fixes BZ #31501. > --- > sysdeps/x86/cpu-features.c | 11 ++-- > sysdeps/x86/sysdep.h | 29 ++++++++++ > sysdeps/x86_64/tst-gnu2-tls2mod1.S | 87 ++++++++++++++++++++++++++++++ > 3 files changed, 123 insertions(+), 4 deletions(-) > create mode 100644 sysdeps/x86_64/tst-gnu2-tls2mod1.S > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > index 4ea373dffa..3d7c2819d7 100644 > --- a/sysdeps/x86/cpu-features.c > +++ b/sysdeps/x86/cpu-features.c > @@ -311,7 +311,7 @@ update_active (struct cpu_features *cpu_features) > /* NB: On AMX capable processors, ebx always includes AMX > states. */ > unsigned int xsave_state_full_size > - = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64); > + = ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64); > > cpu_features->xsave_state_size > = xsave_state_full_size; > @@ -401,8 +401,10 @@ update_active (struct cpu_features *cpu_features) > unsigned int amx_size > = (xstate_amx_comp_offsets[31] > + xstate_amx_comp_sizes[31]); > - amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET, > - 64); > + amx_size > + = ALIGN_UP ((amx_size > + + TLSDESC_CALL_REGISTER_SAVE_AREA), > + 64); > /* Set xsave_state_full_size to the compact AMX > state size for XSAVEC. NB: xsave_state_full_size > is only used in _dl_tlsdesc_dynamic_xsave and > @@ -410,7 +412,8 @@ update_active (struct cpu_features *cpu_features) > cpu_features->xsave_state_full_size = amx_size; > #endif > cpu_features->xsave_state_size > - = ALIGN_UP (size + STATE_SAVE_OFFSET, 64); > + = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, > + 64); > CPU_FEATURE_SET (cpu_features, XSAVEC); > } > } > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h > index db8e576e91..46fcd27345 100644 > --- a/sysdeps/x86/sysdep.h > +++ b/sysdeps/x86/sysdep.h > @@ -46,6 +46,34 @@ > red-zone into account. */ > # define STATE_SAVE_OFFSET (8 * 7 + 8) > > +/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning > + stack. After realigning stack, it saves RCX, RDX, R8, R9, R10 and > + R11. Allocate space for RDI, RSI and RBX to avoid clobbering saved > + RDI, RSI and RBX values on stack by xsave. > + > + +==================+<- stack frame start aligned at 8 or 16 bytes > + | |<- RDI > + | |<- RSI > + | |<- RBX > + | |<- paddings from stack realignment of 64 bytes > + |------------------|<- xsave buffer end aligned at 64 bytes > + | |<- > + | |<- > + | |<- > + |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) > + | |<- 8-byte padding > + | |<- 8-byte padding > + | |<- R11 > + | |<- R10 > + | |<- R9 > + | |<- R8 > + | |<- RDX > + | |<- RCX > + +==================+<- State buffer start aligned at 64 bytes > + > +*/ > +# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24) > + > /* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX > registers are mutually exclusive. */ > # define STATE_SAVE_MASK \ > @@ -68,6 +96,7 @@ > /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386 > doesn't have red-zone, use 0 here. */ > # define STATE_SAVE_OFFSET 0 > +# define TLSDESC_CALL_REGISTER_SAVE_AREA 0 > > /* Save SSE, AVX, AXV512, mask and bound registers. */ > # define STATE_SAVE_MASK \ > diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S b/sysdeps/x86_64/tst-gnu2-tls2mod1.S > new file mode 100644 > index 0000000000..449ddd5c9d > --- /dev/null > +++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S > @@ -0,0 +1,87 @@ > +/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx. > + Copyright (C) 2024 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +/* On AVX512 machines, OFFSET == 104 caused _dl_tlsdesc_dynamic_xsavec > + to clobber %rdi, %rsi and %rbx. On Intel AVX CPUs, the state size > + is 960 bytes and this test didn't fail. It may be due to the unused > + last 128 bytes. On AMD AVX CPUs, the state size is 832 bytes and > + this test might fail without the fix. */ > +#ifndef OFFSET > +# define OFFSET 104 > +#endif > + > + .text > + .p2align 4 > + .globl apply_tls > + .type apply_tls, @function > +apply_tls: > + cfi_startproc > + _CET_ENDBR > + pushq %rbp > + cfi_def_cfa_offset (16) > + cfi_offset (6, -16) > + movdqu (%RDI_LP), %xmm0 > + lea tls_var1@TLSDESC(%rip), %RAX_LP > + mov %RSP_LP, %RBP_LP > + cfi_def_cfa_register (6) > + /* Align stack to 64 bytes. */ > + and $-64, %RSP_LP > + sub $OFFSET, %RSP_LP > + pushq %rbx > + /* Set %ebx to 0xbadbeef. */ > + movl $0xbadbeef, %ebx > + movl $0xbadbeef, %esi > + movq %rdi, saved_rdi(%rip) > + movq %rsi, saved_rsi(%rip) > + call *tls_var1@TLSCALL(%RAX_LP) > + /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx. */ > + cmpq saved_rdi(%rip), %rdi > + jne L(hlt) > + cmpq saved_rsi(%rip), %rsi > + jne L(hlt) > + cmpl $0xbadbeef, %ebx > + jne L(hlt) > + add %fs:0, %RAX_LP > + movups %xmm0, 32(%RAX_LP) > + movdqu 16(%RDI_LP), %xmm1 > + mov %RAX_LP, %RBX_LP > + movups %xmm1, 48(%RAX_LP) > + lea 32(%RBX_LP), %RAX_LP > + pop %rbx > + leave > + cfi_def_cfa (7, 8) > + ret > +L(hlt): > + hlt > + cfi_endproc > + .size apply_tls, .-apply_tls > + .hidden tls_var1 > + .globl tls_var1 > + .section .tbss,"awT",@nobits > + .align 16 > + .type tls_var1, @object > + .size tls_var1, 3200 > +tls_var1: > + .zero 3200 > + .local saved_rdi > + .comm saved_rdi,8,8 > + .local saved_rsi > + .comm saved_rsi,8,8 > + .section .note.GNU-stack,"",@progbits > -- > 2.44.0 > I need to adjust assembly codes.
On Sun, Mar 17, 2024 at 6:02 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Sun, Mar 17, 2024 at 5:55 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack. > > After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define > > TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX > > to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to > > STATE_SAVE_OFFSET(%rsp). > > > > +==================+<- stack frame start aligned at 8 or 16 bytes > > | |<- RDI > > | |<- RSI > > | |<- RBX > > | |<- paddings from stack realignment of 64 bytes > > |------------------|<- xsave buffer end aligned at 64 bytes > > | |<- > > | |<- > > | |<- > > |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) > > | |<- 8-byte padding > > | |<- 8-byte padding > > | |<- R11 > > | |<- R10 > > | |<- R9 > > | |<- R8 > > | |<- RDX > > | |<- RCX > > +==================+<- State buffer start aligned at 64 bytes > > > > This fixes BZ #31501. > > --- > > sysdeps/x86/cpu-features.c | 11 ++-- > > sysdeps/x86/sysdep.h | 29 ++++++++++ > > sysdeps/x86_64/tst-gnu2-tls2mod1.S | 87 ++++++++++++++++++++++++++++++ > > 3 files changed, 123 insertions(+), 4 deletions(-) > > create mode 100644 sysdeps/x86_64/tst-gnu2-tls2mod1.S > > > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > > index 4ea373dffa..3d7c2819d7 100644 > > --- a/sysdeps/x86/cpu-features.c > > +++ b/sysdeps/x86/cpu-features.c > > @@ -311,7 +311,7 @@ update_active (struct cpu_features *cpu_features) > > /* NB: On AMX capable processors, ebx always includes AMX > > states. */ > > unsigned int xsave_state_full_size > > - = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64); > > + = ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64); > > > > cpu_features->xsave_state_size > > = xsave_state_full_size; > > @@ -401,8 +401,10 @@ update_active (struct cpu_features *cpu_features) > > unsigned int amx_size > > = (xstate_amx_comp_offsets[31] > > + xstate_amx_comp_sizes[31]); > > - amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET, > > - 64); > > + amx_size > > + = ALIGN_UP ((amx_size > > + + TLSDESC_CALL_REGISTER_SAVE_AREA), > > + 64); > > /* Set xsave_state_full_size to the compact AMX > > state size for XSAVEC. NB: xsave_state_full_size > > is only used in _dl_tlsdesc_dynamic_xsave and > > @@ -410,7 +412,8 @@ update_active (struct cpu_features *cpu_features) > > cpu_features->xsave_state_full_size = amx_size; > > #endif > > cpu_features->xsave_state_size > > - = ALIGN_UP (size + STATE_SAVE_OFFSET, 64); > > + = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, > > + 64); > > CPU_FEATURE_SET (cpu_features, XSAVEC); > > } > > } > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h > > index db8e576e91..46fcd27345 100644 > > --- a/sysdeps/x86/sysdep.h > > +++ b/sysdeps/x86/sysdep.h > > @@ -46,6 +46,34 @@ > > red-zone into account. */ > > # define STATE_SAVE_OFFSET (8 * 7 + 8) > > > > +/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning > > + stack. After realigning stack, it saves RCX, RDX, R8, R9, R10 and > > + R11. Allocate space for RDI, RSI and RBX to avoid clobbering saved > > + RDI, RSI and RBX values on stack by xsave. > > + > > + +==================+<- stack frame start aligned at 8 or 16 bytes > > + | |<- RDI > > + | |<- RSI > > + | |<- RBX > > + | |<- paddings from stack realignment of 64 bytes > > + |------------------|<- xsave buffer end aligned at 64 bytes > > + | |<- > > + | |<- > > + | |<- > > + |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) > > + | |<- 8-byte padding > > + | |<- 8-byte padding > > + | |<- R11 > > + | |<- R10 > > + | |<- R9 > > + | |<- R8 > > + | |<- RDX > > + | |<- RCX > > + +==================+<- State buffer start aligned at 64 bytes > > + > > +*/ > > +# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24) > > + > > /* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX > > registers are mutually exclusive. */ > > # define STATE_SAVE_MASK \ > > @@ -68,6 +96,7 @@ > > /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386 > > doesn't have red-zone, use 0 here. */ > > # define STATE_SAVE_OFFSET 0 > > +# define TLSDESC_CALL_REGISTER_SAVE_AREA 0 > > > > /* Save SSE, AVX, AXV512, mask and bound registers. */ > > # define STATE_SAVE_MASK \ > > diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S b/sysdeps/x86_64/tst-gnu2-tls2mod1.S > > new file mode 100644 > > index 0000000000..449ddd5c9d > > --- /dev/null > > +++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S > > @@ -0,0 +1,87 @@ > > +/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx. > > + Copyright (C) 2024 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <http://www.gnu.org/licenses/>. */ > > + > > +#include <sysdep.h> > > + > > +/* On AVX512 machines, OFFSET == 104 caused _dl_tlsdesc_dynamic_xsavec > > + to clobber %rdi, %rsi and %rbx. On Intel AVX CPUs, the state size > > + is 960 bytes and this test didn't fail. It may be due to the unused > > + last 128 bytes. On AMD AVX CPUs, the state size is 832 bytes and > > + this test might fail without the fix. */ > > +#ifndef OFFSET > > +# define OFFSET 104 > > +#endif > > + > > + .text > > + .p2align 4 > > + .globl apply_tls > > + .type apply_tls, @function > > +apply_tls: > > + cfi_startproc > > + _CET_ENDBR > > + pushq %rbp > > + cfi_def_cfa_offset (16) > > + cfi_offset (6, -16) > > + movdqu (%RDI_LP), %xmm0 > > + lea tls_var1@TLSDESC(%rip), %RAX_LP > > + mov %RSP_LP, %RBP_LP > > + cfi_def_cfa_register (6) > > + /* Align stack to 64 bytes. */ > > + and $-64, %RSP_LP > > + sub $OFFSET, %RSP_LP > > + pushq %rbx > > + /* Set %ebx to 0xbadbeef. */ > > + movl $0xbadbeef, %ebx > > + movl $0xbadbeef, %esi > > + movq %rdi, saved_rdi(%rip) > > + movq %rsi, saved_rsi(%rip) > > + call *tls_var1@TLSCALL(%RAX_LP) > > + /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx. */ > > + cmpq saved_rdi(%rip), %rdi > > + jne L(hlt) > > + cmpq saved_rsi(%rip), %rsi > > + jne L(hlt) > > + cmpl $0xbadbeef, %ebx > > + jne L(hlt) > > + add %fs:0, %RAX_LP > > + movups %xmm0, 32(%RAX_LP) > > + movdqu 16(%RDI_LP), %xmm1 > > + mov %RAX_LP, %RBX_LP > > + movups %xmm1, 48(%RAX_LP) > > + lea 32(%RBX_LP), %RAX_LP > > + pop %rbx > > + leave > > + cfi_def_cfa (7, 8) > > + ret > > +L(hlt): > > + hlt > > + cfi_endproc > > + .size apply_tls, .-apply_tls > > + .hidden tls_var1 > > + .globl tls_var1 > > + .section .tbss,"awT",@nobits > > + .align 16 > > + .type tls_var1, @object > > + .size tls_var1, 3200 > > +tls_var1: > > + .zero 3200 > > + .local saved_rdi > > + .comm saved_rdi,8,8 > > + .local saved_rsi > > + .comm saved_rsi,8,8 > > + .section .note.GNU-stack,"",@progbits > > -- > > 2.44.0 > > > > I need to adjust assembly codes. > Never mind. Not needed.
* H. J. Lu: > _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack. > After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define > TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX > to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to > STATE_SAVE_OFFSET(%rsp). > > +==================+<- stack frame start aligned at 8 or 16 bytes It's 8-byte aligned only? If the caller uses the psABI convention, we have %rsp ≡ 8 (mod 16). > | |<- RDI > | |<- RSI > | |<- RBX I would add something like “originally in the red zone” here. > | |<- paddings from stack realignment of 64 bytes > |------------------|<- xsave buffer end aligned at 64 bytes > | |<- > | |<- > | |<- > |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) > | |<- 8-byte padding Maybe add “to achieve 64-bit alignment”. > | |<- 8-byte padding > | |<- R11 > | |<- R10 > | |<- R9 > | |<- R8 > | |<- RDX > | |<- RCX > +==================+<- State buffer start aligned at 64 bytes Isn't this “%rsp aligned at 64 bytes”? Likewise for the comment in the sources below. > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h > index db8e576e91..46fcd27345 100644 > --- a/sysdeps/x86/sysdep.h > +++ b/sysdeps/x86/sysdep.h > @@ -46,6 +46,34 @@ > red-zone into account. */ > # define STATE_SAVE_OFFSET (8 * 7 + 8) The comment on STATE_SAVE_OFFSET needs updating, too. I would like to see comments from Noah or Sunil. You'll have to maintain this, too. 8-) I find the macro consts rather confusing, but maybe that's just me. Thanks, Florian
On Mon, Mar 18, 2024 at 3:22 AM Florian Weimer <fweimer@redhat.com> wrote: > > * H. J. Lu: > > > _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack. > > After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define > > TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX > > to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to > > STATE_SAVE_OFFSET(%rsp). > > > > +==================+<- stack frame start aligned at 8 or 16 bytes > > It's 8-byte aligned only? If the caller uses the psABI convention, we > have %rsp ≡ 8 (mod 16). It is due to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 Do we need to support older compilers? > > > | |<- RDI > > | |<- RSI > > | |<- RBX > > I would add something like “originally in the red zone” here. Fixed. > > | |<- paddings from stack realignment of 64 bytes > > |------------------|<- xsave buffer end aligned at 64 bytes > > > > | |<- > > | |<- > > | |<- > > |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) > > | |<- 8-byte padding > > Maybe add “to achieve 64-bit alignment”. Fixed. > > | |<- 8-byte padding > > | |<- R11 > > | |<- R10 > > | |<- R9 > > | |<- R8 > > | |<- RDX > > | |<- RCX > > +==================+<- State buffer start aligned at 64 bytes > > Isn't this “%rsp aligned at 64 bytes”? Fixed. > Likewise for the comment in the sources below. Fixed. > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h > > index db8e576e91..46fcd27345 100644 > > --- a/sysdeps/x86/sysdep.h > > +++ b/sysdeps/x86/sysdep.h > > @@ -46,6 +46,34 @@ > > red-zone into account. */ > > # define STATE_SAVE_OFFSET (8 * 7 + 8) > > The comment on STATE_SAVE_OFFSET needs updating, too. Fixed. > I would like to see comments from Noah or Sunil. You'll have to > maintain this, too. 8-) CCed Noah and Sunil. > I find the macro consts rather confusing, but maybe that's just me. Can you suggest a different name? The v5 patch is at https://patchwork.sourceware.org/project/glibc/list/?series=31992 Thanks.
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 4ea373dffa..3d7c2819d7 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -311,7 +311,7 @@ update_active (struct cpu_features *cpu_features) /* NB: On AMX capable processors, ebx always includes AMX states. */ unsigned int xsave_state_full_size - = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64); + = ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64); cpu_features->xsave_state_size = xsave_state_full_size; @@ -401,8 +401,10 @@ update_active (struct cpu_features *cpu_features) unsigned int amx_size = (xstate_amx_comp_offsets[31] + xstate_amx_comp_sizes[31]); - amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET, - 64); + amx_size + = ALIGN_UP ((amx_size + + TLSDESC_CALL_REGISTER_SAVE_AREA), + 64); /* Set xsave_state_full_size to the compact AMX state size for XSAVEC. NB: xsave_state_full_size is only used in _dl_tlsdesc_dynamic_xsave and @@ -410,7 +412,8 @@ update_active (struct cpu_features *cpu_features) cpu_features->xsave_state_full_size = amx_size; #endif cpu_features->xsave_state_size - = ALIGN_UP (size + STATE_SAVE_OFFSET, 64); + = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, + 64); CPU_FEATURE_SET (cpu_features, XSAVEC); } } diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h index db8e576e91..46fcd27345 100644 --- a/sysdeps/x86/sysdep.h +++ b/sysdeps/x86/sysdep.h @@ -46,6 +46,34 @@ red-zone into account. */ # define STATE_SAVE_OFFSET (8 * 7 + 8) +/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning + stack. After realigning stack, it saves RCX, RDX, R8, R9, R10 and + R11. Allocate space for RDI, RSI and RBX to avoid clobbering saved + RDI, RSI and RBX values on stack by xsave. + + +==================+<- stack frame start aligned at 8 or 16 bytes + | |<- RDI + | |<- RSI + | |<- RBX + | |<- paddings from stack realignment of 64 bytes + |------------------|<- xsave buffer end aligned at 64 bytes + | |<- + | |<- + | |<- + |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) + | |<- 8-byte padding + | |<- 8-byte padding + | |<- R11 + | |<- R10 + | |<- R9 + | |<- R8 + | |<- RDX + | |<- RCX + +==================+<- State buffer start aligned at 64 bytes + +*/ +# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24) + /* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX registers are mutually exclusive. */ # define STATE_SAVE_MASK \ @@ -68,6 +96,7 @@ /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386 doesn't have red-zone, use 0 here. */ # define STATE_SAVE_OFFSET 0 +# define TLSDESC_CALL_REGISTER_SAVE_AREA 0 /* Save SSE, AVX, AXV512, mask and bound registers. */ # define STATE_SAVE_MASK \ diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S b/sysdeps/x86_64/tst-gnu2-tls2mod1.S new file mode 100644 index 0000000000..449ddd5c9d --- /dev/null +++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S @@ -0,0 +1,87 @@ +/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* On AVX512 machines, OFFSET == 104 caused _dl_tlsdesc_dynamic_xsavec + to clobber %rdi, %rsi and %rbx. On Intel AVX CPUs, the state size + is 960 bytes and this test didn't fail. It may be due to the unused + last 128 bytes. On AMD AVX CPUs, the state size is 832 bytes and + this test might fail without the fix. */ +#ifndef OFFSET +# define OFFSET 104 +#endif + + .text + .p2align 4 + .globl apply_tls + .type apply_tls, @function +apply_tls: + cfi_startproc + _CET_ENDBR + pushq %rbp + cfi_def_cfa_offset (16) + cfi_offset (6, -16) + movdqu (%RDI_LP), %xmm0 + lea tls_var1@TLSDESC(%rip), %RAX_LP + mov %RSP_LP, %RBP_LP + cfi_def_cfa_register (6) + /* Align stack to 64 bytes. */ + and $-64, %RSP_LP + sub $OFFSET, %RSP_LP + pushq %rbx + /* Set %ebx to 0xbadbeef. */ + movl $0xbadbeef, %ebx + movl $0xbadbeef, %esi + movq %rdi, saved_rdi(%rip) + movq %rsi, saved_rsi(%rip) + call *tls_var1@TLSCALL(%RAX_LP) + /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx. */ + cmpq saved_rdi(%rip), %rdi + jne L(hlt) + cmpq saved_rsi(%rip), %rsi + jne L(hlt) + cmpl $0xbadbeef, %ebx + jne L(hlt) + add %fs:0, %RAX_LP + movups %xmm0, 32(%RAX_LP) + movdqu 16(%RDI_LP), %xmm1 + mov %RAX_LP, %RBX_LP + movups %xmm1, 48(%RAX_LP) + lea 32(%RBX_LP), %RAX_LP + pop %rbx + leave + cfi_def_cfa (7, 8) + ret +L(hlt): + hlt + cfi_endproc + .size apply_tls, .-apply_tls + .hidden tls_var1 + .globl tls_var1 + .section .tbss,"awT",@nobits + .align 16 + .type tls_var1, @object + .size tls_var1, 3200 +tls_var1: + .zero 3200 + .local saved_rdi + .comm saved_rdi,8,8 + .local saved_rsi + .comm saved_rsi,8,8 + .section .note.GNU-stack,"",@progbits