Message ID | 20221014164008.1325863-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,1/3] x86: Update evex256/512 vec macros | expand |
On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > This is to make it easier to do think like: > ``` > vpcmpb %VEC(0), %VEC(1), %k0 > kmov{d|q} %k0, %{eax|rax} > test %{eax|rax} > ``` Since all these register macros are based on VEC_SIZE which is either 32 bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are needed. 8-bit and 16-bit macros aren't needed. > It adds macro s.t any GPR can get the proper width with: > `V{upper_case_GPR_name}` > > and any mask insn can get the proper width with: > `{mask_insn_without_postfix}V` All macros should be in upper cases. > This commit does not change libc.so > > Tested build on x86-64 > --- > sysdeps/x86_64/multiarch/reg-macros.h | 337 ++++++++++++++++++ > .../multiarch/scripts/gen-reg-map-macros.py | 97 +++++ > 2 files changed, 434 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h > create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h > new file mode 100644 > index 0000000000..c4d7f57b66 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/reg-macros.h vreg-macros.h to indicate macros based on vector size. Please add comments to indicate that register macros are expanded based on vector size. > @@ -0,0 +1,337 @@ > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py. > + > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifndef _REG_MAP_MACROS_H > +#define _REG_MAP_MACROS_H 1 > + > +#define rax_8 al > +#define eax_8 al > +#define ax_8 al > +#define al_8 al > +#define rax_16 ax > +#define eax_16 ax > +#define ax_16 ax > +#define al_16 ax > +#define rax_32 eax > +#define eax_32 eax > +#define ax_32 eax > +#define al_32 eax > +#define rax_64 rax > +#define eax_64 rax > +#define ax_64 rax > +#define al_64 rax Only rax_32 and rax_64 are needed. > +#define rbx_8 bl > +#define ebx_8 bl > +#define bx_8 bl > +#define bl_8 bl > +#define rbx_16 bx > +#define ebx_16 bx > +#define bx_16 bx > +#define bl_16 bx > +#define rbx_32 ebx > +#define ebx_32 ebx > +#define bx_32 ebx > +#define bl_32 ebx > +#define rbx_64 rbx > +#define ebx_64 rbx > +#define bx_64 rbx > +#define bl_64 rbx > +#define rcx_8 cl > +#define ecx_8 cl > +#define cx_8 cl > +#define cl_8 cl > +#define rcx_16 cx > +#define ecx_16 cx > +#define cx_16 cx > +#define cl_16 cx > +#define rcx_32 ecx > +#define ecx_32 ecx > +#define cx_32 ecx > +#define cl_32 ecx > +#define rcx_64 rcx > +#define ecx_64 rcx > +#define cx_64 rcx > +#define cl_64 rcx > +#define rdx_8 dl > +#define edx_8 dl > +#define dx_8 dl > +#define dl_8 dl > +#define rdx_16 dx > +#define edx_16 dx > +#define dx_16 dx > +#define dl_16 dx > +#define rdx_32 edx > +#define edx_32 edx > +#define dx_32 edx > +#define dl_32 edx > +#define rdx_64 rdx > +#define edx_64 rdx > +#define dx_64 rdx > +#define dl_64 rdx > +#define rbp_8 bpl > +#define ebp_8 bpl > +#define bp_8 bpl > +#define bpl_8 bpl > +#define rbp_16 bp > +#define ebp_16 bp > +#define bp_16 bp > +#define bpl_16 bp > +#define rbp_32 ebp > +#define ebp_32 ebp > +#define bp_32 ebp > +#define bpl_32 ebp > +#define rbp_64 rbp > +#define ebp_64 rbp > +#define bp_64 rbp > +#define bpl_64 rbp > +#define rsp_8 spl > +#define esp_8 spl > +#define sp_8 spl > +#define spl_8 spl > +#define rsp_16 sp > +#define esp_16 sp > +#define sp_16 sp > +#define spl_16 sp > +#define rsp_32 esp > +#define esp_32 esp > +#define sp_32 esp > +#define spl_32 esp > +#define rsp_64 rsp > +#define esp_64 rsp > +#define sp_64 rsp > +#define spl_64 rsp > +#define rsi_8 sil > +#define esi_8 sil > +#define si_8 sil > +#define sil_8 sil > +#define rsi_16 si > +#define esi_16 si > +#define si_16 si > +#define sil_16 si > +#define rsi_32 esi > +#define esi_32 esi > +#define si_32 esi > +#define sil_32 esi > +#define rsi_64 rsi > +#define esi_64 rsi > +#define si_64 rsi > +#define sil_64 rsi > +#define rdi_8 dil > +#define edi_8 dil > +#define di_8 dil > +#define dil_8 dil > +#define rdi_16 di > +#define edi_16 di > +#define di_16 di > +#define dil_16 di > +#define rdi_32 edi > +#define edi_32 edi > +#define di_32 edi > +#define dil_32 edi > +#define rdi_64 rdi > +#define edi_64 rdi > +#define di_64 rdi > +#define dil_64 rdi > +#define r8_8 r8b > +#define r8d_8 r8b > +#define r8w_8 r8b > +#define r8b_8 r8b > +#define r8_16 r8w > +#define r8d_16 r8w > +#define r8w_16 r8w > +#define r8b_16 r8w > +#define r8_32 r8d > +#define r8d_32 r8d > +#define r8w_32 r8d > +#define r8b_32 r8d > +#define r8_64 r8 > +#define r8d_64 r8 > +#define r8w_64 r8 > +#define r8b_64 r8 > +#define r9_8 r9b > +#define r9d_8 r9b > +#define r9w_8 r9b > +#define r9b_8 r9b > +#define r9_16 r9w > +#define r9d_16 r9w > +#define r9w_16 r9w > +#define r9b_16 r9w > +#define r9_32 r9d > +#define r9d_32 r9d > +#define r9w_32 r9d > +#define r9b_32 r9d > +#define r9_64 r9 > +#define r9d_64 r9 > +#define r9w_64 r9 > +#define r9b_64 r9 > +#define r10_8 r10b > +#define r10d_8 r10b > +#define r10w_8 r10b > +#define r10b_8 r10b > +#define r10_16 r10w > +#define r10d_16 r10w > +#define r10w_16 r10w > +#define r10b_16 r10w > +#define r10_32 r10d > +#define r10d_32 r10d > +#define r10w_32 r10d > +#define r10b_32 r10d > +#define r10_64 r10 > +#define r10d_64 r10 > +#define r10w_64 r10 > +#define r10b_64 r10 > +#define r11_8 r11b > +#define r11d_8 r11b > +#define r11w_8 r11b > +#define r11b_8 r11b > +#define r11_16 r11w > +#define r11d_16 r11w > +#define r11w_16 r11w > +#define r11b_16 r11w > +#define r11_32 r11d > +#define r11d_32 r11d > +#define r11w_32 r11d > +#define r11b_32 r11d > +#define r11_64 r11 > +#define r11d_64 r11 > +#define r11w_64 r11 > +#define r11b_64 r11 > +#define r12_8 r12b > +#define r12d_8 r12b > +#define r12w_8 r12b > +#define r12b_8 r12b > +#define r12_16 r12w > +#define r12d_16 r12w > +#define r12w_16 r12w > +#define r12b_16 r12w > +#define r12_32 r12d > +#define r12d_32 r12d > +#define r12w_32 r12d > +#define r12b_32 r12d > +#define r12_64 r12 > +#define r12d_64 r12 > +#define r12w_64 r12 > +#define r12b_64 r12 > +#define r13_8 r13b > +#define r13d_8 r13b > +#define r13w_8 r13b > +#define r13b_8 r13b > +#define r13_16 r13w > +#define r13d_16 r13w > +#define r13w_16 r13w > +#define r13b_16 r13w > +#define r13_32 r13d > +#define r13d_32 r13d > +#define r13w_32 r13d > +#define r13b_32 r13d > +#define r13_64 r13 > +#define r13d_64 r13 > +#define r13w_64 r13 > +#define r13b_64 r13 > +#define r14_8 r14b > +#define r14d_8 r14b > +#define r14w_8 r14b > +#define r14b_8 r14b > +#define r14_16 r14w > +#define r14d_16 r14w > +#define r14w_16 r14w > +#define r14b_16 r14w > +#define r14_32 r14d > +#define r14d_32 r14d > +#define r14w_32 r14d > +#define r14b_32 r14d > +#define r14_64 r14 > +#define r14d_64 r14 > +#define r14w_64 r14 > +#define r14b_64 r14 > +#define r15_8 r15b > +#define r15d_8 r15b > +#define r15w_8 r15b > +#define r15b_8 r15b > +#define r15_16 r15w > +#define r15d_16 r15w > +#define r15w_16 r15w > +#define r15b_16 r15w > +#define r15_32 r15d > +#define r15d_32 r15d > +#define r15w_32 r15d > +#define r15b_32 r15d > +#define r15_64 r15 > +#define r15d_64 r15 > +#define r15w_64 r15 > +#define r15b_64 r15 > + > +#define VRAX VGPR(rax) > +#define VRBX VGPR(rbx) > +#define VRCX VGPR(rcx) > +#define VRDX VGPR(rdx) > +#define VRBP VGPR(rbp) > +#define VRSP VGPR(rsp) > +#define VRSI VGPR(rsi) > +#define VRDI VGPR(rdi) > +#define VR8 VGPR(r8) > +#define VR9 VGPR(r9) > +#define VR10 VGPR(r10) > +#define VR11 VGPR(r11) > +#define VR12 VGPR(r12) > +#define VR13 VGPR(r13) > +#define VR14 VGPR(r14) > +#define VR15 VGPR(r15) > + > +#define kmov_8 kmovb > +#define kmov_16 kmovw > +#define kmov_32 kmovd > +#define kmov_64 kmovq Only 32 and 64 are needed. > +#define kortest_8 kortestb > +#define kortest_16 kortestw > +#define kortest_32 kortestd > +#define kortest_64 kortestq > +#define kor_8 korb > +#define kor_16 korw > +#define kor_32 kord > +#define kor_64 korq > +#define ktest_8 ktestb > +#define ktest_16 ktestw > +#define ktest_32 ktestd > +#define ktest_64 ktestq > +#define kand_8 kandb > +#define kand_16 kandw > +#define kand_32 kandd > +#define kand_64 kandq > +#define kxor_8 kxorb > +#define kxor_16 kxorw > +#define kxor_32 kxord > +#define kxor_64 kxorq > + > +#define kmovV VKINSN_SZ(kmov, REG_WIDTH) > +#define kortestV VKINSN_SZ(kortest, REG_WIDTH) > +#define korV VKINSN_SZ(kor, REG_WIDTH) > +#define ktestV VKINSN_SZ(ktest, REG_WIDTH) > +#define kandV VKINSN_SZ(kand, REG_WIDTH) > +#define kxorV VKINSN_SZ(kxor, REG_WIDTH) #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH) > + > +#ifndef REG_WIDTH > +#define REG_WIDTH VEC_SIZE Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH can be dropped. > +#endif > +#define PRIM_VGPR_SZ(reg_name, reg_size) reg_name##_##reg_size This is used for both register and instruction. How about #define VPASTER(x,y) x##_##y > +#define VGPR_SZ(reg_name, reg_size) PRIM_VGPR_SZ(reg_name, reg_size) > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH) > +#define VKINSN_SZ(insn, reg_size) PRIM_VGPR_SZ(insn, reg_size) No need for both VGPR_SZ and VKINSN_SZ. How about #define VEVALUATOR(x,y) VPASTER(x,y) > + > +#endif > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > new file mode 100644 > index 0000000000..5b04e89ecb > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > @@ -0,0 +1,97 @@ > +#!/usr/bin/python3 > +# Copyright (C) 2022 Free Software Foundation, Inc. > +# This file is part of the GNU C Library. > +# > +# The GNU C Library is free software; you can redistribute it and/or > +# modify it under the terms of the GNU Lesser General Public > +# License as published by the Free Software Foundation; either > +# version 2.1 of the License, or (at your option) any later version. > +# > +# The GNU C Library is distributed in the hope that it will be useful, > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +# Lesser General Public License for more details. > +# > +# You should have received a copy of the GNU Lesser General Public > +# License along with the GNU C Library; if not, see > +# <https://www.gnu.org/licenses/>. > +"""Generate macros for getting GPR name of a certain size > + > +Inputs: None > +Output: Prints header fill to stdout > + > +API: > + VGPR(reg_name) > + - Get register name VEC_SIZE component of `reg_name` > + VGPR_SZ(reg_name, reg_size) > + - Get register name `reg_size` component of `reg_name` > +""" > + > +import sys > +from datetime import datetime > + > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"], > + ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"], > + ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"], > + ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"], > + ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"], > + ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"], > + ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"], > + ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]] > + > +mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"] > +mask_insns_ext = ["b", "w", "d", "q"] > + > +cr = """ > + Copyright (C) {} Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > +""" > + > +print("/* This file was generated by: {}.".format(sys.argv[0])) > +print(cr.format(datetime.today().year)) > + > +print("#ifndef _REG_MAP_MACROS_H") > +print("#define _REG_MAP_MACROS_H\t1\n") > +for reg in registers: > + for i in range(0, 4): > + for j in range(0, 4): > + print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i])) > + > +print("") > +for reg in registers: > + print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0])) > + > +print("") > +for mask_insn in mask_insns: > + for i in range(0, 4): > + print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn, > + mask_insns_ext[i])) > + > +print("") > +for mask_insn in mask_insns: > + print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn)) > +print("") > + > +print("#ifndef REG_WIDTH") > +print("#define REG_WIDTH VEC_SIZE") > +print("#endif") > +print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size") > +print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)") > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)") > +print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)") > + > +print("\n#endif") > -- > 2.34.1 >
On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > This is to make it easier to do think like: > > ``` > > vpcmpb %VEC(0), %VEC(1), %k0 > > kmov{d|q} %k0, %{eax|rax} > > test %{eax|rax} > > ``` > > Since all these register macros are based on VEC_SIZE which is either 32 > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are > needed. 8-bit and 16-bit macros aren't needed. > > > It adds macro s.t any GPR can get the proper width with: > > `V{upper_case_GPR_name}` > > > > and any mask insn can get the proper width with: > > `{mask_insn_without_postfix}V` > > All macros should be in upper cases. > > > This commit does not change libc.so > > > > Tested build on x86-64 > > --- > > sysdeps/x86_64/multiarch/reg-macros.h | 337 ++++++++++++++++++ > > .../multiarch/scripts/gen-reg-map-macros.py | 97 +++++ > > 2 files changed, 434 insertions(+) > > create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h > > create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h > > new file mode 100644 > > index 0000000000..c4d7f57b66 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h > > vreg-macros.h to indicate macros based on vector size. Please > add comments to indicate that register macros are expanded based > on vector size. > > > @@ -0,0 +1,337 @@ > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py. > > + > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#ifndef _REG_MAP_MACROS_H > > +#define _REG_MAP_MACROS_H 1 > > + > > +#define rax_8 al > > +#define eax_8 al > > +#define ax_8 al > > +#define al_8 al > > +#define rax_16 ax > > +#define eax_16 ax > > +#define ax_16 ax > > +#define al_16 ax > > +#define rax_32 eax > > +#define eax_32 eax > > +#define ax_32 eax > > +#define al_32 eax > > +#define rax_64 rax > > +#define eax_64 rax > > +#define ax_64 rax > > +#define al_64 rax > > Only rax_32 and rax_64 are needed. > > > +#define rbx_8 bl > > +#define ebx_8 bl > > +#define bx_8 bl > > +#define bl_8 bl > > +#define rbx_16 bx > > +#define ebx_16 bx > > +#define bx_16 bx > > +#define bl_16 bx > > +#define rbx_32 ebx > > +#define ebx_32 ebx > > +#define bx_32 ebx > > +#define bl_32 ebx > > +#define rbx_64 rbx > > +#define ebx_64 rbx > > +#define bx_64 rbx > > +#define bl_64 rbx > > +#define rcx_8 cl > > +#define ecx_8 cl > > +#define cx_8 cl > > +#define cl_8 cl > > +#define rcx_16 cx > > +#define ecx_16 cx > > +#define cx_16 cx > > +#define cl_16 cx > > +#define rcx_32 ecx > > +#define ecx_32 ecx > > +#define cx_32 ecx > > +#define cl_32 ecx > > +#define rcx_64 rcx > > +#define ecx_64 rcx > > +#define cx_64 rcx > > +#define cl_64 rcx > > +#define rdx_8 dl > > +#define edx_8 dl > > +#define dx_8 dl > > +#define dl_8 dl > > +#define rdx_16 dx > > +#define edx_16 dx > > +#define dx_16 dx > > +#define dl_16 dx > > +#define rdx_32 edx > > +#define edx_32 edx > > +#define dx_32 edx > > +#define dl_32 edx > > +#define rdx_64 rdx > > +#define edx_64 rdx > > +#define dx_64 rdx > > +#define dl_64 rdx > > +#define rbp_8 bpl > > +#define ebp_8 bpl > > +#define bp_8 bpl > > +#define bpl_8 bpl > > +#define rbp_16 bp > > +#define ebp_16 bp > > +#define bp_16 bp > > +#define bpl_16 bp > > +#define rbp_32 ebp > > +#define ebp_32 ebp > > +#define bp_32 ebp > > +#define bpl_32 ebp > > +#define rbp_64 rbp > > +#define ebp_64 rbp > > +#define bp_64 rbp > > +#define bpl_64 rbp > > +#define rsp_8 spl > > +#define esp_8 spl > > +#define sp_8 spl > > +#define spl_8 spl > > +#define rsp_16 sp > > +#define esp_16 sp > > +#define sp_16 sp > > +#define spl_16 sp > > +#define rsp_32 esp > > +#define esp_32 esp > > +#define sp_32 esp > > +#define spl_32 esp > > +#define rsp_64 rsp > > +#define esp_64 rsp > > +#define sp_64 rsp > > +#define spl_64 rsp > > +#define rsi_8 sil > > +#define esi_8 sil > > +#define si_8 sil > > +#define sil_8 sil > > +#define rsi_16 si > > +#define esi_16 si > > +#define si_16 si > > +#define sil_16 si > > +#define rsi_32 esi > > +#define esi_32 esi > > +#define si_32 esi > > +#define sil_32 esi > > +#define rsi_64 rsi > > +#define esi_64 rsi > > +#define si_64 rsi > > +#define sil_64 rsi > > +#define rdi_8 dil > > +#define edi_8 dil > > +#define di_8 dil > > +#define dil_8 dil > > +#define rdi_16 di > > +#define edi_16 di > > +#define di_16 di > > +#define dil_16 di > > +#define rdi_32 edi > > +#define edi_32 edi > > +#define di_32 edi > > +#define dil_32 edi > > +#define rdi_64 rdi > > +#define edi_64 rdi > > +#define di_64 rdi > > +#define dil_64 rdi > > +#define r8_8 r8b > > +#define r8d_8 r8b > > +#define r8w_8 r8b > > +#define r8b_8 r8b > > +#define r8_16 r8w > > +#define r8d_16 r8w > > +#define r8w_16 r8w > > +#define r8b_16 r8w > > +#define r8_32 r8d > > +#define r8d_32 r8d > > +#define r8w_32 r8d > > +#define r8b_32 r8d > > +#define r8_64 r8 > > +#define r8d_64 r8 > > +#define r8w_64 r8 > > +#define r8b_64 r8 > > +#define r9_8 r9b > > +#define r9d_8 r9b > > +#define r9w_8 r9b > > +#define r9b_8 r9b > > +#define r9_16 r9w > > +#define r9d_16 r9w > > +#define r9w_16 r9w > > +#define r9b_16 r9w > > +#define r9_32 r9d > > +#define r9d_32 r9d > > +#define r9w_32 r9d > > +#define r9b_32 r9d > > +#define r9_64 r9 > > +#define r9d_64 r9 > > +#define r9w_64 r9 > > +#define r9b_64 r9 > > +#define r10_8 r10b > > +#define r10d_8 r10b > > +#define r10w_8 r10b > > +#define r10b_8 r10b > > +#define r10_16 r10w > > +#define r10d_16 r10w > > +#define r10w_16 r10w > > +#define r10b_16 r10w > > +#define r10_32 r10d > > +#define r10d_32 r10d > > +#define r10w_32 r10d > > +#define r10b_32 r10d > > +#define r10_64 r10 > > +#define r10d_64 r10 > > +#define r10w_64 r10 > > +#define r10b_64 r10 > > +#define r11_8 r11b > > +#define r11d_8 r11b > > +#define r11w_8 r11b > > +#define r11b_8 r11b > > +#define r11_16 r11w > > +#define r11d_16 r11w > > +#define r11w_16 r11w > > +#define r11b_16 r11w > > +#define r11_32 r11d > > +#define r11d_32 r11d > > +#define r11w_32 r11d > > +#define r11b_32 r11d > > +#define r11_64 r11 > > +#define r11d_64 r11 > > +#define r11w_64 r11 > > +#define r11b_64 r11 > > +#define r12_8 r12b > > +#define r12d_8 r12b > > +#define r12w_8 r12b > > +#define r12b_8 r12b > > +#define r12_16 r12w > > +#define r12d_16 r12w > > +#define r12w_16 r12w > > +#define r12b_16 r12w > > +#define r12_32 r12d > > +#define r12d_32 r12d > > +#define r12w_32 r12d > > +#define r12b_32 r12d > > +#define r12_64 r12 > > +#define r12d_64 r12 > > +#define r12w_64 r12 > > +#define r12b_64 r12 > > +#define r13_8 r13b > > +#define r13d_8 r13b > > +#define r13w_8 r13b > > +#define r13b_8 r13b > > +#define r13_16 r13w > > +#define r13d_16 r13w > > +#define r13w_16 r13w > > +#define r13b_16 r13w > > +#define r13_32 r13d > > +#define r13d_32 r13d > > +#define r13w_32 r13d > > +#define r13b_32 r13d > > +#define r13_64 r13 > > +#define r13d_64 r13 > > +#define r13w_64 r13 > > +#define r13b_64 r13 > > +#define r14_8 r14b > > +#define r14d_8 r14b > > +#define r14w_8 r14b > > +#define r14b_8 r14b > > +#define r14_16 r14w > > +#define r14d_16 r14w > > +#define r14w_16 r14w > > +#define r14b_16 r14w > > +#define r14_32 r14d > > +#define r14d_32 r14d > > +#define r14w_32 r14d > > +#define r14b_32 r14d > > +#define r14_64 r14 > > +#define r14d_64 r14 > > +#define r14w_64 r14 > > +#define r14b_64 r14 > > +#define r15_8 r15b > > +#define r15d_8 r15b > > +#define r15w_8 r15b > > +#define r15b_8 r15b > > +#define r15_16 r15w > > +#define r15d_16 r15w > > +#define r15w_16 r15w > > +#define r15b_16 r15w > > +#define r15_32 r15d > > +#define r15d_32 r15d > > +#define r15w_32 r15d > > +#define r15b_32 r15d > > +#define r15_64 r15 > > +#define r15d_64 r15 > > +#define r15w_64 r15 > > +#define r15b_64 r15 > > + > > +#define VRAX VGPR(rax) > > +#define VRBX VGPR(rbx) > > +#define VRCX VGPR(rcx) > > +#define VRDX VGPR(rdx) > > +#define VRBP VGPR(rbp) > > +#define VRSP VGPR(rsp) > > +#define VRSI VGPR(rsi) > > +#define VRDI VGPR(rdi) > > +#define VR8 VGPR(r8) > > +#define VR9 VGPR(r9) > > +#define VR10 VGPR(r10) > > +#define VR11 VGPR(r11) > > +#define VR12 VGPR(r12) > > +#define VR13 VGPR(r13) > > +#define VR14 VGPR(r14) > > +#define VR15 VGPR(r15) > > + > > +#define kmov_8 kmovb > > +#define kmov_16 kmovw > > +#define kmov_32 kmovd > > +#define kmov_64 kmovq > > Only 32 and 64 are needed. Thats not entirely true for the wide-char impls. > > > +#define kortest_8 kortestb > > +#define kortest_16 kortestw > > +#define kortest_32 kortestd > > +#define kortest_64 kortestq > > +#define kor_8 korb > > +#define kor_16 korw > > +#define kor_32 kord > > +#define kor_64 korq > > +#define ktest_8 ktestb > > +#define ktest_16 ktestw > > +#define ktest_32 ktestd > > +#define ktest_64 ktestq > > +#define kand_8 kandb > > +#define kand_16 kandw > > +#define kand_32 kandd > > +#define kand_64 kandq > > +#define kxor_8 kxorb > > +#define kxor_16 kxorw > > +#define kxor_32 kxord > > +#define kxor_64 kxorq > > + > > +#define kmovV VKINSN_SZ(kmov, REG_WIDTH) > > +#define kortestV VKINSN_SZ(kortest, REG_WIDTH) > > +#define korV VKINSN_SZ(kor, REG_WIDTH) > > +#define ktestV VKINSN_SZ(ktest, REG_WIDTH) > > +#define kandV VKINSN_SZ(kand, REG_WIDTH) > > +#define kxorV VKINSN_SZ(kxor, REG_WIDTH) > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH) Will fix for V5. > > > + > > +#ifndef REG_WIDTH > > +#define REG_WIDTH VEC_SIZE > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH > can be dropped. Thats not quite true. For wide-char impls REG_WIDTH != VEC_SIZE. > > > +#endif > > +#define PRIM_VGPR_SZ(reg_name, reg_size) reg_name##_##reg_size > > This is used for both register and instruction. How about > > #define VPASTER(x,y) x##_##y Will fix for V5. > > > > +#define VGPR_SZ(reg_name, reg_size) PRIM_VGPR_SZ(reg_name, reg_size) > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH) > > +#define VKINSN_SZ(insn, reg_size) PRIM_VGPR_SZ(insn, reg_size) > > No need for both VGPR_SZ and VKINSN_SZ. How about > > #define VEVALUATOR(x,y) VPASTER(x,y) Will change for V5. > > > + > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > new file mode 100644 > > index 0000000000..5b04e89ecb > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > @@ -0,0 +1,97 @@ > > +#!/usr/bin/python3 > > +# Copyright (C) 2022 Free Software Foundation, Inc. > > +# This file is part of the GNU C Library. > > +# > > +# The GNU C Library is free software; you can redistribute it and/or > > +# modify it under the terms of the GNU Lesser General Public > > +# License as published by the Free Software Foundation; either > > +# version 2.1 of the License, or (at your option) any later version. > > +# > > +# The GNU C Library is distributed in the hope that it will be useful, > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > +# Lesser General Public License for more details. > > +# > > +# You should have received a copy of the GNU Lesser General Public > > +# License along with the GNU C Library; if not, see > > +# <https://www.gnu.org/licenses/>. > > +"""Generate macros for getting GPR name of a certain size > > + > > +Inputs: None > > +Output: Prints header fill to stdout > > + > > +API: > > + VGPR(reg_name) > > + - Get register name VEC_SIZE component of `reg_name` > > + VGPR_SZ(reg_name, reg_size) > > + - Get register name `reg_size` component of `reg_name` > > +""" > > + > > +import sys > > +from datetime import datetime > > + > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"], > > + ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"], > > + ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"], > > + ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"], > > + ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"], > > + ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"], > > + ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"], > > + ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]] > > + > > +mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"] > > +mask_insns_ext = ["b", "w", "d", "q"] > > + > > +cr = """ > > + Copyright (C) {} Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > +""" > > + > > +print("/* This file was generated by: {}.".format(sys.argv[0])) > > +print(cr.format(datetime.today().year)) > > + > > +print("#ifndef _REG_MAP_MACROS_H") > > +print("#define _REG_MAP_MACROS_H\t1\n") > > +for reg in registers: > > + for i in range(0, 4): > > + for j in range(0, 4): > > + print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i])) > > + > > +print("") > > +for reg in registers: > > + print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0])) > > + > > +print("") > > +for mask_insn in mask_insns: > > + for i in range(0, 4): > > + print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn, > > + mask_insns_ext[i])) > > + > > +print("") > > +for mask_insn in mask_insns: > > + print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn)) > > +print("") > > + > > +print("#ifndef REG_WIDTH") > > +print("#define REG_WIDTH VEC_SIZE") > > +print("#endif") > > +print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size") > > +print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)") > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)") > > +print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)") > > + > > +print("\n#endif") > > -- > > 2.34.1 > > > > > -- > H.J.
On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > This is to make it easier to do think like: > > > ``` > > > vpcmpb %VEC(0), %VEC(1), %k0 > > > kmov{d|q} %k0, %{eax|rax} > > > test %{eax|rax} > > > ``` > > > > Since all these register macros are based on VEC_SIZE which is either 32 > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are > > needed. 8-bit and 16-bit macros aren't needed. > > > > > It adds macro s.t any GPR can get the proper width with: > > > `V{upper_case_GPR_name}` > > > > > > and any mask insn can get the proper width with: > > > `{mask_insn_without_postfix}V` > > > > All macros should be in upper cases. > > > > > This commit does not change libc.so > > > > > > Tested build on x86-64 > > > --- > > > sysdeps/x86_64/multiarch/reg-macros.h | 337 ++++++++++++++++++ > > > .../multiarch/scripts/gen-reg-map-macros.py | 97 +++++ > > > 2 files changed, 434 insertions(+) > > > create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h > > > create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h > > > new file mode 100644 > > > index 0000000000..c4d7f57b66 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h > > > > vreg-macros.h to indicate macros based on vector size. Please > > add comments to indicate that register macros are expanded based > > on vector size. > > > > > @@ -0,0 +1,337 @@ > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py. > > > + > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +#ifndef _REG_MAP_MACROS_H > > > +#define _REG_MAP_MACROS_H 1 > > > + > > > +#define rax_8 al > > > +#define eax_8 al > > > +#define ax_8 al > > > +#define al_8 al > > > +#define rax_16 ax > > > +#define eax_16 ax > > > +#define ax_16 ax > > > +#define al_16 ax > > > +#define rax_32 eax > > > +#define eax_32 eax > > > +#define ax_32 eax > > > +#define al_32 eax > > > +#define rax_64 rax > > > +#define eax_64 rax > > > +#define ax_64 rax > > > +#define al_64 rax > > > > Only rax_32 and rax_64 are needed. > > > > > +#define rbx_8 bl > > > +#define ebx_8 bl > > > +#define bx_8 bl > > > +#define bl_8 bl > > > +#define rbx_16 bx > > > +#define ebx_16 bx > > > +#define bx_16 bx > > > +#define bl_16 bx > > > +#define rbx_32 ebx > > > +#define ebx_32 ebx > > > +#define bx_32 ebx > > > +#define bl_32 ebx > > > +#define rbx_64 rbx > > > +#define ebx_64 rbx > > > +#define bx_64 rbx > > > +#define bl_64 rbx > > > +#define rcx_8 cl > > > +#define ecx_8 cl > > > +#define cx_8 cl > > > +#define cl_8 cl > > > +#define rcx_16 cx > > > +#define ecx_16 cx > > > +#define cx_16 cx > > > +#define cl_16 cx > > > +#define rcx_32 ecx > > > +#define ecx_32 ecx > > > +#define cx_32 ecx > > > +#define cl_32 ecx > > > +#define rcx_64 rcx > > > +#define ecx_64 rcx > > > +#define cx_64 rcx > > > +#define cl_64 rcx > > > +#define rdx_8 dl > > > +#define edx_8 dl > > > +#define dx_8 dl > > > +#define dl_8 dl > > > +#define rdx_16 dx > > > +#define edx_16 dx > > > +#define dx_16 dx > > > +#define dl_16 dx > > > +#define rdx_32 edx > > > +#define edx_32 edx > > > +#define dx_32 edx > > > +#define dl_32 edx > > > +#define rdx_64 rdx > > > +#define edx_64 rdx > > > +#define dx_64 rdx > > > +#define dl_64 rdx > > > +#define rbp_8 bpl > > > +#define ebp_8 bpl > > > +#define bp_8 bpl > > > +#define bpl_8 bpl > > > +#define rbp_16 bp > > > +#define ebp_16 bp > > > +#define bp_16 bp > > > +#define bpl_16 bp > > > +#define rbp_32 ebp > > > +#define ebp_32 ebp > > > +#define bp_32 ebp > > > +#define bpl_32 ebp > > > +#define rbp_64 rbp > > > +#define ebp_64 rbp > > > +#define bp_64 rbp > > > +#define bpl_64 rbp > > > +#define rsp_8 spl > > > +#define esp_8 spl > > > +#define sp_8 spl > > > +#define spl_8 spl > > > +#define rsp_16 sp > > > +#define esp_16 sp > > > +#define sp_16 sp > > > +#define spl_16 sp > > > +#define rsp_32 esp > > > +#define esp_32 esp > > > +#define sp_32 esp > > > +#define spl_32 esp > > > +#define rsp_64 rsp > > > +#define esp_64 rsp > > > +#define sp_64 rsp > > > +#define spl_64 rsp > > > +#define rsi_8 sil > > > +#define esi_8 sil > > > +#define si_8 sil > > > +#define sil_8 sil > > > +#define rsi_16 si > > > +#define esi_16 si > > > +#define si_16 si > > > +#define sil_16 si > > > +#define rsi_32 esi > > > +#define esi_32 esi > > > +#define si_32 esi > > > +#define sil_32 esi > > > +#define rsi_64 rsi > > > +#define esi_64 rsi > > > +#define si_64 rsi > > > +#define sil_64 rsi > > > +#define rdi_8 dil > > > +#define edi_8 dil > > > +#define di_8 dil > > > +#define dil_8 dil > > > +#define rdi_16 di > > > +#define edi_16 di > > > +#define di_16 di > > > +#define dil_16 di > > > +#define rdi_32 edi > > > +#define edi_32 edi > > > +#define di_32 edi > > > +#define dil_32 edi > > > +#define rdi_64 rdi > > > +#define edi_64 rdi > > > +#define di_64 rdi > > > +#define dil_64 rdi > > > +#define r8_8 r8b > > > +#define r8d_8 r8b > > > +#define r8w_8 r8b > > > +#define r8b_8 r8b > > > +#define r8_16 r8w > > > +#define r8d_16 r8w > > > +#define r8w_16 r8w > > > +#define r8b_16 r8w > > > +#define r8_32 r8d > > > +#define r8d_32 r8d > > > +#define r8w_32 r8d > > > +#define r8b_32 r8d > > > +#define r8_64 r8 > > > +#define r8d_64 r8 > > > +#define r8w_64 r8 > > > +#define r8b_64 r8 > > > +#define r9_8 r9b > > > +#define r9d_8 r9b > > > +#define r9w_8 r9b > > > +#define r9b_8 r9b > > > +#define r9_16 r9w > > > +#define r9d_16 r9w > > > +#define r9w_16 r9w > > > +#define r9b_16 r9w > > > +#define r9_32 r9d > > > +#define r9d_32 r9d > > > +#define r9w_32 r9d > > > +#define r9b_32 r9d > > > +#define r9_64 r9 > > > +#define r9d_64 r9 > > > +#define r9w_64 r9 > > > +#define r9b_64 r9 > > > +#define r10_8 r10b > > > +#define r10d_8 r10b > > > +#define r10w_8 r10b > > > +#define r10b_8 r10b > > > +#define r10_16 r10w > > > +#define r10d_16 r10w > > > +#define r10w_16 r10w > > > +#define r10b_16 r10w > > > +#define r10_32 r10d > > > +#define r10d_32 r10d > > > +#define r10w_32 r10d > > > +#define r10b_32 r10d > > > +#define r10_64 r10 > > > +#define r10d_64 r10 > > > +#define r10w_64 r10 > > > +#define r10b_64 r10 > > > +#define r11_8 r11b > > > +#define r11d_8 r11b > > > +#define r11w_8 r11b > > > +#define r11b_8 r11b > > > +#define r11_16 r11w > > > +#define r11d_16 r11w > > > +#define r11w_16 r11w > > > +#define r11b_16 r11w > > > +#define r11_32 r11d > > > +#define r11d_32 r11d > > > +#define r11w_32 r11d > > > +#define r11b_32 r11d > > > +#define r11_64 r11 > > > +#define r11d_64 r11 > > > +#define r11w_64 r11 > > > +#define r11b_64 r11 > > > +#define r12_8 r12b > > > +#define r12d_8 r12b > > > +#define r12w_8 r12b > > > +#define r12b_8 r12b > > > +#define r12_16 r12w > > > +#define r12d_16 r12w > > > +#define r12w_16 r12w > > > +#define r12b_16 r12w > > > +#define r12_32 r12d > > > +#define r12d_32 r12d > > > +#define r12w_32 r12d > > > +#define r12b_32 r12d > > > +#define r12_64 r12 > > > +#define r12d_64 r12 > > > +#define r12w_64 r12 > > > +#define r12b_64 r12 > > > +#define r13_8 r13b > > > +#define r13d_8 r13b > > > +#define r13w_8 r13b > > > +#define r13b_8 r13b > > > +#define r13_16 r13w > > > +#define r13d_16 r13w > > > +#define r13w_16 r13w > > > +#define r13b_16 r13w > > > +#define r13_32 r13d > > > +#define r13d_32 r13d > > > +#define r13w_32 r13d > > > +#define r13b_32 r13d > > > +#define r13_64 r13 > > > +#define r13d_64 r13 > > > +#define r13w_64 r13 > > > +#define r13b_64 r13 > > > +#define r14_8 r14b > > > +#define r14d_8 r14b > > > +#define r14w_8 r14b > > > +#define r14b_8 r14b > > > +#define r14_16 r14w > > > +#define r14d_16 r14w > > > +#define r14w_16 r14w > > > +#define r14b_16 r14w > > > +#define r14_32 r14d > > > +#define r14d_32 r14d > > > +#define r14w_32 r14d > > > +#define r14b_32 r14d > > > +#define r14_64 r14 > > > +#define r14d_64 r14 > > > +#define r14w_64 r14 > > > +#define r14b_64 r14 > > > +#define r15_8 r15b > > > +#define r15d_8 r15b > > > +#define r15w_8 r15b > > > +#define r15b_8 r15b > > > +#define r15_16 r15w > > > +#define r15d_16 r15w > > > +#define r15w_16 r15w > > > +#define r15b_16 r15w > > > +#define r15_32 r15d > > > +#define r15d_32 r15d > > > +#define r15w_32 r15d > > > +#define r15b_32 r15d > > > +#define r15_64 r15 > > > +#define r15d_64 r15 > > > +#define r15w_64 r15 > > > +#define r15b_64 r15 > > > + > > > +#define VRAX VGPR(rax) > > > +#define VRBX VGPR(rbx) > > > +#define VRCX VGPR(rcx) > > > +#define VRDX VGPR(rdx) > > > +#define VRBP VGPR(rbp) > > > +#define VRSP VGPR(rsp) > > > +#define VRSI VGPR(rsi) > > > +#define VRDI VGPR(rdi) > > > +#define VR8 VGPR(r8) > > > +#define VR9 VGPR(r9) > > > +#define VR10 VGPR(r10) > > > +#define VR11 VGPR(r11) > > > +#define VR12 VGPR(r12) > > > +#define VR13 VGPR(r13) > > > +#define VR14 VGPR(r14) > > > +#define VR15 VGPR(r15) > > > + > > > +#define kmov_8 kmovb > > > +#define kmov_16 kmovw > > > +#define kmov_32 kmovd > > > +#define kmov_64 kmovq > > > > Only 32 and 64 are needed. > > Thats not entirely true for the wide-char impls. > > > > > +#define kortest_8 kortestb > > > +#define kortest_16 kortestw > > > +#define kortest_32 kortestd > > > +#define kortest_64 kortestq > > > +#define kor_8 korb > > > +#define kor_16 korw > > > +#define kor_32 kord > > > +#define kor_64 korq > > > +#define ktest_8 ktestb > > > +#define ktest_16 ktestw > > > +#define ktest_32 ktestd > > > +#define ktest_64 ktestq > > > +#define kand_8 kandb > > > +#define kand_16 kandw > > > +#define kand_32 kandd > > > +#define kand_64 kandq > > > +#define kxor_8 kxorb > > > +#define kxor_16 kxorw > > > +#define kxor_32 kxord > > > +#define kxor_64 kxorq > > > + > > > +#define kmovV VKINSN_SZ(kmov, REG_WIDTH) > > > +#define kortestV VKINSN_SZ(kortest, REG_WIDTH) > > > +#define korV VKINSN_SZ(kor, REG_WIDTH) > > > +#define ktestV VKINSN_SZ(ktest, REG_WIDTH) > > > +#define kandV VKINSN_SZ(kand, REG_WIDTH) > > > +#define kxorV VKINSN_SZ(kxor, REG_WIDTH) > > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH) > > Will fix for V5. > > > > > + > > > +#ifndef REG_WIDTH > > > +#define REG_WIDTH VEC_SIZE > > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH > > can be dropped. > > Thats not quite true. > > For wide-char impls REG_WIDTH != VEC_SIZE. These register macros are used to operate vectors. Do you have an example of REG_WIDTH != VEC_SIZE? > > > > > +#endif > > > +#define PRIM_VGPR_SZ(reg_name, reg_size) reg_name##_##reg_size > > > > This is used for both register and instruction. How about > > > > #define VPASTER(x,y) x##_##y > > Will fix for V5. > > > > > > > +#define VGPR_SZ(reg_name, reg_size) PRIM_VGPR_SZ(reg_name, reg_size) > > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH) > > > +#define VKINSN_SZ(insn, reg_size) PRIM_VGPR_SZ(insn, reg_size) > > > > No need for both VGPR_SZ and VKINSN_SZ. How about > > > > #define VEVALUATOR(x,y) VPASTER(x,y) > > Will change for V5. > > > > > + > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > new file mode 100644 > > > index 0000000000..5b04e89ecb > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > @@ -0,0 +1,97 @@ > > > +#!/usr/bin/python3 > > > +# Copyright (C) 2022 Free Software Foundation, Inc. > > > +# This file is part of the GNU C Library. > > > +# > > > +# The GNU C Library is free software; you can redistribute it and/or > > > +# modify it under the terms of the GNU Lesser General Public > > > +# License as published by the Free Software Foundation; either > > > +# version 2.1 of the License, or (at your option) any later version. > > > +# > > > +# The GNU C Library is distributed in the hope that it will be useful, > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > +# Lesser General Public License for more details. > > > +# > > > +# You should have received a copy of the GNU Lesser General Public > > > +# License along with the GNU C Library; if not, see > > > +# <https://www.gnu.org/licenses/>. > > > +"""Generate macros for getting GPR name of a certain size > > > + > > > +Inputs: None > > > +Output: Prints header fill to stdout > > > + > > > +API: > > > + VGPR(reg_name) > > > + - Get register name VEC_SIZE component of `reg_name` > > > + VGPR_SZ(reg_name, reg_size) > > > + - Get register name `reg_size` component of `reg_name` > > > +""" > > > + > > > +import sys > > > +from datetime import datetime > > > + > > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"], > > > + ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"], > > > + ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"], > > > + ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"], > > > + ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"], > > > + ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"], > > > + ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"], > > > + ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]] > > > + > > > +mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"] > > > +mask_insns_ext = ["b", "w", "d", "q"] > > > + > > > +cr = """ > > > + Copyright (C) {} Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > +""" > > > + > > > +print("/* This file was generated by: {}.".format(sys.argv[0])) > > > +print(cr.format(datetime.today().year)) > > > + > > > +print("#ifndef _REG_MAP_MACROS_H") > > > +print("#define _REG_MAP_MACROS_H\t1\n") > > > +for reg in registers: > > > + for i in range(0, 4): > > > + for j in range(0, 4): > > > + print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i])) > > > + > > > +print("") > > > +for reg in registers: > > > + print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0])) > > > + > > > +print("") > > > +for mask_insn in mask_insns: > > > + for i in range(0, 4): > > > + print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn, > > > + mask_insns_ext[i])) > > > + > > > +print("") > > > +for mask_insn in mask_insns: > > > + print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn)) > > > +print("") > > > + > > > +print("#ifndef REG_WIDTH") > > > +print("#define REG_WIDTH VEC_SIZE") > > > +print("#endif") > > > +print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size") > > > +print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)") > > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)") > > > +print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)") > > > + > > > +print("\n#endif") > > > -- > > > 2.34.1 > > > > > > > > > -- > > H.J.
On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > This is to make it easier to do think like: > > > > ``` > > > > vpcmpb %VEC(0), %VEC(1), %k0 > > > > kmov{d|q} %k0, %{eax|rax} > > > > test %{eax|rax} > > > > ``` > > > > > > Since all these register macros are based on VEC_SIZE which is either 32 > > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are > > > needed. 8-bit and 16-bit macros aren't needed. > > > > > > > It adds macro s.t any GPR can get the proper width with: > > > > `V{upper_case_GPR_name}` > > > > > > > > and any mask insn can get the proper width with: > > > > `{mask_insn_without_postfix}V` > > > > > > All macros should be in upper cases. > > > > > > > This commit does not change libc.so > > > > > > > > Tested build on x86-64 > > > > --- > > > > sysdeps/x86_64/multiarch/reg-macros.h | 337 ++++++++++++++++++ > > > > .../multiarch/scripts/gen-reg-map-macros.py | 97 +++++ > > > > 2 files changed, 434 insertions(+) > > > > create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h > > > > create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h > > > > new file mode 100644 > > > > index 0000000000..c4d7f57b66 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h > > > > > > vreg-macros.h to indicate macros based on vector size. Please > > > add comments to indicate that register macros are expanded based > > > on vector size. > > > > > > > @@ -0,0 +1,337 @@ > > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py. > > > > + > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > +#ifndef _REG_MAP_MACROS_H > > > > +#define _REG_MAP_MACROS_H 1 > > > > + > > > > +#define rax_8 al > > > > +#define eax_8 al > > > > +#define ax_8 al > > > > +#define al_8 al > > > > +#define rax_16 ax > > > > +#define eax_16 ax > > > > +#define ax_16 ax > > > > +#define al_16 ax > > > > +#define rax_32 eax > > > > +#define eax_32 eax > > > > +#define ax_32 eax > > > > +#define al_32 eax > > > > +#define rax_64 rax > > > > +#define eax_64 rax > > > > +#define ax_64 rax > > > > +#define al_64 rax > > > > > > Only rax_32 and rax_64 are needed. > > > > > > > +#define rbx_8 bl > > > > +#define ebx_8 bl > > > > +#define bx_8 bl > > > > +#define bl_8 bl > > > > +#define rbx_16 bx > > > > +#define ebx_16 bx > > > > +#define bx_16 bx > > > > +#define bl_16 bx > > > > +#define rbx_32 ebx > > > > +#define ebx_32 ebx > > > > +#define bx_32 ebx > > > > +#define bl_32 ebx > > > > +#define rbx_64 rbx > > > > +#define ebx_64 rbx > > > > +#define bx_64 rbx > > > > +#define bl_64 rbx > > > > +#define rcx_8 cl > > > > +#define ecx_8 cl > > > > +#define cx_8 cl > > > > +#define cl_8 cl > > > > +#define rcx_16 cx > > > > +#define ecx_16 cx > > > > +#define cx_16 cx > > > > +#define cl_16 cx > > > > +#define rcx_32 ecx > > > > +#define ecx_32 ecx > > > > +#define cx_32 ecx > > > > +#define cl_32 ecx > > > > +#define rcx_64 rcx > > > > +#define ecx_64 rcx > > > > +#define cx_64 rcx > > > > +#define cl_64 rcx > > > > +#define rdx_8 dl > > > > +#define edx_8 dl > > > > +#define dx_8 dl > > > > +#define dl_8 dl > > > > +#define rdx_16 dx > > > > +#define edx_16 dx > > > > +#define dx_16 dx > > > > +#define dl_16 dx > > > > +#define rdx_32 edx > > > > +#define edx_32 edx > > > > +#define dx_32 edx > > > > +#define dl_32 edx > > > > +#define rdx_64 rdx > > > > +#define edx_64 rdx > > > > +#define dx_64 rdx > > > > +#define dl_64 rdx > > > > +#define rbp_8 bpl > > > > +#define ebp_8 bpl > > > > +#define bp_8 bpl > > > > +#define bpl_8 bpl > > > > +#define rbp_16 bp > > > > +#define ebp_16 bp > > > > +#define bp_16 bp > > > > +#define bpl_16 bp > > > > +#define rbp_32 ebp > > > > +#define ebp_32 ebp > > > > +#define bp_32 ebp > > > > +#define bpl_32 ebp > > > > +#define rbp_64 rbp > > > > +#define ebp_64 rbp > > > > +#define bp_64 rbp > > > > +#define bpl_64 rbp > > > > +#define rsp_8 spl > > > > +#define esp_8 spl > > > > +#define sp_8 spl > > > > +#define spl_8 spl > > > > +#define rsp_16 sp > > > > +#define esp_16 sp > > > > +#define sp_16 sp > > > > +#define spl_16 sp > > > > +#define rsp_32 esp > > > > +#define esp_32 esp > > > > +#define sp_32 esp > > > > +#define spl_32 esp > > > > +#define rsp_64 rsp > > > > +#define esp_64 rsp > > > > +#define sp_64 rsp > > > > +#define spl_64 rsp > > > > +#define rsi_8 sil > > > > +#define esi_8 sil > > > > +#define si_8 sil > > > > +#define sil_8 sil > > > > +#define rsi_16 si > > > > +#define esi_16 si > > > > +#define si_16 si > > > > +#define sil_16 si > > > > +#define rsi_32 esi > > > > +#define esi_32 esi > > > > +#define si_32 esi > > > > +#define sil_32 esi > > > > +#define rsi_64 rsi > > > > +#define esi_64 rsi > > > > +#define si_64 rsi > > > > +#define sil_64 rsi > > > > +#define rdi_8 dil > > > > +#define edi_8 dil > > > > +#define di_8 dil > > > > +#define dil_8 dil > > > > +#define rdi_16 di > > > > +#define edi_16 di > > > > +#define di_16 di > > > > +#define dil_16 di > > > > +#define rdi_32 edi > > > > +#define edi_32 edi > > > > +#define di_32 edi > > > > +#define dil_32 edi > > > > +#define rdi_64 rdi > > > > +#define edi_64 rdi > > > > +#define di_64 rdi > > > > +#define dil_64 rdi > > > > +#define r8_8 r8b > > > > +#define r8d_8 r8b > > > > +#define r8w_8 r8b > > > > +#define r8b_8 r8b > > > > +#define r8_16 r8w > > > > +#define r8d_16 r8w > > > > +#define r8w_16 r8w > > > > +#define r8b_16 r8w > > > > +#define r8_32 r8d > > > > +#define r8d_32 r8d > > > > +#define r8w_32 r8d > > > > +#define r8b_32 r8d > > > > +#define r8_64 r8 > > > > +#define r8d_64 r8 > > > > +#define r8w_64 r8 > > > > +#define r8b_64 r8 > > > > +#define r9_8 r9b > > > > +#define r9d_8 r9b > > > > +#define r9w_8 r9b > > > > +#define r9b_8 r9b > > > > +#define r9_16 r9w > > > > +#define r9d_16 r9w > > > > +#define r9w_16 r9w > > > > +#define r9b_16 r9w > > > > +#define r9_32 r9d > > > > +#define r9d_32 r9d > > > > +#define r9w_32 r9d > > > > +#define r9b_32 r9d > > > > +#define r9_64 r9 > > > > +#define r9d_64 r9 > > > > +#define r9w_64 r9 > > > > +#define r9b_64 r9 > > > > +#define r10_8 r10b > > > > +#define r10d_8 r10b > > > > +#define r10w_8 r10b > > > > +#define r10b_8 r10b > > > > +#define r10_16 r10w > > > > +#define r10d_16 r10w > > > > +#define r10w_16 r10w > > > > +#define r10b_16 r10w > > > > +#define r10_32 r10d > > > > +#define r10d_32 r10d > > > > +#define r10w_32 r10d > > > > +#define r10b_32 r10d > > > > +#define r10_64 r10 > > > > +#define r10d_64 r10 > > > > +#define r10w_64 r10 > > > > +#define r10b_64 r10 > > > > +#define r11_8 r11b > > > > +#define r11d_8 r11b > > > > +#define r11w_8 r11b > > > > +#define r11b_8 r11b > > > > +#define r11_16 r11w > > > > +#define r11d_16 r11w > > > > +#define r11w_16 r11w > > > > +#define r11b_16 r11w > > > > +#define r11_32 r11d > > > > +#define r11d_32 r11d > > > > +#define r11w_32 r11d > > > > +#define r11b_32 r11d > > > > +#define r11_64 r11 > > > > +#define r11d_64 r11 > > > > +#define r11w_64 r11 > > > > +#define r11b_64 r11 > > > > +#define r12_8 r12b > > > > +#define r12d_8 r12b > > > > +#define r12w_8 r12b > > > > +#define r12b_8 r12b > > > > +#define r12_16 r12w > > > > +#define r12d_16 r12w > > > > +#define r12w_16 r12w > > > > +#define r12b_16 r12w > > > > +#define r12_32 r12d > > > > +#define r12d_32 r12d > > > > +#define r12w_32 r12d > > > > +#define r12b_32 r12d > > > > +#define r12_64 r12 > > > > +#define r12d_64 r12 > > > > +#define r12w_64 r12 > > > > +#define r12b_64 r12 > > > > +#define r13_8 r13b > > > > +#define r13d_8 r13b > > > > +#define r13w_8 r13b > > > > +#define r13b_8 r13b > > > > +#define r13_16 r13w > > > > +#define r13d_16 r13w > > > > +#define r13w_16 r13w > > > > +#define r13b_16 r13w > > > > +#define r13_32 r13d > > > > +#define r13d_32 r13d > > > > +#define r13w_32 r13d > > > > +#define r13b_32 r13d > > > > +#define r13_64 r13 > > > > +#define r13d_64 r13 > > > > +#define r13w_64 r13 > > > > +#define r13b_64 r13 > > > > +#define r14_8 r14b > > > > +#define r14d_8 r14b > > > > +#define r14w_8 r14b > > > > +#define r14b_8 r14b > > > > +#define r14_16 r14w > > > > +#define r14d_16 r14w > > > > +#define r14w_16 r14w > > > > +#define r14b_16 r14w > > > > +#define r14_32 r14d > > > > +#define r14d_32 r14d > > > > +#define r14w_32 r14d > > > > +#define r14b_32 r14d > > > > +#define r14_64 r14 > > > > +#define r14d_64 r14 > > > > +#define r14w_64 r14 > > > > +#define r14b_64 r14 > > > > +#define r15_8 r15b > > > > +#define r15d_8 r15b > > > > +#define r15w_8 r15b > > > > +#define r15b_8 r15b > > > > +#define r15_16 r15w > > > > +#define r15d_16 r15w > > > > +#define r15w_16 r15w > > > > +#define r15b_16 r15w > > > > +#define r15_32 r15d > > > > +#define r15d_32 r15d > > > > +#define r15w_32 r15d > > > > +#define r15b_32 r15d > > > > +#define r15_64 r15 > > > > +#define r15d_64 r15 > > > > +#define r15w_64 r15 > > > > +#define r15b_64 r15 > > > > + > > > > +#define VRAX VGPR(rax) > > > > +#define VRBX VGPR(rbx) > > > > +#define VRCX VGPR(rcx) > > > > +#define VRDX VGPR(rdx) > > > > +#define VRBP VGPR(rbp) > > > > +#define VRSP VGPR(rsp) > > > > +#define VRSI VGPR(rsi) > > > > +#define VRDI VGPR(rdi) > > > > +#define VR8 VGPR(r8) > > > > +#define VR9 VGPR(r9) > > > > +#define VR10 VGPR(r10) > > > > +#define VR11 VGPR(r11) > > > > +#define VR12 VGPR(r12) > > > > +#define VR13 VGPR(r13) > > > > +#define VR14 VGPR(r14) > > > > +#define VR15 VGPR(r15) > > > > + > > > > +#define kmov_8 kmovb > > > > +#define kmov_16 kmovw > > > > +#define kmov_32 kmovd > > > > +#define kmov_64 kmovq > > > > > > Only 32 and 64 are needed. > > > > Thats not entirely true for the wide-char impls. > > > > > > > +#define kortest_8 kortestb > > > > +#define kortest_16 kortestw > > > > +#define kortest_32 kortestd > > > > +#define kortest_64 kortestq > > > > +#define kor_8 korb > > > > +#define kor_16 korw > > > > +#define kor_32 kord > > > > +#define kor_64 korq > > > > +#define ktest_8 ktestb > > > > +#define ktest_16 ktestw > > > > +#define ktest_32 ktestd > > > > +#define ktest_64 ktestq > > > > +#define kand_8 kandb > > > > +#define kand_16 kandw > > > > +#define kand_32 kandd > > > > +#define kand_64 kandq > > > > +#define kxor_8 kxorb > > > > +#define kxor_16 kxorw > > > > +#define kxor_32 kxord > > > > +#define kxor_64 kxorq > > > > + > > > > +#define kmovV VKINSN_SZ(kmov, REG_WIDTH) > > > > +#define kortestV VKINSN_SZ(kortest, REG_WIDTH) > > > > +#define korV VKINSN_SZ(kor, REG_WIDTH) > > > > +#define ktestV VKINSN_SZ(ktest, REG_WIDTH) > > > > +#define kandV VKINSN_SZ(kand, REG_WIDTH) > > > > +#define kxorV VKINSN_SZ(kxor, REG_WIDTH) > > > > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH) > > > > Will fix for V5. > > > > > > > + > > > > +#ifndef REG_WIDTH > > > > +#define REG_WIDTH VEC_SIZE > > > > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH > > > can be dropped. > > > > Thats not quite true. > > > > For wide-char impls REG_WIDTH != VEC_SIZE. > > These register macros are used to operate vectors. Do you have > an example of REG_WIDTH != VEC_SIZE? But since wide-char instructions use 32-bit comparison the resulting mask is < 64-bit i.e: vpcmpd %zmm16, %zmm17, %k0 kmovd %k0, %eax will collect all the necessary bits and is prefered. Next version of Sunil's memchr-evex512 should have it. > > > > > > > > +#endif > > > > +#define PRIM_VGPR_SZ(reg_name, reg_size) reg_name##_##reg_size > > > > > > This is used for both register and instruction. How about > > > > > > #define VPASTER(x,y) x##_##y > > > > Will fix for V5. > > > > > > > > > > +#define VGPR_SZ(reg_name, reg_size) PRIM_VGPR_SZ(reg_name, reg_size) > > > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH) > > > > +#define VKINSN_SZ(insn, reg_size) PRIM_VGPR_SZ(insn, reg_size) > > > > > > No need for both VGPR_SZ and VKINSN_SZ. How about > > > > > > #define VEVALUATOR(x,y) VPASTER(x,y) > > > > Will change for V5. > > > > > > > + > > > > +#endif > > > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > > new file mode 100644 > > > > index 0000000000..5b04e89ecb > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > > @@ -0,0 +1,97 @@ > > > > +#!/usr/bin/python3 > > > > +# Copyright (C) 2022 Free Software Foundation, Inc. > > > > +# This file is part of the GNU C Library. > > > > +# > > > > +# The GNU C Library is free software; you can redistribute it and/or > > > > +# modify it under the terms of the GNU Lesser General Public > > > > +# License as published by the Free Software Foundation; either > > > > +# version 2.1 of the License, or (at your option) any later version. > > > > +# > > > > +# The GNU C Library is distributed in the hope that it will be useful, > > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > +# Lesser General Public License for more details. > > > > +# > > > > +# You should have received a copy of the GNU Lesser General Public > > > > +# License along with the GNU C Library; if not, see > > > > +# <https://www.gnu.org/licenses/>. > > > > +"""Generate macros for getting GPR name of a certain size > > > > + > > > > +Inputs: None > > > > +Output: Prints header fill to stdout > > > > + > > > > +API: > > > > + VGPR(reg_name) > > > > + - Get register name VEC_SIZE component of `reg_name` > > > > + VGPR_SZ(reg_name, reg_size) > > > > + - Get register name `reg_size` component of `reg_name` > > > > +""" > > > > + > > > > +import sys > > > > +from datetime import datetime > > > > + > > > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"], > > > > + ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"], > > > > + ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"], > > > > + ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"], > > > > + ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"], > > > > + ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"], > > > > + ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"], > > > > + ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]] > > > > + > > > > +mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"] > > > > +mask_insns_ext = ["b", "w", "d", "q"] > > > > + > > > > +cr = """ > > > > + Copyright (C) {} Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > +""" > > > > + > > > > +print("/* This file was generated by: {}.".format(sys.argv[0])) > > > > +print(cr.format(datetime.today().year)) > > > > + > > > > +print("#ifndef _REG_MAP_MACROS_H") > > > > +print("#define _REG_MAP_MACROS_H\t1\n") > > > > +for reg in registers: > > > > + for i in range(0, 4): > > > > + for j in range(0, 4): > > > > + print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i])) > > > > + > > > > +print("") > > > > +for reg in registers: > > > > + print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0])) > > > > + > > > > +print("") > > > > +for mask_insn in mask_insns: > > > > + for i in range(0, 4): > > > > + print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn, > > > > + mask_insns_ext[i])) > > > > + > > > > +print("") > > > > +for mask_insn in mask_insns: > > > > + print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn)) > > > > +print("") > > > > + > > > > +print("#ifndef REG_WIDTH") > > > > +print("#define REG_WIDTH VEC_SIZE") > > > > +print("#endif") > > > > +print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size") > > > > +print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)") > > > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)") > > > > +print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)") > > > > + > > > > +print("\n#endif") > > > > -- > > > > 2.34.1 > > > > > > > > > > > > > -- > > > H.J. > > > > -- > H.J.
On Fri, Oct 14, 2022 at 11:38 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > This is to make it easier to do think like: > > > > > ``` > > > > > vpcmpb %VEC(0), %VEC(1), %k0 > > > > > kmov{d|q} %k0, %{eax|rax} > > > > > test %{eax|rax} > > > > > ``` > > > > > > > > Since all these register macros are based on VEC_SIZE which is either 32 > > > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are > > > > needed. 8-bit and 16-bit macros aren't needed. > > > > > > > > > It adds macro s.t any GPR can get the proper width with: > > > > > `V{upper_case_GPR_name}` > > > > > > > > > > and any mask insn can get the proper width with: > > > > > `{mask_insn_without_postfix}V` > > > > > > > > All macros should be in upper cases. > > > > > > > > > This commit does not change libc.so > > > > > > > > > > Tested build on x86-64 > > > > > --- > > > > > sysdeps/x86_64/multiarch/reg-macros.h | 337 ++++++++++++++++++ > > > > > .../multiarch/scripts/gen-reg-map-macros.py | 97 +++++ > > > > > 2 files changed, 434 insertions(+) > > > > > create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h > > > > > create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h > > > > > new file mode 100644 > > > > > index 0000000000..c4d7f57b66 > > > > > --- /dev/null > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h > > > > > > > > vreg-macros.h to indicate macros based on vector size. Please > > > > add comments to indicate that register macros are expanded based > > > > on vector size. > > > > > > > > > @@ -0,0 +1,337 @@ > > > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py. > > > > > + > > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > > + This file is part of the GNU C Library. > > > > > + > > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > > + modify it under the terms of the GNU Lesser General Public > > > > > + License as published by the Free Software Foundation; either > > > > > + version 2.1 of the License, or (at your option) any later version. > > > > > + > > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > > + Lesser General Public License for more details. > > > > > + > > > > > + You should have received a copy of the GNU Lesser General Public > > > > > + License along with the GNU C Library; if not, see > > > > > + <https://www.gnu.org/licenses/>. */ > > > > > + > > > > > +#ifndef _REG_MAP_MACROS_H > > > > > +#define _REG_MAP_MACROS_H 1 > > > > > + > > > > > +#define rax_8 al > > > > > +#define eax_8 al > > > > > +#define ax_8 al > > > > > +#define al_8 al > > > > > +#define rax_16 ax > > > > > +#define eax_16 ax > > > > > +#define ax_16 ax > > > > > +#define al_16 ax > > > > > +#define rax_32 eax > > > > > +#define eax_32 eax > > > > > +#define ax_32 eax > > > > > +#define al_32 eax > > > > > +#define rax_64 rax > > > > > +#define eax_64 rax > > > > > +#define ax_64 rax > > > > > +#define al_64 rax > > > > > > > > Only rax_32 and rax_64 are needed. > > > > > > > > > +#define rbx_8 bl > > > > > +#define ebx_8 bl > > > > > +#define bx_8 bl > > > > > +#define bl_8 bl > > > > > +#define rbx_16 bx > > > > > +#define ebx_16 bx > > > > > +#define bx_16 bx > > > > > +#define bl_16 bx > > > > > +#define rbx_32 ebx > > > > > +#define ebx_32 ebx > > > > > +#define bx_32 ebx > > > > > +#define bl_32 ebx > > > > > +#define rbx_64 rbx > > > > > +#define ebx_64 rbx > > > > > +#define bx_64 rbx > > > > > +#define bl_64 rbx > > > > > +#define rcx_8 cl > > > > > +#define ecx_8 cl > > > > > +#define cx_8 cl > > > > > +#define cl_8 cl > > > > > +#define rcx_16 cx > > > > > +#define ecx_16 cx > > > > > +#define cx_16 cx > > > > > +#define cl_16 cx > > > > > +#define rcx_32 ecx > > > > > +#define ecx_32 ecx > > > > > +#define cx_32 ecx > > > > > +#define cl_32 ecx > > > > > +#define rcx_64 rcx > > > > > +#define ecx_64 rcx > > > > > +#define cx_64 rcx > > > > > +#define cl_64 rcx > > > > > +#define rdx_8 dl > > > > > +#define edx_8 dl > > > > > +#define dx_8 dl > > > > > +#define dl_8 dl > > > > > +#define rdx_16 dx > > > > > +#define edx_16 dx > > > > > +#define dx_16 dx > > > > > +#define dl_16 dx > > > > > +#define rdx_32 edx > > > > > +#define edx_32 edx > > > > > +#define dx_32 edx > > > > > +#define dl_32 edx > > > > > +#define rdx_64 rdx > > > > > +#define edx_64 rdx > > > > > +#define dx_64 rdx > > > > > +#define dl_64 rdx > > > > > +#define rbp_8 bpl > > > > > +#define ebp_8 bpl > > > > > +#define bp_8 bpl > > > > > +#define bpl_8 bpl > > > > > +#define rbp_16 bp > > > > > +#define ebp_16 bp > > > > > +#define bp_16 bp > > > > > +#define bpl_16 bp > > > > > +#define rbp_32 ebp > > > > > +#define ebp_32 ebp > > > > > +#define bp_32 ebp > > > > > +#define bpl_32 ebp > > > > > +#define rbp_64 rbp > > > > > +#define ebp_64 rbp > > > > > +#define bp_64 rbp > > > > > +#define bpl_64 rbp > > > > > +#define rsp_8 spl > > > > > +#define esp_8 spl > > > > > +#define sp_8 spl > > > > > +#define spl_8 spl > > > > > +#define rsp_16 sp > > > > > +#define esp_16 sp > > > > > +#define sp_16 sp > > > > > +#define spl_16 sp > > > > > +#define rsp_32 esp > > > > > +#define esp_32 esp > > > > > +#define sp_32 esp > > > > > +#define spl_32 esp > > > > > +#define rsp_64 rsp > > > > > +#define esp_64 rsp > > > > > +#define sp_64 rsp > > > > > +#define spl_64 rsp > > > > > +#define rsi_8 sil > > > > > +#define esi_8 sil > > > > > +#define si_8 sil > > > > > +#define sil_8 sil > > > > > +#define rsi_16 si > > > > > +#define esi_16 si > > > > > +#define si_16 si > > > > > +#define sil_16 si > > > > > +#define rsi_32 esi > > > > > +#define esi_32 esi > > > > > +#define si_32 esi > > > > > +#define sil_32 esi > > > > > +#define rsi_64 rsi > > > > > +#define esi_64 rsi > > > > > +#define si_64 rsi > > > > > +#define sil_64 rsi > > > > > +#define rdi_8 dil > > > > > +#define edi_8 dil > > > > > +#define di_8 dil > > > > > +#define dil_8 dil > > > > > +#define rdi_16 di > > > > > +#define edi_16 di > > > > > +#define di_16 di > > > > > +#define dil_16 di > > > > > +#define rdi_32 edi > > > > > +#define edi_32 edi > > > > > +#define di_32 edi > > > > > +#define dil_32 edi > > > > > +#define rdi_64 rdi > > > > > +#define edi_64 rdi > > > > > +#define di_64 rdi > > > > > +#define dil_64 rdi > > > > > +#define r8_8 r8b > > > > > +#define r8d_8 r8b > > > > > +#define r8w_8 r8b > > > > > +#define r8b_8 r8b > > > > > +#define r8_16 r8w > > > > > +#define r8d_16 r8w > > > > > +#define r8w_16 r8w > > > > > +#define r8b_16 r8w > > > > > +#define r8_32 r8d > > > > > +#define r8d_32 r8d > > > > > +#define r8w_32 r8d > > > > > +#define r8b_32 r8d > > > > > +#define r8_64 r8 > > > > > +#define r8d_64 r8 > > > > > +#define r8w_64 r8 > > > > > +#define r8b_64 r8 > > > > > +#define r9_8 r9b > > > > > +#define r9d_8 r9b > > > > > +#define r9w_8 r9b > > > > > +#define r9b_8 r9b > > > > > +#define r9_16 r9w > > > > > +#define r9d_16 r9w > > > > > +#define r9w_16 r9w > > > > > +#define r9b_16 r9w > > > > > +#define r9_32 r9d > > > > > +#define r9d_32 r9d > > > > > +#define r9w_32 r9d > > > > > +#define r9b_32 r9d > > > > > +#define r9_64 r9 > > > > > +#define r9d_64 r9 > > > > > +#define r9w_64 r9 > > > > > +#define r9b_64 r9 > > > > > +#define r10_8 r10b > > > > > +#define r10d_8 r10b > > > > > +#define r10w_8 r10b > > > > > +#define r10b_8 r10b > > > > > +#define r10_16 r10w > > > > > +#define r10d_16 r10w > > > > > +#define r10w_16 r10w > > > > > +#define r10b_16 r10w > > > > > +#define r10_32 r10d > > > > > +#define r10d_32 r10d > > > > > +#define r10w_32 r10d > > > > > +#define r10b_32 r10d > > > > > +#define r10_64 r10 > > > > > +#define r10d_64 r10 > > > > > +#define r10w_64 r10 > > > > > +#define r10b_64 r10 > > > > > +#define r11_8 r11b > > > > > +#define r11d_8 r11b > > > > > +#define r11w_8 r11b > > > > > +#define r11b_8 r11b > > > > > +#define r11_16 r11w > > > > > +#define r11d_16 r11w > > > > > +#define r11w_16 r11w > > > > > +#define r11b_16 r11w > > > > > +#define r11_32 r11d > > > > > +#define r11d_32 r11d > > > > > +#define r11w_32 r11d > > > > > +#define r11b_32 r11d > > > > > +#define r11_64 r11 > > > > > +#define r11d_64 r11 > > > > > +#define r11w_64 r11 > > > > > +#define r11b_64 r11 > > > > > +#define r12_8 r12b > > > > > +#define r12d_8 r12b > > > > > +#define r12w_8 r12b > > > > > +#define r12b_8 r12b > > > > > +#define r12_16 r12w > > > > > +#define r12d_16 r12w > > > > > +#define r12w_16 r12w > > > > > +#define r12b_16 r12w > > > > > +#define r12_32 r12d > > > > > +#define r12d_32 r12d > > > > > +#define r12w_32 r12d > > > > > +#define r12b_32 r12d > > > > > +#define r12_64 r12 > > > > > +#define r12d_64 r12 > > > > > +#define r12w_64 r12 > > > > > +#define r12b_64 r12 > > > > > +#define r13_8 r13b > > > > > +#define r13d_8 r13b > > > > > +#define r13w_8 r13b > > > > > +#define r13b_8 r13b > > > > > +#define r13_16 r13w > > > > > +#define r13d_16 r13w > > > > > +#define r13w_16 r13w > > > > > +#define r13b_16 r13w > > > > > +#define r13_32 r13d > > > > > +#define r13d_32 r13d > > > > > +#define r13w_32 r13d > > > > > +#define r13b_32 r13d > > > > > +#define r13_64 r13 > > > > > +#define r13d_64 r13 > > > > > +#define r13w_64 r13 > > > > > +#define r13b_64 r13 > > > > > +#define r14_8 r14b > > > > > +#define r14d_8 r14b > > > > > +#define r14w_8 r14b > > > > > +#define r14b_8 r14b > > > > > +#define r14_16 r14w > > > > > +#define r14d_16 r14w > > > > > +#define r14w_16 r14w > > > > > +#define r14b_16 r14w > > > > > +#define r14_32 r14d > > > > > +#define r14d_32 r14d > > > > > +#define r14w_32 r14d > > > > > +#define r14b_32 r14d > > > > > +#define r14_64 r14 > > > > > +#define r14d_64 r14 > > > > > +#define r14w_64 r14 > > > > > +#define r14b_64 r14 > > > > > +#define r15_8 r15b > > > > > +#define r15d_8 r15b > > > > > +#define r15w_8 r15b > > > > > +#define r15b_8 r15b > > > > > +#define r15_16 r15w > > > > > +#define r15d_16 r15w > > > > > +#define r15w_16 r15w > > > > > +#define r15b_16 r15w > > > > > +#define r15_32 r15d > > > > > +#define r15d_32 r15d > > > > > +#define r15w_32 r15d > > > > > +#define r15b_32 r15d > > > > > +#define r15_64 r15 > > > > > +#define r15d_64 r15 > > > > > +#define r15w_64 r15 > > > > > +#define r15b_64 r15 > > > > > + > > > > > +#define VRAX VGPR(rax) > > > > > +#define VRBX VGPR(rbx) > > > > > +#define VRCX VGPR(rcx) > > > > > +#define VRDX VGPR(rdx) > > > > > +#define VRBP VGPR(rbp) > > > > > +#define VRSP VGPR(rsp) > > > > > +#define VRSI VGPR(rsi) > > > > > +#define VRDI VGPR(rdi) > > > > > +#define VR8 VGPR(r8) > > > > > +#define VR9 VGPR(r9) > > > > > +#define VR10 VGPR(r10) > > > > > +#define VR11 VGPR(r11) > > > > > +#define VR12 VGPR(r12) > > > > > +#define VR13 VGPR(r13) > > > > > +#define VR14 VGPR(r14) > > > > > +#define VR15 VGPR(r15) > > > > > + > > > > > +#define kmov_8 kmovb > > > > > +#define kmov_16 kmovw > > > > > +#define kmov_32 kmovd > > > > > +#define kmov_64 kmovq > > > > > > > > Only 32 and 64 are needed. > > > > > > Thats not entirely true for the wide-char impls. > > > > > > > > > +#define kortest_8 kortestb > > > > > +#define kortest_16 kortestw > > > > > +#define kortest_32 kortestd > > > > > +#define kortest_64 kortestq > > > > > +#define kor_8 korb > > > > > +#define kor_16 korw > > > > > +#define kor_32 kord > > > > > +#define kor_64 korq > > > > > +#define ktest_8 ktestb > > > > > +#define ktest_16 ktestw > > > > > +#define ktest_32 ktestd > > > > > +#define ktest_64 ktestq > > > > > +#define kand_8 kandb > > > > > +#define kand_16 kandw > > > > > +#define kand_32 kandd > > > > > +#define kand_64 kandq > > > > > +#define kxor_8 kxorb > > > > > +#define kxor_16 kxorw > > > > > +#define kxor_32 kxord > > > > > +#define kxor_64 kxorq > > > > > + > > > > > +#define kmovV VKINSN_SZ(kmov, REG_WIDTH) > > > > > +#define kortestV VKINSN_SZ(kortest, REG_WIDTH) > > > > > +#define korV VKINSN_SZ(kor, REG_WIDTH) > > > > > +#define ktestV VKINSN_SZ(ktest, REG_WIDTH) > > > > > +#define kandV VKINSN_SZ(kand, REG_WIDTH) > > > > > +#define kxorV VKINSN_SZ(kxor, REG_WIDTH) > > > > > > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH) > > > > > > Will fix for V5. > > > > > > > > > + > > > > > +#ifndef REG_WIDTH > > > > > +#define REG_WIDTH VEC_SIZE > > > > > > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH > > > > can be dropped. > > > > > > Thats not quite true. > > > > > > For wide-char impls REG_WIDTH != VEC_SIZE. > > > > These register macros are used to operate vectors. Do you have > > an example of REG_WIDTH != VEC_SIZE? > > But since wide-char instructions use 32-bit comparison the resulting > mask is < 64-bit i.e: > > vpcmpd %zmm16, %zmm17, %k0 > kmovd %k0, %eax > will collect all the necessary bits and is prefered. > > Next version of Sunil's memchr-evex512 should have it. > So it is based on CHAR_PER_VEC. When will 8-bit and 16-bit registers be used? H.J.
On Fri, Oct 14, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Fri, Oct 14, 2022 at 11:38 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > > > On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > > > This is to make it easier to do think like: > > > > > > ``` > > > > > > vpcmpb %VEC(0), %VEC(1), %k0 > > > > > > kmov{d|q} %k0, %{eax|rax} > > > > > > test %{eax|rax} > > > > > > ``` > > > > > > > > > > Since all these register macros are based on VEC_SIZE which is either 32 > > > > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are > > > > > needed. 8-bit and 16-bit macros aren't needed. > > > > > > > > > > > It adds macro s.t any GPR can get the proper width with: > > > > > > `V{upper_case_GPR_name}` > > > > > > > > > > > > and any mask insn can get the proper width with: > > > > > > `{mask_insn_without_postfix}V` > > > > > > > > > > All macros should be in upper cases. > > > > > > > > > > > This commit does not change libc.so > > > > > > > > > > > > Tested build on x86-64 > > > > > > --- > > > > > > sysdeps/x86_64/multiarch/reg-macros.h | 337 ++++++++++++++++++ > > > > > > .../multiarch/scripts/gen-reg-map-macros.py | 97 +++++ > > > > > > 2 files changed, 434 insertions(+) > > > > > > create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h > > > > > > create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h > > > > > > new file mode 100644 > > > > > > index 0000000000..c4d7f57b66 > > > > > > --- /dev/null > > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h > > > > > > > > > > vreg-macros.h to indicate macros based on vector size. Please > > > > > add comments to indicate that register macros are expanded based > > > > > on vector size. > > > > > > > > > > > @@ -0,0 +1,337 @@ > > > > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py. > > > > > > + > > > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > > > + This file is part of the GNU C Library. > > > > > > + > > > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > > > + modify it under the terms of the GNU Lesser General Public > > > > > > + License as published by the Free Software Foundation; either > > > > > > + version 2.1 of the License, or (at your option) any later version. > > > > > > + > > > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > > > + Lesser General Public License for more details. > > > > > > + > > > > > > + You should have received a copy of the GNU Lesser General Public > > > > > > + License along with the GNU C Library; if not, see > > > > > > + <https://www.gnu.org/licenses/>. */ > > > > > > + > > > > > > +#ifndef _REG_MAP_MACROS_H > > > > > > +#define _REG_MAP_MACROS_H 1 > > > > > > + > > > > > > +#define rax_8 al > > > > > > +#define eax_8 al > > > > > > +#define ax_8 al > > > > > > +#define al_8 al > > > > > > +#define rax_16 ax > > > > > > +#define eax_16 ax > > > > > > +#define ax_16 ax > > > > > > +#define al_16 ax > > > > > > +#define rax_32 eax > > > > > > +#define eax_32 eax > > > > > > +#define ax_32 eax > > > > > > +#define al_32 eax > > > > > > +#define rax_64 rax > > > > > > +#define eax_64 rax > > > > > > +#define ax_64 rax > > > > > > +#define al_64 rax > > > > > > > > > > Only rax_32 and rax_64 are needed. > > > > > > > > > > > +#define rbx_8 bl > > > > > > +#define ebx_8 bl > > > > > > +#define bx_8 bl > > > > > > +#define bl_8 bl > > > > > > +#define rbx_16 bx > > > > > > +#define ebx_16 bx > > > > > > +#define bx_16 bx > > > > > > +#define bl_16 bx > > > > > > +#define rbx_32 ebx > > > > > > +#define ebx_32 ebx > > > > > > +#define bx_32 ebx > > > > > > +#define bl_32 ebx > > > > > > +#define rbx_64 rbx > > > > > > +#define ebx_64 rbx > > > > > > +#define bx_64 rbx > > > > > > +#define bl_64 rbx > > > > > > +#define rcx_8 cl > > > > > > +#define ecx_8 cl > > > > > > +#define cx_8 cl > > > > > > +#define cl_8 cl > > > > > > +#define rcx_16 cx > > > > > > +#define ecx_16 cx > > > > > > +#define cx_16 cx > > > > > > +#define cl_16 cx > > > > > > +#define rcx_32 ecx > > > > > > +#define ecx_32 ecx > > > > > > +#define cx_32 ecx > > > > > > +#define cl_32 ecx > > > > > > +#define rcx_64 rcx > > > > > > +#define ecx_64 rcx > > > > > > +#define cx_64 rcx > > > > > > +#define cl_64 rcx > > > > > > +#define rdx_8 dl > > > > > > +#define edx_8 dl > > > > > > +#define dx_8 dl > > > > > > +#define dl_8 dl > > > > > > +#define rdx_16 dx > > > > > > +#define edx_16 dx > > > > > > +#define dx_16 dx > > > > > > +#define dl_16 dx > > > > > > +#define rdx_32 edx > > > > > > +#define edx_32 edx > > > > > > +#define dx_32 edx > > > > > > +#define dl_32 edx > > > > > > +#define rdx_64 rdx > > > > > > +#define edx_64 rdx > > > > > > +#define dx_64 rdx > > > > > > +#define dl_64 rdx > > > > > > +#define rbp_8 bpl > > > > > > +#define ebp_8 bpl > > > > > > +#define bp_8 bpl > > > > > > +#define bpl_8 bpl > > > > > > +#define rbp_16 bp > > > > > > +#define ebp_16 bp > > > > > > +#define bp_16 bp > > > > > > +#define bpl_16 bp > > > > > > +#define rbp_32 ebp > > > > > > +#define ebp_32 ebp > > > > > > +#define bp_32 ebp > > > > > > +#define bpl_32 ebp > > > > > > +#define rbp_64 rbp > > > > > > +#define ebp_64 rbp > > > > > > +#define bp_64 rbp > > > > > > +#define bpl_64 rbp > > > > > > +#define rsp_8 spl > > > > > > +#define esp_8 spl > > > > > > +#define sp_8 spl > > > > > > +#define spl_8 spl > > > > > > +#define rsp_16 sp > > > > > > +#define esp_16 sp > > > > > > +#define sp_16 sp > > > > > > +#define spl_16 sp > > > > > > +#define rsp_32 esp > > > > > > +#define esp_32 esp > > > > > > +#define sp_32 esp > > > > > > +#define spl_32 esp > > > > > > +#define rsp_64 rsp > > > > > > +#define esp_64 rsp > > > > > > +#define sp_64 rsp > > > > > > +#define spl_64 rsp > > > > > > +#define rsi_8 sil > > > > > > +#define esi_8 sil > > > > > > +#define si_8 sil > > > > > > +#define sil_8 sil > > > > > > +#define rsi_16 si > > > > > > +#define esi_16 si > > > > > > +#define si_16 si > > > > > > +#define sil_16 si > > > > > > +#define rsi_32 esi > > > > > > +#define esi_32 esi > > > > > > +#define si_32 esi > > > > > > +#define sil_32 esi > > > > > > +#define rsi_64 rsi > > > > > > +#define esi_64 rsi > > > > > > +#define si_64 rsi > > > > > > +#define sil_64 rsi > > > > > > +#define rdi_8 dil > > > > > > +#define edi_8 dil > > > > > > +#define di_8 dil > > > > > > +#define dil_8 dil > > > > > > +#define rdi_16 di > > > > > > +#define edi_16 di > > > > > > +#define di_16 di > > > > > > +#define dil_16 di > > > > > > +#define rdi_32 edi > > > > > > +#define edi_32 edi > > > > > > +#define di_32 edi > > > > > > +#define dil_32 edi > > > > > > +#define rdi_64 rdi > > > > > > +#define edi_64 rdi > > > > > > +#define di_64 rdi > > > > > > +#define dil_64 rdi > > > > > > +#define r8_8 r8b > > > > > > +#define r8d_8 r8b > > > > > > +#define r8w_8 r8b > > > > > > +#define r8b_8 r8b > > > > > > +#define r8_16 r8w > > > > > > +#define r8d_16 r8w > > > > > > +#define r8w_16 r8w > > > > > > +#define r8b_16 r8w > > > > > > +#define r8_32 r8d > > > > > > +#define r8d_32 r8d > > > > > > +#define r8w_32 r8d > > > > > > +#define r8b_32 r8d > > > > > > +#define r8_64 r8 > > > > > > +#define r8d_64 r8 > > > > > > +#define r8w_64 r8 > > > > > > +#define r8b_64 r8 > > > > > > +#define r9_8 r9b > > > > > > +#define r9d_8 r9b > > > > > > +#define r9w_8 r9b > > > > > > +#define r9b_8 r9b > > > > > > +#define r9_16 r9w > > > > > > +#define r9d_16 r9w > > > > > > +#define r9w_16 r9w > > > > > > +#define r9b_16 r9w > > > > > > +#define r9_32 r9d > > > > > > +#define r9d_32 r9d > > > > > > +#define r9w_32 r9d > > > > > > +#define r9b_32 r9d > > > > > > +#define r9_64 r9 > > > > > > +#define r9d_64 r9 > > > > > > +#define r9w_64 r9 > > > > > > +#define r9b_64 r9 > > > > > > +#define r10_8 r10b > > > > > > +#define r10d_8 r10b > > > > > > +#define r10w_8 r10b > > > > > > +#define r10b_8 r10b > > > > > > +#define r10_16 r10w > > > > > > +#define r10d_16 r10w > > > > > > +#define r10w_16 r10w > > > > > > +#define r10b_16 r10w > > > > > > +#define r10_32 r10d > > > > > > +#define r10d_32 r10d > > > > > > +#define r10w_32 r10d > > > > > > +#define r10b_32 r10d > > > > > > +#define r10_64 r10 > > > > > > +#define r10d_64 r10 > > > > > > +#define r10w_64 r10 > > > > > > +#define r10b_64 r10 > > > > > > +#define r11_8 r11b > > > > > > +#define r11d_8 r11b > > > > > > +#define r11w_8 r11b > > > > > > +#define r11b_8 r11b > > > > > > +#define r11_16 r11w > > > > > > +#define r11d_16 r11w > > > > > > +#define r11w_16 r11w > > > > > > +#define r11b_16 r11w > > > > > > +#define r11_32 r11d > > > > > > +#define r11d_32 r11d > > > > > > +#define r11w_32 r11d > > > > > > +#define r11b_32 r11d > > > > > > +#define r11_64 r11 > > > > > > +#define r11d_64 r11 > > > > > > +#define r11w_64 r11 > > > > > > +#define r11b_64 r11 > > > > > > +#define r12_8 r12b > > > > > > +#define r12d_8 r12b > > > > > > +#define r12w_8 r12b > > > > > > +#define r12b_8 r12b > > > > > > +#define r12_16 r12w > > > > > > +#define r12d_16 r12w > > > > > > +#define r12w_16 r12w > > > > > > +#define r12b_16 r12w > > > > > > +#define r12_32 r12d > > > > > > +#define r12d_32 r12d > > > > > > +#define r12w_32 r12d > > > > > > +#define r12b_32 r12d > > > > > > +#define r12_64 r12 > > > > > > +#define r12d_64 r12 > > > > > > +#define r12w_64 r12 > > > > > > +#define r12b_64 r12 > > > > > > +#define r13_8 r13b > > > > > > +#define r13d_8 r13b > > > > > > +#define r13w_8 r13b > > > > > > +#define r13b_8 r13b > > > > > > +#define r13_16 r13w > > > > > > +#define r13d_16 r13w > > > > > > +#define r13w_16 r13w > > > > > > +#define r13b_16 r13w > > > > > > +#define r13_32 r13d > > > > > > +#define r13d_32 r13d > > > > > > +#define r13w_32 r13d > > > > > > +#define r13b_32 r13d > > > > > > +#define r13_64 r13 > > > > > > +#define r13d_64 r13 > > > > > > +#define r13w_64 r13 > > > > > > +#define r13b_64 r13 > > > > > > +#define r14_8 r14b > > > > > > +#define r14d_8 r14b > > > > > > +#define r14w_8 r14b > > > > > > +#define r14b_8 r14b > > > > > > +#define r14_16 r14w > > > > > > +#define r14d_16 r14w > > > > > > +#define r14w_16 r14w > > > > > > +#define r14b_16 r14w > > > > > > +#define r14_32 r14d > > > > > > +#define r14d_32 r14d > > > > > > +#define r14w_32 r14d > > > > > > +#define r14b_32 r14d > > > > > > +#define r14_64 r14 > > > > > > +#define r14d_64 r14 > > > > > > +#define r14w_64 r14 > > > > > > +#define r14b_64 r14 > > > > > > +#define r15_8 r15b > > > > > > +#define r15d_8 r15b > > > > > > +#define r15w_8 r15b > > > > > > +#define r15b_8 r15b > > > > > > +#define r15_16 r15w > > > > > > +#define r15d_16 r15w > > > > > > +#define r15w_16 r15w > > > > > > +#define r15b_16 r15w > > > > > > +#define r15_32 r15d > > > > > > +#define r15d_32 r15d > > > > > > +#define r15w_32 r15d > > > > > > +#define r15b_32 r15d > > > > > > +#define r15_64 r15 > > > > > > +#define r15d_64 r15 > > > > > > +#define r15w_64 r15 > > > > > > +#define r15b_64 r15 > > > > > > + > > > > > > +#define VRAX VGPR(rax) > > > > > > +#define VRBX VGPR(rbx) > > > > > > +#define VRCX VGPR(rcx) > > > > > > +#define VRDX VGPR(rdx) > > > > > > +#define VRBP VGPR(rbp) > > > > > > +#define VRSP VGPR(rsp) > > > > > > +#define VRSI VGPR(rsi) > > > > > > +#define VRDI VGPR(rdi) > > > > > > +#define VR8 VGPR(r8) > > > > > > +#define VR9 VGPR(r9) > > > > > > +#define VR10 VGPR(r10) > > > > > > +#define VR11 VGPR(r11) > > > > > > +#define VR12 VGPR(r12) > > > > > > +#define VR13 VGPR(r13) > > > > > > +#define VR14 VGPR(r14) > > > > > > +#define VR15 VGPR(r15) > > > > > > + > > > > > > +#define kmov_8 kmovb > > > > > > +#define kmov_16 kmovw > > > > > > +#define kmov_32 kmovd > > > > > > +#define kmov_64 kmovq > > > > > > > > > > Only 32 and 64 are needed. > > > > > > > > Thats not entirely true for the wide-char impls. > > > > > > > > > > > +#define kortest_8 kortestb > > > > > > +#define kortest_16 kortestw > > > > > > +#define kortest_32 kortestd > > > > > > +#define kortest_64 kortestq > > > > > > +#define kor_8 korb > > > > > > +#define kor_16 korw > > > > > > +#define kor_32 kord > > > > > > +#define kor_64 korq > > > > > > +#define ktest_8 ktestb > > > > > > +#define ktest_16 ktestw > > > > > > +#define ktest_32 ktestd > > > > > > +#define ktest_64 ktestq > > > > > > +#define kand_8 kandb > > > > > > +#define kand_16 kandw > > > > > > +#define kand_32 kandd > > > > > > +#define kand_64 kandq > > > > > > +#define kxor_8 kxorb > > > > > > +#define kxor_16 kxorw > > > > > > +#define kxor_32 kxord > > > > > > +#define kxor_64 kxorq > > > > > > + > > > > > > +#define kmovV VKINSN_SZ(kmov, REG_WIDTH) > > > > > > +#define kortestV VKINSN_SZ(kortest, REG_WIDTH) > > > > > > +#define korV VKINSN_SZ(kor, REG_WIDTH) > > > > > > +#define ktestV VKINSN_SZ(ktest, REG_WIDTH) > > > > > > +#define kandV VKINSN_SZ(kand, REG_WIDTH) > > > > > > +#define kxorV VKINSN_SZ(kxor, REG_WIDTH) > > > > > > > > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH) > > > > > > > > Will fix for V5. > > > > > > > > > > > + > > > > > > +#ifndef REG_WIDTH > > > > > > +#define REG_WIDTH VEC_SIZE > > > > > > > > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH > > > > > can be dropped. > > > > > > > > Thats not quite true. > > > > > > > > For wide-char impls REG_WIDTH != VEC_SIZE. > > > > > > These register macros are used to operate vectors. Do you have > > > an example of REG_WIDTH != VEC_SIZE? > > > > But since wide-char instructions use 32-bit comparison the resulting > > mask is < 64-bit i.e: > > > > vpcmpd %zmm16, %zmm17, %k0 > > kmovd %k0, %eax > > will collect all the necessary bits and is prefered. > > > > Next version of Sunil's memchr-evex512 should have it. > > > > So it is based on CHAR_PER_VEC. When will 8-bit and 16-bit > registers be used? In a sense. generally even if CHAR_PER_VEC < 32 its better to use 32 but in some cases where you want to use `inc{b|w}` to test for all 1s its useful for `VGPR_SZ(rax, CHAR_PER_VEC)` to work. > > H.J.
On Fri, Oct 14, 2022 at 12:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Fri, Oct 14, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Fri, Oct 14, 2022 at 11:38 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > > > > > On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > > > > > This is to make it easier to do think like: > > > > > > > ``` > > > > > > > vpcmpb %VEC(0), %VEC(1), %k0 > > > > > > > kmov{d|q} %k0, %{eax|rax} > > > > > > > test %{eax|rax} > > > > > > > ``` > > > > > > > > > > > > Since all these register macros are based on VEC_SIZE which is either 32 > > > > > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are > > > > > > needed. 8-bit and 16-bit macros aren't needed. > > > > > > > > > > > > > It adds macro s.t any GPR can get the proper width with: > > > > > > > `V{upper_case_GPR_name}` > > > > > > > > > > > > > > and any mask insn can get the proper width with: > > > > > > > `{mask_insn_without_postfix}V` > > > > > > > > > > > > All macros should be in upper cases. > > > > > > > > > > > > > This commit does not change libc.so > > > > > > > > > > > > > > Tested build on x86-64 > > > > > > > --- > > > > > > > sysdeps/x86_64/multiarch/reg-macros.h | 337 ++++++++++++++++++ > > > > > > > .../multiarch/scripts/gen-reg-map-macros.py | 97 +++++ > > > > > > > 2 files changed, 434 insertions(+) > > > > > > > create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h > > > > > > > create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h > > > > > > > new file mode 100644 > > > > > > > index 0000000000..c4d7f57b66 > > > > > > > --- /dev/null > > > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h > > > > > > > > > > > > vreg-macros.h to indicate macros based on vector size. Please > > > > > > add comments to indicate that register macros are expanded based > > > > > > on vector size. > > > > > > > > > > > > > @@ -0,0 +1,337 @@ > > > > > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py. > > > > > > > + > > > > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > > > > + This file is part of the GNU C Library. > > > > > > > + > > > > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > > > > + modify it under the terms of the GNU Lesser General Public > > > > > > > + License as published by the Free Software Foundation; either > > > > > > > + version 2.1 of the License, or (at your option) any later version. > > > > > > > + > > > > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > > > > + Lesser General Public License for more details. > > > > > > > + > > > > > > > + You should have received a copy of the GNU Lesser General Public > > > > > > > + License along with the GNU C Library; if not, see > > > > > > > + <https://www.gnu.org/licenses/>. */ > > > > > > > + > > > > > > > +#ifndef _REG_MAP_MACROS_H > > > > > > > +#define _REG_MAP_MACROS_H 1 > > > > > > > + > > > > > > > +#define rax_8 al > > > > > > > +#define eax_8 al > > > > > > > +#define ax_8 al > > > > > > > +#define al_8 al > > > > > > > +#define rax_16 ax > > > > > > > +#define eax_16 ax > > > > > > > +#define ax_16 ax > > > > > > > +#define al_16 ax > > > > > > > +#define rax_32 eax > > > > > > > +#define eax_32 eax > > > > > > > +#define ax_32 eax > > > > > > > +#define al_32 eax > > > > > > > +#define rax_64 rax > > > > > > > +#define eax_64 rax > > > > > > > +#define ax_64 rax > > > > > > > +#define al_64 rax > > > > > > > > > > > > Only rax_32 and rax_64 are needed. > > > > > > > > > > > > > +#define rbx_8 bl > > > > > > > +#define ebx_8 bl > > > > > > > +#define bx_8 bl > > > > > > > +#define bl_8 bl > > > > > > > +#define rbx_16 bx > > > > > > > +#define ebx_16 bx > > > > > > > +#define bx_16 bx > > > > > > > +#define bl_16 bx > > > > > > > +#define rbx_32 ebx > > > > > > > +#define ebx_32 ebx > > > > > > > +#define bx_32 ebx > > > > > > > +#define bl_32 ebx > > > > > > > +#define rbx_64 rbx > > > > > > > +#define ebx_64 rbx > > > > > > > +#define bx_64 rbx > > > > > > > +#define bl_64 rbx > > > > > > > +#define rcx_8 cl > > > > > > > +#define ecx_8 cl > > > > > > > +#define cx_8 cl > > > > > > > +#define cl_8 cl > > > > > > > +#define rcx_16 cx > > > > > > > +#define ecx_16 cx > > > > > > > +#define cx_16 cx > > > > > > > +#define cl_16 cx > > > > > > > +#define rcx_32 ecx > > > > > > > +#define ecx_32 ecx > > > > > > > +#define cx_32 ecx > > > > > > > +#define cl_32 ecx > > > > > > > +#define rcx_64 rcx > > > > > > > +#define ecx_64 rcx > > > > > > > +#define cx_64 rcx > > > > > > > +#define cl_64 rcx > > > > > > > +#define rdx_8 dl > > > > > > > +#define edx_8 dl > > > > > > > +#define dx_8 dl > > > > > > > +#define dl_8 dl > > > > > > > +#define rdx_16 dx > > > > > > > +#define edx_16 dx > > > > > > > +#define dx_16 dx > > > > > > > +#define dl_16 dx > > > > > > > +#define rdx_32 edx > > > > > > > +#define edx_32 edx > > > > > > > +#define dx_32 edx > > > > > > > +#define dl_32 edx > > > > > > > +#define rdx_64 rdx > > > > > > > +#define edx_64 rdx > > > > > > > +#define dx_64 rdx > > > > > > > +#define dl_64 rdx > > > > > > > +#define rbp_8 bpl > > > > > > > +#define ebp_8 bpl > > > > > > > +#define bp_8 bpl > > > > > > > +#define bpl_8 bpl > > > > > > > +#define rbp_16 bp > > > > > > > +#define ebp_16 bp > > > > > > > +#define bp_16 bp > > > > > > > +#define bpl_16 bp > > > > > > > +#define rbp_32 ebp > > > > > > > +#define ebp_32 ebp > > > > > > > +#define bp_32 ebp > > > > > > > +#define bpl_32 ebp > > > > > > > +#define rbp_64 rbp > > > > > > > +#define ebp_64 rbp > > > > > > > +#define bp_64 rbp > > > > > > > +#define bpl_64 rbp > > > > > > > +#define rsp_8 spl > > > > > > > +#define esp_8 spl > > > > > > > +#define sp_8 spl > > > > > > > +#define spl_8 spl > > > > > > > +#define rsp_16 sp > > > > > > > +#define esp_16 sp > > > > > > > +#define sp_16 sp > > > > > > > +#define spl_16 sp > > > > > > > +#define rsp_32 esp > > > > > > > +#define esp_32 esp > > > > > > > +#define sp_32 esp > > > > > > > +#define spl_32 esp > > > > > > > +#define rsp_64 rsp > > > > > > > +#define esp_64 rsp > > > > > > > +#define sp_64 rsp > > > > > > > +#define spl_64 rsp > > > > > > > +#define rsi_8 sil > > > > > > > +#define esi_8 sil > > > > > > > +#define si_8 sil > > > > > > > +#define sil_8 sil > > > > > > > +#define rsi_16 si > > > > > > > +#define esi_16 si > > > > > > > +#define si_16 si > > > > > > > +#define sil_16 si > > > > > > > +#define rsi_32 esi > > > > > > > +#define esi_32 esi > > > > > > > +#define si_32 esi > > > > > > > +#define sil_32 esi > > > > > > > +#define rsi_64 rsi > > > > > > > +#define esi_64 rsi > > > > > > > +#define si_64 rsi > > > > > > > +#define sil_64 rsi > > > > > > > +#define rdi_8 dil > > > > > > > +#define edi_8 dil > > > > > > > +#define di_8 dil > > > > > > > +#define dil_8 dil > > > > > > > +#define rdi_16 di > > > > > > > +#define edi_16 di > > > > > > > +#define di_16 di > > > > > > > +#define dil_16 di > > > > > > > +#define rdi_32 edi > > > > > > > +#define edi_32 edi > > > > > > > +#define di_32 edi > > > > > > > +#define dil_32 edi > > > > > > > +#define rdi_64 rdi > > > > > > > +#define edi_64 rdi > > > > > > > +#define di_64 rdi > > > > > > > +#define dil_64 rdi > > > > > > > +#define r8_8 r8b > > > > > > > +#define r8d_8 r8b > > > > > > > +#define r8w_8 r8b > > > > > > > +#define r8b_8 r8b > > > > > > > +#define r8_16 r8w > > > > > > > +#define r8d_16 r8w > > > > > > > +#define r8w_16 r8w > > > > > > > +#define r8b_16 r8w > > > > > > > +#define r8_32 r8d > > > > > > > +#define r8d_32 r8d > > > > > > > +#define r8w_32 r8d > > > > > > > +#define r8b_32 r8d > > > > > > > +#define r8_64 r8 > > > > > > > +#define r8d_64 r8 > > > > > > > +#define r8w_64 r8 > > > > > > > +#define r8b_64 r8 > > > > > > > +#define r9_8 r9b > > > > > > > +#define r9d_8 r9b > > > > > > > +#define r9w_8 r9b > > > > > > > +#define r9b_8 r9b > > > > > > > +#define r9_16 r9w > > > > > > > +#define r9d_16 r9w > > > > > > > +#define r9w_16 r9w > > > > > > > +#define r9b_16 r9w > > > > > > > +#define r9_32 r9d > > > > > > > +#define r9d_32 r9d > > > > > > > +#define r9w_32 r9d > > > > > > > +#define r9b_32 r9d > > > > > > > +#define r9_64 r9 > > > > > > > +#define r9d_64 r9 > > > > > > > +#define r9w_64 r9 > > > > > > > +#define r9b_64 r9 > > > > > > > +#define r10_8 r10b > > > > > > > +#define r10d_8 r10b > > > > > > > +#define r10w_8 r10b > > > > > > > +#define r10b_8 r10b > > > > > > > +#define r10_16 r10w > > > > > > > +#define r10d_16 r10w > > > > > > > +#define r10w_16 r10w > > > > > > > +#define r10b_16 r10w > > > > > > > +#define r10_32 r10d > > > > > > > +#define r10d_32 r10d > > > > > > > +#define r10w_32 r10d > > > > > > > +#define r10b_32 r10d > > > > > > > +#define r10_64 r10 > > > > > > > +#define r10d_64 r10 > > > > > > > +#define r10w_64 r10 > > > > > > > +#define r10b_64 r10 > > > > > > > +#define r11_8 r11b > > > > > > > +#define r11d_8 r11b > > > > > > > +#define r11w_8 r11b > > > > > > > +#define r11b_8 r11b > > > > > > > +#define r11_16 r11w > > > > > > > +#define r11d_16 r11w > > > > > > > +#define r11w_16 r11w > > > > > > > +#define r11b_16 r11w > > > > > > > +#define r11_32 r11d > > > > > > > +#define r11d_32 r11d > > > > > > > +#define r11w_32 r11d > > > > > > > +#define r11b_32 r11d > > > > > > > +#define r11_64 r11 > > > > > > > +#define r11d_64 r11 > > > > > > > +#define r11w_64 r11 > > > > > > > +#define r11b_64 r11 > > > > > > > +#define r12_8 r12b > > > > > > > +#define r12d_8 r12b > > > > > > > +#define r12w_8 r12b > > > > > > > +#define r12b_8 r12b > > > > > > > +#define r12_16 r12w > > > > > > > +#define r12d_16 r12w > > > > > > > +#define r12w_16 r12w > > > > > > > +#define r12b_16 r12w > > > > > > > +#define r12_32 r12d > > > > > > > +#define r12d_32 r12d > > > > > > > +#define r12w_32 r12d > > > > > > > +#define r12b_32 r12d > > > > > > > +#define r12_64 r12 > > > > > > > +#define r12d_64 r12 > > > > > > > +#define r12w_64 r12 > > > > > > > +#define r12b_64 r12 > > > > > > > +#define r13_8 r13b > > > > > > > +#define r13d_8 r13b > > > > > > > +#define r13w_8 r13b > > > > > > > +#define r13b_8 r13b > > > > > > > +#define r13_16 r13w > > > > > > > +#define r13d_16 r13w > > > > > > > +#define r13w_16 r13w > > > > > > > +#define r13b_16 r13w > > > > > > > +#define r13_32 r13d > > > > > > > +#define r13d_32 r13d > > > > > > > +#define r13w_32 r13d > > > > > > > +#define r13b_32 r13d > > > > > > > +#define r13_64 r13 > > > > > > > +#define r13d_64 r13 > > > > > > > +#define r13w_64 r13 > > > > > > > +#define r13b_64 r13 > > > > > > > +#define r14_8 r14b > > > > > > > +#define r14d_8 r14b > > > > > > > +#define r14w_8 r14b > > > > > > > +#define r14b_8 r14b > > > > > > > +#define r14_16 r14w > > > > > > > +#define r14d_16 r14w > > > > > > > +#define r14w_16 r14w > > > > > > > +#define r14b_16 r14w > > > > > > > +#define r14_32 r14d > > > > > > > +#define r14d_32 r14d > > > > > > > +#define r14w_32 r14d > > > > > > > +#define r14b_32 r14d > > > > > > > +#define r14_64 r14 > > > > > > > +#define r14d_64 r14 > > > > > > > +#define r14w_64 r14 > > > > > > > +#define r14b_64 r14 > > > > > > > +#define r15_8 r15b > > > > > > > +#define r15d_8 r15b > > > > > > > +#define r15w_8 r15b > > > > > > > +#define r15b_8 r15b > > > > > > > +#define r15_16 r15w > > > > > > > +#define r15d_16 r15w > > > > > > > +#define r15w_16 r15w > > > > > > > +#define r15b_16 r15w > > > > > > > +#define r15_32 r15d > > > > > > > +#define r15d_32 r15d > > > > > > > +#define r15w_32 r15d > > > > > > > +#define r15b_32 r15d > > > > > > > +#define r15_64 r15 > > > > > > > +#define r15d_64 r15 > > > > > > > +#define r15w_64 r15 > > > > > > > +#define r15b_64 r15 > > > > > > > + > > > > > > > +#define VRAX VGPR(rax) > > > > > > > +#define VRBX VGPR(rbx) > > > > > > > +#define VRCX VGPR(rcx) > > > > > > > +#define VRDX VGPR(rdx) > > > > > > > +#define VRBP VGPR(rbp) > > > > > > > +#define VRSP VGPR(rsp) > > > > > > > +#define VRSI VGPR(rsi) > > > > > > > +#define VRDI VGPR(rdi) > > > > > > > +#define VR8 VGPR(r8) > > > > > > > +#define VR9 VGPR(r9) > > > > > > > +#define VR10 VGPR(r10) > > > > > > > +#define VR11 VGPR(r11) > > > > > > > +#define VR12 VGPR(r12) > > > > > > > +#define VR13 VGPR(r13) > > > > > > > +#define VR14 VGPR(r14) > > > > > > > +#define VR15 VGPR(r15) > > > > > > > + > > > > > > > +#define kmov_8 kmovb > > > > > > > +#define kmov_16 kmovw > > > > > > > +#define kmov_32 kmovd > > > > > > > +#define kmov_64 kmovq > > > > > > > > > > > > Only 32 and 64 are needed. > > > > > > > > > > Thats not entirely true for the wide-char impls. > > > > > > > > > > > > > +#define kortest_8 kortestb > > > > > > > +#define kortest_16 kortestw > > > > > > > +#define kortest_32 kortestd > > > > > > > +#define kortest_64 kortestq > > > > > > > +#define kor_8 korb > > > > > > > +#define kor_16 korw > > > > > > > +#define kor_32 kord > > > > > > > +#define kor_64 korq > > > > > > > +#define ktest_8 ktestb > > > > > > > +#define ktest_16 ktestw > > > > > > > +#define ktest_32 ktestd > > > > > > > +#define ktest_64 ktestq > > > > > > > +#define kand_8 kandb > > > > > > > +#define kand_16 kandw > > > > > > > +#define kand_32 kandd > > > > > > > +#define kand_64 kandq > > > > > > > +#define kxor_8 kxorb > > > > > > > +#define kxor_16 kxorw > > > > > > > +#define kxor_32 kxord > > > > > > > +#define kxor_64 kxorq > > > > > > > + > > > > > > > +#define kmovV VKINSN_SZ(kmov, REG_WIDTH) > > > > > > > +#define kortestV VKINSN_SZ(kortest, REG_WIDTH) > > > > > > > +#define korV VKINSN_SZ(kor, REG_WIDTH) > > > > > > > +#define ktestV VKINSN_SZ(ktest, REG_WIDTH) > > > > > > > +#define kandV VKINSN_SZ(kand, REG_WIDTH) > > > > > > > +#define kxorV VKINSN_SZ(kxor, REG_WIDTH) > > > > > > > > > > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH) > > > > > > > > > > Will fix for V5. > > > > > > > > > > > > > + > > > > > > > +#ifndef REG_WIDTH > > > > > > > +#define REG_WIDTH VEC_SIZE > > > > > > > > > > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH > > > > > > can be dropped. > > > > > > > > > > Thats not quite true. > > > > > > > > > > For wide-char impls REG_WIDTH != VEC_SIZE. > > > > > > > > These register macros are used to operate vectors. Do you have > > > > an example of REG_WIDTH != VEC_SIZE? > > > > > > But since wide-char instructions use 32-bit comparison the resulting > > > mask is < 64-bit i.e: > > > > > > vpcmpd %zmm16, %zmm17, %k0 > > > kmovd %k0, %eax > > > will collect all the necessary bits and is prefered. > > > > > > Next version of Sunil's memchr-evex512 should have it. > > > > > > > So it is based on CHAR_PER_VEC. When will 8-bit and 16-bit > > registers be used? > > In a sense. generally even if CHAR_PER_VEC < 32 its better to use > 32 but in some cases where you want to use `inc{b|w}` to test for > all 1s its useful for `VGPR_SZ(rax, CHAR_PER_VEC)` to work. We only need #define rax_8 al not #define eax_8 al #define ax_8 al #define al_8 al
On Fri, Oct 14, 2022 at 2:13 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Fri, Oct 14, 2022 at 12:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Fri, Oct 14, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Fri, Oct 14, 2022 at 11:38 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > > > On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > > > > > > > On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > > > > > > > This is to make it easier to do think like: > > > > > > > > ``` > > > > > > > > vpcmpb %VEC(0), %VEC(1), %k0 > > > > > > > > kmov{d|q} %k0, %{eax|rax} > > > > > > > > test %{eax|rax} > > > > > > > > ``` > > > > > > > > > > > > > > Since all these register macros are based on VEC_SIZE which is either 32 > > > > > > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are > > > > > > > needed. 8-bit and 16-bit macros aren't needed. > > > > > > > > > > > > > > > It adds macro s.t any GPR can get the proper width with: > > > > > > > > `V{upper_case_GPR_name}` > > > > > > > > > > > > > > > > and any mask insn can get the proper width with: > > > > > > > > `{mask_insn_without_postfix}V` > > > > > > > > > > > > > > All macros should be in upper cases. > > > > > > > > > > > > > > > This commit does not change libc.so > > > > > > > > > > > > > > > > Tested build on x86-64 > > > > > > > > --- > > > > > > > > sysdeps/x86_64/multiarch/reg-macros.h | 337 ++++++++++++++++++ > > > > > > > > .../multiarch/scripts/gen-reg-map-macros.py | 97 +++++ > > > > > > > > 2 files changed, 434 insertions(+) > > > > > > > > create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h > > > > > > > > create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py > > > > > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h > > > > > > > > new file mode 100644 > > > > > > > > index 0000000000..c4d7f57b66 > > > > > > > > --- /dev/null > > > > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h > > > > > > > > > > > > > > vreg-macros.h to indicate macros based on vector size. Please > > > > > > > add comments to indicate that register macros are expanded based > > > > > > > on vector size. > > > > > > > > > > > > > > > @@ -0,0 +1,337 @@ > > > > > > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py. > > > > > > > > + > > > > > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > > > > > + This file is part of the GNU C Library. > > > > > > > > + > > > > > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > > > > > + modify it under the terms of the GNU Lesser General Public > > > > > > > > + License as published by the Free Software Foundation; either > > > > > > > > + version 2.1 of the License, or (at your option) any later version. > > > > > > > > + > > > > > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > > > > > + Lesser General Public License for more details. > > > > > > > > + > > > > > > > > + You should have received a copy of the GNU Lesser General Public > > > > > > > > + License along with the GNU C Library; if not, see > > > > > > > > + <https://www.gnu.org/licenses/>. */ > > > > > > > > + > > > > > > > > +#ifndef _REG_MAP_MACROS_H > > > > > > > > +#define _REG_MAP_MACROS_H 1 > > > > > > > > + > > > > > > > > +#define rax_8 al > > > > > > > > +#define eax_8 al > > > > > > > > +#define ax_8 al > > > > > > > > +#define al_8 al > > > > > > > > +#define rax_16 ax > > > > > > > > +#define eax_16 ax > > > > > > > > +#define ax_16 ax > > > > > > > > +#define al_16 ax > > > > > > > > +#define rax_32 eax > > > > > > > > +#define eax_32 eax > > > > > > > > +#define ax_32 eax > > > > > > > > +#define al_32 eax > > > > > > > > +#define rax_64 rax > > > > > > > > +#define eax_64 rax > > > > > > > > +#define ax_64 rax > > > > > > > > +#define al_64 rax > > > > > > > > > > > > > > Only rax_32 and rax_64 are needed. > > > > > > > > > > > > > > > +#define rbx_8 bl > > > > > > > > +#define ebx_8 bl > > > > > > > > +#define bx_8 bl > > > > > > > > +#define bl_8 bl > > > > > > > > +#define rbx_16 bx > > > > > > > > +#define ebx_16 bx > > > > > > > > +#define bx_16 bx > > > > > > > > +#define bl_16 bx > > > > > > > > +#define rbx_32 ebx > > > > > > > > +#define ebx_32 ebx > > > > > > > > +#define bx_32 ebx > > > > > > > > +#define bl_32 ebx > > > > > > > > +#define rbx_64 rbx > > > > > > > > +#define ebx_64 rbx > > > > > > > > +#define bx_64 rbx > > > > > > > > +#define bl_64 rbx > > > > > > > > +#define rcx_8 cl > > > > > > > > +#define ecx_8 cl > > > > > > > > +#define cx_8 cl > > > > > > > > +#define cl_8 cl > > > > > > > > +#define rcx_16 cx > > > > > > > > +#define ecx_16 cx > > > > > > > > +#define cx_16 cx > > > > > > > > +#define cl_16 cx > > > > > > > > +#define rcx_32 ecx > > > > > > > > +#define ecx_32 ecx > > > > > > > > +#define cx_32 ecx > > > > > > > > +#define cl_32 ecx > > > > > > > > +#define rcx_64 rcx > > > > > > > > +#define ecx_64 rcx > > > > > > > > +#define cx_64 rcx > > > > > > > > +#define cl_64 rcx > > > > > > > > +#define rdx_8 dl > > > > > > > > +#define edx_8 dl > > > > > > > > +#define dx_8 dl > > > > > > > > +#define dl_8 dl > > > > > > > > +#define rdx_16 dx > > > > > > > > +#define edx_16 dx > > > > > > > > +#define dx_16 dx > > > > > > > > +#define dl_16 dx > > > > > > > > +#define rdx_32 edx > > > > > > > > +#define edx_32 edx > > > > > > > > +#define dx_32 edx > > > > > > > > +#define dl_32 edx > > > > > > > > +#define rdx_64 rdx > > > > > > > > +#define edx_64 rdx > > > > > > > > +#define dx_64 rdx > > > > > > > > +#define dl_64 rdx > > > > > > > > +#define rbp_8 bpl > > > > > > > > +#define ebp_8 bpl > > > > > > > > +#define bp_8 bpl > > > > > > > > +#define bpl_8 bpl > > > > > > > > +#define rbp_16 bp > > > > > > > > +#define ebp_16 bp > > > > > > > > +#define bp_16 bp > > > > > > > > +#define bpl_16 bp > > > > > > > > +#define rbp_32 ebp > > > > > > > > +#define ebp_32 ebp > > > > > > > > +#define bp_32 ebp > > > > > > > > +#define bpl_32 ebp > > > > > > > > +#define rbp_64 rbp > > > > > > > > +#define ebp_64 rbp > > > > > > > > +#define bp_64 rbp > > > > > > > > +#define bpl_64 rbp > > > > > > > > +#define rsp_8 spl > > > > > > > > +#define esp_8 spl > > > > > > > > +#define sp_8 spl > > > > > > > > +#define spl_8 spl > > > > > > > > +#define rsp_16 sp > > > > > > > > +#define esp_16 sp > > > > > > > > +#define sp_16 sp > > > > > > > > +#define spl_16 sp > > > > > > > > +#define rsp_32 esp > > > > > > > > +#define esp_32 esp > > > > > > > > +#define sp_32 esp > > > > > > > > +#define spl_32 esp > > > > > > > > +#define rsp_64 rsp > > > > > > > > +#define esp_64 rsp > > > > > > > > +#define sp_64 rsp > > > > > > > > +#define spl_64 rsp > > > > > > > > +#define rsi_8 sil > > > > > > > > +#define esi_8 sil > > > > > > > > +#define si_8 sil > > > > > > > > +#define sil_8 sil > > > > > > > > +#define rsi_16 si > > > > > > > > +#define esi_16 si > > > > > > > > +#define si_16 si > > > > > > > > +#define sil_16 si > > > > > > > > +#define rsi_32 esi > > > > > > > > +#define esi_32 esi > > > > > > > > +#define si_32 esi > > > > > > > > +#define sil_32 esi > > > > > > > > +#define rsi_64 rsi > > > > > > > > +#define esi_64 rsi > > > > > > > > +#define si_64 rsi > > > > > > > > +#define sil_64 rsi > > > > > > > > +#define rdi_8 dil > > > > > > > > +#define edi_8 dil > > > > > > > > +#define di_8 dil > > > > > > > > +#define dil_8 dil > > > > > > > > +#define rdi_16 di > > > > > > > > +#define edi_16 di > > > > > > > > +#define di_16 di > > > > > > > > +#define dil_16 di > > > > > > > > +#define rdi_32 edi > > > > > > > > +#define edi_32 edi > > > > > > > > +#define di_32 edi > > > > > > > > +#define dil_32 edi > > > > > > > > +#define rdi_64 rdi > > > > > > > > +#define edi_64 rdi > > > > > > > > +#define di_64 rdi > > > > > > > > +#define dil_64 rdi > > > > > > > > +#define r8_8 r8b > > > > > > > > +#define r8d_8 r8b > > > > > > > > +#define r8w_8 r8b > > > > > > > > +#define r8b_8 r8b > > > > > > > > +#define r8_16 r8w > > > > > > > > +#define r8d_16 r8w > > > > > > > > +#define r8w_16 r8w > > > > > > > > +#define r8b_16 r8w > > > > > > > > +#define r8_32 r8d > > > > > > > > +#define r8d_32 r8d > > > > > > > > +#define r8w_32 r8d > > > > > > > > +#define r8b_32 r8d > > > > > > > > +#define r8_64 r8 > > > > > > > > +#define r8d_64 r8 > > > > > > > > +#define r8w_64 r8 > > > > > > > > +#define r8b_64 r8 > > > > > > > > +#define r9_8 r9b > > > > > > > > +#define r9d_8 r9b > > > > > > > > +#define r9w_8 r9b > > > > > > > > +#define r9b_8 r9b > > > > > > > > +#define r9_16 r9w > > > > > > > > +#define r9d_16 r9w > > > > > > > > +#define r9w_16 r9w > > > > > > > > +#define r9b_16 r9w > > > > > > > > +#define r9_32 r9d > > > > > > > > +#define r9d_32 r9d > > > > > > > > +#define r9w_32 r9d > > > > > > > > +#define r9b_32 r9d > > > > > > > > +#define r9_64 r9 > > > > > > > > +#define r9d_64 r9 > > > > > > > > +#define r9w_64 r9 > > > > > > > > +#define r9b_64 r9 > > > > > > > > +#define r10_8 r10b > > > > > > > > +#define r10d_8 r10b > > > > > > > > +#define r10w_8 r10b > > > > > > > > +#define r10b_8 r10b > > > > > > > > +#define r10_16 r10w > > > > > > > > +#define r10d_16 r10w > > > > > > > > +#define r10w_16 r10w > > > > > > > > +#define r10b_16 r10w > > > > > > > > +#define r10_32 r10d > > > > > > > > +#define r10d_32 r10d > > > > > > > > +#define r10w_32 r10d > > > > > > > > +#define r10b_32 r10d > > > > > > > > +#define r10_64 r10 > > > > > > > > +#define r10d_64 r10 > > > > > > > > +#define r10w_64 r10 > > > > > > > > +#define r10b_64 r10 > > > > > > > > +#define r11_8 r11b > > > > > > > > +#define r11d_8 r11b > > > > > > > > +#define r11w_8 r11b > > > > > > > > +#define r11b_8 r11b > > > > > > > > +#define r11_16 r11w > > > > > > > > +#define r11d_16 r11w > > > > > > > > +#define r11w_16 r11w > > > > > > > > +#define r11b_16 r11w > > > > > > > > +#define r11_32 r11d > > > > > > > > +#define r11d_32 r11d > > > > > > > > +#define r11w_32 r11d > > > > > > > > +#define r11b_32 r11d > > > > > > > > +#define r11_64 r11 > > > > > > > > +#define r11d_64 r11 > > > > > > > > +#define r11w_64 r11 > > > > > > > > +#define r11b_64 r11 > > > > > > > > +#define r12_8 r12b > > > > > > > > +#define r12d_8 r12b > > > > > > > > +#define r12w_8 r12b > > > > > > > > +#define r12b_8 r12b > > > > > > > > +#define r12_16 r12w > > > > > > > > +#define r12d_16 r12w > > > > > > > > +#define r12w_16 r12w > > > > > > > > +#define r12b_16 r12w > > > > > > > > +#define r12_32 r12d > > > > > > > > +#define r12d_32 r12d > > > > > > > > +#define r12w_32 r12d > > > > > > > > +#define r12b_32 r12d > > > > > > > > +#define r12_64 r12 > > > > > > > > +#define r12d_64 r12 > > > > > > > > +#define r12w_64 r12 > > > > > > > > +#define r12b_64 r12 > > > > > > > > +#define r13_8 r13b > > > > > > > > +#define r13d_8 r13b > > > > > > > > +#define r13w_8 r13b > > > > > > > > +#define r13b_8 r13b > > > > > > > > +#define r13_16 r13w > > > > > > > > +#define r13d_16 r13w > > > > > > > > +#define r13w_16 r13w > > > > > > > > +#define r13b_16 r13w > > > > > > > > +#define r13_32 r13d > > > > > > > > +#define r13d_32 r13d > > > > > > > > +#define r13w_32 r13d > > > > > > > > +#define r13b_32 r13d > > > > > > > > +#define r13_64 r13 > > > > > > > > +#define r13d_64 r13 > > > > > > > > +#define r13w_64 r13 > > > > > > > > +#define r13b_64 r13 > > > > > > > > +#define r14_8 r14b > > > > > > > > +#define r14d_8 r14b > > > > > > > > +#define r14w_8 r14b > > > > > > > > +#define r14b_8 r14b > > > > > > > > +#define r14_16 r14w > > > > > > > > +#define r14d_16 r14w > > > > > > > > +#define r14w_16 r14w > > > > > > > > +#define r14b_16 r14w > > > > > > > > +#define r14_32 r14d > > > > > > > > +#define r14d_32 r14d > > > > > > > > +#define r14w_32 r14d > > > > > > > > +#define r14b_32 r14d > > > > > > > > +#define r14_64 r14 > > > > > > > > +#define r14d_64 r14 > > > > > > > > +#define r14w_64 r14 > > > > > > > > +#define r14b_64 r14 > > > > > > > > +#define r15_8 r15b > > > > > > > > +#define r15d_8 r15b > > > > > > > > +#define r15w_8 r15b > > > > > > > > +#define r15b_8 r15b > > > > > > > > +#define r15_16 r15w > > > > > > > > +#define r15d_16 r15w > > > > > > > > +#define r15w_16 r15w > > > > > > > > +#define r15b_16 r15w > > > > > > > > +#define r15_32 r15d > > > > > > > > +#define r15d_32 r15d > > > > > > > > +#define r15w_32 r15d > > > > > > > > +#define r15b_32 r15d > > > > > > > > +#define r15_64 r15 > > > > > > > > +#define r15d_64 r15 > > > > > > > > +#define r15w_64 r15 > > > > > > > > +#define r15b_64 r15 > > > > > > > > + > > > > > > > > +#define VRAX VGPR(rax) > > > > > > > > +#define VRBX VGPR(rbx) > > > > > > > > +#define VRCX VGPR(rcx) > > > > > > > > +#define VRDX VGPR(rdx) > > > > > > > > +#define VRBP VGPR(rbp) > > > > > > > > +#define VRSP VGPR(rsp) > > > > > > > > +#define VRSI VGPR(rsi) > > > > > > > > +#define VRDI VGPR(rdi) > > > > > > > > +#define VR8 VGPR(r8) > > > > > > > > +#define VR9 VGPR(r9) > > > > > > > > +#define VR10 VGPR(r10) > > > > > > > > +#define VR11 VGPR(r11) > > > > > > > > +#define VR12 VGPR(r12) > > > > > > > > +#define VR13 VGPR(r13) > > > > > > > > +#define VR14 VGPR(r14) > > > > > > > > +#define VR15 VGPR(r15) > > > > > > > > + > > > > > > > > +#define kmov_8 kmovb > > > > > > > > +#define kmov_16 kmovw > > > > > > > > +#define kmov_32 kmovd > > > > > > > > +#define kmov_64 kmovq > > > > > > > > > > > > > > Only 32 and 64 are needed. > > > > > > > > > > > > Thats not entirely true for the wide-char impls. > > > > > > > > > > > > > > > +#define kortest_8 kortestb > > > > > > > > +#define kortest_16 kortestw > > > > > > > > +#define kortest_32 kortestd > > > > > > > > +#define kortest_64 kortestq > > > > > > > > +#define kor_8 korb > > > > > > > > +#define kor_16 korw > > > > > > > > +#define kor_32 kord > > > > > > > > +#define kor_64 korq > > > > > > > > +#define ktest_8 ktestb > > > > > > > > +#define ktest_16 ktestw > > > > > > > > +#define ktest_32 ktestd > > > > > > > > +#define ktest_64 ktestq > > > > > > > > +#define kand_8 kandb > > > > > > > > +#define kand_16 kandw > > > > > > > > +#define kand_32 kandd > > > > > > > > +#define kand_64 kandq > > > > > > > > +#define kxor_8 kxorb > > > > > > > > +#define kxor_16 kxorw > > > > > > > > +#define kxor_32 kxord > > > > > > > > +#define kxor_64 kxorq > > > > > > > > + > > > > > > > > +#define kmovV VKINSN_SZ(kmov, REG_WIDTH) > > > > > > > > +#define kortestV VKINSN_SZ(kortest, REG_WIDTH) > > > > > > > > +#define korV VKINSN_SZ(kor, REG_WIDTH) > > > > > > > > +#define ktestV VKINSN_SZ(ktest, REG_WIDTH) > > > > > > > > +#define kandV VKINSN_SZ(kand, REG_WIDTH) > > > > > > > > +#define kxorV VKINSN_SZ(kxor, REG_WIDTH) > > > > > > > > > > > > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH) > > > > > > > > > > > > Will fix for V5. > > > > > > > > > > > > > > > + > > > > > > > > +#ifndef REG_WIDTH > > > > > > > > +#define REG_WIDTH VEC_SIZE > > > > > > > > > > > > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH > > > > > > > can be dropped. > > > > > > > > > > > > Thats not quite true. > > > > > > > > > > > > For wide-char impls REG_WIDTH != VEC_SIZE. > > > > > > > > > > These register macros are used to operate vectors. Do you have > > > > > an example of REG_WIDTH != VEC_SIZE? > > > > > > > > But since wide-char instructions use 32-bit comparison the resulting > > > > mask is < 64-bit i.e: > > > > > > > > vpcmpd %zmm16, %zmm17, %k0 > > > > kmovd %k0, %eax > > > > will collect all the necessary bits and is prefered. > > > > > > > > Next version of Sunil's memchr-evex512 should have it. > > > > > > > > > > So it is based on CHAR_PER_VEC. When will 8-bit and 16-bit > > > registers be used? > > > > In a sense. generally even if CHAR_PER_VEC < 32 its better to use > > 32 but in some cases where you want to use `inc{b|w}` to test for > > all 1s its useful for `VGPR_SZ(rax, CHAR_PER_VEC)` to work. > > We only need > > #define rax_8 al > > not > > #define eax_8 al > #define ax_8 al > #define al_8 al > Thats fair. I guess I was thinking this would be the easiest as you can replace any but would you prefer for these only to apply to 64-bit variants? > -- > H.J.
diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h new file mode 100644 index 0000000000..c4d7f57b66 --- /dev/null +++ b/sysdeps/x86_64/multiarch/reg-macros.h @@ -0,0 +1,337 @@ +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py. + + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _REG_MAP_MACROS_H +#define _REG_MAP_MACROS_H 1 + +#define rax_8 al +#define eax_8 al +#define ax_8 al +#define al_8 al +#define rax_16 ax +#define eax_16 ax +#define ax_16 ax +#define al_16 ax +#define rax_32 eax +#define eax_32 eax +#define ax_32 eax +#define al_32 eax +#define rax_64 rax +#define eax_64 rax +#define ax_64 rax +#define al_64 rax +#define rbx_8 bl +#define ebx_8 bl +#define bx_8 bl +#define bl_8 bl +#define rbx_16 bx +#define ebx_16 bx +#define bx_16 bx +#define bl_16 bx +#define rbx_32 ebx +#define ebx_32 ebx +#define bx_32 ebx +#define bl_32 ebx +#define rbx_64 rbx +#define ebx_64 rbx +#define bx_64 rbx +#define bl_64 rbx +#define rcx_8 cl +#define ecx_8 cl +#define cx_8 cl +#define cl_8 cl +#define rcx_16 cx +#define ecx_16 cx +#define cx_16 cx +#define cl_16 cx +#define rcx_32 ecx +#define ecx_32 ecx +#define cx_32 ecx +#define cl_32 ecx +#define rcx_64 rcx +#define ecx_64 rcx +#define cx_64 rcx +#define cl_64 rcx +#define rdx_8 dl +#define edx_8 dl +#define dx_8 dl +#define dl_8 dl +#define rdx_16 dx +#define edx_16 dx +#define dx_16 dx +#define dl_16 dx +#define rdx_32 edx +#define edx_32 edx +#define dx_32 edx +#define dl_32 edx +#define rdx_64 rdx +#define edx_64 rdx +#define dx_64 rdx +#define dl_64 rdx +#define rbp_8 bpl +#define ebp_8 bpl +#define bp_8 bpl +#define bpl_8 bpl +#define rbp_16 bp +#define ebp_16 bp +#define bp_16 bp +#define bpl_16 bp +#define rbp_32 ebp +#define ebp_32 ebp +#define bp_32 ebp +#define bpl_32 ebp +#define rbp_64 rbp +#define ebp_64 rbp +#define bp_64 rbp +#define bpl_64 rbp +#define rsp_8 spl +#define esp_8 spl +#define sp_8 spl +#define spl_8 spl +#define rsp_16 sp +#define esp_16 sp +#define sp_16 sp +#define spl_16 sp +#define rsp_32 esp +#define esp_32 esp +#define sp_32 esp +#define spl_32 esp +#define rsp_64 rsp +#define esp_64 rsp +#define sp_64 rsp +#define spl_64 rsp +#define rsi_8 sil +#define esi_8 sil +#define si_8 sil +#define sil_8 sil +#define rsi_16 si +#define esi_16 si +#define si_16 si +#define sil_16 si +#define rsi_32 esi +#define esi_32 esi +#define si_32 esi +#define sil_32 esi +#define rsi_64 rsi +#define esi_64 rsi +#define si_64 rsi +#define sil_64 rsi +#define rdi_8 dil +#define edi_8 dil +#define di_8 dil +#define dil_8 dil +#define rdi_16 di +#define edi_16 di +#define di_16 di +#define dil_16 di +#define rdi_32 edi +#define edi_32 edi +#define di_32 edi +#define dil_32 edi +#define rdi_64 rdi +#define edi_64 rdi +#define di_64 rdi +#define dil_64 rdi +#define r8_8 r8b +#define r8d_8 r8b +#define r8w_8 r8b +#define r8b_8 r8b +#define r8_16 r8w +#define r8d_16 r8w +#define r8w_16 r8w +#define r8b_16 r8w +#define r8_32 r8d +#define r8d_32 r8d +#define r8w_32 r8d +#define r8b_32 r8d +#define r8_64 r8 +#define r8d_64 r8 +#define r8w_64 r8 +#define r8b_64 r8 +#define r9_8 r9b +#define r9d_8 r9b +#define r9w_8 r9b +#define r9b_8 r9b +#define r9_16 r9w +#define r9d_16 r9w +#define r9w_16 r9w +#define r9b_16 r9w +#define r9_32 r9d +#define r9d_32 r9d +#define r9w_32 r9d +#define r9b_32 r9d +#define r9_64 r9 +#define r9d_64 r9 +#define r9w_64 r9 +#define r9b_64 r9 +#define r10_8 r10b +#define r10d_8 r10b +#define r10w_8 r10b +#define r10b_8 r10b +#define r10_16 r10w +#define r10d_16 r10w +#define r10w_16 r10w +#define r10b_16 r10w +#define r10_32 r10d +#define r10d_32 r10d +#define r10w_32 r10d +#define r10b_32 r10d +#define r10_64 r10 +#define r10d_64 r10 +#define r10w_64 r10 +#define r10b_64 r10 +#define r11_8 r11b +#define r11d_8 r11b +#define r11w_8 r11b +#define r11b_8 r11b +#define r11_16 r11w +#define r11d_16 r11w +#define r11w_16 r11w +#define r11b_16 r11w +#define r11_32 r11d +#define r11d_32 r11d +#define r11w_32 r11d +#define r11b_32 r11d +#define r11_64 r11 +#define r11d_64 r11 +#define r11w_64 r11 +#define r11b_64 r11 +#define r12_8 r12b +#define r12d_8 r12b +#define r12w_8 r12b +#define r12b_8 r12b +#define r12_16 r12w +#define r12d_16 r12w +#define r12w_16 r12w +#define r12b_16 r12w +#define r12_32 r12d +#define r12d_32 r12d +#define r12w_32 r12d +#define r12b_32 r12d +#define r12_64 r12 +#define r12d_64 r12 +#define r12w_64 r12 +#define r12b_64 r12 +#define r13_8 r13b +#define r13d_8 r13b +#define r13w_8 r13b +#define r13b_8 r13b +#define r13_16 r13w +#define r13d_16 r13w +#define r13w_16 r13w +#define r13b_16 r13w +#define r13_32 r13d +#define r13d_32 r13d +#define r13w_32 r13d +#define r13b_32 r13d +#define r13_64 r13 +#define r13d_64 r13 +#define r13w_64 r13 +#define r13b_64 r13 +#define r14_8 r14b +#define r14d_8 r14b +#define r14w_8 r14b +#define r14b_8 r14b +#define r14_16 r14w +#define r14d_16 r14w +#define r14w_16 r14w +#define r14b_16 r14w +#define r14_32 r14d +#define r14d_32 r14d +#define r14w_32 r14d +#define r14b_32 r14d +#define r14_64 r14 +#define r14d_64 r14 +#define r14w_64 r14 +#define r14b_64 r14 +#define r15_8 r15b +#define r15d_8 r15b +#define r15w_8 r15b +#define r15b_8 r15b +#define r15_16 r15w +#define r15d_16 r15w +#define r15w_16 r15w +#define r15b_16 r15w +#define r15_32 r15d +#define r15d_32 r15d +#define r15w_32 r15d +#define r15b_32 r15d +#define r15_64 r15 +#define r15d_64 r15 +#define r15w_64 r15 +#define r15b_64 r15 + +#define VRAX VGPR(rax) +#define VRBX VGPR(rbx) +#define VRCX VGPR(rcx) +#define VRDX VGPR(rdx) +#define VRBP VGPR(rbp) +#define VRSP VGPR(rsp) +#define VRSI VGPR(rsi) +#define VRDI VGPR(rdi) +#define VR8 VGPR(r8) +#define VR9 VGPR(r9) +#define VR10 VGPR(r10) +#define VR11 VGPR(r11) +#define VR12 VGPR(r12) +#define VR13 VGPR(r13) +#define VR14 VGPR(r14) +#define VR15 VGPR(r15) + +#define kmov_8 kmovb +#define kmov_16 kmovw +#define kmov_32 kmovd +#define kmov_64 kmovq +#define kortest_8 kortestb +#define kortest_16 kortestw +#define kortest_32 kortestd +#define kortest_64 kortestq +#define kor_8 korb +#define kor_16 korw +#define kor_32 kord +#define kor_64 korq +#define ktest_8 ktestb +#define ktest_16 ktestw +#define ktest_32 ktestd +#define ktest_64 ktestq +#define kand_8 kandb +#define kand_16 kandw +#define kand_32 kandd +#define kand_64 kandq +#define kxor_8 kxorb +#define kxor_16 kxorw +#define kxor_32 kxord +#define kxor_64 kxorq + +#define kmovV VKINSN_SZ(kmov, REG_WIDTH) +#define kortestV VKINSN_SZ(kortest, REG_WIDTH) +#define korV VKINSN_SZ(kor, REG_WIDTH) +#define ktestV VKINSN_SZ(ktest, REG_WIDTH) +#define kandV VKINSN_SZ(kand, REG_WIDTH) +#define kxorV VKINSN_SZ(kxor, REG_WIDTH) + +#ifndef REG_WIDTH +#define REG_WIDTH VEC_SIZE +#endif +#define PRIM_VGPR_SZ(reg_name, reg_size) reg_name##_##reg_size +#define VGPR_SZ(reg_name, reg_size) PRIM_VGPR_SZ(reg_name, reg_size) +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH) +#define VKINSN_SZ(insn, reg_size) PRIM_VGPR_SZ(insn, reg_size) + +#endif diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py new file mode 100644 index 0000000000..5b04e89ecb --- /dev/null +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py @@ -0,0 +1,97 @@ +#!/usr/bin/python3 +# Copyright (C) 2022 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <https://www.gnu.org/licenses/>. +"""Generate macros for getting GPR name of a certain size + +Inputs: None +Output: Prints header fill to stdout + +API: + VGPR(reg_name) + - Get register name VEC_SIZE component of `reg_name` + VGPR_SZ(reg_name, reg_size) + - Get register name `reg_size` component of `reg_name` +""" + +import sys +from datetime import datetime + +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"], + ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"], + ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"], + ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"], + ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"], + ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"], + ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"], + ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]] + +mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"] +mask_insns_ext = ["b", "w", "d", "q"] + +cr = """ + Copyright (C) {} Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ +""" + +print("/* This file was generated by: {}.".format(sys.argv[0])) +print(cr.format(datetime.today().year)) + +print("#ifndef _REG_MAP_MACROS_H") +print("#define _REG_MAP_MACROS_H\t1\n") +for reg in registers: + for i in range(0, 4): + for j in range(0, 4): + print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i])) + +print("") +for reg in registers: + print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0])) + +print("") +for mask_insn in mask_insns: + for i in range(0, 4): + print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn, + mask_insns_ext[i])) + +print("") +for mask_insn in mask_insns: + print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn)) +print("") + +print("#ifndef REG_WIDTH") +print("#define REG_WIDTH VEC_SIZE") +print("#endif") +print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size") +print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)") +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)") +print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)") + +print("\n#endif")