From patchwork Sat Oct 15 00:06:13 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 1690237 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=2620:52:3:1:0:246e:9693:128c; helo=sourceware.org; envelope-from=libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Authentication-Results: legolas.ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.a=rsa-sha256 header.s=default header.b=vBQMhTCd; dkim-atps=neutral Received: from sourceware.org (server2.sourceware.org [IPv6:2620:52:3:1:0:246e:9693:128c]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-384) server-digest SHA384) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4Mq3WR05Dlz23jn for ; Sat, 15 Oct 2022 11:08:35 +1100 (AEDT) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id BAC243857811 for ; Sat, 15 Oct 2022 00:08:32 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BAC243857811 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1665792512; bh=61OdAFPtk3YHmwVo71nL9Xv3KxO17ZxhHlTlC9+Fk/Y=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=vBQMhTCdPYNS9Qf/K18fjbPmHL01lcmsKyokcDWMuk3HMqiEC9XlKEtAjMIJ4GaWn ZjJ1+FaZAEJ9FDlISUs3L+D/uw0y3ljxcuwJTcclgqbE933DA0rubwuryAxX6VRwjL VBkve7VccK0tYAv1MzXu49l4/mUCV/x+q5qKHu34= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pj1-x1033.google.com (mail-pj1-x1033.google.com [IPv6:2607:f8b0:4864:20::1033]) by sourceware.org (Postfix) with ESMTPS id AC19E3858418 for ; Sat, 15 Oct 2022 00:06:39 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org AC19E3858418 Received: by mail-pj1-x1033.google.com with SMTP id pq16so6310178pjb.2 for ; Fri, 14 Oct 2022 17:06:39 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=61OdAFPtk3YHmwVo71nL9Xv3KxO17ZxhHlTlC9+Fk/Y=; b=qC4Hqoe42W7hAy21gmGNgXsiAmTkyBVNTx15N/tvoN65EpGMXsj1UouYBqsdnb6c1/ f2M+VI1d0CoWVk7BCXOqGWPq7IQNrozYHbrO5QhkNaYOuuECRd17Y3I7zubnTuCpQIuY bCByjPWZIINGCHiShF3Kb56/O+uqdSQNMVGBoC24S0X4cf5TKR0ACWfe0KRaeK2BwzD5 7keoWXsNJ1yVrb1wZLTf9eaKY11GvRT4NisneA6eVCV3ZwuNm84FuOv3w380C/vcaQ3S Qy/0+VxeD5FF8R9sCT5tESkWoh7D8HYxTszSBTmv3xQKkX4YCDbvLJ7MFUtVQWM/TDh7 7h7Q== X-Gm-Message-State: ACrzQf2NO6k5u8/RJswygTG2o9io7+Z+KlumDeRvfMcAmqAiMEJIToCQ K5pXQx+UgPfidbCkF8fk2f/sUdNrAtAIOg== X-Google-Smtp-Source: AMsMyM7g21Fv3/lJUU4pZPGAf/TkQUlNPF5ztXYFPr67si4VIp777h34w531LbbHlmbwVMbj/x1rSA== X-Received: by 2002:a17:902:720a:b0:181:150c:fcc4 with SMTP id ba10-20020a170902720a00b00181150cfcc4mr330929plb.109.1665792398153; Fri, 14 Oct 2022 17:06:38 -0700 (PDT) Received: from noahgold-DESK.. ([192.55.60.38]) by smtp.gmail.com with ESMTPSA id o125-20020a62cd83000000b00561c179e17dsm2253717pfg.76.2022.10.14.17.06.36 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Fri, 14 Oct 2022 17:06:37 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v8 4/6] x86: Update memset to use new VEC macros Date: Fri, 14 Oct 2022 19:06:13 -0500 Message-Id: <20221015000615.126774-4-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20221015000615.126774-1-goldstein.w.n@gmail.com> References: <20221014164008.1325863-1-goldstein.w.n@gmail.com> <20221015000615.126774-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, URIBL_BLACK autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" Replace %VEC(n) -> %VMM(n) This commit does not change libc.so Tested build on x86-64 --- .../memset-avx2-unaligned-erms-rtm.S | 8 +-- .../multiarch/memset-avx2-unaligned-erms.S | 14 +--- .../multiarch/memset-avx512-unaligned-erms.S | 20 +----- .../multiarch/memset-evex-unaligned-erms.S | 20 +----- .../multiarch/memset-sse2-unaligned-erms.S | 10 +-- .../multiarch/memset-vec-unaligned-erms.S | 70 ++++++++----------- 6 files changed, 43 insertions(+), 99 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S index 8ac3e479bb..bc8605faf3 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S @@ -1,10 +1,6 @@ -#define ZERO_UPPER_VEC_REGISTERS_RETURN \ - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST +#include "x86-avx-rtm-vecs.h" -#define VZEROUPPER_RETURN jmp L(return) - -#define SECTION(p) p##.avx.rtm #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm -#include "memset-avx2-unaligned-erms.S" +# include "memset-avx2-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index a9054a9122..47cf5072a4 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -4,14 +4,9 @@ # define USE_WITH_AVX2 1 -# define VEC_SIZE 32 -# define MOV_SIZE 4 -# define RET_SIZE 4 - -# define VEC(i) ymm##i - -# define VMOVU vmovdqu -# define VMOVA vmovdqa +# ifndef VEC_SIZE +# include "x86-avx-vecs.h" +# endif # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ @@ -26,9 +21,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 -# ifndef SECTION -# define SECTION(p) p##.avx -# endif # ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_avx2_##s # endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index 47623b8ee8..84145b6c27 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -4,26 +4,14 @@ # define USE_WITH_AVX512 1 -# define VEC_SIZE 64 -# define MOV_SIZE 6 -# define RET_SIZE 1 - -# define XMM0 xmm16 -# define YMM0 ymm16 -# define VEC0 zmm16 -# define VEC(i) VEC##i - -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -# define VZEROUPPER +# include "x86-evex512-vecs.h" # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastb d, %VEC0; \ + vpbroadcastb d, %VMM(0); \ movq r, %rax # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastd d, %VEC0; \ + vpbroadcastd d, %VMM(0); \ movq r, %rax # define MEMSET_VDUP_TO_VEC0_HIGH() @@ -32,8 +20,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() # define WMEMSET_VDUP_TO_VEC0_LOW() -# define SECTION(p) p##.evex512 - #ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_avx512_##s #endif diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S index ac4b2d2d50..1f03b26bf8 100644 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -4,26 +4,14 @@ # define USE_WITH_EVEX 1 -# define VEC_SIZE 32 -# define MOV_SIZE 6 -# define RET_SIZE 1 - -# define XMM0 xmm16 -# define YMM0 ymm16 -# define VEC0 ymm16 -# define VEC(i) VEC##i - -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -# define VZEROUPPER +# include "x86-evex256-vecs.h" # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastb d, %VEC0; \ + vpbroadcastb d, %VMM(0); \ movq r, %rax # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastd d, %VEC0; \ + vpbroadcastd d, %VMM(0); \ movq r, %rax # define MEMSET_VDUP_TO_VEC0_HIGH() @@ -32,8 +20,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() # define WMEMSET_VDUP_TO_VEC0_LOW() -# define SECTION(p) p##.evex - #ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_evex_##s #endif diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S index 44f9b8888b..34b245d8ca 100644 --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S @@ -26,13 +26,7 @@ # include # define USE_WITH_SSE2 1 -# define VEC_SIZE 16 -# define MOV_SIZE 3 -# define RET_SIZE 1 - -# define VEC(i) xmm##i -# define VMOVU movups -# define VMOVA movaps +# include "x86-sse2-vecs.h" # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ @@ -52,8 +46,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() # define WMEMSET_VDUP_TO_VEC0_LOW() -# define SECTION(p) p - # ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_sse2_##s # endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 905d0fa464..03de0ab907 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -34,14 +34,6 @@ # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) #endif -#ifndef XMM0 -# define XMM0 xmm0 -#endif - -#ifndef YMM0 -# define YMM0 ymm0 -#endif - #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -150,8 +142,8 @@ L(entry_from_wmemset): cmpq $(VEC_SIZE * 2), %rdx ja L(more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(0), (%rdi) + VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VMM(0), (%rdi) VZEROUPPER_RETURN #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) cmp $(VEC_SIZE * 2), %RDX_LP ja L(stosb_more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VMOVU %VMM(0), (%rdi) + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) VZEROUPPER_RETURN #endif .p2align 4,, 4 L(last_2x_vec): #ifdef USE_LESS_VEC_MASK_STORE - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi, %rdx) + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) #else - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi) + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi) #endif VZEROUPPER_RETURN @@ -221,7 +213,7 @@ L(less_vec_from_wmemset): bzhil %edx, %ecx, %ecx kmovd %ecx, %k1 # endif - vmovdqu8 %VEC(0), (%rax){%k1} + vmovdqu8 %VMM(0), (%rax){%k1} VZEROUPPER_RETURN # if defined USE_MULTIARCH && IS_IN (libc) @@ -249,8 +241,8 @@ L(stosb_more_2x_vec): and (4x, 8x] jump to target. */ L(more_2x_vec): /* Store next 2x vec regardless. */ - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + VMOVU %VMM(0), (%rdi) + VMOVU %VMM(0), (VEC_SIZE * 1)(%rdi) /* Two different methods of setting up pointers / compare. The two @@ -278,8 +270,8 @@ L(more_2x_vec): #endif /* Store next 2x vec regardless. */ - VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) - VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) + VMOVU %VMM(0), (VEC_SIZE * 2)(%rax) + VMOVU %VMM(0), (VEC_SIZE * 3)(%rax) #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 @@ -304,20 +296,20 @@ L(more_2x_vec): andq $(VEC_SIZE * -2), %LOOP_REG .p2align 4 L(loop): - VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) - VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) - VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) - VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VMM(0), LOOP_4X_OFFSET(%LOOP_REG) + VMOVA %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) subq $-(VEC_SIZE * 4), %LOOP_REG cmpq %END_REG, %LOOP_REG jb L(loop) .p2align 4,, MOV_SIZE L(last_4x_vec): - VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) - VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) -L(return): + VMOVU %VMM(0), LOOP_4X_OFFSET(%END_REG) + VMOVU %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) + VMOVU %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) + VMOVU %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) +L(return_vzeroupper): #if VEC_SIZE > 16 ZERO_UPPER_VEC_REGISTERS_RETURN #else @@ -355,7 +347,7 @@ L(cross_page): jge L(between_16_31) #endif #ifndef USE_XMM_LESS_VEC - MOVQ %XMM0, %SET_REG64 + MOVQ %VMM_128(0), %SET_REG64 #endif cmpl $8, %edx jge L(between_8_15) @@ -374,8 +366,8 @@ L(between_0_0): .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) /* From 32 to 63. No branch when size == 32. */ L(between_32_63): - VMOVU %YMM0, (%LESS_VEC_REG) - VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) + VMOVU %VMM_256(0), (%LESS_VEC_REG) + VMOVU %VMM_256(0), -32(%LESS_VEC_REG, %rdx) VZEROUPPER_RETURN #endif @@ -383,8 +375,8 @@ L(between_32_63): .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) L(between_16_31): /* From 16 to 31. No branch when size == 16. */ - VMOVU %XMM0, (%LESS_VEC_REG) - VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) + VMOVU %VMM_128(0), (%LESS_VEC_REG) + VMOVU %VMM_128(0), -16(%LESS_VEC_REG, %rdx) ret #endif @@ -394,8 +386,8 @@ L(between_16_31): L(between_8_15): /* From 8 to 15. No branch when size == 8. */ #ifdef USE_XMM_LESS_VEC - MOVQ %XMM0, (%rdi) - MOVQ %XMM0, -8(%rdi, %rdx) + MOVQ %VMM_128(0), (%rdi) + MOVQ %VMM_128(0), -8(%rdi, %rdx) #else movq %SET_REG64, (%LESS_VEC_REG) movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) @@ -408,8 +400,8 @@ L(between_8_15): L(between_4_7): /* From 4 to 7. No branch when size == 4. */ #ifdef USE_XMM_LESS_VEC - MOVD %XMM0, (%rdi) - MOVD %XMM0, -4(%rdi, %rdx) + MOVD %VMM_128(0), (%rdi) + MOVD %VMM_128(0), -4(%rdi, %rdx) #else movl %SET_REG32, (%LESS_VEC_REG) movl %SET_REG32, -4(%LESS_VEC_REG, %rdx)