diff mbox series

[2/2] aarch64: Reduce zva prologue to 64 bytes to reduce one instruction

Message ID 1505834596-21641-2-git-send-email-siddhesh@sourceware.org
State New
Headers show
Series [1/2] Hoist ZVA check out of the function | expand

Commit Message

Siddhesh Poyarekar Sept. 19, 2017, 3:23 p.m. UTC
The current zva copy of 64 bytes has a prologue and epilogue of 128
bytes, which is quite suboptimal for falkor as well as mustang (the
two arm machines I have access to).  Dropping it to 64 bytes, which is
the mininmum alignment required for 64 byte zva to work correctly
results in a decent gain in performance for falkor as well as mustang.
For falkor the gain against generic memset goes from about 80% to
about 120%.

Function: memset
Variant: walk
                                    simple_memset	__memset_nozva	__memset_zva_64	__memset_zva_default	__memset_generic
diff mbox series

Patch

========================================================================================================================
                  length=256, char=0:         1.82 (-87.23%)	       26.67 ( 87.51%)	       31.86 (123.99%)	       24.50 ( 72.23%)	       14.23
                  length=257, char=0:         1.82 (-87.27%)	       26.82 ( 87.83%)	       32.02 (124.27%)	       24.31 ( 70.27%)	       14.28
                  length=258, char=0:         1.82 (-87.36%)	       26.13 ( 81.75%)	       32.00 (122.52%)	       24.23 ( 68.50%)	       14.38
                  length=259, char=0:         1.82 (-87.39%)	       25.17 ( 74.45%)	       32.06 (122.23%)	       24.43 ( 69.36%)	       14.43
                  length=260, char=0:         1.82 (-87.40%)	       25.38 ( 76.04%)	       31.91 (121.27%)	       24.47 ( 69.70%)	       14.42
                  length=261, char=0:         1.82 (-87.42%)	       25.86 ( 78.74%)	       32.22 (122.72%)	       24.60 ( 70.01%)	       14.47
                  length=262, char=0:         1.82 (-87.51%)	       25.79 ( 76.97%)	       32.31 (121.76%)	       24.21 ( 66.17%)	       14.57
                  length=263, char=0:         1.82 (-87.54%)	       25.56 ( 74.92%)	       32.35 (121.37%)	       24.26 ( 66.00%)	       14.61
                  length=264, char=0:         1.82 (-87.54%)	       25.34 ( 73.35%)	       32.16 (120.05%)	       24.51 ( 67.68%)	       14.62
                  length=265, char=0:         1.82 (-87.57%)	       25.03 ( 70.62%)	       32.49 (121.49%)	       24.60 ( 67.68%)	       14.67
                  length=266, char=0:         1.82 (-87.61%)	       24.69 ( 67.82%)	       32.55 (121.28%)	       24.67 ( 67.72%)	       14.71
                  length=267, char=0:         1.82 (-87.70%)	       24.76 ( 67.09%)	       32.33 (118.20%)	       24.57 ( 65.84%)	       14.82
                  length=268, char=0:         1.82 (-87.75%)	       24.10 ( 62.09%)	       32.58 (119.14%)	       24.81 ( 66.87%)	       14.87
                  length=269, char=0:         1.82 (-87.72%)	       23.72 ( 59.63%)	       32.45 (118.36%)	       24.88 ( 67.41%)	       14.86
                  length=270, char=0:         1.83 (-87.76%)	       23.44 ( 57.22%)	       32.72 (119.47%)	       24.96 ( 67.44%)	       14.91
                  length=271, char=0:         1.83 (-87.75%)	       23.26 ( 56.06%)	       32.80 (120.04%)	       25.02 ( 67.90%)	       14.90
                  length=512, char=0:         1.90 (-92.54%)	       29.15 ( 14.39%)	       42.50 ( 66.77%)	       40.75 ( 59.92%)	       25.48
                  length=513, char=0:         1.90 (-92.59%)	       29.21 ( 13.75%)	       42.18 ( 64.27%)	       40.61 ( 58.13%)	       25.68
                  length=514, char=0:         1.90 (-92.56%)	       28.78 ( 12.50%)	       42.56 ( 66.34%)	       40.71 ( 59.12%)	       25.59
                  length=515, char=0:         1.90 (-92.61%)	       28.59 ( 11.01%)	       42.20 ( 63.85%)	       40.74 ( 58.18%)	       25.76
                  length=516, char=0:         1.90 (-92.59%)	       28.42 ( 10.75%)	       42.60 ( 66.00%)	       40.38 ( 57.34%)	       25.66
                  length=517, char=0:         1.90 (-92.64%)	       28.32 (  9.49%)	       42.32 ( 63.59%)	       40.49 ( 56.54%)	       25.87
                  length=518, char=0:         1.90 (-92.61%)	       28.17 (  9.43%)	       42.73 ( 65.98%)	       40.55 ( 57.49%)	       25.75
                  length=519, char=0:         1.90 (-92.66%)	       28.04 (  8.16%)	       42.37 ( 63.39%)	       40.58 ( 56.51%)	       25.93
                  length=520, char=0:         1.90 (-92.63%)	       27.58 (  6.71%)	       42.80 ( 65.61%)	       40.62 ( 57.19%)	       25.84
                  length=521, char=0:         1.90 (-92.68%)	       27.33 (  5.01%)	       42.39 ( 62.86%)	       40.67 ( 56.26%)	       26.03
                  length=522, char=0:         1.90 (-92.66%)	       27.28 (  5.28%)	       42.83 ( 65.27%)	       40.76 ( 57.27%)	       25.91
                  length=523, char=0:         1.90 (-92.70%)	       27.37 (  4.87%)	       42.47 ( 62.72%)	       40.77 ( 56.22%)	       26.10
                  length=524, char=0:         1.90 (-92.68%)	       27.25 (  4.82%)	       42.88 ( 64.92%)	       40.81 ( 56.98%)	       26.00
                  length=525, char=0:         1.91 (-92.72%)	       27.12 (  3.58%)	       42.52 ( 62.41%)	       40.82 ( 55.93%)	       26.18
                  length=526, char=0:         1.90 (-92.70%)	       27.00 (  3.54%)	       42.93 ( 64.65%)	       40.89 ( 56.81%)	       26.08
                  length=527, char=0:         1.91 (-92.74%)	       26.88 (  2.39%)	       42.93 ( 63.53%)	       40.90 ( 55.81%)	       26.25
                 length=1024, char=0:         1.95 (-95.35%)	       30.48 (-27.21%)	       51.02 ( 21.86%)	       49.74 ( 18.79%)	       41.87
                 length=1025, char=0:         1.95 (-95.31%)	       30.52 (-26.49%)	       51.05 ( 22.97%)	       49.75 ( 19.85%)	       41.51
                 length=1026, char=0:         1.95 (-95.35%)	       30.24 (-27.85%)	       51.05 ( 21.81%)	       49.28 ( 17.59%)	       41.91
                 length=1027, char=0:         1.95 (-95.35%)	       30.16 (-28.08%)	       50.61 ( 20.67%)	       49.77 ( 18.67%)	       41.94
                 length=1028, char=0:         1.95 (-95.36%)	       29.63 (-29.42%)	       51.06 ( 21.63%)	       49.79 ( 18.59%)	       41.98
                 length=1029, char=0:         1.95 (-95.36%)	       29.58 (-29.61%)	       51.06 ( 21.51%)	       49.82 ( 18.55%)	       42.02
                 length=1030, char=0:         1.95 (-95.33%)	       29.71 (-28.78%)	       51.07 ( 22.41%)	       49.79 ( 19.36%)	       41.72
                 length=1031, char=0:         1.95 (-95.37%)	       29.58 (-29.70%)	       51.09 ( 21.42%)	       49.38 ( 17.36%)	       42.07
                 length=1032, char=0:         1.95 (-95.37%)	       29.55 (-29.83%)	       50.62 ( 20.19%)	       49.82 ( 18.31%)	       42.11
                 length=1033, char=0:         1.95 (-95.37%)	       29.26 (-30.56%)	       51.09 ( 21.26%)	       49.85 ( 18.32%)	       42.14
                 length=1034, char=0:         1.95 (-95.38%)	       29.21 (-30.77%)	       51.13 ( 21.19%)	       49.86 ( 18.17%)	       42.19
                 length=1035, char=0:         1.95 (-95.38%)	       29.27 (-30.66%)	       51.14 ( 21.15%)	       49.86 ( 18.10%)	       42.22
                 length=1036, char=0:         1.95 (-95.35%)	       29.25 (-30.22%)	       51.13 ( 21.99%)	       49.88 ( 19.00%)	       41.92
                 length=1037, char=0:         1.95 (-95.39%)	       29.11 (-31.12%)	       51.16 ( 21.06%)	       49.43 ( 16.96%)	       42.26
                 length=1038, char=0:         1.95 (-95.39%)	       29.21 (-30.96%)	       50.67 ( 19.76%)	       49.89 ( 17.92%)	       42.31
                 length=1039, char=0:         1.95 (-95.39%)	       28.98 (-31.54%)	       51.16 ( 20.85%)	       49.90 ( 17.87%)	       42.33

	* sysdeps/aarch64/memset.S (do_zva_64): Set 64 bytes in
	prologue and epilogue instead of 128 bytes.
---
 sysdeps/aarch64/memset.S | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 8cff3a4..a4c1d30 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -62,20 +62,14 @@ 
 	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-	nop
+	add	dst, dst, 64
+	sub	dstend, dstend, 64
 1:	dc	zva, dst
 	add	dst, dst, 64
-	subs	count, count, 64
+	cmp	dstend, dst
 	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
+	stp	q0, q0, [dstend]
+	stp	q0, q0, [dstend, 32]
 	ret
 .endm