Message ID | 20210403081215.2309505-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v8,1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S | expand |
On Sat, Apr 3, 2021 at 1:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > From: noah <goldstein.w.n@gmail.com> > > No Bug. This commit updates the large memcpy case (no overlap). The > update is to perform memcpy on either 2 or 4 contiguous pages at > once. This 1) helps to alleviate the affects of false memory aliasing > when destination and source have a close 4k alignment and 2) In most > cases and for most DRAM units is a modestly more efficient access > pattern. These changes are a clear performance improvement for > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > pass. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > Issue was alignment related AFAICT. Added `.p2align 4` infront of the > loops and no longer see any meaningful regression. > > Also added back the temporal stores for the tail. Saw a regression > when doing these tests. > > Two tables below for skylake and icelake numbers for the areas around > where you saw the regression. Below is all data from the tests. > > N = 10. > > Skylake > Len ,align1 ,align2 ,new mean ,old mean > 4103 ,0 ,64 ,84.5 ,88.6 > 4111 ,0 ,3 ,99.0 ,99.9 > 4127 ,3 ,0 ,102.1 ,102.3 > 4159 ,3 ,7 ,88.7 ,90.9 > 4223 ,9 ,5 ,88.1 ,87.4 > 8199 ,0 ,64 ,146.7 ,150.2 > 8207 ,0 ,3 ,167.9 ,168.5 > 8223 ,3 ,0 ,168.5 ,168.1 > 8255 ,3 ,7 ,157.0 ,159.2 > 8319 ,9 ,5 ,155.5 ,155.7 > 16391 ,0 ,64 ,286.2 ,288.8 > 16399 ,0 ,3 ,307.0 ,308.7 > 16415 ,3 ,0 ,307.4 ,307.6 > 16447 ,3 ,7 ,294.6 ,295.5 > 16511 ,9 ,5 ,291.5 ,462.1 > 32775 ,0 ,64 ,603.4 ,601.5 > 32783 ,0 ,3 ,604.8 ,606.4 > 32799 ,3 ,0 ,603.0 ,604.1 > 32831 ,3 ,7 ,600.2 ,737.3 > 32895 ,9 ,5 ,604.4 ,599.5 > 65543 ,0 ,64 ,1873.5 ,1854.3 > 65551 ,0 ,3 ,1862.9 ,1846.6 > 65567 ,3 ,0 ,1885.5 ,1966.0 > 65599 ,3 ,7 ,1833.2 ,1833.1 > 65663 ,9 ,5 ,1884.9 ,1887.4 > 131079 ,0 ,64 ,3944.3 ,3949.4 > 131087 ,0 ,3 ,3927.3 ,3913.3 > 131103 ,3 ,0 ,4415.8 ,4169.4 > 131135 ,3 ,7 ,4224.5 ,4157.6 > 131199 ,9 ,5 ,5974.0 ,4983.8 > 262151 ,0 ,64 ,11050.2 ,10620.6 > 262159 ,0 ,3 ,9932.8 ,10037.3 > 262175 ,3 ,0 ,10188.8 ,9206.6 > 262207 ,3 ,7 ,9633.3 ,9216.7 > 262271 ,9 ,5 ,9732.7 ,9345.3 > 524295 ,0 ,64 ,24823.9 ,24880.7 > 524303 ,0 ,3 ,24514.0 ,24556.7 > 524319 ,3 ,0 ,23974.4 ,24219.9 > 524351 ,3 ,7 ,24159.7 ,24207.0 > 524415 ,9 ,5 ,23946.5 ,24142.8 > > Icelake: > Len ,align1 ,align2 ,new mean ,old mean > 4103 ,0 ,64 ,50.2 ,63.7 > 4111 ,0 ,3 ,63.7 ,65.1 > 4127 ,3 ,0 ,68.2 ,69.4 > 4159 ,3 ,7 ,59.6 ,68.0 > 4223 ,9 ,5 ,68.2 ,66.8 > 8199 ,0 ,64 ,92.1 ,89.9 > 8207 ,0 ,3 ,119.7 ,118.3 > 8223 ,3 ,0 ,119.1 ,120.9 > 8255 ,3 ,7 ,122.9 ,123.7 > 8319 ,9 ,5 ,122.1 ,121.8 > 16391 ,0 ,64 ,162.7 ,158.0 > 16399 ,0 ,3 ,227.6 ,234.1 > 16415 ,3 ,0 ,230.8 ,232.7 > 16447 ,3 ,7 ,226.8 ,232.6 > 16511 ,9 ,5 ,233.4 ,233.8 > 32775 ,0 ,64 ,312.2 ,301.8 > 32783 ,0 ,3 ,449.7 ,450.0 > 32799 ,3 ,0 ,452.7 ,455.9 > 32831 ,3 ,7 ,449.8 ,458.0 > 32895 ,9 ,5 ,456.3 ,459.4 > 65543 ,0 ,64 ,1460.6 ,1463.9 > 65551 ,0 ,3 ,1462.0 ,1465.4 > 65567 ,3 ,0 ,1466.6 ,1480.4 > 65599 ,3 ,7 ,1488.0 ,1488.9 > 65663 ,9 ,5 ,1680.8 ,1499.5 > 131079 ,0 ,64 ,2988.5 ,3010.1 > 131087 ,0 ,3 ,2995.5 ,2996.4 > 131103 ,3 ,0 ,3006.2 ,3000.5 > 131135 ,3 ,7 ,3032.4 ,3073.7 > 131199 ,9 ,5 ,3010.4 ,3027.4 > 262151 ,0 ,64 ,6143.2 ,6079.1 > 262159 ,0 ,3 ,6085.1 ,6075.8 > 262175 ,3 ,0 ,6088.0 ,6064.9 > 262207 ,3 ,7 ,6018.7 ,6023.5 > 262271 ,9 ,5 ,6019.8 ,5959.2 > 524295 ,0 ,64 ,14464.2 ,14095.1 > 524303 ,0 ,3 ,14761.6 ,14050.2 > 524319 ,3 ,0 ,14534.1 ,14087.5 > 524351 ,3 ,7 ,14147.7 ,13903.8 > 524415 ,9 ,5 ,14157.0 ,13982.9 > > > > cpu ,version ,Len ,align1 ,align2 ,new mean ,old mean > skylake ,avx ,4103 ,0 ,64 ,84.5 ,88.6 > skylake ,avx ,4111 ,0 ,3 ,99.0 ,99.9 > skylake ,avx ,4127 ,3 ,0 ,102.1 ,102.3 > skylake ,avx ,4159 ,3 ,7 ,88.7 ,90.9 > skylake ,avx ,4223 ,9 ,5 ,88.1 ,87.4 > skylake ,avx ,8199 ,0 ,64 ,146.7 ,150.2 > skylake ,avx ,8207 ,0 ,3 ,167.9 ,168.5 > skylake ,avx ,8223 ,3 ,0 ,168.5 ,168.1 > skylake ,avx ,8255 ,3 ,7 ,157.0 ,159.2 > skylake ,avx ,8319 ,9 ,5 ,155.5 ,155.7 > skylake ,avx ,16391 ,0 ,64 ,286.2 ,288.8 > skylake ,avx ,16399 ,0 ,3 ,307.0 ,308.7 > skylake ,avx ,16415 ,3 ,0 ,307.4 ,307.6 > skylake ,avx ,16447 ,3 ,7 ,294.6 ,295.5 > skylake ,avx ,16511 ,9 ,5 ,291.5 ,462.1 > skylake ,avx ,32775 ,0 ,64 ,603.4 ,601.5 > skylake ,avx ,32783 ,0 ,3 ,604.8 ,606.4 > skylake ,avx ,32799 ,3 ,0 ,603.0 ,604.1 > skylake ,avx ,32831 ,3 ,7 ,600.2 ,737.3 > skylake ,avx ,32895 ,9 ,5 ,604.4 ,599.5 > skylake ,avx ,65543 ,0 ,64 ,1873.5 ,1854.3 > skylake ,avx ,65551 ,0 ,3 ,1862.9 ,1846.6 > skylake ,avx ,65567 ,3 ,0 ,1885.5 ,1966.0 > skylake ,avx ,65599 ,3 ,7 ,1833.2 ,1833.1 > skylake ,avx ,65663 ,9 ,5 ,1884.9 ,1887.4 > skylake ,avx ,131079 ,0 ,64 ,3944.3 ,3949.4 > skylake ,avx ,131087 ,0 ,3 ,3927.3 ,3913.3 > skylake ,avx ,131103 ,3 ,0 ,4415.8 ,4169.4 > skylake ,avx ,131135 ,3 ,7 ,4224.5 ,4157.6 > skylake ,avx ,131199 ,9 ,5 ,5974.0 ,4983.8 > skylake ,avx ,262151 ,0 ,64 ,11050.2 ,10620.6 > skylake ,avx ,262159 ,0 ,3 ,9932.8 ,10037.3 > skylake ,avx ,262175 ,3 ,0 ,10188.8 ,9206.6 > skylake ,avx ,262207 ,3 ,7 ,9633.3 ,9216.7 > skylake ,avx ,262271 ,9 ,5 ,9732.7 ,9345.3 > skylake ,avx ,524295 ,0 ,64 ,24823.9 ,24880.7 > skylake ,avx ,524303 ,0 ,3 ,24514.0 ,24556.7 > skylake ,avx ,524319 ,3 ,0 ,23974.4 ,24219.9 > skylake ,avx ,524351 ,3 ,7 ,24159.7 ,24207.0 > skylake ,avx ,524415 ,9 ,5 ,23946.5 ,24142.8 > skylake ,avx ,1048583 ,0 ,64 ,49163.9 ,49454.6 > skylake ,avx ,1048591 ,0 ,3 ,49879.3 ,49400.8 > skylake ,avx ,1048607 ,3 ,0 ,49738.0 ,48864.6 > skylake ,avx ,1048639 ,3 ,7 ,48804.0 ,47588.5 > skylake ,avx ,1048703 ,9 ,5 ,49629.4 ,49796.3 > skylake ,avx ,2097159 ,0 ,64 ,98271.7 ,96330.6 > skylake ,avx ,2097167 ,0 ,3 ,97801.8 ,98638.1 > skylake ,avx ,2097183 ,3 ,0 ,98041.1 ,99287.6 > skylake ,avx ,2097215 ,3 ,7 ,96629.5 ,96521.9 > skylake ,avx ,2097279 ,9 ,5 ,98961.8 ,98909.8 > skylake ,avx ,4194311 ,0 ,64 ,194667.7 ,195377.1 > skylake ,avx ,4194319 ,0 ,3 ,194919.5 ,198576.2 > skylake ,avx ,4194335 ,3 ,0 ,192949.8 ,194584.7 > skylake ,avx ,4194367 ,3 ,7 ,189943.5 ,189177.9 > skylake ,avx ,4194431 ,9 ,5 ,192479.1 ,196494.2 > skylake ,avx ,8388615 ,0 ,64 ,588671.6 ,587215.4 > skylake ,avx ,8388623 ,0 ,3 ,581640.7 ,582812.5 > skylake ,avx ,8388639 ,3 ,0 ,549811.9 ,544697.6 > skylake ,avx ,8388671 ,3 ,7 ,591155.0 ,577951.8 > skylake ,avx ,8388735 ,9 ,5 ,547583.2 ,545133.3 > skylake ,avx ,16777223 ,0 ,64 ,1787503.0 ,1811146.0 > skylake ,avx ,16777231 ,0 ,3 ,1758671.0 ,1756343.0 > skylake ,avx ,16777247 ,3 ,0 ,1691781.0 ,1694661.0 > skylake ,avx ,16777279 ,3 ,7 ,1768150.0 ,1754785.0 > skylake ,avx ,16777343 ,9 ,5 ,1695179.0 ,1710794.0 > skylake ,sse2 ,4103 ,0 ,64 ,150.8 ,150.5 > skylake ,sse2 ,4111 ,0 ,3 ,156.8 ,158.4 > skylake ,sse2 ,4127 ,3 ,0 ,99.7 ,99.4 > skylake ,sse2 ,4159 ,3 ,7 ,154.8 ,154.5 > skylake ,sse2 ,4223 ,9 ,5 ,137.3 ,137.2 > skylake ,sse2 ,8199 ,0 ,64 ,284.8 ,285.5 > skylake ,sse2 ,8207 ,0 ,3 ,296.0 ,296.1 > skylake ,sse2 ,8223 ,3 ,0 ,168.0 ,168.2 > skylake ,sse2 ,8255 ,3 ,7 ,293.0 ,292.4 > skylake ,sse2 ,8319 ,9 ,5 ,251.3 ,250.7 > skylake ,sse2 ,16391 ,0 ,64 ,561.3 ,608.3 > skylake ,sse2 ,16399 ,0 ,3 ,571.0 ,574.8 > skylake ,sse2 ,16415 ,3 ,0 ,305.4 ,305.0 > skylake ,sse2 ,16447 ,3 ,7 ,563.2 ,565.0 > skylake ,sse2 ,16511 ,9 ,5 ,477.1 ,475.1 > skylake ,sse2 ,32775 ,0 ,64 ,1128.2 ,1131.7 > skylake ,sse2 ,32783 ,0 ,3 ,1126.6 ,1131.0 > skylake ,sse2 ,32799 ,3 ,0 ,587.6 ,590.8 > skylake ,sse2 ,32831 ,3 ,7 ,1130.6 ,1126.2 > skylake ,sse2 ,32895 ,9 ,5 ,957.6 ,953.0 > skylake ,sse2 ,65543 ,0 ,64 ,2718.9 ,2704.2 > skylake ,sse2 ,65551 ,0 ,3 ,2724.1 ,2725.0 > skylake ,sse2 ,65567 ,3 ,0 ,1888.4 ,1914.3 > skylake ,sse2 ,65599 ,3 ,7 ,2787.6 ,2748.7 > skylake ,sse2 ,65663 ,9 ,5 ,2400.5 ,2369.4 > skylake ,sse2 ,131079 ,0 ,64 ,5603.3 ,5654.9 > skylake ,sse2 ,131087 ,0 ,3 ,5939.3 ,5871.4 > skylake ,sse2 ,131103 ,3 ,0 ,4272.4 ,4190.0 > skylake ,sse2 ,131135 ,3 ,7 ,7601.4 ,7524.6 > skylake ,sse2 ,131199 ,9 ,5 ,7022.1 ,6864.7 > skylake ,sse2 ,262151 ,0 ,64 ,13736.2 ,14030.0 > skylake ,sse2 ,262159 ,0 ,3 ,12407.3 ,12334.1 > skylake ,sse2 ,262175 ,3 ,0 ,9661.1 ,9249.4 > skylake ,sse2 ,262207 ,3 ,7 ,12850.2 ,12351.6 > skylake ,sse2 ,262271 ,9 ,5 ,10792.6 ,10435.8 > skylake ,sse2 ,524295 ,0 ,64 ,27754.5 ,28177.7 > skylake ,sse2 ,524303 ,0 ,3 ,27766.2 ,28152.0 > skylake ,sse2 ,524319 ,3 ,0 ,24030.9 ,24438.3 > skylake ,sse2 ,524351 ,3 ,7 ,27787.5 ,27933.0 > skylake ,sse2 ,524415 ,9 ,5 ,24263.2 ,25249.1 > skylake ,sse2 ,1048583 ,0 ,64 ,56199.9 ,56039.8 > skylake ,sse2 ,1048591 ,0 ,3 ,56750.2 ,58889.7 > skylake ,sse2 ,1048607 ,3 ,0 ,56394.0 ,55115.3 > skylake ,sse2 ,1048639 ,3 ,7 ,57233.1 ,57473.8 > skylake ,sse2 ,1048703 ,9 ,5 ,56324.3 ,55917.9 > skylake ,sse2 ,2097159 ,0 ,64 ,113234.8 ,114346.4 > skylake ,sse2 ,2097167 ,0 ,3 ,114373.1 ,115522.5 > skylake ,sse2 ,2097183 ,3 ,0 ,108113.3 ,108513.3 > skylake ,sse2 ,2097215 ,3 ,7 ,116863.6 ,116549.9 > skylake ,sse2 ,2097279 ,9 ,5 ,108945.1 ,108843.7 > skylake ,sse2 ,4194311 ,0 ,64 ,230250.1 ,232350.0 > skylake ,sse2 ,4194319 ,0 ,3 ,231895.3 ,235055.6 > skylake ,sse2 ,4194335 ,3 ,0 ,218442.8 ,219199.8 > skylake ,sse2 ,4194367 ,3 ,7 ,242564.2 ,235587.7 > skylake ,sse2 ,4194431 ,9 ,5 ,224167.4 ,215261.8 > skylake ,sse2 ,8388615 ,0 ,64 ,679801.8 ,674832.0 > skylake ,sse2 ,8388623 ,0 ,3 ,684913.2 ,685238.7 > skylake ,sse2 ,8388639 ,3 ,0 ,644865.4 ,631388.6 > skylake ,sse2 ,8388671 ,3 ,7 ,698700.9 ,689316.1 > skylake ,sse2 ,8388735 ,9 ,5 ,644820.2 ,631366.8 > skylake ,sse2 ,16777223 ,0 ,64 ,1877984.0 ,1876437.0 > skylake ,sse2 ,16777231 ,0 ,3 ,1898086.0 ,1913053.0 > skylake ,sse2 ,16777247 ,3 ,0 ,1857018.0 ,1866949.0 > skylake ,sse2 ,16777279 ,3 ,7 ,1914905.0 ,1897134.0 > skylake ,sse2 ,16777343 ,9 ,5 ,1859937.0 ,1881939.0 > icelake ,avx512 ,4103 ,0 ,64 ,75.2 ,75.8 > icelake ,avx512 ,4111 ,0 ,3 ,56.9 ,56.4 > icelake ,avx512 ,4127 ,3 ,0 ,59.1 ,59.6 > icelake ,avx512 ,4159 ,3 ,7 ,50.7 ,51.3 > icelake ,avx512 ,4223 ,9 ,5 ,59.2 ,58.9 > icelake ,avx512 ,8199 ,0 ,64 ,67.8 ,63.9 > icelake ,avx512 ,8207 ,0 ,3 ,89.0 ,89.9 > icelake ,avx512 ,8223 ,3 ,0 ,90.2 ,90.1 > icelake ,avx512 ,8255 ,3 ,7 ,82.6 ,84.9 > icelake ,avx512 ,8319 ,9 ,5 ,91.5 ,92.8 > icelake ,avx512 ,16391 ,0 ,64 ,118.0 ,117.6 > icelake ,avx512 ,16399 ,0 ,3 ,156.5 ,157.0 > icelake ,avx512 ,16415 ,3 ,0 ,157.4 ,157.3 > icelake ,avx512 ,16447 ,3 ,7 ,151.0 ,151.6 > icelake ,avx512 ,16511 ,9 ,5 ,159.1 ,159.6 > icelake ,avx512 ,32775 ,0 ,64 ,231.8 ,230.8 > icelake ,avx512 ,32783 ,0 ,3 ,297.8 ,299.3 > icelake ,avx512 ,32799 ,3 ,0 ,299.1 ,299.0 > icelake ,avx512 ,32831 ,3 ,7 ,293.5 ,295.4 > icelake ,avx512 ,32895 ,9 ,5 ,300.3 ,302.5 > icelake ,avx512 ,65543 ,0 ,64 ,1473.4 ,1479.2 > icelake ,avx512 ,65551 ,0 ,3 ,1438.2 ,1445.3 > icelake ,avx512 ,65567 ,3 ,0 ,1450.3 ,1463.8 > icelake ,avx512 ,65599 ,3 ,7 ,1469.0 ,1473.8 > icelake ,avx512 ,65663 ,9 ,5 ,1480.0 ,1483.5 > icelake ,avx512 ,131079 ,0 ,64 ,3015.1 ,3037.5 > icelake ,avx512 ,131087 ,0 ,3 ,2952.3 ,2960.4 > icelake ,avx512 ,131103 ,3 ,0 ,2966.2 ,2964.4 > icelake ,avx512 ,131135 ,3 ,7 ,2961.6 ,3047.9 > icelake ,avx512 ,131199 ,9 ,5 ,2967.4 ,3183.8 > icelake ,avx512 ,262151 ,0 ,64 ,6206.0 ,6141.5 > icelake ,avx512 ,262159 ,0 ,3 ,5990.8 ,5959.2 > icelake ,avx512 ,262175 ,3 ,0 ,5976.7 ,5963.8 > icelake ,avx512 ,262207 ,3 ,7 ,5939.5 ,5924.3 > icelake ,avx512 ,262271 ,9 ,5 ,5944.6 ,5990.3 > icelake ,avx512 ,524295 ,0 ,64 ,14726.7 ,14307.0 > icelake ,avx512 ,524303 ,0 ,3 ,14344.2 ,14040.5 > icelake ,avx512 ,524319 ,3 ,0 ,14175.0 ,13862.2 > icelake ,avx512 ,524351 ,3 ,7 ,14261.4 ,13821.5 > icelake ,avx512 ,524415 ,9 ,5 ,14266.5 ,14064.7 > icelake ,avx512 ,1048583 ,0 ,64 ,35211.4 ,35414.6 > icelake ,avx512 ,1048591 ,0 ,3 ,35156.8 ,35591.2 > icelake ,avx512 ,1048607 ,3 ,0 ,35273.1 ,35503.3 > icelake ,avx512 ,1048639 ,3 ,7 ,35255.8 ,35725.0 > icelake ,avx512 ,1048703 ,9 ,5 ,35703.6 ,36289.9 > icelake ,avx512 ,2097159 ,0 ,64 ,72613.9 ,72063.2 > icelake ,avx512 ,2097167 ,0 ,3 ,72301.6 ,73504.2 > icelake ,avx512 ,2097183 ,3 ,0 ,73448.8 ,72133.6 > icelake ,avx512 ,2097215 ,3 ,7 ,73762.9 ,72825.8 > icelake ,avx512 ,2097279 ,9 ,5 ,72097.3 ,72914.6 > icelake ,avx512 ,4194311 ,0 ,64 ,144793.4 ,144182.1 > icelake ,avx512 ,4194319 ,0 ,3 ,143710.3 ,145063.3 > icelake ,avx512 ,4194335 ,3 ,0 ,146722.1 ,144046.4 > icelake ,avx512 ,4194367 ,3 ,7 ,144267.0 ,144874.6 > icelake ,avx512 ,4194431 ,9 ,5 ,143808.2 ,144560.0 > icelake ,avx512 ,8388615 ,0 ,64 ,427993.4 ,424521.5 > icelake ,avx512 ,8388623 ,0 ,3 ,470267.1 ,473290.8 > icelake ,avx512 ,8388639 ,3 ,0 ,457179.7 ,461797.7 > icelake ,avx512 ,8388671 ,3 ,7 ,472507.9 ,481561.4 > icelake ,avx512 ,8388735 ,9 ,5 ,463611.9 ,467388.7 > icelake ,avx512 ,16777223 ,0 ,64 ,1490426.0 ,1526996.0 > icelake ,avx512 ,16777231 ,0 ,3 ,1516687.0 ,1517095.0 > icelake ,avx512 ,16777247 ,3 ,0 ,1497688.0 ,1512766.0 > icelake ,avx512 ,16777279 ,3 ,7 ,1512331.0 ,1524317.0 > icelake ,avx512 ,16777343 ,9 ,5 ,1498908.0 ,1500526.0 > icelake ,avx ,4103 ,0 ,64 ,50.2 ,63.7 > icelake ,avx ,4111 ,0 ,3 ,63.7 ,65.1 > icelake ,avx ,4127 ,3 ,0 ,68.2 ,69.4 > icelake ,avx ,4159 ,3 ,7 ,59.6 ,68.0 > icelake ,avx ,4223 ,9 ,5 ,68.2 ,66.8 > icelake ,avx ,8199 ,0 ,64 ,92.1 ,89.9 > icelake ,avx ,8207 ,0 ,3 ,119.7 ,118.3 > icelake ,avx ,8223 ,3 ,0 ,119.1 ,120.9 > icelake ,avx ,8255 ,3 ,7 ,122.9 ,123.7 > icelake ,avx ,8319 ,9 ,5 ,122.1 ,121.8 > icelake ,avx ,16391 ,0 ,64 ,162.7 ,158.0 > icelake ,avx ,16399 ,0 ,3 ,227.6 ,234.1 > icelake ,avx ,16415 ,3 ,0 ,230.8 ,232.7 > icelake ,avx ,16447 ,3 ,7 ,226.8 ,232.6 > icelake ,avx ,16511 ,9 ,5 ,233.4 ,233.8 > icelake ,avx ,32775 ,0 ,64 ,312.2 ,301.8 > icelake ,avx ,32783 ,0 ,3 ,449.7 ,450.0 > icelake ,avx ,32799 ,3 ,0 ,452.7 ,455.9 > icelake ,avx ,32831 ,3 ,7 ,449.8 ,458.0 > icelake ,avx ,32895 ,9 ,5 ,456.3 ,459.4 > icelake ,avx ,65543 ,0 ,64 ,1460.6 ,1463.9 > icelake ,avx ,65551 ,0 ,3 ,1462.0 ,1465.4 > icelake ,avx ,65567 ,3 ,0 ,1466.6 ,1480.4 > icelake ,avx ,65599 ,3 ,7 ,1488.0 ,1488.9 > icelake ,avx ,65663 ,9 ,5 ,1680.8 ,1499.5 > icelake ,avx ,131079 ,0 ,64 ,2988.5 ,3010.1 > icelake ,avx ,131087 ,0 ,3 ,2995.5 ,2996.4 > icelake ,avx ,131103 ,3 ,0 ,3006.2 ,3000.5 > icelake ,avx ,131135 ,3 ,7 ,3032.4 ,3073.7 > icelake ,avx ,131199 ,9 ,5 ,3010.4 ,3027.4 > icelake ,avx ,262151 ,0 ,64 ,6143.2 ,6079.1 > icelake ,avx ,262159 ,0 ,3 ,6085.1 ,6075.8 > icelake ,avx ,262175 ,3 ,0 ,6088.0 ,6064.9 > icelake ,avx ,262207 ,3 ,7 ,6018.7 ,6023.5 > icelake ,avx ,262271 ,9 ,5 ,6019.8 ,5959.2 > icelake ,avx ,524295 ,0 ,64 ,14464.2 ,14095.1 > icelake ,avx ,524303 ,0 ,3 ,14761.6 ,14050.2 > icelake ,avx ,524319 ,3 ,0 ,14534.1 ,14087.5 > icelake ,avx ,524351 ,3 ,7 ,14147.7 ,13903.8 > icelake ,avx ,524415 ,9 ,5 ,14157.0 ,13982.9 > icelake ,avx ,1048583 ,0 ,64 ,36599.0 ,37461.4 > icelake ,avx ,1048591 ,0 ,3 ,36717.8 ,37454.9 > icelake ,avx ,1048607 ,3 ,0 ,36821.2 ,37343.3 > icelake ,avx ,1048639 ,3 ,7 ,36958.0 ,37507.2 > icelake ,avx ,1048703 ,9 ,5 ,36869.2 ,37413.1 > icelake ,avx ,2097159 ,0 ,64 ,74765.8 ,75330.9 > icelake ,avx ,2097167 ,0 ,3 ,75175.4 ,74891.9 > icelake ,avx ,2097183 ,3 ,0 ,75451.4 ,74787.7 > icelake ,avx ,2097215 ,3 ,7 ,75394.8 ,75839.1 > icelake ,avx ,2097279 ,9 ,5 ,75099.2 ,75421.2 > icelake ,avx ,4194311 ,0 ,64 ,146809.6 ,146619.4 > icelake ,avx ,4194319 ,0 ,3 ,148866.4 ,149898.2 > icelake ,avx ,4194335 ,3 ,0 ,148719.7 ,150165.4 > icelake ,avx ,4194367 ,3 ,7 ,150600.1 ,150925.9 > icelake ,avx ,4194431 ,9 ,5 ,149457.3 ,150519.2 > icelake ,avx ,8388615 ,0 ,64 ,412709.8 ,423666.1 > icelake ,avx ,8388623 ,0 ,3 ,423717.4 ,424418.2 > icelake ,avx ,8388639 ,3 ,0 ,414387.5 ,413445.6 > icelake ,avx ,8388671 ,3 ,7 ,449010.7 ,417553.5 > icelake ,avx ,8388735 ,9 ,5 ,414128.6 ,411815.3 > icelake ,avx ,16777223 ,0 ,64 ,1490032.0 ,1510004.0 > icelake ,avx ,16777231 ,0 ,3 ,1379638.0 ,1422097.0 > icelake ,avx ,16777247 ,3 ,0 ,1418930.0 ,1367557.0 > icelake ,avx ,16777279 ,3 ,7 ,1515152.0 ,1500176.0 > icelake ,avx ,16777343 ,9 ,5 ,1344117.0 ,1411795.0 > icelake ,sse2 ,4103 ,0 ,64 ,113.2 ,114.6 > icelake ,sse2 ,4111 ,0 ,3 ,121.5 ,120.4 > icelake ,sse2 ,4127 ,3 ,0 ,1700.5 ,1771.5 > icelake ,sse2 ,4159 ,3 ,7 ,119.3 ,118.8 > icelake ,sse2 ,4223 ,9 ,5 ,1739.7 ,1735.2 > icelake ,sse2 ,8199 ,0 ,64 ,207.0 ,203.9 > icelake ,sse2 ,8207 ,0 ,3 ,225.5 ,220.8 > icelake ,sse2 ,8223 ,3 ,0 ,3444.3 ,3743.5 > icelake ,sse2 ,8255 ,3 ,7 ,219.9 ,216.8 > icelake ,sse2 ,8319 ,9 ,5 ,4117.1 ,3487.3 > icelake ,sse2 ,16391 ,0 ,64 ,397.1 ,394.3 > icelake ,sse2 ,16399 ,0 ,3 ,439.6 ,428.6 > icelake ,sse2 ,16415 ,3 ,0 ,6997.0 ,7031.2 > icelake ,sse2 ,16447 ,3 ,7 ,426.8 ,421.8 > icelake ,sse2 ,16511 ,9 ,5 ,7037.6 ,7038.3 > icelake ,sse2 ,32775 ,0 ,64 ,790.9 ,779.0 > icelake ,sse2 ,32783 ,0 ,3 ,863.1 ,849.6 > icelake ,sse2 ,32799 ,3 ,0 ,14043.0 ,14390.9 > icelake ,sse2 ,32831 ,3 ,7 ,841.6 ,833.1 > icelake ,sse2 ,32895 ,9 ,5 ,14277.6 ,14344.2 > icelake ,sse2 ,65543 ,0 ,64 ,1897.0 ,1897.3 > icelake ,sse2 ,65551 ,0 ,3 ,1927.1 ,1955.4 > icelake ,sse2 ,65567 ,3 ,0 ,28834.7 ,28727.8 > icelake ,sse2 ,65599 ,3 ,7 ,1961.4 ,1969.7 > icelake ,sse2 ,65663 ,9 ,5 ,28867.6 ,29019.8 > icelake ,sse2 ,131079 ,0 ,64 ,3879.3 ,3872.6 > icelake ,sse2 ,131087 ,0 ,3 ,3955.3 ,3990.7 > icelake ,sse2 ,131103 ,3 ,0 ,58001.8 ,60567.9 > icelake ,sse2 ,131135 ,3 ,7 ,3951.5 ,4002.6 > icelake ,sse2 ,131199 ,9 ,5 ,57886.7 ,58391.4 > icelake ,sse2 ,262151 ,0 ,64 ,7851.4 ,7894.7 > icelake ,sse2 ,262159 ,0 ,3 ,7947.5 ,8016.2 > icelake ,sse2 ,262175 ,3 ,0 ,115036.2 ,115968.6 > icelake ,sse2 ,262207 ,3 ,7 ,7883.9 ,7814.1 > icelake ,sse2 ,262271 ,9 ,5 ,113776.4 ,119733.6 > icelake ,sse2 ,524295 ,0 ,64 ,17198.1 ,16974.9 > icelake ,sse2 ,524303 ,0 ,3 ,17402.2 ,17096.3 > icelake ,sse2 ,524319 ,3 ,0 ,223980.4 ,225889.9 > icelake ,sse2 ,524351 ,3 ,7 ,17034.9 ,16910.3 > icelake ,sse2 ,524415 ,9 ,5 ,224027.7 ,224962.5 > icelake ,sse2 ,1048583 ,0 ,64 ,38822.3 ,39178.6 > icelake ,sse2 ,1048591 ,0 ,3 ,41686.7 ,40247.4 > icelake ,sse2 ,1048607 ,3 ,0 ,38814.8 ,39323.3 > icelake ,sse2 ,1048639 ,3 ,7 ,39568.3 ,41325.7 > icelake ,sse2 ,1048703 ,9 ,5 ,39354.2 ,39637.9 > icelake ,sse2 ,2097159 ,0 ,64 ,84074.7 ,84543.1 > icelake ,sse2 ,2097167 ,0 ,3 ,83665.7 ,82358.2 > icelake ,sse2 ,2097183 ,3 ,0 ,81817.8 ,79638.9 > icelake ,sse2 ,2097215 ,3 ,7 ,83649.1 ,83497.6 > icelake ,sse2 ,2097279 ,9 ,5 ,80287.6 ,79980.9 > icelake ,sse2 ,4194311 ,0 ,64 ,165409.8 ,168343.1 > icelake ,sse2 ,4194319 ,0 ,3 ,165216.7 ,177632.0 > icelake ,sse2 ,4194335 ,3 ,0 ,158718.7 ,160342.2 > icelake ,sse2 ,4194367 ,3 ,7 ,167944.9 ,167204.4 > icelake ,sse2 ,4194431 ,9 ,5 ,161530.1 ,164839.7 > icelake ,sse2 ,8388615 ,0 ,64 ,626504.3 ,629858.5 > icelake ,sse2 ,8388623 ,0 ,3 ,623969.5 ,631509.1 > icelake ,sse2 ,8388639 ,3 ,0 ,599366.7 ,600016.0 > icelake ,sse2 ,8388671 ,3 ,7 ,619964.2 ,619113.2 > icelake ,sse2 ,8388735 ,9 ,5 ,595338.1 ,604172.4 > icelake ,sse2 ,16777223 ,0 ,64 ,1709597.0 ,1725184.0 > icelake ,sse2 ,16777231 ,0 ,3 ,1725452.0 ,1719746.0 > icelake ,sse2 ,16777247 ,3 ,0 ,1614269.0 ,1607164.0 > icelake ,sse2 ,16777279 ,3 ,7 ,1705295.0 ,1733018.0 > icelake ,sse2 ,16777343 ,9 ,5 ,1604197.0 ,1595690.0 > I am having a hard time to convince myself that this patch is really necessary. What are geomeans of all different cases for each processors?
On Sat, Apr 3, 2021 at 1:46 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Sat, Apr 3, 2021 at 1:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > From: noah <goldstein.w.n@gmail.com> > > > > No Bug. This commit updates the large memcpy case (no overlap). The > > update is to perform memcpy on either 2 or 4 contiguous pages at > > once. This 1) helps to alleviate the affects of false memory aliasing > > when destination and source have a close 4k alignment and 2) In most > > cases and for most DRAM units is a modestly more efficient access > > pattern. These changes are a clear performance improvement for > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > > pass. > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > --- > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the > > loops and no longer see any meaningful regression. > > > > Also added back the temporal stores for the tail. Saw a regression > > when doing these tests. > > > > Two tables below for skylake and icelake numbers for the areas around > > where you saw the regression. Below is all data from the tests. > > > > N = 10. > > > > Skylake > > Len ,align1 ,align2 ,new mean ,old mean > > 4103 ,0 ,64 ,84.5 ,88.6 > > 4111 ,0 ,3 ,99.0 ,99.9 > > 4127 ,3 ,0 ,102.1 ,102.3 > > 4159 ,3 ,7 ,88.7 ,90.9 > > 4223 ,9 ,5 ,88.1 ,87.4 > > 8199 ,0 ,64 ,146.7 ,150.2 > > 8207 ,0 ,3 ,167.9 ,168.5 > > 8223 ,3 ,0 ,168.5 ,168.1 > > 8255 ,3 ,7 ,157.0 ,159.2 > > 8319 ,9 ,5 ,155.5 ,155.7 > > 16391 ,0 ,64 ,286.2 ,288.8 > > 16399 ,0 ,3 ,307.0 ,308.7 > > 16415 ,3 ,0 ,307.4 ,307.6 > > 16447 ,3 ,7 ,294.6 ,295.5 > > 16511 ,9 ,5 ,291.5 ,462.1 > > 32775 ,0 ,64 ,603.4 ,601.5 > > 32783 ,0 ,3 ,604.8 ,606.4 > > 32799 ,3 ,0 ,603.0 ,604.1 > > 32831 ,3 ,7 ,600.2 ,737.3 > > 32895 ,9 ,5 ,604.4 ,599.5 > > 65543 ,0 ,64 ,1873.5 ,1854.3 > > 65551 ,0 ,3 ,1862.9 ,1846.6 > > 65567 ,3 ,0 ,1885.5 ,1966.0 > > 65599 ,3 ,7 ,1833.2 ,1833.1 > > 65663 ,9 ,5 ,1884.9 ,1887.4 > > 131079 ,0 ,64 ,3944.3 ,3949.4 > > 131087 ,0 ,3 ,3927.3 ,3913.3 > > 131103 ,3 ,0 ,4415.8 ,4169.4 > > 131135 ,3 ,7 ,4224.5 ,4157.6 > > 131199 ,9 ,5 ,5974.0 ,4983.8 > > 262151 ,0 ,64 ,11050.2 ,10620.6 > > 262159 ,0 ,3 ,9932.8 ,10037.3 > > 262175 ,3 ,0 ,10188.8 ,9206.6 > > 262207 ,3 ,7 ,9633.3 ,9216.7 > > 262271 ,9 ,5 ,9732.7 ,9345.3 > > 524295 ,0 ,64 ,24823.9 ,24880.7 > > 524303 ,0 ,3 ,24514.0 ,24556.7 > > 524319 ,3 ,0 ,23974.4 ,24219.9 > > 524351 ,3 ,7 ,24159.7 ,24207.0 > > 524415 ,9 ,5 ,23946.5 ,24142.8 > > > > Icelake: > > Len ,align1 ,align2 ,new mean ,old mean > > 4103 ,0 ,64 ,50.2 ,63.7 > > 4111 ,0 ,3 ,63.7 ,65.1 > > 4127 ,3 ,0 ,68.2 ,69.4 > > 4159 ,3 ,7 ,59.6 ,68.0 > > 4223 ,9 ,5 ,68.2 ,66.8 > > 8199 ,0 ,64 ,92.1 ,89.9 > > 8207 ,0 ,3 ,119.7 ,118.3 > > 8223 ,3 ,0 ,119.1 ,120.9 > > 8255 ,3 ,7 ,122.9 ,123.7 > > 8319 ,9 ,5 ,122.1 ,121.8 > > 16391 ,0 ,64 ,162.7 ,158.0 > > 16399 ,0 ,3 ,227.6 ,234.1 > > 16415 ,3 ,0 ,230.8 ,232.7 > > 16447 ,3 ,7 ,226.8 ,232.6 > > 16511 ,9 ,5 ,233.4 ,233.8 > > 32775 ,0 ,64 ,312.2 ,301.8 > > 32783 ,0 ,3 ,449.7 ,450.0 > > 32799 ,3 ,0 ,452.7 ,455.9 > > 32831 ,3 ,7 ,449.8 ,458.0 > > 32895 ,9 ,5 ,456.3 ,459.4 > > 65543 ,0 ,64 ,1460.6 ,1463.9 > > 65551 ,0 ,3 ,1462.0 ,1465.4 > > 65567 ,3 ,0 ,1466.6 ,1480.4 > > 65599 ,3 ,7 ,1488.0 ,1488.9 > > 65663 ,9 ,5 ,1680.8 ,1499.5 > > 131079 ,0 ,64 ,2988.5 ,3010.1 > > 131087 ,0 ,3 ,2995.5 ,2996.4 > > 131103 ,3 ,0 ,3006.2 ,3000.5 > > 131135 ,3 ,7 ,3032.4 ,3073.7 > > 131199 ,9 ,5 ,3010.4 ,3027.4 > > 262151 ,0 ,64 ,6143.2 ,6079.1 > > 262159 ,0 ,3 ,6085.1 ,6075.8 > > 262175 ,3 ,0 ,6088.0 ,6064.9 > > 262207 ,3 ,7 ,6018.7 ,6023.5 > > 262271 ,9 ,5 ,6019.8 ,5959.2 > > 524295 ,0 ,64 ,14464.2 ,14095.1 > > 524303 ,0 ,3 ,14761.6 ,14050.2 > > 524319 ,3 ,0 ,14534.1 ,14087.5 > > 524351 ,3 ,7 ,14147.7 ,13903.8 > > 524415 ,9 ,5 ,14157.0 ,13982.9 > > > > > > > > cpu ,version ,Len ,align1 ,align2 ,new mean ,old mean > > skylake ,avx ,4103 ,0 ,64 ,84.5 ,88.6 > > skylake ,avx ,4111 ,0 ,3 ,99.0 ,99.9 > > skylake ,avx ,4127 ,3 ,0 ,102.1 ,102.3 > > skylake ,avx ,4159 ,3 ,7 ,88.7 ,90.9 > > skylake ,avx ,4223 ,9 ,5 ,88.1 ,87.4 > > skylake ,avx ,8199 ,0 ,64 ,146.7 ,150.2 > > skylake ,avx ,8207 ,0 ,3 ,167.9 ,168.5 > > skylake ,avx ,8223 ,3 ,0 ,168.5 ,168.1 > > skylake ,avx ,8255 ,3 ,7 ,157.0 ,159.2 > > skylake ,avx ,8319 ,9 ,5 ,155.5 ,155.7 > > skylake ,avx ,16391 ,0 ,64 ,286.2 ,288.8 > > skylake ,avx ,16399 ,0 ,3 ,307.0 ,308.7 > > skylake ,avx ,16415 ,3 ,0 ,307.4 ,307.6 > > skylake ,avx ,16447 ,3 ,7 ,294.6 ,295.5 > > skylake ,avx ,16511 ,9 ,5 ,291.5 ,462.1 > > skylake ,avx ,32775 ,0 ,64 ,603.4 ,601.5 > > skylake ,avx ,32783 ,0 ,3 ,604.8 ,606.4 > > skylake ,avx ,32799 ,3 ,0 ,603.0 ,604.1 > > skylake ,avx ,32831 ,3 ,7 ,600.2 ,737.3 > > skylake ,avx ,32895 ,9 ,5 ,604.4 ,599.5 > > skylake ,avx ,65543 ,0 ,64 ,1873.5 ,1854.3 > > skylake ,avx ,65551 ,0 ,3 ,1862.9 ,1846.6 > > skylake ,avx ,65567 ,3 ,0 ,1885.5 ,1966.0 > > skylake ,avx ,65599 ,3 ,7 ,1833.2 ,1833.1 > > skylake ,avx ,65663 ,9 ,5 ,1884.9 ,1887.4 > > skylake ,avx ,131079 ,0 ,64 ,3944.3 ,3949.4 > > skylake ,avx ,131087 ,0 ,3 ,3927.3 ,3913.3 > > skylake ,avx ,131103 ,3 ,0 ,4415.8 ,4169.4 > > skylake ,avx ,131135 ,3 ,7 ,4224.5 ,4157.6 > > skylake ,avx ,131199 ,9 ,5 ,5974.0 ,4983.8 > > skylake ,avx ,262151 ,0 ,64 ,11050.2 ,10620.6 > > skylake ,avx ,262159 ,0 ,3 ,9932.8 ,10037.3 > > skylake ,avx ,262175 ,3 ,0 ,10188.8 ,9206.6 > > skylake ,avx ,262207 ,3 ,7 ,9633.3 ,9216.7 > > skylake ,avx ,262271 ,9 ,5 ,9732.7 ,9345.3 > > skylake ,avx ,524295 ,0 ,64 ,24823.9 ,24880.7 > > skylake ,avx ,524303 ,0 ,3 ,24514.0 ,24556.7 > > skylake ,avx ,524319 ,3 ,0 ,23974.4 ,24219.9 > > skylake ,avx ,524351 ,3 ,7 ,24159.7 ,24207.0 > > skylake ,avx ,524415 ,9 ,5 ,23946.5 ,24142.8 > > skylake ,avx ,1048583 ,0 ,64 ,49163.9 ,49454.6 > > skylake ,avx ,1048591 ,0 ,3 ,49879.3 ,49400.8 > > skylake ,avx ,1048607 ,3 ,0 ,49738.0 ,48864.6 > > skylake ,avx ,1048639 ,3 ,7 ,48804.0 ,47588.5 > > skylake ,avx ,1048703 ,9 ,5 ,49629.4 ,49796.3 > > skylake ,avx ,2097159 ,0 ,64 ,98271.7 ,96330.6 > > skylake ,avx ,2097167 ,0 ,3 ,97801.8 ,98638.1 > > skylake ,avx ,2097183 ,3 ,0 ,98041.1 ,99287.6 > > skylake ,avx ,2097215 ,3 ,7 ,96629.5 ,96521.9 > > skylake ,avx ,2097279 ,9 ,5 ,98961.8 ,98909.8 > > skylake ,avx ,4194311 ,0 ,64 ,194667.7 ,195377.1 > > skylake ,avx ,4194319 ,0 ,3 ,194919.5 ,198576.2 > > skylake ,avx ,4194335 ,3 ,0 ,192949.8 ,194584.7 > > skylake ,avx ,4194367 ,3 ,7 ,189943.5 ,189177.9 > > skylake ,avx ,4194431 ,9 ,5 ,192479.1 ,196494.2 > > skylake ,avx ,8388615 ,0 ,64 ,588671.6 ,587215.4 > > skylake ,avx ,8388623 ,0 ,3 ,581640.7 ,582812.5 > > skylake ,avx ,8388639 ,3 ,0 ,549811.9 ,544697.6 > > skylake ,avx ,8388671 ,3 ,7 ,591155.0 ,577951.8 > > skylake ,avx ,8388735 ,9 ,5 ,547583.2 ,545133.3 > > skylake ,avx ,16777223 ,0 ,64 ,1787503.0 ,1811146.0 > > skylake ,avx ,16777231 ,0 ,3 ,1758671.0 ,1756343.0 > > skylake ,avx ,16777247 ,3 ,0 ,1691781.0 ,1694661.0 > > skylake ,avx ,16777279 ,3 ,7 ,1768150.0 ,1754785.0 > > skylake ,avx ,16777343 ,9 ,5 ,1695179.0 ,1710794.0 > > skylake ,sse2 ,4103 ,0 ,64 ,150.8 ,150.5 > > skylake ,sse2 ,4111 ,0 ,3 ,156.8 ,158.4 > > skylake ,sse2 ,4127 ,3 ,0 ,99.7 ,99.4 > > skylake ,sse2 ,4159 ,3 ,7 ,154.8 ,154.5 > > skylake ,sse2 ,4223 ,9 ,5 ,137.3 ,137.2 > > skylake ,sse2 ,8199 ,0 ,64 ,284.8 ,285.5 > > skylake ,sse2 ,8207 ,0 ,3 ,296.0 ,296.1 > > skylake ,sse2 ,8223 ,3 ,0 ,168.0 ,168.2 > > skylake ,sse2 ,8255 ,3 ,7 ,293.0 ,292.4 > > skylake ,sse2 ,8319 ,9 ,5 ,251.3 ,250.7 > > skylake ,sse2 ,16391 ,0 ,64 ,561.3 ,608.3 > > skylake ,sse2 ,16399 ,0 ,3 ,571.0 ,574.8 > > skylake ,sse2 ,16415 ,3 ,0 ,305.4 ,305.0 > > skylake ,sse2 ,16447 ,3 ,7 ,563.2 ,565.0 > > skylake ,sse2 ,16511 ,9 ,5 ,477.1 ,475.1 > > skylake ,sse2 ,32775 ,0 ,64 ,1128.2 ,1131.7 > > skylake ,sse2 ,32783 ,0 ,3 ,1126.6 ,1131.0 > > skylake ,sse2 ,32799 ,3 ,0 ,587.6 ,590.8 > > skylake ,sse2 ,32831 ,3 ,7 ,1130.6 ,1126.2 > > skylake ,sse2 ,32895 ,9 ,5 ,957.6 ,953.0 > > skylake ,sse2 ,65543 ,0 ,64 ,2718.9 ,2704.2 > > skylake ,sse2 ,65551 ,0 ,3 ,2724.1 ,2725.0 > > skylake ,sse2 ,65567 ,3 ,0 ,1888.4 ,1914.3 > > skylake ,sse2 ,65599 ,3 ,7 ,2787.6 ,2748.7 > > skylake ,sse2 ,65663 ,9 ,5 ,2400.5 ,2369.4 > > skylake ,sse2 ,131079 ,0 ,64 ,5603.3 ,5654.9 > > skylake ,sse2 ,131087 ,0 ,3 ,5939.3 ,5871.4 > > skylake ,sse2 ,131103 ,3 ,0 ,4272.4 ,4190.0 > > skylake ,sse2 ,131135 ,3 ,7 ,7601.4 ,7524.6 > > skylake ,sse2 ,131199 ,9 ,5 ,7022.1 ,6864.7 > > skylake ,sse2 ,262151 ,0 ,64 ,13736.2 ,14030.0 > > skylake ,sse2 ,262159 ,0 ,3 ,12407.3 ,12334.1 > > skylake ,sse2 ,262175 ,3 ,0 ,9661.1 ,9249.4 > > skylake ,sse2 ,262207 ,3 ,7 ,12850.2 ,12351.6 > > skylake ,sse2 ,262271 ,9 ,5 ,10792.6 ,10435.8 > > skylake ,sse2 ,524295 ,0 ,64 ,27754.5 ,28177.7 > > skylake ,sse2 ,524303 ,0 ,3 ,27766.2 ,28152.0 > > skylake ,sse2 ,524319 ,3 ,0 ,24030.9 ,24438.3 > > skylake ,sse2 ,524351 ,3 ,7 ,27787.5 ,27933.0 > > skylake ,sse2 ,524415 ,9 ,5 ,24263.2 ,25249.1 > > skylake ,sse2 ,1048583 ,0 ,64 ,56199.9 ,56039.8 > > skylake ,sse2 ,1048591 ,0 ,3 ,56750.2 ,58889.7 > > skylake ,sse2 ,1048607 ,3 ,0 ,56394.0 ,55115.3 > > skylake ,sse2 ,1048639 ,3 ,7 ,57233.1 ,57473.8 > > skylake ,sse2 ,1048703 ,9 ,5 ,56324.3 ,55917.9 > > skylake ,sse2 ,2097159 ,0 ,64 ,113234.8 ,114346.4 > > skylake ,sse2 ,2097167 ,0 ,3 ,114373.1 ,115522.5 > > skylake ,sse2 ,2097183 ,3 ,0 ,108113.3 ,108513.3 > > skylake ,sse2 ,2097215 ,3 ,7 ,116863.6 ,116549.9 > > skylake ,sse2 ,2097279 ,9 ,5 ,108945.1 ,108843.7 > > skylake ,sse2 ,4194311 ,0 ,64 ,230250.1 ,232350.0 > > skylake ,sse2 ,4194319 ,0 ,3 ,231895.3 ,235055.6 > > skylake ,sse2 ,4194335 ,3 ,0 ,218442.8 ,219199.8 > > skylake ,sse2 ,4194367 ,3 ,7 ,242564.2 ,235587.7 > > skylake ,sse2 ,4194431 ,9 ,5 ,224167.4 ,215261.8 > > skylake ,sse2 ,8388615 ,0 ,64 ,679801.8 ,674832.0 > > skylake ,sse2 ,8388623 ,0 ,3 ,684913.2 ,685238.7 > > skylake ,sse2 ,8388639 ,3 ,0 ,644865.4 ,631388.6 > > skylake ,sse2 ,8388671 ,3 ,7 ,698700.9 ,689316.1 > > skylake ,sse2 ,8388735 ,9 ,5 ,644820.2 ,631366.8 > > skylake ,sse2 ,16777223 ,0 ,64 ,1877984.0 ,1876437.0 > > skylake ,sse2 ,16777231 ,0 ,3 ,1898086.0 ,1913053.0 > > skylake ,sse2 ,16777247 ,3 ,0 ,1857018.0 ,1866949.0 > > skylake ,sse2 ,16777279 ,3 ,7 ,1914905.0 ,1897134.0 > > skylake ,sse2 ,16777343 ,9 ,5 ,1859937.0 ,1881939.0 > > icelake ,avx512 ,4103 ,0 ,64 ,75.2 ,75.8 > > icelake ,avx512 ,4111 ,0 ,3 ,56.9 ,56.4 > > icelake ,avx512 ,4127 ,3 ,0 ,59.1 ,59.6 > > icelake ,avx512 ,4159 ,3 ,7 ,50.7 ,51.3 > > icelake ,avx512 ,4223 ,9 ,5 ,59.2 ,58.9 > > icelake ,avx512 ,8199 ,0 ,64 ,67.8 ,63.9 > > icelake ,avx512 ,8207 ,0 ,3 ,89.0 ,89.9 > > icelake ,avx512 ,8223 ,3 ,0 ,90.2 ,90.1 > > icelake ,avx512 ,8255 ,3 ,7 ,82.6 ,84.9 > > icelake ,avx512 ,8319 ,9 ,5 ,91.5 ,92.8 > > icelake ,avx512 ,16391 ,0 ,64 ,118.0 ,117.6 > > icelake ,avx512 ,16399 ,0 ,3 ,156.5 ,157.0 > > icelake ,avx512 ,16415 ,3 ,0 ,157.4 ,157.3 > > icelake ,avx512 ,16447 ,3 ,7 ,151.0 ,151.6 > > icelake ,avx512 ,16511 ,9 ,5 ,159.1 ,159.6 > > icelake ,avx512 ,32775 ,0 ,64 ,231.8 ,230.8 > > icelake ,avx512 ,32783 ,0 ,3 ,297.8 ,299.3 > > icelake ,avx512 ,32799 ,3 ,0 ,299.1 ,299.0 > > icelake ,avx512 ,32831 ,3 ,7 ,293.5 ,295.4 > > icelake ,avx512 ,32895 ,9 ,5 ,300.3 ,302.5 > > icelake ,avx512 ,65543 ,0 ,64 ,1473.4 ,1479.2 > > icelake ,avx512 ,65551 ,0 ,3 ,1438.2 ,1445.3 > > icelake ,avx512 ,65567 ,3 ,0 ,1450.3 ,1463.8 > > icelake ,avx512 ,65599 ,3 ,7 ,1469.0 ,1473.8 > > icelake ,avx512 ,65663 ,9 ,5 ,1480.0 ,1483.5 > > icelake ,avx512 ,131079 ,0 ,64 ,3015.1 ,3037.5 > > icelake ,avx512 ,131087 ,0 ,3 ,2952.3 ,2960.4 > > icelake ,avx512 ,131103 ,3 ,0 ,2966.2 ,2964.4 > > icelake ,avx512 ,131135 ,3 ,7 ,2961.6 ,3047.9 > > icelake ,avx512 ,131199 ,9 ,5 ,2967.4 ,3183.8 > > icelake ,avx512 ,262151 ,0 ,64 ,6206.0 ,6141.5 > > icelake ,avx512 ,262159 ,0 ,3 ,5990.8 ,5959.2 > > icelake ,avx512 ,262175 ,3 ,0 ,5976.7 ,5963.8 > > icelake ,avx512 ,262207 ,3 ,7 ,5939.5 ,5924.3 > > icelake ,avx512 ,262271 ,9 ,5 ,5944.6 ,5990.3 > > icelake ,avx512 ,524295 ,0 ,64 ,14726.7 ,14307.0 > > icelake ,avx512 ,524303 ,0 ,3 ,14344.2 ,14040.5 > > icelake ,avx512 ,524319 ,3 ,0 ,14175.0 ,13862.2 > > icelake ,avx512 ,524351 ,3 ,7 ,14261.4 ,13821.5 > > icelake ,avx512 ,524415 ,9 ,5 ,14266.5 ,14064.7 > > icelake ,avx512 ,1048583 ,0 ,64 ,35211.4 ,35414.6 > > icelake ,avx512 ,1048591 ,0 ,3 ,35156.8 ,35591.2 > > icelake ,avx512 ,1048607 ,3 ,0 ,35273.1 ,35503.3 > > icelake ,avx512 ,1048639 ,3 ,7 ,35255.8 ,35725.0 > > icelake ,avx512 ,1048703 ,9 ,5 ,35703.6 ,36289.9 > > icelake ,avx512 ,2097159 ,0 ,64 ,72613.9 ,72063.2 > > icelake ,avx512 ,2097167 ,0 ,3 ,72301.6 ,73504.2 > > icelake ,avx512 ,2097183 ,3 ,0 ,73448.8 ,72133.6 > > icelake ,avx512 ,2097215 ,3 ,7 ,73762.9 ,72825.8 > > icelake ,avx512 ,2097279 ,9 ,5 ,72097.3 ,72914.6 > > icelake ,avx512 ,4194311 ,0 ,64 ,144793.4 ,144182.1 > > icelake ,avx512 ,4194319 ,0 ,3 ,143710.3 ,145063.3 > > icelake ,avx512 ,4194335 ,3 ,0 ,146722.1 ,144046.4 > > icelake ,avx512 ,4194367 ,3 ,7 ,144267.0 ,144874.6 > > icelake ,avx512 ,4194431 ,9 ,5 ,143808.2 ,144560.0 > > icelake ,avx512 ,8388615 ,0 ,64 ,427993.4 ,424521.5 > > icelake ,avx512 ,8388623 ,0 ,3 ,470267.1 ,473290.8 > > icelake ,avx512 ,8388639 ,3 ,0 ,457179.7 ,461797.7 > > icelake ,avx512 ,8388671 ,3 ,7 ,472507.9 ,481561.4 > > icelake ,avx512 ,8388735 ,9 ,5 ,463611.9 ,467388.7 > > icelake ,avx512 ,16777223 ,0 ,64 ,1490426.0 ,1526996.0 > > icelake ,avx512 ,16777231 ,0 ,3 ,1516687.0 ,1517095.0 > > icelake ,avx512 ,16777247 ,3 ,0 ,1497688.0 ,1512766.0 > > icelake ,avx512 ,16777279 ,3 ,7 ,1512331.0 ,1524317.0 > > icelake ,avx512 ,16777343 ,9 ,5 ,1498908.0 ,1500526.0 > > icelake ,avx ,4103 ,0 ,64 ,50.2 ,63.7 > > icelake ,avx ,4111 ,0 ,3 ,63.7 ,65.1 > > icelake ,avx ,4127 ,3 ,0 ,68.2 ,69.4 > > icelake ,avx ,4159 ,3 ,7 ,59.6 ,68.0 > > icelake ,avx ,4223 ,9 ,5 ,68.2 ,66.8 > > icelake ,avx ,8199 ,0 ,64 ,92.1 ,89.9 > > icelake ,avx ,8207 ,0 ,3 ,119.7 ,118.3 > > icelake ,avx ,8223 ,3 ,0 ,119.1 ,120.9 > > icelake ,avx ,8255 ,3 ,7 ,122.9 ,123.7 > > icelake ,avx ,8319 ,9 ,5 ,122.1 ,121.8 > > icelake ,avx ,16391 ,0 ,64 ,162.7 ,158.0 > > icelake ,avx ,16399 ,0 ,3 ,227.6 ,234.1 > > icelake ,avx ,16415 ,3 ,0 ,230.8 ,232.7 > > icelake ,avx ,16447 ,3 ,7 ,226.8 ,232.6 > > icelake ,avx ,16511 ,9 ,5 ,233.4 ,233.8 > > icelake ,avx ,32775 ,0 ,64 ,312.2 ,301.8 > > icelake ,avx ,32783 ,0 ,3 ,449.7 ,450.0 > > icelake ,avx ,32799 ,3 ,0 ,452.7 ,455.9 > > icelake ,avx ,32831 ,3 ,7 ,449.8 ,458.0 > > icelake ,avx ,32895 ,9 ,5 ,456.3 ,459.4 > > icelake ,avx ,65543 ,0 ,64 ,1460.6 ,1463.9 > > icelake ,avx ,65551 ,0 ,3 ,1462.0 ,1465.4 > > icelake ,avx ,65567 ,3 ,0 ,1466.6 ,1480.4 > > icelake ,avx ,65599 ,3 ,7 ,1488.0 ,1488.9 > > icelake ,avx ,65663 ,9 ,5 ,1680.8 ,1499.5 > > icelake ,avx ,131079 ,0 ,64 ,2988.5 ,3010.1 > > icelake ,avx ,131087 ,0 ,3 ,2995.5 ,2996.4 > > icelake ,avx ,131103 ,3 ,0 ,3006.2 ,3000.5 > > icelake ,avx ,131135 ,3 ,7 ,3032.4 ,3073.7 > > icelake ,avx ,131199 ,9 ,5 ,3010.4 ,3027.4 > > icelake ,avx ,262151 ,0 ,64 ,6143.2 ,6079.1 > > icelake ,avx ,262159 ,0 ,3 ,6085.1 ,6075.8 > > icelake ,avx ,262175 ,3 ,0 ,6088.0 ,6064.9 > > icelake ,avx ,262207 ,3 ,7 ,6018.7 ,6023.5 > > icelake ,avx ,262271 ,9 ,5 ,6019.8 ,5959.2 > > icelake ,avx ,524295 ,0 ,64 ,14464.2 ,14095.1 > > icelake ,avx ,524303 ,0 ,3 ,14761.6 ,14050.2 > > icelake ,avx ,524319 ,3 ,0 ,14534.1 ,14087.5 > > icelake ,avx ,524351 ,3 ,7 ,14147.7 ,13903.8 > > icelake ,avx ,524415 ,9 ,5 ,14157.0 ,13982.9 > > icelake ,avx ,1048583 ,0 ,64 ,36599.0 ,37461.4 > > icelake ,avx ,1048591 ,0 ,3 ,36717.8 ,37454.9 > > icelake ,avx ,1048607 ,3 ,0 ,36821.2 ,37343.3 > > icelake ,avx ,1048639 ,3 ,7 ,36958.0 ,37507.2 > > icelake ,avx ,1048703 ,9 ,5 ,36869.2 ,37413.1 > > icelake ,avx ,2097159 ,0 ,64 ,74765.8 ,75330.9 > > icelake ,avx ,2097167 ,0 ,3 ,75175.4 ,74891.9 > > icelake ,avx ,2097183 ,3 ,0 ,75451.4 ,74787.7 > > icelake ,avx ,2097215 ,3 ,7 ,75394.8 ,75839.1 > > icelake ,avx ,2097279 ,9 ,5 ,75099.2 ,75421.2 > > icelake ,avx ,4194311 ,0 ,64 ,146809.6 ,146619.4 > > icelake ,avx ,4194319 ,0 ,3 ,148866.4 ,149898.2 > > icelake ,avx ,4194335 ,3 ,0 ,148719.7 ,150165.4 > > icelake ,avx ,4194367 ,3 ,7 ,150600.1 ,150925.9 > > icelake ,avx ,4194431 ,9 ,5 ,149457.3 ,150519.2 > > icelake ,avx ,8388615 ,0 ,64 ,412709.8 ,423666.1 > > icelake ,avx ,8388623 ,0 ,3 ,423717.4 ,424418.2 > > icelake ,avx ,8388639 ,3 ,0 ,414387.5 ,413445.6 > > icelake ,avx ,8388671 ,3 ,7 ,449010.7 ,417553.5 > > icelake ,avx ,8388735 ,9 ,5 ,414128.6 ,411815.3 > > icelake ,avx ,16777223 ,0 ,64 ,1490032.0 ,1510004.0 > > icelake ,avx ,16777231 ,0 ,3 ,1379638.0 ,1422097.0 > > icelake ,avx ,16777247 ,3 ,0 ,1418930.0 ,1367557.0 > > icelake ,avx ,16777279 ,3 ,7 ,1515152.0 ,1500176.0 > > icelake ,avx ,16777343 ,9 ,5 ,1344117.0 ,1411795.0 > > icelake ,sse2 ,4103 ,0 ,64 ,113.2 ,114.6 > > icelake ,sse2 ,4111 ,0 ,3 ,121.5 ,120.4 > > icelake ,sse2 ,4127 ,3 ,0 ,1700.5 ,1771.5 > > icelake ,sse2 ,4159 ,3 ,7 ,119.3 ,118.8 > > icelake ,sse2 ,4223 ,9 ,5 ,1739.7 ,1735.2 > > icelake ,sse2 ,8199 ,0 ,64 ,207.0 ,203.9 > > icelake ,sse2 ,8207 ,0 ,3 ,225.5 ,220.8 > > icelake ,sse2 ,8223 ,3 ,0 ,3444.3 ,3743.5 > > icelake ,sse2 ,8255 ,3 ,7 ,219.9 ,216.8 > > icelake ,sse2 ,8319 ,9 ,5 ,4117.1 ,3487.3 > > icelake ,sse2 ,16391 ,0 ,64 ,397.1 ,394.3 > > icelake ,sse2 ,16399 ,0 ,3 ,439.6 ,428.6 > > icelake ,sse2 ,16415 ,3 ,0 ,6997.0 ,7031.2 > > icelake ,sse2 ,16447 ,3 ,7 ,426.8 ,421.8 > > icelake ,sse2 ,16511 ,9 ,5 ,7037.6 ,7038.3 > > icelake ,sse2 ,32775 ,0 ,64 ,790.9 ,779.0 > > icelake ,sse2 ,32783 ,0 ,3 ,863.1 ,849.6 > > icelake ,sse2 ,32799 ,3 ,0 ,14043.0 ,14390.9 > > icelake ,sse2 ,32831 ,3 ,7 ,841.6 ,833.1 > > icelake ,sse2 ,32895 ,9 ,5 ,14277.6 ,14344.2 > > icelake ,sse2 ,65543 ,0 ,64 ,1897.0 ,1897.3 > > icelake ,sse2 ,65551 ,0 ,3 ,1927.1 ,1955.4 > > icelake ,sse2 ,65567 ,3 ,0 ,28834.7 ,28727.8 > > icelake ,sse2 ,65599 ,3 ,7 ,1961.4 ,1969.7 > > icelake ,sse2 ,65663 ,9 ,5 ,28867.6 ,29019.8 > > icelake ,sse2 ,131079 ,0 ,64 ,3879.3 ,3872.6 > > icelake ,sse2 ,131087 ,0 ,3 ,3955.3 ,3990.7 > > icelake ,sse2 ,131103 ,3 ,0 ,58001.8 ,60567.9 > > icelake ,sse2 ,131135 ,3 ,7 ,3951.5 ,4002.6 > > icelake ,sse2 ,131199 ,9 ,5 ,57886.7 ,58391.4 > > icelake ,sse2 ,262151 ,0 ,64 ,7851.4 ,7894.7 > > icelake ,sse2 ,262159 ,0 ,3 ,7947.5 ,8016.2 > > icelake ,sse2 ,262175 ,3 ,0 ,115036.2 ,115968.6 > > icelake ,sse2 ,262207 ,3 ,7 ,7883.9 ,7814.1 > > icelake ,sse2 ,262271 ,9 ,5 ,113776.4 ,119733.6 > > icelake ,sse2 ,524295 ,0 ,64 ,17198.1 ,16974.9 > > icelake ,sse2 ,524303 ,0 ,3 ,17402.2 ,17096.3 > > icelake ,sse2 ,524319 ,3 ,0 ,223980.4 ,225889.9 > > icelake ,sse2 ,524351 ,3 ,7 ,17034.9 ,16910.3 > > icelake ,sse2 ,524415 ,9 ,5 ,224027.7 ,224962.5 > > icelake ,sse2 ,1048583 ,0 ,64 ,38822.3 ,39178.6 > > icelake ,sse2 ,1048591 ,0 ,3 ,41686.7 ,40247.4 > > icelake ,sse2 ,1048607 ,3 ,0 ,38814.8 ,39323.3 > > icelake ,sse2 ,1048639 ,3 ,7 ,39568.3 ,41325.7 > > icelake ,sse2 ,1048703 ,9 ,5 ,39354.2 ,39637.9 > > icelake ,sse2 ,2097159 ,0 ,64 ,84074.7 ,84543.1 > > icelake ,sse2 ,2097167 ,0 ,3 ,83665.7 ,82358.2 > > icelake ,sse2 ,2097183 ,3 ,0 ,81817.8 ,79638.9 > > icelake ,sse2 ,2097215 ,3 ,7 ,83649.1 ,83497.6 > > icelake ,sse2 ,2097279 ,9 ,5 ,80287.6 ,79980.9 > > icelake ,sse2 ,4194311 ,0 ,64 ,165409.8 ,168343.1 > > icelake ,sse2 ,4194319 ,0 ,3 ,165216.7 ,177632.0 > > icelake ,sse2 ,4194335 ,3 ,0 ,158718.7 ,160342.2 > > icelake ,sse2 ,4194367 ,3 ,7 ,167944.9 ,167204.4 > > icelake ,sse2 ,4194431 ,9 ,5 ,161530.1 ,164839.7 > > icelake ,sse2 ,8388615 ,0 ,64 ,626504.3 ,629858.5 > > icelake ,sse2 ,8388623 ,0 ,3 ,623969.5 ,631509.1 > > icelake ,sse2 ,8388639 ,3 ,0 ,599366.7 ,600016.0 > > icelake ,sse2 ,8388671 ,3 ,7 ,619964.2 ,619113.2 > > icelake ,sse2 ,8388735 ,9 ,5 ,595338.1 ,604172.4 > > icelake ,sse2 ,16777223 ,0 ,64 ,1709597.0 ,1725184.0 > > icelake ,sse2 ,16777231 ,0 ,3 ,1725452.0 ,1719746.0 > > icelake ,sse2 ,16777247 ,3 ,0 ,1614269.0 ,1607164.0 > > icelake ,sse2 ,16777279 ,3 ,7 ,1705295.0 ,1733018.0 > > icelake ,sse2 ,16777343 ,9 ,5 ,1604197.0 ,1595690.0 > > > > I am having a hard time to convince myself that this patch is really necessary. > What are geomeans of all different cases for each processors? N = 100, Geometric mean of Current vs New for memcpy-bench-large. Note the bench-memmove-large numbers should be unaffected by this patch as the new logic only applies to the no overlap case. cpu ,inst ,Len ,align1 ,align2 ,new geomean ,cur geomean ,New/Cur icelake ,sse2 ,65543 ,0 ,0 ,5566.1 ,5564.7 ,1.0 icelake ,sse2 ,65551 ,0 ,3 ,5856.4 ,5725.7 ,1.02 icelake ,sse2 ,65567 ,3 ,0 ,5622.8 ,5892.9 ,0.95 icelake ,sse2 ,65599 ,3 ,5 ,5857.3 ,5723.8 ,1.02 icelake ,sse2 ,65536 ,0 ,127 ,5953.3 ,5831.1 ,1.02 icelake ,sse2 ,65536 ,0 ,255 ,5811.7 ,5789.5 ,1.0 icelake ,sse2 ,65536 ,0 ,256 ,5373.5 ,5284.1 ,1.02 icelake ,sse2 ,65536 ,0 ,4064 ,5820.1 ,5761.6 ,1.01 icelake ,sse2 ,131079 ,0 ,0 ,12421.5 ,12424.1 ,1.0 icelake ,sse2 ,131087 ,0 ,3 ,12389.5 ,12276.4 ,1.01 icelake ,sse2 ,131103 ,3 ,0 ,11587.0 ,12607.6 ,0.92 icelake ,sse2 ,131135 ,3 ,5 ,11596.9 ,11896.2 ,0.97 icelake ,sse2 ,131072 ,0 ,127 ,11746.4 ,12490.1 ,0.94 icelake ,sse2 ,131072 ,0 ,255 ,11486.8 ,11831.7 ,0.97 icelake ,sse2 ,131072 ,0 ,256 ,10453.5 ,10451.7 ,1.0 icelake ,sse2 ,131072 ,0 ,4064 ,11231.7 ,11223.6 ,1.0 icelake ,sse2 ,262151 ,0 ,0 ,29408.5 ,30831.2 ,0.95 icelake ,sse2 ,262159 ,0 ,3 ,30813.6 ,32235.6 ,0.96 icelake ,sse2 ,262175 ,3 ,0 ,30245.0 ,31392.5 ,0.96 icelake ,sse2 ,262207 ,3 ,5 ,30775.6 ,32298.6 ,0.95 icelake ,sse2 ,262144 ,0 ,127 ,31784.7 ,32791.5 ,0.97 icelake ,sse2 ,262144 ,0 ,255 ,30726.0 ,31997.5 ,0.96 icelake ,sse2 ,262144 ,0 ,256 ,28418.9 ,29440.9 ,0.97 icelake ,sse2 ,262144 ,0 ,4064 ,29984.1 ,31048.9 ,0.97 icelake ,sse2 ,524295 ,0 ,0 ,76079.0 ,75752.0 ,1.0 icelake ,sse2 ,524303 ,0 ,3 ,79939.3 ,80796.4 ,0.99 icelake ,sse2 ,524319 ,3 ,0 ,79018.1 ,79928.5 ,0.99 icelake ,sse2 ,524351 ,3 ,5 ,81219.4 ,81053.8 ,1.0 icelake ,sse2 ,524288 ,0 ,127 ,80111.8 ,80087.2 ,1.0 icelake ,sse2 ,524288 ,0 ,255 ,79334.0 ,79525.6 ,1.0 icelake ,sse2 ,524288 ,0 ,256 ,75766.9 ,75918.9 ,1.0 icelake ,sse2 ,524288 ,0 ,4064 ,78907.9 ,79550.8 ,0.99 icelake ,sse2 ,1048583 ,0 ,0 ,144672.6 ,147457.7 ,0.98 icelake ,sse2 ,1048591 ,0 ,3 ,173803.9 ,400563.2 ,0.43 icelake ,sse2 ,1048607 ,3 ,0 ,149391.9 ,151772.1 ,0.98 icelake ,sse2 ,1048639 ,3 ,5 ,174774.1 ,400657.4 ,0.44 icelake ,sse2 ,1048576 ,0 ,127 ,175350.9 ,347110.6 ,0.51 icelake ,sse2 ,1048576 ,0 ,255 ,150152.6 ,144242.9 ,1.04 icelake ,sse2 ,1048576 ,0 ,256 ,145869.7 ,147489.6 ,0.99 icelake ,sse2 ,1048576 ,0 ,4064 ,145814.7 ,147497.7 ,0.99 icelake ,sse2 ,2097159 ,0 ,0 ,289460.6 ,295574.6 ,0.98 icelake ,sse2 ,2097167 ,0 ,3 ,347057.0 ,799549.1 ,0.43 icelake ,sse2 ,2097183 ,3 ,0 ,298565.7 ,301424.3 ,0.99 icelake ,sse2 ,2097215 ,3 ,5 ,348620.4 ,797557.4 ,0.44 icelake ,sse2 ,2097152 ,0 ,127 ,348751.4 ,695260.9 ,0.5 icelake ,sse2 ,2097152 ,0 ,255 ,298960.5 ,286590.0 ,1.04 icelake ,sse2 ,2097152 ,0 ,256 ,290978.4 ,293225.6 ,0.99 icelake ,sse2 ,2097152 ,0 ,4064 ,290476.0 ,292283.2 ,0.99 icelake ,sse2 ,4194311 ,0 ,0 ,583386.3 ,588284.3 ,0.99 icelake ,sse2 ,4194319 ,0 ,3 ,703870.5 ,1595268.0 ,0.44 icelake ,sse2 ,4194335 ,3 ,0 ,599400.2 ,601591.6 ,1.0 icelake ,sse2 ,4194367 ,3 ,5 ,694569.7 ,1595608.0 ,0.44 icelake ,sse2 ,4194304 ,0 ,127 ,700229.1 ,1389061.9 ,0.5 icelake ,sse2 ,4194304 ,0 ,255 ,600779.0 ,573361.2 ,1.05 icelake ,sse2 ,4194304 ,0 ,256 ,586610.7 ,589269.6 ,1.0 icelake ,sse2 ,4194304 ,0 ,4064 ,583616.3 ,584806.4 ,1.0 icelake ,sse2 ,8388615 ,0 ,0 ,1214632.8 ,1266616.0 ,0.96 icelake ,sse2 ,8388623 ,0 ,3 ,1405136.9 ,3198827.1 ,0.44 icelake ,sse2 ,8388639 ,3 ,0 ,1244302.6 ,1297425.9 ,0.96 icelake ,sse2 ,8388671 ,3 ,5 ,1404685.1 ,3196389.9 ,0.44 icelake ,sse2 ,8388608 ,0 ,127 ,1419888.5 ,2792729.4 ,0.51 icelake ,sse2 ,8388608 ,0 ,255 ,1249044.6 ,1259726.7 ,0.99 icelake ,sse2 ,8388608 ,0 ,256 ,1234471.9 ,1300463.6 ,0.95 icelake ,sse2 ,8388608 ,0 ,4064 ,1220102.2 ,1265190.5 ,0.96 icelake ,sse2 ,16777223 ,0 ,0 ,2689516.3 ,2846521.1 ,0.94 icelake ,sse2 ,16777231 ,0 ,3 ,3001317.4 ,6428733.7 ,0.47 icelake ,sse2 ,16777247 ,3 ,0 ,2770040.8 ,2910434.9 ,0.95 icelake ,sse2 ,16777279 ,3 ,5 ,3002076.1 ,6415835.9 ,0.47 icelake ,sse2 ,16777216 ,0 ,127 ,3063786.3 ,5609895.3 ,0.55 icelake ,sse2 ,16777216 ,0 ,255 ,2821606.1 ,2833843.6 ,1.0 icelake ,sse2 ,16777216 ,0 ,256 ,2719765.5 ,2925344.2 ,0.93 icelake ,sse2 ,16777216 ,0 ,4064 ,2686189.2 ,2848017.5 ,0.94 icelake ,sse2 ,33554439 ,0 ,0 ,5577945.0 ,5913674.6 ,0.94 icelake ,sse2 ,33554447 ,0 ,3 ,6152758.8 ,12863855.0 ,0.48 icelake ,sse2 ,33554463 ,3 ,0 ,5773351.4 ,6035289.3 ,0.96 icelake ,sse2 ,33554495 ,3 ,5 ,6160006.2 ,12878153.9 ,0.48 icelake ,sse2 ,33554432 ,0 ,127 ,6303495.4 ,11221070.2 ,0.56 icelake ,sse2 ,33554432 ,0 ,255 ,5830879.6 ,5944978.6 ,0.98 icelake ,sse2 ,33554432 ,0 ,256 ,5611968.2 ,6068255.4 ,0.92 icelake ,sse2 ,33554432 ,0 ,4064 ,5570321.0 ,5964542.6 ,0.93 icelake ,avx ,65543 ,0 ,0 ,5561.1 ,5659.7 ,0.98 icelake ,avx ,65551 ,0 ,3 ,5859.9 ,5724.8 ,1.02 icelake ,avx ,65567 ,3 ,0 ,5636.7 ,5623.3 ,1.0 icelake ,avx ,65599 ,3 ,5 ,5856.3 ,5720.2 ,1.02 icelake ,avx ,65536 ,0 ,127 ,6011.1 ,5910.0 ,1.02 icelake ,avx ,65536 ,0 ,255 ,5854.5 ,5792.3 ,1.01 icelake ,avx ,65536 ,0 ,256 ,5213.0 ,5273.9 ,0.99 icelake ,avx ,65536 ,0 ,4064 ,5760.7 ,5661.1 ,1.02 icelake ,avx ,131079 ,0 ,0 ,12371.4 ,12707.0 ,0.97 icelake ,avx ,131087 ,0 ,3 ,13220.1 ,12515.7 ,1.06 icelake ,avx ,131103 ,3 ,0 ,11628.2 ,11546.9 ,1.01 icelake ,avx ,131135 ,3 ,5 ,13025.7 ,13967.6 ,0.93 icelake ,avx ,131072 ,0 ,127 ,11781.7 ,11936.4 ,0.99 icelake ,avx ,131072 ,0 ,255 ,11802.2 ,11583.9 ,1.02 icelake ,avx ,131072 ,0 ,256 ,10436.9 ,10693.1 ,0.98 icelake ,avx ,131072 ,0 ,4064 ,11880.9 ,11395.6 ,1.04 icelake ,avx ,262151 ,0 ,0 ,29132.6 ,30542.8 ,0.95 icelake ,avx ,262159 ,0 ,3 ,30533.5 ,31468.8 ,0.97 icelake ,avx ,262175 ,3 ,0 ,29879.5 ,30933.7 ,0.97 icelake ,avx ,262207 ,3 ,5 ,30263.1 ,31445.0 ,0.96 icelake ,avx ,262144 ,0 ,127 ,30180.9 ,31405.3 ,0.96 icelake ,avx ,262144 ,0 ,255 ,30152.9 ,31372.5 ,0.96 icelake ,avx ,262144 ,0 ,256 ,28121.9 ,28990.9 ,0.97 icelake ,avx ,262144 ,0 ,4064 ,29785.2 ,31078.4 ,0.96 icelake ,avx ,524295 ,0 ,0 ,76045.7 ,75824.3 ,1.0 icelake ,avx ,524303 ,0 ,3 ,79303.7 ,80433.3 ,0.99 icelake ,avx ,524319 ,3 ,0 ,79323.8 ,79411.3 ,1.0 icelake ,avx ,524351 ,3 ,5 ,79797.9 ,80179.4 ,1.0 icelake ,avx ,524288 ,0 ,127 ,80046.7 ,80254.1 ,1.0 icelake ,avx ,524288 ,0 ,255 ,78580.6 ,79210.4 ,0.99 icelake ,avx ,524288 ,0 ,256 ,75464.4 ,75184.2 ,1.0 icelake ,avx ,524288 ,0 ,4064 ,78863.6 ,78677.9 ,1.0 icelake ,avx ,1048583 ,0 ,0 ,131017.9 ,133962.4 ,0.98 icelake ,avx ,1048591 ,0 ,3 ,143451.3 ,210311.7 ,0.68 icelake ,avx ,1048607 ,3 ,0 ,136944.0 ,138426.4 ,0.99 icelake ,avx ,1048639 ,3 ,5 ,143594.3 ,209887.9 ,0.68 icelake ,avx ,1048576 ,0 ,127 ,156462.0 ,218873.2 ,0.71 icelake ,avx ,1048576 ,0 ,255 ,148026.3 ,179419.0 ,0.83 icelake ,avx ,1048576 ,0 ,256 ,143365.7 ,137816.3 ,1.04 icelake ,avx ,1048576 ,0 ,4064 ,131683.4 ,132731.6 ,0.99 icelake ,avx ,2097159 ,0 ,0 ,263807.1 ,267984.5 ,0.98 icelake ,avx ,2097167 ,0 ,3 ,286949.8 ,422279.2 ,0.68 icelake ,avx ,2097183 ,3 ,0 ,274675.6 ,276702.2 ,0.99 icelake ,avx ,2097215 ,3 ,5 ,286681.7 ,420176.7 ,0.68 icelake ,avx ,2097152 ,0 ,127 ,314499.2 ,437864.2 ,0.72 icelake ,avx ,2097152 ,0 ,255 ,297458.4 ,359520.9 ,0.83 icelake ,avx ,2097152 ,0 ,256 ,285883.2 ,276043.2 ,1.04 icelake ,avx ,2097152 ,0 ,4064 ,263436.6 ,265516.6 ,0.99 icelake ,avx ,4194311 ,0 ,0 ,529119.4 ,536745.2 ,0.99 icelake ,avx ,4194319 ,0 ,3 ,573960.0 ,839002.3 ,0.68 icelake ,avx ,4194335 ,3 ,0 ,550617.2 ,553117.5 ,1.0 icelake ,avx ,4194367 ,3 ,5 ,572742.8 ,838784.5 ,0.68 icelake ,avx ,4194304 ,0 ,127 ,629413.6 ,876512.1 ,0.72 icelake ,avx ,4194304 ,0 ,255 ,594224.1 ,717425.1 ,0.83 icelake ,avx ,4194304 ,0 ,256 ,573365.0 ,552538.3 ,1.04 icelake ,avx ,4194304 ,0 ,4064 ,527459.3 ,531907.1 ,0.99 icelake ,avx ,8388615 ,0 ,0 ,1094256.8 ,1145619.9 ,0.96 icelake ,avx ,8388623 ,0 ,3 ,1170367.1 ,1700076.4 ,0.69 icelake ,avx ,8388639 ,3 ,0 ,1136168.1 ,1174752.4 ,0.97 icelake ,avx ,8388671 ,3 ,5 ,1172015.6 ,1703032.8 ,0.69 icelake ,avx ,8388608 ,0 ,127 ,1276748.6 ,1771351.9 ,0.72 icelake ,avx ,8388608 ,0 ,255 ,1207712.0 ,1449267.0 ,0.83 icelake ,avx ,8388608 ,0 ,256 ,1167958.9 ,1178243.1 ,0.99 icelake ,avx ,8388608 ,0 ,4064 ,1106155.9 ,1145128.6 ,0.97 icelake ,avx ,16777223 ,0 ,0 ,2479317.5 ,2630301.0 ,0.94 icelake ,avx ,16777231 ,0 ,3 ,2643303.6 ,3536980.7 ,0.75 icelake ,avx ,16777247 ,3 ,0 ,2571967.0 ,2672246.4 ,0.96 icelake ,avx ,16777279 ,3 ,5 ,2641320.5 ,3538388.9 ,0.75 icelake ,avx ,16777216 ,0 ,127 ,2832921.6 ,3593702.5 ,0.79 icelake ,avx ,16777216 ,0 ,255 ,2700272.1 ,3025346.1 ,0.89 icelake ,avx ,16777216 ,0 ,256 ,2622133.7 ,2709087.6 ,0.97 icelake ,avx ,16777216 ,0 ,4064 ,2475020.7 ,2610977.8 ,0.95 icelake ,avx ,33554439 ,0 ,0 ,5190103.1 ,5576047.9 ,0.93 icelake ,avx ,33554447 ,0 ,3 ,5477752.1 ,7215479.2 ,0.76 icelake ,avx ,33554463 ,3 ,0 ,5338711.7 ,5625026.7 ,0.95 icelake ,avx ,33554495 ,3 ,5 ,5505164.8 ,7223660.8 ,0.76 icelake ,avx ,33554432 ,0 ,127 ,5859232.3 ,7279581.9 ,0.8 icelake ,avx ,33554432 ,0 ,255 ,5681634.7 ,6156488.6 ,0.92 icelake ,avx ,33554432 ,0 ,256 ,5440721.4 ,5728347.4 ,0.95 icelake ,avx ,33554432 ,0 ,4064 ,5191213.2 ,5538716.4 ,0.94 icelake ,avx512 ,65543 ,0 ,0 ,5563.5 ,5634.1 ,0.99 icelake ,avx512 ,65551 ,0 ,3 ,5864.1 ,5728.4 ,1.02 icelake ,avx512 ,65567 ,3 ,0 ,5720.2 ,5625.3 ,1.02 icelake ,avx512 ,65599 ,3 ,5 ,5857.2 ,5722.0 ,1.02 icelake ,avx512 ,65536 ,0 ,127 ,6040.7 ,5844.0 ,1.03 icelake ,avx512 ,65536 ,0 ,255 ,5826.5 ,5799.6 ,1.0 icelake ,avx512 ,65536 ,0 ,256 ,5234.4 ,5230.0 ,1.0 icelake ,avx512 ,65536 ,0 ,4064 ,5800.7 ,5655.4 ,1.03 icelake ,avx512 ,131079 ,0 ,0 ,12591.4 ,11767.1 ,1.07 icelake ,avx512 ,131087 ,0 ,3 ,12694.9 ,12292.1 ,1.03 icelake ,avx512 ,131103 ,3 ,0 ,11374.7 ,12236.3 ,0.93 icelake ,avx512 ,131135 ,3 ,5 ,11958.2 ,11745.5 ,1.02 icelake ,avx512 ,131072 ,0 ,127 ,11803.4 ,11908.6 ,0.99 icelake ,avx512 ,131072 ,0 ,255 ,11569.0 ,11487.9 ,1.01 icelake ,avx512 ,131072 ,0 ,256 ,11087.6 ,10456.4 ,1.06 icelake ,avx512 ,131072 ,0 ,4064 ,11166.0 ,11248.2 ,0.99 icelake ,avx512 ,262151 ,0 ,0 ,30232.1 ,29932.7 ,1.01 icelake ,avx512 ,262159 ,0 ,3 ,30093.8 ,31315.1 ,0.96 icelake ,avx512 ,262175 ,3 ,0 ,30147.7 ,30643.4 ,0.98 icelake ,avx512 ,262207 ,3 ,5 ,29985.9 ,31479.8 ,0.95 icelake ,avx512 ,262144 ,0 ,127 ,30099.7 ,31552.9 ,0.95 icelake ,avx512 ,262144 ,0 ,255 ,29772.8 ,30698.1 ,0.97 icelake ,avx512 ,262144 ,0 ,256 ,28109.3 ,28957.9 ,0.97 icelake ,avx512 ,262144 ,0 ,4064 ,29787.5 ,30637.2 ,0.97 icelake ,avx512 ,524295 ,0 ,0 ,75920.7 ,75047.1 ,1.01 icelake ,avx512 ,524303 ,0 ,3 ,79218.6 ,79529.2 ,1.0 icelake ,avx512 ,524319 ,3 ,0 ,78446.9 ,78550.7 ,1.0 icelake ,avx512 ,524351 ,3 ,5 ,79055.0 ,79425.2 ,1.0 icelake ,avx512 ,524288 ,0 ,127 ,79070.6 ,79626.7 ,0.99 icelake ,avx512 ,524288 ,0 ,255 ,77891.8 ,78078.3 ,1.0 icelake ,avx512 ,524288 ,0 ,256 ,74797.3 ,74436.9 ,1.0 icelake ,avx512 ,524288 ,0 ,4064 ,78339.3 ,78337.2 ,1.0 icelake ,avx512 ,1048583 ,0 ,0 ,131427.6 ,133891.3 ,0.98 icelake ,avx512 ,1048591 ,0 ,3 ,143984.1 ,142003.7 ,1.01 icelake ,avx512 ,1048607 ,3 ,0 ,137547.9 ,134450.1 ,1.02 icelake ,avx512 ,1048639 ,3 ,5 ,144630.4 ,142174.6 ,1.02 icelake ,avx512 ,1048576 ,0 ,127 ,149810.7 ,142684.9 ,1.05 icelake ,avx512 ,1048576 ,0 ,255 ,156212.6 ,143509.2 ,1.09 icelake ,avx512 ,1048576 ,0 ,256 ,153776.9 ,139788.0 ,1.1 icelake ,avx512 ,1048576 ,0 ,4064 ,137926.6 ,134832.8 ,1.02 icelake ,avx512 ,2097159 ,0 ,0 ,263465.3 ,267681.6 ,0.98 icelake ,avx512 ,2097167 ,0 ,3 ,288947.7 ,284129.9 ,1.02 icelake ,avx512 ,2097183 ,3 ,0 ,275395.5 ,269216.0 ,1.02 icelake ,avx512 ,2097215 ,3 ,5 ,289131.5 ,284475.3 ,1.02 icelake ,avx512 ,2097152 ,0 ,127 ,299404.5 ,286193.2 ,1.05 icelake ,avx512 ,2097152 ,0 ,255 ,312913.2 ,286785.6 ,1.09 icelake ,avx512 ,2097152 ,0 ,256 ,307882.7 ,279708.7 ,1.1 icelake ,avx512 ,2097152 ,0 ,4064 ,275552.3 ,269867.0 ,1.02 icelake ,avx512 ,4194311 ,0 ,0 ,526480.1 ,536038.9 ,0.98 icelake ,avx512 ,4194319 ,0 ,3 ,579122.9 ,569512.5 ,1.02 icelake ,avx512 ,4194335 ,3 ,0 ,551658.1 ,542973.3 ,1.02 icelake ,avx512 ,4194367 ,3 ,5 ,578575.2 ,569497.2 ,1.02 icelake ,avx512 ,4194304 ,0 ,127 ,599943.6 ,569138.2 ,1.05 icelake ,avx512 ,4194304 ,0 ,255 ,628419.2 ,575908.4 ,1.09 icelake ,avx512 ,4194304 ,0 ,256 ,617242.8 ,561417.7 ,1.1 icelake ,avx512 ,4194304 ,0 ,4064 ,552012.3 ,540617.2 ,1.02 icelake ,avx512 ,8388615 ,0 ,0 ,1092471.4 ,1133834.9 ,0.96 icelake ,avx512 ,8388623 ,0 ,3 ,1185623.5 ,1218150.0 ,0.97 icelake ,avx512 ,8388639 ,3 ,0 ,1142647.1 ,1139201.6 ,1.0 icelake ,avx512 ,8388671 ,3 ,5 ,1183702.5 ,1225474.6 ,0.97 icelake ,avx512 ,8388608 ,0 ,127 ,1231862.8 ,1221685.1 ,1.01 icelake ,avx512 ,8388608 ,0 ,255 ,1290816.7 ,1221576.2 ,1.06 icelake ,avx512 ,8388608 ,0 ,256 ,1299047.6 ,1195021.2 ,1.09 icelake ,avx512 ,8388608 ,0 ,4064 ,1139648.9 ,1140113.0 ,1.0 icelake ,avx512 ,16777223 ,0 ,0 ,2464861.2 ,2599120.4 ,0.95 icelake ,avx512 ,16777231 ,0 ,3 ,2651029.7 ,2758867.1 ,0.96 icelake ,avx512 ,16777247 ,3 ,0 ,2570099.8 ,2601099.4 ,0.99 icelake ,avx512 ,16777279 ,3 ,5 ,2660529.4 ,2762598.6 ,0.96 icelake ,avx512 ,16777216 ,0 ,127 ,2759531.7 ,2756811.1 ,1.0 icelake ,avx512 ,16777216 ,0 ,255 ,2878568.5 ,2777650.3 ,1.04 icelake ,avx512 ,16777216 ,0 ,256 ,2931879.3 ,2709687.7 ,1.08 icelake ,avx512 ,16777216 ,0 ,4064 ,2587161.1 ,2632011.2 ,0.98 icelake ,avx512 ,33554439 ,0 ,0 ,5175406.0 ,5528857.2 ,0.94 icelake ,avx512 ,33554447 ,0 ,3 ,5537561.9 ,5818119.1 ,0.95 icelake ,avx512 ,33554463 ,3 ,0 ,5435099.5 ,5560442.2 ,0.98 icelake ,avx512 ,33554495 ,3 ,5 ,5546314.9 ,5800995.0 ,0.96 icelake ,avx512 ,33554432 ,0 ,127 ,5770248.0 ,5781104.9 ,1.0 icelake ,avx512 ,33554432 ,0 ,255 ,6019120.7 ,5836023.3 ,1.03 icelake ,avx512 ,33554432 ,0 ,256 ,6107033.4 ,5681798.8 ,1.07 icelake ,avx512 ,33554432 ,0 ,4064 ,5356238.5 ,5598521.5 ,0.96 skylake ,sse2 ,65543 ,0 ,0 ,3091.4 ,2940.2 ,1.05 skylake ,sse2 ,65551 ,0 ,3 ,3682.6 ,3403.7 ,1.08 skylake ,sse2 ,65567 ,3 ,0 ,3031.3 ,3070.2 ,0.99 skylake ,sse2 ,65599 ,3 ,5 ,3731.2 ,3718.7 ,1.0 skylake ,sse2 ,65536 ,0 ,127 ,3642.3 ,3390.5 ,1.07 skylake ,sse2 ,65536 ,0 ,255 ,3493.9 ,3333.0 ,1.05 skylake ,sse2 ,65536 ,0 ,256 ,3043.2 ,2981.0 ,1.02 skylake ,sse2 ,65536 ,0 ,4064 ,2796.6 ,2843.9 ,0.98 skylake ,sse2 ,131079 ,0 ,0 ,6347.4 ,6309.8 ,1.01 skylake ,sse2 ,131087 ,0 ,3 ,7318.4 ,7486.2 ,0.98 skylake ,sse2 ,131103 ,3 ,0 ,6297.4 ,6516.8 ,0.97 skylake ,sse2 ,131135 ,3 ,5 ,7544.5 ,7823.5 ,0.96 skylake ,sse2 ,131072 ,0 ,127 ,7426.4 ,7554.3 ,0.98 skylake ,sse2 ,131072 ,0 ,255 ,7349.0 ,7195.4 ,1.02 skylake ,sse2 ,131072 ,0 ,256 ,7068.1 ,6804.8 ,1.04 skylake ,sse2 ,131072 ,0 ,4064 ,6884.6 ,7566.7 ,0.91 skylake ,sse2 ,262151 ,0 ,0 ,15848.1 ,15552.2 ,1.02 skylake ,sse2 ,262159 ,0 ,3 ,17864.6 ,16787.9 ,1.06 skylake ,sse2 ,262175 ,3 ,0 ,15748.1 ,16266.0 ,0.97 skylake ,sse2 ,262207 ,3 ,5 ,17022.3 ,17229.8 ,0.99 skylake ,sse2 ,262144 ,0 ,127 ,16158.7 ,16093.6 ,1.0 skylake ,sse2 ,262144 ,0 ,255 ,15670.7 ,15949.2 ,0.98 skylake ,sse2 ,262144 ,0 ,256 ,14806.3 ,14970.3 ,0.99 skylake ,sse2 ,262144 ,0 ,4064 ,14751.7 ,15008.2 ,0.98 skylake ,sse2 ,524295 ,0 ,0 ,32874.8 ,33731.2 ,0.97 skylake ,sse2 ,524303 ,0 ,3 ,34035.1 ,34777.8 ,0.98 skylake ,sse2 ,524319 ,3 ,0 ,34325.6 ,34108.9 ,1.01 skylake ,sse2 ,524351 ,3 ,5 ,34853.5 ,35624.4 ,0.98 skylake ,sse2 ,524288 ,0 ,127 ,33437.4 ,33816.7 ,0.99 skylake ,sse2 ,524288 ,0 ,255 ,33256.1 ,33664.7 ,0.99 skylake ,sse2 ,524288 ,0 ,256 ,32006.3 ,32396.3 ,0.99 skylake ,sse2 ,524288 ,0 ,4064 ,32284.7 ,32713.9 ,0.99 skylake ,sse2 ,1048583 ,0 ,0 ,71891.7 ,73858.4 ,0.97 skylake ,sse2 ,1048591 ,0 ,3 ,74621.3 ,74389.7 ,1.0 skylake ,sse2 ,1048607 ,3 ,0 ,72515.0 ,73573.2 ,0.99 skylake ,sse2 ,1048639 ,3 ,5 ,72471.7 ,73782.6 ,0.98 skylake ,sse2 ,1048576 ,0 ,127 ,77638.6 ,82474.6 ,0.94 skylake ,sse2 ,1048576 ,0 ,255 ,71870.0 ,71933.6 ,1.0 skylake ,sse2 ,1048576 ,0 ,256 ,70410.0 ,73243.6 ,0.96 skylake ,sse2 ,1048576 ,0 ,4064 ,71267.1 ,72274.6 ,0.99 skylake ,sse2 ,2097159 ,0 ,0 ,140052.6 ,144880.1 ,0.97 skylake ,sse2 ,2097167 ,0 ,3 ,146626.5 ,147972.6 ,0.99 skylake ,sse2 ,2097183 ,3 ,0 ,141750.1 ,146353.6 ,0.97 skylake ,sse2 ,2097215 ,3 ,5 ,144169.0 ,148120.1 ,0.97 skylake ,sse2 ,2097152 ,0 ,127 ,156575.9 ,165844.4 ,0.94 skylake ,sse2 ,2097152 ,0 ,255 ,144277.7 ,146971.5 ,0.98 skylake ,sse2 ,2097152 ,0 ,256 ,143047.4 ,146810.9 ,0.97 skylake ,sse2 ,2097152 ,0 ,4064 ,142795.6 ,145805.8 ,0.98 skylake ,sse2 ,4194311 ,0 ,0 ,284353.3 ,298092.5 ,0.95 skylake ,sse2 ,4194319 ,0 ,3 ,296656.4 ,311960.2 ,0.95 skylake ,sse2 ,4194335 ,3 ,0 ,285922.6 ,304100.5 ,0.94 skylake ,sse2 ,4194367 ,3 ,5 ,297135.4 ,312532.5 ,0.95 skylake ,sse2 ,4194304 ,0 ,127 ,323938.6 ,340414.3 ,0.95 skylake ,sse2 ,4194304 ,0 ,255 ,301460.9 ,310042.7 ,0.97 skylake ,sse2 ,4194304 ,0 ,256 ,287155.8 ,303580.6 ,0.95 skylake ,sse2 ,4194304 ,0 ,4064 ,291006.2 ,302441.3 ,0.96 skylake ,sse2 ,8388615 ,0 ,0 ,714424.7 ,747484.3 ,0.96 skylake ,sse2 ,8388623 ,0 ,3 ,748995.5 ,774116.5 ,0.97 skylake ,sse2 ,8388639 ,3 ,0 ,720563.4 ,757386.9 ,0.95 skylake ,sse2 ,8388671 ,3 ,5 ,748028.7 ,773907.8 ,0.97 skylake ,sse2 ,8388608 ,0 ,127 ,750775.3 ,780245.2 ,0.96 skylake ,sse2 ,8388608 ,0 ,255 ,724940.3 ,764197.8 ,0.95 skylake ,sse2 ,8388608 ,0 ,256 ,722035.0 ,759408.9 ,0.95 skylake ,sse2 ,8388608 ,0 ,4064 ,756977.8 ,755532.4 ,1.0 skylake ,sse2 ,16777223 ,0 ,0 ,1971686.0 ,2111263.4 ,0.93 skylake ,sse2 ,16777231 ,0 ,3 ,1953608.9 ,2128493.8 ,0.92 skylake ,sse2 ,16777247 ,3 ,0 ,1967075.6 ,2103772.3 ,0.94 skylake ,sse2 ,16777279 ,3 ,5 ,1950851.6 ,2133601.6 ,0.91 skylake ,sse2 ,16777216 ,0 ,127 ,1991168.2 ,2078249.3 ,0.96 skylake ,sse2 ,16777216 ,0 ,255 ,1958502.9 ,2111955.5 ,0.93 skylake ,sse2 ,16777216 ,0 ,256 ,1965103.7 ,2114293.0 ,0.93 skylake ,sse2 ,16777216 ,0 ,4064 ,1958381.3 ,2103438.6 ,0.93 skylake ,sse2 ,33554439 ,0 ,0 ,4456144.2 ,4660837.1 ,0.96 skylake ,sse2 ,33554447 ,0 ,3 ,4431097.0 ,4679042.6 ,0.95 skylake ,sse2 ,33554463 ,3 ,0 ,4448225.6 ,4648538.3 ,0.96 skylake ,sse2 ,33554495 ,3 ,5 ,4427743.0 ,4678340.1 ,0.95 skylake ,sse2 ,33554432 ,0 ,127 ,4437517.3 ,4552005.9 ,0.97 skylake ,sse2 ,33554432 ,0 ,255 ,4427135.1 ,4543412.0 ,0.97 skylake ,sse2 ,33554432 ,0 ,256 ,4441311.2 ,4658315.5 ,0.95 skylake ,sse2 ,33554432 ,0 ,4064 ,4429798.4 ,4659499.6 ,0.95 skylake ,avx ,65543 ,0 ,0 ,3115.8 ,3043.7 ,1.02 skylake ,avx ,65551 ,0 ,3 ,3673.2 ,3551.7 ,1.03 skylake ,avx ,65567 ,3 ,0 ,3024.6 ,2887.4 ,1.05 skylake ,avx ,65599 ,3 ,5 ,3907.8 ,3636.4 ,1.07 skylake ,avx ,65536 ,0 ,127 ,3539.2 ,3372.3 ,1.05 skylake ,avx ,65536 ,0 ,255 ,3489.9 ,3344.0 ,1.04 skylake ,avx ,65536 ,0 ,256 ,3059.0 ,2924.4 ,1.05 skylake ,avx ,65536 ,0 ,4064 ,2805.0 ,2869.3 ,0.98 skylake ,avx ,131079 ,0 ,0 ,6129.2 ,6263.4 ,0.98 skylake ,avx ,131087 ,0 ,3 ,7096.8 ,7570.0 ,0.94 skylake ,avx ,131103 ,3 ,0 ,6394.5 ,6842.5 ,0.93 skylake ,avx ,131135 ,3 ,5 ,7462.8 ,7776.0 ,0.96 skylake ,avx ,131072 ,0 ,127 ,7726.9 ,7428.5 ,1.04 skylake ,avx ,131072 ,0 ,255 ,7167.4 ,7278.9 ,0.98 skylake ,avx ,131072 ,0 ,256 ,7197.9 ,6284.3 ,1.15 skylake ,avx ,131072 ,0 ,4064 ,6984.0 ,6940.4 ,1.01 skylake ,avx ,262151 ,0 ,0 ,15787.3 ,16403.1 ,0.96 skylake ,avx ,262159 ,0 ,3 ,17800.1 ,17628.1 ,1.01 skylake ,avx ,262175 ,3 ,0 ,16622.8 ,16244.3 ,1.02 skylake ,avx ,262207 ,3 ,5 ,16989.7 ,17509.0 ,0.97 skylake ,avx ,262144 ,0 ,127 ,16190.8 ,15971.8 ,1.01 skylake ,avx ,262144 ,0 ,255 ,15787.1 ,15876.7 ,0.99 skylake ,avx ,262144 ,0 ,256 ,14840.1 ,14997.0 ,0.99 skylake ,avx ,262144 ,0 ,4064 ,15743.0 ,14976.2 ,1.05 skylake ,avx ,524295 ,0 ,0 ,32848.5 ,33397.8 ,0.98 skylake ,avx ,524303 ,0 ,3 ,34872.1 ,34862.2 ,1.0 skylake ,avx ,524319 ,3 ,0 ,33784.6 ,34023.8 ,0.99 skylake ,avx ,524351 ,3 ,5 ,35337.1 ,35364.5 ,1.0 skylake ,avx ,524288 ,0 ,127 ,33624.5 ,33596.5 ,1.0 skylake ,avx ,524288 ,0 ,255 ,33390.7 ,33842.8 ,0.99 skylake ,avx ,524288 ,0 ,256 ,31937.0 ,32357.2 ,0.99 skylake ,avx ,524288 ,0 ,4064 ,32233.5 ,32267.3 ,1.0 skylake ,avx ,1048583 ,0 ,0 ,100354.7 ,105840.6 ,0.95 skylake ,avx ,1048591 ,0 ,3 ,68102.5 ,67496.0 ,1.01 skylake ,avx ,1048607 ,3 ,0 ,66146.1 ,67540.0 ,0.98 skylake ,avx ,1048639 ,3 ,5 ,67530.8 ,67726.4 ,1.0 skylake ,avx ,1048576 ,0 ,127 ,67105.6 ,66533.5 ,1.01 skylake ,avx ,1048576 ,0 ,255 ,67101.8 ,65666.7 ,1.02 skylake ,avx ,1048576 ,0 ,256 ,65092.6 ,67103.0 ,0.97 skylake ,avx ,1048576 ,0 ,4064 ,65700.0 ,67031.5 ,0.98 skylake ,avx ,2097159 ,0 ,0 ,133101.0 ,135171.6 ,0.98 skylake ,avx ,2097167 ,0 ,3 ,134174.4 ,135782.1 ,0.99 skylake ,avx ,2097183 ,3 ,0 ,132056.4 ,134170.0 ,0.98 skylake ,avx ,2097215 ,3 ,5 ,134413.5 ,136341.1 ,0.99 skylake ,avx ,2097152 ,0 ,127 ,133003.9 ,132992.1 ,1.0 skylake ,avx ,2097152 ,0 ,255 ,133344.3 ,132883.1 ,1.0 skylake ,avx ,2097152 ,0 ,256 ,134051.7 ,136185.8 ,0.98 skylake ,avx ,2097152 ,0 ,4064 ,132976.3 ,135029.4 ,0.98 skylake ,avx ,4194311 ,0 ,0 ,268004.1 ,282650.3 ,0.95 skylake ,avx ,4194319 ,0 ,3 ,270270.0 ,286700.3 ,0.94 skylake ,avx ,4194335 ,3 ,0 ,264288.5 ,279582.4 ,0.95 skylake ,avx ,4194367 ,3 ,5 ,270498.4 ,286294.5 ,0.94 skylake ,avx ,4194304 ,0 ,127 ,271219.3 ,275129.8 ,0.99 skylake ,avx ,4194304 ,0 ,255 ,269996.5 ,270227.6 ,1.0 skylake ,avx ,4194304 ,0 ,256 ,267901.1 ,281673.1 ,0.95 skylake ,avx ,4194304 ,0 ,4064 ,268390.0 ,279100.3 ,0.96 skylake ,avx ,8388615 ,0 ,0 ,803547.9 ,813229.9 ,0.99 skylake ,avx ,8388623 ,0 ,3 ,828872.4 ,869413.0 ,0.95 skylake ,avx ,8388639 ,3 ,0 ,818000.0 ,873781.7 ,0.94 skylake ,avx ,8388671 ,3 ,5 ,824679.0 ,863561.5 ,0.95 skylake ,avx ,8388608 ,0 ,127 ,800728.5 ,779000.8 ,1.03 skylake ,avx ,8388608 ,0 ,255 ,820071.4 ,770113.2 ,1.06 skylake ,avx ,8388608 ,0 ,256 ,825624.6 ,867247.7 ,0.95 skylake ,avx ,8388608 ,0 ,4064 ,830209.7 ,894086.6 ,0.93 skylake ,avx ,16777223 ,0 ,0 ,1989391.3 ,2132829.8 ,0.93 skylake ,avx ,16777231 ,0 ,3 ,1994225.1 ,2211556.0 ,0.9 skylake ,avx ,16777247 ,3 ,0 ,1993572.9 ,2213029.9 ,0.9 skylake ,avx ,16777279 ,3 ,5 ,2001956.9 ,2211769.7 ,0.91 skylake ,avx ,16777216 ,0 ,127 ,1968155.9 ,2127764.7 ,0.92 skylake ,avx ,16777216 ,0 ,255 ,1978305.1 ,2121371.3 ,0.93 skylake ,avx ,16777216 ,0 ,256 ,1993261.9 ,2206494.1 ,0.9 skylake ,avx ,16777216 ,0 ,4064 ,1993808.3 ,2198137.4 ,0.91 skylake ,avx ,33554439 ,0 ,0 ,4540216.7 ,4870021.8 ,0.93 skylake ,avx ,33554447 ,0 ,3 ,4483505.3 ,4850545.5 ,0.92 skylake ,avx ,33554463 ,3 ,0 ,4501944.5 ,4870922.4 ,0.92 skylake ,avx ,33554495 ,3 ,5 ,4484565.5 ,4845392.4 ,0.93 skylake ,avx ,33554432 ,0 ,127 ,4408639.3 ,4701698.6 ,0.94 skylake ,avx ,33554432 ,0 ,255 ,4445826.0 ,4678142.9 ,0.95 skylake ,avx ,33554432 ,0 ,256 ,4497953.2 ,4844498.6 ,0.93 skylake ,avx ,33554432 ,0 ,4064 ,4501572.4 ,4839209.4 ,0.93 > > -- > H.J.
Last message got formatted weirdly. Here is file with the data. On Sat, Apr 3, 2021 at 3:41 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Sat, Apr 3, 2021 at 1:46 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Sat, Apr 3, 2021 at 1:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > From: noah <goldstein.w.n@gmail.com> > > > > > > No Bug. This commit updates the large memcpy case (no overlap). The > > > update is to perform memcpy on either 2 or 4 contiguous pages at > > > once. This 1) helps to alleviate the affects of false memory aliasing > > > when destination and source have a close 4k alignment and 2) In most > > > cases and for most DRAM units is a modestly more efficient access > > > pattern. These changes are a clear performance improvement for > > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > > > pass. > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > --- > > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the > > > loops and no longer see any meaningful regression. > > > > > > Also added back the temporal stores for the tail. Saw a regression > > > when doing these tests. > > > > > > Two tables below for skylake and icelake numbers for the areas around > > > where you saw the regression. Below is all data from the tests. > > > > > > N = 10. > > > > > > Skylake > > > Len ,align1 ,align2 ,new mean ,old mean > > > 4103 ,0 ,64 ,84.5 ,88.6 > > > 4111 ,0 ,3 ,99.0 ,99.9 > > > 4127 ,3 ,0 ,102.1 ,102.3 > > > 4159 ,3 ,7 ,88.7 ,90.9 > > > 4223 ,9 ,5 ,88.1 ,87.4 > > > 8199 ,0 ,64 ,146.7 ,150.2 > > > 8207 ,0 ,3 ,167.9 ,168.5 > > > 8223 ,3 ,0 ,168.5 ,168.1 > > > 8255 ,3 ,7 ,157.0 ,159.2 > > > 8319 ,9 ,5 ,155.5 ,155.7 > > > 16391 ,0 ,64 ,286.2 ,288.8 > > > 16399 ,0 ,3 ,307.0 ,308.7 > > > 16415 ,3 ,0 ,307.4 ,307.6 > > > 16447 ,3 ,7 ,294.6 ,295.5 > > > 16511 ,9 ,5 ,291.5 ,462.1 > > > 32775 ,0 ,64 ,603.4 ,601.5 > > > 32783 ,0 ,3 ,604.8 ,606.4 > > > 32799 ,3 ,0 ,603.0 ,604.1 > > > 32831 ,3 ,7 ,600.2 ,737.3 > > > 32895 ,9 ,5 ,604.4 ,599.5 > > > 65543 ,0 ,64 ,1873.5 ,1854.3 > > > 65551 ,0 ,3 ,1862.9 ,1846.6 > > > 65567 ,3 ,0 ,1885.5 ,1966.0 > > > 65599 ,3 ,7 ,1833.2 ,1833.1 > > > 65663 ,9 ,5 ,1884.9 ,1887.4 > > > 131079 ,0 ,64 ,3944.3 ,3949.4 > > > 131087 ,0 ,3 ,3927.3 ,3913.3 > > > 131103 ,3 ,0 ,4415.8 ,4169.4 > > > 131135 ,3 ,7 ,4224.5 ,4157.6 > > > 131199 ,9 ,5 ,5974.0 ,4983.8 > > > 262151 ,0 ,64 ,11050.2 ,10620.6 > > > 262159 ,0 ,3 ,9932.8 ,10037.3 > > > 262175 ,3 ,0 ,10188.8 ,9206.6 > > > 262207 ,3 ,7 ,9633.3 ,9216.7 > > > 262271 ,9 ,5 ,9732.7 ,9345.3 > > > 524295 ,0 ,64 ,24823.9 ,24880.7 > > > 524303 ,0 ,3 ,24514.0 ,24556.7 > > > 524319 ,3 ,0 ,23974.4 ,24219.9 > > > 524351 ,3 ,7 ,24159.7 ,24207.0 > > > 524415 ,9 ,5 ,23946.5 ,24142.8 > > > > > > Icelake: > > > Len ,align1 ,align2 ,new mean ,old mean > > > 4103 ,0 ,64 ,50.2 ,63.7 > > > 4111 ,0 ,3 ,63.7 ,65.1 > > > 4127 ,3 ,0 ,68.2 ,69.4 > > > 4159 ,3 ,7 ,59.6 ,68.0 > > > 4223 ,9 ,5 ,68.2 ,66.8 > > > 8199 ,0 ,64 ,92.1 ,89.9 > > > 8207 ,0 ,3 ,119.7 ,118.3 > > > 8223 ,3 ,0 ,119.1 ,120.9 > > > 8255 ,3 ,7 ,122.9 ,123.7 > > > 8319 ,9 ,5 ,122.1 ,121.8 > > > 16391 ,0 ,64 ,162.7 ,158.0 > > > 16399 ,0 ,3 ,227.6 ,234.1 > > > 16415 ,3 ,0 ,230.8 ,232.7 > > > 16447 ,3 ,7 ,226.8 ,232.6 > > > 16511 ,9 ,5 ,233.4 ,233.8 > > > 32775 ,0 ,64 ,312.2 ,301.8 > > > 32783 ,0 ,3 ,449.7 ,450.0 > > > 32799 ,3 ,0 ,452.7 ,455.9 > > > 32831 ,3 ,7 ,449.8 ,458.0 > > > 32895 ,9 ,5 ,456.3 ,459.4 > > > 65543 ,0 ,64 ,1460.6 ,1463.9 > > > 65551 ,0 ,3 ,1462.0 ,1465.4 > > > 65567 ,3 ,0 ,1466.6 ,1480.4 > > > 65599 ,3 ,7 ,1488.0 ,1488.9 > > > 65663 ,9 ,5 ,1680.8 ,1499.5 > > > 131079 ,0 ,64 ,2988.5 ,3010.1 > > > 131087 ,0 ,3 ,2995.5 ,2996.4 > > > 131103 ,3 ,0 ,3006.2 ,3000.5 > > > 131135 ,3 ,7 ,3032.4 ,3073.7 > > > 131199 ,9 ,5 ,3010.4 ,3027.4 > > > 262151 ,0 ,64 ,6143.2 ,6079.1 > > > 262159 ,0 ,3 ,6085.1 ,6075.8 > > > 262175 ,3 ,0 ,6088.0 ,6064.9 > > > 262207 ,3 ,7 ,6018.7 ,6023.5 > > > 262271 ,9 ,5 ,6019.8 ,5959.2 > > > 524295 ,0 ,64 ,14464.2 ,14095.1 > > > 524303 ,0 ,3 ,14761.6 ,14050.2 > > > 524319 ,3 ,0 ,14534.1 ,14087.5 > > > 524351 ,3 ,7 ,14147.7 ,13903.8 > > > 524415 ,9 ,5 ,14157.0 ,13982.9 > > > > > > > > > > > > cpu ,version ,Len ,align1 ,align2 ,new mean ,old mean > > > skylake ,avx ,4103 ,0 ,64 ,84.5 ,88.6 > > > skylake ,avx ,4111 ,0 ,3 ,99.0 ,99.9 > > > skylake ,avx ,4127 ,3 ,0 ,102.1 ,102.3 > > > skylake ,avx ,4159 ,3 ,7 ,88.7 ,90.9 > > > skylake ,avx ,4223 ,9 ,5 ,88.1 ,87.4 > > > skylake ,avx ,8199 ,0 ,64 ,146.7 ,150.2 > > > skylake ,avx ,8207 ,0 ,3 ,167.9 ,168.5 > > > skylake ,avx ,8223 ,3 ,0 ,168.5 ,168.1 > > > skylake ,avx ,8255 ,3 ,7 ,157.0 ,159.2 > > > skylake ,avx ,8319 ,9 ,5 ,155.5 ,155.7 > > > skylake ,avx ,16391 ,0 ,64 ,286.2 ,288.8 > > > skylake ,avx ,16399 ,0 ,3 ,307.0 ,308.7 > > > skylake ,avx ,16415 ,3 ,0 ,307.4 ,307.6 > > > skylake ,avx ,16447 ,3 ,7 ,294.6 ,295.5 > > > skylake ,avx ,16511 ,9 ,5 ,291.5 ,462.1 > > > skylake ,avx ,32775 ,0 ,64 ,603.4 ,601.5 > > > skylake ,avx ,32783 ,0 ,3 ,604.8 ,606.4 > > > skylake ,avx ,32799 ,3 ,0 ,603.0 ,604.1 > > > skylake ,avx ,32831 ,3 ,7 ,600.2 ,737.3 > > > skylake ,avx ,32895 ,9 ,5 ,604.4 ,599.5 > > > skylake ,avx ,65543 ,0 ,64 ,1873.5 ,1854.3 > > > skylake ,avx ,65551 ,0 ,3 ,1862.9 ,1846.6 > > > skylake ,avx ,65567 ,3 ,0 ,1885.5 ,1966.0 > > > skylake ,avx ,65599 ,3 ,7 ,1833.2 ,1833.1 > > > skylake ,avx ,65663 ,9 ,5 ,1884.9 ,1887.4 > > > skylake ,avx ,131079 ,0 ,64 ,3944.3 ,3949.4 > > > skylake ,avx ,131087 ,0 ,3 ,3927.3 ,3913.3 > > > skylake ,avx ,131103 ,3 ,0 ,4415.8 ,4169.4 > > > skylake ,avx ,131135 ,3 ,7 ,4224.5 ,4157.6 > > > skylake ,avx ,131199 ,9 ,5 ,5974.0 ,4983.8 > > > skylake ,avx ,262151 ,0 ,64 ,11050.2 ,10620.6 > > > skylake ,avx ,262159 ,0 ,3 ,9932.8 ,10037.3 > > > skylake ,avx ,262175 ,3 ,0 ,10188.8 ,9206.6 > > > skylake ,avx ,262207 ,3 ,7 ,9633.3 ,9216.7 > > > skylake ,avx ,262271 ,9 ,5 ,9732.7 ,9345.3 > > > skylake ,avx ,524295 ,0 ,64 ,24823.9 ,24880.7 > > > skylake ,avx ,524303 ,0 ,3 ,24514.0 ,24556.7 > > > skylake ,avx ,524319 ,3 ,0 ,23974.4 ,24219.9 > > > skylake ,avx ,524351 ,3 ,7 ,24159.7 ,24207.0 > > > skylake ,avx ,524415 ,9 ,5 ,23946.5 ,24142.8 > > > skylake ,avx ,1048583 ,0 ,64 ,49163.9 ,49454.6 > > > skylake ,avx ,1048591 ,0 ,3 ,49879.3 ,49400.8 > > > skylake ,avx ,1048607 ,3 ,0 ,49738.0 ,48864.6 > > > skylake ,avx ,1048639 ,3 ,7 ,48804.0 ,47588.5 > > > skylake ,avx ,1048703 ,9 ,5 ,49629.4 ,49796.3 > > > skylake ,avx ,2097159 ,0 ,64 ,98271.7 ,96330.6 > > > skylake ,avx ,2097167 ,0 ,3 ,97801.8 ,98638.1 > > > skylake ,avx ,2097183 ,3 ,0 ,98041.1 ,99287.6 > > > skylake ,avx ,2097215 ,3 ,7 ,96629.5 ,96521.9 > > > skylake ,avx ,2097279 ,9 ,5 ,98961.8 ,98909.8 > > > skylake ,avx ,4194311 ,0 ,64 ,194667.7 ,195377.1 > > > skylake ,avx ,4194319 ,0 ,3 ,194919.5 ,198576.2 > > > skylake ,avx ,4194335 ,3 ,0 ,192949.8 ,194584.7 > > > skylake ,avx ,4194367 ,3 ,7 ,189943.5 ,189177.9 > > > skylake ,avx ,4194431 ,9 ,5 ,192479.1 ,196494.2 > > > skylake ,avx ,8388615 ,0 ,64 ,588671.6 ,587215.4 > > > skylake ,avx ,8388623 ,0 ,3 ,581640.7 ,582812.5 > > > skylake ,avx ,8388639 ,3 ,0 ,549811.9 ,544697.6 > > > skylake ,avx ,8388671 ,3 ,7 ,591155.0 ,577951.8 > > > skylake ,avx ,8388735 ,9 ,5 ,547583.2 ,545133.3 > > > skylake ,avx ,16777223 ,0 ,64 ,1787503.0 ,1811146.0 > > > skylake ,avx ,16777231 ,0 ,3 ,1758671.0 ,1756343.0 > > > skylake ,avx ,16777247 ,3 ,0 ,1691781.0 ,1694661.0 > > > skylake ,avx ,16777279 ,3 ,7 ,1768150.0 ,1754785.0 > > > skylake ,avx ,16777343 ,9 ,5 ,1695179.0 ,1710794.0 > > > skylake ,sse2 ,4103 ,0 ,64 ,150.8 ,150.5 > > > skylake ,sse2 ,4111 ,0 ,3 ,156.8 ,158.4 > > > skylake ,sse2 ,4127 ,3 ,0 ,99.7 ,99.4 > > > skylake ,sse2 ,4159 ,3 ,7 ,154.8 ,154.5 > > > skylake ,sse2 ,4223 ,9 ,5 ,137.3 ,137.2 > > > skylake ,sse2 ,8199 ,0 ,64 ,284.8 ,285.5 > > > skylake ,sse2 ,8207 ,0 ,3 ,296.0 ,296.1 > > > skylake ,sse2 ,8223 ,3 ,0 ,168.0 ,168.2 > > > skylake ,sse2 ,8255 ,3 ,7 ,293.0 ,292.4 > > > skylake ,sse2 ,8319 ,9 ,5 ,251.3 ,250.7 > > > skylake ,sse2 ,16391 ,0 ,64 ,561.3 ,608.3 > > > skylake ,sse2 ,16399 ,0 ,3 ,571.0 ,574.8 > > > skylake ,sse2 ,16415 ,3 ,0 ,305.4 ,305.0 > > > skylake ,sse2 ,16447 ,3 ,7 ,563.2 ,565.0 > > > skylake ,sse2 ,16511 ,9 ,5 ,477.1 ,475.1 > > > skylake ,sse2 ,32775 ,0 ,64 ,1128.2 ,1131.7 > > > skylake ,sse2 ,32783 ,0 ,3 ,1126.6 ,1131.0 > > > skylake ,sse2 ,32799 ,3 ,0 ,587.6 ,590.8 > > > skylake ,sse2 ,32831 ,3 ,7 ,1130.6 ,1126.2 > > > skylake ,sse2 ,32895 ,9 ,5 ,957.6 ,953.0 > > > skylake ,sse2 ,65543 ,0 ,64 ,2718.9 ,2704.2 > > > skylake ,sse2 ,65551 ,0 ,3 ,2724.1 ,2725.0 > > > skylake ,sse2 ,65567 ,3 ,0 ,1888.4 ,1914.3 > > > skylake ,sse2 ,65599 ,3 ,7 ,2787.6 ,2748.7 > > > skylake ,sse2 ,65663 ,9 ,5 ,2400.5 ,2369.4 > > > skylake ,sse2 ,131079 ,0 ,64 ,5603.3 ,5654.9 > > > skylake ,sse2 ,131087 ,0 ,3 ,5939.3 ,5871.4 > > > skylake ,sse2 ,131103 ,3 ,0 ,4272.4 ,4190.0 > > > skylake ,sse2 ,131135 ,3 ,7 ,7601.4 ,7524.6 > > > skylake ,sse2 ,131199 ,9 ,5 ,7022.1 ,6864.7 > > > skylake ,sse2 ,262151 ,0 ,64 ,13736.2 ,14030.0 > > > skylake ,sse2 ,262159 ,0 ,3 ,12407.3 ,12334.1 > > > skylake ,sse2 ,262175 ,3 ,0 ,9661.1 ,9249.4 > > > skylake ,sse2 ,262207 ,3 ,7 ,12850.2 ,12351.6 > > > skylake ,sse2 ,262271 ,9 ,5 ,10792.6 ,10435.8 > > > skylake ,sse2 ,524295 ,0 ,64 ,27754.5 ,28177.7 > > > skylake ,sse2 ,524303 ,0 ,3 ,27766.2 ,28152.0 > > > skylake ,sse2 ,524319 ,3 ,0 ,24030.9 ,24438.3 > > > skylake ,sse2 ,524351 ,3 ,7 ,27787.5 ,27933.0 > > > skylake ,sse2 ,524415 ,9 ,5 ,24263.2 ,25249.1 > > > skylake ,sse2 ,1048583 ,0 ,64 ,56199.9 ,56039.8 > > > skylake ,sse2 ,1048591 ,0 ,3 ,56750.2 ,58889.7 > > > skylake ,sse2 ,1048607 ,3 ,0 ,56394.0 ,55115.3 > > > skylake ,sse2 ,1048639 ,3 ,7 ,57233.1 ,57473.8 > > > skylake ,sse2 ,1048703 ,9 ,5 ,56324.3 ,55917.9 > > > skylake ,sse2 ,2097159 ,0 ,64 ,113234.8 ,114346.4 > > > skylake ,sse2 ,2097167 ,0 ,3 ,114373.1 ,115522.5 > > > skylake ,sse2 ,2097183 ,3 ,0 ,108113.3 ,108513.3 > > > skylake ,sse2 ,2097215 ,3 ,7 ,116863.6 ,116549.9 > > > skylake ,sse2 ,2097279 ,9 ,5 ,108945.1 ,108843.7 > > > skylake ,sse2 ,4194311 ,0 ,64 ,230250.1 ,232350.0 > > > skylake ,sse2 ,4194319 ,0 ,3 ,231895.3 ,235055.6 > > > skylake ,sse2 ,4194335 ,3 ,0 ,218442.8 ,219199.8 > > > skylake ,sse2 ,4194367 ,3 ,7 ,242564.2 ,235587.7 > > > skylake ,sse2 ,4194431 ,9 ,5 ,224167.4 ,215261.8 > > > skylake ,sse2 ,8388615 ,0 ,64 ,679801.8 ,674832.0 > > > skylake ,sse2 ,8388623 ,0 ,3 ,684913.2 ,685238.7 > > > skylake ,sse2 ,8388639 ,3 ,0 ,644865.4 ,631388.6 > > > skylake ,sse2 ,8388671 ,3 ,7 ,698700.9 ,689316.1 > > > skylake ,sse2 ,8388735 ,9 ,5 ,644820.2 ,631366.8 > > > skylake ,sse2 ,16777223 ,0 ,64 ,1877984.0 ,1876437.0 > > > skylake ,sse2 ,16777231 ,0 ,3 ,1898086.0 ,1913053.0 > > > skylake ,sse2 ,16777247 ,3 ,0 ,1857018.0 ,1866949.0 > > > skylake ,sse2 ,16777279 ,3 ,7 ,1914905.0 ,1897134.0 > > > skylake ,sse2 ,16777343 ,9 ,5 ,1859937.0 ,1881939.0 > > > icelake ,avx512 ,4103 ,0 ,64 ,75.2 ,75.8 > > > icelake ,avx512 ,4111 ,0 ,3 ,56.9 ,56.4 > > > icelake ,avx512 ,4127 ,3 ,0 ,59.1 ,59.6 > > > icelake ,avx512 ,4159 ,3 ,7 ,50.7 ,51.3 > > > icelake ,avx512 ,4223 ,9 ,5 ,59.2 ,58.9 > > > icelake ,avx512 ,8199 ,0 ,64 ,67.8 ,63.9 > > > icelake ,avx512 ,8207 ,0 ,3 ,89.0 ,89.9 > > > icelake ,avx512 ,8223 ,3 ,0 ,90.2 ,90.1 > > > icelake ,avx512 ,8255 ,3 ,7 ,82.6 ,84.9 > > > icelake ,avx512 ,8319 ,9 ,5 ,91.5 ,92.8 > > > icelake ,avx512 ,16391 ,0 ,64 ,118.0 ,117.6 > > > icelake ,avx512 ,16399 ,0 ,3 ,156.5 ,157.0 > > > icelake ,avx512 ,16415 ,3 ,0 ,157.4 ,157.3 > > > icelake ,avx512 ,16447 ,3 ,7 ,151.0 ,151.6 > > > icelake ,avx512 ,16511 ,9 ,5 ,159.1 ,159.6 > > > icelake ,avx512 ,32775 ,0 ,64 ,231.8 ,230.8 > > > icelake ,avx512 ,32783 ,0 ,3 ,297.8 ,299.3 > > > icelake ,avx512 ,32799 ,3 ,0 ,299.1 ,299.0 > > > icelake ,avx512 ,32831 ,3 ,7 ,293.5 ,295.4 > > > icelake ,avx512 ,32895 ,9 ,5 ,300.3 ,302.5 > > > icelake ,avx512 ,65543 ,0 ,64 ,1473.4 ,1479.2 > > > icelake ,avx512 ,65551 ,0 ,3 ,1438.2 ,1445.3 > > > icelake ,avx512 ,65567 ,3 ,0 ,1450.3 ,1463.8 > > > icelake ,avx512 ,65599 ,3 ,7 ,1469.0 ,1473.8 > > > icelake ,avx512 ,65663 ,9 ,5 ,1480.0 ,1483.5 > > > icelake ,avx512 ,131079 ,0 ,64 ,3015.1 ,3037.5 > > > icelake ,avx512 ,131087 ,0 ,3 ,2952.3 ,2960.4 > > > icelake ,avx512 ,131103 ,3 ,0 ,2966.2 ,2964.4 > > > icelake ,avx512 ,131135 ,3 ,7 ,2961.6 ,3047.9 > > > icelake ,avx512 ,131199 ,9 ,5 ,2967.4 ,3183.8 > > > icelake ,avx512 ,262151 ,0 ,64 ,6206.0 ,6141.5 > > > icelake ,avx512 ,262159 ,0 ,3 ,5990.8 ,5959.2 > > > icelake ,avx512 ,262175 ,3 ,0 ,5976.7 ,5963.8 > > > icelake ,avx512 ,262207 ,3 ,7 ,5939.5 ,5924.3 > > > icelake ,avx512 ,262271 ,9 ,5 ,5944.6 ,5990.3 > > > icelake ,avx512 ,524295 ,0 ,64 ,14726.7 ,14307.0 > > > icelake ,avx512 ,524303 ,0 ,3 ,14344.2 ,14040.5 > > > icelake ,avx512 ,524319 ,3 ,0 ,14175.0 ,13862.2 > > > icelake ,avx512 ,524351 ,3 ,7 ,14261.4 ,13821.5 > > > icelake ,avx512 ,524415 ,9 ,5 ,14266.5 ,14064.7 > > > icelake ,avx512 ,1048583 ,0 ,64 ,35211.4 ,35414.6 > > > icelake ,avx512 ,1048591 ,0 ,3 ,35156.8 ,35591.2 > > > icelake ,avx512 ,1048607 ,3 ,0 ,35273.1 ,35503.3 > > > icelake ,avx512 ,1048639 ,3 ,7 ,35255.8 ,35725.0 > > > icelake ,avx512 ,1048703 ,9 ,5 ,35703.6 ,36289.9 > > > icelake ,avx512 ,2097159 ,0 ,64 ,72613.9 ,72063.2 > > > icelake ,avx512 ,2097167 ,0 ,3 ,72301.6 ,73504.2 > > > icelake ,avx512 ,2097183 ,3 ,0 ,73448.8 ,72133.6 > > > icelake ,avx512 ,2097215 ,3 ,7 ,73762.9 ,72825.8 > > > icelake ,avx512 ,2097279 ,9 ,5 ,72097.3 ,72914.6 > > > icelake ,avx512 ,4194311 ,0 ,64 ,144793.4 ,144182.1 > > > icelake ,avx512 ,4194319 ,0 ,3 ,143710.3 ,145063.3 > > > icelake ,avx512 ,4194335 ,3 ,0 ,146722.1 ,144046.4 > > > icelake ,avx512 ,4194367 ,3 ,7 ,144267.0 ,144874.6 > > > icelake ,avx512 ,4194431 ,9 ,5 ,143808.2 ,144560.0 > > > icelake ,avx512 ,8388615 ,0 ,64 ,427993.4 ,424521.5 > > > icelake ,avx512 ,8388623 ,0 ,3 ,470267.1 ,473290.8 > > > icelake ,avx512 ,8388639 ,3 ,0 ,457179.7 ,461797.7 > > > icelake ,avx512 ,8388671 ,3 ,7 ,472507.9 ,481561.4 > > > icelake ,avx512 ,8388735 ,9 ,5 ,463611.9 ,467388.7 > > > icelake ,avx512 ,16777223 ,0 ,64 ,1490426.0 ,1526996.0 > > > icelake ,avx512 ,16777231 ,0 ,3 ,1516687.0 ,1517095.0 > > > icelake ,avx512 ,16777247 ,3 ,0 ,1497688.0 ,1512766.0 > > > icelake ,avx512 ,16777279 ,3 ,7 ,1512331.0 ,1524317.0 > > > icelake ,avx512 ,16777343 ,9 ,5 ,1498908.0 ,1500526.0 > > > icelake ,avx ,4103 ,0 ,64 ,50.2 ,63.7 > > > icelake ,avx ,4111 ,0 ,3 ,63.7 ,65.1 > > > icelake ,avx ,4127 ,3 ,0 ,68.2 ,69.4 > > > icelake ,avx ,4159 ,3 ,7 ,59.6 ,68.0 > > > icelake ,avx ,4223 ,9 ,5 ,68.2 ,66.8 > > > icelake ,avx ,8199 ,0 ,64 ,92.1 ,89.9 > > > icelake ,avx ,8207 ,0 ,3 ,119.7 ,118.3 > > > icelake ,avx ,8223 ,3 ,0 ,119.1 ,120.9 > > > icelake ,avx ,8255 ,3 ,7 ,122.9 ,123.7 > > > icelake ,avx ,8319 ,9 ,5 ,122.1 ,121.8 > > > icelake ,avx ,16391 ,0 ,64 ,162.7 ,158.0 > > > icelake ,avx ,16399 ,0 ,3 ,227.6 ,234.1 > > > icelake ,avx ,16415 ,3 ,0 ,230.8 ,232.7 > > > icelake ,avx ,16447 ,3 ,7 ,226.8 ,232.6 > > > icelake ,avx ,16511 ,9 ,5 ,233.4 ,233.8 > > > icelake ,avx ,32775 ,0 ,64 ,312.2 ,301.8 > > > icelake ,avx ,32783 ,0 ,3 ,449.7 ,450.0 > > > icelake ,avx ,32799 ,3 ,0 ,452.7 ,455.9 > > > icelake ,avx ,32831 ,3 ,7 ,449.8 ,458.0 > > > icelake ,avx ,32895 ,9 ,5 ,456.3 ,459.4 > > > icelake ,avx ,65543 ,0 ,64 ,1460.6 ,1463.9 > > > icelake ,avx ,65551 ,0 ,3 ,1462.0 ,1465.4 > > > icelake ,avx ,65567 ,3 ,0 ,1466.6 ,1480.4 > > > icelake ,avx ,65599 ,3 ,7 ,1488.0 ,1488.9 > > > icelake ,avx ,65663 ,9 ,5 ,1680.8 ,1499.5 > > > icelake ,avx ,131079 ,0 ,64 ,2988.5 ,3010.1 > > > icelake ,avx ,131087 ,0 ,3 ,2995.5 ,2996.4 > > > icelake ,avx ,131103 ,3 ,0 ,3006.2 ,3000.5 > > > icelake ,avx ,131135 ,3 ,7 ,3032.4 ,3073.7 > > > icelake ,avx ,131199 ,9 ,5 ,3010.4 ,3027.4 > > > icelake ,avx ,262151 ,0 ,64 ,6143.2 ,6079.1 > > > icelake ,avx ,262159 ,0 ,3 ,6085.1 ,6075.8 > > > icelake ,avx ,262175 ,3 ,0 ,6088.0 ,6064.9 > > > icelake ,avx ,262207 ,3 ,7 ,6018.7 ,6023.5 > > > icelake ,avx ,262271 ,9 ,5 ,6019.8 ,5959.2 > > > icelake ,avx ,524295 ,0 ,64 ,14464.2 ,14095.1 > > > icelake ,avx ,524303 ,0 ,3 ,14761.6 ,14050.2 > > > icelake ,avx ,524319 ,3 ,0 ,14534.1 ,14087.5 > > > icelake ,avx ,524351 ,3 ,7 ,14147.7 ,13903.8 > > > icelake ,avx ,524415 ,9 ,5 ,14157.0 ,13982.9 > > > icelake ,avx ,1048583 ,0 ,64 ,36599.0 ,37461.4 > > > icelake ,avx ,1048591 ,0 ,3 ,36717.8 ,37454.9 > > > icelake ,avx ,1048607 ,3 ,0 ,36821.2 ,37343.3 > > > icelake ,avx ,1048639 ,3 ,7 ,36958.0 ,37507.2 > > > icelake ,avx ,1048703 ,9 ,5 ,36869.2 ,37413.1 > > > icelake ,avx ,2097159 ,0 ,64 ,74765.8 ,75330.9 > > > icelake ,avx ,2097167 ,0 ,3 ,75175.4 ,74891.9 > > > icelake ,avx ,2097183 ,3 ,0 ,75451.4 ,74787.7 > > > icelake ,avx ,2097215 ,3 ,7 ,75394.8 ,75839.1 > > > icelake ,avx ,2097279 ,9 ,5 ,75099.2 ,75421.2 > > > icelake ,avx ,4194311 ,0 ,64 ,146809.6 ,146619.4 > > > icelake ,avx ,4194319 ,0 ,3 ,148866.4 ,149898.2 > > > icelake ,avx ,4194335 ,3 ,0 ,148719.7 ,150165.4 > > > icelake ,avx ,4194367 ,3 ,7 ,150600.1 ,150925.9 > > > icelake ,avx ,4194431 ,9 ,5 ,149457.3 ,150519.2 > > > icelake ,avx ,8388615 ,0 ,64 ,412709.8 ,423666.1 > > > icelake ,avx ,8388623 ,0 ,3 ,423717.4 ,424418.2 > > > icelake ,avx ,8388639 ,3 ,0 ,414387.5 ,413445.6 > > > icelake ,avx ,8388671 ,3 ,7 ,449010.7 ,417553.5 > > > icelake ,avx ,8388735 ,9 ,5 ,414128.6 ,411815.3 > > > icelake ,avx ,16777223 ,0 ,64 ,1490032.0 ,1510004.0 > > > icelake ,avx ,16777231 ,0 ,3 ,1379638.0 ,1422097.0 > > > icelake ,avx ,16777247 ,3 ,0 ,1418930.0 ,1367557.0 > > > icelake ,avx ,16777279 ,3 ,7 ,1515152.0 ,1500176.0 > > > icelake ,avx ,16777343 ,9 ,5 ,1344117.0 ,1411795.0 > > > icelake ,sse2 ,4103 ,0 ,64 ,113.2 ,114.6 > > > icelake ,sse2 ,4111 ,0 ,3 ,121.5 ,120.4 > > > icelake ,sse2 ,4127 ,3 ,0 ,1700.5 ,1771.5 > > > icelake ,sse2 ,4159 ,3 ,7 ,119.3 ,118.8 > > > icelake ,sse2 ,4223 ,9 ,5 ,1739.7 ,1735.2 > > > icelake ,sse2 ,8199 ,0 ,64 ,207.0 ,203.9 > > > icelake ,sse2 ,8207 ,0 ,3 ,225.5 ,220.8 > > > icelake ,sse2 ,8223 ,3 ,0 ,3444.3 ,3743.5 > > > icelake ,sse2 ,8255 ,3 ,7 ,219.9 ,216.8 > > > icelake ,sse2 ,8319 ,9 ,5 ,4117.1 ,3487.3 > > > icelake ,sse2 ,16391 ,0 ,64 ,397.1 ,394.3 > > > icelake ,sse2 ,16399 ,0 ,3 ,439.6 ,428.6 > > > icelake ,sse2 ,16415 ,3 ,0 ,6997.0 ,7031.2 > > > icelake ,sse2 ,16447 ,3 ,7 ,426.8 ,421.8 > > > icelake ,sse2 ,16511 ,9 ,5 ,7037.6 ,7038.3 > > > icelake ,sse2 ,32775 ,0 ,64 ,790.9 ,779.0 > > > icelake ,sse2 ,32783 ,0 ,3 ,863.1 ,849.6 > > > icelake ,sse2 ,32799 ,3 ,0 ,14043.0 ,14390.9 > > > icelake ,sse2 ,32831 ,3 ,7 ,841.6 ,833.1 > > > icelake ,sse2 ,32895 ,9 ,5 ,14277.6 ,14344.2 > > > icelake ,sse2 ,65543 ,0 ,64 ,1897.0 ,1897.3 > > > icelake ,sse2 ,65551 ,0 ,3 ,1927.1 ,1955.4 > > > icelake ,sse2 ,65567 ,3 ,0 ,28834.7 ,28727.8 > > > icelake ,sse2 ,65599 ,3 ,7 ,1961.4 ,1969.7 > > > icelake ,sse2 ,65663 ,9 ,5 ,28867.6 ,29019.8 > > > icelake ,sse2 ,131079 ,0 ,64 ,3879.3 ,3872.6 > > > icelake ,sse2 ,131087 ,0 ,3 ,3955.3 ,3990.7 > > > icelake ,sse2 ,131103 ,3 ,0 ,58001.8 ,60567.9 > > > icelake ,sse2 ,131135 ,3 ,7 ,3951.5 ,4002.6 > > > icelake ,sse2 ,131199 ,9 ,5 ,57886.7 ,58391.4 > > > icelake ,sse2 ,262151 ,0 ,64 ,7851.4 ,7894.7 > > > icelake ,sse2 ,262159 ,0 ,3 ,7947.5 ,8016.2 > > > icelake ,sse2 ,262175 ,3 ,0 ,115036.2 ,115968.6 > > > icelake ,sse2 ,262207 ,3 ,7 ,7883.9 ,7814.1 > > > icelake ,sse2 ,262271 ,9 ,5 ,113776.4 ,119733.6 > > > icelake ,sse2 ,524295 ,0 ,64 ,17198.1 ,16974.9 > > > icelake ,sse2 ,524303 ,0 ,3 ,17402.2 ,17096.3 > > > icelake ,sse2 ,524319 ,3 ,0 ,223980.4 ,225889.9 > > > icelake ,sse2 ,524351 ,3 ,7 ,17034.9 ,16910.3 > > > icelake ,sse2 ,524415 ,9 ,5 ,224027.7 ,224962.5 > > > icelake ,sse2 ,1048583 ,0 ,64 ,38822.3 ,39178.6 > > > icelake ,sse2 ,1048591 ,0 ,3 ,41686.7 ,40247.4 > > > icelake ,sse2 ,1048607 ,3 ,0 ,38814.8 ,39323.3 > > > icelake ,sse2 ,1048639 ,3 ,7 ,39568.3 ,41325.7 > > > icelake ,sse2 ,1048703 ,9 ,5 ,39354.2 ,39637.9 > > > icelake ,sse2 ,2097159 ,0 ,64 ,84074.7 ,84543.1 > > > icelake ,sse2 ,2097167 ,0 ,3 ,83665.7 ,82358.2 > > > icelake ,sse2 ,2097183 ,3 ,0 ,81817.8 ,79638.9 > > > icelake ,sse2 ,2097215 ,3 ,7 ,83649.1 ,83497.6 > > > icelake ,sse2 ,2097279 ,9 ,5 ,80287.6 ,79980.9 > > > icelake ,sse2 ,4194311 ,0 ,64 ,165409.8 ,168343.1 > > > icelake ,sse2 ,4194319 ,0 ,3 ,165216.7 ,177632.0 > > > icelake ,sse2 ,4194335 ,3 ,0 ,158718.7 ,160342.2 > > > icelake ,sse2 ,4194367 ,3 ,7 ,167944.9 ,167204.4 > > > icelake ,sse2 ,4194431 ,9 ,5 ,161530.1 ,164839.7 > > > icelake ,sse2 ,8388615 ,0 ,64 ,626504.3 ,629858.5 > > > icelake ,sse2 ,8388623 ,0 ,3 ,623969.5 ,631509.1 > > > icelake ,sse2 ,8388639 ,3 ,0 ,599366.7 ,600016.0 > > > icelake ,sse2 ,8388671 ,3 ,7 ,619964.2 ,619113.2 > > > icelake ,sse2 ,8388735 ,9 ,5 ,595338.1 ,604172.4 > > > icelake ,sse2 ,16777223 ,0 ,64 ,1709597.0 ,1725184.0 > > > icelake ,sse2 ,16777231 ,0 ,3 ,1725452.0 ,1719746.0 > > > icelake ,sse2 ,16777247 ,3 ,0 ,1614269.0 ,1607164.0 > > > icelake ,sse2 ,16777279 ,3 ,7 ,1705295.0 ,1733018.0 > > > icelake ,sse2 ,16777343 ,9 ,5 ,1604197.0 ,1595690.0 > > > > > > > I am having a hard time to convince myself that this patch is really necessary. > > What are geomeans of all different cases for each processors? > > N = 100, Geometric mean of Current vs New for memcpy-bench-large. Note the > bench-memmove-large numbers should be unaffected by this patch as the new > logic only applies to the no overlap case. > > cpu ,inst ,Len ,align1 ,align2 ,new > geomean ,cur geomean ,New/Cur > icelake ,sse2 ,65543 ,0 ,0 > ,5566.1 ,5564.7 ,1.0 > icelake ,sse2 ,65551 ,0 ,3 > ,5856.4 ,5725.7 ,1.02 > icelake ,sse2 ,65567 ,3 ,0 > ,5622.8 ,5892.9 ,0.95 > icelake ,sse2 ,65599 ,3 ,5 > ,5857.3 ,5723.8 ,1.02 > icelake ,sse2 ,65536 ,0 ,127 > ,5953.3 ,5831.1 ,1.02 > icelake ,sse2 ,65536 ,0 ,255 > ,5811.7 ,5789.5 ,1.0 > icelake ,sse2 ,65536 ,0 ,256 > ,5373.5 ,5284.1 ,1.02 > icelake ,sse2 ,65536 ,0 ,4064 > ,5820.1 ,5761.6 ,1.01 > icelake ,sse2 ,131079 ,0 ,0 > ,12421.5 ,12424.1 ,1.0 > icelake ,sse2 ,131087 ,0 ,3 > ,12389.5 ,12276.4 ,1.01 > icelake ,sse2 ,131103 ,3 ,0 > ,11587.0 ,12607.6 ,0.92 > icelake ,sse2 ,131135 ,3 ,5 > ,11596.9 ,11896.2 ,0.97 > icelake ,sse2 ,131072 ,0 ,127 > ,11746.4 ,12490.1 ,0.94 > icelake ,sse2 ,131072 ,0 ,255 > ,11486.8 ,11831.7 ,0.97 > icelake ,sse2 ,131072 ,0 ,256 > ,10453.5 ,10451.7 ,1.0 > icelake ,sse2 ,131072 ,0 ,4064 > ,11231.7 ,11223.6 ,1.0 > icelake ,sse2 ,262151 ,0 ,0 > ,29408.5 ,30831.2 ,0.95 > icelake ,sse2 ,262159 ,0 ,3 > ,30813.6 ,32235.6 ,0.96 > icelake ,sse2 ,262175 ,3 ,0 > ,30245.0 ,31392.5 ,0.96 > icelake ,sse2 ,262207 ,3 ,5 > ,30775.6 ,32298.6 ,0.95 > icelake ,sse2 ,262144 ,0 ,127 > ,31784.7 ,32791.5 ,0.97 > icelake ,sse2 ,262144 ,0 ,255 > ,30726.0 ,31997.5 ,0.96 > icelake ,sse2 ,262144 ,0 ,256 > ,28418.9 ,29440.9 ,0.97 > icelake ,sse2 ,262144 ,0 ,4064 > ,29984.1 ,31048.9 ,0.97 > icelake ,sse2 ,524295 ,0 ,0 > ,76079.0 ,75752.0 ,1.0 > icelake ,sse2 ,524303 ,0 ,3 > ,79939.3 ,80796.4 ,0.99 > icelake ,sse2 ,524319 ,3 ,0 > ,79018.1 ,79928.5 ,0.99 > icelake ,sse2 ,524351 ,3 ,5 > ,81219.4 ,81053.8 ,1.0 > icelake ,sse2 ,524288 ,0 ,127 > ,80111.8 ,80087.2 ,1.0 > icelake ,sse2 ,524288 ,0 ,255 > ,79334.0 ,79525.6 ,1.0 > icelake ,sse2 ,524288 ,0 ,256 > ,75766.9 ,75918.9 ,1.0 > icelake ,sse2 ,524288 ,0 ,4064 > ,78907.9 ,79550.8 ,0.99 > icelake ,sse2 ,1048583 ,0 ,0 > ,144672.6 ,147457.7 ,0.98 > icelake ,sse2 ,1048591 ,0 ,3 > ,173803.9 ,400563.2 ,0.43 > icelake ,sse2 ,1048607 ,3 ,0 > ,149391.9 ,151772.1 ,0.98 > icelake ,sse2 ,1048639 ,3 ,5 > ,174774.1 ,400657.4 ,0.44 > icelake ,sse2 ,1048576 ,0 ,127 > ,175350.9 ,347110.6 ,0.51 > icelake ,sse2 ,1048576 ,0 ,255 > ,150152.6 ,144242.9 ,1.04 > icelake ,sse2 ,1048576 ,0 ,256 > ,145869.7 ,147489.6 ,0.99 > icelake ,sse2 ,1048576 ,0 ,4064 > ,145814.7 ,147497.7 ,0.99 > icelake ,sse2 ,2097159 ,0 ,0 > ,289460.6 ,295574.6 ,0.98 > icelake ,sse2 ,2097167 ,0 ,3 > ,347057.0 ,799549.1 ,0.43 > icelake ,sse2 ,2097183 ,3 ,0 > ,298565.7 ,301424.3 ,0.99 > icelake ,sse2 ,2097215 ,3 ,5 > ,348620.4 ,797557.4 ,0.44 > icelake ,sse2 ,2097152 ,0 ,127 > ,348751.4 ,695260.9 ,0.5 > icelake ,sse2 ,2097152 ,0 ,255 > ,298960.5 ,286590.0 ,1.04 > icelake ,sse2 ,2097152 ,0 ,256 > ,290978.4 ,293225.6 ,0.99 > icelake ,sse2 ,2097152 ,0 ,4064 > ,290476.0 ,292283.2 ,0.99 > icelake ,sse2 ,4194311 ,0 ,0 > ,583386.3 ,588284.3 ,0.99 > icelake ,sse2 ,4194319 ,0 ,3 > ,703870.5 ,1595268.0 ,0.44 > icelake ,sse2 ,4194335 ,3 ,0 > ,599400.2 ,601591.6 ,1.0 > icelake ,sse2 ,4194367 ,3 ,5 > ,694569.7 ,1595608.0 ,0.44 > icelake ,sse2 ,4194304 ,0 ,127 > ,700229.1 ,1389061.9 ,0.5 > icelake ,sse2 ,4194304 ,0 ,255 > ,600779.0 ,573361.2 ,1.05 > icelake ,sse2 ,4194304 ,0 ,256 > ,586610.7 ,589269.6 ,1.0 > icelake ,sse2 ,4194304 ,0 ,4064 > ,583616.3 ,584806.4 ,1.0 > icelake ,sse2 ,8388615 ,0 ,0 > ,1214632.8 ,1266616.0 ,0.96 > icelake ,sse2 ,8388623 ,0 ,3 > ,1405136.9 ,3198827.1 ,0.44 > icelake ,sse2 ,8388639 ,3 ,0 > ,1244302.6 ,1297425.9 ,0.96 > icelake ,sse2 ,8388671 ,3 ,5 > ,1404685.1 ,3196389.9 ,0.44 > icelake ,sse2 ,8388608 ,0 ,127 > ,1419888.5 ,2792729.4 ,0.51 > icelake ,sse2 ,8388608 ,0 ,255 > ,1249044.6 ,1259726.7 ,0.99 > icelake ,sse2 ,8388608 ,0 ,256 > ,1234471.9 ,1300463.6 ,0.95 > icelake ,sse2 ,8388608 ,0 ,4064 > ,1220102.2 ,1265190.5 ,0.96 > icelake ,sse2 ,16777223 ,0 ,0 > ,2689516.3 ,2846521.1 ,0.94 > icelake ,sse2 ,16777231 ,0 ,3 > ,3001317.4 ,6428733.7 ,0.47 > icelake ,sse2 ,16777247 ,3 ,0 > ,2770040.8 ,2910434.9 ,0.95 > icelake ,sse2 ,16777279 ,3 ,5 > ,3002076.1 ,6415835.9 ,0.47 > icelake ,sse2 ,16777216 ,0 ,127 > ,3063786.3 ,5609895.3 ,0.55 > icelake ,sse2 ,16777216 ,0 ,255 > ,2821606.1 ,2833843.6 ,1.0 > icelake ,sse2 ,16777216 ,0 ,256 > ,2719765.5 ,2925344.2 ,0.93 > icelake ,sse2 ,16777216 ,0 ,4064 > ,2686189.2 ,2848017.5 ,0.94 > icelake ,sse2 ,33554439 ,0 ,0 > ,5577945.0 ,5913674.6 ,0.94 > icelake ,sse2 ,33554447 ,0 ,3 > ,6152758.8 ,12863855.0 ,0.48 > icelake ,sse2 ,33554463 ,3 ,0 > ,5773351.4 ,6035289.3 ,0.96 > icelake ,sse2 ,33554495 ,3 ,5 > ,6160006.2 ,12878153.9 ,0.48 > icelake ,sse2 ,33554432 ,0 ,127 > ,6303495.4 ,11221070.2 ,0.56 > icelake ,sse2 ,33554432 ,0 ,255 > ,5830879.6 ,5944978.6 ,0.98 > icelake ,sse2 ,33554432 ,0 ,256 > ,5611968.2 ,6068255.4 ,0.92 > icelake ,sse2 ,33554432 ,0 ,4064 > ,5570321.0 ,5964542.6 ,0.93 icelake ,avx ,65543 > ,0 ,0 ,5561.1 ,5659.7 ,0.98 > icelake ,avx ,65551 ,0 ,3 > ,5859.9 ,5724.8 ,1.02 > icelake ,avx ,65567 ,3 ,0 > ,5636.7 ,5623.3 ,1.0 > icelake ,avx ,65599 ,3 ,5 > ,5856.3 ,5720.2 ,1.02 > icelake ,avx ,65536 ,0 ,127 > ,6011.1 ,5910.0 ,1.02 > icelake ,avx ,65536 ,0 ,255 > ,5854.5 ,5792.3 ,1.01 > icelake ,avx ,65536 ,0 ,256 > ,5213.0 ,5273.9 ,0.99 > icelake ,avx ,65536 ,0 ,4064 > ,5760.7 ,5661.1 ,1.02 > icelake ,avx ,131079 ,0 ,0 > ,12371.4 ,12707.0 ,0.97 > icelake ,avx ,131087 ,0 ,3 > ,13220.1 ,12515.7 ,1.06 > icelake ,avx ,131103 ,3 ,0 > ,11628.2 ,11546.9 ,1.01 > icelake ,avx ,131135 ,3 ,5 > ,13025.7 ,13967.6 ,0.93 > icelake ,avx ,131072 ,0 ,127 > ,11781.7 ,11936.4 ,0.99 > icelake ,avx ,131072 ,0 ,255 > ,11802.2 ,11583.9 ,1.02 > icelake ,avx ,131072 ,0 ,256 > ,10436.9 ,10693.1 ,0.98 > icelake ,avx ,131072 ,0 ,4064 > ,11880.9 ,11395.6 ,1.04 > icelake ,avx ,262151 ,0 ,0 > ,29132.6 ,30542.8 ,0.95 > icelake ,avx ,262159 ,0 ,3 > ,30533.5 ,31468.8 ,0.97 > icelake ,avx ,262175 ,3 ,0 > ,29879.5 ,30933.7 ,0.97 > icelake ,avx ,262207 ,3 ,5 > ,30263.1 ,31445.0 ,0.96 > icelake ,avx ,262144 ,0 ,127 > ,30180.9 ,31405.3 ,0.96 > icelake ,avx ,262144 ,0 ,255 > ,30152.9 ,31372.5 ,0.96 > icelake ,avx ,262144 ,0 ,256 > ,28121.9 ,28990.9 ,0.97 > icelake ,avx ,262144 ,0 ,4064 > ,29785.2 ,31078.4 ,0.96 > icelake ,avx ,524295 ,0 ,0 > ,76045.7 ,75824.3 ,1.0 > icelake ,avx ,524303 ,0 ,3 > ,79303.7 ,80433.3 ,0.99 > icelake ,avx ,524319 ,3 ,0 > ,79323.8 ,79411.3 ,1.0 > icelake ,avx ,524351 ,3 ,5 > ,79797.9 ,80179.4 ,1.0 > icelake ,avx ,524288 ,0 ,127 > ,80046.7 ,80254.1 ,1.0 > icelake ,avx ,524288 ,0 ,255 > ,78580.6 ,79210.4 ,0.99 > icelake ,avx ,524288 ,0 ,256 > ,75464.4 ,75184.2 ,1.0 > icelake ,avx ,524288 ,0 ,4064 > ,78863.6 ,78677.9 ,1.0 > icelake ,avx ,1048583 ,0 ,0 > ,131017.9 ,133962.4 ,0.98 > icelake ,avx ,1048591 ,0 ,3 > ,143451.3 ,210311.7 ,0.68 > icelake ,avx ,1048607 ,3 ,0 > ,136944.0 ,138426.4 ,0.99 > icelake ,avx ,1048639 ,3 ,5 > ,143594.3 ,209887.9 ,0.68 > icelake ,avx ,1048576 ,0 ,127 > ,156462.0 ,218873.2 ,0.71 > icelake ,avx ,1048576 ,0 ,255 > ,148026.3 ,179419.0 ,0.83 > icelake ,avx ,1048576 ,0 ,256 > ,143365.7 ,137816.3 ,1.04 > icelake ,avx ,1048576 ,0 ,4064 > ,131683.4 ,132731.6 ,0.99 > icelake ,avx ,2097159 ,0 ,0 > ,263807.1 ,267984.5 ,0.98 > icelake ,avx ,2097167 ,0 ,3 > ,286949.8 ,422279.2 ,0.68 > icelake ,avx ,2097183 ,3 ,0 > ,274675.6 ,276702.2 ,0.99 > icelake ,avx ,2097215 ,3 ,5 > ,286681.7 ,420176.7 ,0.68 > icelake ,avx ,2097152 ,0 ,127 > ,314499.2 ,437864.2 ,0.72 > icelake ,avx ,2097152 ,0 ,255 > ,297458.4 ,359520.9 ,0.83 > icelake ,avx ,2097152 ,0 ,256 > ,285883.2 ,276043.2 ,1.04 > icelake ,avx ,2097152 ,0 ,4064 > ,263436.6 ,265516.6 ,0.99 > icelake ,avx ,4194311 ,0 ,0 > ,529119.4 ,536745.2 ,0.99 > icelake ,avx ,4194319 ,0 ,3 > ,573960.0 ,839002.3 ,0.68 > icelake ,avx ,4194335 ,3 ,0 > ,550617.2 ,553117.5 ,1.0 > icelake ,avx ,4194367 ,3 ,5 > ,572742.8 ,838784.5 ,0.68 > icelake ,avx ,4194304 ,0 ,127 > ,629413.6 ,876512.1 ,0.72 > icelake ,avx ,4194304 ,0 ,255 > ,594224.1 ,717425.1 ,0.83 > icelake ,avx ,4194304 ,0 ,256 > ,573365.0 ,552538.3 ,1.04 > icelake ,avx ,4194304 ,0 ,4064 > ,527459.3 ,531907.1 ,0.99 > icelake ,avx ,8388615 ,0 ,0 > ,1094256.8 ,1145619.9 ,0.96 > icelake ,avx ,8388623 ,0 ,3 > ,1170367.1 ,1700076.4 ,0.69 > icelake ,avx ,8388639 ,3 ,0 > ,1136168.1 ,1174752.4 ,0.97 > icelake ,avx ,8388671 ,3 ,5 > ,1172015.6 ,1703032.8 ,0.69 > icelake ,avx ,8388608 ,0 ,127 > ,1276748.6 ,1771351.9 ,0.72 > icelake ,avx ,8388608 ,0 ,255 > ,1207712.0 ,1449267.0 ,0.83 > icelake ,avx ,8388608 ,0 ,256 > ,1167958.9 ,1178243.1 ,0.99 > icelake ,avx ,8388608 ,0 ,4064 > ,1106155.9 ,1145128.6 ,0.97 > icelake ,avx ,16777223 ,0 ,0 > ,2479317.5 ,2630301.0 ,0.94 > icelake ,avx ,16777231 ,0 ,3 > ,2643303.6 ,3536980.7 ,0.75 > icelake ,avx ,16777247 ,3 ,0 > ,2571967.0 ,2672246.4 ,0.96 > icelake ,avx ,16777279 ,3 ,5 > ,2641320.5 ,3538388.9 ,0.75 > icelake ,avx ,16777216 ,0 ,127 > ,2832921.6 ,3593702.5 ,0.79 > icelake ,avx ,16777216 ,0 ,255 > ,2700272.1 ,3025346.1 ,0.89 > icelake ,avx ,16777216 ,0 ,256 > ,2622133.7 ,2709087.6 ,0.97 > icelake ,avx ,16777216 ,0 ,4064 > ,2475020.7 ,2610977.8 ,0.95 > icelake ,avx ,33554439 ,0 ,0 > ,5190103.1 ,5576047.9 ,0.93 > icelake ,avx ,33554447 ,0 ,3 > ,5477752.1 ,7215479.2 ,0.76 > icelake ,avx ,33554463 ,3 ,0 > ,5338711.7 ,5625026.7 ,0.95 > icelake ,avx ,33554495 ,3 ,5 > ,5505164.8 ,7223660.8 ,0.76 > icelake ,avx ,33554432 ,0 ,127 > ,5859232.3 ,7279581.9 ,0.8 > icelake ,avx ,33554432 ,0 ,255 > ,5681634.7 ,6156488.6 ,0.92 > icelake ,avx ,33554432 ,0 ,256 > ,5440721.4 ,5728347.4 ,0.95 > icelake ,avx ,33554432 ,0 ,4064 > ,5191213.2 ,5538716.4 ,0.94 > icelake ,avx512 ,65543 ,0 ,0 > ,5563.5 ,5634.1 ,0.99 > icelake ,avx512 ,65551 ,0 ,3 > ,5864.1 ,5728.4 ,1.02 > icelake ,avx512 ,65567 ,3 ,0 > ,5720.2 ,5625.3 ,1.02 > icelake ,avx512 ,65599 ,3 ,5 > ,5857.2 ,5722.0 ,1.02 > icelake ,avx512 ,65536 ,0 ,127 > ,6040.7 ,5844.0 ,1.03 > icelake ,avx512 ,65536 ,0 ,255 > ,5826.5 ,5799.6 ,1.0 > icelake ,avx512 ,65536 ,0 ,256 > ,5234.4 ,5230.0 ,1.0 > icelake ,avx512 ,65536 ,0 ,4064 > ,5800.7 ,5655.4 ,1.03 > icelake ,avx512 ,131079 ,0 ,0 > ,12591.4 ,11767.1 ,1.07 > icelake ,avx512 ,131087 ,0 ,3 > ,12694.9 ,12292.1 ,1.03 > icelake ,avx512 ,131103 ,3 ,0 > ,11374.7 ,12236.3 ,0.93 > icelake ,avx512 ,131135 ,3 ,5 > ,11958.2 ,11745.5 ,1.02 > icelake ,avx512 ,131072 ,0 ,127 > ,11803.4 ,11908.6 ,0.99 > icelake ,avx512 ,131072 ,0 ,255 > ,11569.0 ,11487.9 ,1.01 > icelake ,avx512 ,131072 ,0 ,256 > ,11087.6 ,10456.4 ,1.06 > icelake ,avx512 ,131072 ,0 ,4064 > ,11166.0 ,11248.2 ,0.99 > icelake ,avx512 ,262151 ,0 ,0 > ,30232.1 ,29932.7 ,1.01 > icelake ,avx512 ,262159 ,0 ,3 > ,30093.8 ,31315.1 ,0.96 > icelake ,avx512 ,262175 ,3 ,0 > ,30147.7 ,30643.4 ,0.98 > icelake ,avx512 ,262207 ,3 ,5 > ,29985.9 ,31479.8 ,0.95 > icelake ,avx512 ,262144 ,0 ,127 > ,30099.7 ,31552.9 ,0.95 > icelake ,avx512 ,262144 ,0 ,255 > ,29772.8 ,30698.1 ,0.97 > icelake ,avx512 ,262144 ,0 ,256 > ,28109.3 ,28957.9 ,0.97 > icelake ,avx512 ,262144 ,0 ,4064 > ,29787.5 ,30637.2 ,0.97 > icelake ,avx512 ,524295 ,0 ,0 > ,75920.7 ,75047.1 ,1.01 > icelake ,avx512 ,524303 ,0 ,3 > ,79218.6 ,79529.2 ,1.0 > icelake ,avx512 ,524319 ,3 ,0 > ,78446.9 ,78550.7 ,1.0 > icelake ,avx512 ,524351 ,3 ,5 > ,79055.0 ,79425.2 ,1.0 > icelake ,avx512 ,524288 ,0 ,127 > ,79070.6 ,79626.7 ,0.99 > icelake ,avx512 ,524288 ,0 ,255 > ,77891.8 ,78078.3 ,1.0 > icelake ,avx512 ,524288 ,0 ,256 > ,74797.3 ,74436.9 ,1.0 > icelake ,avx512 ,524288 ,0 ,4064 > ,78339.3 ,78337.2 ,1.0 > icelake ,avx512 ,1048583 ,0 ,0 > ,131427.6 ,133891.3 ,0.98 > icelake ,avx512 ,1048591 ,0 ,3 > ,143984.1 ,142003.7 ,1.01 > icelake ,avx512 ,1048607 ,3 ,0 > ,137547.9 ,134450.1 ,1.02 > icelake ,avx512 ,1048639 ,3 ,5 > ,144630.4 ,142174.6 ,1.02 > icelake ,avx512 ,1048576 ,0 ,127 > ,149810.7 ,142684.9 ,1.05 > icelake ,avx512 ,1048576 ,0 ,255 > ,156212.6 ,143509.2 ,1.09 > icelake ,avx512 ,1048576 ,0 ,256 > ,153776.9 ,139788.0 ,1.1 > icelake ,avx512 ,1048576 ,0 ,4064 > ,137926.6 ,134832.8 ,1.02 > icelake ,avx512 ,2097159 ,0 ,0 > ,263465.3 ,267681.6 ,0.98 > icelake ,avx512 ,2097167 ,0 ,3 > ,288947.7 ,284129.9 ,1.02 > icelake ,avx512 ,2097183 ,3 ,0 > ,275395.5 ,269216.0 ,1.02 > icelake ,avx512 ,2097215 ,3 ,5 > ,289131.5 ,284475.3 ,1.02 > icelake ,avx512 ,2097152 ,0 ,127 > ,299404.5 ,286193.2 ,1.05 > icelake ,avx512 ,2097152 ,0 ,255 > ,312913.2 ,286785.6 ,1.09 > icelake ,avx512 ,2097152 ,0 ,256 > ,307882.7 ,279708.7 ,1.1 > icelake ,avx512 ,2097152 ,0 ,4064 > ,275552.3 ,269867.0 ,1.02 > icelake ,avx512 ,4194311 ,0 ,0 > ,526480.1 ,536038.9 ,0.98 > icelake ,avx512 ,4194319 ,0 ,3 > ,579122.9 ,569512.5 ,1.02 > icelake ,avx512 ,4194335 ,3 ,0 > ,551658.1 ,542973.3 ,1.02 > icelake ,avx512 ,4194367 ,3 ,5 > ,578575.2 ,569497.2 ,1.02 > icelake ,avx512 ,4194304 ,0 ,127 > ,599943.6 ,569138.2 ,1.05 > icelake ,avx512 ,4194304 ,0 ,255 > ,628419.2 ,575908.4 ,1.09 > icelake ,avx512 ,4194304 ,0 ,256 > ,617242.8 ,561417.7 ,1.1 > icelake ,avx512 ,4194304 ,0 ,4064 > ,552012.3 ,540617.2 ,1.02 > icelake ,avx512 ,8388615 ,0 ,0 > ,1092471.4 ,1133834.9 ,0.96 > icelake ,avx512 ,8388623 ,0 ,3 > ,1185623.5 ,1218150.0 ,0.97 > icelake ,avx512 ,8388639 ,3 ,0 > ,1142647.1 ,1139201.6 ,1.0 > icelake ,avx512 ,8388671 ,3 ,5 > ,1183702.5 ,1225474.6 ,0.97 > icelake ,avx512 ,8388608 ,0 ,127 > ,1231862.8 ,1221685.1 ,1.01 > icelake ,avx512 ,8388608 ,0 ,255 > ,1290816.7 ,1221576.2 ,1.06 > icelake ,avx512 ,8388608 ,0 ,256 > ,1299047.6 ,1195021.2 ,1.09 > icelake ,avx512 ,8388608 ,0 ,4064 > ,1139648.9 ,1140113.0 ,1.0 > icelake ,avx512 ,16777223 ,0 ,0 > ,2464861.2 ,2599120.4 ,0.95 > icelake ,avx512 ,16777231 ,0 ,3 > ,2651029.7 ,2758867.1 ,0.96 > icelake ,avx512 ,16777247 ,3 ,0 > ,2570099.8 ,2601099.4 ,0.99 > icelake ,avx512 ,16777279 ,3 ,5 > ,2660529.4 ,2762598.6 ,0.96 > icelake ,avx512 ,16777216 ,0 ,127 > ,2759531.7 ,2756811.1 ,1.0 > icelake ,avx512 ,16777216 ,0 ,255 > ,2878568.5 ,2777650.3 ,1.04 > icelake ,avx512 ,16777216 ,0 ,256 > ,2931879.3 ,2709687.7 ,1.08 > icelake ,avx512 ,16777216 ,0 ,4064 > ,2587161.1 ,2632011.2 ,0.98 > icelake ,avx512 ,33554439 ,0 ,0 > ,5175406.0 ,5528857.2 ,0.94 > icelake ,avx512 ,33554447 ,0 ,3 > ,5537561.9 ,5818119.1 ,0.95 > icelake ,avx512 ,33554463 ,3 ,0 > ,5435099.5 ,5560442.2 ,0.98 > icelake ,avx512 ,33554495 ,3 ,5 > ,5546314.9 ,5800995.0 ,0.96 > icelake ,avx512 ,33554432 ,0 ,127 > ,5770248.0 ,5781104.9 ,1.0 > icelake ,avx512 ,33554432 ,0 ,255 > ,6019120.7 ,5836023.3 ,1.03 > icelake ,avx512 ,33554432 ,0 ,256 > ,6107033.4 ,5681798.8 ,1.07 > icelake ,avx512 ,33554432 ,0 ,4064 > ,5356238.5 ,5598521.5 ,0.96 > skylake ,sse2 ,65543 ,0 ,0 > ,3091.4 ,2940.2 ,1.05 > skylake ,sse2 ,65551 ,0 ,3 > ,3682.6 ,3403.7 ,1.08 > skylake ,sse2 ,65567 ,3 ,0 > ,3031.3 ,3070.2 ,0.99 > skylake ,sse2 ,65599 ,3 ,5 > ,3731.2 ,3718.7 ,1.0 > skylake ,sse2 ,65536 ,0 ,127 > ,3642.3 ,3390.5 ,1.07 > skylake ,sse2 ,65536 ,0 ,255 > ,3493.9 ,3333.0 ,1.05 > skylake ,sse2 ,65536 ,0 ,256 > ,3043.2 ,2981.0 ,1.02 > skylake ,sse2 ,65536 ,0 ,4064 > ,2796.6 ,2843.9 ,0.98 > skylake ,sse2 ,131079 ,0 ,0 > ,6347.4 ,6309.8 ,1.01 > skylake ,sse2 ,131087 ,0 ,3 > ,7318.4 ,7486.2 ,0.98 > skylake ,sse2 ,131103 ,3 ,0 > ,6297.4 ,6516.8 ,0.97 > skylake ,sse2 ,131135 ,3 ,5 > ,7544.5 ,7823.5 ,0.96 > skylake ,sse2 ,131072 ,0 ,127 > ,7426.4 ,7554.3 ,0.98 > skylake ,sse2 ,131072 ,0 ,255 > ,7349.0 ,7195.4 ,1.02 > skylake ,sse2 ,131072 ,0 ,256 > ,7068.1 ,6804.8 ,1.04 > skylake ,sse2 ,131072 ,0 ,4064 > ,6884.6 ,7566.7 ,0.91 > skylake ,sse2 ,262151 ,0 ,0 > ,15848.1 ,15552.2 ,1.02 > skylake ,sse2 ,262159 ,0 ,3 > ,17864.6 ,16787.9 ,1.06 > skylake ,sse2 ,262175 ,3 ,0 > ,15748.1 ,16266.0 ,0.97 > skylake ,sse2 ,262207 ,3 ,5 > ,17022.3 ,17229.8 ,0.99 > skylake ,sse2 ,262144 ,0 ,127 > ,16158.7 ,16093.6 ,1.0 > skylake ,sse2 ,262144 ,0 ,255 > ,15670.7 ,15949.2 ,0.98 > skylake ,sse2 ,262144 ,0 ,256 > ,14806.3 ,14970.3 ,0.99 > skylake ,sse2 ,262144 ,0 ,4064 > ,14751.7 ,15008.2 ,0.98 > skylake ,sse2 ,524295 ,0 ,0 > ,32874.8 ,33731.2 ,0.97 > skylake ,sse2 ,524303 ,0 ,3 > ,34035.1 ,34777.8 ,0.98 > skylake ,sse2 ,524319 ,3 ,0 > ,34325.6 ,34108.9 ,1.01 > skylake ,sse2 ,524351 ,3 ,5 > ,34853.5 ,35624.4 ,0.98 > skylake ,sse2 ,524288 ,0 ,127 > ,33437.4 ,33816.7 ,0.99 > skylake ,sse2 ,524288 ,0 ,255 > ,33256.1 ,33664.7 ,0.99 > skylake ,sse2 ,524288 ,0 ,256 > ,32006.3 ,32396.3 ,0.99 > skylake ,sse2 ,524288 ,0 ,4064 > ,32284.7 ,32713.9 ,0.99 > skylake ,sse2 ,1048583 ,0 ,0 > ,71891.7 ,73858.4 ,0.97 > skylake ,sse2 ,1048591 ,0 ,3 > ,74621.3 ,74389.7 ,1.0 > skylake ,sse2 ,1048607 ,3 ,0 > ,72515.0 ,73573.2 ,0.99 > skylake ,sse2 ,1048639 ,3 ,5 > ,72471.7 ,73782.6 ,0.98 > skylake ,sse2 ,1048576 ,0 ,127 > ,77638.6 ,82474.6 ,0.94 > skylake ,sse2 ,1048576 ,0 ,255 > ,71870.0 ,71933.6 ,1.0 > skylake ,sse2 ,1048576 ,0 ,256 > ,70410.0 ,73243.6 ,0.96 > skylake ,sse2 ,1048576 ,0 ,4064 > ,71267.1 ,72274.6 ,0.99 > skylake ,sse2 ,2097159 ,0 ,0 > ,140052.6 ,144880.1 ,0.97 > skylake ,sse2 ,2097167 ,0 ,3 > ,146626.5 ,147972.6 ,0.99 > skylake ,sse2 ,2097183 ,3 ,0 > ,141750.1 ,146353.6 ,0.97 > skylake ,sse2 ,2097215 ,3 ,5 > ,144169.0 ,148120.1 ,0.97 > skylake ,sse2 ,2097152 ,0 ,127 > ,156575.9 ,165844.4 ,0.94 > skylake ,sse2 ,2097152 ,0 ,255 > ,144277.7 ,146971.5 ,0.98 > skylake ,sse2 ,2097152 ,0 ,256 > ,143047.4 ,146810.9 ,0.97 > skylake ,sse2 ,2097152 ,0 ,4064 > ,142795.6 ,145805.8 ,0.98 > skylake ,sse2 ,4194311 ,0 ,0 > ,284353.3 ,298092.5 ,0.95 > skylake ,sse2 ,4194319 ,0 ,3 > ,296656.4 ,311960.2 ,0.95 > skylake ,sse2 ,4194335 ,3 ,0 > ,285922.6 ,304100.5 ,0.94 > skylake ,sse2 ,4194367 ,3 ,5 > ,297135.4 ,312532.5 ,0.95 > skylake ,sse2 ,4194304 ,0 ,127 > ,323938.6 ,340414.3 ,0.95 > skylake ,sse2 ,4194304 ,0 ,255 > ,301460.9 ,310042.7 ,0.97 > skylake ,sse2 ,4194304 ,0 ,256 > ,287155.8 ,303580.6 ,0.95 > skylake ,sse2 ,4194304 ,0 ,4064 > ,291006.2 ,302441.3 ,0.96 > skylake ,sse2 ,8388615 ,0 ,0 > ,714424.7 ,747484.3 ,0.96 > skylake ,sse2 ,8388623 ,0 ,3 > ,748995.5 ,774116.5 ,0.97 > skylake ,sse2 ,8388639 ,3 ,0 > ,720563.4 ,757386.9 ,0.95 > skylake ,sse2 ,8388671 ,3 ,5 > ,748028.7 ,773907.8 ,0.97 > skylake ,sse2 ,8388608 ,0 ,127 > ,750775.3 ,780245.2 ,0.96 > skylake ,sse2 ,8388608 ,0 ,255 > ,724940.3 ,764197.8 ,0.95 > skylake ,sse2 ,8388608 ,0 ,256 > ,722035.0 ,759408.9 ,0.95 > skylake ,sse2 ,8388608 ,0 ,4064 > ,756977.8 ,755532.4 ,1.0 > skylake ,sse2 ,16777223 ,0 ,0 > ,1971686.0 ,2111263.4 ,0.93 > skylake ,sse2 ,16777231 ,0 ,3 > ,1953608.9 ,2128493.8 ,0.92 > skylake ,sse2 ,16777247 ,3 ,0 > ,1967075.6 ,2103772.3 ,0.94 > skylake ,sse2 ,16777279 ,3 ,5 > ,1950851.6 ,2133601.6 ,0.91 > skylake ,sse2 ,16777216 ,0 ,127 > ,1991168.2 ,2078249.3 ,0.96 > skylake ,sse2 ,16777216 ,0 ,255 > ,1958502.9 ,2111955.5 ,0.93 > skylake ,sse2 ,16777216 ,0 ,256 > ,1965103.7 ,2114293.0 ,0.93 > skylake ,sse2 ,16777216 ,0 ,4064 > ,1958381.3 ,2103438.6 ,0.93 > skylake ,sse2 ,33554439 ,0 ,0 > ,4456144.2 ,4660837.1 ,0.96 > skylake ,sse2 ,33554447 ,0 ,3 > ,4431097.0 ,4679042.6 ,0.95 > skylake ,sse2 ,33554463 ,3 ,0 > ,4448225.6 ,4648538.3 ,0.96 > skylake ,sse2 ,33554495 ,3 ,5 > ,4427743.0 ,4678340.1 ,0.95 > skylake ,sse2 ,33554432 ,0 ,127 > ,4437517.3 ,4552005.9 ,0.97 > skylake ,sse2 ,33554432 ,0 ,255 > ,4427135.1 ,4543412.0 ,0.97 > skylake ,sse2 ,33554432 ,0 ,256 > ,4441311.2 ,4658315.5 ,0.95 > skylake ,sse2 ,33554432 ,0 ,4064 > ,4429798.4 ,4659499.6 ,0.95 skylake ,avx ,65543 > ,0 ,0 ,3115.8 ,3043.7 ,1.02 > skylake ,avx ,65551 ,0 ,3 > ,3673.2 ,3551.7 ,1.03 > skylake ,avx ,65567 ,3 ,0 > ,3024.6 ,2887.4 ,1.05 > skylake ,avx ,65599 ,3 ,5 > ,3907.8 ,3636.4 ,1.07 > skylake ,avx ,65536 ,0 ,127 > ,3539.2 ,3372.3 ,1.05 > skylake ,avx ,65536 ,0 ,255 > ,3489.9 ,3344.0 ,1.04 > skylake ,avx ,65536 ,0 ,256 > ,3059.0 ,2924.4 ,1.05 > skylake ,avx ,65536 ,0 ,4064 > ,2805.0 ,2869.3 ,0.98 > skylake ,avx ,131079 ,0 ,0 > ,6129.2 ,6263.4 ,0.98 > skylake ,avx ,131087 ,0 ,3 > ,7096.8 ,7570.0 ,0.94 > skylake ,avx ,131103 ,3 ,0 > ,6394.5 ,6842.5 ,0.93 > skylake ,avx ,131135 ,3 ,5 > ,7462.8 ,7776.0 ,0.96 > skylake ,avx ,131072 ,0 ,127 > ,7726.9 ,7428.5 ,1.04 > skylake ,avx ,131072 ,0 ,255 > ,7167.4 ,7278.9 ,0.98 > skylake ,avx ,131072 ,0 ,256 > ,7197.9 ,6284.3 ,1.15 > skylake ,avx ,131072 ,0 ,4064 > ,6984.0 ,6940.4 ,1.01 > skylake ,avx ,262151 ,0 ,0 > ,15787.3 ,16403.1 ,0.96 > skylake ,avx ,262159 ,0 ,3 > ,17800.1 ,17628.1 ,1.01 > skylake ,avx ,262175 ,3 ,0 > ,16622.8 ,16244.3 ,1.02 > skylake ,avx ,262207 ,3 ,5 > ,16989.7 ,17509.0 ,0.97 > skylake ,avx ,262144 ,0 ,127 > ,16190.8 ,15971.8 ,1.01 > skylake ,avx ,262144 ,0 ,255 > ,15787.1 ,15876.7 ,0.99 > skylake ,avx ,262144 ,0 ,256 > ,14840.1 ,14997.0 ,0.99 > skylake ,avx ,262144 ,0 ,4064 > ,15743.0 ,14976.2 ,1.05 > skylake ,avx ,524295 ,0 ,0 > ,32848.5 ,33397.8 ,0.98 > skylake ,avx ,524303 ,0 ,3 > ,34872.1 ,34862.2 ,1.0 > skylake ,avx ,524319 ,3 ,0 > ,33784.6 ,34023.8 ,0.99 > skylake ,avx ,524351 ,3 ,5 > ,35337.1 ,35364.5 ,1.0 > skylake ,avx ,524288 ,0 ,127 > ,33624.5 ,33596.5 ,1.0 > skylake ,avx ,524288 ,0 ,255 > ,33390.7 ,33842.8 ,0.99 > skylake ,avx ,524288 ,0 ,256 > ,31937.0 ,32357.2 ,0.99 > skylake ,avx ,524288 ,0 ,4064 > ,32233.5 ,32267.3 ,1.0 > skylake ,avx ,1048583 ,0 ,0 > ,100354.7 ,105840.6 ,0.95 > skylake ,avx ,1048591 ,0 ,3 > ,68102.5 ,67496.0 ,1.01 > skylake ,avx ,1048607 ,3 ,0 > ,66146.1 ,67540.0 ,0.98 > skylake ,avx ,1048639 ,3 ,5 > ,67530.8 ,67726.4 ,1.0 > skylake ,avx ,1048576 ,0 ,127 > ,67105.6 ,66533.5 ,1.01 > skylake ,avx ,1048576 ,0 ,255 > ,67101.8 ,65666.7 ,1.02 > skylake ,avx ,1048576 ,0 ,256 > ,65092.6 ,67103.0 ,0.97 > skylake ,avx ,1048576 ,0 ,4064 > ,65700.0 ,67031.5 ,0.98 > skylake ,avx ,2097159 ,0 ,0 > ,133101.0 ,135171.6 ,0.98 > skylake ,avx ,2097167 ,0 ,3 > ,134174.4 ,135782.1 ,0.99 > skylake ,avx ,2097183 ,3 ,0 > ,132056.4 ,134170.0 ,0.98 > skylake ,avx ,2097215 ,3 ,5 > ,134413.5 ,136341.1 ,0.99 > skylake ,avx ,2097152 ,0 ,127 > ,133003.9 ,132992.1 ,1.0 > skylake ,avx ,2097152 ,0 ,255 > ,133344.3 ,132883.1 ,1.0 > skylake ,avx ,2097152 ,0 ,256 > ,134051.7 ,136185.8 ,0.98 > skylake ,avx ,2097152 ,0 ,4064 > ,132976.3 ,135029.4 ,0.98 > skylake ,avx ,4194311 ,0 ,0 > ,268004.1 ,282650.3 ,0.95 > skylake ,avx ,4194319 ,0 ,3 > ,270270.0 ,286700.3 ,0.94 > skylake ,avx ,4194335 ,3 ,0 > ,264288.5 ,279582.4 ,0.95 > skylake ,avx ,4194367 ,3 ,5 > ,270498.4 ,286294.5 ,0.94 > skylake ,avx ,4194304 ,0 ,127 > ,271219.3 ,275129.8 ,0.99 > skylake ,avx ,4194304 ,0 ,255 > ,269996.5 ,270227.6 ,1.0 > skylake ,avx ,4194304 ,0 ,256 > ,267901.1 ,281673.1 ,0.95 > skylake ,avx ,4194304 ,0 ,4064 > ,268390.0 ,279100.3 ,0.96 > skylake ,avx ,8388615 ,0 ,0 > ,803547.9 ,813229.9 ,0.99 > skylake ,avx ,8388623 ,0 ,3 > ,828872.4 ,869413.0 ,0.95 > skylake ,avx ,8388639 ,3 ,0 > ,818000.0 ,873781.7 ,0.94 > skylake ,avx ,8388671 ,3 ,5 > ,824679.0 ,863561.5 ,0.95 > skylake ,avx ,8388608 ,0 ,127 > ,800728.5 ,779000.8 ,1.03 > skylake ,avx ,8388608 ,0 ,255 > ,820071.4 ,770113.2 ,1.06 > skylake ,avx ,8388608 ,0 ,256 > ,825624.6 ,867247.7 ,0.95 > skylake ,avx ,8388608 ,0 ,4064 > ,830209.7 ,894086.6 ,0.93 > skylake ,avx ,16777223 ,0 ,0 > ,1989391.3 ,2132829.8 ,0.93 > skylake ,avx ,16777231 ,0 ,3 > ,1994225.1 ,2211556.0 ,0.9 > skylake ,avx ,16777247 ,3 ,0 > ,1993572.9 ,2213029.9 ,0.9 > skylake ,avx ,16777279 ,3 ,5 > ,2001956.9 ,2211769.7 ,0.91 > skylake ,avx ,16777216 ,0 ,127 > ,1968155.9 ,2127764.7 ,0.92 > skylake ,avx ,16777216 ,0 ,255 > ,1978305.1 ,2121371.3 ,0.93 > skylake ,avx ,16777216 ,0 ,256 > ,1993261.9 ,2206494.1 ,0.9 > skylake ,avx ,16777216 ,0 ,4064 > ,1993808.3 ,2198137.4 ,0.91 > skylake ,avx ,33554439 ,0 ,0 > ,4540216.7 ,4870021.8 ,0.93 > skylake ,avx ,33554447 ,0 ,3 > ,4483505.3 ,4850545.5 ,0.92 > skylake ,avx ,33554463 ,3 ,0 > ,4501944.5 ,4870922.4 ,0.92 > skylake ,avx ,33554495 ,3 ,5 > ,4484565.5 ,4845392.4 ,0.93 > skylake ,avx ,33554432 ,0 ,127 > ,4408639.3 ,4701698.6 ,0.94 > skylake ,avx ,33554432 ,0 ,255 > ,4445826.0 ,4678142.9 ,0.95 > skylake ,avx ,33554432 ,0 ,256 > ,4497953.2 ,4844498.6 ,0.93 > skylake ,avx ,33554432 ,0 ,4064 > ,4501572.4 ,4839209.4 ,0.93 > > > > > -- > > H.J.
Sorry, there was a mistake in the last set of data. It was truncated to N = 10. Here is the N = 100 data. Sorry for the spam! On Sat, Apr 3, 2021 at 3:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Last message got formatted weirdly. Here is file with the data. > > On Sat, Apr 3, 2021 at 3:41 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Sat, Apr 3, 2021 at 1:46 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Sat, Apr 3, 2021 at 1:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > From: noah <goldstein.w.n@gmail.com> > > > > > > > > No Bug. This commit updates the large memcpy case (no overlap). The > > > > update is to perform memcpy on either 2 or 4 contiguous pages at > > > > once. This 1) helps to alleviate the affects of false memory aliasing > > > > when destination and source have a close 4k alignment and 2) In most > > > > cases and for most DRAM units is a modestly more efficient access > > > > pattern. These changes are a clear performance improvement for > > > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > > > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > > > > pass. > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > --- > > > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the > > > > loops and no longer see any meaningful regression. > > > > > > > > Also added back the temporal stores for the tail. Saw a regression > > > > when doing these tests. > > > > > > > > Two tables below for skylake and icelake numbers for the areas around > > > > where you saw the regression. Below is all data from the tests. > > > > > > > > N = 10. > > > > > > > > Skylake > > > > Len ,align1 ,align2 ,new mean ,old mean > > > > 4103 ,0 ,64 ,84.5 ,88.6 > > > > 4111 ,0 ,3 ,99.0 ,99.9 > > > > 4127 ,3 ,0 ,102.1 ,102.3 > > > > 4159 ,3 ,7 ,88.7 ,90.9 > > > > 4223 ,9 ,5 ,88.1 ,87.4 > > > > 8199 ,0 ,64 ,146.7 ,150.2 > > > > 8207 ,0 ,3 ,167.9 ,168.5 > > > > 8223 ,3 ,0 ,168.5 ,168.1 > > > > 8255 ,3 ,7 ,157.0 ,159.2 > > > > 8319 ,9 ,5 ,155.5 ,155.7 > > > > 16391 ,0 ,64 ,286.2 ,288.8 > > > > 16399 ,0 ,3 ,307.0 ,308.7 > > > > 16415 ,3 ,0 ,307.4 ,307.6 > > > > 16447 ,3 ,7 ,294.6 ,295.5 > > > > 16511 ,9 ,5 ,291.5 ,462.1 > > > > 32775 ,0 ,64 ,603.4 ,601.5 > > > > 32783 ,0 ,3 ,604.8 ,606.4 > > > > 32799 ,3 ,0 ,603.0 ,604.1 > > > > 32831 ,3 ,7 ,600.2 ,737.3 > > > > 32895 ,9 ,5 ,604.4 ,599.5 > > > > 65543 ,0 ,64 ,1873.5 ,1854.3 > > > > 65551 ,0 ,3 ,1862.9 ,1846.6 > > > > 65567 ,3 ,0 ,1885.5 ,1966.0 > > > > 65599 ,3 ,7 ,1833.2 ,1833.1 > > > > 65663 ,9 ,5 ,1884.9 ,1887.4 > > > > 131079 ,0 ,64 ,3944.3 ,3949.4 > > > > 131087 ,0 ,3 ,3927.3 ,3913.3 > > > > 131103 ,3 ,0 ,4415.8 ,4169.4 > > > > 131135 ,3 ,7 ,4224.5 ,4157.6 > > > > 131199 ,9 ,5 ,5974.0 ,4983.8 > > > > 262151 ,0 ,64 ,11050.2 ,10620.6 > > > > 262159 ,0 ,3 ,9932.8 ,10037.3 > > > > 262175 ,3 ,0 ,10188.8 ,9206.6 > > > > 262207 ,3 ,7 ,9633.3 ,9216.7 > > > > 262271 ,9 ,5 ,9732.7 ,9345.3 > > > > 524295 ,0 ,64 ,24823.9 ,24880.7 > > > > 524303 ,0 ,3 ,24514.0 ,24556.7 > > > > 524319 ,3 ,0 ,23974.4 ,24219.9 > > > > 524351 ,3 ,7 ,24159.7 ,24207.0 > > > > 524415 ,9 ,5 ,23946.5 ,24142.8 > > > > > > > > Icelake: > > > > Len ,align1 ,align2 ,new mean ,old mean > > > > 4103 ,0 ,64 ,50.2 ,63.7 > > > > 4111 ,0 ,3 ,63.7 ,65.1 > > > > 4127 ,3 ,0 ,68.2 ,69.4 > > > > 4159 ,3 ,7 ,59.6 ,68.0 > > > > 4223 ,9 ,5 ,68.2 ,66.8 > > > > 8199 ,0 ,64 ,92.1 ,89.9 > > > > 8207 ,0 ,3 ,119.7 ,118.3 > > > > 8223 ,3 ,0 ,119.1 ,120.9 > > > > 8255 ,3 ,7 ,122.9 ,123.7 > > > > 8319 ,9 ,5 ,122.1 ,121.8 > > > > 16391 ,0 ,64 ,162.7 ,158.0 > > > > 16399 ,0 ,3 ,227.6 ,234.1 > > > > 16415 ,3 ,0 ,230.8 ,232.7 > > > > 16447 ,3 ,7 ,226.8 ,232.6 > > > > 16511 ,9 ,5 ,233.4 ,233.8 > > > > 32775 ,0 ,64 ,312.2 ,301.8 > > > > 32783 ,0 ,3 ,449.7 ,450.0 > > > > 32799 ,3 ,0 ,452.7 ,455.9 > > > > 32831 ,3 ,7 ,449.8 ,458.0 > > > > 32895 ,9 ,5 ,456.3 ,459.4 > > > > 65543 ,0 ,64 ,1460.6 ,1463.9 > > > > 65551 ,0 ,3 ,1462.0 ,1465.4 > > > > 65567 ,3 ,0 ,1466.6 ,1480.4 > > > > 65599 ,3 ,7 ,1488.0 ,1488.9 > > > > 65663 ,9 ,5 ,1680.8 ,1499.5 > > > > 131079 ,0 ,64 ,2988.5 ,3010.1 > > > > 131087 ,0 ,3 ,2995.5 ,2996.4 > > > > 131103 ,3 ,0 ,3006.2 ,3000.5 > > > > 131135 ,3 ,7 ,3032.4 ,3073.7 > > > > 131199 ,9 ,5 ,3010.4 ,3027.4 > > > > 262151 ,0 ,64 ,6143.2 ,6079.1 > > > > 262159 ,0 ,3 ,6085.1 ,6075.8 > > > > 262175 ,3 ,0 ,6088.0 ,6064.9 > > > > 262207 ,3 ,7 ,6018.7 ,6023.5 > > > > 262271 ,9 ,5 ,6019.8 ,5959.2 > > > > 524295 ,0 ,64 ,14464.2 ,14095.1 > > > > 524303 ,0 ,3 ,14761.6 ,14050.2 > > > > 524319 ,3 ,0 ,14534.1 ,14087.5 > > > > 524351 ,3 ,7 ,14147.7 ,13903.8 > > > > 524415 ,9 ,5 ,14157.0 ,13982.9 > > > > > > > > > > > > > > > > cpu ,version ,Len ,align1 ,align2 ,new mean ,old mean > > > > skylake ,avx ,4103 ,0 ,64 ,84.5 ,88.6 > > > > skylake ,avx ,4111 ,0 ,3 ,99.0 ,99.9 > > > > skylake ,avx ,4127 ,3 ,0 ,102.1 ,102.3 > > > > skylake ,avx ,4159 ,3 ,7 ,88.7 ,90.9 > > > > skylake ,avx ,4223 ,9 ,5 ,88.1 ,87.4 > > > > skylake ,avx ,8199 ,0 ,64 ,146.7 ,150.2 > > > > skylake ,avx ,8207 ,0 ,3 ,167.9 ,168.5 > > > > skylake ,avx ,8223 ,3 ,0 ,168.5 ,168.1 > > > > skylake ,avx ,8255 ,3 ,7 ,157.0 ,159.2 > > > > skylake ,avx ,8319 ,9 ,5 ,155.5 ,155.7 > > > > skylake ,avx ,16391 ,0 ,64 ,286.2 ,288.8 > > > > skylake ,avx ,16399 ,0 ,3 ,307.0 ,308.7 > > > > skylake ,avx ,16415 ,3 ,0 ,307.4 ,307.6 > > > > skylake ,avx ,16447 ,3 ,7 ,294.6 ,295.5 > > > > skylake ,avx ,16511 ,9 ,5 ,291.5 ,462.1 > > > > skylake ,avx ,32775 ,0 ,64 ,603.4 ,601.5 > > > > skylake ,avx ,32783 ,0 ,3 ,604.8 ,606.4 > > > > skylake ,avx ,32799 ,3 ,0 ,603.0 ,604.1 > > > > skylake ,avx ,32831 ,3 ,7 ,600.2 ,737.3 > > > > skylake ,avx ,32895 ,9 ,5 ,604.4 ,599.5 > > > > skylake ,avx ,65543 ,0 ,64 ,1873.5 ,1854.3 > > > > skylake ,avx ,65551 ,0 ,3 ,1862.9 ,1846.6 > > > > skylake ,avx ,65567 ,3 ,0 ,1885.5 ,1966.0 > > > > skylake ,avx ,65599 ,3 ,7 ,1833.2 ,1833.1 > > > > skylake ,avx ,65663 ,9 ,5 ,1884.9 ,1887.4 > > > > skylake ,avx ,131079 ,0 ,64 ,3944.3 ,3949.4 > > > > skylake ,avx ,131087 ,0 ,3 ,3927.3 ,3913.3 > > > > skylake ,avx ,131103 ,3 ,0 ,4415.8 ,4169.4 > > > > skylake ,avx ,131135 ,3 ,7 ,4224.5 ,4157.6 > > > > skylake ,avx ,131199 ,9 ,5 ,5974.0 ,4983.8 > > > > skylake ,avx ,262151 ,0 ,64 ,11050.2 ,10620.6 > > > > skylake ,avx ,262159 ,0 ,3 ,9932.8 ,10037.3 > > > > skylake ,avx ,262175 ,3 ,0 ,10188.8 ,9206.6 > > > > skylake ,avx ,262207 ,3 ,7 ,9633.3 ,9216.7 > > > > skylake ,avx ,262271 ,9 ,5 ,9732.7 ,9345.3 > > > > skylake ,avx ,524295 ,0 ,64 ,24823.9 ,24880.7 > > > > skylake ,avx ,524303 ,0 ,3 ,24514.0 ,24556.7 > > > > skylake ,avx ,524319 ,3 ,0 ,23974.4 ,24219.9 > > > > skylake ,avx ,524351 ,3 ,7 ,24159.7 ,24207.0 > > > > skylake ,avx ,524415 ,9 ,5 ,23946.5 ,24142.8 > > > > skylake ,avx ,1048583 ,0 ,64 ,49163.9 ,49454.6 > > > > skylake ,avx ,1048591 ,0 ,3 ,49879.3 ,49400.8 > > > > skylake ,avx ,1048607 ,3 ,0 ,49738.0 ,48864.6 > > > > skylake ,avx ,1048639 ,3 ,7 ,48804.0 ,47588.5 > > > > skylake ,avx ,1048703 ,9 ,5 ,49629.4 ,49796.3 > > > > skylake ,avx ,2097159 ,0 ,64 ,98271.7 ,96330.6 > > > > skylake ,avx ,2097167 ,0 ,3 ,97801.8 ,98638.1 > > > > skylake ,avx ,2097183 ,3 ,0 ,98041.1 ,99287.6 > > > > skylake ,avx ,2097215 ,3 ,7 ,96629.5 ,96521.9 > > > > skylake ,avx ,2097279 ,9 ,5 ,98961.8 ,98909.8 > > > > skylake ,avx ,4194311 ,0 ,64 ,194667.7 ,195377.1 > > > > skylake ,avx ,4194319 ,0 ,3 ,194919.5 ,198576.2 > > > > skylake ,avx ,4194335 ,3 ,0 ,192949.8 ,194584.7 > > > > skylake ,avx ,4194367 ,3 ,7 ,189943.5 ,189177.9 > > > > skylake ,avx ,4194431 ,9 ,5 ,192479.1 ,196494.2 > > > > skylake ,avx ,8388615 ,0 ,64 ,588671.6 ,587215.4 > > > > skylake ,avx ,8388623 ,0 ,3 ,581640.7 ,582812.5 > > > > skylake ,avx ,8388639 ,3 ,0 ,549811.9 ,544697.6 > > > > skylake ,avx ,8388671 ,3 ,7 ,591155.0 ,577951.8 > > > > skylake ,avx ,8388735 ,9 ,5 ,547583.2 ,545133.3 > > > > skylake ,avx ,16777223 ,0 ,64 ,1787503.0 ,1811146.0 > > > > skylake ,avx ,16777231 ,0 ,3 ,1758671.0 ,1756343.0 > > > > skylake ,avx ,16777247 ,3 ,0 ,1691781.0 ,1694661.0 > > > > skylake ,avx ,16777279 ,3 ,7 ,1768150.0 ,1754785.0 > > > > skylake ,avx ,16777343 ,9 ,5 ,1695179.0 ,1710794.0 > > > > skylake ,sse2 ,4103 ,0 ,64 ,150.8 ,150.5 > > > > skylake ,sse2 ,4111 ,0 ,3 ,156.8 ,158.4 > > > > skylake ,sse2 ,4127 ,3 ,0 ,99.7 ,99.4 > > > > skylake ,sse2 ,4159 ,3 ,7 ,154.8 ,154.5 > > > > skylake ,sse2 ,4223 ,9 ,5 ,137.3 ,137.2 > > > > skylake ,sse2 ,8199 ,0 ,64 ,284.8 ,285.5 > > > > skylake ,sse2 ,8207 ,0 ,3 ,296.0 ,296.1 > > > > skylake ,sse2 ,8223 ,3 ,0 ,168.0 ,168.2 > > > > skylake ,sse2 ,8255 ,3 ,7 ,293.0 ,292.4 > > > > skylake ,sse2 ,8319 ,9 ,5 ,251.3 ,250.7 > > > > skylake ,sse2 ,16391 ,0 ,64 ,561.3 ,608.3 > > > > skylake ,sse2 ,16399 ,0 ,3 ,571.0 ,574.8 > > > > skylake ,sse2 ,16415 ,3 ,0 ,305.4 ,305.0 > > > > skylake ,sse2 ,16447 ,3 ,7 ,563.2 ,565.0 > > > > skylake ,sse2 ,16511 ,9 ,5 ,477.1 ,475.1 > > > > skylake ,sse2 ,32775 ,0 ,64 ,1128.2 ,1131.7 > > > > skylake ,sse2 ,32783 ,0 ,3 ,1126.6 ,1131.0 > > > > skylake ,sse2 ,32799 ,3 ,0 ,587.6 ,590.8 > > > > skylake ,sse2 ,32831 ,3 ,7 ,1130.6 ,1126.2 > > > > skylake ,sse2 ,32895 ,9 ,5 ,957.6 ,953.0 > > > > skylake ,sse2 ,65543 ,0 ,64 ,2718.9 ,2704.2 > > > > skylake ,sse2 ,65551 ,0 ,3 ,2724.1 ,2725.0 > > > > skylake ,sse2 ,65567 ,3 ,0 ,1888.4 ,1914.3 > > > > skylake ,sse2 ,65599 ,3 ,7 ,2787.6 ,2748.7 > > > > skylake ,sse2 ,65663 ,9 ,5 ,2400.5 ,2369.4 > > > > skylake ,sse2 ,131079 ,0 ,64 ,5603.3 ,5654.9 > > > > skylake ,sse2 ,131087 ,0 ,3 ,5939.3 ,5871.4 > > > > skylake ,sse2 ,131103 ,3 ,0 ,4272.4 ,4190.0 > > > > skylake ,sse2 ,131135 ,3 ,7 ,7601.4 ,7524.6 > > > > skylake ,sse2 ,131199 ,9 ,5 ,7022.1 ,6864.7 > > > > skylake ,sse2 ,262151 ,0 ,64 ,13736.2 ,14030.0 > > > > skylake ,sse2 ,262159 ,0 ,3 ,12407.3 ,12334.1 > > > > skylake ,sse2 ,262175 ,3 ,0 ,9661.1 ,9249.4 > > > > skylake ,sse2 ,262207 ,3 ,7 ,12850.2 ,12351.6 > > > > skylake ,sse2 ,262271 ,9 ,5 ,10792.6 ,10435.8 > > > > skylake ,sse2 ,524295 ,0 ,64 ,27754.5 ,28177.7 > > > > skylake ,sse2 ,524303 ,0 ,3 ,27766.2 ,28152.0 > > > > skylake ,sse2 ,524319 ,3 ,0 ,24030.9 ,24438.3 > > > > skylake ,sse2 ,524351 ,3 ,7 ,27787.5 ,27933.0 > > > > skylake ,sse2 ,524415 ,9 ,5 ,24263.2 ,25249.1 > > > > skylake ,sse2 ,1048583 ,0 ,64 ,56199.9 ,56039.8 > > > > skylake ,sse2 ,1048591 ,0 ,3 ,56750.2 ,58889.7 > > > > skylake ,sse2 ,1048607 ,3 ,0 ,56394.0 ,55115.3 > > > > skylake ,sse2 ,1048639 ,3 ,7 ,57233.1 ,57473.8 > > > > skylake ,sse2 ,1048703 ,9 ,5 ,56324.3 ,55917.9 > > > > skylake ,sse2 ,2097159 ,0 ,64 ,113234.8 ,114346.4 > > > > skylake ,sse2 ,2097167 ,0 ,3 ,114373.1 ,115522.5 > > > > skylake ,sse2 ,2097183 ,3 ,0 ,108113.3 ,108513.3 > > > > skylake ,sse2 ,2097215 ,3 ,7 ,116863.6 ,116549.9 > > > > skylake ,sse2 ,2097279 ,9 ,5 ,108945.1 ,108843.7 > > > > skylake ,sse2 ,4194311 ,0 ,64 ,230250.1 ,232350.0 > > > > skylake ,sse2 ,4194319 ,0 ,3 ,231895.3 ,235055.6 > > > > skylake ,sse2 ,4194335 ,3 ,0 ,218442.8 ,219199.8 > > > > skylake ,sse2 ,4194367 ,3 ,7 ,242564.2 ,235587.7 > > > > skylake ,sse2 ,4194431 ,9 ,5 ,224167.4 ,215261.8 > > > > skylake ,sse2 ,8388615 ,0 ,64 ,679801.8 ,674832.0 > > > > skylake ,sse2 ,8388623 ,0 ,3 ,684913.2 ,685238.7 > > > > skylake ,sse2 ,8388639 ,3 ,0 ,644865.4 ,631388.6 > > > > skylake ,sse2 ,8388671 ,3 ,7 ,698700.9 ,689316.1 > > > > skylake ,sse2 ,8388735 ,9 ,5 ,644820.2 ,631366.8 > > > > skylake ,sse2 ,16777223 ,0 ,64 ,1877984.0 ,1876437.0 > > > > skylake ,sse2 ,16777231 ,0 ,3 ,1898086.0 ,1913053.0 > > > > skylake ,sse2 ,16777247 ,3 ,0 ,1857018.0 ,1866949.0 > > > > skylake ,sse2 ,16777279 ,3 ,7 ,1914905.0 ,1897134.0 > > > > skylake ,sse2 ,16777343 ,9 ,5 ,1859937.0 ,1881939.0 > > > > icelake ,avx512 ,4103 ,0 ,64 ,75.2 ,75.8 > > > > icelake ,avx512 ,4111 ,0 ,3 ,56.9 ,56.4 > > > > icelake ,avx512 ,4127 ,3 ,0 ,59.1 ,59.6 > > > > icelake ,avx512 ,4159 ,3 ,7 ,50.7 ,51.3 > > > > icelake ,avx512 ,4223 ,9 ,5 ,59.2 ,58.9 > > > > icelake ,avx512 ,8199 ,0 ,64 ,67.8 ,63.9 > > > > icelake ,avx512 ,8207 ,0 ,3 ,89.0 ,89.9 > > > > icelake ,avx512 ,8223 ,3 ,0 ,90.2 ,90.1 > > > > icelake ,avx512 ,8255 ,3 ,7 ,82.6 ,84.9 > > > > icelake ,avx512 ,8319 ,9 ,5 ,91.5 ,92.8 > > > > icelake ,avx512 ,16391 ,0 ,64 ,118.0 ,117.6 > > > > icelake ,avx512 ,16399 ,0 ,3 ,156.5 ,157.0 > > > > icelake ,avx512 ,16415 ,3 ,0 ,157.4 ,157.3 > > > > icelake ,avx512 ,16447 ,3 ,7 ,151.0 ,151.6 > > > > icelake ,avx512 ,16511 ,9 ,5 ,159.1 ,159.6 > > > > icelake ,avx512 ,32775 ,0 ,64 ,231.8 ,230.8 > > > > icelake ,avx512 ,32783 ,0 ,3 ,297.8 ,299.3 > > > > icelake ,avx512 ,32799 ,3 ,0 ,299.1 ,299.0 > > > > icelake ,avx512 ,32831 ,3 ,7 ,293.5 ,295.4 > > > > icelake ,avx512 ,32895 ,9 ,5 ,300.3 ,302.5 > > > > icelake ,avx512 ,65543 ,0 ,64 ,1473.4 ,1479.2 > > > > icelake ,avx512 ,65551 ,0 ,3 ,1438.2 ,1445.3 > > > > icelake ,avx512 ,65567 ,3 ,0 ,1450.3 ,1463.8 > > > > icelake ,avx512 ,65599 ,3 ,7 ,1469.0 ,1473.8 > > > > icelake ,avx512 ,65663 ,9 ,5 ,1480.0 ,1483.5 > > > > icelake ,avx512 ,131079 ,0 ,64 ,3015.1 ,3037.5 > > > > icelake ,avx512 ,131087 ,0 ,3 ,2952.3 ,2960.4 > > > > icelake ,avx512 ,131103 ,3 ,0 ,2966.2 ,2964.4 > > > > icelake ,avx512 ,131135 ,3 ,7 ,2961.6 ,3047.9 > > > > icelake ,avx512 ,131199 ,9 ,5 ,2967.4 ,3183.8 > > > > icelake ,avx512 ,262151 ,0 ,64 ,6206.0 ,6141.5 > > > > icelake ,avx512 ,262159 ,0 ,3 ,5990.8 ,5959.2 > > > > icelake ,avx512 ,262175 ,3 ,0 ,5976.7 ,5963.8 > > > > icelake ,avx512 ,262207 ,3 ,7 ,5939.5 ,5924.3 > > > > icelake ,avx512 ,262271 ,9 ,5 ,5944.6 ,5990.3 > > > > icelake ,avx512 ,524295 ,0 ,64 ,14726.7 ,14307.0 > > > > icelake ,avx512 ,524303 ,0 ,3 ,14344.2 ,14040.5 > > > > icelake ,avx512 ,524319 ,3 ,0 ,14175.0 ,13862.2 > > > > icelake ,avx512 ,524351 ,3 ,7 ,14261.4 ,13821.5 > > > > icelake ,avx512 ,524415 ,9 ,5 ,14266.5 ,14064.7 > > > > icelake ,avx512 ,1048583 ,0 ,64 ,35211.4 ,35414.6 > > > > icelake ,avx512 ,1048591 ,0 ,3 ,35156.8 ,35591.2 > > > > icelake ,avx512 ,1048607 ,3 ,0 ,35273.1 ,35503.3 > > > > icelake ,avx512 ,1048639 ,3 ,7 ,35255.8 ,35725.0 > > > > icelake ,avx512 ,1048703 ,9 ,5 ,35703.6 ,36289.9 > > > > icelake ,avx512 ,2097159 ,0 ,64 ,72613.9 ,72063.2 > > > > icelake ,avx512 ,2097167 ,0 ,3 ,72301.6 ,73504.2 > > > > icelake ,avx512 ,2097183 ,3 ,0 ,73448.8 ,72133.6 > > > > icelake ,avx512 ,2097215 ,3 ,7 ,73762.9 ,72825.8 > > > > icelake ,avx512 ,2097279 ,9 ,5 ,72097.3 ,72914.6 > > > > icelake ,avx512 ,4194311 ,0 ,64 ,144793.4 ,144182.1 > > > > icelake ,avx512 ,4194319 ,0 ,3 ,143710.3 ,145063.3 > > > > icelake ,avx512 ,4194335 ,3 ,0 ,146722.1 ,144046.4 > > > > icelake ,avx512 ,4194367 ,3 ,7 ,144267.0 ,144874.6 > > > > icelake ,avx512 ,4194431 ,9 ,5 ,143808.2 ,144560.0 > > > > icelake ,avx512 ,8388615 ,0 ,64 ,427993.4 ,424521.5 > > > > icelake ,avx512 ,8388623 ,0 ,3 ,470267.1 ,473290.8 > > > > icelake ,avx512 ,8388639 ,3 ,0 ,457179.7 ,461797.7 > > > > icelake ,avx512 ,8388671 ,3 ,7 ,472507.9 ,481561.4 > > > > icelake ,avx512 ,8388735 ,9 ,5 ,463611.9 ,467388.7 > > > > icelake ,avx512 ,16777223 ,0 ,64 ,1490426.0 ,1526996.0 > > > > icelake ,avx512 ,16777231 ,0 ,3 ,1516687.0 ,1517095.0 > > > > icelake ,avx512 ,16777247 ,3 ,0 ,1497688.0 ,1512766.0 > > > > icelake ,avx512 ,16777279 ,3 ,7 ,1512331.0 ,1524317.0 > > > > icelake ,avx512 ,16777343 ,9 ,5 ,1498908.0 ,1500526.0 > > > > icelake ,avx ,4103 ,0 ,64 ,50.2 ,63.7 > > > > icelake ,avx ,4111 ,0 ,3 ,63.7 ,65.1 > > > > icelake ,avx ,4127 ,3 ,0 ,68.2 ,69.4 > > > > icelake ,avx ,4159 ,3 ,7 ,59.6 ,68.0 > > > > icelake ,avx ,4223 ,9 ,5 ,68.2 ,66.8 > > > > icelake ,avx ,8199 ,0 ,64 ,92.1 ,89.9 > > > > icelake ,avx ,8207 ,0 ,3 ,119.7 ,118.3 > > > > icelake ,avx ,8223 ,3 ,0 ,119.1 ,120.9 > > > > icelake ,avx ,8255 ,3 ,7 ,122.9 ,123.7 > > > > icelake ,avx ,8319 ,9 ,5 ,122.1 ,121.8 > > > > icelake ,avx ,16391 ,0 ,64 ,162.7 ,158.0 > > > > icelake ,avx ,16399 ,0 ,3 ,227.6 ,234.1 > > > > icelake ,avx ,16415 ,3 ,0 ,230.8 ,232.7 > > > > icelake ,avx ,16447 ,3 ,7 ,226.8 ,232.6 > > > > icelake ,avx ,16511 ,9 ,5 ,233.4 ,233.8 > > > > icelake ,avx ,32775 ,0 ,64 ,312.2 ,301.8 > > > > icelake ,avx ,32783 ,0 ,3 ,449.7 ,450.0 > > > > icelake ,avx ,32799 ,3 ,0 ,452.7 ,455.9 > > > > icelake ,avx ,32831 ,3 ,7 ,449.8 ,458.0 > > > > icelake ,avx ,32895 ,9 ,5 ,456.3 ,459.4 > > > > icelake ,avx ,65543 ,0 ,64 ,1460.6 ,1463.9 > > > > icelake ,avx ,65551 ,0 ,3 ,1462.0 ,1465.4 > > > > icelake ,avx ,65567 ,3 ,0 ,1466.6 ,1480.4 > > > > icelake ,avx ,65599 ,3 ,7 ,1488.0 ,1488.9 > > > > icelake ,avx ,65663 ,9 ,5 ,1680.8 ,1499.5 > > > > icelake ,avx ,131079 ,0 ,64 ,2988.5 ,3010.1 > > > > icelake ,avx ,131087 ,0 ,3 ,2995.5 ,2996.4 > > > > icelake ,avx ,131103 ,3 ,0 ,3006.2 ,3000.5 > > > > icelake ,avx ,131135 ,3 ,7 ,3032.4 ,3073.7 > > > > icelake ,avx ,131199 ,9 ,5 ,3010.4 ,3027.4 > > > > icelake ,avx ,262151 ,0 ,64 ,6143.2 ,6079.1 > > > > icelake ,avx ,262159 ,0 ,3 ,6085.1 ,6075.8 > > > > icelake ,avx ,262175 ,3 ,0 ,6088.0 ,6064.9 > > > > icelake ,avx ,262207 ,3 ,7 ,6018.7 ,6023.5 > > > > icelake ,avx ,262271 ,9 ,5 ,6019.8 ,5959.2 > > > > icelake ,avx ,524295 ,0 ,64 ,14464.2 ,14095.1 > > > > icelake ,avx ,524303 ,0 ,3 ,14761.6 ,14050.2 > > > > icelake ,avx ,524319 ,3 ,0 ,14534.1 ,14087.5 > > > > icelake ,avx ,524351 ,3 ,7 ,14147.7 ,13903.8 > > > > icelake ,avx ,524415 ,9 ,5 ,14157.0 ,13982.9 > > > > icelake ,avx ,1048583 ,0 ,64 ,36599.0 ,37461.4 > > > > icelake ,avx ,1048591 ,0 ,3 ,36717.8 ,37454.9 > > > > icelake ,avx ,1048607 ,3 ,0 ,36821.2 ,37343.3 > > > > icelake ,avx ,1048639 ,3 ,7 ,36958.0 ,37507.2 > > > > icelake ,avx ,1048703 ,9 ,5 ,36869.2 ,37413.1 > > > > icelake ,avx ,2097159 ,0 ,64 ,74765.8 ,75330.9 > > > > icelake ,avx ,2097167 ,0 ,3 ,75175.4 ,74891.9 > > > > icelake ,avx ,2097183 ,3 ,0 ,75451.4 ,74787.7 > > > > icelake ,avx ,2097215 ,3 ,7 ,75394.8 ,75839.1 > > > > icelake ,avx ,2097279 ,9 ,5 ,75099.2 ,75421.2 > > > > icelake ,avx ,4194311 ,0 ,64 ,146809.6 ,146619.4 > > > > icelake ,avx ,4194319 ,0 ,3 ,148866.4 ,149898.2 > > > > icelake ,avx ,4194335 ,3 ,0 ,148719.7 ,150165.4 > > > > icelake ,avx ,4194367 ,3 ,7 ,150600.1 ,150925.9 > > > > icelake ,avx ,4194431 ,9 ,5 ,149457.3 ,150519.2 > > > > icelake ,avx ,8388615 ,0 ,64 ,412709.8 ,423666.1 > > > > icelake ,avx ,8388623 ,0 ,3 ,423717.4 ,424418.2 > > > > icelake ,avx ,8388639 ,3 ,0 ,414387.5 ,413445.6 > > > > icelake ,avx ,8388671 ,3 ,7 ,449010.7 ,417553.5 > > > > icelake ,avx ,8388735 ,9 ,5 ,414128.6 ,411815.3 > > > > icelake ,avx ,16777223 ,0 ,64 ,1490032.0 ,1510004.0 > > > > icelake ,avx ,16777231 ,0 ,3 ,1379638.0 ,1422097.0 > > > > icelake ,avx ,16777247 ,3 ,0 ,1418930.0 ,1367557.0 > > > > icelake ,avx ,16777279 ,3 ,7 ,1515152.0 ,1500176.0 > > > > icelake ,avx ,16777343 ,9 ,5 ,1344117.0 ,1411795.0 > > > > icelake ,sse2 ,4103 ,0 ,64 ,113.2 ,114.6 > > > > icelake ,sse2 ,4111 ,0 ,3 ,121.5 ,120.4 > > > > icelake ,sse2 ,4127 ,3 ,0 ,1700.5 ,1771.5 > > > > icelake ,sse2 ,4159 ,3 ,7 ,119.3 ,118.8 > > > > icelake ,sse2 ,4223 ,9 ,5 ,1739.7 ,1735.2 > > > > icelake ,sse2 ,8199 ,0 ,64 ,207.0 ,203.9 > > > > icelake ,sse2 ,8207 ,0 ,3 ,225.5 ,220.8 > > > > icelake ,sse2 ,8223 ,3 ,0 ,3444.3 ,3743.5 > > > > icelake ,sse2 ,8255 ,3 ,7 ,219.9 ,216.8 > > > > icelake ,sse2 ,8319 ,9 ,5 ,4117.1 ,3487.3 > > > > icelake ,sse2 ,16391 ,0 ,64 ,397.1 ,394.3 > > > > icelake ,sse2 ,16399 ,0 ,3 ,439.6 ,428.6 > > > > icelake ,sse2 ,16415 ,3 ,0 ,6997.0 ,7031.2 > > > > icelake ,sse2 ,16447 ,3 ,7 ,426.8 ,421.8 > > > > icelake ,sse2 ,16511 ,9 ,5 ,7037.6 ,7038.3 > > > > icelake ,sse2 ,32775 ,0 ,64 ,790.9 ,779.0 > > > > icelake ,sse2 ,32783 ,0 ,3 ,863.1 ,849.6 > > > > icelake ,sse2 ,32799 ,3 ,0 ,14043.0 ,14390.9 > > > > icelake ,sse2 ,32831 ,3 ,7 ,841.6 ,833.1 > > > > icelake ,sse2 ,32895 ,9 ,5 ,14277.6 ,14344.2 > > > > icelake ,sse2 ,65543 ,0 ,64 ,1897.0 ,1897.3 > > > > icelake ,sse2 ,65551 ,0 ,3 ,1927.1 ,1955.4 > > > > icelake ,sse2 ,65567 ,3 ,0 ,28834.7 ,28727.8 > > > > icelake ,sse2 ,65599 ,3 ,7 ,1961.4 ,1969.7 > > > > icelake ,sse2 ,65663 ,9 ,5 ,28867.6 ,29019.8 > > > > icelake ,sse2 ,131079 ,0 ,64 ,3879.3 ,3872.6 > > > > icelake ,sse2 ,131087 ,0 ,3 ,3955.3 ,3990.7 > > > > icelake ,sse2 ,131103 ,3 ,0 ,58001.8 ,60567.9 > > > > icelake ,sse2 ,131135 ,3 ,7 ,3951.5 ,4002.6 > > > > icelake ,sse2 ,131199 ,9 ,5 ,57886.7 ,58391.4 > > > > icelake ,sse2 ,262151 ,0 ,64 ,7851.4 ,7894.7 > > > > icelake ,sse2 ,262159 ,0 ,3 ,7947.5 ,8016.2 > > > > icelake ,sse2 ,262175 ,3 ,0 ,115036.2 ,115968.6 > > > > icelake ,sse2 ,262207 ,3 ,7 ,7883.9 ,7814.1 > > > > icelake ,sse2 ,262271 ,9 ,5 ,113776.4 ,119733.6 > > > > icelake ,sse2 ,524295 ,0 ,64 ,17198.1 ,16974.9 > > > > icelake ,sse2 ,524303 ,0 ,3 ,17402.2 ,17096.3 > > > > icelake ,sse2 ,524319 ,3 ,0 ,223980.4 ,225889.9 > > > > icelake ,sse2 ,524351 ,3 ,7 ,17034.9 ,16910.3 > > > > icelake ,sse2 ,524415 ,9 ,5 ,224027.7 ,224962.5 > > > > icelake ,sse2 ,1048583 ,0 ,64 ,38822.3 ,39178.6 > > > > icelake ,sse2 ,1048591 ,0 ,3 ,41686.7 ,40247.4 > > > > icelake ,sse2 ,1048607 ,3 ,0 ,38814.8 ,39323.3 > > > > icelake ,sse2 ,1048639 ,3 ,7 ,39568.3 ,41325.7 > > > > icelake ,sse2 ,1048703 ,9 ,5 ,39354.2 ,39637.9 > > > > icelake ,sse2 ,2097159 ,0 ,64 ,84074.7 ,84543.1 > > > > icelake ,sse2 ,2097167 ,0 ,3 ,83665.7 ,82358.2 > > > > icelake ,sse2 ,2097183 ,3 ,0 ,81817.8 ,79638.9 > > > > icelake ,sse2 ,2097215 ,3 ,7 ,83649.1 ,83497.6 > > > > icelake ,sse2 ,2097279 ,9 ,5 ,80287.6 ,79980.9 > > > > icelake ,sse2 ,4194311 ,0 ,64 ,165409.8 ,168343.1 > > > > icelake ,sse2 ,4194319 ,0 ,3 ,165216.7 ,177632.0 > > > > icelake ,sse2 ,4194335 ,3 ,0 ,158718.7 ,160342.2 > > > > icelake ,sse2 ,4194367 ,3 ,7 ,167944.9 ,167204.4 > > > > icelake ,sse2 ,4194431 ,9 ,5 ,161530.1 ,164839.7 > > > > icelake ,sse2 ,8388615 ,0 ,64 ,626504.3 ,629858.5 > > > > icelake ,sse2 ,8388623 ,0 ,3 ,623969.5 ,631509.1 > > > > icelake ,sse2 ,8388639 ,3 ,0 ,599366.7 ,600016.0 > > > > icelake ,sse2 ,8388671 ,3 ,7 ,619964.2 ,619113.2 > > > > icelake ,sse2 ,8388735 ,9 ,5 ,595338.1 ,604172.4 > > > > icelake ,sse2 ,16777223 ,0 ,64 ,1709597.0 ,1725184.0 > > > > icelake ,sse2 ,16777231 ,0 ,3 ,1725452.0 ,1719746.0 > > > > icelake ,sse2 ,16777247 ,3 ,0 ,1614269.0 ,1607164.0 > > > > icelake ,sse2 ,16777279 ,3 ,7 ,1705295.0 ,1733018.0 > > > > icelake ,sse2 ,16777343 ,9 ,5 ,1604197.0 ,1595690.0 > > > > > > > > > > I am having a hard time to convince myself that this patch is really necessary. > > > What are geomeans of all different cases for each processors? > > > > N = 100, Geometric mean of Current vs New for memcpy-bench-large. Note the > > bench-memmove-large numbers should be unaffected by this patch as the new > > logic only applies to the no overlap case. > > > > cpu ,inst ,Len ,align1 ,align2 ,new > > geomean ,cur geomean ,New/Cur > > icelake ,sse2 ,65543 ,0 ,0 > > ,5566.1 ,5564.7 ,1.0 > > icelake ,sse2 ,65551 ,0 ,3 > > ,5856.4 ,5725.7 ,1.02 > > icelake ,sse2 ,65567 ,3 ,0 > > ,5622.8 ,5892.9 ,0.95 > > icelake ,sse2 ,65599 ,3 ,5 > > ,5857.3 ,5723.8 ,1.02 > > icelake ,sse2 ,65536 ,0 ,127 > > ,5953.3 ,5831.1 ,1.02 > > icelake ,sse2 ,65536 ,0 ,255 > > ,5811.7 ,5789.5 ,1.0 > > icelake ,sse2 ,65536 ,0 ,256 > > ,5373.5 ,5284.1 ,1.02 > > icelake ,sse2 ,65536 ,0 ,4064 > > ,5820.1 ,5761.6 ,1.01 > > icelake ,sse2 ,131079 ,0 ,0 > > ,12421.5 ,12424.1 ,1.0 > > icelake ,sse2 ,131087 ,0 ,3 > > ,12389.5 ,12276.4 ,1.01 > > icelake ,sse2 ,131103 ,3 ,0 > > ,11587.0 ,12607.6 ,0.92 > > icelake ,sse2 ,131135 ,3 ,5 > > ,11596.9 ,11896.2 ,0.97 > > icelake ,sse2 ,131072 ,0 ,127 > > ,11746.4 ,12490.1 ,0.94 > > icelake ,sse2 ,131072 ,0 ,255 > > ,11486.8 ,11831.7 ,0.97 > > icelake ,sse2 ,131072 ,0 ,256 > > ,10453.5 ,10451.7 ,1.0 > > icelake ,sse2 ,131072 ,0 ,4064 > > ,11231.7 ,11223.6 ,1.0 > > icelake ,sse2 ,262151 ,0 ,0 > > ,29408.5 ,30831.2 ,0.95 > > icelake ,sse2 ,262159 ,0 ,3 > > ,30813.6 ,32235.6 ,0.96 > > icelake ,sse2 ,262175 ,3 ,0 > > ,30245.0 ,31392.5 ,0.96 > > icelake ,sse2 ,262207 ,3 ,5 > > ,30775.6 ,32298.6 ,0.95 > > icelake ,sse2 ,262144 ,0 ,127 > > ,31784.7 ,32791.5 ,0.97 > > icelake ,sse2 ,262144 ,0 ,255 > > ,30726.0 ,31997.5 ,0.96 > > icelake ,sse2 ,262144 ,0 ,256 > > ,28418.9 ,29440.9 ,0.97 > > icelake ,sse2 ,262144 ,0 ,4064 > > ,29984.1 ,31048.9 ,0.97 > > icelake ,sse2 ,524295 ,0 ,0 > > ,76079.0 ,75752.0 ,1.0 > > icelake ,sse2 ,524303 ,0 ,3 > > ,79939.3 ,80796.4 ,0.99 > > icelake ,sse2 ,524319 ,3 ,0 > > ,79018.1 ,79928.5 ,0.99 > > icelake ,sse2 ,524351 ,3 ,5 > > ,81219.4 ,81053.8 ,1.0 > > icelake ,sse2 ,524288 ,0 ,127 > > ,80111.8 ,80087.2 ,1.0 > > icelake ,sse2 ,524288 ,0 ,255 > > ,79334.0 ,79525.6 ,1.0 > > icelake ,sse2 ,524288 ,0 ,256 > > ,75766.9 ,75918.9 ,1.0 > > icelake ,sse2 ,524288 ,0 ,4064 > > ,78907.9 ,79550.8 ,0.99 > > icelake ,sse2 ,1048583 ,0 ,0 > > ,144672.6 ,147457.7 ,0.98 > > icelake ,sse2 ,1048591 ,0 ,3 > > ,173803.9 ,400563.2 ,0.43 > > icelake ,sse2 ,1048607 ,3 ,0 > > ,149391.9 ,151772.1 ,0.98 > > icelake ,sse2 ,1048639 ,3 ,5 > > ,174774.1 ,400657.4 ,0.44 > > icelake ,sse2 ,1048576 ,0 ,127 > > ,175350.9 ,347110.6 ,0.51 > > icelake ,sse2 ,1048576 ,0 ,255 > > ,150152.6 ,144242.9 ,1.04 > > icelake ,sse2 ,1048576 ,0 ,256 > > ,145869.7 ,147489.6 ,0.99 > > icelake ,sse2 ,1048576 ,0 ,4064 > > ,145814.7 ,147497.7 ,0.99 > > icelake ,sse2 ,2097159 ,0 ,0 > > ,289460.6 ,295574.6 ,0.98 > > icelake ,sse2 ,2097167 ,0 ,3 > > ,347057.0 ,799549.1 ,0.43 > > icelake ,sse2 ,2097183 ,3 ,0 > > ,298565.7 ,301424.3 ,0.99 > > icelake ,sse2 ,2097215 ,3 ,5 > > ,348620.4 ,797557.4 ,0.44 > > icelake ,sse2 ,2097152 ,0 ,127 > > ,348751.4 ,695260.9 ,0.5 > > icelake ,sse2 ,2097152 ,0 ,255 > > ,298960.5 ,286590.0 ,1.04 > > icelake ,sse2 ,2097152 ,0 ,256 > > ,290978.4 ,293225.6 ,0.99 > > icelake ,sse2 ,2097152 ,0 ,4064 > > ,290476.0 ,292283.2 ,0.99 > > icelake ,sse2 ,4194311 ,0 ,0 > > ,583386.3 ,588284.3 ,0.99 > > icelake ,sse2 ,4194319 ,0 ,3 > > ,703870.5 ,1595268.0 ,0.44 > > icelake ,sse2 ,4194335 ,3 ,0 > > ,599400.2 ,601591.6 ,1.0 > > icelake ,sse2 ,4194367 ,3 ,5 > > ,694569.7 ,1595608.0 ,0.44 > > icelake ,sse2 ,4194304 ,0 ,127 > > ,700229.1 ,1389061.9 ,0.5 > > icelake ,sse2 ,4194304 ,0 ,255 > > ,600779.0 ,573361.2 ,1.05 > > icelake ,sse2 ,4194304 ,0 ,256 > > ,586610.7 ,589269.6 ,1.0 > > icelake ,sse2 ,4194304 ,0 ,4064 > > ,583616.3 ,584806.4 ,1.0 > > icelake ,sse2 ,8388615 ,0 ,0 > > ,1214632.8 ,1266616.0 ,0.96 > > icelake ,sse2 ,8388623 ,0 ,3 > > ,1405136.9 ,3198827.1 ,0.44 > > icelake ,sse2 ,8388639 ,3 ,0 > > ,1244302.6 ,1297425.9 ,0.96 > > icelake ,sse2 ,8388671 ,3 ,5 > > ,1404685.1 ,3196389.9 ,0.44 > > icelake ,sse2 ,8388608 ,0 ,127 > > ,1419888.5 ,2792729.4 ,0.51 > > icelake ,sse2 ,8388608 ,0 ,255 > > ,1249044.6 ,1259726.7 ,0.99 > > icelake ,sse2 ,8388608 ,0 ,256 > > ,1234471.9 ,1300463.6 ,0.95 > > icelake ,sse2 ,8388608 ,0 ,4064 > > ,1220102.2 ,1265190.5 ,0.96 > > icelake ,sse2 ,16777223 ,0 ,0 > > ,2689516.3 ,2846521.1 ,0.94 > > icelake ,sse2 ,16777231 ,0 ,3 > > ,3001317.4 ,6428733.7 ,0.47 > > icelake ,sse2 ,16777247 ,3 ,0 > > ,2770040.8 ,2910434.9 ,0.95 > > icelake ,sse2 ,16777279 ,3 ,5 > > ,3002076.1 ,6415835.9 ,0.47 > > icelake ,sse2 ,16777216 ,0 ,127 > > ,3063786.3 ,5609895.3 ,0.55 > > icelake ,sse2 ,16777216 ,0 ,255 > > ,2821606.1 ,2833843.6 ,1.0 > > icelake ,sse2 ,16777216 ,0 ,256 > > ,2719765.5 ,2925344.2 ,0.93 > > icelake ,sse2 ,16777216 ,0 ,4064 > > ,2686189.2 ,2848017.5 ,0.94 > > icelake ,sse2 ,33554439 ,0 ,0 > > ,5577945.0 ,5913674.6 ,0.94 > > icelake ,sse2 ,33554447 ,0 ,3 > > ,6152758.8 ,12863855.0 ,0.48 > > icelake ,sse2 ,33554463 ,3 ,0 > > ,5773351.4 ,6035289.3 ,0.96 > > icelake ,sse2 ,33554495 ,3 ,5 > > ,6160006.2 ,12878153.9 ,0.48 > > icelake ,sse2 ,33554432 ,0 ,127 > > ,6303495.4 ,11221070.2 ,0.56 > > icelake ,sse2 ,33554432 ,0 ,255 > > ,5830879.6 ,5944978.6 ,0.98 > > icelake ,sse2 ,33554432 ,0 ,256 > > ,5611968.2 ,6068255.4 ,0.92 > > icelake ,sse2 ,33554432 ,0 ,4064 > > ,5570321.0 ,5964542.6 ,0.93 icelake ,avx ,65543 > > ,0 ,0 ,5561.1 ,5659.7 ,0.98 > > icelake ,avx ,65551 ,0 ,3 > > ,5859.9 ,5724.8 ,1.02 > > icelake ,avx ,65567 ,3 ,0 > > ,5636.7 ,5623.3 ,1.0 > > icelake ,avx ,65599 ,3 ,5 > > ,5856.3 ,5720.2 ,1.02 > > icelake ,avx ,65536 ,0 ,127 > > ,6011.1 ,5910.0 ,1.02 > > icelake ,avx ,65536 ,0 ,255 > > ,5854.5 ,5792.3 ,1.01 > > icelake ,avx ,65536 ,0 ,256 > > ,5213.0 ,5273.9 ,0.99 > > icelake ,avx ,65536 ,0 ,4064 > > ,5760.7 ,5661.1 ,1.02 > > icelake ,avx ,131079 ,0 ,0 > > ,12371.4 ,12707.0 ,0.97 > > icelake ,avx ,131087 ,0 ,3 > > ,13220.1 ,12515.7 ,1.06 > > icelake ,avx ,131103 ,3 ,0 > > ,11628.2 ,11546.9 ,1.01 > > icelake ,avx ,131135 ,3 ,5 > > ,13025.7 ,13967.6 ,0.93 > > icelake ,avx ,131072 ,0 ,127 > > ,11781.7 ,11936.4 ,0.99 > > icelake ,avx ,131072 ,0 ,255 > > ,11802.2 ,11583.9 ,1.02 > > icelake ,avx ,131072 ,0 ,256 > > ,10436.9 ,10693.1 ,0.98 > > icelake ,avx ,131072 ,0 ,4064 > > ,11880.9 ,11395.6 ,1.04 > > icelake ,avx ,262151 ,0 ,0 > > ,29132.6 ,30542.8 ,0.95 > > icelake ,avx ,262159 ,0 ,3 > > ,30533.5 ,31468.8 ,0.97 > > icelake ,avx ,262175 ,3 ,0 > > ,29879.5 ,30933.7 ,0.97 > > icelake ,avx ,262207 ,3 ,5 > > ,30263.1 ,31445.0 ,0.96 > > icelake ,avx ,262144 ,0 ,127 > > ,30180.9 ,31405.3 ,0.96 > > icelake ,avx ,262144 ,0 ,255 > > ,30152.9 ,31372.5 ,0.96 > > icelake ,avx ,262144 ,0 ,256 > > ,28121.9 ,28990.9 ,0.97 > > icelake ,avx ,262144 ,0 ,4064 > > ,29785.2 ,31078.4 ,0.96 > > icelake ,avx ,524295 ,0 ,0 > > ,76045.7 ,75824.3 ,1.0 > > icelake ,avx ,524303 ,0 ,3 > > ,79303.7 ,80433.3 ,0.99 > > icelake ,avx ,524319 ,3 ,0 > > ,79323.8 ,79411.3 ,1.0 > > icelake ,avx ,524351 ,3 ,5 > > ,79797.9 ,80179.4 ,1.0 > > icelake ,avx ,524288 ,0 ,127 > > ,80046.7 ,80254.1 ,1.0 > > icelake ,avx ,524288 ,0 ,255 > > ,78580.6 ,79210.4 ,0.99 > > icelake ,avx ,524288 ,0 ,256 > > ,75464.4 ,75184.2 ,1.0 > > icelake ,avx ,524288 ,0 ,4064 > > ,78863.6 ,78677.9 ,1.0 > > icelake ,avx ,1048583 ,0 ,0 > > ,131017.9 ,133962.4 ,0.98 > > icelake ,avx ,1048591 ,0 ,3 > > ,143451.3 ,210311.7 ,0.68 > > icelake ,avx ,1048607 ,3 ,0 > > ,136944.0 ,138426.4 ,0.99 > > icelake ,avx ,1048639 ,3 ,5 > > ,143594.3 ,209887.9 ,0.68 > > icelake ,avx ,1048576 ,0 ,127 > > ,156462.0 ,218873.2 ,0.71 > > icelake ,avx ,1048576 ,0 ,255 > > ,148026.3 ,179419.0 ,0.83 > > icelake ,avx ,1048576 ,0 ,256 > > ,143365.7 ,137816.3 ,1.04 > > icelake ,avx ,1048576 ,0 ,4064 > > ,131683.4 ,132731.6 ,0.99 > > icelake ,avx ,2097159 ,0 ,0 > > ,263807.1 ,267984.5 ,0.98 > > icelake ,avx ,2097167 ,0 ,3 > > ,286949.8 ,422279.2 ,0.68 > > icelake ,avx ,2097183 ,3 ,0 > > ,274675.6 ,276702.2 ,0.99 > > icelake ,avx ,2097215 ,3 ,5 > > ,286681.7 ,420176.7 ,0.68 > > icelake ,avx ,2097152 ,0 ,127 > > ,314499.2 ,437864.2 ,0.72 > > icelake ,avx ,2097152 ,0 ,255 > > ,297458.4 ,359520.9 ,0.83 > > icelake ,avx ,2097152 ,0 ,256 > > ,285883.2 ,276043.2 ,1.04 > > icelake ,avx ,2097152 ,0 ,4064 > > ,263436.6 ,265516.6 ,0.99 > > icelake ,avx ,4194311 ,0 ,0 > > ,529119.4 ,536745.2 ,0.99 > > icelake ,avx ,4194319 ,0 ,3 > > ,573960.0 ,839002.3 ,0.68 > > icelake ,avx ,4194335 ,3 ,0 > > ,550617.2 ,553117.5 ,1.0 > > icelake ,avx ,4194367 ,3 ,5 > > ,572742.8 ,838784.5 ,0.68 > > icelake ,avx ,4194304 ,0 ,127 > > ,629413.6 ,876512.1 ,0.72 > > icelake ,avx ,4194304 ,0 ,255 > > ,594224.1 ,717425.1 ,0.83 > > icelake ,avx ,4194304 ,0 ,256 > > ,573365.0 ,552538.3 ,1.04 > > icelake ,avx ,4194304 ,0 ,4064 > > ,527459.3 ,531907.1 ,0.99 > > icelake ,avx ,8388615 ,0 ,0 > > ,1094256.8 ,1145619.9 ,0.96 > > icelake ,avx ,8388623 ,0 ,3 > > ,1170367.1 ,1700076.4 ,0.69 > > icelake ,avx ,8388639 ,3 ,0 > > ,1136168.1 ,1174752.4 ,0.97 > > icelake ,avx ,8388671 ,3 ,5 > > ,1172015.6 ,1703032.8 ,0.69 > > icelake ,avx ,8388608 ,0 ,127 > > ,1276748.6 ,1771351.9 ,0.72 > > icelake ,avx ,8388608 ,0 ,255 > > ,1207712.0 ,1449267.0 ,0.83 > > icelake ,avx ,8388608 ,0 ,256 > > ,1167958.9 ,1178243.1 ,0.99 > > icelake ,avx ,8388608 ,0 ,4064 > > ,1106155.9 ,1145128.6 ,0.97 > > icelake ,avx ,16777223 ,0 ,0 > > ,2479317.5 ,2630301.0 ,0.94 > > icelake ,avx ,16777231 ,0 ,3 > > ,2643303.6 ,3536980.7 ,0.75 > > icelake ,avx ,16777247 ,3 ,0 > > ,2571967.0 ,2672246.4 ,0.96 > > icelake ,avx ,16777279 ,3 ,5 > > ,2641320.5 ,3538388.9 ,0.75 > > icelake ,avx ,16777216 ,0 ,127 > > ,2832921.6 ,3593702.5 ,0.79 > > icelake ,avx ,16777216 ,0 ,255 > > ,2700272.1 ,3025346.1 ,0.89 > > icelake ,avx ,16777216 ,0 ,256 > > ,2622133.7 ,2709087.6 ,0.97 > > icelake ,avx ,16777216 ,0 ,4064 > > ,2475020.7 ,2610977.8 ,0.95 > > icelake ,avx ,33554439 ,0 ,0 > > ,5190103.1 ,5576047.9 ,0.93 > > icelake ,avx ,33554447 ,0 ,3 > > ,5477752.1 ,7215479.2 ,0.76 > > icelake ,avx ,33554463 ,3 ,0 > > ,5338711.7 ,5625026.7 ,0.95 > > icelake ,avx ,33554495 ,3 ,5 > > ,5505164.8 ,7223660.8 ,0.76 > > icelake ,avx ,33554432 ,0 ,127 > > ,5859232.3 ,7279581.9 ,0.8 > > icelake ,avx ,33554432 ,0 ,255 > > ,5681634.7 ,6156488.6 ,0.92 > > icelake ,avx ,33554432 ,0 ,256 > > ,5440721.4 ,5728347.4 ,0.95 > > icelake ,avx ,33554432 ,0 ,4064 > > ,5191213.2 ,5538716.4 ,0.94 > > icelake ,avx512 ,65543 ,0 ,0 > > ,5563.5 ,5634.1 ,0.99 > > icelake ,avx512 ,65551 ,0 ,3 > > ,5864.1 ,5728.4 ,1.02 > > icelake ,avx512 ,65567 ,3 ,0 > > ,5720.2 ,5625.3 ,1.02 > > icelake ,avx512 ,65599 ,3 ,5 > > ,5857.2 ,5722.0 ,1.02 > > icelake ,avx512 ,65536 ,0 ,127 > > ,6040.7 ,5844.0 ,1.03 > > icelake ,avx512 ,65536 ,0 ,255 > > ,5826.5 ,5799.6 ,1.0 > > icelake ,avx512 ,65536 ,0 ,256 > > ,5234.4 ,5230.0 ,1.0 > > icelake ,avx512 ,65536 ,0 ,4064 > > ,5800.7 ,5655.4 ,1.03 > > icelake ,avx512 ,131079 ,0 ,0 > > ,12591.4 ,11767.1 ,1.07 > > icelake ,avx512 ,131087 ,0 ,3 > > ,12694.9 ,12292.1 ,1.03 > > icelake ,avx512 ,131103 ,3 ,0 > > ,11374.7 ,12236.3 ,0.93 > > icelake ,avx512 ,131135 ,3 ,5 > > ,11958.2 ,11745.5 ,1.02 > > icelake ,avx512 ,131072 ,0 ,127 > > ,11803.4 ,11908.6 ,0.99 > > icelake ,avx512 ,131072 ,0 ,255 > > ,11569.0 ,11487.9 ,1.01 > > icelake ,avx512 ,131072 ,0 ,256 > > ,11087.6 ,10456.4 ,1.06 > > icelake ,avx512 ,131072 ,0 ,4064 > > ,11166.0 ,11248.2 ,0.99 > > icelake ,avx512 ,262151 ,0 ,0 > > ,30232.1 ,29932.7 ,1.01 > > icelake ,avx512 ,262159 ,0 ,3 > > ,30093.8 ,31315.1 ,0.96 > > icelake ,avx512 ,262175 ,3 ,0 > > ,30147.7 ,30643.4 ,0.98 > > icelake ,avx512 ,262207 ,3 ,5 > > ,29985.9 ,31479.8 ,0.95 > > icelake ,avx512 ,262144 ,0 ,127 > > ,30099.7 ,31552.9 ,0.95 > > icelake ,avx512 ,262144 ,0 ,255 > > ,29772.8 ,30698.1 ,0.97 > > icelake ,avx512 ,262144 ,0 ,256 > > ,28109.3 ,28957.9 ,0.97 > > icelake ,avx512 ,262144 ,0 ,4064 > > ,29787.5 ,30637.2 ,0.97 > > icelake ,avx512 ,524295 ,0 ,0 > > ,75920.7 ,75047.1 ,1.01 > > icelake ,avx512 ,524303 ,0 ,3 > > ,79218.6 ,79529.2 ,1.0 > > icelake ,avx512 ,524319 ,3 ,0 > > ,78446.9 ,78550.7 ,1.0 > > icelake ,avx512 ,524351 ,3 ,5 > > ,79055.0 ,79425.2 ,1.0 > > icelake ,avx512 ,524288 ,0 ,127 > > ,79070.6 ,79626.7 ,0.99 > > icelake ,avx512 ,524288 ,0 ,255 > > ,77891.8 ,78078.3 ,1.0 > > icelake ,avx512 ,524288 ,0 ,256 > > ,74797.3 ,74436.9 ,1.0 > > icelake ,avx512 ,524288 ,0 ,4064 > > ,78339.3 ,78337.2 ,1.0 > > icelake ,avx512 ,1048583 ,0 ,0 > > ,131427.6 ,133891.3 ,0.98 > > icelake ,avx512 ,1048591 ,0 ,3 > > ,143984.1 ,142003.7 ,1.01 > > icelake ,avx512 ,1048607 ,3 ,0 > > ,137547.9 ,134450.1 ,1.02 > > icelake ,avx512 ,1048639 ,3 ,5 > > ,144630.4 ,142174.6 ,1.02 > > icelake ,avx512 ,1048576 ,0 ,127 > > ,149810.7 ,142684.9 ,1.05 > > icelake ,avx512 ,1048576 ,0 ,255 > > ,156212.6 ,143509.2 ,1.09 > > icelake ,avx512 ,1048576 ,0 ,256 > > ,153776.9 ,139788.0 ,1.1 > > icelake ,avx512 ,1048576 ,0 ,4064 > > ,137926.6 ,134832.8 ,1.02 > > icelake ,avx512 ,2097159 ,0 ,0 > > ,263465.3 ,267681.6 ,0.98 > > icelake ,avx512 ,2097167 ,0 ,3 > > ,288947.7 ,284129.9 ,1.02 > > icelake ,avx512 ,2097183 ,3 ,0 > > ,275395.5 ,269216.0 ,1.02 > > icelake ,avx512 ,2097215 ,3 ,5 > > ,289131.5 ,284475.3 ,1.02 > > icelake ,avx512 ,2097152 ,0 ,127 > > ,299404.5 ,286193.2 ,1.05 > > icelake ,avx512 ,2097152 ,0 ,255 > > ,312913.2 ,286785.6 ,1.09 > > icelake ,avx512 ,2097152 ,0 ,256 > > ,307882.7 ,279708.7 ,1.1 > > icelake ,avx512 ,2097152 ,0 ,4064 > > ,275552.3 ,269867.0 ,1.02 > > icelake ,avx512 ,4194311 ,0 ,0 > > ,526480.1 ,536038.9 ,0.98 > > icelake ,avx512 ,4194319 ,0 ,3 > > ,579122.9 ,569512.5 ,1.02 > > icelake ,avx512 ,4194335 ,3 ,0 > > ,551658.1 ,542973.3 ,1.02 > > icelake ,avx512 ,4194367 ,3 ,5 > > ,578575.2 ,569497.2 ,1.02 > > icelake ,avx512 ,4194304 ,0 ,127 > > ,599943.6 ,569138.2 ,1.05 > > icelake ,avx512 ,4194304 ,0 ,255 > > ,628419.2 ,575908.4 ,1.09 > > icelake ,avx512 ,4194304 ,0 ,256 > > ,617242.8 ,561417.7 ,1.1 > > icelake ,avx512 ,4194304 ,0 ,4064 > > ,552012.3 ,540617.2 ,1.02 > > icelake ,avx512 ,8388615 ,0 ,0 > > ,1092471.4 ,1133834.9 ,0.96 > > icelake ,avx512 ,8388623 ,0 ,3 > > ,1185623.5 ,1218150.0 ,0.97 > > icelake ,avx512 ,8388639 ,3 ,0 > > ,1142647.1 ,1139201.6 ,1.0 > > icelake ,avx512 ,8388671 ,3 ,5 > > ,1183702.5 ,1225474.6 ,0.97 > > icelake ,avx512 ,8388608 ,0 ,127 > > ,1231862.8 ,1221685.1 ,1.01 > > icelake ,avx512 ,8388608 ,0 ,255 > > ,1290816.7 ,1221576.2 ,1.06 > > icelake ,avx512 ,8388608 ,0 ,256 > > ,1299047.6 ,1195021.2 ,1.09 > > icelake ,avx512 ,8388608 ,0 ,4064 > > ,1139648.9 ,1140113.0 ,1.0 > > icelake ,avx512 ,16777223 ,0 ,0 > > ,2464861.2 ,2599120.4 ,0.95 > > icelake ,avx512 ,16777231 ,0 ,3 > > ,2651029.7 ,2758867.1 ,0.96 > > icelake ,avx512 ,16777247 ,3 ,0 > > ,2570099.8 ,2601099.4 ,0.99 > > icelake ,avx512 ,16777279 ,3 ,5 > > ,2660529.4 ,2762598.6 ,0.96 > > icelake ,avx512 ,16777216 ,0 ,127 > > ,2759531.7 ,2756811.1 ,1.0 > > icelake ,avx512 ,16777216 ,0 ,255 > > ,2878568.5 ,2777650.3 ,1.04 > > icelake ,avx512 ,16777216 ,0 ,256 > > ,2931879.3 ,2709687.7 ,1.08 > > icelake ,avx512 ,16777216 ,0 ,4064 > > ,2587161.1 ,2632011.2 ,0.98 > > icelake ,avx512 ,33554439 ,0 ,0 > > ,5175406.0 ,5528857.2 ,0.94 > > icelake ,avx512 ,33554447 ,0 ,3 > > ,5537561.9 ,5818119.1 ,0.95 > > icelake ,avx512 ,33554463 ,3 ,0 > > ,5435099.5 ,5560442.2 ,0.98 > > icelake ,avx512 ,33554495 ,3 ,5 > > ,5546314.9 ,5800995.0 ,0.96 > > icelake ,avx512 ,33554432 ,0 ,127 > > ,5770248.0 ,5781104.9 ,1.0 > > icelake ,avx512 ,33554432 ,0 ,255 > > ,6019120.7 ,5836023.3 ,1.03 > > icelake ,avx512 ,33554432 ,0 ,256 > > ,6107033.4 ,5681798.8 ,1.07 > > icelake ,avx512 ,33554432 ,0 ,4064 > > ,5356238.5 ,5598521.5 ,0.96 > > skylake ,sse2 ,65543 ,0 ,0 > > ,3091.4 ,2940.2 ,1.05 > > skylake ,sse2 ,65551 ,0 ,3 > > ,3682.6 ,3403.7 ,1.08 > > skylake ,sse2 ,65567 ,3 ,0 > > ,3031.3 ,3070.2 ,0.99 > > skylake ,sse2 ,65599 ,3 ,5 > > ,3731.2 ,3718.7 ,1.0 > > skylake ,sse2 ,65536 ,0 ,127 > > ,3642.3 ,3390.5 ,1.07 > > skylake ,sse2 ,65536 ,0 ,255 > > ,3493.9 ,3333.0 ,1.05 > > skylake ,sse2 ,65536 ,0 ,256 > > ,3043.2 ,2981.0 ,1.02 > > skylake ,sse2 ,65536 ,0 ,4064 > > ,2796.6 ,2843.9 ,0.98 > > skylake ,sse2 ,131079 ,0 ,0 > > ,6347.4 ,6309.8 ,1.01 > > skylake ,sse2 ,131087 ,0 ,3 > > ,7318.4 ,7486.2 ,0.98 > > skylake ,sse2 ,131103 ,3 ,0 > > ,6297.4 ,6516.8 ,0.97 > > skylake ,sse2 ,131135 ,3 ,5 > > ,7544.5 ,7823.5 ,0.96 > > skylake ,sse2 ,131072 ,0 ,127 > > ,7426.4 ,7554.3 ,0.98 > > skylake ,sse2 ,131072 ,0 ,255 > > ,7349.0 ,7195.4 ,1.02 > > skylake ,sse2 ,131072 ,0 ,256 > > ,7068.1 ,6804.8 ,1.04 > > skylake ,sse2 ,131072 ,0 ,4064 > > ,6884.6 ,7566.7 ,0.91 > > skylake ,sse2 ,262151 ,0 ,0 > > ,15848.1 ,15552.2 ,1.02 > > skylake ,sse2 ,262159 ,0 ,3 > > ,17864.6 ,16787.9 ,1.06 > > skylake ,sse2 ,262175 ,3 ,0 > > ,15748.1 ,16266.0 ,0.97 > > skylake ,sse2 ,262207 ,3 ,5 > > ,17022.3 ,17229.8 ,0.99 > > skylake ,sse2 ,262144 ,0 ,127 > > ,16158.7 ,16093.6 ,1.0 > > skylake ,sse2 ,262144 ,0 ,255 > > ,15670.7 ,15949.2 ,0.98 > > skylake ,sse2 ,262144 ,0 ,256 > > ,14806.3 ,14970.3 ,0.99 > > skylake ,sse2 ,262144 ,0 ,4064 > > ,14751.7 ,15008.2 ,0.98 > > skylake ,sse2 ,524295 ,0 ,0 > > ,32874.8 ,33731.2 ,0.97 > > skylake ,sse2 ,524303 ,0 ,3 > > ,34035.1 ,34777.8 ,0.98 > > skylake ,sse2 ,524319 ,3 ,0 > > ,34325.6 ,34108.9 ,1.01 > > skylake ,sse2 ,524351 ,3 ,5 > > ,34853.5 ,35624.4 ,0.98 > > skylake ,sse2 ,524288 ,0 ,127 > > ,33437.4 ,33816.7 ,0.99 > > skylake ,sse2 ,524288 ,0 ,255 > > ,33256.1 ,33664.7 ,0.99 > > skylake ,sse2 ,524288 ,0 ,256 > > ,32006.3 ,32396.3 ,0.99 > > skylake ,sse2 ,524288 ,0 ,4064 > > ,32284.7 ,32713.9 ,0.99 > > skylake ,sse2 ,1048583 ,0 ,0 > > ,71891.7 ,73858.4 ,0.97 > > skylake ,sse2 ,1048591 ,0 ,3 > > ,74621.3 ,74389.7 ,1.0 > > skylake ,sse2 ,1048607 ,3 ,0 > > ,72515.0 ,73573.2 ,0.99 > > skylake ,sse2 ,1048639 ,3 ,5 > > ,72471.7 ,73782.6 ,0.98 > > skylake ,sse2 ,1048576 ,0 ,127 > > ,77638.6 ,82474.6 ,0.94 > > skylake ,sse2 ,1048576 ,0 ,255 > > ,71870.0 ,71933.6 ,1.0 > > skylake ,sse2 ,1048576 ,0 ,256 > > ,70410.0 ,73243.6 ,0.96 > > skylake ,sse2 ,1048576 ,0 ,4064 > > ,71267.1 ,72274.6 ,0.99 > > skylake ,sse2 ,2097159 ,0 ,0 > > ,140052.6 ,144880.1 ,0.97 > > skylake ,sse2 ,2097167 ,0 ,3 > > ,146626.5 ,147972.6 ,0.99 > > skylake ,sse2 ,2097183 ,3 ,0 > > ,141750.1 ,146353.6 ,0.97 > > skylake ,sse2 ,2097215 ,3 ,5 > > ,144169.0 ,148120.1 ,0.97 > > skylake ,sse2 ,2097152 ,0 ,127 > > ,156575.9 ,165844.4 ,0.94 > > skylake ,sse2 ,2097152 ,0 ,255 > > ,144277.7 ,146971.5 ,0.98 > > skylake ,sse2 ,2097152 ,0 ,256 > > ,143047.4 ,146810.9 ,0.97 > > skylake ,sse2 ,2097152 ,0 ,4064 > > ,142795.6 ,145805.8 ,0.98 > > skylake ,sse2 ,4194311 ,0 ,0 > > ,284353.3 ,298092.5 ,0.95 > > skylake ,sse2 ,4194319 ,0 ,3 > > ,296656.4 ,311960.2 ,0.95 > > skylake ,sse2 ,4194335 ,3 ,0 > > ,285922.6 ,304100.5 ,0.94 > > skylake ,sse2 ,4194367 ,3 ,5 > > ,297135.4 ,312532.5 ,0.95 > > skylake ,sse2 ,4194304 ,0 ,127 > > ,323938.6 ,340414.3 ,0.95 > > skylake ,sse2 ,4194304 ,0 ,255 > > ,301460.9 ,310042.7 ,0.97 > > skylake ,sse2 ,4194304 ,0 ,256 > > ,287155.8 ,303580.6 ,0.95 > > skylake ,sse2 ,4194304 ,0 ,4064 > > ,291006.2 ,302441.3 ,0.96 > > skylake ,sse2 ,8388615 ,0 ,0 > > ,714424.7 ,747484.3 ,0.96 > > skylake ,sse2 ,8388623 ,0 ,3 > > ,748995.5 ,774116.5 ,0.97 > > skylake ,sse2 ,8388639 ,3 ,0 > > ,720563.4 ,757386.9 ,0.95 > > skylake ,sse2 ,8388671 ,3 ,5 > > ,748028.7 ,773907.8 ,0.97 > > skylake ,sse2 ,8388608 ,0 ,127 > > ,750775.3 ,780245.2 ,0.96 > > skylake ,sse2 ,8388608 ,0 ,255 > > ,724940.3 ,764197.8 ,0.95 > > skylake ,sse2 ,8388608 ,0 ,256 > > ,722035.0 ,759408.9 ,0.95 > > skylake ,sse2 ,8388608 ,0 ,4064 > > ,756977.8 ,755532.4 ,1.0 > > skylake ,sse2 ,16777223 ,0 ,0 > > ,1971686.0 ,2111263.4 ,0.93 > > skylake ,sse2 ,16777231 ,0 ,3 > > ,1953608.9 ,2128493.8 ,0.92 > > skylake ,sse2 ,16777247 ,3 ,0 > > ,1967075.6 ,2103772.3 ,0.94 > > skylake ,sse2 ,16777279 ,3 ,5 > > ,1950851.6 ,2133601.6 ,0.91 > > skylake ,sse2 ,16777216 ,0 ,127 > > ,1991168.2 ,2078249.3 ,0.96 > > skylake ,sse2 ,16777216 ,0 ,255 > > ,1958502.9 ,2111955.5 ,0.93 > > skylake ,sse2 ,16777216 ,0 ,256 > > ,1965103.7 ,2114293.0 ,0.93 > > skylake ,sse2 ,16777216 ,0 ,4064 > > ,1958381.3 ,2103438.6 ,0.93 > > skylake ,sse2 ,33554439 ,0 ,0 > > ,4456144.2 ,4660837.1 ,0.96 > > skylake ,sse2 ,33554447 ,0 ,3 > > ,4431097.0 ,4679042.6 ,0.95 > > skylake ,sse2 ,33554463 ,3 ,0 > > ,4448225.6 ,4648538.3 ,0.96 > > skylake ,sse2 ,33554495 ,3 ,5 > > ,4427743.0 ,4678340.1 ,0.95 > > skylake ,sse2 ,33554432 ,0 ,127 > > ,4437517.3 ,4552005.9 ,0.97 > > skylake ,sse2 ,33554432 ,0 ,255 > > ,4427135.1 ,4543412.0 ,0.97 > > skylake ,sse2 ,33554432 ,0 ,256 > > ,4441311.2 ,4658315.5 ,0.95 > > skylake ,sse2 ,33554432 ,0 ,4064 > > ,4429798.4 ,4659499.6 ,0.95 skylake ,avx ,65543 > > ,0 ,0 ,3115.8 ,3043.7 ,1.02 > > skylake ,avx ,65551 ,0 ,3 > > ,3673.2 ,3551.7 ,1.03 > > skylake ,avx ,65567 ,3 ,0 > > ,3024.6 ,2887.4 ,1.05 > > skylake ,avx ,65599 ,3 ,5 > > ,3907.8 ,3636.4 ,1.07 > > skylake ,avx ,65536 ,0 ,127 > > ,3539.2 ,3372.3 ,1.05 > > skylake ,avx ,65536 ,0 ,255 > > ,3489.9 ,3344.0 ,1.04 > > skylake ,avx ,65536 ,0 ,256 > > ,3059.0 ,2924.4 ,1.05 > > skylake ,avx ,65536 ,0 ,4064 > > ,2805.0 ,2869.3 ,0.98 > > skylake ,avx ,131079 ,0 ,0 > > ,6129.2 ,6263.4 ,0.98 > > skylake ,avx ,131087 ,0 ,3 > > ,7096.8 ,7570.0 ,0.94 > > skylake ,avx ,131103 ,3 ,0 > > ,6394.5 ,6842.5 ,0.93 > > skylake ,avx ,131135 ,3 ,5 > > ,7462.8 ,7776.0 ,0.96 > > skylake ,avx ,131072 ,0 ,127 > > ,7726.9 ,7428.5 ,1.04 > > skylake ,avx ,131072 ,0 ,255 > > ,7167.4 ,7278.9 ,0.98 > > skylake ,avx ,131072 ,0 ,256 > > ,7197.9 ,6284.3 ,1.15 > > skylake ,avx ,131072 ,0 ,4064 > > ,6984.0 ,6940.4 ,1.01 > > skylake ,avx ,262151 ,0 ,0 > > ,15787.3 ,16403.1 ,0.96 > > skylake ,avx ,262159 ,0 ,3 > > ,17800.1 ,17628.1 ,1.01 > > skylake ,avx ,262175 ,3 ,0 > > ,16622.8 ,16244.3 ,1.02 > > skylake ,avx ,262207 ,3 ,5 > > ,16989.7 ,17509.0 ,0.97 > > skylake ,avx ,262144 ,0 ,127 > > ,16190.8 ,15971.8 ,1.01 > > skylake ,avx ,262144 ,0 ,255 > > ,15787.1 ,15876.7 ,0.99 > > skylake ,avx ,262144 ,0 ,256 > > ,14840.1 ,14997.0 ,0.99 > > skylake ,avx ,262144 ,0 ,4064 > > ,15743.0 ,14976.2 ,1.05 > > skylake ,avx ,524295 ,0 ,0 > > ,32848.5 ,33397.8 ,0.98 > > skylake ,avx ,524303 ,0 ,3 > > ,34872.1 ,34862.2 ,1.0 > > skylake ,avx ,524319 ,3 ,0 > > ,33784.6 ,34023.8 ,0.99 > > skylake ,avx ,524351 ,3 ,5 > > ,35337.1 ,35364.5 ,1.0 > > skylake ,avx ,524288 ,0 ,127 > > ,33624.5 ,33596.5 ,1.0 > > skylake ,avx ,524288 ,0 ,255 > > ,33390.7 ,33842.8 ,0.99 > > skylake ,avx ,524288 ,0 ,256 > > ,31937.0 ,32357.2 ,0.99 > > skylake ,avx ,524288 ,0 ,4064 > > ,32233.5 ,32267.3 ,1.0 > > skylake ,avx ,1048583 ,0 ,0 > > ,100354.7 ,105840.6 ,0.95 > > skylake ,avx ,1048591 ,0 ,3 > > ,68102.5 ,67496.0 ,1.01 > > skylake ,avx ,1048607 ,3 ,0 > > ,66146.1 ,67540.0 ,0.98 > > skylake ,avx ,1048639 ,3 ,5 > > ,67530.8 ,67726.4 ,1.0 > > skylake ,avx ,1048576 ,0 ,127 > > ,67105.6 ,66533.5 ,1.01 > > skylake ,avx ,1048576 ,0 ,255 > > ,67101.8 ,65666.7 ,1.02 > > skylake ,avx ,1048576 ,0 ,256 > > ,65092.6 ,67103.0 ,0.97 > > skylake ,avx ,1048576 ,0 ,4064 > > ,65700.0 ,67031.5 ,0.98 > > skylake ,avx ,2097159 ,0 ,0 > > ,133101.0 ,135171.6 ,0.98 > > skylake ,avx ,2097167 ,0 ,3 > > ,134174.4 ,135782.1 ,0.99 > > skylake ,avx ,2097183 ,3 ,0 > > ,132056.4 ,134170.0 ,0.98 > > skylake ,avx ,2097215 ,3 ,5 > > ,134413.5 ,136341.1 ,0.99 > > skylake ,avx ,2097152 ,0 ,127 > > ,133003.9 ,132992.1 ,1.0 > > skylake ,avx ,2097152 ,0 ,255 > > ,133344.3 ,132883.1 ,1.0 > > skylake ,avx ,2097152 ,0 ,256 > > ,134051.7 ,136185.8 ,0.98 > > skylake ,avx ,2097152 ,0 ,4064 > > ,132976.3 ,135029.4 ,0.98 > > skylake ,avx ,4194311 ,0 ,0 > > ,268004.1 ,282650.3 ,0.95 > > skylake ,avx ,4194319 ,0 ,3 > > ,270270.0 ,286700.3 ,0.94 > > skylake ,avx ,4194335 ,3 ,0 > > ,264288.5 ,279582.4 ,0.95 > > skylake ,avx ,4194367 ,3 ,5 > > ,270498.4 ,286294.5 ,0.94 > > skylake ,avx ,4194304 ,0 ,127 > > ,271219.3 ,275129.8 ,0.99 > > skylake ,avx ,4194304 ,0 ,255 > > ,269996.5 ,270227.6 ,1.0 > > skylake ,avx ,4194304 ,0 ,256 > > ,267901.1 ,281673.1 ,0.95 > > skylake ,avx ,4194304 ,0 ,4064 > > ,268390.0 ,279100.3 ,0.96 > > skylake ,avx ,8388615 ,0 ,0 > > ,803547.9 ,813229.9 ,0.99 > > skylake ,avx ,8388623 ,0 ,3 > > ,828872.4 ,869413.0 ,0.95 > > skylake ,avx ,8388639 ,3 ,0 > > ,818000.0 ,873781.7 ,0.94 > > skylake ,avx ,8388671 ,3 ,5 > > ,824679.0 ,863561.5 ,0.95 > > skylake ,avx ,8388608 ,0 ,127 > > ,800728.5 ,779000.8 ,1.03 > > skylake ,avx ,8388608 ,0 ,255 > > ,820071.4 ,770113.2 ,1.06 > > skylake ,avx ,8388608 ,0 ,256 > > ,825624.6 ,867247.7 ,0.95 > > skylake ,avx ,8388608 ,0 ,4064 > > ,830209.7 ,894086.6 ,0.93 > > skylake ,avx ,16777223 ,0 ,0 > > ,1989391.3 ,2132829.8 ,0.93 > > skylake ,avx ,16777231 ,0 ,3 > > ,1994225.1 ,2211556.0 ,0.9 > > skylake ,avx ,16777247 ,3 ,0 > > ,1993572.9 ,2213029.9 ,0.9 > > skylake ,avx ,16777279 ,3 ,5 > > ,2001956.9 ,2211769.7 ,0.91 > > skylake ,avx ,16777216 ,0 ,127 > > ,1968155.9 ,2127764.7 ,0.92 > > skylake ,avx ,16777216 ,0 ,255 > > ,1978305.1 ,2121371.3 ,0.93 > > skylake ,avx ,16777216 ,0 ,256 > > ,1993261.9 ,2206494.1 ,0.9 > > skylake ,avx ,16777216 ,0 ,4064 > > ,1993808.3 ,2198137.4 ,0.91 > > skylake ,avx ,33554439 ,0 ,0 > > ,4540216.7 ,4870021.8 ,0.93 > > skylake ,avx ,33554447 ,0 ,3 > > ,4483505.3 ,4850545.5 ,0.92 > > skylake ,avx ,33554463 ,3 ,0 > > ,4501944.5 ,4870922.4 ,0.92 > > skylake ,avx ,33554495 ,3 ,5 > > ,4484565.5 ,4845392.4 ,0.93 > > skylake ,avx ,33554432 ,0 ,127 > > ,4408639.3 ,4701698.6 ,0.94 > > skylake ,avx ,33554432 ,0 ,255 > > ,4445826.0 ,4678142.9 ,0.95 > > skylake ,avx ,33554432 ,0 ,256 > > ,4497953.2 ,4844498.6 ,0.93 > > skylake ,avx ,33554432 ,0 ,4064 > > ,4501572.4 ,4839209.4 ,0.93 > > > > > > > > -- > > > H.J.
On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote: > From: noah <goldstein.w.n@gmail.com> > > No Bug. This commit updates the large memcpy case (no overlap). The > update is to perform memcpy on either 2 or 4 contiguous pages at > once. This 1) helps to alleviate the affects of false memory aliasing > when destination and source have a close 4k alignment and 2) In most > cases and for most DRAM units is a modestly more efficient access > pattern. These changes are a clear performance improvement for > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > pass. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > Issue was alignment related AFAICT. Added `.p2align 4` infront of the > loops and no longer see any meaningful regression. > > Also added back the temporal stores for the tail. Saw a regression > when doing these tests. > > Two tables below for skylake and icelake numbers for the areas around > where you saw the regression. Below is all data from the tests. > > N = 10. > > Skylake > Len ,align1 ,align2 ,new mean ,old mean > 4103 ,0 ,64 ,84.5 ,88.6 > 4111 ,0 ,3 ,99.0 ,99.9 > 4127 ,3 ,0 ,102.1 ,102.3 > 4159 ,3 ,7 ,88.7 ,90.9 > 4223 ,9 ,5 ,88.1 ,87.4 > 8199 ,0 ,64 ,146.7 ,150.2 > 8207 ,0 ,3 ,167.9 ,168.5 > 8223 ,3 ,0 ,168.5 ,168.1 > 8255 ,3 ,7 ,157.0 ,159.2 > 8319 ,9 ,5 ,155.5 ,155.7 > 16391 ,0 ,64 ,286.2 ,288.8 > 16399 ,0 ,3 ,307.0 ,308.7 > 16415 ,3 ,0 ,307.4 ,307.6 > 16447 ,3 ,7 ,294.6 ,295.5 > 16511 ,9 ,5 ,291.5 ,462.1 > 32775 ,0 ,64 ,603.4 ,601.5 > 32783 ,0 ,3 ,604.8 ,606.4 > 32799 ,3 ,0 ,603.0 ,604.1 > 32831 ,3 ,7 ,600.2 ,737.3 > 32895 ,9 ,5 ,604.4 ,599.5 > 65543 ,0 ,64 ,1873.5 ,1854.3 > 65551 ,0 ,3 ,1862.9 ,1846.6 > 65567 ,3 ,0 ,1885.5 ,1966.0 > 65599 ,3 ,7 ,1833.2 ,1833.1 > 65663 ,9 ,5 ,1884.9 ,1887.4 > 131079 ,0 ,64 ,3944.3 ,3949.4 > 131087 ,0 ,3 ,3927.3 ,3913.3 > 131103 ,3 ,0 ,4415.8 ,4169.4 > 131135 ,3 ,7 ,4224.5 ,4157.6 > 131199 ,9 ,5 ,5974.0 ,4983.8 > 262151 ,0 ,64 ,11050.2 ,10620.6 > 262159 ,0 ,3 ,9932.8 ,10037.3 > 262175 ,3 ,0 ,10188.8 ,9206.6 > 262207 ,3 ,7 ,9633.3 ,9216.7 > 262271 ,9 ,5 ,9732.7 ,9345.3 > 524295 ,0 ,64 ,24823.9 ,24880.7 > 524303 ,0 ,3 ,24514.0 ,24556.7 > 524319 ,3 ,0 ,23974.4 ,24219.9 > 524351 ,3 ,7 ,24159.7 ,24207.0 > 524415 ,9 ,5 ,23946.5 ,24142.8 > > Icelake: > Len ,align1 ,align2 ,new mean ,old mean > 4103 ,0 ,64 ,50.2 ,63.7 > 4111 ,0 ,3 ,63.7 ,65.1 > 4127 ,3 ,0 ,68.2 ,69.4 > 4159 ,3 ,7 ,59.6 ,68.0 > 4223 ,9 ,5 ,68.2 ,66.8 > 8199 ,0 ,64 ,92.1 ,89.9 > 8207 ,0 ,3 ,119.7 ,118.3 > 8223 ,3 ,0 ,119.1 ,120.9 > 8255 ,3 ,7 ,122.9 ,123.7 > 8319 ,9 ,5 ,122.1 ,121.8 > 16391 ,0 ,64 ,162.7 ,158.0 > 16399 ,0 ,3 ,227.6 ,234.1 > 16415 ,3 ,0 ,230.8 ,232.7 > 16447 ,3 ,7 ,226.8 ,232.6 > 16511 ,9 ,5 ,233.4 ,233.8 > 32775 ,0 ,64 ,312.2 ,301.8 > 32783 ,0 ,3 ,449.7 ,450.0 > 32799 ,3 ,0 ,452.7 ,455.9 > 32831 ,3 ,7 ,449.8 ,458.0 > 32895 ,9 ,5 ,456.3 ,459.4 > 65543 ,0 ,64 ,1460.6 ,1463.9 > 65551 ,0 ,3 ,1462.0 ,1465.4 > 65567 ,3 ,0 ,1466.6 ,1480.4 > 65599 ,3 ,7 ,1488.0 ,1488.9 > 65663 ,9 ,5 ,1680.8 ,1499.5 > 131079 ,0 ,64 ,2988.5 ,3010.1 > 131087 ,0 ,3 ,2995.5 ,2996.4 > 131103 ,3 ,0 ,3006.2 ,3000.5 > 131135 ,3 ,7 ,3032.4 ,3073.7 > 131199 ,9 ,5 ,3010.4 ,3027.4 > 262151 ,0 ,64 ,6143.2 ,6079.1 > 262159 ,0 ,3 ,6085.1 ,6075.8 > 262175 ,3 ,0 ,6088.0 ,6064.9 > 262207 ,3 ,7 ,6018.7 ,6023.5 > 262271 ,9 ,5 ,6019.8 ,5959.2 > 524295 ,0 ,64 ,14464.2 ,14095.1 > 524303 ,0 ,3 ,14761.6 ,14050.2 > 524319 ,3 ,0 ,14534.1 ,14087.5 > 524351 ,3 ,7 ,14147.7 ,13903.8 > 524415 ,9 ,5 ,14157.0 ,13982.9 > > > > cpu ,version ,Len ,align1 ,align2 ,new mean ,old mean > skylake ,avx ,4103 ,0 ,64 ,84.5 ,88.6 > skylake ,avx ,4111 ,0 ,3 ,99.0 ,99.9 > skylake ,avx ,4127 ,3 ,0 ,102.1 ,102.3 > skylake ,avx ,4159 ,3 ,7 ,88.7 ,90.9 > skylake ,avx ,4223 ,9 ,5 ,88.1 ,87.4 > skylake ,avx ,8199 ,0 ,64 ,146.7 ,150.2 > skylake ,avx ,8207 ,0 ,3 ,167.9 ,168.5 > skylake ,avx ,8223 ,3 ,0 ,168.5 ,168.1 > skylake ,avx ,8255 ,3 ,7 ,157.0 ,159.2 > skylake ,avx ,8319 ,9 ,5 ,155.5 ,155.7 > skylake ,avx ,16391 ,0 ,64 ,286.2 ,288.8 > skylake ,avx ,16399 ,0 ,3 ,307.0 ,308.7 > skylake ,avx ,16415 ,3 ,0 ,307.4 ,307.6 > skylake ,avx ,16447 ,3 ,7 ,294.6 ,295.5 > skylake ,avx ,16511 ,9 ,5 ,291.5 ,462.1 > skylake ,avx ,32775 ,0 ,64 ,603.4 ,601.5 > skylake ,avx ,32783 ,0 ,3 ,604.8 ,606.4 > skylake ,avx ,32799 ,3 ,0 ,603.0 ,604.1 > skylake ,avx ,32831 ,3 ,7 ,600.2 ,737.3 > skylake ,avx ,32895 ,9 ,5 ,604.4 ,599.5 > skylake ,avx ,65543 ,0 ,64 ,1873.5 ,1854.3 > skylake ,avx ,65551 ,0 ,3 ,1862.9 ,1846.6 > skylake ,avx ,65567 ,3 ,0 ,1885.5 ,1966.0 > skylake ,avx ,65599 ,3 ,7 ,1833.2 ,1833.1 > skylake ,avx ,65663 ,9 ,5 ,1884.9 ,1887.4 > skylake ,avx ,131079 ,0 ,64 ,3944.3 ,3949.4 > skylake ,avx ,131087 ,0 ,3 ,3927.3 ,3913.3 > skylake ,avx ,131103 ,3 ,0 ,4415.8 ,4169.4 > skylake ,avx ,131135 ,3 ,7 ,4224.5 ,4157.6 > skylake ,avx ,131199 ,9 ,5 ,5974.0 ,4983.8 > skylake ,avx ,262151 ,0 ,64 ,11050.2 ,10620.6 > skylake ,avx ,262159 ,0 ,3 ,9932.8 ,10037.3 > skylake ,avx ,262175 ,3 ,0 ,10188.8 ,9206.6 > skylake ,avx ,262207 ,3 ,7 ,9633.3 ,9216.7 > skylake ,avx ,262271 ,9 ,5 ,9732.7 ,9345.3 > skylake ,avx ,524295 ,0 ,64 ,24823.9 ,24880.7 > skylake ,avx ,524303 ,0 ,3 ,24514.0 ,24556.7 > skylake ,avx ,524319 ,3 ,0 ,23974.4 ,24219.9 > skylake ,avx ,524351 ,3 ,7 ,24159.7 ,24207.0 > skylake ,avx ,524415 ,9 ,5 ,23946.5 ,24142.8 > skylake ,avx ,1048583 ,0 ,64 ,49163.9 ,49454.6 > skylake ,avx ,1048591 ,0 ,3 ,49879.3 ,49400.8 > skylake ,avx ,1048607 ,3 ,0 ,49738.0 ,48864.6 > skylake ,avx ,1048639 ,3 ,7 ,48804.0 ,47588.5 > skylake ,avx ,1048703 ,9 ,5 ,49629.4 ,49796.3 > skylake ,avx ,2097159 ,0 ,64 ,98271.7 ,96330.6 > skylake ,avx ,2097167 ,0 ,3 ,97801.8 ,98638.1 > skylake ,avx ,2097183 ,3 ,0 ,98041.1 ,99287.6 > skylake ,avx ,2097215 ,3 ,7 ,96629.5 ,96521.9 > skylake ,avx ,2097279 ,9 ,5 ,98961.8 ,98909.8 > skylake ,avx ,4194311 ,0 ,64 ,194667.7 ,195377.1 > skylake ,avx ,4194319 ,0 ,3 ,194919.5 ,198576.2 > skylake ,avx ,4194335 ,3 ,0 ,192949.8 ,194584.7 > skylake ,avx ,4194367 ,3 ,7 ,189943.5 ,189177.9 > skylake ,avx ,4194431 ,9 ,5 ,192479.1 ,196494.2 > skylake ,avx ,8388615 ,0 ,64 ,588671.6 ,587215.4 > skylake ,avx ,8388623 ,0 ,3 ,581640.7 ,582812.5 > skylake ,avx ,8388639 ,3 ,0 ,549811.9 ,544697.6 > skylake ,avx ,8388671 ,3 ,7 ,591155.0 ,577951.8 > skylake ,avx ,8388735 ,9 ,5 ,547583.2 ,545133.3 > skylake ,avx ,16777223 ,0 ,64 ,1787503.0 ,1811146.0 > skylake ,avx ,16777231 ,0 ,3 ,1758671.0 ,1756343.0 > skylake ,avx ,16777247 ,3 ,0 ,1691781.0 ,1694661.0 > skylake ,avx ,16777279 ,3 ,7 ,1768150.0 ,1754785.0 > skylake ,avx ,16777343 ,9 ,5 ,1695179.0 ,1710794.0 > skylake ,sse2 ,4103 ,0 ,64 ,150.8 ,150.5 > skylake ,sse2 ,4111 ,0 ,3 ,156.8 ,158.4 > skylake ,sse2 ,4127 ,3 ,0 ,99.7 ,99.4 > skylake ,sse2 ,4159 ,3 ,7 ,154.8 ,154.5 > skylake ,sse2 ,4223 ,9 ,5 ,137.3 ,137.2 > skylake ,sse2 ,8199 ,0 ,64 ,284.8 ,285.5 > skylake ,sse2 ,8207 ,0 ,3 ,296.0 ,296.1 > skylake ,sse2 ,8223 ,3 ,0 ,168.0 ,168.2 > skylake ,sse2 ,8255 ,3 ,7 ,293.0 ,292.4 > skylake ,sse2 ,8319 ,9 ,5 ,251.3 ,250.7 > skylake ,sse2 ,16391 ,0 ,64 ,561.3 ,608.3 > skylake ,sse2 ,16399 ,0 ,3 ,571.0 ,574.8 > skylake ,sse2 ,16415 ,3 ,0 ,305.4 ,305.0 > skylake ,sse2 ,16447 ,3 ,7 ,563.2 ,565.0 > skylake ,sse2 ,16511 ,9 ,5 ,477.1 ,475.1 > skylake ,sse2 ,32775 ,0 ,64 ,1128.2 ,1131.7 > skylake ,sse2 ,32783 ,0 ,3 ,1126.6 ,1131.0 > skylake ,sse2 ,32799 ,3 ,0 ,587.6 ,590.8 > skylake ,sse2 ,32831 ,3 ,7 ,1130.6 ,1126.2 > skylake ,sse2 ,32895 ,9 ,5 ,957.6 ,953.0 > skylake ,sse2 ,65543 ,0 ,64 ,2718.9 ,2704.2 > skylake ,sse2 ,65551 ,0 ,3 ,2724.1 ,2725.0 > skylake ,sse2 ,65567 ,3 ,0 ,1888.4 ,1914.3 > skylake ,sse2 ,65599 ,3 ,7 ,2787.6 ,2748.7 > skylake ,sse2 ,65663 ,9 ,5 ,2400.5 ,2369.4 > skylake ,sse2 ,131079 ,0 ,64 ,5603.3 ,5654.9 > skylake ,sse2 ,131087 ,0 ,3 ,5939.3 ,5871.4 > skylake ,sse2 ,131103 ,3 ,0 ,4272.4 ,4190.0 > skylake ,sse2 ,131135 ,3 ,7 ,7601.4 ,7524.6 > skylake ,sse2 ,131199 ,9 ,5 ,7022.1 ,6864.7 > skylake ,sse2 ,262151 ,0 ,64 ,13736.2 ,14030.0 > skylake ,sse2 ,262159 ,0 ,3 ,12407.3 ,12334.1 > skylake ,sse2 ,262175 ,3 ,0 ,9661.1 ,9249.4 > skylake ,sse2 ,262207 ,3 ,7 ,12850.2 ,12351.6 > skylake ,sse2 ,262271 ,9 ,5 ,10792.6 ,10435.8 > skylake ,sse2 ,524295 ,0 ,64 ,27754.5 ,28177.7 > skylake ,sse2 ,524303 ,0 ,3 ,27766.2 ,28152.0 > skylake ,sse2 ,524319 ,3 ,0 ,24030.9 ,24438.3 > skylake ,sse2 ,524351 ,3 ,7 ,27787.5 ,27933.0 > skylake ,sse2 ,524415 ,9 ,5 ,24263.2 ,25249.1 > skylake ,sse2 ,1048583 ,0 ,64 ,56199.9 ,56039.8 > skylake ,sse2 ,1048591 ,0 ,3 ,56750.2 ,58889.7 > skylake ,sse2 ,1048607 ,3 ,0 ,56394.0 ,55115.3 > skylake ,sse2 ,1048639 ,3 ,7 ,57233.1 ,57473.8 > skylake ,sse2 ,1048703 ,9 ,5 ,56324.3 ,55917.9 > skylake ,sse2 ,2097159 ,0 ,64 ,113234.8 ,114346.4 > skylake ,sse2 ,2097167 ,0 ,3 ,114373.1 ,115522.5 > skylake ,sse2 ,2097183 ,3 ,0 ,108113.3 ,108513.3 > skylake ,sse2 ,2097215 ,3 ,7 ,116863.6 ,116549.9 > skylake ,sse2 ,2097279 ,9 ,5 ,108945.1 ,108843.7 > skylake ,sse2 ,4194311 ,0 ,64 ,230250.1 ,232350.0 > skylake ,sse2 ,4194319 ,0 ,3 ,231895.3 ,235055.6 > skylake ,sse2 ,4194335 ,3 ,0 ,218442.8 ,219199.8 > skylake ,sse2 ,4194367 ,3 ,7 ,242564.2 ,235587.7 > skylake ,sse2 ,4194431 ,9 ,5 ,224167.4 ,215261.8 > skylake ,sse2 ,8388615 ,0 ,64 ,679801.8 ,674832.0 > skylake ,sse2 ,8388623 ,0 ,3 ,684913.2 ,685238.7 > skylake ,sse2 ,8388639 ,3 ,0 ,644865.4 ,631388.6 > skylake ,sse2 ,8388671 ,3 ,7 ,698700.9 ,689316.1 > skylake ,sse2 ,8388735 ,9 ,5 ,644820.2 ,631366.8 > skylake ,sse2 ,16777223 ,0 ,64 ,1877984.0 ,1876437.0 > skylake ,sse2 ,16777231 ,0 ,3 ,1898086.0 ,1913053.0 > skylake ,sse2 ,16777247 ,3 ,0 ,1857018.0 ,1866949.0 > skylake ,sse2 ,16777279 ,3 ,7 ,1914905.0 ,1897134.0 > skylake ,sse2 ,16777343 ,9 ,5 ,1859937.0 ,1881939.0 > icelake ,avx512 ,4103 ,0 ,64 ,75.2 ,75.8 > icelake ,avx512 ,4111 ,0 ,3 ,56.9 ,56.4 > icelake ,avx512 ,4127 ,3 ,0 ,59.1 ,59.6 > icelake ,avx512 ,4159 ,3 ,7 ,50.7 ,51.3 > icelake ,avx512 ,4223 ,9 ,5 ,59.2 ,58.9 > icelake ,avx512 ,8199 ,0 ,64 ,67.8 ,63.9 > icelake ,avx512 ,8207 ,0 ,3 ,89.0 ,89.9 > icelake ,avx512 ,8223 ,3 ,0 ,90.2 ,90.1 > icelake ,avx512 ,8255 ,3 ,7 ,82.6 ,84.9 > icelake ,avx512 ,8319 ,9 ,5 ,91.5 ,92.8 > icelake ,avx512 ,16391 ,0 ,64 ,118.0 ,117.6 > icelake ,avx512 ,16399 ,0 ,3 ,156.5 ,157.0 > icelake ,avx512 ,16415 ,3 ,0 ,157.4 ,157.3 > icelake ,avx512 ,16447 ,3 ,7 ,151.0 ,151.6 > icelake ,avx512 ,16511 ,9 ,5 ,159.1 ,159.6 > icelake ,avx512 ,32775 ,0 ,64 ,231.8 ,230.8 > icelake ,avx512 ,32783 ,0 ,3 ,297.8 ,299.3 > icelake ,avx512 ,32799 ,3 ,0 ,299.1 ,299.0 > icelake ,avx512 ,32831 ,3 ,7 ,293.5 ,295.4 > icelake ,avx512 ,32895 ,9 ,5 ,300.3 ,302.5 > icelake ,avx512 ,65543 ,0 ,64 ,1473.4 ,1479.2 > icelake ,avx512 ,65551 ,0 ,3 ,1438.2 ,1445.3 > icelake ,avx512 ,65567 ,3 ,0 ,1450.3 ,1463.8 > icelake ,avx512 ,65599 ,3 ,7 ,1469.0 ,1473.8 > icelake ,avx512 ,65663 ,9 ,5 ,1480.0 ,1483.5 > icelake ,avx512 ,131079 ,0 ,64 ,3015.1 ,3037.5 > icelake ,avx512 ,131087 ,0 ,3 ,2952.3 ,2960.4 > icelake ,avx512 ,131103 ,3 ,0 ,2966.2 ,2964.4 > icelake ,avx512 ,131135 ,3 ,7 ,2961.6 ,3047.9 > icelake ,avx512 ,131199 ,9 ,5 ,2967.4 ,3183.8 > icelake ,avx512 ,262151 ,0 ,64 ,6206.0 ,6141.5 > icelake ,avx512 ,262159 ,0 ,3 ,5990.8 ,5959.2 > icelake ,avx512 ,262175 ,3 ,0 ,5976.7 ,5963.8 > icelake ,avx512 ,262207 ,3 ,7 ,5939.5 ,5924.3 > icelake ,avx512 ,262271 ,9 ,5 ,5944.6 ,5990.3 > icelake ,avx512 ,524295 ,0 ,64 ,14726.7 ,14307.0 > icelake ,avx512 ,524303 ,0 ,3 ,14344.2 ,14040.5 > icelake ,avx512 ,524319 ,3 ,0 ,14175.0 ,13862.2 > icelake ,avx512 ,524351 ,3 ,7 ,14261.4 ,13821.5 > icelake ,avx512 ,524415 ,9 ,5 ,14266.5 ,14064.7 > icelake ,avx512 ,1048583 ,0 ,64 ,35211.4 ,35414.6 > icelake ,avx512 ,1048591 ,0 ,3 ,35156.8 ,35591.2 > icelake ,avx512 ,1048607 ,3 ,0 ,35273.1 ,35503.3 > icelake ,avx512 ,1048639 ,3 ,7 ,35255.8 ,35725.0 > icelake ,avx512 ,1048703 ,9 ,5 ,35703.6 ,36289.9 > icelake ,avx512 ,2097159 ,0 ,64 ,72613.9 ,72063.2 > icelake ,avx512 ,2097167 ,0 ,3 ,72301.6 ,73504.2 > icelake ,avx512 ,2097183 ,3 ,0 ,73448.8 ,72133.6 > icelake ,avx512 ,2097215 ,3 ,7 ,73762.9 ,72825.8 > icelake ,avx512 ,2097279 ,9 ,5 ,72097.3 ,72914.6 > icelake ,avx512 ,4194311 ,0 ,64 ,144793.4 ,144182.1 > icelake ,avx512 ,4194319 ,0 ,3 ,143710.3 ,145063.3 > icelake ,avx512 ,4194335 ,3 ,0 ,146722.1 ,144046.4 > icelake ,avx512 ,4194367 ,3 ,7 ,144267.0 ,144874.6 > icelake ,avx512 ,4194431 ,9 ,5 ,143808.2 ,144560.0 > icelake ,avx512 ,8388615 ,0 ,64 ,427993.4 ,424521.5 > icelake ,avx512 ,8388623 ,0 ,3 ,470267.1 ,473290.8 > icelake ,avx512 ,8388639 ,3 ,0 ,457179.7 ,461797.7 > icelake ,avx512 ,8388671 ,3 ,7 ,472507.9 ,481561.4 > icelake ,avx512 ,8388735 ,9 ,5 ,463611.9 ,467388.7 > icelake ,avx512 ,16777223 ,0 ,64 ,1490426.0 ,1526996.0 > icelake ,avx512 ,16777231 ,0 ,3 ,1516687.0 ,1517095.0 > icelake ,avx512 ,16777247 ,3 ,0 ,1497688.0 ,1512766.0 > icelake ,avx512 ,16777279 ,3 ,7 ,1512331.0 ,1524317.0 > icelake ,avx512 ,16777343 ,9 ,5 ,1498908.0 ,1500526.0 > icelake ,avx ,4103 ,0 ,64 ,50.2 ,63.7 > icelake ,avx ,4111 ,0 ,3 ,63.7 ,65.1 > icelake ,avx ,4127 ,3 ,0 ,68.2 ,69.4 > icelake ,avx ,4159 ,3 ,7 ,59.6 ,68.0 > icelake ,avx ,4223 ,9 ,5 ,68.2 ,66.8 > icelake ,avx ,8199 ,0 ,64 ,92.1 ,89.9 > icelake ,avx ,8207 ,0 ,3 ,119.7 ,118.3 > icelake ,avx ,8223 ,3 ,0 ,119.1 ,120.9 > icelake ,avx ,8255 ,3 ,7 ,122.9 ,123.7 > icelake ,avx ,8319 ,9 ,5 ,122.1 ,121.8 > icelake ,avx ,16391 ,0 ,64 ,162.7 ,158.0 > icelake ,avx ,16399 ,0 ,3 ,227.6 ,234.1 > icelake ,avx ,16415 ,3 ,0 ,230.8 ,232.7 > icelake ,avx ,16447 ,3 ,7 ,226.8 ,232.6 > icelake ,avx ,16511 ,9 ,5 ,233.4 ,233.8 > icelake ,avx ,32775 ,0 ,64 ,312.2 ,301.8 > icelake ,avx ,32783 ,0 ,3 ,449.7 ,450.0 > icelake ,avx ,32799 ,3 ,0 ,452.7 ,455.9 > icelake ,avx ,32831 ,3 ,7 ,449.8 ,458.0 > icelake ,avx ,32895 ,9 ,5 ,456.3 ,459.4 > icelake ,avx ,65543 ,0 ,64 ,1460.6 ,1463.9 > icelake ,avx ,65551 ,0 ,3 ,1462.0 ,1465.4 > icelake ,avx ,65567 ,3 ,0 ,1466.6 ,1480.4 > icelake ,avx ,65599 ,3 ,7 ,1488.0 ,1488.9 > icelake ,avx ,65663 ,9 ,5 ,1680.8 ,1499.5 > icelake ,avx ,131079 ,0 ,64 ,2988.5 ,3010.1 > icelake ,avx ,131087 ,0 ,3 ,2995.5 ,2996.4 > icelake ,avx ,131103 ,3 ,0 ,3006.2 ,3000.5 > icelake ,avx ,131135 ,3 ,7 ,3032.4 ,3073.7 > icelake ,avx ,131199 ,9 ,5 ,3010.4 ,3027.4 > icelake ,avx ,262151 ,0 ,64 ,6143.2 ,6079.1 > icelake ,avx ,262159 ,0 ,3 ,6085.1 ,6075.8 > icelake ,avx ,262175 ,3 ,0 ,6088.0 ,6064.9 > icelake ,avx ,262207 ,3 ,7 ,6018.7 ,6023.5 > icelake ,avx ,262271 ,9 ,5 ,6019.8 ,5959.2 > icelake ,avx ,524295 ,0 ,64 ,14464.2 ,14095.1 > icelake ,avx ,524303 ,0 ,3 ,14761.6 ,14050.2 > icelake ,avx ,524319 ,3 ,0 ,14534.1 ,14087.5 > icelake ,avx ,524351 ,3 ,7 ,14147.7 ,13903.8 > icelake ,avx ,524415 ,9 ,5 ,14157.0 ,13982.9 > icelake ,avx ,1048583 ,0 ,64 ,36599.0 ,37461.4 > icelake ,avx ,1048591 ,0 ,3 ,36717.8 ,37454.9 > icelake ,avx ,1048607 ,3 ,0 ,36821.2 ,37343.3 > icelake ,avx ,1048639 ,3 ,7 ,36958.0 ,37507.2 > icelake ,avx ,1048703 ,9 ,5 ,36869.2 ,37413.1 > icelake ,avx ,2097159 ,0 ,64 ,74765.8 ,75330.9 > icelake ,avx ,2097167 ,0 ,3 ,75175.4 ,74891.9 > icelake ,avx ,2097183 ,3 ,0 ,75451.4 ,74787.7 > icelake ,avx ,2097215 ,3 ,7 ,75394.8 ,75839.1 > icelake ,avx ,2097279 ,9 ,5 ,75099.2 ,75421.2 > icelake ,avx ,4194311 ,0 ,64 ,146809.6 ,146619.4 > icelake ,avx ,4194319 ,0 ,3 ,148866.4 ,149898.2 > icelake ,avx ,4194335 ,3 ,0 ,148719.7 ,150165.4 > icelake ,avx ,4194367 ,3 ,7 ,150600.1 ,150925.9 > icelake ,avx ,4194431 ,9 ,5 ,149457.3 ,150519.2 > icelake ,avx ,8388615 ,0 ,64 ,412709.8 ,423666.1 > icelake ,avx ,8388623 ,0 ,3 ,423717.4 ,424418.2 > icelake ,avx ,8388639 ,3 ,0 ,414387.5 ,413445.6 > icelake ,avx ,8388671 ,3 ,7 ,449010.7 ,417553.5 > icelake ,avx ,8388735 ,9 ,5 ,414128.6 ,411815.3 > icelake ,avx ,16777223 ,0 ,64 ,1490032.0 ,1510004.0 > icelake ,avx ,16777231 ,0 ,3 ,1379638.0 ,1422097.0 > icelake ,avx ,16777247 ,3 ,0 ,1418930.0 ,1367557.0 > icelake ,avx ,16777279 ,3 ,7 ,1515152.0 ,1500176.0 > icelake ,avx ,16777343 ,9 ,5 ,1344117.0 ,1411795.0 > icelake ,sse2 ,4103 ,0 ,64 ,113.2 ,114.6 > icelake ,sse2 ,4111 ,0 ,3 ,121.5 ,120.4 > icelake ,sse2 ,4127 ,3 ,0 ,1700.5 ,1771.5 > icelake ,sse2 ,4159 ,3 ,7 ,119.3 ,118.8 > icelake ,sse2 ,4223 ,9 ,5 ,1739.7 ,1735.2 > icelake ,sse2 ,8199 ,0 ,64 ,207.0 ,203.9 > icelake ,sse2 ,8207 ,0 ,3 ,225.5 ,220.8 > icelake ,sse2 ,8223 ,3 ,0 ,3444.3 ,3743.5 > icelake ,sse2 ,8255 ,3 ,7 ,219.9 ,216.8 > icelake ,sse2 ,8319 ,9 ,5 ,4117.1 ,3487.3 > icelake ,sse2 ,16391 ,0 ,64 ,397.1 ,394.3 > icelake ,sse2 ,16399 ,0 ,3 ,439.6 ,428.6 > icelake ,sse2 ,16415 ,3 ,0 ,6997.0 ,7031.2 > icelake ,sse2 ,16447 ,3 ,7 ,426.8 ,421.8 > icelake ,sse2 ,16511 ,9 ,5 ,7037.6 ,7038.3 > icelake ,sse2 ,32775 ,0 ,64 ,790.9 ,779.0 > icelake ,sse2 ,32783 ,0 ,3 ,863.1 ,849.6 > icelake ,sse2 ,32799 ,3 ,0 ,14043.0 ,14390.9 > icelake ,sse2 ,32831 ,3 ,7 ,841.6 ,833.1 > icelake ,sse2 ,32895 ,9 ,5 ,14277.6 ,14344.2 > icelake ,sse2 ,65543 ,0 ,64 ,1897.0 ,1897.3 > icelake ,sse2 ,65551 ,0 ,3 ,1927.1 ,1955.4 > icelake ,sse2 ,65567 ,3 ,0 ,28834.7 ,28727.8 > icelake ,sse2 ,65599 ,3 ,7 ,1961.4 ,1969.7 > icelake ,sse2 ,65663 ,9 ,5 ,28867.6 ,29019.8 > icelake ,sse2 ,131079 ,0 ,64 ,3879.3 ,3872.6 > icelake ,sse2 ,131087 ,0 ,3 ,3955.3 ,3990.7 > icelake ,sse2 ,131103 ,3 ,0 ,58001.8 ,60567.9 > icelake ,sse2 ,131135 ,3 ,7 ,3951.5 ,4002.6 > icelake ,sse2 ,131199 ,9 ,5 ,57886.7 ,58391.4 > icelake ,sse2 ,262151 ,0 ,64 ,7851.4 ,7894.7 > icelake ,sse2 ,262159 ,0 ,3 ,7947.5 ,8016.2 > icelake ,sse2 ,262175 ,3 ,0 ,115036.2 ,115968.6 > icelake ,sse2 ,262207 ,3 ,7 ,7883.9 ,7814.1 > icelake ,sse2 ,262271 ,9 ,5 ,113776.4 ,119733.6 > icelake ,sse2 ,524295 ,0 ,64 ,17198.1 ,16974.9 > icelake ,sse2 ,524303 ,0 ,3 ,17402.2 ,17096.3 > icelake ,sse2 ,524319 ,3 ,0 ,223980.4 ,225889.9 > icelake ,sse2 ,524351 ,3 ,7 ,17034.9 ,16910.3 > icelake ,sse2 ,524415 ,9 ,5 ,224027.7 ,224962.5 > icelake ,sse2 ,1048583 ,0 ,64 ,38822.3 ,39178.6 > icelake ,sse2 ,1048591 ,0 ,3 ,41686.7 ,40247.4 > icelake ,sse2 ,1048607 ,3 ,0 ,38814.8 ,39323.3 > icelake ,sse2 ,1048639 ,3 ,7 ,39568.3 ,41325.7 > icelake ,sse2 ,1048703 ,9 ,5 ,39354.2 ,39637.9 > icelake ,sse2 ,2097159 ,0 ,64 ,84074.7 ,84543.1 > icelake ,sse2 ,2097167 ,0 ,3 ,83665.7 ,82358.2 > icelake ,sse2 ,2097183 ,3 ,0 ,81817.8 ,79638.9 > icelake ,sse2 ,2097215 ,3 ,7 ,83649.1 ,83497.6 > icelake ,sse2 ,2097279 ,9 ,5 ,80287.6 ,79980.9 > icelake ,sse2 ,4194311 ,0 ,64 ,165409.8 ,168343.1 > icelake ,sse2 ,4194319 ,0 ,3 ,165216.7 ,177632.0 > icelake ,sse2 ,4194335 ,3 ,0 ,158718.7 ,160342.2 > icelake ,sse2 ,4194367 ,3 ,7 ,167944.9 ,167204.4 > icelake ,sse2 ,4194431 ,9 ,5 ,161530.1 ,164839.7 > icelake ,sse2 ,8388615 ,0 ,64 ,626504.3 ,629858.5 > icelake ,sse2 ,8388623 ,0 ,3 ,623969.5 ,631509.1 > icelake ,sse2 ,8388639 ,3 ,0 ,599366.7 ,600016.0 > icelake ,sse2 ,8388671 ,3 ,7 ,619964.2 ,619113.2 > icelake ,sse2 ,8388735 ,9 ,5 ,595338.1 ,604172.4 > icelake ,sse2 ,16777223 ,0 ,64 ,1709597.0 ,1725184.0 > icelake ,sse2 ,16777231 ,0 ,3 ,1725452.0 ,1719746.0 > icelake ,sse2 ,16777247 ,3 ,0 ,1614269.0 ,1607164.0 > icelake ,sse2 ,16777279 ,3 ,7 ,1705295.0 ,1733018.0 > icelake ,sse2 ,16777343 ,9 ,5 ,1604197.0 ,1595690.0 > > > .../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++---- > 1 file changed, 265 insertions(+), 73 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > index 897a3d9762..5e4a071f16 100644 > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > @@ -35,7 +35,16 @@ > __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. > 7. If size >= __x86_shared_non_temporal_threshold and there is no > overlap between destination and source, use non-temporal store > - instead of aligned store. */ > + instead of aligned store copying from either 2 or 4 pages at > + once. > + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold > + and source and destination do not page alias, copy from 2 pages > + at once using non-temporal stores. Page aliasing in this case is > + considered true if destination's page alignment - sources' page > + alignment is less than 8 * VEC_SIZE. > + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source > + and destination do page alias copy from 4 pages at once using > + non-temporal stores. */ > > #include <sysdep.h> > > @@ -67,6 +76,34 @@ > # endif > #endif > > +#ifndef PAGE_SIZE > +# define PAGE_SIZE 4096 > +#endif > + > +#if PAGE_SIZE != 4096 > +# error Unsupported PAGE_SIZE > +#endif > + > +#ifndef LOG_PAGE_SIZE > +# define LOG_PAGE_SIZE 12 > +#endif > + > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) > +# error Invalid LOG_PAGE_SIZE > +#endif > + > +/* Byte per page for large_memcpy inner loop. */ > +#if VEC_SIZE == 64 > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) > +#else > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) > +#endif > + > +/* Amount to shift rdx by to compare for memcpy_large_4x. */ > +#ifndef LOG_4X_MEMCPY_THRESH > +# define LOG_4X_MEMCPY_THRESH 4 > +#endif > + > /* Avoid short distance rep movsb only with non-SSE vector. */ > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) > @@ -106,6 +143,28 @@ > # error Unsupported PREFETCH_SIZE! > #endif > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ > + VMOVU (offset)base, vec0; \ > + VMOVU ((offset) + VEC_SIZE)base, vec1; > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ > + VMOVNT vec0, (offset)base; \ > + VMOVNT vec1, ((offset) + VEC_SIZE)base; > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > + VMOVU (offset)base, vec0; \ > + VMOVU ((offset) + VEC_SIZE)base, vec1; \ > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > + VMOVNT vec0, (offset)base; \ > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; > +#else > +# error Invalid LARGE_LOAD_SIZE > +#endif > + > #ifndef SECTION > # error SECTION is not defined! > #endif > @@ -393,6 +452,15 @@ L(last_4x_vec): > VZEROUPPER_RETURN > > L(more_8x_vec): > + /* Check if non-temporal move candidate. */ > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > + /* Check non-temporal store threshold. */ > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > + ja L(large_memcpy_2x) > +#endif > + /* Entry if rdx is greater than non-temporal threshold but there > + is overlap. */ > +L(more_8x_vec_check): > cmpq %rsi, %rdi > ja L(more_8x_vec_backward) > /* Source == destination is less common. */ > @@ -419,24 +487,21 @@ L(more_8x_vec): > subq %r8, %rdi > /* Adjust length. */ > addq %r8, %rdx > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > - /* Check non-temporal store threshold. */ > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > - ja L(large_forward) > -#endif > + > + .p2align 4 > L(loop_4x_vec_forward): > /* Copy 4 * VEC a time forward. */ > VMOVU (%rsi), %VEC(0) > VMOVU VEC_SIZE(%rsi), %VEC(1) > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > - addq $(VEC_SIZE * 4), %rsi > - subq $(VEC_SIZE * 4), %rdx > + subq $-(VEC_SIZE * 4), %rsi > + addq $-(VEC_SIZE * 4), %rdx > VMOVA %VEC(0), (%rdi) > VMOVA %VEC(1), VEC_SIZE(%rdi) > VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > - addq $(VEC_SIZE * 4), %rdi > + subq $-(VEC_SIZE * 4), %rdi > cmpq $(VEC_SIZE * 4), %rdx > ja L(loop_4x_vec_forward) > /* Store the last 4 * VEC. */ > @@ -470,24 +535,21 @@ L(more_8x_vec_backward): > subq %r8, %r9 > /* Adjust length. */ > subq %r8, %rdx > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > - /* Check non-temporal store threshold. */ > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > - ja L(large_backward) > -#endif > + > + .p2align 4 > L(loop_4x_vec_backward): > /* Copy 4 * VEC a time backward. */ > VMOVU (%rcx), %VEC(0) > VMOVU -VEC_SIZE(%rcx), %VEC(1) > VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > - subq $(VEC_SIZE * 4), %rcx > - subq $(VEC_SIZE * 4), %rdx > + addq $-(VEC_SIZE * 4), %rcx > + addq $-(VEC_SIZE * 4), %rdx > VMOVA %VEC(0), (%r9) > VMOVA %VEC(1), -VEC_SIZE(%r9) > VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) > VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) > - subq $(VEC_SIZE * 4), %r9 > + addq $-(VEC_SIZE * 4), %r9 > cmpq $(VEC_SIZE * 4), %rdx > ja L(loop_4x_vec_backward) > /* Store the first 4 * VEC. */ > @@ -500,72 +562,202 @@ L(loop_4x_vec_backward): > VZEROUPPER_RETURN > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > -L(large_forward): > + .p2align 4 > +L(large_memcpy_2x): > + /* Compute absolute value of difference between source and > + destination. */ > + movq %rdi, %r9 > + subq %rsi, %r9 > + movq %r9, %r8 > + leaq -1(%r9), %rcx > + sarq $63, %r8 > + xorq %r8, %r9 > + subq %r8, %r9 > /* Don't use non-temporal store if there is overlap between > - destination and source since destination may be in cache > - when source is loaded. */ > - leaq (%rdi, %rdx), %r10 > - cmpq %r10, %rsi > - jb L(loop_4x_vec_forward) > -L(loop_large_forward): > + destination and source since destination may be in cache when > + source is loaded. */ > + cmpq %r9, %rdx > + ja L(more_8x_vec_check) > + > + /* Cache align destination. First store the first 64 bytes then > + adjust alignments. */ > + VMOVU (%rsi), %VEC(8) > +#if VEC_SIZE < 64 > + VMOVU VEC_SIZE(%rsi), %VEC(9) > +#if VEC_SIZE < 32 > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > +#endif > +#endif > + VMOVU %VEC(8), (%rdi) > +#if VEC_SIZE < 64 > + VMOVU %VEC(9), VEC_SIZE(%rdi) > +#if VEC_SIZE < 32 > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > +#endif > +#endif > + /* Adjust source, destination, and size. */ > + movq %rdi, %r8 > + andq $63, %r8 > + /* Get the negative of offset for alignment. */ > + subq $64, %r8 > + /* Adjust source. */ > + subq %r8, %rsi > + /* Adjust destination which should be aligned now. */ > + subq %r8, %rdi > + /* Adjust length. */ > + addq %r8, %rdx > + > + /* Test if source and destination addresses will alias. If they do > + the larger pipeline in large_memcpy_4x alleviated the > + performance drop. */ > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > + jz L(large_memcpy_4x) > + > + movq %rdx, %r10 > + shrq $LOG_4X_MEMCPY_THRESH, %r10 > + cmp __x86_shared_non_temporal_threshold(%rip), %r10 > + jae L(large_memcpy_4x) > + > + /* edx will store remainder size for copying tail. */ > + andl $(PAGE_SIZE * 2 - 1), %edx > + /* r10 stores outer loop counter. */ > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 > + /* Copy 4x VEC at a time from 2 pages. */ > + .p2align 4 > +L(loop_large_memcpy_2x_outer): > + /* ecx stores inner loop counter. */ > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > +L(loop_large_memcpy_2x_inner): > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) > + /* Load vectors from rsi. */ > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + subq $-LARGE_LOAD_SIZE, %rsi > + /* Non-temporal store vectors to rdi. */ > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + subq $-LARGE_LOAD_SIZE, %rdi > + decl %ecx > + jnz L(loop_large_memcpy_2x_inner) > + addq $PAGE_SIZE, %rdi > + addq $PAGE_SIZE, %rsi > + decq %r10 > + jne L(loop_large_memcpy_2x_outer) > + sfence > + > + /* Check if only last 4 loads are needed. */ > + cmpl $(VEC_SIZE * 4), %edx > + jbe L(large_memcpy_2x_end) > + > + /* Handle the last 2 * PAGE_SIZE bytes. */ > +L(loop_large_memcpy_2x_tail): > /* Copy 4 * VEC a time forward with non-temporal stores. */ > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > VMOVU (%rsi), %VEC(0) > VMOVU VEC_SIZE(%rsi), %VEC(1) > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > - addq $PREFETCHED_LOAD_SIZE, %rsi > - subq $PREFETCHED_LOAD_SIZE, %rdx > - VMOVNT %VEC(0), (%rdi) > - VMOVNT %VEC(1), VEC_SIZE(%rdi) > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) > - addq $PREFETCHED_LOAD_SIZE, %rdi > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > - ja L(loop_large_forward) > - sfence > + subq $-(VEC_SIZE * 4), %rsi > + addl $-(VEC_SIZE * 4), %edx > + VMOVA %VEC(0), (%rdi) > + VMOVA %VEC(1), VEC_SIZE(%rdi) > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > + subq $-(VEC_SIZE * 4), %rdi > + cmpl $(VEC_SIZE * 4), %edx > + ja L(loop_large_memcpy_2x_tail) > + > +L(large_memcpy_2x_end): > /* Store the last 4 * VEC. */ > - VMOVU %VEC(5), (%rcx) > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > - /* Store the first VEC. */ > - VMOVU %VEC(4), (%r11) > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > + > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > VZEROUPPER_RETURN > > -L(large_backward): > - /* Don't use non-temporal store if there is overlap between > - destination and source since destination may be in cache > - when source is loaded. */ > - leaq (%rcx, %rdx), %r10 > - cmpq %r10, %r9 > - jb L(loop_4x_vec_backward) > -L(loop_large_backward): > - /* Copy 4 * VEC a time backward with non-temporal stores. */ > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) > - VMOVU (%rcx), %VEC(0) > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > - subq $PREFETCHED_LOAD_SIZE, %rcx > - subq $PREFETCHED_LOAD_SIZE, %rdx > - VMOVNT %VEC(0), (%r9) > - VMOVNT %VEC(1), -VEC_SIZE(%r9) > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) > - subq $PREFETCHED_LOAD_SIZE, %r9 > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > - ja L(loop_large_backward) > + .p2align 4 > +L(large_memcpy_4x): > + movq %rdx, %r10 > + /* edx will store remainder size for copying tail. */ > + andl $(PAGE_SIZE * 4 - 1), %edx > + /* r10 stores outer loop counter. */ > + shrq $(LOG_PAGE_SIZE + 2), %r10 > + /* Copy 4x VEC at a time from 4 pages. */ > + .p2align 4 > +L(loop_large_memcpy_4x_outer): > + /* ecx stores inner loop counter. */ > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > +L(loop_large_memcpy_4x_inner): > + /* Only one prefetch set per page as doing 4 pages give more time > + for prefetcher to keep up. */ > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) > + /* Load vectors from rsi. */ > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > + subq $-LARGE_LOAD_SIZE, %rsi > + /* Non-temporal store vectors to rdi. */ > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > + subq $-LARGE_LOAD_SIZE, %rdi > + decl %ecx > + jnz L(loop_large_memcpy_4x_inner) > + addq $(PAGE_SIZE * 3), %rdi > + addq $(PAGE_SIZE * 3), %rsi > + decq %r10 > + jne L(loop_large_memcpy_4x_outer) > sfence > - /* Store the first 4 * VEC. */ > - VMOVU %VEC(4), (%rdi) > - VMOVU %VEC(5), VEC_SIZE(%rdi) > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > - /* Store the last VEC. */ > - VMOVU %VEC(8), (%r11) > + /* Check if only last 4 loads are needed. */ > + cmpl $(VEC_SIZE * 4), %edx > + jbe L(large_memcpy_4x_end) > + > + /* Handle the last 4 * PAGE_SIZE bytes. */ > +L(loop_large_memcpy_4x_tail): > + /* Copy 4 * VEC a time forward with non-temporal stores. */ > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > + VMOVU (%rsi), %VEC(0) > + VMOVU VEC_SIZE(%rsi), %VEC(1) > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > + subq $-(VEC_SIZE * 4), %rsi > + addl $-(VEC_SIZE * 4), %edx > + VMOVA %VEC(0), (%rdi) > + VMOVA %VEC(1), VEC_SIZE(%rdi) > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > + subq $-(VEC_SIZE * 4), %rdi > + cmpl $(VEC_SIZE * 4), %edx > + ja L(loop_large_memcpy_4x_tail) > + > +L(large_memcpy_4x_end): > + /* Store the last 4 * VEC. */ > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > + > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > VZEROUPPER_RETURN > #endif > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > -- > 2.29.2 > LGTM. Please commit it. Thanks. H.J.
> LGTM. Please commit it. Are you saying that to me or someone else? If its to me what do you mean, is the patch not enough? > Thanks. On Fri, Apr 16, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote: > > From: noah <goldstein.w.n@gmail.com> > > > > No Bug. This commit updates the large memcpy case (no overlap). The > > update is to perform memcpy on either 2 or 4 contiguous pages at > > once. This 1) helps to alleviate the affects of false memory aliasing > > when destination and source have a close 4k alignment and 2) In most > > cases and for most DRAM units is a modestly more efficient access > > pattern. These changes are a clear performance improvement for > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > > pass. > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > --- > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the > > loops and no longer see any meaningful regression. > > > > Also added back the temporal stores for the tail. Saw a regression > > when doing these tests. > > > > Two tables below for skylake and icelake numbers for the areas around > > where you saw the regression. Below is all data from the tests. > > > > N = 10. > > > > Skylake > > Len ,align1 ,align2 ,new mean ,old mean > > 4103 ,0 ,64 ,84.5 ,88.6 > > 4111 ,0 ,3 ,99.0 ,99.9 > > 4127 ,3 ,0 ,102.1 ,102.3 > > 4159 ,3 ,7 ,88.7 ,90.9 > > 4223 ,9 ,5 ,88.1 ,87.4 > > 8199 ,0 ,64 ,146.7 ,150.2 > > 8207 ,0 ,3 ,167.9 ,168.5 > > 8223 ,3 ,0 ,168.5 ,168.1 > > 8255 ,3 ,7 ,157.0 ,159.2 > > 8319 ,9 ,5 ,155.5 ,155.7 > > 16391 ,0 ,64 ,286.2 ,288.8 > > 16399 ,0 ,3 ,307.0 ,308.7 > > 16415 ,3 ,0 ,307.4 ,307.6 > > 16447 ,3 ,7 ,294.6 ,295.5 > > 16511 ,9 ,5 ,291.5 ,462.1 > > 32775 ,0 ,64 ,603.4 ,601.5 > > 32783 ,0 ,3 ,604.8 ,606.4 > > 32799 ,3 ,0 ,603.0 ,604.1 > > 32831 ,3 ,7 ,600.2 ,737.3 > > 32895 ,9 ,5 ,604.4 ,599.5 > > 65543 ,0 ,64 ,1873.5 ,1854.3 > > 65551 ,0 ,3 ,1862.9 ,1846.6 > > 65567 ,3 ,0 ,1885.5 ,1966.0 > > 65599 ,3 ,7 ,1833.2 ,1833.1 > > 65663 ,9 ,5 ,1884.9 ,1887.4 > > 131079 ,0 ,64 ,3944.3 ,3949.4 > > 131087 ,0 ,3 ,3927.3 ,3913.3 > > 131103 ,3 ,0 ,4415.8 ,4169.4 > > 131135 ,3 ,7 ,4224.5 ,4157.6 > > 131199 ,9 ,5 ,5974.0 ,4983.8 > > 262151 ,0 ,64 ,11050.2 ,10620.6 > > 262159 ,0 ,3 ,9932.8 ,10037.3 > > 262175 ,3 ,0 ,10188.8 ,9206.6 > > 262207 ,3 ,7 ,9633.3 ,9216.7 > > 262271 ,9 ,5 ,9732.7 ,9345.3 > > 524295 ,0 ,64 ,24823.9 ,24880.7 > > 524303 ,0 ,3 ,24514.0 ,24556.7 > > 524319 ,3 ,0 ,23974.4 ,24219.9 > > 524351 ,3 ,7 ,24159.7 ,24207.0 > > 524415 ,9 ,5 ,23946.5 ,24142.8 > > > > Icelake: > > Len ,align1 ,align2 ,new mean ,old mean > > 4103 ,0 ,64 ,50.2 ,63.7 > > 4111 ,0 ,3 ,63.7 ,65.1 > > 4127 ,3 ,0 ,68.2 ,69.4 > > 4159 ,3 ,7 ,59.6 ,68.0 > > 4223 ,9 ,5 ,68.2 ,66.8 > > 8199 ,0 ,64 ,92.1 ,89.9 > > 8207 ,0 ,3 ,119.7 ,118.3 > > 8223 ,3 ,0 ,119.1 ,120.9 > > 8255 ,3 ,7 ,122.9 ,123.7 > > 8319 ,9 ,5 ,122.1 ,121.8 > > 16391 ,0 ,64 ,162.7 ,158.0 > > 16399 ,0 ,3 ,227.6 ,234.1 > > 16415 ,3 ,0 ,230.8 ,232.7 > > 16447 ,3 ,7 ,226.8 ,232.6 > > 16511 ,9 ,5 ,233.4 ,233.8 > > 32775 ,0 ,64 ,312.2 ,301.8 > > 32783 ,0 ,3 ,449.7 ,450.0 > > 32799 ,3 ,0 ,452.7 ,455.9 > > 32831 ,3 ,7 ,449.8 ,458.0 > > 32895 ,9 ,5 ,456.3 ,459.4 > > 65543 ,0 ,64 ,1460.6 ,1463.9 > > 65551 ,0 ,3 ,1462.0 ,1465.4 > > 65567 ,3 ,0 ,1466.6 ,1480.4 > > 65599 ,3 ,7 ,1488.0 ,1488.9 > > 65663 ,9 ,5 ,1680.8 ,1499.5 > > 131079 ,0 ,64 ,2988.5 ,3010.1 > > 131087 ,0 ,3 ,2995.5 ,2996.4 > > 131103 ,3 ,0 ,3006.2 ,3000.5 > > 131135 ,3 ,7 ,3032.4 ,3073.7 > > 131199 ,9 ,5 ,3010.4 ,3027.4 > > 262151 ,0 ,64 ,6143.2 ,6079.1 > > 262159 ,0 ,3 ,6085.1 ,6075.8 > > 262175 ,3 ,0 ,6088.0 ,6064.9 > > 262207 ,3 ,7 ,6018.7 ,6023.5 > > 262271 ,9 ,5 ,6019.8 ,5959.2 > > 524295 ,0 ,64 ,14464.2 ,14095.1 > > 524303 ,0 ,3 ,14761.6 ,14050.2 > > 524319 ,3 ,0 ,14534.1 ,14087.5 > > 524351 ,3 ,7 ,14147.7 ,13903.8 > > 524415 ,9 ,5 ,14157.0 ,13982.9 > > > > > > > > cpu ,version ,Len ,align1 ,align2 ,new mean ,old mean > > skylake ,avx ,4103 ,0 ,64 ,84.5 ,88.6 > > skylake ,avx ,4111 ,0 ,3 ,99.0 ,99.9 > > skylake ,avx ,4127 ,3 ,0 ,102.1 ,102.3 > > skylake ,avx ,4159 ,3 ,7 ,88.7 ,90.9 > > skylake ,avx ,4223 ,9 ,5 ,88.1 ,87.4 > > skylake ,avx ,8199 ,0 ,64 ,146.7 ,150.2 > > skylake ,avx ,8207 ,0 ,3 ,167.9 ,168.5 > > skylake ,avx ,8223 ,3 ,0 ,168.5 ,168.1 > > skylake ,avx ,8255 ,3 ,7 ,157.0 ,159.2 > > skylake ,avx ,8319 ,9 ,5 ,155.5 ,155.7 > > skylake ,avx ,16391 ,0 ,64 ,286.2 ,288.8 > > skylake ,avx ,16399 ,0 ,3 ,307.0 ,308.7 > > skylake ,avx ,16415 ,3 ,0 ,307.4 ,307.6 > > skylake ,avx ,16447 ,3 ,7 ,294.6 ,295.5 > > skylake ,avx ,16511 ,9 ,5 ,291.5 ,462.1 > > skylake ,avx ,32775 ,0 ,64 ,603.4 ,601.5 > > skylake ,avx ,32783 ,0 ,3 ,604.8 ,606.4 > > skylake ,avx ,32799 ,3 ,0 ,603.0 ,604.1 > > skylake ,avx ,32831 ,3 ,7 ,600.2 ,737.3 > > skylake ,avx ,32895 ,9 ,5 ,604.4 ,599.5 > > skylake ,avx ,65543 ,0 ,64 ,1873.5 ,1854.3 > > skylake ,avx ,65551 ,0 ,3 ,1862.9 ,1846.6 > > skylake ,avx ,65567 ,3 ,0 ,1885.5 ,1966.0 > > skylake ,avx ,65599 ,3 ,7 ,1833.2 ,1833.1 > > skylake ,avx ,65663 ,9 ,5 ,1884.9 ,1887.4 > > skylake ,avx ,131079 ,0 ,64 ,3944.3 ,3949.4 > > skylake ,avx ,131087 ,0 ,3 ,3927.3 ,3913.3 > > skylake ,avx ,131103 ,3 ,0 ,4415.8 ,4169.4 > > skylake ,avx ,131135 ,3 ,7 ,4224.5 ,4157.6 > > skylake ,avx ,131199 ,9 ,5 ,5974.0 ,4983.8 > > skylake ,avx ,262151 ,0 ,64 ,11050.2 ,10620.6 > > skylake ,avx ,262159 ,0 ,3 ,9932.8 ,10037.3 > > skylake ,avx ,262175 ,3 ,0 ,10188.8 ,9206.6 > > skylake ,avx ,262207 ,3 ,7 ,9633.3 ,9216.7 > > skylake ,avx ,262271 ,9 ,5 ,9732.7 ,9345.3 > > skylake ,avx ,524295 ,0 ,64 ,24823.9 ,24880.7 > > skylake ,avx ,524303 ,0 ,3 ,24514.0 ,24556.7 > > skylake ,avx ,524319 ,3 ,0 ,23974.4 ,24219.9 > > skylake ,avx ,524351 ,3 ,7 ,24159.7 ,24207.0 > > skylake ,avx ,524415 ,9 ,5 ,23946.5 ,24142.8 > > skylake ,avx ,1048583 ,0 ,64 ,49163.9 ,49454.6 > > skylake ,avx ,1048591 ,0 ,3 ,49879.3 ,49400.8 > > skylake ,avx ,1048607 ,3 ,0 ,49738.0 ,48864.6 > > skylake ,avx ,1048639 ,3 ,7 ,48804.0 ,47588.5 > > skylake ,avx ,1048703 ,9 ,5 ,49629.4 ,49796.3 > > skylake ,avx ,2097159 ,0 ,64 ,98271.7 ,96330.6 > > skylake ,avx ,2097167 ,0 ,3 ,97801.8 ,98638.1 > > skylake ,avx ,2097183 ,3 ,0 ,98041.1 ,99287.6 > > skylake ,avx ,2097215 ,3 ,7 ,96629.5 ,96521.9 > > skylake ,avx ,2097279 ,9 ,5 ,98961.8 ,98909.8 > > skylake ,avx ,4194311 ,0 ,64 ,194667.7 ,195377.1 > > skylake ,avx ,4194319 ,0 ,3 ,194919.5 ,198576.2 > > skylake ,avx ,4194335 ,3 ,0 ,192949.8 ,194584.7 > > skylake ,avx ,4194367 ,3 ,7 ,189943.5 ,189177.9 > > skylake ,avx ,4194431 ,9 ,5 ,192479.1 ,196494.2 > > skylake ,avx ,8388615 ,0 ,64 ,588671.6 ,587215.4 > > skylake ,avx ,8388623 ,0 ,3 ,581640.7 ,582812.5 > > skylake ,avx ,8388639 ,3 ,0 ,549811.9 ,544697.6 > > skylake ,avx ,8388671 ,3 ,7 ,591155.0 ,577951.8 > > skylake ,avx ,8388735 ,9 ,5 ,547583.2 ,545133.3 > > skylake ,avx ,16777223 ,0 ,64 ,1787503.0 ,1811146.0 > > skylake ,avx ,16777231 ,0 ,3 ,1758671.0 ,1756343.0 > > skylake ,avx ,16777247 ,3 ,0 ,1691781.0 ,1694661.0 > > skylake ,avx ,16777279 ,3 ,7 ,1768150.0 ,1754785.0 > > skylake ,avx ,16777343 ,9 ,5 ,1695179.0 ,1710794.0 > > skylake ,sse2 ,4103 ,0 ,64 ,150.8 ,150.5 > > skylake ,sse2 ,4111 ,0 ,3 ,156.8 ,158.4 > > skylake ,sse2 ,4127 ,3 ,0 ,99.7 ,99.4 > > skylake ,sse2 ,4159 ,3 ,7 ,154.8 ,154.5 > > skylake ,sse2 ,4223 ,9 ,5 ,137.3 ,137.2 > > skylake ,sse2 ,8199 ,0 ,64 ,284.8 ,285.5 > > skylake ,sse2 ,8207 ,0 ,3 ,296.0 ,296.1 > > skylake ,sse2 ,8223 ,3 ,0 ,168.0 ,168.2 > > skylake ,sse2 ,8255 ,3 ,7 ,293.0 ,292.4 > > skylake ,sse2 ,8319 ,9 ,5 ,251.3 ,250.7 > > skylake ,sse2 ,16391 ,0 ,64 ,561.3 ,608.3 > > skylake ,sse2 ,16399 ,0 ,3 ,571.0 ,574.8 > > skylake ,sse2 ,16415 ,3 ,0 ,305.4 ,305.0 > > skylake ,sse2 ,16447 ,3 ,7 ,563.2 ,565.0 > > skylake ,sse2 ,16511 ,9 ,5 ,477.1 ,475.1 > > skylake ,sse2 ,32775 ,0 ,64 ,1128.2 ,1131.7 > > skylake ,sse2 ,32783 ,0 ,3 ,1126.6 ,1131.0 > > skylake ,sse2 ,32799 ,3 ,0 ,587.6 ,590.8 > > skylake ,sse2 ,32831 ,3 ,7 ,1130.6 ,1126.2 > > skylake ,sse2 ,32895 ,9 ,5 ,957.6 ,953.0 > > skylake ,sse2 ,65543 ,0 ,64 ,2718.9 ,2704.2 > > skylake ,sse2 ,65551 ,0 ,3 ,2724.1 ,2725.0 > > skylake ,sse2 ,65567 ,3 ,0 ,1888.4 ,1914.3 > > skylake ,sse2 ,65599 ,3 ,7 ,2787.6 ,2748.7 > > skylake ,sse2 ,65663 ,9 ,5 ,2400.5 ,2369.4 > > skylake ,sse2 ,131079 ,0 ,64 ,5603.3 ,5654.9 > > skylake ,sse2 ,131087 ,0 ,3 ,5939.3 ,5871.4 > > skylake ,sse2 ,131103 ,3 ,0 ,4272.4 ,4190.0 > > skylake ,sse2 ,131135 ,3 ,7 ,7601.4 ,7524.6 > > skylake ,sse2 ,131199 ,9 ,5 ,7022.1 ,6864.7 > > skylake ,sse2 ,262151 ,0 ,64 ,13736.2 ,14030.0 > > skylake ,sse2 ,262159 ,0 ,3 ,12407.3 ,12334.1 > > skylake ,sse2 ,262175 ,3 ,0 ,9661.1 ,9249.4 > > skylake ,sse2 ,262207 ,3 ,7 ,12850.2 ,12351.6 > > skylake ,sse2 ,262271 ,9 ,5 ,10792.6 ,10435.8 > > skylake ,sse2 ,524295 ,0 ,64 ,27754.5 ,28177.7 > > skylake ,sse2 ,524303 ,0 ,3 ,27766.2 ,28152.0 > > skylake ,sse2 ,524319 ,3 ,0 ,24030.9 ,24438.3 > > skylake ,sse2 ,524351 ,3 ,7 ,27787.5 ,27933.0 > > skylake ,sse2 ,524415 ,9 ,5 ,24263.2 ,25249.1 > > skylake ,sse2 ,1048583 ,0 ,64 ,56199.9 ,56039.8 > > skylake ,sse2 ,1048591 ,0 ,3 ,56750.2 ,58889.7 > > skylake ,sse2 ,1048607 ,3 ,0 ,56394.0 ,55115.3 > > skylake ,sse2 ,1048639 ,3 ,7 ,57233.1 ,57473.8 > > skylake ,sse2 ,1048703 ,9 ,5 ,56324.3 ,55917.9 > > skylake ,sse2 ,2097159 ,0 ,64 ,113234.8 ,114346.4 > > skylake ,sse2 ,2097167 ,0 ,3 ,114373.1 ,115522.5 > > skylake ,sse2 ,2097183 ,3 ,0 ,108113.3 ,108513.3 > > skylake ,sse2 ,2097215 ,3 ,7 ,116863.6 ,116549.9 > > skylake ,sse2 ,2097279 ,9 ,5 ,108945.1 ,108843.7 > > skylake ,sse2 ,4194311 ,0 ,64 ,230250.1 ,232350.0 > > skylake ,sse2 ,4194319 ,0 ,3 ,231895.3 ,235055.6 > > skylake ,sse2 ,4194335 ,3 ,0 ,218442.8 ,219199.8 > > skylake ,sse2 ,4194367 ,3 ,7 ,242564.2 ,235587.7 > > skylake ,sse2 ,4194431 ,9 ,5 ,224167.4 ,215261.8 > > skylake ,sse2 ,8388615 ,0 ,64 ,679801.8 ,674832.0 > > skylake ,sse2 ,8388623 ,0 ,3 ,684913.2 ,685238.7 > > skylake ,sse2 ,8388639 ,3 ,0 ,644865.4 ,631388.6 > > skylake ,sse2 ,8388671 ,3 ,7 ,698700.9 ,689316.1 > > skylake ,sse2 ,8388735 ,9 ,5 ,644820.2 ,631366.8 > > skylake ,sse2 ,16777223 ,0 ,64 ,1877984.0 ,1876437.0 > > skylake ,sse2 ,16777231 ,0 ,3 ,1898086.0 ,1913053.0 > > skylake ,sse2 ,16777247 ,3 ,0 ,1857018.0 ,1866949.0 > > skylake ,sse2 ,16777279 ,3 ,7 ,1914905.0 ,1897134.0 > > skylake ,sse2 ,16777343 ,9 ,5 ,1859937.0 ,1881939.0 > > icelake ,avx512 ,4103 ,0 ,64 ,75.2 ,75.8 > > icelake ,avx512 ,4111 ,0 ,3 ,56.9 ,56.4 > > icelake ,avx512 ,4127 ,3 ,0 ,59.1 ,59.6 > > icelake ,avx512 ,4159 ,3 ,7 ,50.7 ,51.3 > > icelake ,avx512 ,4223 ,9 ,5 ,59.2 ,58.9 > > icelake ,avx512 ,8199 ,0 ,64 ,67.8 ,63.9 > > icelake ,avx512 ,8207 ,0 ,3 ,89.0 ,89.9 > > icelake ,avx512 ,8223 ,3 ,0 ,90.2 ,90.1 > > icelake ,avx512 ,8255 ,3 ,7 ,82.6 ,84.9 > > icelake ,avx512 ,8319 ,9 ,5 ,91.5 ,92.8 > > icelake ,avx512 ,16391 ,0 ,64 ,118.0 ,117.6 > > icelake ,avx512 ,16399 ,0 ,3 ,156.5 ,157.0 > > icelake ,avx512 ,16415 ,3 ,0 ,157.4 ,157.3 > > icelake ,avx512 ,16447 ,3 ,7 ,151.0 ,151.6 > > icelake ,avx512 ,16511 ,9 ,5 ,159.1 ,159.6 > > icelake ,avx512 ,32775 ,0 ,64 ,231.8 ,230.8 > > icelake ,avx512 ,32783 ,0 ,3 ,297.8 ,299.3 > > icelake ,avx512 ,32799 ,3 ,0 ,299.1 ,299.0 > > icelake ,avx512 ,32831 ,3 ,7 ,293.5 ,295.4 > > icelake ,avx512 ,32895 ,9 ,5 ,300.3 ,302.5 > > icelake ,avx512 ,65543 ,0 ,64 ,1473.4 ,1479.2 > > icelake ,avx512 ,65551 ,0 ,3 ,1438.2 ,1445.3 > > icelake ,avx512 ,65567 ,3 ,0 ,1450.3 ,1463.8 > > icelake ,avx512 ,65599 ,3 ,7 ,1469.0 ,1473.8 > > icelake ,avx512 ,65663 ,9 ,5 ,1480.0 ,1483.5 > > icelake ,avx512 ,131079 ,0 ,64 ,3015.1 ,3037.5 > > icelake ,avx512 ,131087 ,0 ,3 ,2952.3 ,2960.4 > > icelake ,avx512 ,131103 ,3 ,0 ,2966.2 ,2964.4 > > icelake ,avx512 ,131135 ,3 ,7 ,2961.6 ,3047.9 > > icelake ,avx512 ,131199 ,9 ,5 ,2967.4 ,3183.8 > > icelake ,avx512 ,262151 ,0 ,64 ,6206.0 ,6141.5 > > icelake ,avx512 ,262159 ,0 ,3 ,5990.8 ,5959.2 > > icelake ,avx512 ,262175 ,3 ,0 ,5976.7 ,5963.8 > > icelake ,avx512 ,262207 ,3 ,7 ,5939.5 ,5924.3 > > icelake ,avx512 ,262271 ,9 ,5 ,5944.6 ,5990.3 > > icelake ,avx512 ,524295 ,0 ,64 ,14726.7 ,14307.0 > > icelake ,avx512 ,524303 ,0 ,3 ,14344.2 ,14040.5 > > icelake ,avx512 ,524319 ,3 ,0 ,14175.0 ,13862.2 > > icelake ,avx512 ,524351 ,3 ,7 ,14261.4 ,13821.5 > > icelake ,avx512 ,524415 ,9 ,5 ,14266.5 ,14064.7 > > icelake ,avx512 ,1048583 ,0 ,64 ,35211.4 ,35414.6 > > icelake ,avx512 ,1048591 ,0 ,3 ,35156.8 ,35591.2 > > icelake ,avx512 ,1048607 ,3 ,0 ,35273.1 ,35503.3 > > icelake ,avx512 ,1048639 ,3 ,7 ,35255.8 ,35725.0 > > icelake ,avx512 ,1048703 ,9 ,5 ,35703.6 ,36289.9 > > icelake ,avx512 ,2097159 ,0 ,64 ,72613.9 ,72063.2 > > icelake ,avx512 ,2097167 ,0 ,3 ,72301.6 ,73504.2 > > icelake ,avx512 ,2097183 ,3 ,0 ,73448.8 ,72133.6 > > icelake ,avx512 ,2097215 ,3 ,7 ,73762.9 ,72825.8 > > icelake ,avx512 ,2097279 ,9 ,5 ,72097.3 ,72914.6 > > icelake ,avx512 ,4194311 ,0 ,64 ,144793.4 ,144182.1 > > icelake ,avx512 ,4194319 ,0 ,3 ,143710.3 ,145063.3 > > icelake ,avx512 ,4194335 ,3 ,0 ,146722.1 ,144046.4 > > icelake ,avx512 ,4194367 ,3 ,7 ,144267.0 ,144874.6 > > icelake ,avx512 ,4194431 ,9 ,5 ,143808.2 ,144560.0 > > icelake ,avx512 ,8388615 ,0 ,64 ,427993.4 ,424521.5 > > icelake ,avx512 ,8388623 ,0 ,3 ,470267.1 ,473290.8 > > icelake ,avx512 ,8388639 ,3 ,0 ,457179.7 ,461797.7 > > icelake ,avx512 ,8388671 ,3 ,7 ,472507.9 ,481561.4 > > icelake ,avx512 ,8388735 ,9 ,5 ,463611.9 ,467388.7 > > icelake ,avx512 ,16777223 ,0 ,64 ,1490426.0 ,1526996.0 > > icelake ,avx512 ,16777231 ,0 ,3 ,1516687.0 ,1517095.0 > > icelake ,avx512 ,16777247 ,3 ,0 ,1497688.0 ,1512766.0 > > icelake ,avx512 ,16777279 ,3 ,7 ,1512331.0 ,1524317.0 > > icelake ,avx512 ,16777343 ,9 ,5 ,1498908.0 ,1500526.0 > > icelake ,avx ,4103 ,0 ,64 ,50.2 ,63.7 > > icelake ,avx ,4111 ,0 ,3 ,63.7 ,65.1 > > icelake ,avx ,4127 ,3 ,0 ,68.2 ,69.4 > > icelake ,avx ,4159 ,3 ,7 ,59.6 ,68.0 > > icelake ,avx ,4223 ,9 ,5 ,68.2 ,66.8 > > icelake ,avx ,8199 ,0 ,64 ,92.1 ,89.9 > > icelake ,avx ,8207 ,0 ,3 ,119.7 ,118.3 > > icelake ,avx ,8223 ,3 ,0 ,119.1 ,120.9 > > icelake ,avx ,8255 ,3 ,7 ,122.9 ,123.7 > > icelake ,avx ,8319 ,9 ,5 ,122.1 ,121.8 > > icelake ,avx ,16391 ,0 ,64 ,162.7 ,158.0 > > icelake ,avx ,16399 ,0 ,3 ,227.6 ,234.1 > > icelake ,avx ,16415 ,3 ,0 ,230.8 ,232.7 > > icelake ,avx ,16447 ,3 ,7 ,226.8 ,232.6 > > icelake ,avx ,16511 ,9 ,5 ,233.4 ,233.8 > > icelake ,avx ,32775 ,0 ,64 ,312.2 ,301.8 > > icelake ,avx ,32783 ,0 ,3 ,449.7 ,450.0 > > icelake ,avx ,32799 ,3 ,0 ,452.7 ,455.9 > > icelake ,avx ,32831 ,3 ,7 ,449.8 ,458.0 > > icelake ,avx ,32895 ,9 ,5 ,456.3 ,459.4 > > icelake ,avx ,65543 ,0 ,64 ,1460.6 ,1463.9 > > icelake ,avx ,65551 ,0 ,3 ,1462.0 ,1465.4 > > icelake ,avx ,65567 ,3 ,0 ,1466.6 ,1480.4 > > icelake ,avx ,65599 ,3 ,7 ,1488.0 ,1488.9 > > icelake ,avx ,65663 ,9 ,5 ,1680.8 ,1499.5 > > icelake ,avx ,131079 ,0 ,64 ,2988.5 ,3010.1 > > icelake ,avx ,131087 ,0 ,3 ,2995.5 ,2996.4 > > icelake ,avx ,131103 ,3 ,0 ,3006.2 ,3000.5 > > icelake ,avx ,131135 ,3 ,7 ,3032.4 ,3073.7 > > icelake ,avx ,131199 ,9 ,5 ,3010.4 ,3027.4 > > icelake ,avx ,262151 ,0 ,64 ,6143.2 ,6079.1 > > icelake ,avx ,262159 ,0 ,3 ,6085.1 ,6075.8 > > icelake ,avx ,262175 ,3 ,0 ,6088.0 ,6064.9 > > icelake ,avx ,262207 ,3 ,7 ,6018.7 ,6023.5 > > icelake ,avx ,262271 ,9 ,5 ,6019.8 ,5959.2 > > icelake ,avx ,524295 ,0 ,64 ,14464.2 ,14095.1 > > icelake ,avx ,524303 ,0 ,3 ,14761.6 ,14050.2 > > icelake ,avx ,524319 ,3 ,0 ,14534.1 ,14087.5 > > icelake ,avx ,524351 ,3 ,7 ,14147.7 ,13903.8 > > icelake ,avx ,524415 ,9 ,5 ,14157.0 ,13982.9 > > icelake ,avx ,1048583 ,0 ,64 ,36599.0 ,37461.4 > > icelake ,avx ,1048591 ,0 ,3 ,36717.8 ,37454.9 > > icelake ,avx ,1048607 ,3 ,0 ,36821.2 ,37343.3 > > icelake ,avx ,1048639 ,3 ,7 ,36958.0 ,37507.2 > > icelake ,avx ,1048703 ,9 ,5 ,36869.2 ,37413.1 > > icelake ,avx ,2097159 ,0 ,64 ,74765.8 ,75330.9 > > icelake ,avx ,2097167 ,0 ,3 ,75175.4 ,74891.9 > > icelake ,avx ,2097183 ,3 ,0 ,75451.4 ,74787.7 > > icelake ,avx ,2097215 ,3 ,7 ,75394.8 ,75839.1 > > icelake ,avx ,2097279 ,9 ,5 ,75099.2 ,75421.2 > > icelake ,avx ,4194311 ,0 ,64 ,146809.6 ,146619.4 > > icelake ,avx ,4194319 ,0 ,3 ,148866.4 ,149898.2 > > icelake ,avx ,4194335 ,3 ,0 ,148719.7 ,150165.4 > > icelake ,avx ,4194367 ,3 ,7 ,150600.1 ,150925.9 > > icelake ,avx ,4194431 ,9 ,5 ,149457.3 ,150519.2 > > icelake ,avx ,8388615 ,0 ,64 ,412709.8 ,423666.1 > > icelake ,avx ,8388623 ,0 ,3 ,423717.4 ,424418.2 > > icelake ,avx ,8388639 ,3 ,0 ,414387.5 ,413445.6 > > icelake ,avx ,8388671 ,3 ,7 ,449010.7 ,417553.5 > > icelake ,avx ,8388735 ,9 ,5 ,414128.6 ,411815.3 > > icelake ,avx ,16777223 ,0 ,64 ,1490032.0 ,1510004.0 > > icelake ,avx ,16777231 ,0 ,3 ,1379638.0 ,1422097.0 > > icelake ,avx ,16777247 ,3 ,0 ,1418930.0 ,1367557.0 > > icelake ,avx ,16777279 ,3 ,7 ,1515152.0 ,1500176.0 > > icelake ,avx ,16777343 ,9 ,5 ,1344117.0 ,1411795.0 > > icelake ,sse2 ,4103 ,0 ,64 ,113.2 ,114.6 > > icelake ,sse2 ,4111 ,0 ,3 ,121.5 ,120.4 > > icelake ,sse2 ,4127 ,3 ,0 ,1700.5 ,1771.5 > > icelake ,sse2 ,4159 ,3 ,7 ,119.3 ,118.8 > > icelake ,sse2 ,4223 ,9 ,5 ,1739.7 ,1735.2 > > icelake ,sse2 ,8199 ,0 ,64 ,207.0 ,203.9 > > icelake ,sse2 ,8207 ,0 ,3 ,225.5 ,220.8 > > icelake ,sse2 ,8223 ,3 ,0 ,3444.3 ,3743.5 > > icelake ,sse2 ,8255 ,3 ,7 ,219.9 ,216.8 > > icelake ,sse2 ,8319 ,9 ,5 ,4117.1 ,3487.3 > > icelake ,sse2 ,16391 ,0 ,64 ,397.1 ,394.3 > > icelake ,sse2 ,16399 ,0 ,3 ,439.6 ,428.6 > > icelake ,sse2 ,16415 ,3 ,0 ,6997.0 ,7031.2 > > icelake ,sse2 ,16447 ,3 ,7 ,426.8 ,421.8 > > icelake ,sse2 ,16511 ,9 ,5 ,7037.6 ,7038.3 > > icelake ,sse2 ,32775 ,0 ,64 ,790.9 ,779.0 > > icelake ,sse2 ,32783 ,0 ,3 ,863.1 ,849.6 > > icelake ,sse2 ,32799 ,3 ,0 ,14043.0 ,14390.9 > > icelake ,sse2 ,32831 ,3 ,7 ,841.6 ,833.1 > > icelake ,sse2 ,32895 ,9 ,5 ,14277.6 ,14344.2 > > icelake ,sse2 ,65543 ,0 ,64 ,1897.0 ,1897.3 > > icelake ,sse2 ,65551 ,0 ,3 ,1927.1 ,1955.4 > > icelake ,sse2 ,65567 ,3 ,0 ,28834.7 ,28727.8 > > icelake ,sse2 ,65599 ,3 ,7 ,1961.4 ,1969.7 > > icelake ,sse2 ,65663 ,9 ,5 ,28867.6 ,29019.8 > > icelake ,sse2 ,131079 ,0 ,64 ,3879.3 ,3872.6 > > icelake ,sse2 ,131087 ,0 ,3 ,3955.3 ,3990.7 > > icelake ,sse2 ,131103 ,3 ,0 ,58001.8 ,60567.9 > > icelake ,sse2 ,131135 ,3 ,7 ,3951.5 ,4002.6 > > icelake ,sse2 ,131199 ,9 ,5 ,57886.7 ,58391.4 > > icelake ,sse2 ,262151 ,0 ,64 ,7851.4 ,7894.7 > > icelake ,sse2 ,262159 ,0 ,3 ,7947.5 ,8016.2 > > icelake ,sse2 ,262175 ,3 ,0 ,115036.2 ,115968.6 > > icelake ,sse2 ,262207 ,3 ,7 ,7883.9 ,7814.1 > > icelake ,sse2 ,262271 ,9 ,5 ,113776.4 ,119733.6 > > icelake ,sse2 ,524295 ,0 ,64 ,17198.1 ,16974.9 > > icelake ,sse2 ,524303 ,0 ,3 ,17402.2 ,17096.3 > > icelake ,sse2 ,524319 ,3 ,0 ,223980.4 ,225889.9 > > icelake ,sse2 ,524351 ,3 ,7 ,17034.9 ,16910.3 > > icelake ,sse2 ,524415 ,9 ,5 ,224027.7 ,224962.5 > > icelake ,sse2 ,1048583 ,0 ,64 ,38822.3 ,39178.6 > > icelake ,sse2 ,1048591 ,0 ,3 ,41686.7 ,40247.4 > > icelake ,sse2 ,1048607 ,3 ,0 ,38814.8 ,39323.3 > > icelake ,sse2 ,1048639 ,3 ,7 ,39568.3 ,41325.7 > > icelake ,sse2 ,1048703 ,9 ,5 ,39354.2 ,39637.9 > > icelake ,sse2 ,2097159 ,0 ,64 ,84074.7 ,84543.1 > > icelake ,sse2 ,2097167 ,0 ,3 ,83665.7 ,82358.2 > > icelake ,sse2 ,2097183 ,3 ,0 ,81817.8 ,79638.9 > > icelake ,sse2 ,2097215 ,3 ,7 ,83649.1 ,83497.6 > > icelake ,sse2 ,2097279 ,9 ,5 ,80287.6 ,79980.9 > > icelake ,sse2 ,4194311 ,0 ,64 ,165409.8 ,168343.1 > > icelake ,sse2 ,4194319 ,0 ,3 ,165216.7 ,177632.0 > > icelake ,sse2 ,4194335 ,3 ,0 ,158718.7 ,160342.2 > > icelake ,sse2 ,4194367 ,3 ,7 ,167944.9 ,167204.4 > > icelake ,sse2 ,4194431 ,9 ,5 ,161530.1 ,164839.7 > > icelake ,sse2 ,8388615 ,0 ,64 ,626504.3 ,629858.5 > > icelake ,sse2 ,8388623 ,0 ,3 ,623969.5 ,631509.1 > > icelake ,sse2 ,8388639 ,3 ,0 ,599366.7 ,600016.0 > > icelake ,sse2 ,8388671 ,3 ,7 ,619964.2 ,619113.2 > > icelake ,sse2 ,8388735 ,9 ,5 ,595338.1 ,604172.4 > > icelake ,sse2 ,16777223 ,0 ,64 ,1709597.0 ,1725184.0 > > icelake ,sse2 ,16777231 ,0 ,3 ,1725452.0 ,1719746.0 > > icelake ,sse2 ,16777247 ,3 ,0 ,1614269.0 ,1607164.0 > > icelake ,sse2 ,16777279 ,3 ,7 ,1705295.0 ,1733018.0 > > icelake ,sse2 ,16777343 ,9 ,5 ,1604197.0 ,1595690.0 > > > > > > .../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++---- > > 1 file changed, 265 insertions(+), 73 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > index 897a3d9762..5e4a071f16 100644 > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > @@ -35,7 +35,16 @@ > > __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. > > 7. If size >= __x86_shared_non_temporal_threshold and there is no > > overlap between destination and source, use non-temporal store > > - instead of aligned store. */ > > + instead of aligned store copying from either 2 or 4 pages at > > + once. > > + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold > > + and source and destination do not page alias, copy from 2 pages > > + at once using non-temporal stores. Page aliasing in this case is > > + considered true if destination's page alignment - sources' page > > + alignment is less than 8 * VEC_SIZE. > > + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source > > + and destination do page alias copy from 4 pages at once using > > + non-temporal stores. */ > > > > #include <sysdep.h> > > > > @@ -67,6 +76,34 @@ > > # endif > > #endif > > > > +#ifndef PAGE_SIZE > > +# define PAGE_SIZE 4096 > > +#endif > > + > > +#if PAGE_SIZE != 4096 > > +# error Unsupported PAGE_SIZE > > +#endif > > + > > +#ifndef LOG_PAGE_SIZE > > +# define LOG_PAGE_SIZE 12 > > +#endif > > + > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) > > +# error Invalid LOG_PAGE_SIZE > > +#endif > > + > > +/* Byte per page for large_memcpy inner loop. */ > > +#if VEC_SIZE == 64 > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) > > +#else > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) > > +#endif > > + > > +/* Amount to shift rdx by to compare for memcpy_large_4x. */ > > +#ifndef LOG_4X_MEMCPY_THRESH > > +# define LOG_4X_MEMCPY_THRESH 4 > > +#endif > > + > > /* Avoid short distance rep movsb only with non-SSE vector. */ > > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB > > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) > > @@ -106,6 +143,28 @@ > > # error Unsupported PREFETCH_SIZE! > > #endif > > > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ > > + VMOVU (offset)base, vec0; \ > > + VMOVU ((offset) + VEC_SIZE)base, vec1; > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ > > + VMOVNT vec0, (offset)base; \ > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > + VMOVU (offset)base, vec0; \ > > + VMOVU ((offset) + VEC_SIZE)base, vec1; \ > > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ > > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > + VMOVNT vec0, (offset)base; \ > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ > > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ > > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; > > +#else > > +# error Invalid LARGE_LOAD_SIZE > > +#endif > > + > > #ifndef SECTION > > # error SECTION is not defined! > > #endif > > @@ -393,6 +452,15 @@ L(last_4x_vec): > > VZEROUPPER_RETURN > > > > L(more_8x_vec): > > + /* Check if non-temporal move candidate. */ > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > + /* Check non-temporal store threshold. */ > > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > + ja L(large_memcpy_2x) > > +#endif > > + /* Entry if rdx is greater than non-temporal threshold but there > > + is overlap. */ > > +L(more_8x_vec_check): > > cmpq %rsi, %rdi > > ja L(more_8x_vec_backward) > > /* Source == destination is less common. */ > > @@ -419,24 +487,21 @@ L(more_8x_vec): > > subq %r8, %rdi > > /* Adjust length. */ > > addq %r8, %rdx > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > - /* Check non-temporal store threshold. */ > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > - ja L(large_forward) > > -#endif > > + > > + .p2align 4 > > L(loop_4x_vec_forward): > > /* Copy 4 * VEC a time forward. */ > > VMOVU (%rsi), %VEC(0) > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > - addq $(VEC_SIZE * 4), %rsi > > - subq $(VEC_SIZE * 4), %rdx > > + subq $-(VEC_SIZE * 4), %rsi > > + addq $-(VEC_SIZE * 4), %rdx > > VMOVA %VEC(0), (%rdi) > > VMOVA %VEC(1), VEC_SIZE(%rdi) > > VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > - addq $(VEC_SIZE * 4), %rdi > > + subq $-(VEC_SIZE * 4), %rdi > > cmpq $(VEC_SIZE * 4), %rdx > > ja L(loop_4x_vec_forward) > > /* Store the last 4 * VEC. */ > > @@ -470,24 +535,21 @@ L(more_8x_vec_backward): > > subq %r8, %r9 > > /* Adjust length. */ > > subq %r8, %rdx > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > - /* Check non-temporal store threshold. */ > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > - ja L(large_backward) > > -#endif > > + > > + .p2align 4 > > L(loop_4x_vec_backward): > > /* Copy 4 * VEC a time backward. */ > > VMOVU (%rcx), %VEC(0) > > VMOVU -VEC_SIZE(%rcx), %VEC(1) > > VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > - subq $(VEC_SIZE * 4), %rcx > > - subq $(VEC_SIZE * 4), %rdx > > + addq $-(VEC_SIZE * 4), %rcx > > + addq $-(VEC_SIZE * 4), %rdx > > VMOVA %VEC(0), (%r9) > > VMOVA %VEC(1), -VEC_SIZE(%r9) > > VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) > > VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) > > - subq $(VEC_SIZE * 4), %r9 > > + addq $-(VEC_SIZE * 4), %r9 > > cmpq $(VEC_SIZE * 4), %rdx > > ja L(loop_4x_vec_backward) > > /* Store the first 4 * VEC. */ > > @@ -500,72 +562,202 @@ L(loop_4x_vec_backward): > > VZEROUPPER_RETURN > > > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > -L(large_forward): > > + .p2align 4 > > +L(large_memcpy_2x): > > + /* Compute absolute value of difference between source and > > + destination. */ > > + movq %rdi, %r9 > > + subq %rsi, %r9 > > + movq %r9, %r8 > > + leaq -1(%r9), %rcx > > + sarq $63, %r8 > > + xorq %r8, %r9 > > + subq %r8, %r9 > > /* Don't use non-temporal store if there is overlap between > > - destination and source since destination may be in cache > > - when source is loaded. */ > > - leaq (%rdi, %rdx), %r10 > > - cmpq %r10, %rsi > > - jb L(loop_4x_vec_forward) > > -L(loop_large_forward): > > + destination and source since destination may be in cache when > > + source is loaded. */ > > + cmpq %r9, %rdx > > + ja L(more_8x_vec_check) > > + > > + /* Cache align destination. First store the first 64 bytes then > > + adjust alignments. */ > > + VMOVU (%rsi), %VEC(8) > > +#if VEC_SIZE < 64 > > + VMOVU VEC_SIZE(%rsi), %VEC(9) > > +#if VEC_SIZE < 32 > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > > +#endif > > +#endif > > + VMOVU %VEC(8), (%rdi) > > +#if VEC_SIZE < 64 > > + VMOVU %VEC(9), VEC_SIZE(%rdi) > > +#if VEC_SIZE < 32 > > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > > +#endif > > +#endif > > + /* Adjust source, destination, and size. */ > > + movq %rdi, %r8 > > + andq $63, %r8 > > + /* Get the negative of offset for alignment. */ > > + subq $64, %r8 > > + /* Adjust source. */ > > + subq %r8, %rsi > > + /* Adjust destination which should be aligned now. */ > > + subq %r8, %rdi > > + /* Adjust length. */ > > + addq %r8, %rdx > > + > > + /* Test if source and destination addresses will alias. If they do > > + the larger pipeline in large_memcpy_4x alleviated the > > + performance drop. */ > > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > > + jz L(large_memcpy_4x) > > + > > + movq %rdx, %r10 > > + shrq $LOG_4X_MEMCPY_THRESH, %r10 > > + cmp __x86_shared_non_temporal_threshold(%rip), %r10 > > + jae L(large_memcpy_4x) > > + > > + /* edx will store remainder size for copying tail. */ > > + andl $(PAGE_SIZE * 2 - 1), %edx > > + /* r10 stores outer loop counter. */ > > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 > > + /* Copy 4x VEC at a time from 2 pages. */ > > + .p2align 4 > > +L(loop_large_memcpy_2x_outer): > > + /* ecx stores inner loop counter. */ > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > +L(loop_large_memcpy_2x_inner): > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) > > + /* Load vectors from rsi. */ > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + subq $-LARGE_LOAD_SIZE, %rsi > > + /* Non-temporal store vectors to rdi. */ > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + subq $-LARGE_LOAD_SIZE, %rdi > > + decl %ecx > > + jnz L(loop_large_memcpy_2x_inner) > > + addq $PAGE_SIZE, %rdi > > + addq $PAGE_SIZE, %rsi > > + decq %r10 > > + jne L(loop_large_memcpy_2x_outer) > > + sfence > > + > > + /* Check if only last 4 loads are needed. */ > > + cmpl $(VEC_SIZE * 4), %edx > > + jbe L(large_memcpy_2x_end) > > + > > + /* Handle the last 2 * PAGE_SIZE bytes. */ > > +L(loop_large_memcpy_2x_tail): > > /* Copy 4 * VEC a time forward with non-temporal stores. */ > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > VMOVU (%rsi), %VEC(0) > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > - addq $PREFETCHED_LOAD_SIZE, %rsi > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > - VMOVNT %VEC(0), (%rdi) > > - VMOVNT %VEC(1), VEC_SIZE(%rdi) > > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) > > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) > > - addq $PREFETCHED_LOAD_SIZE, %rdi > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > - ja L(loop_large_forward) > > - sfence > > + subq $-(VEC_SIZE * 4), %rsi > > + addl $-(VEC_SIZE * 4), %edx > > + VMOVA %VEC(0), (%rdi) > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > + subq $-(VEC_SIZE * 4), %rdi > > + cmpl $(VEC_SIZE * 4), %edx > > + ja L(loop_large_memcpy_2x_tail) > > + > > +L(large_memcpy_2x_end): > > /* Store the last 4 * VEC. */ > > - VMOVU %VEC(5), (%rcx) > > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > > - /* Store the first VEC. */ > > - VMOVU %VEC(4), (%r11) > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > + > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > VZEROUPPER_RETURN > > > > -L(large_backward): > > - /* Don't use non-temporal store if there is overlap between > > - destination and source since destination may be in cache > > - when source is loaded. */ > > - leaq (%rcx, %rdx), %r10 > > - cmpq %r10, %r9 > > - jb L(loop_4x_vec_backward) > > -L(loop_large_backward): > > - /* Copy 4 * VEC a time backward with non-temporal stores. */ > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) > > - VMOVU (%rcx), %VEC(0) > > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > - subq $PREFETCHED_LOAD_SIZE, %rcx > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > - VMOVNT %VEC(0), (%r9) > > - VMOVNT %VEC(1), -VEC_SIZE(%r9) > > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) > > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) > > - subq $PREFETCHED_LOAD_SIZE, %r9 > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > - ja L(loop_large_backward) > > + .p2align 4 > > +L(large_memcpy_4x): > > + movq %rdx, %r10 > > + /* edx will store remainder size for copying tail. */ > > + andl $(PAGE_SIZE * 4 - 1), %edx > > + /* r10 stores outer loop counter. */ > > + shrq $(LOG_PAGE_SIZE + 2), %r10 > > + /* Copy 4x VEC at a time from 4 pages. */ > > + .p2align 4 > > +L(loop_large_memcpy_4x_outer): > > + /* ecx stores inner loop counter. */ > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > +L(loop_large_memcpy_4x_inner): > > + /* Only one prefetch set per page as doing 4 pages give more time > > + for prefetcher to keep up. */ > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) > > + /* Load vectors from rsi. */ > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > + subq $-LARGE_LOAD_SIZE, %rsi > > + /* Non-temporal store vectors to rdi. */ > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > + subq $-LARGE_LOAD_SIZE, %rdi > > + decl %ecx > > + jnz L(loop_large_memcpy_4x_inner) > > + addq $(PAGE_SIZE * 3), %rdi > > + addq $(PAGE_SIZE * 3), %rsi > > + decq %r10 > > + jne L(loop_large_memcpy_4x_outer) > > sfence > > - /* Store the first 4 * VEC. */ > > - VMOVU %VEC(4), (%rdi) > > - VMOVU %VEC(5), VEC_SIZE(%rdi) > > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > > - /* Store the last VEC. */ > > - VMOVU %VEC(8), (%r11) > > + /* Check if only last 4 loads are needed. */ > > + cmpl $(VEC_SIZE * 4), %edx > > + jbe L(large_memcpy_4x_end) > > + > > + /* Handle the last 4 * PAGE_SIZE bytes. */ > > +L(loop_large_memcpy_4x_tail): > > + /* Copy 4 * VEC a time forward with non-temporal stores. */ > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > + VMOVU (%rsi), %VEC(0) > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > + subq $-(VEC_SIZE * 4), %rsi > > + addl $-(VEC_SIZE * 4), %edx > > + VMOVA %VEC(0), (%rdi) > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > + subq $-(VEC_SIZE * 4), %rdi > > + cmpl $(VEC_SIZE * 4), %edx > > + ja L(loop_large_memcpy_4x_tail) > > + > > +L(large_memcpy_4x_end): > > + /* Store the last 4 * VEC. */ > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > + > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > VZEROUPPER_RETURN > > #endif > > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > > -- > > 2.29.2 > > > > LGTM. Please commit it. > > Thanks. > > > H.J.
On Fri, Apr 16, 2021 at 9:35 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > LGTM. Please commit it. > > Are you saying that to me or someone else? If its to me what do you > mean, is the patch not enough? I will commit it for you. > > Thanks. > > On Fri, Apr 16, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote: > > > From: noah <goldstein.w.n@gmail.com> > > > > > > No Bug. This commit updates the large memcpy case (no overlap). The > > > update is to perform memcpy on either 2 or 4 contiguous pages at > > > once. This 1) helps to alleviate the affects of false memory aliasing > > > when destination and source have a close 4k alignment and 2) In most > > > cases and for most DRAM units is a modestly more efficient access > > > pattern. These changes are a clear performance improvement for > > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > > > pass. > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > --- > > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the > > > loops and no longer see any meaningful regression. > > > > > > Also added back the temporal stores for the tail. Saw a regression > > > when doing these tests. > > > > > > Two tables below for skylake and icelake numbers for the areas around > > > where you saw the regression. Below is all data from the tests. > > > > > > N = 10. > > > > > > Skylake > > > Len ,align1 ,align2 ,new mean ,old mean > > > 4103 ,0 ,64 ,84.5 ,88.6 > > > 4111 ,0 ,3 ,99.0 ,99.9 > > > 4127 ,3 ,0 ,102.1 ,102.3 > > > 4159 ,3 ,7 ,88.7 ,90.9 > > > 4223 ,9 ,5 ,88.1 ,87.4 > > > 8199 ,0 ,64 ,146.7 ,150.2 > > > 8207 ,0 ,3 ,167.9 ,168.5 > > > 8223 ,3 ,0 ,168.5 ,168.1 > > > 8255 ,3 ,7 ,157.0 ,159.2 > > > 8319 ,9 ,5 ,155.5 ,155.7 > > > 16391 ,0 ,64 ,286.2 ,288.8 > > > 16399 ,0 ,3 ,307.0 ,308.7 > > > 16415 ,3 ,0 ,307.4 ,307.6 > > > 16447 ,3 ,7 ,294.6 ,295.5 > > > 16511 ,9 ,5 ,291.5 ,462.1 > > > 32775 ,0 ,64 ,603.4 ,601.5 > > > 32783 ,0 ,3 ,604.8 ,606.4 > > > 32799 ,3 ,0 ,603.0 ,604.1 > > > 32831 ,3 ,7 ,600.2 ,737.3 > > > 32895 ,9 ,5 ,604.4 ,599.5 > > > 65543 ,0 ,64 ,1873.5 ,1854.3 > > > 65551 ,0 ,3 ,1862.9 ,1846.6 > > > 65567 ,3 ,0 ,1885.5 ,1966.0 > > > 65599 ,3 ,7 ,1833.2 ,1833.1 > > > 65663 ,9 ,5 ,1884.9 ,1887.4 > > > 131079 ,0 ,64 ,3944.3 ,3949.4 > > > 131087 ,0 ,3 ,3927.3 ,3913.3 > > > 131103 ,3 ,0 ,4415.8 ,4169.4 > > > 131135 ,3 ,7 ,4224.5 ,4157.6 > > > 131199 ,9 ,5 ,5974.0 ,4983.8 > > > 262151 ,0 ,64 ,11050.2 ,10620.6 > > > 262159 ,0 ,3 ,9932.8 ,10037.3 > > > 262175 ,3 ,0 ,10188.8 ,9206.6 > > > 262207 ,3 ,7 ,9633.3 ,9216.7 > > > 262271 ,9 ,5 ,9732.7 ,9345.3 > > > 524295 ,0 ,64 ,24823.9 ,24880.7 > > > 524303 ,0 ,3 ,24514.0 ,24556.7 > > > 524319 ,3 ,0 ,23974.4 ,24219.9 > > > 524351 ,3 ,7 ,24159.7 ,24207.0 > > > 524415 ,9 ,5 ,23946.5 ,24142.8 > > > > > > Icelake: > > > Len ,align1 ,align2 ,new mean ,old mean > > > 4103 ,0 ,64 ,50.2 ,63.7 > > > 4111 ,0 ,3 ,63.7 ,65.1 > > > 4127 ,3 ,0 ,68.2 ,69.4 > > > 4159 ,3 ,7 ,59.6 ,68.0 > > > 4223 ,9 ,5 ,68.2 ,66.8 > > > 8199 ,0 ,64 ,92.1 ,89.9 > > > 8207 ,0 ,3 ,119.7 ,118.3 > > > 8223 ,3 ,0 ,119.1 ,120.9 > > > 8255 ,3 ,7 ,122.9 ,123.7 > > > 8319 ,9 ,5 ,122.1 ,121.8 > > > 16391 ,0 ,64 ,162.7 ,158.0 > > > 16399 ,0 ,3 ,227.6 ,234.1 > > > 16415 ,3 ,0 ,230.8 ,232.7 > > > 16447 ,3 ,7 ,226.8 ,232.6 > > > 16511 ,9 ,5 ,233.4 ,233.8 > > > 32775 ,0 ,64 ,312.2 ,301.8 > > > 32783 ,0 ,3 ,449.7 ,450.0 > > > 32799 ,3 ,0 ,452.7 ,455.9 > > > 32831 ,3 ,7 ,449.8 ,458.0 > > > 32895 ,9 ,5 ,456.3 ,459.4 > > > 65543 ,0 ,64 ,1460.6 ,1463.9 > > > 65551 ,0 ,3 ,1462.0 ,1465.4 > > > 65567 ,3 ,0 ,1466.6 ,1480.4 > > > 65599 ,3 ,7 ,1488.0 ,1488.9 > > > 65663 ,9 ,5 ,1680.8 ,1499.5 > > > 131079 ,0 ,64 ,2988.5 ,3010.1 > > > 131087 ,0 ,3 ,2995.5 ,2996.4 > > > 131103 ,3 ,0 ,3006.2 ,3000.5 > > > 131135 ,3 ,7 ,3032.4 ,3073.7 > > > 131199 ,9 ,5 ,3010.4 ,3027.4 > > > 262151 ,0 ,64 ,6143.2 ,6079.1 > > > 262159 ,0 ,3 ,6085.1 ,6075.8 > > > 262175 ,3 ,0 ,6088.0 ,6064.9 > > > 262207 ,3 ,7 ,6018.7 ,6023.5 > > > 262271 ,9 ,5 ,6019.8 ,5959.2 > > > 524295 ,0 ,64 ,14464.2 ,14095.1 > > > 524303 ,0 ,3 ,14761.6 ,14050.2 > > > 524319 ,3 ,0 ,14534.1 ,14087.5 > > > 524351 ,3 ,7 ,14147.7 ,13903.8 > > > 524415 ,9 ,5 ,14157.0 ,13982.9 > > > > > > > > > > > > cpu ,version ,Len ,align1 ,align2 ,new mean ,old mean > > > skylake ,avx ,4103 ,0 ,64 ,84.5 ,88.6 > > > skylake ,avx ,4111 ,0 ,3 ,99.0 ,99.9 > > > skylake ,avx ,4127 ,3 ,0 ,102.1 ,102.3 > > > skylake ,avx ,4159 ,3 ,7 ,88.7 ,90.9 > > > skylake ,avx ,4223 ,9 ,5 ,88.1 ,87.4 > > > skylake ,avx ,8199 ,0 ,64 ,146.7 ,150.2 > > > skylake ,avx ,8207 ,0 ,3 ,167.9 ,168.5 > > > skylake ,avx ,8223 ,3 ,0 ,168.5 ,168.1 > > > skylake ,avx ,8255 ,3 ,7 ,157.0 ,159.2 > > > skylake ,avx ,8319 ,9 ,5 ,155.5 ,155.7 > > > skylake ,avx ,16391 ,0 ,64 ,286.2 ,288.8 > > > skylake ,avx ,16399 ,0 ,3 ,307.0 ,308.7 > > > skylake ,avx ,16415 ,3 ,0 ,307.4 ,307.6 > > > skylake ,avx ,16447 ,3 ,7 ,294.6 ,295.5 > > > skylake ,avx ,16511 ,9 ,5 ,291.5 ,462.1 > > > skylake ,avx ,32775 ,0 ,64 ,603.4 ,601.5 > > > skylake ,avx ,32783 ,0 ,3 ,604.8 ,606.4 > > > skylake ,avx ,32799 ,3 ,0 ,603.0 ,604.1 > > > skylake ,avx ,32831 ,3 ,7 ,600.2 ,737.3 > > > skylake ,avx ,32895 ,9 ,5 ,604.4 ,599.5 > > > skylake ,avx ,65543 ,0 ,64 ,1873.5 ,1854.3 > > > skylake ,avx ,65551 ,0 ,3 ,1862.9 ,1846.6 > > > skylake ,avx ,65567 ,3 ,0 ,1885.5 ,1966.0 > > > skylake ,avx ,65599 ,3 ,7 ,1833.2 ,1833.1 > > > skylake ,avx ,65663 ,9 ,5 ,1884.9 ,1887.4 > > > skylake ,avx ,131079 ,0 ,64 ,3944.3 ,3949.4 > > > skylake ,avx ,131087 ,0 ,3 ,3927.3 ,3913.3 > > > skylake ,avx ,131103 ,3 ,0 ,4415.8 ,4169.4 > > > skylake ,avx ,131135 ,3 ,7 ,4224.5 ,4157.6 > > > skylake ,avx ,131199 ,9 ,5 ,5974.0 ,4983.8 > > > skylake ,avx ,262151 ,0 ,64 ,11050.2 ,10620.6 > > > skylake ,avx ,262159 ,0 ,3 ,9932.8 ,10037.3 > > > skylake ,avx ,262175 ,3 ,0 ,10188.8 ,9206.6 > > > skylake ,avx ,262207 ,3 ,7 ,9633.3 ,9216.7 > > > skylake ,avx ,262271 ,9 ,5 ,9732.7 ,9345.3 > > > skylake ,avx ,524295 ,0 ,64 ,24823.9 ,24880.7 > > > skylake ,avx ,524303 ,0 ,3 ,24514.0 ,24556.7 > > > skylake ,avx ,524319 ,3 ,0 ,23974.4 ,24219.9 > > > skylake ,avx ,524351 ,3 ,7 ,24159.7 ,24207.0 > > > skylake ,avx ,524415 ,9 ,5 ,23946.5 ,24142.8 > > > skylake ,avx ,1048583 ,0 ,64 ,49163.9 ,49454.6 > > > skylake ,avx ,1048591 ,0 ,3 ,49879.3 ,49400.8 > > > skylake ,avx ,1048607 ,3 ,0 ,49738.0 ,48864.6 > > > skylake ,avx ,1048639 ,3 ,7 ,48804.0 ,47588.5 > > > skylake ,avx ,1048703 ,9 ,5 ,49629.4 ,49796.3 > > > skylake ,avx ,2097159 ,0 ,64 ,98271.7 ,96330.6 > > > skylake ,avx ,2097167 ,0 ,3 ,97801.8 ,98638.1 > > > skylake ,avx ,2097183 ,3 ,0 ,98041.1 ,99287.6 > > > skylake ,avx ,2097215 ,3 ,7 ,96629.5 ,96521.9 > > > skylake ,avx ,2097279 ,9 ,5 ,98961.8 ,98909.8 > > > skylake ,avx ,4194311 ,0 ,64 ,194667.7 ,195377.1 > > > skylake ,avx ,4194319 ,0 ,3 ,194919.5 ,198576.2 > > > skylake ,avx ,4194335 ,3 ,0 ,192949.8 ,194584.7 > > > skylake ,avx ,4194367 ,3 ,7 ,189943.5 ,189177.9 > > > skylake ,avx ,4194431 ,9 ,5 ,192479.1 ,196494.2 > > > skylake ,avx ,8388615 ,0 ,64 ,588671.6 ,587215.4 > > > skylake ,avx ,8388623 ,0 ,3 ,581640.7 ,582812.5 > > > skylake ,avx ,8388639 ,3 ,0 ,549811.9 ,544697.6 > > > skylake ,avx ,8388671 ,3 ,7 ,591155.0 ,577951.8 > > > skylake ,avx ,8388735 ,9 ,5 ,547583.2 ,545133.3 > > > skylake ,avx ,16777223 ,0 ,64 ,1787503.0 ,1811146.0 > > > skylake ,avx ,16777231 ,0 ,3 ,1758671.0 ,1756343.0 > > > skylake ,avx ,16777247 ,3 ,0 ,1691781.0 ,1694661.0 > > > skylake ,avx ,16777279 ,3 ,7 ,1768150.0 ,1754785.0 > > > skylake ,avx ,16777343 ,9 ,5 ,1695179.0 ,1710794.0 > > > skylake ,sse2 ,4103 ,0 ,64 ,150.8 ,150.5 > > > skylake ,sse2 ,4111 ,0 ,3 ,156.8 ,158.4 > > > skylake ,sse2 ,4127 ,3 ,0 ,99.7 ,99.4 > > > skylake ,sse2 ,4159 ,3 ,7 ,154.8 ,154.5 > > > skylake ,sse2 ,4223 ,9 ,5 ,137.3 ,137.2 > > > skylake ,sse2 ,8199 ,0 ,64 ,284.8 ,285.5 > > > skylake ,sse2 ,8207 ,0 ,3 ,296.0 ,296.1 > > > skylake ,sse2 ,8223 ,3 ,0 ,168.0 ,168.2 > > > skylake ,sse2 ,8255 ,3 ,7 ,293.0 ,292.4 > > > skylake ,sse2 ,8319 ,9 ,5 ,251.3 ,250.7 > > > skylake ,sse2 ,16391 ,0 ,64 ,561.3 ,608.3 > > > skylake ,sse2 ,16399 ,0 ,3 ,571.0 ,574.8 > > > skylake ,sse2 ,16415 ,3 ,0 ,305.4 ,305.0 > > > skylake ,sse2 ,16447 ,3 ,7 ,563.2 ,565.0 > > > skylake ,sse2 ,16511 ,9 ,5 ,477.1 ,475.1 > > > skylake ,sse2 ,32775 ,0 ,64 ,1128.2 ,1131.7 > > > skylake ,sse2 ,32783 ,0 ,3 ,1126.6 ,1131.0 > > > skylake ,sse2 ,32799 ,3 ,0 ,587.6 ,590.8 > > > skylake ,sse2 ,32831 ,3 ,7 ,1130.6 ,1126.2 > > > skylake ,sse2 ,32895 ,9 ,5 ,957.6 ,953.0 > > > skylake ,sse2 ,65543 ,0 ,64 ,2718.9 ,2704.2 > > > skylake ,sse2 ,65551 ,0 ,3 ,2724.1 ,2725.0 > > > skylake ,sse2 ,65567 ,3 ,0 ,1888.4 ,1914.3 > > > skylake ,sse2 ,65599 ,3 ,7 ,2787.6 ,2748.7 > > > skylake ,sse2 ,65663 ,9 ,5 ,2400.5 ,2369.4 > > > skylake ,sse2 ,131079 ,0 ,64 ,5603.3 ,5654.9 > > > skylake ,sse2 ,131087 ,0 ,3 ,5939.3 ,5871.4 > > > skylake ,sse2 ,131103 ,3 ,0 ,4272.4 ,4190.0 > > > skylake ,sse2 ,131135 ,3 ,7 ,7601.4 ,7524.6 > > > skylake ,sse2 ,131199 ,9 ,5 ,7022.1 ,6864.7 > > > skylake ,sse2 ,262151 ,0 ,64 ,13736.2 ,14030.0 > > > skylake ,sse2 ,262159 ,0 ,3 ,12407.3 ,12334.1 > > > skylake ,sse2 ,262175 ,3 ,0 ,9661.1 ,9249.4 > > > skylake ,sse2 ,262207 ,3 ,7 ,12850.2 ,12351.6 > > > skylake ,sse2 ,262271 ,9 ,5 ,10792.6 ,10435.8 > > > skylake ,sse2 ,524295 ,0 ,64 ,27754.5 ,28177.7 > > > skylake ,sse2 ,524303 ,0 ,3 ,27766.2 ,28152.0 > > > skylake ,sse2 ,524319 ,3 ,0 ,24030.9 ,24438.3 > > > skylake ,sse2 ,524351 ,3 ,7 ,27787.5 ,27933.0 > > > skylake ,sse2 ,524415 ,9 ,5 ,24263.2 ,25249.1 > > > skylake ,sse2 ,1048583 ,0 ,64 ,56199.9 ,56039.8 > > > skylake ,sse2 ,1048591 ,0 ,3 ,56750.2 ,58889.7 > > > skylake ,sse2 ,1048607 ,3 ,0 ,56394.0 ,55115.3 > > > skylake ,sse2 ,1048639 ,3 ,7 ,57233.1 ,57473.8 > > > skylake ,sse2 ,1048703 ,9 ,5 ,56324.3 ,55917.9 > > > skylake ,sse2 ,2097159 ,0 ,64 ,113234.8 ,114346.4 > > > skylake ,sse2 ,2097167 ,0 ,3 ,114373.1 ,115522.5 > > > skylake ,sse2 ,2097183 ,3 ,0 ,108113.3 ,108513.3 > > > skylake ,sse2 ,2097215 ,3 ,7 ,116863.6 ,116549.9 > > > skylake ,sse2 ,2097279 ,9 ,5 ,108945.1 ,108843.7 > > > skylake ,sse2 ,4194311 ,0 ,64 ,230250.1 ,232350.0 > > > skylake ,sse2 ,4194319 ,0 ,3 ,231895.3 ,235055.6 > > > skylake ,sse2 ,4194335 ,3 ,0 ,218442.8 ,219199.8 > > > skylake ,sse2 ,4194367 ,3 ,7 ,242564.2 ,235587.7 > > > skylake ,sse2 ,4194431 ,9 ,5 ,224167.4 ,215261.8 > > > skylake ,sse2 ,8388615 ,0 ,64 ,679801.8 ,674832.0 > > > skylake ,sse2 ,8388623 ,0 ,3 ,684913.2 ,685238.7 > > > skylake ,sse2 ,8388639 ,3 ,0 ,644865.4 ,631388.6 > > > skylake ,sse2 ,8388671 ,3 ,7 ,698700.9 ,689316.1 > > > skylake ,sse2 ,8388735 ,9 ,5 ,644820.2 ,631366.8 > > > skylake ,sse2 ,16777223 ,0 ,64 ,1877984.0 ,1876437.0 > > > skylake ,sse2 ,16777231 ,0 ,3 ,1898086.0 ,1913053.0 > > > skylake ,sse2 ,16777247 ,3 ,0 ,1857018.0 ,1866949.0 > > > skylake ,sse2 ,16777279 ,3 ,7 ,1914905.0 ,1897134.0 > > > skylake ,sse2 ,16777343 ,9 ,5 ,1859937.0 ,1881939.0 > > > icelake ,avx512 ,4103 ,0 ,64 ,75.2 ,75.8 > > > icelake ,avx512 ,4111 ,0 ,3 ,56.9 ,56.4 > > > icelake ,avx512 ,4127 ,3 ,0 ,59.1 ,59.6 > > > icelake ,avx512 ,4159 ,3 ,7 ,50.7 ,51.3 > > > icelake ,avx512 ,4223 ,9 ,5 ,59.2 ,58.9 > > > icelake ,avx512 ,8199 ,0 ,64 ,67.8 ,63.9 > > > icelake ,avx512 ,8207 ,0 ,3 ,89.0 ,89.9 > > > icelake ,avx512 ,8223 ,3 ,0 ,90.2 ,90.1 > > > icelake ,avx512 ,8255 ,3 ,7 ,82.6 ,84.9 > > > icelake ,avx512 ,8319 ,9 ,5 ,91.5 ,92.8 > > > icelake ,avx512 ,16391 ,0 ,64 ,118.0 ,117.6 > > > icelake ,avx512 ,16399 ,0 ,3 ,156.5 ,157.0 > > > icelake ,avx512 ,16415 ,3 ,0 ,157.4 ,157.3 > > > icelake ,avx512 ,16447 ,3 ,7 ,151.0 ,151.6 > > > icelake ,avx512 ,16511 ,9 ,5 ,159.1 ,159.6 > > > icelake ,avx512 ,32775 ,0 ,64 ,231.8 ,230.8 > > > icelake ,avx512 ,32783 ,0 ,3 ,297.8 ,299.3 > > > icelake ,avx512 ,32799 ,3 ,0 ,299.1 ,299.0 > > > icelake ,avx512 ,32831 ,3 ,7 ,293.5 ,295.4 > > > icelake ,avx512 ,32895 ,9 ,5 ,300.3 ,302.5 > > > icelake ,avx512 ,65543 ,0 ,64 ,1473.4 ,1479.2 > > > icelake ,avx512 ,65551 ,0 ,3 ,1438.2 ,1445.3 > > > icelake ,avx512 ,65567 ,3 ,0 ,1450.3 ,1463.8 > > > icelake ,avx512 ,65599 ,3 ,7 ,1469.0 ,1473.8 > > > icelake ,avx512 ,65663 ,9 ,5 ,1480.0 ,1483.5 > > > icelake ,avx512 ,131079 ,0 ,64 ,3015.1 ,3037.5 > > > icelake ,avx512 ,131087 ,0 ,3 ,2952.3 ,2960.4 > > > icelake ,avx512 ,131103 ,3 ,0 ,2966.2 ,2964.4 > > > icelake ,avx512 ,131135 ,3 ,7 ,2961.6 ,3047.9 > > > icelake ,avx512 ,131199 ,9 ,5 ,2967.4 ,3183.8 > > > icelake ,avx512 ,262151 ,0 ,64 ,6206.0 ,6141.5 > > > icelake ,avx512 ,262159 ,0 ,3 ,5990.8 ,5959.2 > > > icelake ,avx512 ,262175 ,3 ,0 ,5976.7 ,5963.8 > > > icelake ,avx512 ,262207 ,3 ,7 ,5939.5 ,5924.3 > > > icelake ,avx512 ,262271 ,9 ,5 ,5944.6 ,5990.3 > > > icelake ,avx512 ,524295 ,0 ,64 ,14726.7 ,14307.0 > > > icelake ,avx512 ,524303 ,0 ,3 ,14344.2 ,14040.5 > > > icelake ,avx512 ,524319 ,3 ,0 ,14175.0 ,13862.2 > > > icelake ,avx512 ,524351 ,3 ,7 ,14261.4 ,13821.5 > > > icelake ,avx512 ,524415 ,9 ,5 ,14266.5 ,14064.7 > > > icelake ,avx512 ,1048583 ,0 ,64 ,35211.4 ,35414.6 > > > icelake ,avx512 ,1048591 ,0 ,3 ,35156.8 ,35591.2 > > > icelake ,avx512 ,1048607 ,3 ,0 ,35273.1 ,35503.3 > > > icelake ,avx512 ,1048639 ,3 ,7 ,35255.8 ,35725.0 > > > icelake ,avx512 ,1048703 ,9 ,5 ,35703.6 ,36289.9 > > > icelake ,avx512 ,2097159 ,0 ,64 ,72613.9 ,72063.2 > > > icelake ,avx512 ,2097167 ,0 ,3 ,72301.6 ,73504.2 > > > icelake ,avx512 ,2097183 ,3 ,0 ,73448.8 ,72133.6 > > > icelake ,avx512 ,2097215 ,3 ,7 ,73762.9 ,72825.8 > > > icelake ,avx512 ,2097279 ,9 ,5 ,72097.3 ,72914.6 > > > icelake ,avx512 ,4194311 ,0 ,64 ,144793.4 ,144182.1 > > > icelake ,avx512 ,4194319 ,0 ,3 ,143710.3 ,145063.3 > > > icelake ,avx512 ,4194335 ,3 ,0 ,146722.1 ,144046.4 > > > icelake ,avx512 ,4194367 ,3 ,7 ,144267.0 ,144874.6 > > > icelake ,avx512 ,4194431 ,9 ,5 ,143808.2 ,144560.0 > > > icelake ,avx512 ,8388615 ,0 ,64 ,427993.4 ,424521.5 > > > icelake ,avx512 ,8388623 ,0 ,3 ,470267.1 ,473290.8 > > > icelake ,avx512 ,8388639 ,3 ,0 ,457179.7 ,461797.7 > > > icelake ,avx512 ,8388671 ,3 ,7 ,472507.9 ,481561.4 > > > icelake ,avx512 ,8388735 ,9 ,5 ,463611.9 ,467388.7 > > > icelake ,avx512 ,16777223 ,0 ,64 ,1490426.0 ,1526996.0 > > > icelake ,avx512 ,16777231 ,0 ,3 ,1516687.0 ,1517095.0 > > > icelake ,avx512 ,16777247 ,3 ,0 ,1497688.0 ,1512766.0 > > > icelake ,avx512 ,16777279 ,3 ,7 ,1512331.0 ,1524317.0 > > > icelake ,avx512 ,16777343 ,9 ,5 ,1498908.0 ,1500526.0 > > > icelake ,avx ,4103 ,0 ,64 ,50.2 ,63.7 > > > icelake ,avx ,4111 ,0 ,3 ,63.7 ,65.1 > > > icelake ,avx ,4127 ,3 ,0 ,68.2 ,69.4 > > > icelake ,avx ,4159 ,3 ,7 ,59.6 ,68.0 > > > icelake ,avx ,4223 ,9 ,5 ,68.2 ,66.8 > > > icelake ,avx ,8199 ,0 ,64 ,92.1 ,89.9 > > > icelake ,avx ,8207 ,0 ,3 ,119.7 ,118.3 > > > icelake ,avx ,8223 ,3 ,0 ,119.1 ,120.9 > > > icelake ,avx ,8255 ,3 ,7 ,122.9 ,123.7 > > > icelake ,avx ,8319 ,9 ,5 ,122.1 ,121.8 > > > icelake ,avx ,16391 ,0 ,64 ,162.7 ,158.0 > > > icelake ,avx ,16399 ,0 ,3 ,227.6 ,234.1 > > > icelake ,avx ,16415 ,3 ,0 ,230.8 ,232.7 > > > icelake ,avx ,16447 ,3 ,7 ,226.8 ,232.6 > > > icelake ,avx ,16511 ,9 ,5 ,233.4 ,233.8 > > > icelake ,avx ,32775 ,0 ,64 ,312.2 ,301.8 > > > icelake ,avx ,32783 ,0 ,3 ,449.7 ,450.0 > > > icelake ,avx ,32799 ,3 ,0 ,452.7 ,455.9 > > > icelake ,avx ,32831 ,3 ,7 ,449.8 ,458.0 > > > icelake ,avx ,32895 ,9 ,5 ,456.3 ,459.4 > > > icelake ,avx ,65543 ,0 ,64 ,1460.6 ,1463.9 > > > icelake ,avx ,65551 ,0 ,3 ,1462.0 ,1465.4 > > > icelake ,avx ,65567 ,3 ,0 ,1466.6 ,1480.4 > > > icelake ,avx ,65599 ,3 ,7 ,1488.0 ,1488.9 > > > icelake ,avx ,65663 ,9 ,5 ,1680.8 ,1499.5 > > > icelake ,avx ,131079 ,0 ,64 ,2988.5 ,3010.1 > > > icelake ,avx ,131087 ,0 ,3 ,2995.5 ,2996.4 > > > icelake ,avx ,131103 ,3 ,0 ,3006.2 ,3000.5 > > > icelake ,avx ,131135 ,3 ,7 ,3032.4 ,3073.7 > > > icelake ,avx ,131199 ,9 ,5 ,3010.4 ,3027.4 > > > icelake ,avx ,262151 ,0 ,64 ,6143.2 ,6079.1 > > > icelake ,avx ,262159 ,0 ,3 ,6085.1 ,6075.8 > > > icelake ,avx ,262175 ,3 ,0 ,6088.0 ,6064.9 > > > icelake ,avx ,262207 ,3 ,7 ,6018.7 ,6023.5 > > > icelake ,avx ,262271 ,9 ,5 ,6019.8 ,5959.2 > > > icelake ,avx ,524295 ,0 ,64 ,14464.2 ,14095.1 > > > icelake ,avx ,524303 ,0 ,3 ,14761.6 ,14050.2 > > > icelake ,avx ,524319 ,3 ,0 ,14534.1 ,14087.5 > > > icelake ,avx ,524351 ,3 ,7 ,14147.7 ,13903.8 > > > icelake ,avx ,524415 ,9 ,5 ,14157.0 ,13982.9 > > > icelake ,avx ,1048583 ,0 ,64 ,36599.0 ,37461.4 > > > icelake ,avx ,1048591 ,0 ,3 ,36717.8 ,37454.9 > > > icelake ,avx ,1048607 ,3 ,0 ,36821.2 ,37343.3 > > > icelake ,avx ,1048639 ,3 ,7 ,36958.0 ,37507.2 > > > icelake ,avx ,1048703 ,9 ,5 ,36869.2 ,37413.1 > > > icelake ,avx ,2097159 ,0 ,64 ,74765.8 ,75330.9 > > > icelake ,avx ,2097167 ,0 ,3 ,75175.4 ,74891.9 > > > icelake ,avx ,2097183 ,3 ,0 ,75451.4 ,74787.7 > > > icelake ,avx ,2097215 ,3 ,7 ,75394.8 ,75839.1 > > > icelake ,avx ,2097279 ,9 ,5 ,75099.2 ,75421.2 > > > icelake ,avx ,4194311 ,0 ,64 ,146809.6 ,146619.4 > > > icelake ,avx ,4194319 ,0 ,3 ,148866.4 ,149898.2 > > > icelake ,avx ,4194335 ,3 ,0 ,148719.7 ,150165.4 > > > icelake ,avx ,4194367 ,3 ,7 ,150600.1 ,150925.9 > > > icelake ,avx ,4194431 ,9 ,5 ,149457.3 ,150519.2 > > > icelake ,avx ,8388615 ,0 ,64 ,412709.8 ,423666.1 > > > icelake ,avx ,8388623 ,0 ,3 ,423717.4 ,424418.2 > > > icelake ,avx ,8388639 ,3 ,0 ,414387.5 ,413445.6 > > > icelake ,avx ,8388671 ,3 ,7 ,449010.7 ,417553.5 > > > icelake ,avx ,8388735 ,9 ,5 ,414128.6 ,411815.3 > > > icelake ,avx ,16777223 ,0 ,64 ,1490032.0 ,1510004.0 > > > icelake ,avx ,16777231 ,0 ,3 ,1379638.0 ,1422097.0 > > > icelake ,avx ,16777247 ,3 ,0 ,1418930.0 ,1367557.0 > > > icelake ,avx ,16777279 ,3 ,7 ,1515152.0 ,1500176.0 > > > icelake ,avx ,16777343 ,9 ,5 ,1344117.0 ,1411795.0 > > > icelake ,sse2 ,4103 ,0 ,64 ,113.2 ,114.6 > > > icelake ,sse2 ,4111 ,0 ,3 ,121.5 ,120.4 > > > icelake ,sse2 ,4127 ,3 ,0 ,1700.5 ,1771.5 > > > icelake ,sse2 ,4159 ,3 ,7 ,119.3 ,118.8 > > > icelake ,sse2 ,4223 ,9 ,5 ,1739.7 ,1735.2 > > > icelake ,sse2 ,8199 ,0 ,64 ,207.0 ,203.9 > > > icelake ,sse2 ,8207 ,0 ,3 ,225.5 ,220.8 > > > icelake ,sse2 ,8223 ,3 ,0 ,3444.3 ,3743.5 > > > icelake ,sse2 ,8255 ,3 ,7 ,219.9 ,216.8 > > > icelake ,sse2 ,8319 ,9 ,5 ,4117.1 ,3487.3 > > > icelake ,sse2 ,16391 ,0 ,64 ,397.1 ,394.3 > > > icelake ,sse2 ,16399 ,0 ,3 ,439.6 ,428.6 > > > icelake ,sse2 ,16415 ,3 ,0 ,6997.0 ,7031.2 > > > icelake ,sse2 ,16447 ,3 ,7 ,426.8 ,421.8 > > > icelake ,sse2 ,16511 ,9 ,5 ,7037.6 ,7038.3 > > > icelake ,sse2 ,32775 ,0 ,64 ,790.9 ,779.0 > > > icelake ,sse2 ,32783 ,0 ,3 ,863.1 ,849.6 > > > icelake ,sse2 ,32799 ,3 ,0 ,14043.0 ,14390.9 > > > icelake ,sse2 ,32831 ,3 ,7 ,841.6 ,833.1 > > > icelake ,sse2 ,32895 ,9 ,5 ,14277.6 ,14344.2 > > > icelake ,sse2 ,65543 ,0 ,64 ,1897.0 ,1897.3 > > > icelake ,sse2 ,65551 ,0 ,3 ,1927.1 ,1955.4 > > > icelake ,sse2 ,65567 ,3 ,0 ,28834.7 ,28727.8 > > > icelake ,sse2 ,65599 ,3 ,7 ,1961.4 ,1969.7 > > > icelake ,sse2 ,65663 ,9 ,5 ,28867.6 ,29019.8 > > > icelake ,sse2 ,131079 ,0 ,64 ,3879.3 ,3872.6 > > > icelake ,sse2 ,131087 ,0 ,3 ,3955.3 ,3990.7 > > > icelake ,sse2 ,131103 ,3 ,0 ,58001.8 ,60567.9 > > > icelake ,sse2 ,131135 ,3 ,7 ,3951.5 ,4002.6 > > > icelake ,sse2 ,131199 ,9 ,5 ,57886.7 ,58391.4 > > > icelake ,sse2 ,262151 ,0 ,64 ,7851.4 ,7894.7 > > > icelake ,sse2 ,262159 ,0 ,3 ,7947.5 ,8016.2 > > > icelake ,sse2 ,262175 ,3 ,0 ,115036.2 ,115968.6 > > > icelake ,sse2 ,262207 ,3 ,7 ,7883.9 ,7814.1 > > > icelake ,sse2 ,262271 ,9 ,5 ,113776.4 ,119733.6 > > > icelake ,sse2 ,524295 ,0 ,64 ,17198.1 ,16974.9 > > > icelake ,sse2 ,524303 ,0 ,3 ,17402.2 ,17096.3 > > > icelake ,sse2 ,524319 ,3 ,0 ,223980.4 ,225889.9 > > > icelake ,sse2 ,524351 ,3 ,7 ,17034.9 ,16910.3 > > > icelake ,sse2 ,524415 ,9 ,5 ,224027.7 ,224962.5 > > > icelake ,sse2 ,1048583 ,0 ,64 ,38822.3 ,39178.6 > > > icelake ,sse2 ,1048591 ,0 ,3 ,41686.7 ,40247.4 > > > icelake ,sse2 ,1048607 ,3 ,0 ,38814.8 ,39323.3 > > > icelake ,sse2 ,1048639 ,3 ,7 ,39568.3 ,41325.7 > > > icelake ,sse2 ,1048703 ,9 ,5 ,39354.2 ,39637.9 > > > icelake ,sse2 ,2097159 ,0 ,64 ,84074.7 ,84543.1 > > > icelake ,sse2 ,2097167 ,0 ,3 ,83665.7 ,82358.2 > > > icelake ,sse2 ,2097183 ,3 ,0 ,81817.8 ,79638.9 > > > icelake ,sse2 ,2097215 ,3 ,7 ,83649.1 ,83497.6 > > > icelake ,sse2 ,2097279 ,9 ,5 ,80287.6 ,79980.9 > > > icelake ,sse2 ,4194311 ,0 ,64 ,165409.8 ,168343.1 > > > icelake ,sse2 ,4194319 ,0 ,3 ,165216.7 ,177632.0 > > > icelake ,sse2 ,4194335 ,3 ,0 ,158718.7 ,160342.2 > > > icelake ,sse2 ,4194367 ,3 ,7 ,167944.9 ,167204.4 > > > icelake ,sse2 ,4194431 ,9 ,5 ,161530.1 ,164839.7 > > > icelake ,sse2 ,8388615 ,0 ,64 ,626504.3 ,629858.5 > > > icelake ,sse2 ,8388623 ,0 ,3 ,623969.5 ,631509.1 > > > icelake ,sse2 ,8388639 ,3 ,0 ,599366.7 ,600016.0 > > > icelake ,sse2 ,8388671 ,3 ,7 ,619964.2 ,619113.2 > > > icelake ,sse2 ,8388735 ,9 ,5 ,595338.1 ,604172.4 > > > icelake ,sse2 ,16777223 ,0 ,64 ,1709597.0 ,1725184.0 > > > icelake ,sse2 ,16777231 ,0 ,3 ,1725452.0 ,1719746.0 > > > icelake ,sse2 ,16777247 ,3 ,0 ,1614269.0 ,1607164.0 > > > icelake ,sse2 ,16777279 ,3 ,7 ,1705295.0 ,1733018.0 > > > icelake ,sse2 ,16777343 ,9 ,5 ,1604197.0 ,1595690.0 > > > > > > > > > .../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++---- > > > 1 file changed, 265 insertions(+), 73 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > index 897a3d9762..5e4a071f16 100644 > > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > @@ -35,7 +35,16 @@ > > > __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. > > > 7. If size >= __x86_shared_non_temporal_threshold and there is no > > > overlap between destination and source, use non-temporal store > > > - instead of aligned store. */ > > > + instead of aligned store copying from either 2 or 4 pages at > > > + once. > > > + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold > > > + and source and destination do not page alias, copy from 2 pages > > > + at once using non-temporal stores. Page aliasing in this case is > > > + considered true if destination's page alignment - sources' page > > > + alignment is less than 8 * VEC_SIZE. > > > + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source > > > + and destination do page alias copy from 4 pages at once using > > > + non-temporal stores. */ > > > > > > #include <sysdep.h> > > > > > > @@ -67,6 +76,34 @@ > > > # endif > > > #endif > > > > > > +#ifndef PAGE_SIZE > > > +# define PAGE_SIZE 4096 > > > +#endif > > > + > > > +#if PAGE_SIZE != 4096 > > > +# error Unsupported PAGE_SIZE > > > +#endif > > > + > > > +#ifndef LOG_PAGE_SIZE > > > +# define LOG_PAGE_SIZE 12 > > > +#endif > > > + > > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) > > > +# error Invalid LOG_PAGE_SIZE > > > +#endif > > > + > > > +/* Byte per page for large_memcpy inner loop. */ > > > +#if VEC_SIZE == 64 > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) > > > +#else > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) > > > +#endif > > > + > > > +/* Amount to shift rdx by to compare for memcpy_large_4x. */ > > > +#ifndef LOG_4X_MEMCPY_THRESH > > > +# define LOG_4X_MEMCPY_THRESH 4 > > > +#endif > > > + > > > /* Avoid short distance rep movsb only with non-SSE vector. */ > > > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB > > > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) > > > @@ -106,6 +143,28 @@ > > > # error Unsupported PREFETCH_SIZE! > > > #endif > > > > > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ > > > + VMOVU (offset)base, vec0; \ > > > + VMOVU ((offset) + VEC_SIZE)base, vec1; > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ > > > + VMOVNT vec0, (offset)base; \ > > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; > > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > > + VMOVU (offset)base, vec0; \ > > > + VMOVU ((offset) + VEC_SIZE)base, vec1; \ > > > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ > > > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > > + VMOVNT vec0, (offset)base; \ > > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ > > > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ > > > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; > > > +#else > > > +# error Invalid LARGE_LOAD_SIZE > > > +#endif > > > + > > > #ifndef SECTION > > > # error SECTION is not defined! > > > #endif > > > @@ -393,6 +452,15 @@ L(last_4x_vec): > > > VZEROUPPER_RETURN > > > > > > L(more_8x_vec): > > > + /* Check if non-temporal move candidate. */ > > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > + /* Check non-temporal store threshold. */ > > > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > > + ja L(large_memcpy_2x) > > > +#endif > > > + /* Entry if rdx is greater than non-temporal threshold but there > > > + is overlap. */ > > > +L(more_8x_vec_check): > > > cmpq %rsi, %rdi > > > ja L(more_8x_vec_backward) > > > /* Source == destination is less common. */ > > > @@ -419,24 +487,21 @@ L(more_8x_vec): > > > subq %r8, %rdi > > > /* Adjust length. */ > > > addq %r8, %rdx > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > - /* Check non-temporal store threshold. */ > > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > > - ja L(large_forward) > > > -#endif > > > + > > > + .p2align 4 > > > L(loop_4x_vec_forward): > > > /* Copy 4 * VEC a time forward. */ > > > VMOVU (%rsi), %VEC(0) > > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > > - addq $(VEC_SIZE * 4), %rsi > > > - subq $(VEC_SIZE * 4), %rdx > > > + subq $-(VEC_SIZE * 4), %rsi > > > + addq $-(VEC_SIZE * 4), %rdx > > > VMOVA %VEC(0), (%rdi) > > > VMOVA %VEC(1), VEC_SIZE(%rdi) > > > VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > > VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > > - addq $(VEC_SIZE * 4), %rdi > > > + subq $-(VEC_SIZE * 4), %rdi > > > cmpq $(VEC_SIZE * 4), %rdx > > > ja L(loop_4x_vec_forward) > > > /* Store the last 4 * VEC. */ > > > @@ -470,24 +535,21 @@ L(more_8x_vec_backward): > > > subq %r8, %r9 > > > /* Adjust length. */ > > > subq %r8, %rdx > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > - /* Check non-temporal store threshold. */ > > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > > - ja L(large_backward) > > > -#endif > > > + > > > + .p2align 4 > > > L(loop_4x_vec_backward): > > > /* Copy 4 * VEC a time backward. */ > > > VMOVU (%rcx), %VEC(0) > > > VMOVU -VEC_SIZE(%rcx), %VEC(1) > > > VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > > VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > > - subq $(VEC_SIZE * 4), %rcx > > > - subq $(VEC_SIZE * 4), %rdx > > > + addq $-(VEC_SIZE * 4), %rcx > > > + addq $-(VEC_SIZE * 4), %rdx > > > VMOVA %VEC(0), (%r9) > > > VMOVA %VEC(1), -VEC_SIZE(%r9) > > > VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) > > > VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) > > > - subq $(VEC_SIZE * 4), %r9 > > > + addq $-(VEC_SIZE * 4), %r9 > > > cmpq $(VEC_SIZE * 4), %rdx > > > ja L(loop_4x_vec_backward) > > > /* Store the first 4 * VEC. */ > > > @@ -500,72 +562,202 @@ L(loop_4x_vec_backward): > > > VZEROUPPER_RETURN > > > > > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > -L(large_forward): > > > + .p2align 4 > > > +L(large_memcpy_2x): > > > + /* Compute absolute value of difference between source and > > > + destination. */ > > > + movq %rdi, %r9 > > > + subq %rsi, %r9 > > > + movq %r9, %r8 > > > + leaq -1(%r9), %rcx > > > + sarq $63, %r8 > > > + xorq %r8, %r9 > > > + subq %r8, %r9 > > > /* Don't use non-temporal store if there is overlap between > > > - destination and source since destination may be in cache > > > - when source is loaded. */ > > > - leaq (%rdi, %rdx), %r10 > > > - cmpq %r10, %rsi > > > - jb L(loop_4x_vec_forward) > > > -L(loop_large_forward): > > > + destination and source since destination may be in cache when > > > + source is loaded. */ > > > + cmpq %r9, %rdx > > > + ja L(more_8x_vec_check) > > > + > > > + /* Cache align destination. First store the first 64 bytes then > > > + adjust alignments. */ > > > + VMOVU (%rsi), %VEC(8) > > > +#if VEC_SIZE < 64 > > > + VMOVU VEC_SIZE(%rsi), %VEC(9) > > > +#if VEC_SIZE < 32 > > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > > > +#endif > > > +#endif > > > + VMOVU %VEC(8), (%rdi) > > > +#if VEC_SIZE < 64 > > > + VMOVU %VEC(9), VEC_SIZE(%rdi) > > > +#if VEC_SIZE < 32 > > > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > > > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > > > +#endif > > > +#endif > > > + /* Adjust source, destination, and size. */ > > > + movq %rdi, %r8 > > > + andq $63, %r8 > > > + /* Get the negative of offset for alignment. */ > > > + subq $64, %r8 > > > + /* Adjust source. */ > > > + subq %r8, %rsi > > > + /* Adjust destination which should be aligned now. */ > > > + subq %r8, %rdi > > > + /* Adjust length. */ > > > + addq %r8, %rdx > > > + > > > + /* Test if source and destination addresses will alias. If they do > > > + the larger pipeline in large_memcpy_4x alleviated the > > > + performance drop. */ > > > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > > > + jz L(large_memcpy_4x) > > > + > > > + movq %rdx, %r10 > > > + shrq $LOG_4X_MEMCPY_THRESH, %r10 > > > + cmp __x86_shared_non_temporal_threshold(%rip), %r10 > > > + jae L(large_memcpy_4x) > > > + > > > + /* edx will store remainder size for copying tail. */ > > > + andl $(PAGE_SIZE * 2 - 1), %edx > > > + /* r10 stores outer loop counter. */ > > > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 > > > + /* Copy 4x VEC at a time from 2 pages. */ > > > + .p2align 4 > > > +L(loop_large_memcpy_2x_outer): > > > + /* ecx stores inner loop counter. */ > > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > > +L(loop_large_memcpy_2x_inner): > > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) > > > + /* Load vectors from rsi. */ > > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > + subq $-LARGE_LOAD_SIZE, %rsi > > > + /* Non-temporal store vectors to rdi. */ > > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > + subq $-LARGE_LOAD_SIZE, %rdi > > > + decl %ecx > > > + jnz L(loop_large_memcpy_2x_inner) > > > + addq $PAGE_SIZE, %rdi > > > + addq $PAGE_SIZE, %rsi > > > + decq %r10 > > > + jne L(loop_large_memcpy_2x_outer) > > > + sfence > > > + > > > + /* Check if only last 4 loads are needed. */ > > > + cmpl $(VEC_SIZE * 4), %edx > > > + jbe L(large_memcpy_2x_end) > > > + > > > + /* Handle the last 2 * PAGE_SIZE bytes. */ > > > +L(loop_large_memcpy_2x_tail): > > > /* Copy 4 * VEC a time forward with non-temporal stores. */ > > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) > > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > > VMOVU (%rsi), %VEC(0) > > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > > - addq $PREFETCHED_LOAD_SIZE, %rsi > > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > > - VMOVNT %VEC(0), (%rdi) > > > - VMOVNT %VEC(1), VEC_SIZE(%rdi) > > > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) > > > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) > > > - addq $PREFETCHED_LOAD_SIZE, %rdi > > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > > - ja L(loop_large_forward) > > > - sfence > > > + subq $-(VEC_SIZE * 4), %rsi > > > + addl $-(VEC_SIZE * 4), %edx > > > + VMOVA %VEC(0), (%rdi) > > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > > + subq $-(VEC_SIZE * 4), %rdi > > > + cmpl $(VEC_SIZE * 4), %edx > > > + ja L(loop_large_memcpy_2x_tail) > > > + > > > +L(large_memcpy_2x_end): > > > /* Store the last 4 * VEC. */ > > > - VMOVU %VEC(5), (%rcx) > > > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > > > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > > > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > > > - /* Store the first VEC. */ > > > - VMOVU %VEC(4), (%r11) > > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > > + > > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > > VZEROUPPER_RETURN > > > > > > -L(large_backward): > > > - /* Don't use non-temporal store if there is overlap between > > > - destination and source since destination may be in cache > > > - when source is loaded. */ > > > - leaq (%rcx, %rdx), %r10 > > > - cmpq %r10, %r9 > > > - jb L(loop_4x_vec_backward) > > > -L(loop_large_backward): > > > - /* Copy 4 * VEC a time backward with non-temporal stores. */ > > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) > > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) > > > - VMOVU (%rcx), %VEC(0) > > > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > > > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > > - subq $PREFETCHED_LOAD_SIZE, %rcx > > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > > - VMOVNT %VEC(0), (%r9) > > > - VMOVNT %VEC(1), -VEC_SIZE(%r9) > > > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) > > > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) > > > - subq $PREFETCHED_LOAD_SIZE, %r9 > > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > > - ja L(loop_large_backward) > > > + .p2align 4 > > > +L(large_memcpy_4x): > > > + movq %rdx, %r10 > > > + /* edx will store remainder size for copying tail. */ > > > + andl $(PAGE_SIZE * 4 - 1), %edx > > > + /* r10 stores outer loop counter. */ > > > + shrq $(LOG_PAGE_SIZE + 2), %r10 > > > + /* Copy 4x VEC at a time from 4 pages. */ > > > + .p2align 4 > > > +L(loop_large_memcpy_4x_outer): > > > + /* ecx stores inner loop counter. */ > > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > > +L(loop_large_memcpy_4x_inner): > > > + /* Only one prefetch set per page as doing 4 pages give more time > > > + for prefetcher to keep up. */ > > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) > > > + /* Load vectors from rsi. */ > > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > > + subq $-LARGE_LOAD_SIZE, %rsi > > > + /* Non-temporal store vectors to rdi. */ > > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > > + subq $-LARGE_LOAD_SIZE, %rdi > > > + decl %ecx > > > + jnz L(loop_large_memcpy_4x_inner) > > > + addq $(PAGE_SIZE * 3), %rdi > > > + addq $(PAGE_SIZE * 3), %rsi > > > + decq %r10 > > > + jne L(loop_large_memcpy_4x_outer) > > > sfence > > > - /* Store the first 4 * VEC. */ > > > - VMOVU %VEC(4), (%rdi) > > > - VMOVU %VEC(5), VEC_SIZE(%rdi) > > > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > > > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > > > - /* Store the last VEC. */ > > > - VMOVU %VEC(8), (%r11) > > > + /* Check if only last 4 loads are needed. */ > > > + cmpl $(VEC_SIZE * 4), %edx > > > + jbe L(large_memcpy_4x_end) > > > + > > > + /* Handle the last 4 * PAGE_SIZE bytes. */ > > > +L(loop_large_memcpy_4x_tail): > > > + /* Copy 4 * VEC a time forward with non-temporal stores. */ > > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > > + VMOVU (%rsi), %VEC(0) > > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > > + subq $-(VEC_SIZE * 4), %rsi > > > + addl $-(VEC_SIZE * 4), %edx > > > + VMOVA %VEC(0), (%rdi) > > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > > + subq $-(VEC_SIZE * 4), %rdi > > > + cmpl $(VEC_SIZE * 4), %edx > > > + ja L(loop_large_memcpy_4x_tail) > > > + > > > +L(large_memcpy_4x_end): > > > + /* Store the last 4 * VEC. */ > > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > > + > > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > > VZEROUPPER_RETURN > > > #endif > > > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > > > -- > > > 2.29.2 > > > > > > > LGTM. Please commit it. > > > > Thanks. > > > > > > H.J.
On Fri, Apr 16, 2021 at 1:05 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Fri, Apr 16, 2021 at 9:35 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > LGTM. Please commit it. > > > > Are you saying that to me or someone else? If its to me what do you > > mean, is the patch not enough? > > I will commit it for you. Thanks! Are you planning on accepting the bench / testing changes as well? > > > > Thanks. > > > > On Fri, Apr 16, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote: > > > > From: noah <goldstein.w.n@gmail.com> > > > > > > > > No Bug. This commit updates the large memcpy case (no overlap). The > > > > update is to perform memcpy on either 2 or 4 contiguous pages at > > > > once. This 1) helps to alleviate the affects of false memory aliasing > > > > when destination and source have a close 4k alignment and 2) In most > > > > cases and for most DRAM units is a modestly more efficient access > > > > pattern. These changes are a clear performance improvement for > > > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > > > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > > > > pass. > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > --- > > > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the > > > > loops and no longer see any meaningful regression. > > > > > > > > Also added back the temporal stores for the tail. Saw a regression > > > > when doing these tests. > > > > > > > > Two tables below for skylake and icelake numbers for the areas around > > > > where you saw the regression. Below is all data from the tests. > > > > > > > > N = 10. > > > > > > > > Skylake > > > > Len ,align1 ,align2 ,new mean ,old mean > > > > 4103 ,0 ,64 ,84.5 ,88.6 > > > > 4111 ,0 ,3 ,99.0 ,99.9 > > > > 4127 ,3 ,0 ,102.1 ,102.3 > > > > 4159 ,3 ,7 ,88.7 ,90.9 > > > > 4223 ,9 ,5 ,88.1 ,87.4 > > > > 8199 ,0 ,64 ,146.7 ,150.2 > > > > 8207 ,0 ,3 ,167.9 ,168.5 > > > > 8223 ,3 ,0 ,168.5 ,168.1 > > > > 8255 ,3 ,7 ,157.0 ,159.2 > > > > 8319 ,9 ,5 ,155.5 ,155.7 > > > > 16391 ,0 ,64 ,286.2 ,288.8 > > > > 16399 ,0 ,3 ,307.0 ,308.7 > > > > 16415 ,3 ,0 ,307.4 ,307.6 > > > > 16447 ,3 ,7 ,294.6 ,295.5 > > > > 16511 ,9 ,5 ,291.5 ,462.1 > > > > 32775 ,0 ,64 ,603.4 ,601.5 > > > > 32783 ,0 ,3 ,604.8 ,606.4 > > > > 32799 ,3 ,0 ,603.0 ,604.1 > > > > 32831 ,3 ,7 ,600.2 ,737.3 > > > > 32895 ,9 ,5 ,604.4 ,599.5 > > > > 65543 ,0 ,64 ,1873.5 ,1854.3 > > > > 65551 ,0 ,3 ,1862.9 ,1846.6 > > > > 65567 ,3 ,0 ,1885.5 ,1966.0 > > > > 65599 ,3 ,7 ,1833.2 ,1833.1 > > > > 65663 ,9 ,5 ,1884.9 ,1887.4 > > > > 131079 ,0 ,64 ,3944.3 ,3949.4 > > > > 131087 ,0 ,3 ,3927.3 ,3913.3 > > > > 131103 ,3 ,0 ,4415.8 ,4169.4 > > > > 131135 ,3 ,7 ,4224.5 ,4157.6 > > > > 131199 ,9 ,5 ,5974.0 ,4983.8 > > > > 262151 ,0 ,64 ,11050.2 ,10620.6 > > > > 262159 ,0 ,3 ,9932.8 ,10037.3 > > > > 262175 ,3 ,0 ,10188.8 ,9206.6 > > > > 262207 ,3 ,7 ,9633.3 ,9216.7 > > > > 262271 ,9 ,5 ,9732.7 ,9345.3 > > > > 524295 ,0 ,64 ,24823.9 ,24880.7 > > > > 524303 ,0 ,3 ,24514.0 ,24556.7 > > > > 524319 ,3 ,0 ,23974.4 ,24219.9 > > > > 524351 ,3 ,7 ,24159.7 ,24207.0 > > > > 524415 ,9 ,5 ,23946.5 ,24142.8 > > > > > > > > Icelake: > > > > Len ,align1 ,align2 ,new mean ,old mean > > > > 4103 ,0 ,64 ,50.2 ,63.7 > > > > 4111 ,0 ,3 ,63.7 ,65.1 > > > > 4127 ,3 ,0 ,68.2 ,69.4 > > > > 4159 ,3 ,7 ,59.6 ,68.0 > > > > 4223 ,9 ,5 ,68.2 ,66.8 > > > > 8199 ,0 ,64 ,92.1 ,89.9 > > > > 8207 ,0 ,3 ,119.7 ,118.3 > > > > 8223 ,3 ,0 ,119.1 ,120.9 > > > > 8255 ,3 ,7 ,122.9 ,123.7 > > > > 8319 ,9 ,5 ,122.1 ,121.8 > > > > 16391 ,0 ,64 ,162.7 ,158.0 > > > > 16399 ,0 ,3 ,227.6 ,234.1 > > > > 16415 ,3 ,0 ,230.8 ,232.7 > > > > 16447 ,3 ,7 ,226.8 ,232.6 > > > > 16511 ,9 ,5 ,233.4 ,233.8 > > > > 32775 ,0 ,64 ,312.2 ,301.8 > > > > 32783 ,0 ,3 ,449.7 ,450.0 > > > > 32799 ,3 ,0 ,452.7 ,455.9 > > > > 32831 ,3 ,7 ,449.8 ,458.0 > > > > 32895 ,9 ,5 ,456.3 ,459.4 > > > > 65543 ,0 ,64 ,1460.6 ,1463.9 > > > > 65551 ,0 ,3 ,1462.0 ,1465.4 > > > > 65567 ,3 ,0 ,1466.6 ,1480.4 > > > > 65599 ,3 ,7 ,1488.0 ,1488.9 > > > > 65663 ,9 ,5 ,1680.8 ,1499.5 > > > > 131079 ,0 ,64 ,2988.5 ,3010.1 > > > > 131087 ,0 ,3 ,2995.5 ,2996.4 > > > > 131103 ,3 ,0 ,3006.2 ,3000.5 > > > > 131135 ,3 ,7 ,3032.4 ,3073.7 > > > > 131199 ,9 ,5 ,3010.4 ,3027.4 > > > > 262151 ,0 ,64 ,6143.2 ,6079.1 > > > > 262159 ,0 ,3 ,6085.1 ,6075.8 > > > > 262175 ,3 ,0 ,6088.0 ,6064.9 > > > > 262207 ,3 ,7 ,6018.7 ,6023.5 > > > > 262271 ,9 ,5 ,6019.8 ,5959.2 > > > > 524295 ,0 ,64 ,14464.2 ,14095.1 > > > > 524303 ,0 ,3 ,14761.6 ,14050.2 > > > > 524319 ,3 ,0 ,14534.1 ,14087.5 > > > > 524351 ,3 ,7 ,14147.7 ,13903.8 > > > > 524415 ,9 ,5 ,14157.0 ,13982.9 > > > > > > > > > > > > > > > > cpu ,version ,Len ,align1 ,align2 ,new mean ,old mean > > > > skylake ,avx ,4103 ,0 ,64 ,84.5 ,88.6 > > > > skylake ,avx ,4111 ,0 ,3 ,99.0 ,99.9 > > > > skylake ,avx ,4127 ,3 ,0 ,102.1 ,102.3 > > > > skylake ,avx ,4159 ,3 ,7 ,88.7 ,90.9 > > > > skylake ,avx ,4223 ,9 ,5 ,88.1 ,87.4 > > > > skylake ,avx ,8199 ,0 ,64 ,146.7 ,150.2 > > > > skylake ,avx ,8207 ,0 ,3 ,167.9 ,168.5 > > > > skylake ,avx ,8223 ,3 ,0 ,168.5 ,168.1 > > > > skylake ,avx ,8255 ,3 ,7 ,157.0 ,159.2 > > > > skylake ,avx ,8319 ,9 ,5 ,155.5 ,155.7 > > > > skylake ,avx ,16391 ,0 ,64 ,286.2 ,288.8 > > > > skylake ,avx ,16399 ,0 ,3 ,307.0 ,308.7 > > > > skylake ,avx ,16415 ,3 ,0 ,307.4 ,307.6 > > > > skylake ,avx ,16447 ,3 ,7 ,294.6 ,295.5 > > > > skylake ,avx ,16511 ,9 ,5 ,291.5 ,462.1 > > > > skylake ,avx ,32775 ,0 ,64 ,603.4 ,601.5 > > > > skylake ,avx ,32783 ,0 ,3 ,604.8 ,606.4 > > > > skylake ,avx ,32799 ,3 ,0 ,603.0 ,604.1 > > > > skylake ,avx ,32831 ,3 ,7 ,600.2 ,737.3 > > > > skylake ,avx ,32895 ,9 ,5 ,604.4 ,599.5 > > > > skylake ,avx ,65543 ,0 ,64 ,1873.5 ,1854.3 > > > > skylake ,avx ,65551 ,0 ,3 ,1862.9 ,1846.6 > > > > skylake ,avx ,65567 ,3 ,0 ,1885.5 ,1966.0 > > > > skylake ,avx ,65599 ,3 ,7 ,1833.2 ,1833.1 > > > > skylake ,avx ,65663 ,9 ,5 ,1884.9 ,1887.4 > > > > skylake ,avx ,131079 ,0 ,64 ,3944.3 ,3949.4 > > > > skylake ,avx ,131087 ,0 ,3 ,3927.3 ,3913.3 > > > > skylake ,avx ,131103 ,3 ,0 ,4415.8 ,4169.4 > > > > skylake ,avx ,131135 ,3 ,7 ,4224.5 ,4157.6 > > > > skylake ,avx ,131199 ,9 ,5 ,5974.0 ,4983.8 > > > > skylake ,avx ,262151 ,0 ,64 ,11050.2 ,10620.6 > > > > skylake ,avx ,262159 ,0 ,3 ,9932.8 ,10037.3 > > > > skylake ,avx ,262175 ,3 ,0 ,10188.8 ,9206.6 > > > > skylake ,avx ,262207 ,3 ,7 ,9633.3 ,9216.7 > > > > skylake ,avx ,262271 ,9 ,5 ,9732.7 ,9345.3 > > > > skylake ,avx ,524295 ,0 ,64 ,24823.9 ,24880.7 > > > > skylake ,avx ,524303 ,0 ,3 ,24514.0 ,24556.7 > > > > skylake ,avx ,524319 ,3 ,0 ,23974.4 ,24219.9 > > > > skylake ,avx ,524351 ,3 ,7 ,24159.7 ,24207.0 > > > > skylake ,avx ,524415 ,9 ,5 ,23946.5 ,24142.8 > > > > skylake ,avx ,1048583 ,0 ,64 ,49163.9 ,49454.6 > > > > skylake ,avx ,1048591 ,0 ,3 ,49879.3 ,49400.8 > > > > skylake ,avx ,1048607 ,3 ,0 ,49738.0 ,48864.6 > > > > skylake ,avx ,1048639 ,3 ,7 ,48804.0 ,47588.5 > > > > skylake ,avx ,1048703 ,9 ,5 ,49629.4 ,49796.3 > > > > skylake ,avx ,2097159 ,0 ,64 ,98271.7 ,96330.6 > > > > skylake ,avx ,2097167 ,0 ,3 ,97801.8 ,98638.1 > > > > skylake ,avx ,2097183 ,3 ,0 ,98041.1 ,99287.6 > > > > skylake ,avx ,2097215 ,3 ,7 ,96629.5 ,96521.9 > > > > skylake ,avx ,2097279 ,9 ,5 ,98961.8 ,98909.8 > > > > skylake ,avx ,4194311 ,0 ,64 ,194667.7 ,195377.1 > > > > skylake ,avx ,4194319 ,0 ,3 ,194919.5 ,198576.2 > > > > skylake ,avx ,4194335 ,3 ,0 ,192949.8 ,194584.7 > > > > skylake ,avx ,4194367 ,3 ,7 ,189943.5 ,189177.9 > > > > skylake ,avx ,4194431 ,9 ,5 ,192479.1 ,196494.2 > > > > skylake ,avx ,8388615 ,0 ,64 ,588671.6 ,587215.4 > > > > skylake ,avx ,8388623 ,0 ,3 ,581640.7 ,582812.5 > > > > skylake ,avx ,8388639 ,3 ,0 ,549811.9 ,544697.6 > > > > skylake ,avx ,8388671 ,3 ,7 ,591155.0 ,577951.8 > > > > skylake ,avx ,8388735 ,9 ,5 ,547583.2 ,545133.3 > > > > skylake ,avx ,16777223 ,0 ,64 ,1787503.0 ,1811146.0 > > > > skylake ,avx ,16777231 ,0 ,3 ,1758671.0 ,1756343.0 > > > > skylake ,avx ,16777247 ,3 ,0 ,1691781.0 ,1694661.0 > > > > skylake ,avx ,16777279 ,3 ,7 ,1768150.0 ,1754785.0 > > > > skylake ,avx ,16777343 ,9 ,5 ,1695179.0 ,1710794.0 > > > > skylake ,sse2 ,4103 ,0 ,64 ,150.8 ,150.5 > > > > skylake ,sse2 ,4111 ,0 ,3 ,156.8 ,158.4 > > > > skylake ,sse2 ,4127 ,3 ,0 ,99.7 ,99.4 > > > > skylake ,sse2 ,4159 ,3 ,7 ,154.8 ,154.5 > > > > skylake ,sse2 ,4223 ,9 ,5 ,137.3 ,137.2 > > > > skylake ,sse2 ,8199 ,0 ,64 ,284.8 ,285.5 > > > > skylake ,sse2 ,8207 ,0 ,3 ,296.0 ,296.1 > > > > skylake ,sse2 ,8223 ,3 ,0 ,168.0 ,168.2 > > > > skylake ,sse2 ,8255 ,3 ,7 ,293.0 ,292.4 > > > > skylake ,sse2 ,8319 ,9 ,5 ,251.3 ,250.7 > > > > skylake ,sse2 ,16391 ,0 ,64 ,561.3 ,608.3 > > > > skylake ,sse2 ,16399 ,0 ,3 ,571.0 ,574.8 > > > > skylake ,sse2 ,16415 ,3 ,0 ,305.4 ,305.0 > > > > skylake ,sse2 ,16447 ,3 ,7 ,563.2 ,565.0 > > > > skylake ,sse2 ,16511 ,9 ,5 ,477.1 ,475.1 > > > > skylake ,sse2 ,32775 ,0 ,64 ,1128.2 ,1131.7 > > > > skylake ,sse2 ,32783 ,0 ,3 ,1126.6 ,1131.0 > > > > skylake ,sse2 ,32799 ,3 ,0 ,587.6 ,590.8 > > > > skylake ,sse2 ,32831 ,3 ,7 ,1130.6 ,1126.2 > > > > skylake ,sse2 ,32895 ,9 ,5 ,957.6 ,953.0 > > > > skylake ,sse2 ,65543 ,0 ,64 ,2718.9 ,2704.2 > > > > skylake ,sse2 ,65551 ,0 ,3 ,2724.1 ,2725.0 > > > > skylake ,sse2 ,65567 ,3 ,0 ,1888.4 ,1914.3 > > > > skylake ,sse2 ,65599 ,3 ,7 ,2787.6 ,2748.7 > > > > skylake ,sse2 ,65663 ,9 ,5 ,2400.5 ,2369.4 > > > > skylake ,sse2 ,131079 ,0 ,64 ,5603.3 ,5654.9 > > > > skylake ,sse2 ,131087 ,0 ,3 ,5939.3 ,5871.4 > > > > skylake ,sse2 ,131103 ,3 ,0 ,4272.4 ,4190.0 > > > > skylake ,sse2 ,131135 ,3 ,7 ,7601.4 ,7524.6 > > > > skylake ,sse2 ,131199 ,9 ,5 ,7022.1 ,6864.7 > > > > skylake ,sse2 ,262151 ,0 ,64 ,13736.2 ,14030.0 > > > > skylake ,sse2 ,262159 ,0 ,3 ,12407.3 ,12334.1 > > > > skylake ,sse2 ,262175 ,3 ,0 ,9661.1 ,9249.4 > > > > skylake ,sse2 ,262207 ,3 ,7 ,12850.2 ,12351.6 > > > > skylake ,sse2 ,262271 ,9 ,5 ,10792.6 ,10435.8 > > > > skylake ,sse2 ,524295 ,0 ,64 ,27754.5 ,28177.7 > > > > skylake ,sse2 ,524303 ,0 ,3 ,27766.2 ,28152.0 > > > > skylake ,sse2 ,524319 ,3 ,0 ,24030.9 ,24438.3 > > > > skylake ,sse2 ,524351 ,3 ,7 ,27787.5 ,27933.0 > > > > skylake ,sse2 ,524415 ,9 ,5 ,24263.2 ,25249.1 > > > > skylake ,sse2 ,1048583 ,0 ,64 ,56199.9 ,56039.8 > > > > skylake ,sse2 ,1048591 ,0 ,3 ,56750.2 ,58889.7 > > > > skylake ,sse2 ,1048607 ,3 ,0 ,56394.0 ,55115.3 > > > > skylake ,sse2 ,1048639 ,3 ,7 ,57233.1 ,57473.8 > > > > skylake ,sse2 ,1048703 ,9 ,5 ,56324.3 ,55917.9 > > > > skylake ,sse2 ,2097159 ,0 ,64 ,113234.8 ,114346.4 > > > > skylake ,sse2 ,2097167 ,0 ,3 ,114373.1 ,115522.5 > > > > skylake ,sse2 ,2097183 ,3 ,0 ,108113.3 ,108513.3 > > > > skylake ,sse2 ,2097215 ,3 ,7 ,116863.6 ,116549.9 > > > > skylake ,sse2 ,2097279 ,9 ,5 ,108945.1 ,108843.7 > > > > skylake ,sse2 ,4194311 ,0 ,64 ,230250.1 ,232350.0 > > > > skylake ,sse2 ,4194319 ,0 ,3 ,231895.3 ,235055.6 > > > > skylake ,sse2 ,4194335 ,3 ,0 ,218442.8 ,219199.8 > > > > skylake ,sse2 ,4194367 ,3 ,7 ,242564.2 ,235587.7 > > > > skylake ,sse2 ,4194431 ,9 ,5 ,224167.4 ,215261.8 > > > > skylake ,sse2 ,8388615 ,0 ,64 ,679801.8 ,674832.0 > > > > skylake ,sse2 ,8388623 ,0 ,3 ,684913.2 ,685238.7 > > > > skylake ,sse2 ,8388639 ,3 ,0 ,644865.4 ,631388.6 > > > > skylake ,sse2 ,8388671 ,3 ,7 ,698700.9 ,689316.1 > > > > skylake ,sse2 ,8388735 ,9 ,5 ,644820.2 ,631366.8 > > > > skylake ,sse2 ,16777223 ,0 ,64 ,1877984.0 ,1876437.0 > > > > skylake ,sse2 ,16777231 ,0 ,3 ,1898086.0 ,1913053.0 > > > > skylake ,sse2 ,16777247 ,3 ,0 ,1857018.0 ,1866949.0 > > > > skylake ,sse2 ,16777279 ,3 ,7 ,1914905.0 ,1897134.0 > > > > skylake ,sse2 ,16777343 ,9 ,5 ,1859937.0 ,1881939.0 > > > > icelake ,avx512 ,4103 ,0 ,64 ,75.2 ,75.8 > > > > icelake ,avx512 ,4111 ,0 ,3 ,56.9 ,56.4 > > > > icelake ,avx512 ,4127 ,3 ,0 ,59.1 ,59.6 > > > > icelake ,avx512 ,4159 ,3 ,7 ,50.7 ,51.3 > > > > icelake ,avx512 ,4223 ,9 ,5 ,59.2 ,58.9 > > > > icelake ,avx512 ,8199 ,0 ,64 ,67.8 ,63.9 > > > > icelake ,avx512 ,8207 ,0 ,3 ,89.0 ,89.9 > > > > icelake ,avx512 ,8223 ,3 ,0 ,90.2 ,90.1 > > > > icelake ,avx512 ,8255 ,3 ,7 ,82.6 ,84.9 > > > > icelake ,avx512 ,8319 ,9 ,5 ,91.5 ,92.8 > > > > icelake ,avx512 ,16391 ,0 ,64 ,118.0 ,117.6 > > > > icelake ,avx512 ,16399 ,0 ,3 ,156.5 ,157.0 > > > > icelake ,avx512 ,16415 ,3 ,0 ,157.4 ,157.3 > > > > icelake ,avx512 ,16447 ,3 ,7 ,151.0 ,151.6 > > > > icelake ,avx512 ,16511 ,9 ,5 ,159.1 ,159.6 > > > > icelake ,avx512 ,32775 ,0 ,64 ,231.8 ,230.8 > > > > icelake ,avx512 ,32783 ,0 ,3 ,297.8 ,299.3 > > > > icelake ,avx512 ,32799 ,3 ,0 ,299.1 ,299.0 > > > > icelake ,avx512 ,32831 ,3 ,7 ,293.5 ,295.4 > > > > icelake ,avx512 ,32895 ,9 ,5 ,300.3 ,302.5 > > > > icelake ,avx512 ,65543 ,0 ,64 ,1473.4 ,1479.2 > > > > icelake ,avx512 ,65551 ,0 ,3 ,1438.2 ,1445.3 > > > > icelake ,avx512 ,65567 ,3 ,0 ,1450.3 ,1463.8 > > > > icelake ,avx512 ,65599 ,3 ,7 ,1469.0 ,1473.8 > > > > icelake ,avx512 ,65663 ,9 ,5 ,1480.0 ,1483.5 > > > > icelake ,avx512 ,131079 ,0 ,64 ,3015.1 ,3037.5 > > > > icelake ,avx512 ,131087 ,0 ,3 ,2952.3 ,2960.4 > > > > icelake ,avx512 ,131103 ,3 ,0 ,2966.2 ,2964.4 > > > > icelake ,avx512 ,131135 ,3 ,7 ,2961.6 ,3047.9 > > > > icelake ,avx512 ,131199 ,9 ,5 ,2967.4 ,3183.8 > > > > icelake ,avx512 ,262151 ,0 ,64 ,6206.0 ,6141.5 > > > > icelake ,avx512 ,262159 ,0 ,3 ,5990.8 ,5959.2 > > > > icelake ,avx512 ,262175 ,3 ,0 ,5976.7 ,5963.8 > > > > icelake ,avx512 ,262207 ,3 ,7 ,5939.5 ,5924.3 > > > > icelake ,avx512 ,262271 ,9 ,5 ,5944.6 ,5990.3 > > > > icelake ,avx512 ,524295 ,0 ,64 ,14726.7 ,14307.0 > > > > icelake ,avx512 ,524303 ,0 ,3 ,14344.2 ,14040.5 > > > > icelake ,avx512 ,524319 ,3 ,0 ,14175.0 ,13862.2 > > > > icelake ,avx512 ,524351 ,3 ,7 ,14261.4 ,13821.5 > > > > icelake ,avx512 ,524415 ,9 ,5 ,14266.5 ,14064.7 > > > > icelake ,avx512 ,1048583 ,0 ,64 ,35211.4 ,35414.6 > > > > icelake ,avx512 ,1048591 ,0 ,3 ,35156.8 ,35591.2 > > > > icelake ,avx512 ,1048607 ,3 ,0 ,35273.1 ,35503.3 > > > > icelake ,avx512 ,1048639 ,3 ,7 ,35255.8 ,35725.0 > > > > icelake ,avx512 ,1048703 ,9 ,5 ,35703.6 ,36289.9 > > > > icelake ,avx512 ,2097159 ,0 ,64 ,72613.9 ,72063.2 > > > > icelake ,avx512 ,2097167 ,0 ,3 ,72301.6 ,73504.2 > > > > icelake ,avx512 ,2097183 ,3 ,0 ,73448.8 ,72133.6 > > > > icelake ,avx512 ,2097215 ,3 ,7 ,73762.9 ,72825.8 > > > > icelake ,avx512 ,2097279 ,9 ,5 ,72097.3 ,72914.6 > > > > icelake ,avx512 ,4194311 ,0 ,64 ,144793.4 ,144182.1 > > > > icelake ,avx512 ,4194319 ,0 ,3 ,143710.3 ,145063.3 > > > > icelake ,avx512 ,4194335 ,3 ,0 ,146722.1 ,144046.4 > > > > icelake ,avx512 ,4194367 ,3 ,7 ,144267.0 ,144874.6 > > > > icelake ,avx512 ,4194431 ,9 ,5 ,143808.2 ,144560.0 > > > > icelake ,avx512 ,8388615 ,0 ,64 ,427993.4 ,424521.5 > > > > icelake ,avx512 ,8388623 ,0 ,3 ,470267.1 ,473290.8 > > > > icelake ,avx512 ,8388639 ,3 ,0 ,457179.7 ,461797.7 > > > > icelake ,avx512 ,8388671 ,3 ,7 ,472507.9 ,481561.4 > > > > icelake ,avx512 ,8388735 ,9 ,5 ,463611.9 ,467388.7 > > > > icelake ,avx512 ,16777223 ,0 ,64 ,1490426.0 ,1526996.0 > > > > icelake ,avx512 ,16777231 ,0 ,3 ,1516687.0 ,1517095.0 > > > > icelake ,avx512 ,16777247 ,3 ,0 ,1497688.0 ,1512766.0 > > > > icelake ,avx512 ,16777279 ,3 ,7 ,1512331.0 ,1524317.0 > > > > icelake ,avx512 ,16777343 ,9 ,5 ,1498908.0 ,1500526.0 > > > > icelake ,avx ,4103 ,0 ,64 ,50.2 ,63.7 > > > > icelake ,avx ,4111 ,0 ,3 ,63.7 ,65.1 > > > > icelake ,avx ,4127 ,3 ,0 ,68.2 ,69.4 > > > > icelake ,avx ,4159 ,3 ,7 ,59.6 ,68.0 > > > > icelake ,avx ,4223 ,9 ,5 ,68.2 ,66.8 > > > > icelake ,avx ,8199 ,0 ,64 ,92.1 ,89.9 > > > > icelake ,avx ,8207 ,0 ,3 ,119.7 ,118.3 > > > > icelake ,avx ,8223 ,3 ,0 ,119.1 ,120.9 > > > > icelake ,avx ,8255 ,3 ,7 ,122.9 ,123.7 > > > > icelake ,avx ,8319 ,9 ,5 ,122.1 ,121.8 > > > > icelake ,avx ,16391 ,0 ,64 ,162.7 ,158.0 > > > > icelake ,avx ,16399 ,0 ,3 ,227.6 ,234.1 > > > > icelake ,avx ,16415 ,3 ,0 ,230.8 ,232.7 > > > > icelake ,avx ,16447 ,3 ,7 ,226.8 ,232.6 > > > > icelake ,avx ,16511 ,9 ,5 ,233.4 ,233.8 > > > > icelake ,avx ,32775 ,0 ,64 ,312.2 ,301.8 > > > > icelake ,avx ,32783 ,0 ,3 ,449.7 ,450.0 > > > > icelake ,avx ,32799 ,3 ,0 ,452.7 ,455.9 > > > > icelake ,avx ,32831 ,3 ,7 ,449.8 ,458.0 > > > > icelake ,avx ,32895 ,9 ,5 ,456.3 ,459.4 > > > > icelake ,avx ,65543 ,0 ,64 ,1460.6 ,1463.9 > > > > icelake ,avx ,65551 ,0 ,3 ,1462.0 ,1465.4 > > > > icelake ,avx ,65567 ,3 ,0 ,1466.6 ,1480.4 > > > > icelake ,avx ,65599 ,3 ,7 ,1488.0 ,1488.9 > > > > icelake ,avx ,65663 ,9 ,5 ,1680.8 ,1499.5 > > > > icelake ,avx ,131079 ,0 ,64 ,2988.5 ,3010.1 > > > > icelake ,avx ,131087 ,0 ,3 ,2995.5 ,2996.4 > > > > icelake ,avx ,131103 ,3 ,0 ,3006.2 ,3000.5 > > > > icelake ,avx ,131135 ,3 ,7 ,3032.4 ,3073.7 > > > > icelake ,avx ,131199 ,9 ,5 ,3010.4 ,3027.4 > > > > icelake ,avx ,262151 ,0 ,64 ,6143.2 ,6079.1 > > > > icelake ,avx ,262159 ,0 ,3 ,6085.1 ,6075.8 > > > > icelake ,avx ,262175 ,3 ,0 ,6088.0 ,6064.9 > > > > icelake ,avx ,262207 ,3 ,7 ,6018.7 ,6023.5 > > > > icelake ,avx ,262271 ,9 ,5 ,6019.8 ,5959.2 > > > > icelake ,avx ,524295 ,0 ,64 ,14464.2 ,14095.1 > > > > icelake ,avx ,524303 ,0 ,3 ,14761.6 ,14050.2 > > > > icelake ,avx ,524319 ,3 ,0 ,14534.1 ,14087.5 > > > > icelake ,avx ,524351 ,3 ,7 ,14147.7 ,13903.8 > > > > icelake ,avx ,524415 ,9 ,5 ,14157.0 ,13982.9 > > > > icelake ,avx ,1048583 ,0 ,64 ,36599.0 ,37461.4 > > > > icelake ,avx ,1048591 ,0 ,3 ,36717.8 ,37454.9 > > > > icelake ,avx ,1048607 ,3 ,0 ,36821.2 ,37343.3 > > > > icelake ,avx ,1048639 ,3 ,7 ,36958.0 ,37507.2 > > > > icelake ,avx ,1048703 ,9 ,5 ,36869.2 ,37413.1 > > > > icelake ,avx ,2097159 ,0 ,64 ,74765.8 ,75330.9 > > > > icelake ,avx ,2097167 ,0 ,3 ,75175.4 ,74891.9 > > > > icelake ,avx ,2097183 ,3 ,0 ,75451.4 ,74787.7 > > > > icelake ,avx ,2097215 ,3 ,7 ,75394.8 ,75839.1 > > > > icelake ,avx ,2097279 ,9 ,5 ,75099.2 ,75421.2 > > > > icelake ,avx ,4194311 ,0 ,64 ,146809.6 ,146619.4 > > > > icelake ,avx ,4194319 ,0 ,3 ,148866.4 ,149898.2 > > > > icelake ,avx ,4194335 ,3 ,0 ,148719.7 ,150165.4 > > > > icelake ,avx ,4194367 ,3 ,7 ,150600.1 ,150925.9 > > > > icelake ,avx ,4194431 ,9 ,5 ,149457.3 ,150519.2 > > > > icelake ,avx ,8388615 ,0 ,64 ,412709.8 ,423666.1 > > > > icelake ,avx ,8388623 ,0 ,3 ,423717.4 ,424418.2 > > > > icelake ,avx ,8388639 ,3 ,0 ,414387.5 ,413445.6 > > > > icelake ,avx ,8388671 ,3 ,7 ,449010.7 ,417553.5 > > > > icelake ,avx ,8388735 ,9 ,5 ,414128.6 ,411815.3 > > > > icelake ,avx ,16777223 ,0 ,64 ,1490032.0 ,1510004.0 > > > > icelake ,avx ,16777231 ,0 ,3 ,1379638.0 ,1422097.0 > > > > icelake ,avx ,16777247 ,3 ,0 ,1418930.0 ,1367557.0 > > > > icelake ,avx ,16777279 ,3 ,7 ,1515152.0 ,1500176.0 > > > > icelake ,avx ,16777343 ,9 ,5 ,1344117.0 ,1411795.0 > > > > icelake ,sse2 ,4103 ,0 ,64 ,113.2 ,114.6 > > > > icelake ,sse2 ,4111 ,0 ,3 ,121.5 ,120.4 > > > > icelake ,sse2 ,4127 ,3 ,0 ,1700.5 ,1771.5 > > > > icelake ,sse2 ,4159 ,3 ,7 ,119.3 ,118.8 > > > > icelake ,sse2 ,4223 ,9 ,5 ,1739.7 ,1735.2 > > > > icelake ,sse2 ,8199 ,0 ,64 ,207.0 ,203.9 > > > > icelake ,sse2 ,8207 ,0 ,3 ,225.5 ,220.8 > > > > icelake ,sse2 ,8223 ,3 ,0 ,3444.3 ,3743.5 > > > > icelake ,sse2 ,8255 ,3 ,7 ,219.9 ,216.8 > > > > icelake ,sse2 ,8319 ,9 ,5 ,4117.1 ,3487.3 > > > > icelake ,sse2 ,16391 ,0 ,64 ,397.1 ,394.3 > > > > icelake ,sse2 ,16399 ,0 ,3 ,439.6 ,428.6 > > > > icelake ,sse2 ,16415 ,3 ,0 ,6997.0 ,7031.2 > > > > icelake ,sse2 ,16447 ,3 ,7 ,426.8 ,421.8 > > > > icelake ,sse2 ,16511 ,9 ,5 ,7037.6 ,7038.3 > > > > icelake ,sse2 ,32775 ,0 ,64 ,790.9 ,779.0 > > > > icelake ,sse2 ,32783 ,0 ,3 ,863.1 ,849.6 > > > > icelake ,sse2 ,32799 ,3 ,0 ,14043.0 ,14390.9 > > > > icelake ,sse2 ,32831 ,3 ,7 ,841.6 ,833.1 > > > > icelake ,sse2 ,32895 ,9 ,5 ,14277.6 ,14344.2 > > > > icelake ,sse2 ,65543 ,0 ,64 ,1897.0 ,1897.3 > > > > icelake ,sse2 ,65551 ,0 ,3 ,1927.1 ,1955.4 > > > > icelake ,sse2 ,65567 ,3 ,0 ,28834.7 ,28727.8 > > > > icelake ,sse2 ,65599 ,3 ,7 ,1961.4 ,1969.7 > > > > icelake ,sse2 ,65663 ,9 ,5 ,28867.6 ,29019.8 > > > > icelake ,sse2 ,131079 ,0 ,64 ,3879.3 ,3872.6 > > > > icelake ,sse2 ,131087 ,0 ,3 ,3955.3 ,3990.7 > > > > icelake ,sse2 ,131103 ,3 ,0 ,58001.8 ,60567.9 > > > > icelake ,sse2 ,131135 ,3 ,7 ,3951.5 ,4002.6 > > > > icelake ,sse2 ,131199 ,9 ,5 ,57886.7 ,58391.4 > > > > icelake ,sse2 ,262151 ,0 ,64 ,7851.4 ,7894.7 > > > > icelake ,sse2 ,262159 ,0 ,3 ,7947.5 ,8016.2 > > > > icelake ,sse2 ,262175 ,3 ,0 ,115036.2 ,115968.6 > > > > icelake ,sse2 ,262207 ,3 ,7 ,7883.9 ,7814.1 > > > > icelake ,sse2 ,262271 ,9 ,5 ,113776.4 ,119733.6 > > > > icelake ,sse2 ,524295 ,0 ,64 ,17198.1 ,16974.9 > > > > icelake ,sse2 ,524303 ,0 ,3 ,17402.2 ,17096.3 > > > > icelake ,sse2 ,524319 ,3 ,0 ,223980.4 ,225889.9 > > > > icelake ,sse2 ,524351 ,3 ,7 ,17034.9 ,16910.3 > > > > icelake ,sse2 ,524415 ,9 ,5 ,224027.7 ,224962.5 > > > > icelake ,sse2 ,1048583 ,0 ,64 ,38822.3 ,39178.6 > > > > icelake ,sse2 ,1048591 ,0 ,3 ,41686.7 ,40247.4 > > > > icelake ,sse2 ,1048607 ,3 ,0 ,38814.8 ,39323.3 > > > > icelake ,sse2 ,1048639 ,3 ,7 ,39568.3 ,41325.7 > > > > icelake ,sse2 ,1048703 ,9 ,5 ,39354.2 ,39637.9 > > > > icelake ,sse2 ,2097159 ,0 ,64 ,84074.7 ,84543.1 > > > > icelake ,sse2 ,2097167 ,0 ,3 ,83665.7 ,82358.2 > > > > icelake ,sse2 ,2097183 ,3 ,0 ,81817.8 ,79638.9 > > > > icelake ,sse2 ,2097215 ,3 ,7 ,83649.1 ,83497.6 > > > > icelake ,sse2 ,2097279 ,9 ,5 ,80287.6 ,79980.9 > > > > icelake ,sse2 ,4194311 ,0 ,64 ,165409.8 ,168343.1 > > > > icelake ,sse2 ,4194319 ,0 ,3 ,165216.7 ,177632.0 > > > > icelake ,sse2 ,4194335 ,3 ,0 ,158718.7 ,160342.2 > > > > icelake ,sse2 ,4194367 ,3 ,7 ,167944.9 ,167204.4 > > > > icelake ,sse2 ,4194431 ,9 ,5 ,161530.1 ,164839.7 > > > > icelake ,sse2 ,8388615 ,0 ,64 ,626504.3 ,629858.5 > > > > icelake ,sse2 ,8388623 ,0 ,3 ,623969.5 ,631509.1 > > > > icelake ,sse2 ,8388639 ,3 ,0 ,599366.7 ,600016.0 > > > > icelake ,sse2 ,8388671 ,3 ,7 ,619964.2 ,619113.2 > > > > icelake ,sse2 ,8388735 ,9 ,5 ,595338.1 ,604172.4 > > > > icelake ,sse2 ,16777223 ,0 ,64 ,1709597.0 ,1725184.0 > > > > icelake ,sse2 ,16777231 ,0 ,3 ,1725452.0 ,1719746.0 > > > > icelake ,sse2 ,16777247 ,3 ,0 ,1614269.0 ,1607164.0 > > > > icelake ,sse2 ,16777279 ,3 ,7 ,1705295.0 ,1733018.0 > > > > icelake ,sse2 ,16777343 ,9 ,5 ,1604197.0 ,1595690.0 > > > > > > > > > > > > .../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++---- > > > > 1 file changed, 265 insertions(+), 73 deletions(-) > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > > index 897a3d9762..5e4a071f16 100644 > > > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > > @@ -35,7 +35,16 @@ > > > > __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. > > > > 7. If size >= __x86_shared_non_temporal_threshold and there is no > > > > overlap between destination and source, use non-temporal store > > > > - instead of aligned store. */ > > > > + instead of aligned store copying from either 2 or 4 pages at > > > > + once. > > > > + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold > > > > + and source and destination do not page alias, copy from 2 pages > > > > + at once using non-temporal stores. Page aliasing in this case is > > > > + considered true if destination's page alignment - sources' page > > > > + alignment is less than 8 * VEC_SIZE. > > > > + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source > > > > + and destination do page alias copy from 4 pages at once using > > > > + non-temporal stores. */ > > > > > > > > #include <sysdep.h> > > > > > > > > @@ -67,6 +76,34 @@ > > > > # endif > > > > #endif > > > > > > > > +#ifndef PAGE_SIZE > > > > +# define PAGE_SIZE 4096 > > > > +#endif > > > > + > > > > +#if PAGE_SIZE != 4096 > > > > +# error Unsupported PAGE_SIZE > > > > +#endif > > > > + > > > > +#ifndef LOG_PAGE_SIZE > > > > +# define LOG_PAGE_SIZE 12 > > > > +#endif > > > > + > > > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) > > > > +# error Invalid LOG_PAGE_SIZE > > > > +#endif > > > > + > > > > +/* Byte per page for large_memcpy inner loop. */ > > > > +#if VEC_SIZE == 64 > > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) > > > > +#else > > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) > > > > +#endif > > > > + > > > > +/* Amount to shift rdx by to compare for memcpy_large_4x. */ > > > > +#ifndef LOG_4X_MEMCPY_THRESH > > > > +# define LOG_4X_MEMCPY_THRESH 4 > > > > +#endif > > > > + > > > > /* Avoid short distance rep movsb only with non-SSE vector. */ > > > > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB > > > > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) > > > > @@ -106,6 +143,28 @@ > > > > # error Unsupported PREFETCH_SIZE! > > > > #endif > > > > > > > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) > > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ > > > > + VMOVU (offset)base, vec0; \ > > > > + VMOVU ((offset) + VEC_SIZE)base, vec1; > > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ > > > > + VMOVNT vec0, (offset)base; \ > > > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; > > > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) > > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > > > + VMOVU (offset)base, vec0; \ > > > > + VMOVU ((offset) + VEC_SIZE)base, vec1; \ > > > > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ > > > > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; > > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > > > + VMOVNT vec0, (offset)base; \ > > > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ > > > > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ > > > > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; > > > > +#else > > > > +# error Invalid LARGE_LOAD_SIZE > > > > +#endif > > > > + > > > > #ifndef SECTION > > > > # error SECTION is not defined! > > > > #endif > > > > @@ -393,6 +452,15 @@ L(last_4x_vec): > > > > VZEROUPPER_RETURN > > > > > > > > L(more_8x_vec): > > > > + /* Check if non-temporal move candidate. */ > > > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > > + /* Check non-temporal store threshold. */ > > > > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > > > + ja L(large_memcpy_2x) > > > > +#endif > > > > + /* Entry if rdx is greater than non-temporal threshold but there > > > > + is overlap. */ > > > > +L(more_8x_vec_check): > > > > cmpq %rsi, %rdi > > > > ja L(more_8x_vec_backward) > > > > /* Source == destination is less common. */ > > > > @@ -419,24 +487,21 @@ L(more_8x_vec): > > > > subq %r8, %rdi > > > > /* Adjust length. */ > > > > addq %r8, %rdx > > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > > - /* Check non-temporal store threshold. */ > > > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > > > - ja L(large_forward) > > > > -#endif > > > > + > > > > + .p2align 4 > > > > L(loop_4x_vec_forward): > > > > /* Copy 4 * VEC a time forward. */ > > > > VMOVU (%rsi), %VEC(0) > > > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > > > - addq $(VEC_SIZE * 4), %rsi > > > > - subq $(VEC_SIZE * 4), %rdx > > > > + subq $-(VEC_SIZE * 4), %rsi > > > > + addq $-(VEC_SIZE * 4), %rdx > > > > VMOVA %VEC(0), (%rdi) > > > > VMOVA %VEC(1), VEC_SIZE(%rdi) > > > > VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > > > VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > > > - addq $(VEC_SIZE * 4), %rdi > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > cmpq $(VEC_SIZE * 4), %rdx > > > > ja L(loop_4x_vec_forward) > > > > /* Store the last 4 * VEC. */ > > > > @@ -470,24 +535,21 @@ L(more_8x_vec_backward): > > > > subq %r8, %r9 > > > > /* Adjust length. */ > > > > subq %r8, %rdx > > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > > - /* Check non-temporal store threshold. */ > > > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > > > - ja L(large_backward) > > > > -#endif > > > > + > > > > + .p2align 4 > > > > L(loop_4x_vec_backward): > > > > /* Copy 4 * VEC a time backward. */ > > > > VMOVU (%rcx), %VEC(0) > > > > VMOVU -VEC_SIZE(%rcx), %VEC(1) > > > > VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > > > VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > > > - subq $(VEC_SIZE * 4), %rcx > > > > - subq $(VEC_SIZE * 4), %rdx > > > > + addq $-(VEC_SIZE * 4), %rcx > > > > + addq $-(VEC_SIZE * 4), %rdx > > > > VMOVA %VEC(0), (%r9) > > > > VMOVA %VEC(1), -VEC_SIZE(%r9) > > > > VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) > > > > VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) > > > > - subq $(VEC_SIZE * 4), %r9 > > > > + addq $-(VEC_SIZE * 4), %r9 > > > > cmpq $(VEC_SIZE * 4), %rdx > > > > ja L(loop_4x_vec_backward) > > > > /* Store the first 4 * VEC. */ > > > > @@ -500,72 +562,202 @@ L(loop_4x_vec_backward): > > > > VZEROUPPER_RETURN > > > > > > > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > > -L(large_forward): > > > > + .p2align 4 > > > > +L(large_memcpy_2x): > > > > + /* Compute absolute value of difference between source and > > > > + destination. */ > > > > + movq %rdi, %r9 > > > > + subq %rsi, %r9 > > > > + movq %r9, %r8 > > > > + leaq -1(%r9), %rcx > > > > + sarq $63, %r8 > > > > + xorq %r8, %r9 > > > > + subq %r8, %r9 > > > > /* Don't use non-temporal store if there is overlap between > > > > - destination and source since destination may be in cache > > > > - when source is loaded. */ > > > > - leaq (%rdi, %rdx), %r10 > > > > - cmpq %r10, %rsi > > > > - jb L(loop_4x_vec_forward) > > > > -L(loop_large_forward): > > > > + destination and source since destination may be in cache when > > > > + source is loaded. */ > > > > + cmpq %r9, %rdx > > > > + ja L(more_8x_vec_check) > > > > + > > > > + /* Cache align destination. First store the first 64 bytes then > > > > + adjust alignments. */ > > > > + VMOVU (%rsi), %VEC(8) > > > > +#if VEC_SIZE < 64 > > > > + VMOVU VEC_SIZE(%rsi), %VEC(9) > > > > +#if VEC_SIZE < 32 > > > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > > > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > > > > +#endif > > > > +#endif > > > > + VMOVU %VEC(8), (%rdi) > > > > +#if VEC_SIZE < 64 > > > > + VMOVU %VEC(9), VEC_SIZE(%rdi) > > > > +#if VEC_SIZE < 32 > > > > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > > > > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > > > > +#endif > > > > +#endif > > > > + /* Adjust source, destination, and size. */ > > > > + movq %rdi, %r8 > > > > + andq $63, %r8 > > > > + /* Get the negative of offset for alignment. */ > > > > + subq $64, %r8 > > > > + /* Adjust source. */ > > > > + subq %r8, %rsi > > > > + /* Adjust destination which should be aligned now. */ > > > > + subq %r8, %rdi > > > > + /* Adjust length. */ > > > > + addq %r8, %rdx > > > > + > > > > + /* Test if source and destination addresses will alias. If they do > > > > + the larger pipeline in large_memcpy_4x alleviated the > > > > + performance drop. */ > > > > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > > > > + jz L(large_memcpy_4x) > > > > + > > > > + movq %rdx, %r10 > > > > + shrq $LOG_4X_MEMCPY_THRESH, %r10 > > > > + cmp __x86_shared_non_temporal_threshold(%rip), %r10 > > > > + jae L(large_memcpy_4x) > > > > + > > > > + /* edx will store remainder size for copying tail. */ > > > > + andl $(PAGE_SIZE * 2 - 1), %edx > > > > + /* r10 stores outer loop counter. */ > > > > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 > > > > + /* Copy 4x VEC at a time from 2 pages. */ > > > > + .p2align 4 > > > > +L(loop_large_memcpy_2x_outer): > > > > + /* ecx stores inner loop counter. */ > > > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > > > +L(loop_large_memcpy_2x_inner): > > > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) > > > > + /* Load vectors from rsi. */ > > > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > > + subq $-LARGE_LOAD_SIZE, %rsi > > > > + /* Non-temporal store vectors to rdi. */ > > > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > > + subq $-LARGE_LOAD_SIZE, %rdi > > > > + decl %ecx > > > > + jnz L(loop_large_memcpy_2x_inner) > > > > + addq $PAGE_SIZE, %rdi > > > > + addq $PAGE_SIZE, %rsi > > > > + decq %r10 > > > > + jne L(loop_large_memcpy_2x_outer) > > > > + sfence > > > > + > > > > + /* Check if only last 4 loads are needed. */ > > > > + cmpl $(VEC_SIZE * 4), %edx > > > > + jbe L(large_memcpy_2x_end) > > > > + > > > > + /* Handle the last 2 * PAGE_SIZE bytes. */ > > > > +L(loop_large_memcpy_2x_tail): > > > > /* Copy 4 * VEC a time forward with non-temporal stores. */ > > > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) > > > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > > > VMOVU (%rsi), %VEC(0) > > > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > > > - addq $PREFETCHED_LOAD_SIZE, %rsi > > > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > > > - VMOVNT %VEC(0), (%rdi) > > > > - VMOVNT %VEC(1), VEC_SIZE(%rdi) > > > > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) > > > > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) > > > > - addq $PREFETCHED_LOAD_SIZE, %rdi > > > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > > > - ja L(loop_large_forward) > > > > - sfence > > > > + subq $-(VEC_SIZE * 4), %rsi > > > > + addl $-(VEC_SIZE * 4), %edx > > > > + VMOVA %VEC(0), (%rdi) > > > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + cmpl $(VEC_SIZE * 4), %edx > > > > + ja L(loop_large_memcpy_2x_tail) > > > > + > > > > +L(large_memcpy_2x_end): > > > > /* Store the last 4 * VEC. */ > > > > - VMOVU %VEC(5), (%rcx) > > > > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > > > > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > > > > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > > > > - /* Store the first VEC. */ > > > > - VMOVU %VEC(4), (%r11) > > > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > > > + > > > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > > > VZEROUPPER_RETURN > > > > > > > > -L(large_backward): > > > > - /* Don't use non-temporal store if there is overlap between > > > > - destination and source since destination may be in cache > > > > - when source is loaded. */ > > > > - leaq (%rcx, %rdx), %r10 > > > > - cmpq %r10, %r9 > > > > - jb L(loop_4x_vec_backward) > > > > -L(loop_large_backward): > > > > - /* Copy 4 * VEC a time backward with non-temporal stores. */ > > > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) > > > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) > > > > - VMOVU (%rcx), %VEC(0) > > > > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > > > > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > > > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > > > - subq $PREFETCHED_LOAD_SIZE, %rcx > > > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > > > - VMOVNT %VEC(0), (%r9) > > > > - VMOVNT %VEC(1), -VEC_SIZE(%r9) > > > > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) > > > > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) > > > > - subq $PREFETCHED_LOAD_SIZE, %r9 > > > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > > > - ja L(loop_large_backward) > > > > + .p2align 4 > > > > +L(large_memcpy_4x): > > > > + movq %rdx, %r10 > > > > + /* edx will store remainder size for copying tail. */ > > > > + andl $(PAGE_SIZE * 4 - 1), %edx > > > > + /* r10 stores outer loop counter. */ > > > > + shrq $(LOG_PAGE_SIZE + 2), %r10 > > > > + /* Copy 4x VEC at a time from 4 pages. */ > > > > + .p2align 4 > > > > +L(loop_large_memcpy_4x_outer): > > > > + /* ecx stores inner loop counter. */ > > > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > > > +L(loop_large_memcpy_4x_inner): > > > > + /* Only one prefetch set per page as doing 4 pages give more time > > > > + for prefetcher to keep up. */ > > > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) > > > > + /* Load vectors from rsi. */ > > > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > > > + subq $-LARGE_LOAD_SIZE, %rsi > > > > + /* Non-temporal store vectors to rdi. */ > > > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > > > + subq $-LARGE_LOAD_SIZE, %rdi > > > > + decl %ecx > > > > + jnz L(loop_large_memcpy_4x_inner) > > > > + addq $(PAGE_SIZE * 3), %rdi > > > > + addq $(PAGE_SIZE * 3), %rsi > > > > + decq %r10 > > > > + jne L(loop_large_memcpy_4x_outer) > > > > sfence > > > > - /* Store the first 4 * VEC. */ > > > > - VMOVU %VEC(4), (%rdi) > > > > - VMOVU %VEC(5), VEC_SIZE(%rdi) > > > > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > > > > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > > > > - /* Store the last VEC. */ > > > > - VMOVU %VEC(8), (%r11) > > > > + /* Check if only last 4 loads are needed. */ > > > > + cmpl $(VEC_SIZE * 4), %edx > > > > + jbe L(large_memcpy_4x_end) > > > > + > > > > + /* Handle the last 4 * PAGE_SIZE bytes. */ > > > > +L(loop_large_memcpy_4x_tail): > > > > + /* Copy 4 * VEC a time forward with non-temporal stores. */ > > > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > > > + VMOVU (%rsi), %VEC(0) > > > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > > > + subq $-(VEC_SIZE * 4), %rsi > > > > + addl $-(VEC_SIZE * 4), %edx > > > > + VMOVA %VEC(0), (%rdi) > > > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + cmpl $(VEC_SIZE * 4), %edx > > > > + ja L(loop_large_memcpy_4x_tail) > > > > + > > > > +L(large_memcpy_4x_end): > > > > + /* Store the last 4 * VEC. */ > > > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > > > + > > > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > > > VZEROUPPER_RETURN > > > > #endif > > > > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > > > > -- > > > > 2.29.2 > > > > > > > > > > LGTM. Please commit it. > > > > > > Thanks. > > > > > > > > > H.J. > > > > -- > H.J.
On Fri, Apr 16, 2021 at 12:25 PM Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Fri, Apr 16, 2021 at 1:05 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Fri, Apr 16, 2021 at 9:35 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > LGTM. Please commit it. > > > > > > Are you saying that to me or someone else? If its to me what do you > > > mean, is the patch not enough? > > > > I will commit it for you. > > Thanks! Are you planning on accepting the bench / testing changes as well? > > > > > > > Thanks. > > > > > > On Fri, Apr 16, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote: > > > > > From: noah <goldstein.w.n@gmail.com> > > > > > > > > > > No Bug. This commit updates the large memcpy case (no overlap). The > > > > > update is to perform memcpy on either 2 or 4 contiguous pages at > > > > > once. This 1) helps to alleviate the affects of false memory aliasing > > > > > when destination and source have a close 4k alignment and 2) In most > > > > > cases and for most DRAM units is a modestly more efficient access > > > > > pattern. These changes are a clear performance improvement for > > > > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > > > > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > > > > > pass. > > > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > > --- > > > > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the > > > > > loops and no longer see any meaningful regression. > > > > > > > > > > Also added back the temporal stores for the tail. Saw a regression > > > > > when doing these tests. > > > > > > > > > > Two tables below for skylake and icelake numbers for the areas around > > > > > where you saw the regression. Below is all data from the tests. > > > > > > > > > > N = 10. > > > > > > > > > > Skylake > > > > > Len ,align1 ,align2 ,new mean ,old mean > > > > > 4103 ,0 ,64 ,84.5 ,88.6 > > > > > 4111 ,0 ,3 ,99.0 ,99.9 > > > > > 4127 ,3 ,0 ,102.1 ,102.3 > > > > > 4159 ,3 ,7 ,88.7 ,90.9 > > > > > 4223 ,9 ,5 ,88.1 ,87.4 > > > > > 8199 ,0 ,64 ,146.7 ,150.2 > > > > > 8207 ,0 ,3 ,167.9 ,168.5 > > > > > 8223 ,3 ,0 ,168.5 ,168.1 > > > > > 8255 ,3 ,7 ,157.0 ,159.2 > > > > > 8319 ,9 ,5 ,155.5 ,155.7 > > > > > 16391 ,0 ,64 ,286.2 ,288.8 > > > > > 16399 ,0 ,3 ,307.0 ,308.7 > > > > > 16415 ,3 ,0 ,307.4 ,307.6 > > > > > 16447 ,3 ,7 ,294.6 ,295.5 > > > > > 16511 ,9 ,5 ,291.5 ,462.1 > > > > > 32775 ,0 ,64 ,603.4 ,601.5 > > > > > 32783 ,0 ,3 ,604.8 ,606.4 > > > > > 32799 ,3 ,0 ,603.0 ,604.1 > > > > > 32831 ,3 ,7 ,600.2 ,737.3 > > > > > 32895 ,9 ,5 ,604.4 ,599.5 > > > > > 65543 ,0 ,64 ,1873.5 ,1854.3 > > > > > 65551 ,0 ,3 ,1862.9 ,1846.6 > > > > > 65567 ,3 ,0 ,1885.5 ,1966.0 > > > > > 65599 ,3 ,7 ,1833.2 ,1833.1 > > > > > 65663 ,9 ,5 ,1884.9 ,1887.4 > > > > > 131079 ,0 ,64 ,3944.3 ,3949.4 > > > > > 131087 ,0 ,3 ,3927.3 ,3913.3 > > > > > 131103 ,3 ,0 ,4415.8 ,4169.4 > > > > > 131135 ,3 ,7 ,4224.5 ,4157.6 > > > > > 131199 ,9 ,5 ,5974.0 ,4983.8 > > > > > 262151 ,0 ,64 ,11050.2 ,10620.6 > > > > > 262159 ,0 ,3 ,9932.8 ,10037.3 > > > > > 262175 ,3 ,0 ,10188.8 ,9206.6 > > > > > 262207 ,3 ,7 ,9633.3 ,9216.7 > > > > > 262271 ,9 ,5 ,9732.7 ,9345.3 > > > > > 524295 ,0 ,64 ,24823.9 ,24880.7 > > > > > 524303 ,0 ,3 ,24514.0 ,24556.7 > > > > > 524319 ,3 ,0 ,23974.4 ,24219.9 > > > > > 524351 ,3 ,7 ,24159.7 ,24207.0 > > > > > 524415 ,9 ,5 ,23946.5 ,24142.8 > > > > > > > > > > Icelake: > > > > > Len ,align1 ,align2 ,new mean ,old mean > > > > > 4103 ,0 ,64 ,50.2 ,63.7 > > > > > 4111 ,0 ,3 ,63.7 ,65.1 > > > > > 4127 ,3 ,0 ,68.2 ,69.4 > > > > > 4159 ,3 ,7 ,59.6 ,68.0 > > > > > 4223 ,9 ,5 ,68.2 ,66.8 > > > > > 8199 ,0 ,64 ,92.1 ,89.9 > > > > > 8207 ,0 ,3 ,119.7 ,118.3 > > > > > 8223 ,3 ,0 ,119.1 ,120.9 > > > > > 8255 ,3 ,7 ,122.9 ,123.7 > > > > > 8319 ,9 ,5 ,122.1 ,121.8 > > > > > 16391 ,0 ,64 ,162.7 ,158.0 > > > > > 16399 ,0 ,3 ,227.6 ,234.1 > > > > > 16415 ,3 ,0 ,230.8 ,232.7 > > > > > 16447 ,3 ,7 ,226.8 ,232.6 > > > > > 16511 ,9 ,5 ,233.4 ,233.8 > > > > > 32775 ,0 ,64 ,312.2 ,301.8 > > > > > 32783 ,0 ,3 ,449.7 ,450.0 > > > > > 32799 ,3 ,0 ,452.7 ,455.9 > > > > > 32831 ,3 ,7 ,449.8 ,458.0 > > > > > 32895 ,9 ,5 ,456.3 ,459.4 > > > > > 65543 ,0 ,64 ,1460.6 ,1463.9 > > > > > 65551 ,0 ,3 ,1462.0 ,1465.4 > > > > > 65567 ,3 ,0 ,1466.6 ,1480.4 > > > > > 65599 ,3 ,7 ,1488.0 ,1488.9 > > > > > 65663 ,9 ,5 ,1680.8 ,1499.5 > > > > > 131079 ,0 ,64 ,2988.5 ,3010.1 > > > > > 131087 ,0 ,3 ,2995.5 ,2996.4 > > > > > 131103 ,3 ,0 ,3006.2 ,3000.5 > > > > > 131135 ,3 ,7 ,3032.4 ,3073.7 > > > > > 131199 ,9 ,5 ,3010.4 ,3027.4 > > > > > 262151 ,0 ,64 ,6143.2 ,6079.1 > > > > > 262159 ,0 ,3 ,6085.1 ,6075.8 > > > > > 262175 ,3 ,0 ,6088.0 ,6064.9 > > > > > 262207 ,3 ,7 ,6018.7 ,6023.5 > > > > > 262271 ,9 ,5 ,6019.8 ,5959.2 > > > > > 524295 ,0 ,64 ,14464.2 ,14095.1 > > > > > 524303 ,0 ,3 ,14761.6 ,14050.2 > > > > > 524319 ,3 ,0 ,14534.1 ,14087.5 > > > > > 524351 ,3 ,7 ,14147.7 ,13903.8 > > > > > 524415 ,9 ,5 ,14157.0 ,13982.9 > > > > > > > > > > > > > > > > > > > > cpu ,version ,Len ,align1 ,align2 ,new mean ,old mean > > > > > skylake ,avx ,4103 ,0 ,64 ,84.5 ,88.6 > > > > > skylake ,avx ,4111 ,0 ,3 ,99.0 ,99.9 > > > > > skylake ,avx ,4127 ,3 ,0 ,102.1 ,102.3 > > > > > skylake ,avx ,4159 ,3 ,7 ,88.7 ,90.9 > > > > > skylake ,avx ,4223 ,9 ,5 ,88.1 ,87.4 > > > > > skylake ,avx ,8199 ,0 ,64 ,146.7 ,150.2 > > > > > skylake ,avx ,8207 ,0 ,3 ,167.9 ,168.5 > > > > > skylake ,avx ,8223 ,3 ,0 ,168.5 ,168.1 > > > > > skylake ,avx ,8255 ,3 ,7 ,157.0 ,159.2 > > > > > skylake ,avx ,8319 ,9 ,5 ,155.5 ,155.7 > > > > > skylake ,avx ,16391 ,0 ,64 ,286.2 ,288.8 > > > > > skylake ,avx ,16399 ,0 ,3 ,307.0 ,308.7 > > > > > skylake ,avx ,16415 ,3 ,0 ,307.4 ,307.6 > > > > > skylake ,avx ,16447 ,3 ,7 ,294.6 ,295.5 > > > > > skylake ,avx ,16511 ,9 ,5 ,291.5 ,462.1 > > > > > skylake ,avx ,32775 ,0 ,64 ,603.4 ,601.5 > > > > > skylake ,avx ,32783 ,0 ,3 ,604.8 ,606.4 > > > > > skylake ,avx ,32799 ,3 ,0 ,603.0 ,604.1 > > > > > skylake ,avx ,32831 ,3 ,7 ,600.2 ,737.3 > > > > > skylake ,avx ,32895 ,9 ,5 ,604.4 ,599.5 > > > > > skylake ,avx ,65543 ,0 ,64 ,1873.5 ,1854.3 > > > > > skylake ,avx ,65551 ,0 ,3 ,1862.9 ,1846.6 > > > > > skylake ,avx ,65567 ,3 ,0 ,1885.5 ,1966.0 > > > > > skylake ,avx ,65599 ,3 ,7 ,1833.2 ,1833.1 > > > > > skylake ,avx ,65663 ,9 ,5 ,1884.9 ,1887.4 > > > > > skylake ,avx ,131079 ,0 ,64 ,3944.3 ,3949.4 > > > > > skylake ,avx ,131087 ,0 ,3 ,3927.3 ,3913.3 > > > > > skylake ,avx ,131103 ,3 ,0 ,4415.8 ,4169.4 > > > > > skylake ,avx ,131135 ,3 ,7 ,4224.5 ,4157.6 > > > > > skylake ,avx ,131199 ,9 ,5 ,5974.0 ,4983.8 > > > > > skylake ,avx ,262151 ,0 ,64 ,11050.2 ,10620.6 > > > > > skylake ,avx ,262159 ,0 ,3 ,9932.8 ,10037.3 > > > > > skylake ,avx ,262175 ,3 ,0 ,10188.8 ,9206.6 > > > > > skylake ,avx ,262207 ,3 ,7 ,9633.3 ,9216.7 > > > > > skylake ,avx ,262271 ,9 ,5 ,9732.7 ,9345.3 > > > > > skylake ,avx ,524295 ,0 ,64 ,24823.9 ,24880.7 > > > > > skylake ,avx ,524303 ,0 ,3 ,24514.0 ,24556.7 > > > > > skylake ,avx ,524319 ,3 ,0 ,23974.4 ,24219.9 > > > > > skylake ,avx ,524351 ,3 ,7 ,24159.7 ,24207.0 > > > > > skylake ,avx ,524415 ,9 ,5 ,23946.5 ,24142.8 > > > > > skylake ,avx ,1048583 ,0 ,64 ,49163.9 ,49454.6 > > > > > skylake ,avx ,1048591 ,0 ,3 ,49879.3 ,49400.8 > > > > > skylake ,avx ,1048607 ,3 ,0 ,49738.0 ,48864.6 > > > > > skylake ,avx ,1048639 ,3 ,7 ,48804.0 ,47588.5 > > > > > skylake ,avx ,1048703 ,9 ,5 ,49629.4 ,49796.3 > > > > > skylake ,avx ,2097159 ,0 ,64 ,98271.7 ,96330.6 > > > > > skylake ,avx ,2097167 ,0 ,3 ,97801.8 ,98638.1 > > > > > skylake ,avx ,2097183 ,3 ,0 ,98041.1 ,99287.6 > > > > > skylake ,avx ,2097215 ,3 ,7 ,96629.5 ,96521.9 > > > > > skylake ,avx ,2097279 ,9 ,5 ,98961.8 ,98909.8 > > > > > skylake ,avx ,4194311 ,0 ,64 ,194667.7 ,195377.1 > > > > > skylake ,avx ,4194319 ,0 ,3 ,194919.5 ,198576.2 > > > > > skylake ,avx ,4194335 ,3 ,0 ,192949.8 ,194584.7 > > > > > skylake ,avx ,4194367 ,3 ,7 ,189943.5 ,189177.9 > > > > > skylake ,avx ,4194431 ,9 ,5 ,192479.1 ,196494.2 > > > > > skylake ,avx ,8388615 ,0 ,64 ,588671.6 ,587215.4 > > > > > skylake ,avx ,8388623 ,0 ,3 ,581640.7 ,582812.5 > > > > > skylake ,avx ,8388639 ,3 ,0 ,549811.9 ,544697.6 > > > > > skylake ,avx ,8388671 ,3 ,7 ,591155.0 ,577951.8 > > > > > skylake ,avx ,8388735 ,9 ,5 ,547583.2 ,545133.3 > > > > > skylake ,avx ,16777223 ,0 ,64 ,1787503.0 ,1811146.0 > > > > > skylake ,avx ,16777231 ,0 ,3 ,1758671.0 ,1756343.0 > > > > > skylake ,avx ,16777247 ,3 ,0 ,1691781.0 ,1694661.0 > > > > > skylake ,avx ,16777279 ,3 ,7 ,1768150.0 ,1754785.0 > > > > > skylake ,avx ,16777343 ,9 ,5 ,1695179.0 ,1710794.0 > > > > > skylake ,sse2 ,4103 ,0 ,64 ,150.8 ,150.5 > > > > > skylake ,sse2 ,4111 ,0 ,3 ,156.8 ,158.4 > > > > > skylake ,sse2 ,4127 ,3 ,0 ,99.7 ,99.4 > > > > > skylake ,sse2 ,4159 ,3 ,7 ,154.8 ,154.5 > > > > > skylake ,sse2 ,4223 ,9 ,5 ,137.3 ,137.2 > > > > > skylake ,sse2 ,8199 ,0 ,64 ,284.8 ,285.5 > > > > > skylake ,sse2 ,8207 ,0 ,3 ,296.0 ,296.1 > > > > > skylake ,sse2 ,8223 ,3 ,0 ,168.0 ,168.2 > > > > > skylake ,sse2 ,8255 ,3 ,7 ,293.0 ,292.4 > > > > > skylake ,sse2 ,8319 ,9 ,5 ,251.3 ,250.7 > > > > > skylake ,sse2 ,16391 ,0 ,64 ,561.3 ,608.3 > > > > > skylake ,sse2 ,16399 ,0 ,3 ,571.0 ,574.8 > > > > > skylake ,sse2 ,16415 ,3 ,0 ,305.4 ,305.0 > > > > > skylake ,sse2 ,16447 ,3 ,7 ,563.2 ,565.0 > > > > > skylake ,sse2 ,16511 ,9 ,5 ,477.1 ,475.1 > > > > > skylake ,sse2 ,32775 ,0 ,64 ,1128.2 ,1131.7 > > > > > skylake ,sse2 ,32783 ,0 ,3 ,1126.6 ,1131.0 > > > > > skylake ,sse2 ,32799 ,3 ,0 ,587.6 ,590.8 > > > > > skylake ,sse2 ,32831 ,3 ,7 ,1130.6 ,1126.2 > > > > > skylake ,sse2 ,32895 ,9 ,5 ,957.6 ,953.0 > > > > > skylake ,sse2 ,65543 ,0 ,64 ,2718.9 ,2704.2 > > > > > skylake ,sse2 ,65551 ,0 ,3 ,2724.1 ,2725.0 > > > > > skylake ,sse2 ,65567 ,3 ,0 ,1888.4 ,1914.3 > > > > > skylake ,sse2 ,65599 ,3 ,7 ,2787.6 ,2748.7 > > > > > skylake ,sse2 ,65663 ,9 ,5 ,2400.5 ,2369.4 > > > > > skylake ,sse2 ,131079 ,0 ,64 ,5603.3 ,5654.9 > > > > > skylake ,sse2 ,131087 ,0 ,3 ,5939.3 ,5871.4 > > > > > skylake ,sse2 ,131103 ,3 ,0 ,4272.4 ,4190.0 > > > > > skylake ,sse2 ,131135 ,3 ,7 ,7601.4 ,7524.6 > > > > > skylake ,sse2 ,131199 ,9 ,5 ,7022.1 ,6864.7 > > > > > skylake ,sse2 ,262151 ,0 ,64 ,13736.2 ,14030.0 > > > > > skylake ,sse2 ,262159 ,0 ,3 ,12407.3 ,12334.1 > > > > > skylake ,sse2 ,262175 ,3 ,0 ,9661.1 ,9249.4 > > > > > skylake ,sse2 ,262207 ,3 ,7 ,12850.2 ,12351.6 > > > > > skylake ,sse2 ,262271 ,9 ,5 ,10792.6 ,10435.8 > > > > > skylake ,sse2 ,524295 ,0 ,64 ,27754.5 ,28177.7 > > > > > skylake ,sse2 ,524303 ,0 ,3 ,27766.2 ,28152.0 > > > > > skylake ,sse2 ,524319 ,3 ,0 ,24030.9 ,24438.3 > > > > > skylake ,sse2 ,524351 ,3 ,7 ,27787.5 ,27933.0 > > > > > skylake ,sse2 ,524415 ,9 ,5 ,24263.2 ,25249.1 > > > > > skylake ,sse2 ,1048583 ,0 ,64 ,56199.9 ,56039.8 > > > > > skylake ,sse2 ,1048591 ,0 ,3 ,56750.2 ,58889.7 > > > > > skylake ,sse2 ,1048607 ,3 ,0 ,56394.0 ,55115.3 > > > > > skylake ,sse2 ,1048639 ,3 ,7 ,57233.1 ,57473.8 > > > > > skylake ,sse2 ,1048703 ,9 ,5 ,56324.3 ,55917.9 > > > > > skylake ,sse2 ,2097159 ,0 ,64 ,113234.8 ,114346.4 > > > > > skylake ,sse2 ,2097167 ,0 ,3 ,114373.1 ,115522.5 > > > > > skylake ,sse2 ,2097183 ,3 ,0 ,108113.3 ,108513.3 > > > > > skylake ,sse2 ,2097215 ,3 ,7 ,116863.6 ,116549.9 > > > > > skylake ,sse2 ,2097279 ,9 ,5 ,108945.1 ,108843.7 > > > > > skylake ,sse2 ,4194311 ,0 ,64 ,230250.1 ,232350.0 > > > > > skylake ,sse2 ,4194319 ,0 ,3 ,231895.3 ,235055.6 > > > > > skylake ,sse2 ,4194335 ,3 ,0 ,218442.8 ,219199.8 > > > > > skylake ,sse2 ,4194367 ,3 ,7 ,242564.2 ,235587.7 > > > > > skylake ,sse2 ,4194431 ,9 ,5 ,224167.4 ,215261.8 > > > > > skylake ,sse2 ,8388615 ,0 ,64 ,679801.8 ,674832.0 > > > > > skylake ,sse2 ,8388623 ,0 ,3 ,684913.2 ,685238.7 > > > > > skylake ,sse2 ,8388639 ,3 ,0 ,644865.4 ,631388.6 > > > > > skylake ,sse2 ,8388671 ,3 ,7 ,698700.9 ,689316.1 > > > > > skylake ,sse2 ,8388735 ,9 ,5 ,644820.2 ,631366.8 > > > > > skylake ,sse2 ,16777223 ,0 ,64 ,1877984.0 ,1876437.0 > > > > > skylake ,sse2 ,16777231 ,0 ,3 ,1898086.0 ,1913053.0 > > > > > skylake ,sse2 ,16777247 ,3 ,0 ,1857018.0 ,1866949.0 > > > > > skylake ,sse2 ,16777279 ,3 ,7 ,1914905.0 ,1897134.0 > > > > > skylake ,sse2 ,16777343 ,9 ,5 ,1859937.0 ,1881939.0 > > > > > icelake ,avx512 ,4103 ,0 ,64 ,75.2 ,75.8 > > > > > icelake ,avx512 ,4111 ,0 ,3 ,56.9 ,56.4 > > > > > icelake ,avx512 ,4127 ,3 ,0 ,59.1 ,59.6 > > > > > icelake ,avx512 ,4159 ,3 ,7 ,50.7 ,51.3 > > > > > icelake ,avx512 ,4223 ,9 ,5 ,59.2 ,58.9 > > > > > icelake ,avx512 ,8199 ,0 ,64 ,67.8 ,63.9 > > > > > icelake ,avx512 ,8207 ,0 ,3 ,89.0 ,89.9 > > > > > icelake ,avx512 ,8223 ,3 ,0 ,90.2 ,90.1 > > > > > icelake ,avx512 ,8255 ,3 ,7 ,82.6 ,84.9 > > > > > icelake ,avx512 ,8319 ,9 ,5 ,91.5 ,92.8 > > > > > icelake ,avx512 ,16391 ,0 ,64 ,118.0 ,117.6 > > > > > icelake ,avx512 ,16399 ,0 ,3 ,156.5 ,157.0 > > > > > icelake ,avx512 ,16415 ,3 ,0 ,157.4 ,157.3 > > > > > icelake ,avx512 ,16447 ,3 ,7 ,151.0 ,151.6 > > > > > icelake ,avx512 ,16511 ,9 ,5 ,159.1 ,159.6 > > > > > icelake ,avx512 ,32775 ,0 ,64 ,231.8 ,230.8 > > > > > icelake ,avx512 ,32783 ,0 ,3 ,297.8 ,299.3 > > > > > icelake ,avx512 ,32799 ,3 ,0 ,299.1 ,299.0 > > > > > icelake ,avx512 ,32831 ,3 ,7 ,293.5 ,295.4 > > > > > icelake ,avx512 ,32895 ,9 ,5 ,300.3 ,302.5 > > > > > icelake ,avx512 ,65543 ,0 ,64 ,1473.4 ,1479.2 > > > > > icelake ,avx512 ,65551 ,0 ,3 ,1438.2 ,1445.3 > > > > > icelake ,avx512 ,65567 ,3 ,0 ,1450.3 ,1463.8 > > > > > icelake ,avx512 ,65599 ,3 ,7 ,1469.0 ,1473.8 > > > > > icelake ,avx512 ,65663 ,9 ,5 ,1480.0 ,1483.5 > > > > > icelake ,avx512 ,131079 ,0 ,64 ,3015.1 ,3037.5 > > > > > icelake ,avx512 ,131087 ,0 ,3 ,2952.3 ,2960.4 > > > > > icelake ,avx512 ,131103 ,3 ,0 ,2966.2 ,2964.4 > > > > > icelake ,avx512 ,131135 ,3 ,7 ,2961.6 ,3047.9 > > > > > icelake ,avx512 ,131199 ,9 ,5 ,2967.4 ,3183.8 > > > > > icelake ,avx512 ,262151 ,0 ,64 ,6206.0 ,6141.5 > > > > > icelake ,avx512 ,262159 ,0 ,3 ,5990.8 ,5959.2 > > > > > icelake ,avx512 ,262175 ,3 ,0 ,5976.7 ,5963.8 > > > > > icelake ,avx512 ,262207 ,3 ,7 ,5939.5 ,5924.3 > > > > > icelake ,avx512 ,262271 ,9 ,5 ,5944.6 ,5990.3 > > > > > icelake ,avx512 ,524295 ,0 ,64 ,14726.7 ,14307.0 > > > > > icelake ,avx512 ,524303 ,0 ,3 ,14344.2 ,14040.5 > > > > > icelake ,avx512 ,524319 ,3 ,0 ,14175.0 ,13862.2 > > > > > icelake ,avx512 ,524351 ,3 ,7 ,14261.4 ,13821.5 > > > > > icelake ,avx512 ,524415 ,9 ,5 ,14266.5 ,14064.7 > > > > > icelake ,avx512 ,1048583 ,0 ,64 ,35211.4 ,35414.6 > > > > > icelake ,avx512 ,1048591 ,0 ,3 ,35156.8 ,35591.2 > > > > > icelake ,avx512 ,1048607 ,3 ,0 ,35273.1 ,35503.3 > > > > > icelake ,avx512 ,1048639 ,3 ,7 ,35255.8 ,35725.0 > > > > > icelake ,avx512 ,1048703 ,9 ,5 ,35703.6 ,36289.9 > > > > > icelake ,avx512 ,2097159 ,0 ,64 ,72613.9 ,72063.2 > > > > > icelake ,avx512 ,2097167 ,0 ,3 ,72301.6 ,73504.2 > > > > > icelake ,avx512 ,2097183 ,3 ,0 ,73448.8 ,72133.6 > > > > > icelake ,avx512 ,2097215 ,3 ,7 ,73762.9 ,72825.8 > > > > > icelake ,avx512 ,2097279 ,9 ,5 ,72097.3 ,72914.6 > > > > > icelake ,avx512 ,4194311 ,0 ,64 ,144793.4 ,144182.1 > > > > > icelake ,avx512 ,4194319 ,0 ,3 ,143710.3 ,145063.3 > > > > > icelake ,avx512 ,4194335 ,3 ,0 ,146722.1 ,144046.4 > > > > > icelake ,avx512 ,4194367 ,3 ,7 ,144267.0 ,144874.6 > > > > > icelake ,avx512 ,4194431 ,9 ,5 ,143808.2 ,144560.0 > > > > > icelake ,avx512 ,8388615 ,0 ,64 ,427993.4 ,424521.5 > > > > > icelake ,avx512 ,8388623 ,0 ,3 ,470267.1 ,473290.8 > > > > > icelake ,avx512 ,8388639 ,3 ,0 ,457179.7 ,461797.7 > > > > > icelake ,avx512 ,8388671 ,3 ,7 ,472507.9 ,481561.4 > > > > > icelake ,avx512 ,8388735 ,9 ,5 ,463611.9 ,467388.7 > > > > > icelake ,avx512 ,16777223 ,0 ,64 ,1490426.0 ,1526996.0 > > > > > icelake ,avx512 ,16777231 ,0 ,3 ,1516687.0 ,1517095.0 > > > > > icelake ,avx512 ,16777247 ,3 ,0 ,1497688.0 ,1512766.0 > > > > > icelake ,avx512 ,16777279 ,3 ,7 ,1512331.0 ,1524317.0 > > > > > icelake ,avx512 ,16777343 ,9 ,5 ,1498908.0 ,1500526.0 > > > > > icelake ,avx ,4103 ,0 ,64 ,50.2 ,63.7 > > > > > icelake ,avx ,4111 ,0 ,3 ,63.7 ,65.1 > > > > > icelake ,avx ,4127 ,3 ,0 ,68.2 ,69.4 > > > > > icelake ,avx ,4159 ,3 ,7 ,59.6 ,68.0 > > > > > icelake ,avx ,4223 ,9 ,5 ,68.2 ,66.8 > > > > > icelake ,avx ,8199 ,0 ,64 ,92.1 ,89.9 > > > > > icelake ,avx ,8207 ,0 ,3 ,119.7 ,118.3 > > > > > icelake ,avx ,8223 ,3 ,0 ,119.1 ,120.9 > > > > > icelake ,avx ,8255 ,3 ,7 ,122.9 ,123.7 > > > > > icelake ,avx ,8319 ,9 ,5 ,122.1 ,121.8 > > > > > icelake ,avx ,16391 ,0 ,64 ,162.7 ,158.0 > > > > > icelake ,avx ,16399 ,0 ,3 ,227.6 ,234.1 > > > > > icelake ,avx ,16415 ,3 ,0 ,230.8 ,232.7 > > > > > icelake ,avx ,16447 ,3 ,7 ,226.8 ,232.6 > > > > > icelake ,avx ,16511 ,9 ,5 ,233.4 ,233.8 > > > > > icelake ,avx ,32775 ,0 ,64 ,312.2 ,301.8 > > > > > icelake ,avx ,32783 ,0 ,3 ,449.7 ,450.0 > > > > > icelake ,avx ,32799 ,3 ,0 ,452.7 ,455.9 > > > > > icelake ,avx ,32831 ,3 ,7 ,449.8 ,458.0 > > > > > icelake ,avx ,32895 ,9 ,5 ,456.3 ,459.4 > > > > > icelake ,avx ,65543 ,0 ,64 ,1460.6 ,1463.9 > > > > > icelake ,avx ,65551 ,0 ,3 ,1462.0 ,1465.4 > > > > > icelake ,avx ,65567 ,3 ,0 ,1466.6 ,1480.4 > > > > > icelake ,avx ,65599 ,3 ,7 ,1488.0 ,1488.9 > > > > > icelake ,avx ,65663 ,9 ,5 ,1680.8 ,1499.5 > > > > > icelake ,avx ,131079 ,0 ,64 ,2988.5 ,3010.1 > > > > > icelake ,avx ,131087 ,0 ,3 ,2995.5 ,2996.4 > > > > > icelake ,avx ,131103 ,3 ,0 ,3006.2 ,3000.5 > > > > > icelake ,avx ,131135 ,3 ,7 ,3032.4 ,3073.7 > > > > > icelake ,avx ,131199 ,9 ,5 ,3010.4 ,3027.4 > > > > > icelake ,avx ,262151 ,0 ,64 ,6143.2 ,6079.1 > > > > > icelake ,avx ,262159 ,0 ,3 ,6085.1 ,6075.8 > > > > > icelake ,avx ,262175 ,3 ,0 ,6088.0 ,6064.9 > > > > > icelake ,avx ,262207 ,3 ,7 ,6018.7 ,6023.5 > > > > > icelake ,avx ,262271 ,9 ,5 ,6019.8 ,5959.2 > > > > > icelake ,avx ,524295 ,0 ,64 ,14464.2 ,14095.1 > > > > > icelake ,avx ,524303 ,0 ,3 ,14761.6 ,14050.2 > > > > > icelake ,avx ,524319 ,3 ,0 ,14534.1 ,14087.5 > > > > > icelake ,avx ,524351 ,3 ,7 ,14147.7 ,13903.8 > > > > > icelake ,avx ,524415 ,9 ,5 ,14157.0 ,13982.9 > > > > > icelake ,avx ,1048583 ,0 ,64 ,36599.0 ,37461.4 > > > > > icelake ,avx ,1048591 ,0 ,3 ,36717.8 ,37454.9 > > > > > icelake ,avx ,1048607 ,3 ,0 ,36821.2 ,37343.3 > > > > > icelake ,avx ,1048639 ,3 ,7 ,36958.0 ,37507.2 > > > > > icelake ,avx ,1048703 ,9 ,5 ,36869.2 ,37413.1 > > > > > icelake ,avx ,2097159 ,0 ,64 ,74765.8 ,75330.9 > > > > > icelake ,avx ,2097167 ,0 ,3 ,75175.4 ,74891.9 > > > > > icelake ,avx ,2097183 ,3 ,0 ,75451.4 ,74787.7 > > > > > icelake ,avx ,2097215 ,3 ,7 ,75394.8 ,75839.1 > > > > > icelake ,avx ,2097279 ,9 ,5 ,75099.2 ,75421.2 > > > > > icelake ,avx ,4194311 ,0 ,64 ,146809.6 ,146619.4 > > > > > icelake ,avx ,4194319 ,0 ,3 ,148866.4 ,149898.2 > > > > > icelake ,avx ,4194335 ,3 ,0 ,148719.7 ,150165.4 > > > > > icelake ,avx ,4194367 ,3 ,7 ,150600.1 ,150925.9 > > > > > icelake ,avx ,4194431 ,9 ,5 ,149457.3 ,150519.2 > > > > > icelake ,avx ,8388615 ,0 ,64 ,412709.8 ,423666.1 > > > > > icelake ,avx ,8388623 ,0 ,3 ,423717.4 ,424418.2 > > > > > icelake ,avx ,8388639 ,3 ,0 ,414387.5 ,413445.6 > > > > > icelake ,avx ,8388671 ,3 ,7 ,449010.7 ,417553.5 > > > > > icelake ,avx ,8388735 ,9 ,5 ,414128.6 ,411815.3 > > > > > icelake ,avx ,16777223 ,0 ,64 ,1490032.0 ,1510004.0 > > > > > icelake ,avx ,16777231 ,0 ,3 ,1379638.0 ,1422097.0 > > > > > icelake ,avx ,16777247 ,3 ,0 ,1418930.0 ,1367557.0 > > > > > icelake ,avx ,16777279 ,3 ,7 ,1515152.0 ,1500176.0 > > > > > icelake ,avx ,16777343 ,9 ,5 ,1344117.0 ,1411795.0 > > > > > icelake ,sse2 ,4103 ,0 ,64 ,113.2 ,114.6 > > > > > icelake ,sse2 ,4111 ,0 ,3 ,121.5 ,120.4 > > > > > icelake ,sse2 ,4127 ,3 ,0 ,1700.5 ,1771.5 > > > > > icelake ,sse2 ,4159 ,3 ,7 ,119.3 ,118.8 > > > > > icelake ,sse2 ,4223 ,9 ,5 ,1739.7 ,1735.2 > > > > > icelake ,sse2 ,8199 ,0 ,64 ,207.0 ,203.9 > > > > > icelake ,sse2 ,8207 ,0 ,3 ,225.5 ,220.8 > > > > > icelake ,sse2 ,8223 ,3 ,0 ,3444.3 ,3743.5 > > > > > icelake ,sse2 ,8255 ,3 ,7 ,219.9 ,216.8 > > > > > icelake ,sse2 ,8319 ,9 ,5 ,4117.1 ,3487.3 > > > > > icelake ,sse2 ,16391 ,0 ,64 ,397.1 ,394.3 > > > > > icelake ,sse2 ,16399 ,0 ,3 ,439.6 ,428.6 > > > > > icelake ,sse2 ,16415 ,3 ,0 ,6997.0 ,7031.2 > > > > > icelake ,sse2 ,16447 ,3 ,7 ,426.8 ,421.8 > > > > > icelake ,sse2 ,16511 ,9 ,5 ,7037.6 ,7038.3 > > > > > icelake ,sse2 ,32775 ,0 ,64 ,790.9 ,779.0 > > > > > icelake ,sse2 ,32783 ,0 ,3 ,863.1 ,849.6 > > > > > icelake ,sse2 ,32799 ,3 ,0 ,14043.0 ,14390.9 > > > > > icelake ,sse2 ,32831 ,3 ,7 ,841.6 ,833.1 > > > > > icelake ,sse2 ,32895 ,9 ,5 ,14277.6 ,14344.2 > > > > > icelake ,sse2 ,65543 ,0 ,64 ,1897.0 ,1897.3 > > > > > icelake ,sse2 ,65551 ,0 ,3 ,1927.1 ,1955.4 > > > > > icelake ,sse2 ,65567 ,3 ,0 ,28834.7 ,28727.8 > > > > > icelake ,sse2 ,65599 ,3 ,7 ,1961.4 ,1969.7 > > > > > icelake ,sse2 ,65663 ,9 ,5 ,28867.6 ,29019.8 > > > > > icelake ,sse2 ,131079 ,0 ,64 ,3879.3 ,3872.6 > > > > > icelake ,sse2 ,131087 ,0 ,3 ,3955.3 ,3990.7 > > > > > icelake ,sse2 ,131103 ,3 ,0 ,58001.8 ,60567.9 > > > > > icelake ,sse2 ,131135 ,3 ,7 ,3951.5 ,4002.6 > > > > > icelake ,sse2 ,131199 ,9 ,5 ,57886.7 ,58391.4 > > > > > icelake ,sse2 ,262151 ,0 ,64 ,7851.4 ,7894.7 > > > > > icelake ,sse2 ,262159 ,0 ,3 ,7947.5 ,8016.2 > > > > > icelake ,sse2 ,262175 ,3 ,0 ,115036.2 ,115968.6 > > > > > icelake ,sse2 ,262207 ,3 ,7 ,7883.9 ,7814.1 > > > > > icelake ,sse2 ,262271 ,9 ,5 ,113776.4 ,119733.6 > > > > > icelake ,sse2 ,524295 ,0 ,64 ,17198.1 ,16974.9 > > > > > icelake ,sse2 ,524303 ,0 ,3 ,17402.2 ,17096.3 > > > > > icelake ,sse2 ,524319 ,3 ,0 ,223980.4 ,225889.9 > > > > > icelake ,sse2 ,524351 ,3 ,7 ,17034.9 ,16910.3 > > > > > icelake ,sse2 ,524415 ,9 ,5 ,224027.7 ,224962.5 > > > > > icelake ,sse2 ,1048583 ,0 ,64 ,38822.3 ,39178.6 > > > > > icelake ,sse2 ,1048591 ,0 ,3 ,41686.7 ,40247.4 > > > > > icelake ,sse2 ,1048607 ,3 ,0 ,38814.8 ,39323.3 > > > > > icelake ,sse2 ,1048639 ,3 ,7 ,39568.3 ,41325.7 > > > > > icelake ,sse2 ,1048703 ,9 ,5 ,39354.2 ,39637.9 > > > > > icelake ,sse2 ,2097159 ,0 ,64 ,84074.7 ,84543.1 > > > > > icelake ,sse2 ,2097167 ,0 ,3 ,83665.7 ,82358.2 > > > > > icelake ,sse2 ,2097183 ,3 ,0 ,81817.8 ,79638.9 > > > > > icelake ,sse2 ,2097215 ,3 ,7 ,83649.1 ,83497.6 > > > > > icelake ,sse2 ,2097279 ,9 ,5 ,80287.6 ,79980.9 > > > > > icelake ,sse2 ,4194311 ,0 ,64 ,165409.8 ,168343.1 > > > > > icelake ,sse2 ,4194319 ,0 ,3 ,165216.7 ,177632.0 > > > > > icelake ,sse2 ,4194335 ,3 ,0 ,158718.7 ,160342.2 > > > > > icelake ,sse2 ,4194367 ,3 ,7 ,167944.9 ,167204.4 > > > > > icelake ,sse2 ,4194431 ,9 ,5 ,161530.1 ,164839.7 > > > > > icelake ,sse2 ,8388615 ,0 ,64 ,626504.3 ,629858.5 > > > > > icelake ,sse2 ,8388623 ,0 ,3 ,623969.5 ,631509.1 > > > > > icelake ,sse2 ,8388639 ,3 ,0 ,599366.7 ,600016.0 > > > > > icelake ,sse2 ,8388671 ,3 ,7 ,619964.2 ,619113.2 > > > > > icelake ,sse2 ,8388735 ,9 ,5 ,595338.1 ,604172.4 > > > > > icelake ,sse2 ,16777223 ,0 ,64 ,1709597.0 ,1725184.0 > > > > > icelake ,sse2 ,16777231 ,0 ,3 ,1725452.0 ,1719746.0 > > > > > icelake ,sse2 ,16777247 ,3 ,0 ,1614269.0 ,1607164.0 > > > > > icelake ,sse2 ,16777279 ,3 ,7 ,1705295.0 ,1733018.0 > > > > > icelake ,sse2 ,16777343 ,9 ,5 ,1604197.0 ,1595690.0 > > > > > > > > > > > > > > > .../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++---- > > > > > 1 file changed, 265 insertions(+), 73 deletions(-) > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > > > index 897a3d9762..5e4a071f16 100644 > > > > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > > > @@ -35,7 +35,16 @@ > > > > > __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. > > > > > 7. If size >= __x86_shared_non_temporal_threshold and there is no > > > > > overlap between destination and source, use non-temporal store > > > > > - instead of aligned store. */ > > > > > + instead of aligned store copying from either 2 or 4 pages at > > > > > + once. > > > > > + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold > > > > > + and source and destination do not page alias, copy from 2 pages > > > > > + at once using non-temporal stores. Page aliasing in this case is > > > > > + considered true if destination's page alignment - sources' page > > > > > + alignment is less than 8 * VEC_SIZE. > > > > > + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source > > > > > + and destination do page alias copy from 4 pages at once using > > > > > + non-temporal stores. */ > > > > > > > > > > #include <sysdep.h> > > > > > > > > > > @@ -67,6 +76,34 @@ > > > > > # endif > > > > > #endif > > > > > > > > > > +#ifndef PAGE_SIZE > > > > > +# define PAGE_SIZE 4096 > > > > > +#endif > > > > > + > > > > > +#if PAGE_SIZE != 4096 > > > > > +# error Unsupported PAGE_SIZE > > > > > +#endif > > > > > + > > > > > +#ifndef LOG_PAGE_SIZE > > > > > +# define LOG_PAGE_SIZE 12 > > > > > +#endif > > > > > + > > > > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) > > > > > +# error Invalid LOG_PAGE_SIZE > > > > > +#endif > > > > > + > > > > > +/* Byte per page for large_memcpy inner loop. */ > > > > > +#if VEC_SIZE == 64 > > > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) > > > > > +#else > > > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) > > > > > +#endif > > > > > + > > > > > +/* Amount to shift rdx by to compare for memcpy_large_4x. */ > > > > > +#ifndef LOG_4X_MEMCPY_THRESH > > > > > +# define LOG_4X_MEMCPY_THRESH 4 > > > > > +#endif > > > > > + > > > > > /* Avoid short distance rep movsb only with non-SSE vector. */ > > > > > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB > > > > > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) > > > > > @@ -106,6 +143,28 @@ > > > > > # error Unsupported PREFETCH_SIZE! > > > > > #endif > > > > > > > > > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) > > > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ > > > > > + VMOVU (offset)base, vec0; \ > > > > > + VMOVU ((offset) + VEC_SIZE)base, vec1; > > > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ > > > > > + VMOVNT vec0, (offset)base; \ > > > > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; > > > > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) > > > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > > > > + VMOVU (offset)base, vec0; \ > > > > > + VMOVU ((offset) + VEC_SIZE)base, vec1; \ > > > > > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ > > > > > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; > > > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > > > > + VMOVNT vec0, (offset)base; \ > > > > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ > > > > > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ > > > > > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; > > > > > +#else > > > > > +# error Invalid LARGE_LOAD_SIZE > > > > > +#endif > > > > > + > > > > > #ifndef SECTION > > > > > # error SECTION is not defined! > > > > > #endif > > > > > @@ -393,6 +452,15 @@ L(last_4x_vec): > > > > > VZEROUPPER_RETURN > > > > > > > > > > L(more_8x_vec): > > > > > + /* Check if non-temporal move candidate. */ > > > > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > > > + /* Check non-temporal store threshold. */ > > > > > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > > > > + ja L(large_memcpy_2x) > > > > > +#endif > > > > > + /* Entry if rdx is greater than non-temporal threshold but there > > > > > + is overlap. */ > > > > > +L(more_8x_vec_check): > > > > > cmpq %rsi, %rdi > > > > > ja L(more_8x_vec_backward) > > > > > /* Source == destination is less common. */ > > > > > @@ -419,24 +487,21 @@ L(more_8x_vec): > > > > > subq %r8, %rdi > > > > > /* Adjust length. */ > > > > > addq %r8, %rdx > > > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > > > - /* Check non-temporal store threshold. */ > > > > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > > > > - ja L(large_forward) > > > > > -#endif > > > > > + > > > > > + .p2align 4 > > > > > L(loop_4x_vec_forward): > > > > > /* Copy 4 * VEC a time forward. */ > > > > > VMOVU (%rsi), %VEC(0) > > > > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > > > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > > > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > > > > - addq $(VEC_SIZE * 4), %rsi > > > > > - subq $(VEC_SIZE * 4), %rdx > > > > > + subq $-(VEC_SIZE * 4), %rsi > > > > > + addq $-(VEC_SIZE * 4), %rdx > > > > > VMOVA %VEC(0), (%rdi) > > > > > VMOVA %VEC(1), VEC_SIZE(%rdi) > > > > > VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > > > > VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > cmpq $(VEC_SIZE * 4), %rdx > > > > > ja L(loop_4x_vec_forward) > > > > > /* Store the last 4 * VEC. */ > > > > > @@ -470,24 +535,21 @@ L(more_8x_vec_backward): > > > > > subq %r8, %r9 > > > > > /* Adjust length. */ > > > > > subq %r8, %rdx > > > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > > > - /* Check non-temporal store threshold. */ > > > > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > > > > - ja L(large_backward) > > > > > -#endif > > > > > + > > > > > + .p2align 4 > > > > > L(loop_4x_vec_backward): > > > > > /* Copy 4 * VEC a time backward. */ > > > > > VMOVU (%rcx), %VEC(0) > > > > > VMOVU -VEC_SIZE(%rcx), %VEC(1) > > > > > VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > > > > VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > > > > - subq $(VEC_SIZE * 4), %rcx > > > > > - subq $(VEC_SIZE * 4), %rdx > > > > > + addq $-(VEC_SIZE * 4), %rcx > > > > > + addq $-(VEC_SIZE * 4), %rdx > > > > > VMOVA %VEC(0), (%r9) > > > > > VMOVA %VEC(1), -VEC_SIZE(%r9) > > > > > VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) > > > > > VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) > > > > > - subq $(VEC_SIZE * 4), %r9 > > > > > + addq $-(VEC_SIZE * 4), %r9 > > > > > cmpq $(VEC_SIZE * 4), %rdx > > > > > ja L(loop_4x_vec_backward) > > > > > /* Store the first 4 * VEC. */ > > > > > @@ -500,72 +562,202 @@ L(loop_4x_vec_backward): > > > > > VZEROUPPER_RETURN > > > > > > > > > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > > > > -L(large_forward): > > > > > + .p2align 4 > > > > > +L(large_memcpy_2x): > > > > > + /* Compute absolute value of difference between source and > > > > > + destination. */ > > > > > + movq %rdi, %r9 > > > > > + subq %rsi, %r9 > > > > > + movq %r9, %r8 > > > > > + leaq -1(%r9), %rcx > > > > > + sarq $63, %r8 > > > > > + xorq %r8, %r9 > > > > > + subq %r8, %r9 > > > > > /* Don't use non-temporal store if there is overlap between > > > > > - destination and source since destination may be in cache > > > > > - when source is loaded. */ > > > > > - leaq (%rdi, %rdx), %r10 > > > > > - cmpq %r10, %rsi > > > > > - jb L(loop_4x_vec_forward) > > > > > -L(loop_large_forward): > > > > > + destination and source since destination may be in cache when > > > > > + source is loaded. */ > > > > > + cmpq %r9, %rdx > > > > > + ja L(more_8x_vec_check) > > > > > + > > > > > + /* Cache align destination. First store the first 64 bytes then > > > > > + adjust alignments. */ > > > > > + VMOVU (%rsi), %VEC(8) > > > > > +#if VEC_SIZE < 64 > > > > > + VMOVU VEC_SIZE(%rsi), %VEC(9) > > > > > +#if VEC_SIZE < 32 > > > > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > > > > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > > > > > +#endif > > > > > +#endif > > > > > + VMOVU %VEC(8), (%rdi) > > > > > +#if VEC_SIZE < 64 > > > > > + VMOVU %VEC(9), VEC_SIZE(%rdi) > > > > > +#if VEC_SIZE < 32 > > > > > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > > > > > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > > > > > +#endif > > > > > +#endif > > > > > + /* Adjust source, destination, and size. */ > > > > > + movq %rdi, %r8 > > > > > + andq $63, %r8 > > > > > + /* Get the negative of offset for alignment. */ > > > > > + subq $64, %r8 > > > > > + /* Adjust source. */ > > > > > + subq %r8, %rsi > > > > > + /* Adjust destination which should be aligned now. */ > > > > > + subq %r8, %rdi > > > > > + /* Adjust length. */ > > > > > + addq %r8, %rdx > > > > > + > > > > > + /* Test if source and destination addresses will alias. If they do > > > > > + the larger pipeline in large_memcpy_4x alleviated the > > > > > + performance drop. */ > > > > > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > > > > > + jz L(large_memcpy_4x) > > > > > + > > > > > + movq %rdx, %r10 > > > > > + shrq $LOG_4X_MEMCPY_THRESH, %r10 > > > > > + cmp __x86_shared_non_temporal_threshold(%rip), %r10 > > > > > + jae L(large_memcpy_4x) > > > > > + > > > > > + /* edx will store remainder size for copying tail. */ > > > > > + andl $(PAGE_SIZE * 2 - 1), %edx > > > > > + /* r10 stores outer loop counter. */ > > > > > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 > > > > > + /* Copy 4x VEC at a time from 2 pages. */ > > > > > + .p2align 4 > > > > > +L(loop_large_memcpy_2x_outer): > > > > > + /* ecx stores inner loop counter. */ > > > > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > > > > +L(loop_large_memcpy_2x_inner): > > > > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > > > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) > > > > > + /* Load vectors from rsi. */ > > > > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > > > + subq $-LARGE_LOAD_SIZE, %rsi > > > > > + /* Non-temporal store vectors to rdi. */ > > > > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > > > + subq $-LARGE_LOAD_SIZE, %rdi > > > > > + decl %ecx > > > > > + jnz L(loop_large_memcpy_2x_inner) > > > > > + addq $PAGE_SIZE, %rdi > > > > > + addq $PAGE_SIZE, %rsi > > > > > + decq %r10 > > > > > + jne L(loop_large_memcpy_2x_outer) > > > > > + sfence > > > > > + > > > > > + /* Check if only last 4 loads are needed. */ > > > > > + cmpl $(VEC_SIZE * 4), %edx > > > > > + jbe L(large_memcpy_2x_end) > > > > > + > > > > > + /* Handle the last 2 * PAGE_SIZE bytes. */ > > > > > +L(loop_large_memcpy_2x_tail): > > > > > /* Copy 4 * VEC a time forward with non-temporal stores. */ > > > > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > > > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) > > > > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > > > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > > > > VMOVU (%rsi), %VEC(0) > > > > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > > > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > > > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > > > > - addq $PREFETCHED_LOAD_SIZE, %rsi > > > > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > > > > - VMOVNT %VEC(0), (%rdi) > > > > > - VMOVNT %VEC(1), VEC_SIZE(%rdi) > > > > > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) > > > > > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) > > > > > - addq $PREFETCHED_LOAD_SIZE, %rdi > > > > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > > > > - ja L(loop_large_forward) > > > > > - sfence > > > > > + subq $-(VEC_SIZE * 4), %rsi > > > > > + addl $-(VEC_SIZE * 4), %edx > > > > > + VMOVA %VEC(0), (%rdi) > > > > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > > > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > > > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > + cmpl $(VEC_SIZE * 4), %edx > > > > > + ja L(loop_large_memcpy_2x_tail) > > > > > + > > > > > +L(large_memcpy_2x_end): > > > > > /* Store the last 4 * VEC. */ > > > > > - VMOVU %VEC(5), (%rcx) > > > > > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > > > > > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > > > > > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > > > > > - /* Store the first VEC. */ > > > > > - VMOVU %VEC(4), (%r11) > > > > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > > > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > > > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > > > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > > > > + > > > > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > > > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > > > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > > > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > > > > VZEROUPPER_RETURN > > > > > > > > > > -L(large_backward): > > > > > - /* Don't use non-temporal store if there is overlap between > > > > > - destination and source since destination may be in cache > > > > > - when source is loaded. */ > > > > > - leaq (%rcx, %rdx), %r10 > > > > > - cmpq %r10, %r9 > > > > > - jb L(loop_4x_vec_backward) > > > > > -L(loop_large_backward): > > > > > - /* Copy 4 * VEC a time backward with non-temporal stores. */ > > > > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) > > > > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) > > > > > - VMOVU (%rcx), %VEC(0) > > > > > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > > > > > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > > > > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > > > > - subq $PREFETCHED_LOAD_SIZE, %rcx > > > > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > > > > - VMOVNT %VEC(0), (%r9) > > > > > - VMOVNT %VEC(1), -VEC_SIZE(%r9) > > > > > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) > > > > > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) > > > > > - subq $PREFETCHED_LOAD_SIZE, %r9 > > > > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > > > > - ja L(loop_large_backward) > > > > > + .p2align 4 > > > > > +L(large_memcpy_4x): > > > > > + movq %rdx, %r10 > > > > > + /* edx will store remainder size for copying tail. */ > > > > > + andl $(PAGE_SIZE * 4 - 1), %edx > > > > > + /* r10 stores outer loop counter. */ > > > > > + shrq $(LOG_PAGE_SIZE + 2), %r10 > > > > > + /* Copy 4x VEC at a time from 4 pages. */ > > > > > + .p2align 4 > > > > > +L(loop_large_memcpy_4x_outer): > > > > > + /* ecx stores inner loop counter. */ > > > > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > > > > +L(loop_large_memcpy_4x_inner): > > > > > + /* Only one prefetch set per page as doing 4 pages give more time > > > > > + for prefetcher to keep up. */ > > > > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > > > > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) > > > > > + /* Load vectors from rsi. */ > > > > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > > > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > > > > + subq $-LARGE_LOAD_SIZE, %rsi > > > > > + /* Non-temporal store vectors to rdi. */ > > > > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > > > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > > > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > > > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > > > > + subq $-LARGE_LOAD_SIZE, %rdi > > > > > + decl %ecx > > > > > + jnz L(loop_large_memcpy_4x_inner) > > > > > + addq $(PAGE_SIZE * 3), %rdi > > > > > + addq $(PAGE_SIZE * 3), %rsi > > > > > + decq %r10 > > > > > + jne L(loop_large_memcpy_4x_outer) > > > > > sfence > > > > > - /* Store the first 4 * VEC. */ > > > > > - VMOVU %VEC(4), (%rdi) > > > > > - VMOVU %VEC(5), VEC_SIZE(%rdi) > > > > > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > > > > > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > > > > > - /* Store the last VEC. */ > > > > > - VMOVU %VEC(8), (%r11) > > > > > + /* Check if only last 4 loads are needed. */ > > > > > + cmpl $(VEC_SIZE * 4), %edx > > > > > + jbe L(large_memcpy_4x_end) > > > > > + > > > > > + /* Handle the last 4 * PAGE_SIZE bytes. */ > > > > > +L(loop_large_memcpy_4x_tail): > > > > > + /* Copy 4 * VEC a time forward with non-temporal stores. */ > > > > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > > > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > > > > + VMOVU (%rsi), %VEC(0) > > > > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > > > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > > > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > > > > + subq $-(VEC_SIZE * 4), %rsi > > > > > + addl $-(VEC_SIZE * 4), %edx > > > > > + VMOVA %VEC(0), (%rdi) > > > > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > > > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > > > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > + cmpl $(VEC_SIZE * 4), %edx > > > > > + ja L(loop_large_memcpy_4x_tail) > > > > > + > > > > > +L(large_memcpy_4x_end): > > > > > + /* Store the last 4 * VEC. */ > > > > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > > > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > > > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > > > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > > > > + > > > > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > > > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > > > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > > > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > > > > VZEROUPPER_RETURN > > > > > #endif > > > > > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > > > > > -- > > > > > 2.29.2 > > > > > > > > > > > > > LGTM. Please commit it. > > > > > > > > Thanks. > > > > > > > > > > > > H.J. > > > > > > > > -- > > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 897a3d9762..5e4a071f16 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -35,7 +35,16 @@ __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. 7. If size >= __x86_shared_non_temporal_threshold and there is no overlap between destination and source, use non-temporal store - instead of aligned store. */ + instead of aligned store copying from either 2 or 4 pages at + once. + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold + and source and destination do not page alias, copy from 2 pages + at once using non-temporal stores. Page aliasing in this case is + considered true if destination's page alignment - sources' page + alignment is less than 8 * VEC_SIZE. + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source + and destination do page alias copy from 4 pages at once using + non-temporal stores. */ #include <sysdep.h> @@ -67,6 +76,34 @@ # endif #endif +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif + +#if PAGE_SIZE != 4096 +# error Unsupported PAGE_SIZE +#endif + +#ifndef LOG_PAGE_SIZE +# define LOG_PAGE_SIZE 12 +#endif + +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) +# error Invalid LOG_PAGE_SIZE +#endif + +/* Byte per page for large_memcpy inner loop. */ +#if VEC_SIZE == 64 +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) +#else +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) +#endif + +/* Amount to shift rdx by to compare for memcpy_large_4x. */ +#ifndef LOG_4X_MEMCPY_THRESH +# define LOG_4X_MEMCPY_THRESH 4 +#endif + /* Avoid short distance rep movsb only with non-SSE vector. */ #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) @@ -106,6 +143,28 @@ # error Unsupported PREFETCH_SIZE! #endif +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ + VMOVU (offset)base, vec0; \ + VMOVU ((offset) + VEC_SIZE)base, vec1; +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ + VMOVNT vec0, (offset)base; \ + VMOVNT vec1, ((offset) + VEC_SIZE)base; +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ + VMOVU (offset)base, vec0; \ + VMOVU ((offset) + VEC_SIZE)base, vec1; \ + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ + VMOVNT vec0, (offset)base; \ + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; +#else +# error Invalid LARGE_LOAD_SIZE +#endif + #ifndef SECTION # error SECTION is not defined! #endif @@ -393,6 +452,15 @@ L(last_4x_vec): VZEROUPPER_RETURN L(more_8x_vec): + /* Check if non-temporal move candidate. */ +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + ja L(large_memcpy_2x) +#endif + /* Entry if rdx is greater than non-temporal threshold but there + is overlap. */ +L(more_8x_vec_check): cmpq %rsi, %rdi ja L(more_8x_vec_backward) /* Source == destination is less common. */ @@ -419,24 +487,21 @@ L(more_8x_vec): subq %r8, %rdi /* Adjust length. */ addq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP - ja L(large_forward) -#endif + + .p2align 4 L(loop_4x_vec_forward): /* Copy 4 * VEC a time forward. */ VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $(VEC_SIZE * 4), %rsi - subq $(VEC_SIZE * 4), %rdx + subq $-(VEC_SIZE * 4), %rsi + addq $-(VEC_SIZE * 4), %rdx VMOVA %VEC(0), (%rdi) VMOVA %VEC(1), VEC_SIZE(%rdi) VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $(VEC_SIZE * 4), %rdi + subq $-(VEC_SIZE * 4), %rdi cmpq $(VEC_SIZE * 4), %rdx ja L(loop_4x_vec_forward) /* Store the last 4 * VEC. */ @@ -470,24 +535,21 @@ L(more_8x_vec_backward): subq %r8, %r9 /* Adjust length. */ subq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP - ja L(large_backward) -#endif + + .p2align 4 L(loop_4x_vec_backward): /* Copy 4 * VEC a time backward. */ VMOVU (%rcx), %VEC(0) VMOVU -VEC_SIZE(%rcx), %VEC(1) VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $(VEC_SIZE * 4), %rcx - subq $(VEC_SIZE * 4), %rdx + addq $-(VEC_SIZE * 4), %rcx + addq $-(VEC_SIZE * 4), %rdx VMOVA %VEC(0), (%r9) VMOVA %VEC(1), -VEC_SIZE(%r9) VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $(VEC_SIZE * 4), %r9 + addq $-(VEC_SIZE * 4), %r9 cmpq $(VEC_SIZE * 4), %rdx ja L(loop_4x_vec_backward) /* Store the first 4 * VEC. */ @@ -500,72 +562,202 @@ L(loop_4x_vec_backward): VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) -L(large_forward): + .p2align 4 +L(large_memcpy_2x): + /* Compute absolute value of difference between source and + destination. */ + movq %rdi, %r9 + subq %rsi, %r9 + movq %r9, %r8 + leaq -1(%r9), %rcx + sarq $63, %r8 + xorq %r8, %r9 + subq %r8, %r9 /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rdi, %rdx), %r10 - cmpq %r10, %rsi - jb L(loop_4x_vec_forward) -L(loop_large_forward): + destination and source since destination may be in cache when + source is loaded. */ + cmpq %r9, %rdx + ja L(more_8x_vec_check) + + /* Cache align destination. First store the first 64 bytes then + adjust alignments. */ + VMOVU (%rsi), %VEC(8) +#if VEC_SIZE < 64 + VMOVU VEC_SIZE(%rsi), %VEC(9) +#if VEC_SIZE < 32 + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) +#endif +#endif + VMOVU %VEC(8), (%rdi) +#if VEC_SIZE < 64 + VMOVU %VEC(9), VEC_SIZE(%rdi) +#if VEC_SIZE < 32 + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) +#endif +#endif + /* Adjust source, destination, and size. */ + movq %rdi, %r8 + andq $63, %r8 + /* Get the negative of offset for alignment. */ + subq $64, %r8 + /* Adjust source. */ + subq %r8, %rsi + /* Adjust destination which should be aligned now. */ + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx + + /* Test if source and destination addresses will alias. If they do + the larger pipeline in large_memcpy_4x alleviated the + performance drop. */ + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx + jz L(large_memcpy_4x) + + movq %rdx, %r10 + shrq $LOG_4X_MEMCPY_THRESH, %r10 + cmp __x86_shared_non_temporal_threshold(%rip), %r10 + jae L(large_memcpy_4x) + + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 2 - 1), %edx + /* r10 stores outer loop counter. */ + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 + /* Copy 4x VEC at a time from 2 pages. */ + .p2align 4 +L(loop_large_memcpy_2x_outer): + /* ecx stores inner loop counter. */ + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx +L(loop_large_memcpy_2x_inner): + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) + /* Load vectors from rsi. */ + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + subq $-LARGE_LOAD_SIZE, %rsi + /* Non-temporal store vectors to rdi. */ + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + subq $-LARGE_LOAD_SIZE, %rdi + decl %ecx + jnz L(loop_large_memcpy_2x_inner) + addq $PAGE_SIZE, %rdi + addq $PAGE_SIZE, %rsi + decq %r10 + jne L(loop_large_memcpy_2x_outer) + sfence + + /* Check if only last 4 loads are needed. */ + cmpl $(VEC_SIZE * 4), %edx + jbe L(large_memcpy_2x_end) + + /* Handle the last 2 * PAGE_SIZE bytes. */ +L(loop_large_memcpy_2x_tail): /* Copy 4 * VEC a time forward with non-temporal stores. */ - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $PREFETCHED_LOAD_SIZE, %rsi - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%rdi) - VMOVNT %VEC(1), VEC_SIZE(%rdi) - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $PREFETCHED_LOAD_SIZE, %rdi - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_forward) - sfence + subq $-(VEC_SIZE * 4), %rsi + addl $-(VEC_SIZE * 4), %edx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + subq $-(VEC_SIZE * 4), %rdi + cmpl $(VEC_SIZE * 4), %edx + ja L(loop_large_memcpy_2x_tail) + +L(large_memcpy_2x_end): /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) - /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) + + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN -L(large_backward): - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rcx, %rdx), %r10 - cmpq %r10, %r9 - jb L(loop_4x_vec_backward) -L(loop_large_backward): - /* Copy 4 * VEC a time backward with non-temporal stores. */ - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $PREFETCHED_LOAD_SIZE, %rcx - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%r9) - VMOVNT %VEC(1), -VEC_SIZE(%r9) - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $PREFETCHED_LOAD_SIZE, %r9 - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_backward) + .p2align 4 +L(large_memcpy_4x): + movq %rdx, %r10 + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 4 - 1), %edx + /* r10 stores outer loop counter. */ + shrq $(LOG_PAGE_SIZE + 2), %r10 + /* Copy 4x VEC at a time from 4 pages. */ + .p2align 4 +L(loop_large_memcpy_4x_outer): + /* ecx stores inner loop counter. */ + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx +L(loop_large_memcpy_4x_inner): + /* Only one prefetch set per page as doing 4 pages give more time + for prefetcher to keep up. */ + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) + /* Load vectors from rsi. */ + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) + subq $-LARGE_LOAD_SIZE, %rsi + /* Non-temporal store vectors to rdi. */ + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) + subq $-LARGE_LOAD_SIZE, %rdi + decl %ecx + jnz L(loop_large_memcpy_4x_inner) + addq $(PAGE_SIZE * 3), %rdi + addq $(PAGE_SIZE * 3), %rsi + decq %r10 + jne L(loop_large_memcpy_4x_outer) sfence - /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) - /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) + /* Check if only last 4 loads are needed. */ + cmpl $(VEC_SIZE * 4), %edx + jbe L(large_memcpy_4x_end) + + /* Handle the last 4 * PAGE_SIZE bytes. */ +L(loop_large_memcpy_4x_tail): + /* Copy 4 * VEC a time forward with non-temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + subq $-(VEC_SIZE * 4), %rsi + addl $-(VEC_SIZE * 4), %edx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + subq $-(VEC_SIZE * 4), %rdi + cmpl $(VEC_SIZE * 4), %edx + ja L(loop_large_memcpy_4x_tail) + +L(large_memcpy_4x_end): + /* Store the last 4 * VEC. */ + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) + + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))