diff mbox

PowerPC: optimized memmove for POWER7/PPC64

Message ID 53A77FA6.5070908@linux.vnet.ibm.com
State New
Headers show

Commit Message

Adhemerval Zanella June 23, 2014, 1:15 a.m. UTC
This patch adds an optimized memmove optimization for POWER7/powerpc64.
Basically the idea is to use the memcpy for POWER7 on non-overlapped
memory regions and a optimized backward memcpy for memory regions
that overlap (similar to the idea of string/memmove.c).

The backward memcpy algorithm used is similar the one use for memcpy for
POWER7, with adjustments done for alignment.  The difference is memory
is always aligned to 16 bytes before using VSX/altivec instructions (I sent
a patch to change this for memcpy as well).

I see a lot of improvement in the memmove benchtest (I have attached the
output on a POWER8/LE machine) and also on some loop compiler transformation
(the nbench stringsort kernel, where loops copies are transformed in 
memmove by -ftree-loop-distribute-patterns).

Tested on powerpc64be and powerpc64le.  Ok to commit?

--

	* string/bcopy.c: Use full path to include memmove.c.
	* sysdeps/powerpc/powerpc64/multiarch/Makefile: Add memmove and bcopy
	multiarch objects.
	* sysdeps/powerpc/powerpc64/multiarch/bcopy-ppc64.c: New file: default
	bcopy for powerpc64.
	* sysdeps/powerpc/powerpc64/multiarch/bcopy.c: New file: multiarch
	bcopy for powerpc64.
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: Add bcopy
	and memmove implementations.
	* sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S: New file:
	optimized multiarch memmove for POWER7/powerpc64.
	* sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c: New file:
	default multiarch memmove for powerpc64.
	* sysdeps/powerpc/powerpc64/multiarch/memmove.c: New file: memmove
	multiarch for powerpc64.
	* sysdeps/powerpc/powerpc64/power7/bcopy.c: New file: optimized bcopy
	for POWER7/powerpc64.
	* sysdeps/powerpc/powerpc64/power7/memmove.S: New file: optimized
	memmove for POWER7/powerpc64.

--
simple_memmove	__memmove_power7	__memmove_ppc
Length    1, alignment  0/32:	12.6094	13.6719	33.6094
Length    1, alignment 32/ 0:	15.2969	18.9375	33.625
Length    1, alignment  0/ 0:	14.75	19.5625	16.9062
Length    1, alignment  0/ 0:	14.7344	21.7969	23.8438
Length    2, alignment  0/32:	14.8438	17.0312	31.1875
Length    2, alignment 32/ 0:	17.9062	17.3594	26.7031
Length    2, alignment  0/ 1:	14.6719	19.6094	21.75
Length    2, alignment  1/ 0:	17.6094	17.0312	31.1406
Length    4, alignment  0/32:	19.6719	17.1719	31.25
Length    4, alignment 32/ 0:	27.25	17.4531	30.7344
Length    4, alignment  0/ 2:	19.4062	19.4688	26.5156
Length    4, alignment  2/ 0:	26.7188	16.875	31.6562
Length    8, alignment  0/32:	28.9844	12.2031	26.0156
Length    8, alignment 32/ 0:	32.2656	12.1562	26.8594
Length    8, alignment  0/ 3:	28.7969	14.9219	27.9531
Length    8, alignment  3/ 0:	31.3281	11.8125	26.5938
Length   16, alignment  0/32:	52.375	24.8438	36.1094
Length   16, alignment 32/ 0:	24.4219	23.7812	36.3438
Length   16, alignment  0/ 4:	52.5469	24.5156	50.7656
Length   16, alignment  4/ 0:	55.5312	24.2969	36.0312
Length   32, alignment  0/32:	84.0156	29.1406	45.2031
Length   32, alignment 32/ 0:	29.8125	29.1094	44.9375
Length   32, alignment  0/ 5:	89.8281	36.9062	60.6719
Length   32, alignment  5/ 0:	94.3438	24.75	38.7969
Length   64, alignment  0/32:	164.703	36.5156	45.0156
Length   64, alignment 32/ 0:	39.5	31.8125	47.8594
Length   64, alignment  0/ 6:	165.047	37.0469	65.6875
Length   64, alignment  6/ 0:	169.734	29.8906	43.5469
Length  128, alignment  0/32:	307.859	38.9531	60.0469
Length  128, alignment 32/ 0:	57.9219	36.3125	50.4688
Length  128, alignment  0/ 7:	305.266	48.6875	90.8438
Length  128, alignment  7/ 0:	309.719	34.7188	47.875
Length  256, alignment  0/32:	589.406	39.7188	73.2969
Length  256, alignment 32/ 0:	91.25	43.9688	52.1406
Length  256, alignment  0/ 8:	590.328	63.5156	69
Length  256, alignment  8/ 0:	602.344	52.1094	61.2031
Length  512, alignment  0/32:	1070.52	55.6406	125.719
Length  512, alignment 32/ 0:	179.875	49.9219	70.75
Length  512, alignment  0/ 9:	1020.77	109.469	242.828
Length  512, alignment  9/ 0:	917.125	93.1406	112.812
Length 1024, alignment  0/32:	1123.58	102.234	224.031
Length 1024, alignment 32/ 0:	318.531	95.1562	110.078
Length 1024, alignment  0/10:	953.969	175.797	344.891
Length 1024, alignment 10/ 0:	1356.89	162.578	165.453
Length 2048, alignment  0/32:	1674.66	155.359	230.078
Length 2048, alignment 32/ 0:	342.953	152.078	153.984
Length 2048, alignment  0/11:	1675.23	217.25	306.297
Length 2048, alignment 11/ 0:	1737.81	217.562	216.359
Length 4096, alignment  0/32:	3173.81	210.016	329.688
Length 4096, alignment 32/ 0:	473.594	213.719	211.75
Length 4096, alignment  0/12:	3144.8	274.328	451.375
Length 4096, alignment 12/ 0:	3236.78	272.5	277.906
Length 8192, alignment  0/32:	6112.25	274.609	507.547
Length 8192, alignment 32/ 0:	673.469	268.609	275.156
Length 8192, alignment  0/13:	6112.08	375.547	734.578
Length 8192, alignment 13/ 0:	6301.75	365.984	370.578
Length    0, alignment  0/32:	12.5938	19.3906	26.5469
Length    0, alignment 32/ 0:	10.2188	19.4844	33.6562
Length    0, alignment  0/ 0:	10.1094	13.5469	26.9844
Length    0, alignment  0/ 0:	9.95312	19.1406	33.1875
Length    1, alignment  0/32:	12.5156	19.4531	33.7031
Length    1, alignment 32/ 0:	15.4531	18.9375	33.4531
Length    1, alignment  0/ 1:	12.1875	19.1719	34.25
Length    1, alignment  1/ 0:	14.7656	19.3281	33.5469
Length    2, alignment  0/32:	14.8281	16.7188	31.1875
Length    2, alignment 32/ 0:	11.7188	13.0156	30.9531
Length    2, alignment  0/ 2:	14.4688	16.5312	30.8906
Length    2, alignment  2/ 0:	17.4375	16.875	30.8125
Length    3, alignment  0/32:	17.9688	17.0625	31.4062
Length    3, alignment 32/ 0:	20.5469	17.5781	31.3281
Length    3, alignment  0/ 3:	17.7812	17.1719	26.3125
Length    3, alignment  3/ 0:	19.7188	17.25	31.2656
Length    4, alignment  0/32:	19.4062	17.2188	31.4844
Length    4, alignment 32/ 0:	26.6094	16.9844	31.5938
Length    4, alignment  0/ 4:	19.1406	17.0312	31.25
Length    4, alignment  4/ 0:	26.5781	16.7969	31.0625
Length    5, alignment  0/32:	27.0312	17.0469	31.2031
Length    5, alignment 32/ 0:	24.9844	17.2344	31.5469
Length    5, alignment  0/ 5:	25.8906	12.4062	31.4062
Length    5, alignment  5/ 0:	24.7656	16.7656	31.0781
Length    6, alignment  0/32:	24.6562	14.9531	29.2188
Length    6, alignment 32/ 0:	27.3438	14.2969	29.2031
Length    6, alignment  0/ 6:	23.8906	14.3125	28.8125
Length    6, alignment  6/ 0:	26.7656	14.875	28.7812
Length    7, alignment  0/32:	27.2656	17.0938	31.8594
Length    7, alignment 32/ 0:	29.4219	17.0938	31.4688
Length    7, alignment  0/ 7:	26.0469	16.8125	31.6094
Length    7, alignment  7/ 0:	22.5625	12.9062	31.1406
Length    8, alignment  0/32:	29.1406	11.8906	26.2969
Length    8, alignment 32/ 0:	31.5938	12.2344	26.2656
Length    8, alignment  0/ 8:	28.9375	11.9062	26.2656
Length    8, alignment  8/ 0:	31.1719	12.0156	26.2812
Length    9, alignment  0/32:	30.9375	23.9219	35.9844
Length    9, alignment 32/ 0:	34.4688	23.9062	35.0156
Length    9, alignment  0/ 9:	31.7188	23.9844	35.375
Length    9, alignment  9/ 0:	33.9062	22.7969	33.8594
Length   10, alignment  0/32:	34.5625	22.1875	33.9062
Length   10, alignment 32/ 0:	37.7344	21.4531	33.9688
Length   10, alignment  0/10:	33.3438	21.0625	33.4844
Length   10, alignment 10/ 0:	35.7031	26.8594	39.1875
Length   11, alignment  0/32:	36.1875	21.8438	34.125
Length   11, alignment 32/ 0:	39.2812	21.3438	33.7812
Length   11, alignment  0/11:	35.4531	21.5	33.8594
Length   11, alignment 11/ 0:	38.4688	24.5	38.7031
Length   12, alignment  0/32:	38.7031	22.1875	33.8125
Length   12, alignment 32/ 0:	45.75	21.8281	33.5469
Length   12, alignment  0/12:	37.75	22.0781	34.25
Length   12, alignment 12/ 0:	45.2031	21.8281	33.9062
Length   13, alignment  0/32:	45.3906	21.7812	33.6094
Length   13, alignment 32/ 0:	47.6719	21.6562	33.4062
Length   13, alignment  0/13:	45.8281	21.6406	26.7812
Length   13, alignment 13/ 0:	48.2812	23.9219	36.5781
Length   14, alignment  0/32:	47.7969	21.7812	30.8281
Length   14, alignment 32/ 0:	50.1719	21.5469	31.1406
Length   14, alignment  0/14:	47.375	21.2656	31.7656
Length   14, alignment 14/ 0:	50.2969	24.0312	35.8438
Length   15, alignment  0/32:	49.6562	21.8125	33.375
Length   15, alignment 32/ 0:	52.6406	21.2812	33.8125
Length   15, alignment  0/15:	49.9844	22.0312	33.6719
Length   15, alignment 15/ 0:	53.8125	25.7812	35.9844
Length   16, alignment  0/32:	52.375	24.0781	36.0781
Length   16, alignment 32/ 0:	24.8281	23.8281	36.0156
Length   16, alignment  0/16:	52.4219	24.0156	35.9062
Length   16, alignment 16/ 0:	24.6875	23.7969	35.6406
Length   17, alignment  0/32:	54.25	23.8281	35.3906
Length   17, alignment 32/ 0:	24.4375	24.2031	35.8594
Length   17, alignment  0/17:	54.9688	23.5312	36.0312
Length   17, alignment 17/ 0:	24.5781	24.3594	33.8594
Length   18, alignment  0/32:	56.9688	21.7344	33.625
Length   18, alignment 32/ 0:	27.375	21.6719	33.6719
Length   18, alignment  0/18:	58.375	21.9844	33.125
Length   18, alignment 18/ 0:	27.2031	27.4844	38.3125
Length   19, alignment  0/32:	54.5	21.5781	33.5312
Length   19, alignment 32/ 0:	27.125	21.4531	33.1875
Length   19, alignment  0/19:	54.625	22.1406	34.125
Length   19, alignment 19/ 0:	26.7656	25.0625	38.6875
Length   20, alignment  0/32:	57.2812	21.8125	33.6406
Length   20, alignment 32/ 0:	29.2344	21.8125	33.2656
Length   20, alignment  0/20:	57.2969	21.3438	33.6562
Length   20, alignment 20/ 0:	28.5469	21.6719	33.9219
Length   21, alignment  0/32:	58.7656	21.5781	33.5469
Length   21, alignment 32/ 0:	29.0781	22.125	33.4688
Length   21, alignment  0/21:	66.8906	21.875	33.9844
Length   21, alignment 21/ 0:	29	24.8438	38.375
Length   22, alignment  0/32:	66.7812	19.5156	31.2969
Length   22, alignment 32/ 0:	31.875	19.2969	31.1719
Length   22, alignment  0/22:	66.6719	19.4688	31.0625
Length   22, alignment 22/ 0:	32.1094	24.6562	36.4531
Length   23, alignment  0/32:	68.8594	21.8125	33.875
Length   23, alignment 32/ 0:	31.1562	21.3906	33.7812
Length   23, alignment  0/23:	68.5156	21.9375	33.6562
Length   23, alignment 23/ 0:	31.625	22.3125	35.7656
Length   24, alignment  0/32:	70.8281	24.0781	36.0156
Length   24, alignment 32/ 0:	33.625	24.2188	35.8594
Length   24, alignment  0/24:	71.0312	23.7656	35.9219
Length   24, alignment 24/ 0:	34.3281	24.0312	35.5
Length   25, alignment  0/32:	73.3125	20.2812	26.9062
Length   25, alignment 32/ 0:	33.9844	24.0156	35.4375
Length   25, alignment  0/25:	73.2188	24.5	36.5156
Length   25, alignment 25/ 0:	34.0781	22.3281	36.1406
Length   26, alignment  0/32:	75.8281	22.1406	33.5938
Length   26, alignment 32/ 0:	36.2031	21.8438	32.9844
Length   26, alignment  0/26:	75.8594	21.6719	32.9062
Length   26, alignment 26/ 0:	36.4375	26.7656	39.0312
Length   27, alignment  0/32:	77.7656	21.6875	33.9688
Length   27, alignment 32/ 0:	36.2812	21.375	33.1562
Length   27, alignment  0/27:	78.1406	21.9688	33.625
Length   27, alignment 27/ 0:	36.25	24.4375	38.125
Length   28, alignment  0/32:	80.6719	15.625	33.3594
Length   28, alignment 32/ 0:	38.6562	21.1875	33.4688
Length   28, alignment  0/28:	80.3438	21.25	33.9688
Length   28, alignment 28/ 0:	38.2344	22.0156	33.1562
Length   29, alignment  0/32:	82.2969	21.5156	33.5781
Length   29, alignment 32/ 0:	38.125	21.4219	33.2344
Length   29, alignment  0/29:	80.5469	21.5	33.2188
Length   29, alignment 29/ 0:	84.5156	24.4531	38.1406
Length   30, alignment  0/32:	84.7344	25.9375	33.5781
Length   30, alignment 32/ 0:	40.7188	21.875	33.9688
Length   30, alignment  0/30:	85.0625	21.5	33.0312
Length   30, alignment 30/ 0:	40.4688	24.7031	36.0156
Length   31, alignment  0/32:	87.7344	21.3594	33.8594
Length   31, alignment 32/ 0:	38.3281	21.7031	33.6406
Length   31, alignment  0/31:	87.9844	21.4688	33.5469
Length   31, alignment 31/ 0:	38.3281	24.3594	38.5625
Length   48, alignment  0/32:	126.859	32.0469	48.5938
Length   48, alignment 32/ 0:	34.375	27.1562	42.9219
Length   48, alignment  0/ 3:	127.062	38.9219	64.875
Length   48, alignment  3/ 0:	131.422	24.625	38.5312
Length   80, alignment  0/32:	202.234	33.7969	53.0312
Length   80, alignment 32/ 0:	44.2812	29.1875	45.7344
Length   80, alignment  0/ 5:	188.516	41.5	75.6875
Length   80, alignment  5/ 0:	207.469	29.2344	43.3594
Length   96, alignment  0/32:	239.344	36.2344	54.8281
Length   96, alignment 32/ 0:	47.7031	31.8906	47.3906
Length   96, alignment  0/ 6:	239.938	41.0781	74.0938
Length   96, alignment  6/ 0:	246.109	33.7812	48.3594
Length  112, alignment  0/32:	277.328	33.9844	57.9688
Length  112, alignment 32/ 0:	52.5	29.7812	45.1719
Length  112, alignment  0/ 7:	277.484	48.7656	86.125
Length  112, alignment  7/ 0:	285.375	33.875	40.6094
Length  144, alignment  0/32:	338.203	35.8125	63.0625
Length  144, alignment 32/ 0:	62.4844	34.0469	47.4219
Length  144, alignment  0/ 9:	351.938	52.8906	94.9844
Length  144, alignment  9/ 0:	361.797	38.6406	52.9062
Length  160, alignment  0/32:	389.016	30.875	62.6719
Length  160, alignment 32/ 0:	58	36.4531	50.0625
Length  160, alignment  0/10:	390.047	50.9844	94.6406
Length  160, alignment 10/ 0:	385.891	43.8438	58.0312
Length  176, alignment  0/32:	414.594	36.6094	68.1719
Length  176, alignment 32/ 0:	59.3438	33.7656	48.375
Length  176, alignment  0/11:	402.469	50.9375	97.4688
Length  176, alignment 11/ 0:	421.312	37.875	55.1094
Length  192, alignment  0/32:	444.203	35.1875	64.2812
Length  192, alignment 32/ 0:	73.1875	35.7031	51.5781
Length  192, alignment  0/12:	444.078	51.1562	92.0625
Length  192, alignment 12/ 0:	452.062	43.625	54.25
Length  208, alignment  0/32:	462.188	33.3438	66.9219
Length  208, alignment 32/ 0:	87.4375	28.4219	44.3438
Length  208, alignment  0/13:	467.703	55.6406	100.188
Length  208, alignment 13/ 0:	377.969	42.1719	61.1094
Length  224, alignment  0/32:	403.797	35.0625	68.2188
Length  224, alignment 32/ 0:	92.875	33.0938	52.5156
Length  224, alignment  0/14:	431.094	53.9844	105.578
Length  224, alignment 14/ 0:	346.516	46.9375	59.75
Length  240, alignment  0/32:	417.5	31.7969	71.0781
Length  240, alignment 32/ 0:	97.4688	36.2188	40.6875
Length  240, alignment  0/15:	440.953	61.3438	119.359
Length  240, alignment 15/ 0:	357.969	46.8281	58.7656
Length  272, alignment  0/32:	443.297	38.4688	76.25
Length  272, alignment 32/ 0:	106.359	33.7969	56.1875
Length  272, alignment  0/17:	464.906	66.7344	129.891
Length  272, alignment 17/ 0:	107.203	49.3125	71.2656
Length  288, alignment  0/32:	453	42.5156	88.2188
Length  288, alignment 32/ 0:	101.031	44.4531	57.0781
Length  288, alignment  0/18:	529.5	63.8125	128.688
Length  288, alignment 18/ 0:	111.266	55.5	69.3125
Length  304, alignment  0/32:	452.438	38.9219	91.4219
Length  304, alignment 32/ 0:	104.906	41.7344	50.5
Length  304, alignment  0/19:	530.141	70.0781	137.453
Length  304, alignment 19/ 0:	116.5	52.9844	69.4844
Length  320, alignment  0/32:	460.344	42.5625	91.4375
Length  320, alignment 32/ 0:	111.203	45.7969	51.8594
Length  320, alignment  0/20:	508.203	72.1406	136.516
Length  320, alignment 20/ 0:	119.953	54.6094	72.7969
Length  336, alignment  0/32:	479.609	39.6406	89.7031
Length  336, alignment 32/ 0:	113.156	47.75	49.6562
Length  336, alignment  0/21:	518.594	75.0938	146.578
Length  336, alignment 21/ 0:	120.719	66.9844	67.3281
Length  352, alignment  0/32:	473.234	43.4688	92.8281
Length  352, alignment 32/ 0:	123.406	45.5469	50.7031
Length  352, alignment  0/22:	520.656	74.0625	145.016
Length  352, alignment 22/ 0:	124.703	72.1562	77.2031
Length  368, alignment  0/32:	511.125	40.5156	95.6094
Length  368, alignment 32/ 0:	123.75	42.7969	50.2969
Length  368, alignment  0/23:	544.328	79.8281	154.453
Length  368, alignment 23/ 0:	130.094	66.5469	80.1719
Length  384, alignment  0/32:	527.5	48.1094	98.1719
Length  384, alignment 32/ 0:	134.188	43.75	59.6562
Length  384, alignment  0/24:	543.938	81.7656	98.5625
Length  384, alignment 24/ 0:	133.141	74.9062	95.8594
Length  400, alignment  0/32:	486.562	45.7656	100.844
Length  400, alignment 32/ 0:	136	48.1094	52.6719
Length  400, alignment  0/25:	526.5	91.1094	167.75
Length  400, alignment 25/ 0:	143.219	79.6719	81.7031
Length  416, alignment  0/32:	498.094	48.1562	103.016
Length  416, alignment 32/ 0:	138.25	50.5	56.1719
Length  416, alignment  0/26:	566.047	90.1719	166.188
Length  416, alignment 26/ 0:	144.141	76.6406	95.0625
Length  432, alignment  0/32:	539.188	45.4375	105.141
Length  432, alignment 32/ 0:	148.359	42.4375	50.375
Length  432, alignment  0/27:	564.391	93.8906	172.266
Length  432, alignment 27/ 0:	140.391	84.3906	86.0781
Length  448, alignment  0/32:	513.234	49.9531	108.547
Length  448, alignment 32/ 0:	152.328	46.4062	67.9844
Length  448, alignment  0/28:	546.734	96.7344	175.203
Length  448, alignment 28/ 0:	140.031	84.3438	90.7812
Length  464, alignment  0/32:	520.516	48.2969	111.781
Length  464, alignment 32/ 0:	151.703	50.8594	54.5625
Length  464, alignment  0/29:	573.438	100.531	181.203
Length  464, alignment 29/ 0:	156.859	90.3906	95.8906
Length  480, alignment  0/32:	524.656	49.7031	113.891
Length  480, alignment 32/ 0:	161.5	47.1719	58.2812
Length  480, alignment  0/30:	579.859	98.5781	182.094
Length  480, alignment 30/ 0:	145.359	91.4375	101.75
Length  496, alignment  0/32:	529.688	48.1562	115.656
Length  496, alignment 32/ 0:	165.531	44.3125	63.8281
Length  496, alignment  0/31:	592.734	104.844	194.219
Length  496, alignment 31/ 0:	167.594	88.9219	102.844
diff mbox

Patch

diff --git a/string/bcopy.c b/string/bcopy.c
index 7c1225c..f497b5d 100644
--- a/string/bcopy.c
+++ b/string/bcopy.c
@@ -25,4 +25,4 @@ 
 #define	a2		dest
 #define	a2const
 
-#include <memmove.c>
+#include <string/memmove.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 05744e9..a9d35b1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -17,7 +17,8 @@  sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \
 		   strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
-		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64
+		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
+		   memmove-power7 memmove-ppc64 bcopy-ppc64
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy-ppc64.c
new file mode 100644
index 0000000..f8d5aa5
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy-ppc64.c
@@ -0,0 +1,25 @@ 
+/* PowerPC64 default bcopy.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
+
+#define bcopy __bcopy_ppc
+
+#include <string/bcopy.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
new file mode 100644
index 0000000..0688907
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
@@ -0,0 +1,29 @@ 
+/* PowerPC64 multiarch bcopy.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include "init-arch.h"
+
+extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
+/* __bcopy_power7 symbol is implemented at memmove-power7.S  */
+extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
+
+libc_ifunc (bcopy,
+            (hwcap & PPC_FEATURE_HAS_VSX)
+            ? __bcopy_power7
+            : __bcopy_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index b3933a5..9aae96e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -61,6 +61,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_power4)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ppc))
 
+  /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
+			      __memmove_power7)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
+
   /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c.  */
   IFUNC_IMPL (i, name, memset,
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
@@ -136,6 +142,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __bzero_power4)
 	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ppc))
 
+  /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
+  IFUNC_IMPL (i, name, bcopy,
+	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
+			      __bcopy_power7)
+	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
+
   /* Support sysdeps/powerpc/powerpc64/multiarch/mempcpy.c.  */
   IFUNC_IMPL (i, name, mempcpy,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
new file mode 100644
index 0000000..667e7bc
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
@@ -0,0 +1,43 @@ 
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__memmove_power7)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__memmove_power7):					\
+  cfi_startproc;						\
+  LOCALENTRY(__memmove_power7)
+
+#undef END_GEN_TB
+#define END_GEN_TB(name, mask)					\
+  cfi_endproc;							\
+  TRACEBACK_MASK(__memmove_power7,mask)				\
+  END_2(__memmove_power7)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef bcopy
+#define bcopy __bcopy_power7
+
+#include <sysdeps/powerpc/powerpc64/power7/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c
new file mode 100644
index 0000000..6039c90
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c
@@ -0,0 +1,29 @@ 
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+#define MEMMOVE __memmove_ppc
+#if !defined(NOT_IN_libc) && defined(SHARED)
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+#endif
+
+extern __typeof (memmove) __memmove_ppc attribute_hidden;
+
+#define MEMCPY_OK_FOR_FWD_MEMMOVE 1
+#include <string/memmove.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
new file mode 100644
index 0000000..9a1ce8f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -0,0 +1,45 @@ 
+/* Multiple versions of memmove. PowerPC64 version.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memmove before the initialization
+   happened.  */
+#if defined SHARED && !defined NOT_IN_libc
+/* Redefine memmove so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memmove
+# define memmove __redirect_memmove
+# include <string.h>
+# include "init-arch.h"
+
+extern __typeof (__redirect_memmove) __libc_memmove;
+
+extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
+
+libc_ifunc (__libc_memmove,
+            (hwcap & PPC_FEATURE_HAS_VSX)
+            ? __memmove_power7
+            : __memmove_ppc);
+
+#undef memmove
+strong_alias (__libc_memmove, memmove);
+libc_hidden_ver (__libc_memmove, memmove);
+#else
+# include <string/memmove.c>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/bcopy.c b/sysdeps/powerpc/powerpc64/power7/bcopy.c
new file mode 100644
index 0000000..4a6a400
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/bcopy.c
@@ -0,0 +1 @@ 
+/* Implemented at memmove.S  */
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
new file mode 100644
index 0000000..b18d7fb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -0,0 +1,831 @@ 
+/* Optimized memcpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memcpy (void *dest [r3], const void *src [r4], size_t len [r5])
+
+   This optimization check if memory 'dest'  overlaps with 'src'. If it does
+   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+   embedded here to gain some cycles).
+   If source and destiny overlaps, a optimized backwards memcpy is used
+   instead.  */
+
+	.machine power7
+EALIGN (memmove, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memmove):
+	subf    r9,r4,r3
+	cmpld   cr7,r9,r5
+	blt	cr7,L(memmove_bwd)
+
+	cmpldi	cr1,r5,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				       code.  */
+
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	r11,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,16f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,r5
+	srdi	12,r5,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+L(aligned_128loop):
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	stxvd2x	6,0,r11
+	addi	r4,r4,64
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	addi	r11,r11,64
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	addi	r4,r4,64
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	addi	r11,r11,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	addi	r4,r4,64
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	addi	r11,r11,64
+32:
+	bf	26,16f
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	addi	r4,r4,32
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	addi	r11,r11,32
+16:
+	bf	27,8f
+	lxvd2x	6,0,r4
+	addi	r4,r4,16
+	stxvd2x	6,0,r11
+	addi	r11,r11,16
+8:
+	bf	28,4f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std     6,0(r11)
+	addi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw     6,0(r11)
+	bf      30,L(tail5)
+	lhz     7,4(r4)
+	sth     7,4(r11)
+	bflr	31
+	lbz     8,6(r4)
+	stb     8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	r11,3
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	lwz	8,8(r4)
+	stw	7,4(r11)
+	lwz	6,12(r4)
+	addi	r4,r4,16
+	stw	8,8(r11)
+	stw	6,12(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(r4)
+	sth	6,0(r11)
+	bflr	31
+	lbz	7,2(r4)
+	stb	7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(r4)
+	stb	6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(r4)
+	stb	6,0(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	stw	7,4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st r11 quadword.  */
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	r5,0,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,0f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+0:
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,r5,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,r4
+#else
+	lvsl	5,0,r4
+#endif
+	lvx	3,0,r4
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	r4,r4,16
+	stvx	6,0,r11
+	addi	r11,r11,16
+	vor	3,4,4
+	clrrdi	0,r4,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,r4,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	r4,r4,32
+	stvx	6,0,r11
+	stvx	10,r11,6
+	addi	r11,r11,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,r4,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Start to memcpy backward implementation: the algorith first check if
+	   src and dest have the same alignment and if it does align both to 16
+	   bytes and copy using VSX instructions.
+	   If does not, align dest to 16 bytes and use VMX (altivec) instruction
+	   to read two 16 bytes at time, shift/permute the bytes read and write
+	   aligned to dest.  */
+L(memmove_bwd):
+	cmpldi	cr1,r5,31
+	/* Copy is done backwards: update the pointers and check alignment.  */
+	add	r11,r3,r5
+	add	r4,r4,r5
+	mr	r0,r11
+	ble	cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
+				           code.  */
+
+	andi.	r10,r11,15	    /* Check if r11 is aligned to 16 bytes  */
+	clrldi	r9,r4,60	    /* Check if r4 is aligned to 16 bytes  */
+	cmpld	cr6,r10,r9	    /* SRC and DST alignments match?  */
+	
+	bne     cr6,L(copy_GE_32_unaligned_bwd)
+	beq     L(aligned_copy_bwd)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,16f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+	li	r6,-16
+	li	r7,-32
+	li	r8,-48
+	li	r9,-64
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail_bwd)
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	mtctr	12
+	b	L(aligned_128loop_bwd)
+
+	.align  4
+L(aligned_128head_bwd):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+L(aligned_128loop_bwd):
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	stxvd2x	v6,r11,r6
+	subi	r4,r4,64
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,7
+	subi	r11,r11,64
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	subi	r4,r4,64
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	subi	r11,r11,64
+	bdnz	L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	subi	r4,r4,64
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	subi	r11,r11,64
+32:
+	bf	26,16f
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	subi	r4,r4,32
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	subi	r11,r11,32
+16:
+	bf	27,8f
+	lxvd2x	v6,r4,r6
+	subi	r4,r4,16
+	stxvd2x	v6,r11,r6
+	subi	r11,r11,16
+8:
+	bf	28,4f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std     r6,-8(r11)
+	subi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw     r6,-4(r11)
+	bf      30,L(tail5_bwd)
+	lhz     r7,-6(r4)
+	sth     r7,-6(r11)
+	bflr	31
+	lbz     r8,-7(r4)
+	stb     r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32_bwd):
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8_bwd)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned_bwd)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment_bwd)
+	lbz	6,-1(r4)
+	subi	r4,r4,1
+	stb	6,-1(r11)
+	subi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment_bwd):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	stw	r6,-4(r11)
+	lwz	r8,-12(r4)
+	stw	r7,-8(r11)
+	lwz	r6,-16(r4)
+	subi	r4,r4,16
+	stw	r8,-12(r11)
+	stw	r6,-16(r11)
+	subi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4_bwd)
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4_bwd):
+	bf	29,L(tail2_bwd)
+	lwz	6,-4(r4)
+	stw	6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	7,-6(r4)
+	sth	7,-6(r11)
+	bflr	31
+	lbz	8,-7(r4)
+	stb	8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2_bwd):
+	bf	30,1f
+	lhz	6,-2(r4)
+	sth	6,-2(r11)
+	bflr	31
+	lbz	7,-3(r4)
+	stb	7,-3(r11)
+	blr
+
+	.align	4
+L(tail5_bwd):
+	bflr	31
+	lbz	6,-5(r4)
+	stb	6,-5(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,-1(r4)
+	stb	6,-1(r11)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8_bwd):
+	bne	cr6,L(tail4_bwd)
+
+	/* Though we could've used ld/std here, they are still
+	   slow for unaligned cases.  */
+	lwz	6,-8(r4)
+	lwz	7,-4(r4)
+	stw	6,-8(r11)
+	stw	7,-4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned_bwd):
+	andi.	r10,r11,15      /* Check alignment of DST against 16 bytes..  */
+	srdi	r9,r5,4		/* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont_bwd)
+
+	/* DST is not quadword aligned and r10 holds the address masked to
+           compare alignments.  */
+	mtocrf	0x01,r10
+	subf	r5,r10,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,0f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+0:
+	srdi	r9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont_bwd):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	r10,r5,60
+	li	r6,-16	      /* Index for 16-bytes offsets.  */
+	li	r7,-32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	r8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v5,r0,r4
+#else
+	lvsl	v5,r0,r4
+#endif
+	lvx	v3,0,r4
+	li	r0,0
+	bf	31,L(setup_unaligned_loop_bwd)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	subi	r4,r4,16
+	stvx	v6,r11,r6
+	subi	r11,r11,16
+	vor	v3,v4,v4
+	clrrdi	r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+	mtctr	r8
+	ble	cr6,L(end_unaligned_loop_bwd)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop_bwd):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	lvx	v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+	vperm	v10,v4,v3,v5
+#else
+	vperm	v10,v3,v4,v5
+#endif
+	subi	r4,r4,32
+	stvx	v6,r11,r6
+	stvx	v10,r11,r7
+	subi	r11,r11,32
+	bdnz	L(unaligned_loop_bwd)
+
+	clrrdi	r0,r4,60
+
+	.align	4
+L(end_unaligned_loop_bwd):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw	r6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	r7,-6(r4)
+	sth	r7,-6(r11)
+	bflr	31
+	lbz	r8,-7(r4)
+	stb	r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+END_GEN_TB (memmove, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+   Implemented in this file to avoid linker create a stub function call
+   in the branch to '_memmove'.  */
+ENTRY (bcopy)
+	mr	r6,r3
+	mr	r3,r4
+	mr	r4,r6
+	b	L(_memmove)
+END (bcopy)