Message ID | 1408489897-2393-1-git-send-email-dann.frazier@canonical.com |
---|---|
State | New |
Headers | show |
Buglink? SRU justification? etc. On 08/19/2014 04:11 PM, dann frazier wrote: > From: Feng Kan <fkan@apm.com> > > Using the glibc cortex string work work authored by Linaro as base to > create new copy to/from user kernel routine. > > Iperf performance increase: > -l (size) 1 core result > Optimized 64B 44-51Mb/s > 1500B 4.9Gb/s > 30000B 16.2Gb/s > Original 64B 34-50.7Mb/s > 1500B 4.7Gb/s > 30000B 14.5Gb/s > > Signed-off-by: Feng Kan <fkan@apm.com> > (v2 submittal) > Reference: http://www.spinics.net/lists/arm-kernel/msg353650.html > Signed-off-by: dann frazier <dann.frazier@canonical.com> > --- > arch/arm64/lib/copy_from_user.S | 36 +----- > arch/arm64/lib/copy_template.S | 278 ++++++++++++++++++++++++++++++++++++++++ > arch/arm64/lib/copy_to_user.S | 31 +---- > 3 files changed, 284 insertions(+), 61 deletions(-) > create mode 100644 arch/arm64/lib/copy_template.S > > diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S > index 5e27add..c4c5187 100644 > --- a/arch/arm64/lib/copy_from_user.S > +++ b/arch/arm64/lib/copy_from_user.S > @@ -15,7 +15,6 @@ > */ > > #include <linux/linkage.h> > -#include <asm/assembler.h> > > /* > * Copy from user space to a kernel buffer (alignment handled by the hardware) > @@ -28,39 +27,10 @@ > * x0 - bytes not copied > */ > ENTRY(__copy_from_user) > - add x4, x1, x2 // upper user buffer boundary > - subs x2, x2, #8 > - b.mi 2f > -1: > -USER(9f, ldr x3, [x1], #8 ) > - subs x2, x2, #8 > - str x3, [x0], #8 > - b.pl 1b > -2: adds x2, x2, #4 > - b.mi 3f > -USER(9f, ldr w3, [x1], #4 ) > - sub x2, x2, #4 > - str w3, [x0], #4 > -3: adds x2, x2, #2 > - b.mi 4f > -USER(9f, ldrh w3, [x1], #2 ) > - sub x2, x2, #2 > - strh w3, [x0], #2 > -4: adds x2, x2, #1 > - b.mi 5f > -USER(9f, ldrb w3, [x1] ) > - strb w3, [x0] > -5: mov x0, #0 > - ret > +#include "copy_template.S" > ENDPROC(__copy_from_user) > > .section .fixup,"ax" > - .align 2 > -9: sub x2, x4, x1 > - mov x3, x2 > -10: strb wzr, [x0], #1 // zero remaining buffer space > - subs x3, x3, #1 > - b.ne 10b > - mov x0, x2 // bytes not copied > - ret > + .align 2 > + copy_abort_table > .previous > diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S > new file mode 100644 > index 0000000..f2c7003 > --- /dev/null > +++ b/arch/arm64/lib/copy_template.S > @@ -0,0 +1,278 @@ > +/* > + * Copyright (c) 2013, Applied Micro Circuits Corporation > + * Copyright (c) 2012-2013, Linaro Limited > + * > + * Author: Feng Kan <fkan@apm.com> > + * Author: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> > + * > + * The code is adopted from the memcpy routine by Linaro Limited. > + * > + * This file is free software: you may copy, redistribute and/or modify it > + * under the terms of the GNU General Public License as published by the > + * Free Software Foundation, either version 2 of the License, or (at your > + * option) any later version. > + * > + * This file is distributed in the hope that it will be useful, but > + * WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program. If not, see <http://www.gnu.org/licenses/>. > + * > + * This file incorporates work covered by the following copyright and > + * permission notice: > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions are met: > + * 1 Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2 Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * 3 Neither the name of the Linaro nor the > + * names of its contributors may be used to endorse or promote products > + * derived from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + */ > +#include <asm/assembler.h> > + > +dstin .req x0 > +src .req x1 > +count .req x2 > +tmp1 .req x3 > +tmp1w .req w3 > +tmp2 .req x4 > +tmp2w .req w4 > +tmp3 .req x5 > +tmp3w .req w5 > +dst .req x6 > + > +A_l .req x7 > +A_h .req x8 > +B_l .req x9 > +B_h .req x10 > +C_l .req x11 > +C_h .req x12 > +D_l .req x13 > +D_h .req x14 > + > + mov dst, dstin > + cmp count, #64 > + b.ge .Lcpy_not_short > + cmp count, #15 > + b.le .Ltail15tiny > + > + /* > + * Deal with small copies quickly by dropping straight into the > + * exit block. > + */ > +.Ltail63: > + /* > + * Copy up to 48 bytes of data. At this point we only need the > + * bottom 6 bits of count to be accurate. > + */ > + ands tmp1, count, #0x30 > + b.eq .Ltail15 > + add dst, dst, tmp1 > + add src, src, tmp1 > + cmp tmp1w, #0x20 > + b.eq 1f > + b.lt 2f > + USER(8f, ldp A_l, A_h, [src, #-48]) > + USER(8f, stp A_l, A_h, [dst, #-48]) > +1: > + USER(8f, ldp A_l, A_h, [src, #-32]) > + USER(8f, stp A_l, A_h, [dst, #-32]) > +2: > + USER(8f, ldp A_l, A_h, [src, #-16]) > + USER(8f, stp A_l, A_h, [dst, #-16]) > + > +.Ltail15: > + ands count, count, #15 > + beq 1f > + add src, src, count > + USER(9f, ldp A_l, A_h, [src, #-16]) > + add dst, dst, count > + USER(9f, stp A_l, A_h, [dst, #-16]) > +1: > + b .Lsuccess > + > +.Ltail15tiny: > + /* > + * Copy up to 15 bytes of data. Does not assume additional data > + * being copied. > + */ > + tbz count, #3, 1f > + USER(10f, ldr tmp1, [src], #8) > + USER(10f, str tmp1, [dst], #8) > +1: > + tbz count, #2, 1f > + USER(10f, ldr tmp1w, [src], #4) > + USER(10f, str tmp1w, [dst], #4) > +1: > + tbz count, #1, 1f > + USER(10f, ldrh tmp1w, [src], #2) > + USER(10f, strh tmp1w, [dst], #2) > +1: > + tbz count, #0, 1f > + USER(10f, ldrb tmp1w, [src]) > + USER(10f, strb tmp1w, [dst]) > +1: > + b .Lsuccess > + > +.Lcpy_not_short: > + /* > + * We don't much care about the alignment of DST, but we want SRC > + * to be 128-bit (16 byte) aligned so that we don't cross cache line > + * boundaries on both loads and stores. > + */ > + neg tmp2, src > + ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ > + b.eq 2f > + sub count, count, tmp2 > + /* > + * Copy more data than needed; it's faster than jumping > + * around copying sub-Quadword quantities. We know that > + * it can't overrun. > + */ > + USER(11f, ldp A_l, A_h, [src]) > + add src, src, tmp2 > + USER(11f, stp A_l, A_h, [dst]) > + add dst, dst, tmp2 > + /* There may be less than 63 bytes to go now. */ > + cmp count, #63 > + b.le .Ltail63 > +2: > + subs count, count, #128 > + b.ge .Lcpy_body_large > + /* > + * Less than 128 bytes to copy, so handle 64 here and then jump > + * to the tail. > + */ > + USER(12f, ldp A_l, A_h, [src]) > + USER(12f, ldp B_l, B_h, [src, #16]) > + USER(12f, ldp C_l, C_h, [src, #32]) > + USER(12f, ldp D_l, D_h, [src, #48]) > + USER(12f, stp A_l, A_h, [dst]) > + USER(12f, stp B_l, B_h, [dst, #16]) > + USER(12f, stp C_l, C_h, [dst, #32]) > + USER(12f, stp D_l, D_h, [dst, #48]) > + tst count, #0x3f > + add src, src, #64 > + add dst, dst, #64 > + b.ne .Ltail63 > + b .Lsuccess > + > + /* > + * Critical loop. Start at a new cache line boundary. Assuming > + * 64 bytes per line this ensures the entire loop is in one line. > + */ > + .p2align 6 > +.Lcpy_body_large: > + /* There are at least 128 bytes to copy. */ > + USER(12f, ldp A_l, A_h, [src, #0]) > + sub dst, dst, #16 /* Pre-bias. */ > + USER(13f, ldp B_l, B_h, [src, #16]) > + USER(13f, ldp C_l, C_h, [src, #32]) > + USER(13f, ldp D_l, D_h, [src, #48]!) /* src += 64 - Pre-bias. */ > +1: > + USER(13f, stp A_l, A_h, [dst, #16]) > + USER(13f, ldp A_l, A_h, [src, #16]) > + USER(13f, stp B_l, B_h, [dst, #32]) > + USER(13f, ldp B_l, B_h, [src, #32]) > + USER(13f, stp C_l, C_h, [dst, #48]) > + USER(13f, ldp C_l, C_h, [src, #48]) > + USER(13f, stp D_l, D_h, [dst, #64]!) > + USER(13f, ldp D_l, D_h, [src, #64]!) > + subs count, count, #64 > + b.ge 1b > + USER(14f, stp A_l, A_h, [dst, #16]) > + USER(14f, stp B_l, B_h, [dst, #32]) > + USER(14f, stp C_l, C_h, [dst, #48]) > + USER(14f, stp D_l, D_h, [dst, #64]) > + add src, src, #16 > + add dst, dst, #64 + 16 > + tst count, #0x3f > + b.ne .Ltail63 > +.Lsuccess: > + /* Nothing left to copy */ > + mov x0, #0 > + ret > + > + .macro copy_abort_table > +8: > + /* > + * Count bytes remain > + * dst points to (dst + tmp1) > + */ > + mov x0, count > + sub dst, dst, tmp1 > + b .Lfinalize > +9: > + /* > + * 16 bytes remain > + * dst is accurate > + */ > + mov x0, #16 > + b .Lfinalize > +10: > + /* > + * count is accurate > + * dst is accurate > + */ > + mov x0, count > + b .Lfinalize > +11: > + /* > + *(count + tmp2) bytes remain > + * dst points to the start of the remaining bytes > + */ > + add x0, count, tmp2 > + b .Lfinalize > +12: > + /* > + * (count + 128) bytes remain > + * dst is accurate > + */ > + add x0, count, #128 > + b .Lfinalize > +13: > + /* > + * (count + 128) bytes remain > + * dst is pre-biased to (dst + 16) > + */ > + add x0, count, #128 > + sub dst, dst, #16 > + b .Lfinalize > +14: > + /* > + * count is accurate > + * dst is pre-biased to (dst + 16) > + */ > + mov x0, count > + sub dst, dst, #16 > + /* fall-through */ > +.Lfinalize: > + /* > + * Zeroize remaining destination-buffer > + */ > + mov count, x0 > +20: > + /* Zero remaining buffer space */ > + strb wzr, [dst], #1 > + subs count, count, #1 > + b.ne 20b > + ret > + .endm > diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S > index a0aeeb9..08787b0 100644 > --- a/arch/arm64/lib/copy_to_user.S > +++ b/arch/arm64/lib/copy_to_user.S > @@ -15,7 +15,6 @@ > */ > > #include <linux/linkage.h> > -#include <asm/assembler.h> > > /* > * Copy to user space from a kernel buffer (alignment handled by the hardware) > @@ -28,34 +27,10 @@ > * x0 - bytes not copied > */ > ENTRY(__copy_to_user) > - add x4, x0, x2 // upper user buffer boundary > - subs x2, x2, #8 > - b.mi 2f > -1: > - ldr x3, [x1], #8 > - subs x2, x2, #8 > -USER(9f, str x3, [x0], #8 ) > - b.pl 1b > -2: adds x2, x2, #4 > - b.mi 3f > - ldr w3, [x1], #4 > - sub x2, x2, #4 > -USER(9f, str w3, [x0], #4 ) > -3: adds x2, x2, #2 > - b.mi 4f > - ldrh w3, [x1], #2 > - sub x2, x2, #2 > -USER(9f, strh w3, [x0], #2 ) > -4: adds x2, x2, #1 > - b.mi 5f > - ldrb w3, [x1] > -USER(9f, strb w3, [x0] ) > -5: mov x0, #0 > - ret > +#include "copy_template.S" > ENDPROC(__copy_to_user) > > .section .fixup,"ax" > - .align 2 > -9: sub x0, x4, x0 // bytes not copied > - ret > + .align 2 > + copy_abort_table > .previous >
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 5e27add..c4c5187 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -15,7 +15,6 @@ */ #include <linux/linkage.h> -#include <asm/assembler.h> /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -28,39 +27,10 @@ * x0 - bytes not copied */ ENTRY(__copy_from_user) - add x4, x1, x2 // upper user buffer boundary - subs x2, x2, #8 - b.mi 2f -1: -USER(9f, ldr x3, [x1], #8 ) - subs x2, x2, #8 - str x3, [x0], #8 - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f -USER(9f, ldr w3, [x1], #4 ) - sub x2, x2, #4 - str w3, [x0], #4 -3: adds x2, x2, #2 - b.mi 4f -USER(9f, ldrh w3, [x1], #2 ) - sub x2, x2, #2 - strh w3, [x0], #2 -4: adds x2, x2, #1 - b.mi 5f -USER(9f, ldrb w3, [x1] ) - strb w3, [x0] -5: mov x0, #0 - ret +#include "copy_template.S" ENDPROC(__copy_from_user) .section .fixup,"ax" - .align 2 -9: sub x2, x4, x1 - mov x3, x2 -10: strb wzr, [x0], #1 // zero remaining buffer space - subs x3, x3, #1 - b.ne 10b - mov x0, x2 // bytes not copied - ret + .align 2 + copy_abort_table .previous diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S new file mode 100644 index 0000000..f2c7003 --- /dev/null +++ b/arch/arm64/lib/copy_template.S @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2013, Applied Micro Circuits Corporation + * Copyright (c) 2012-2013, Linaro Limited + * + * Author: Feng Kan <fkan@apm.com> + * Author: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> + * + * The code is adopted from the memcpy routine by Linaro Limited. + * + * This file is free software: you may copy, redistribute and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 2 of the License, or (at your + * option) any later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * This file incorporates work covered by the following copyright and + * permission notice: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * 1 Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2 Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3 Neither the name of the Linaro nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <asm/assembler.h> + +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +tmp3 .req x5 +tmp3w .req w5 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #64 + b.ge .Lcpy_not_short + cmp count, #15 + b.le .Ltail15tiny + + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltail15 + add dst, dst, tmp1 + add src, src, tmp1 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + USER(8f, ldp A_l, A_h, [src, #-48]) + USER(8f, stp A_l, A_h, [dst, #-48]) +1: + USER(8f, ldp A_l, A_h, [src, #-32]) + USER(8f, stp A_l, A_h, [dst, #-32]) +2: + USER(8f, ldp A_l, A_h, [src, #-16]) + USER(8f, stp A_l, A_h, [dst, #-16]) + +.Ltail15: + ands count, count, #15 + beq 1f + add src, src, count + USER(9f, ldp A_l, A_h, [src, #-16]) + add dst, dst, count + USER(9f, stp A_l, A_h, [dst, #-16]) +1: + b .Lsuccess + +.Ltail15tiny: + /* + * Copy up to 15 bytes of data. Does not assume additional data + * being copied. + */ + tbz count, #3, 1f + USER(10f, ldr tmp1, [src], #8) + USER(10f, str tmp1, [dst], #8) +1: + tbz count, #2, 1f + USER(10f, ldr tmp1w, [src], #4) + USER(10f, str tmp1w, [dst], #4) +1: + tbz count, #1, 1f + USER(10f, ldrh tmp1w, [src], #2) + USER(10f, strh tmp1w, [dst], #2) +1: + tbz count, #0, 1f + USER(10f, ldrb tmp1w, [src]) + USER(10f, strb tmp1w, [dst]) +1: + b .Lsuccess + +.Lcpy_not_short: + /* + * We don't much care about the alignment of DST, but we want SRC + * to be 128-bit (16 byte) aligned so that we don't cross cache line + * boundaries on both loads and stores. + */ + neg tmp2, src + ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ + b.eq 2f + sub count, count, tmp2 + /* + * Copy more data than needed; it's faster than jumping + * around copying sub-Quadword quantities. We know that + * it can't overrun. + */ + USER(11f, ldp A_l, A_h, [src]) + add src, src, tmp2 + USER(11f, stp A_l, A_h, [dst]) + add dst, dst, tmp2 + /* There may be less than 63 bytes to go now. */ + cmp count, #63 + b.le .Ltail63 +2: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + USER(12f, ldp A_l, A_h, [src]) + USER(12f, ldp B_l, B_h, [src, #16]) + USER(12f, ldp C_l, C_h, [src, #32]) + USER(12f, ldp D_l, D_h, [src, #48]) + USER(12f, stp A_l, A_h, [dst]) + USER(12f, stp B_l, B_h, [dst, #16]) + USER(12f, stp C_l, C_h, [dst, #32]) + USER(12f, stp D_l, D_h, [dst, #48]) + tst count, #0x3f + add src, src, #64 + add dst, dst, #64 + b.ne .Ltail63 + b .Lsuccess + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align 6 +.Lcpy_body_large: + /* There are at least 128 bytes to copy. */ + USER(12f, ldp A_l, A_h, [src, #0]) + sub dst, dst, #16 /* Pre-bias. */ + USER(13f, ldp B_l, B_h, [src, #16]) + USER(13f, ldp C_l, C_h, [src, #32]) + USER(13f, ldp D_l, D_h, [src, #48]!) /* src += 64 - Pre-bias. */ +1: + USER(13f, stp A_l, A_h, [dst, #16]) + USER(13f, ldp A_l, A_h, [src, #16]) + USER(13f, stp B_l, B_h, [dst, #32]) + USER(13f, ldp B_l, B_h, [src, #32]) + USER(13f, stp C_l, C_h, [dst, #48]) + USER(13f, ldp C_l, C_h, [src, #48]) + USER(13f, stp D_l, D_h, [dst, #64]!) + USER(13f, ldp D_l, D_h, [src, #64]!) + subs count, count, #64 + b.ge 1b + USER(14f, stp A_l, A_h, [dst, #16]) + USER(14f, stp B_l, B_h, [dst, #32]) + USER(14f, stp C_l, C_h, [dst, #48]) + USER(14f, stp D_l, D_h, [dst, #64]) + add src, src, #16 + add dst, dst, #64 + 16 + tst count, #0x3f + b.ne .Ltail63 +.Lsuccess: + /* Nothing left to copy */ + mov x0, #0 + ret + + .macro copy_abort_table +8: + /* + * Count bytes remain + * dst points to (dst + tmp1) + */ + mov x0, count + sub dst, dst, tmp1 + b .Lfinalize +9: + /* + * 16 bytes remain + * dst is accurate + */ + mov x0, #16 + b .Lfinalize +10: + /* + * count is accurate + * dst is accurate + */ + mov x0, count + b .Lfinalize +11: + /* + *(count + tmp2) bytes remain + * dst points to the start of the remaining bytes + */ + add x0, count, tmp2 + b .Lfinalize +12: + /* + * (count + 128) bytes remain + * dst is accurate + */ + add x0, count, #128 + b .Lfinalize +13: + /* + * (count + 128) bytes remain + * dst is pre-biased to (dst + 16) + */ + add x0, count, #128 + sub dst, dst, #16 + b .Lfinalize +14: + /* + * count is accurate + * dst is pre-biased to (dst + 16) + */ + mov x0, count + sub dst, dst, #16 + /* fall-through */ +.Lfinalize: + /* + * Zeroize remaining destination-buffer + */ + mov count, x0 +20: + /* Zero remaining buffer space */ + strb wzr, [dst], #1 + subs count, count, #1 + b.ne 20b + ret + .endm diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index a0aeeb9..08787b0 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -15,7 +15,6 @@ */ #include <linux/linkage.h> -#include <asm/assembler.h> /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -28,34 +27,10 @@ * x0 - bytes not copied */ ENTRY(__copy_to_user) - add x4, x0, x2 // upper user buffer boundary - subs x2, x2, #8 - b.mi 2f -1: - ldr x3, [x1], #8 - subs x2, x2, #8 -USER(9f, str x3, [x0], #8 ) - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f - ldr w3, [x1], #4 - sub x2, x2, #4 -USER(9f, str w3, [x0], #4 ) -3: adds x2, x2, #2 - b.mi 4f - ldrh w3, [x1], #2 - sub x2, x2, #2 -USER(9f, strh w3, [x0], #2 ) -4: adds x2, x2, #1 - b.mi 5f - ldrb w3, [x1] -USER(9f, strb w3, [x0] ) -5: mov x0, #0 - ret +#include "copy_template.S" ENDPROC(__copy_to_user) .section .fixup,"ax" - .align 2 -9: sub x0, x4, x0 // bytes not copied - ret + .align 2 + copy_abort_table .previous