From patchwork Thu Aug 17 22:25:25 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexey Brodkin X-Patchwork-Id: 802930 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=uclibc-ng.org (client-ip=89.238.66.15; helo=helium.openadk.org; envelope-from=devel-bounces@uclibc-ng.org; receiver=) Received: from helium.openadk.org (helium.openadk.org [89.238.66.15]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3xYLQD5R2mz9t38 for ; Fri, 18 Aug 2017 08:25:38 +1000 (AEST) Received: from helium.openadk.org (localhost [IPv6:::1]) by helium.openadk.org (Postfix) with ESMTP id 5167A10599; Fri, 18 Aug 2017 00:25:35 +0200 (CEST) X-Original-To: devel@uclibc-ng.org Delivered-To: devel@helium.openadk.org Received: from smtprelay.synopsys.com (smtprelay2.synopsys.com [198.182.60.111]) by helium.openadk.org (Postfix) with ESMTPS id 495DB10597; Fri, 18 Aug 2017 00:25:34 +0200 (CEST) Received: from mailhost.synopsys.com (mailhost1.synopsys.com [10.12.238.239]) by smtprelay.synopsys.com (Postfix) with ESMTP id 004FF10C14B6; Thu, 17 Aug 2017 15:25:31 -0700 (PDT) Received: from mailhost.synopsys.com (localhost [127.0.0.1]) by mailhost.synopsys.com (Postfix) with ESMTP id 9965AC7C; Thu, 17 Aug 2017 15:25:31 -0700 (PDT) Received: from abrodkin-7440l.internal.synopsys.com (unknown [10.225.15.124]) by mailhost.synopsys.com (Postfix) with ESMTP id 05A00C58; Thu, 17 Aug 2017 15:25:29 -0700 (PDT) From: Alexey Brodkin To: devel@uclibc-ng.org Date: Fri, 18 Aug 2017 01:25:25 +0300 Message-Id: <1503008725-21115-1-git-send-email-abrodkin@synopsys.com> X-Mailer: git-send-email 2.7.5 Cc: Vineet Gupta , Alexey Brodkin Subject: [uclibc-ng-devel] [PATCH] arc: Merge ARCv2 string routines in generic ARC .S files X-BeenThere: devel@uclibc-ng.org X-Mailman-Version: 2.1.18 Precedence: list List-Id: uClibc-ng Development List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Errors-To: devel-bounces@uclibc-ng.org Sender: "devel" In cde74b83f9b2 "ARC: remove special CFLAGS/LDFLAGS handling" we got rid of CONFIG_ARC_CPU_HS which was used to select ARCv2-specific implementation of optimized string routines. So now ARCv2-tuned memset/memcpy/strcmp are not used, instead those for ARC700 used for both ARC700 and ARCHS. Without uClibc config option we may only tell which CPU type we're targeting by built-in defines of GCC. I.e. no more conditional file inclusion in Makefiles. That leaves us only one option - merge both implementations in 1 file and use ifdefs. Signed-off-by: Alexey Brodkin --- FWIW I run that change through LMbench, LTP and uClibc-ng-testsuite and didn't see any significant differences or regressions. extra/Configs/Config.in | 1 - libc/string/arc/arcv2/memcpy.S | 236 ---------------------------------------- libc/string/arc/arcv2/memset.S | 115 -------------------- libc/string/arc/arcv2/strcmp.S | 83 -------------- libc/string/arc/memcpy.S | 238 ++++++++++++++++++++++++++++++++++++++++- libc/string/arc/memset.S | 115 +++++++++++++++++++- libc/string/arc/strcmp.S | 81 +++++++++++++- 7 files changed, 428 insertions(+), 441 deletions(-) delete mode 100644 libc/string/arc/arcv2/memcpy.S delete mode 100644 libc/string/arc/arcv2/memset.S delete mode 100644 libc/string/arc/arcv2/strcmp.S diff --git a/extra/Configs/Config.in b/extra/Configs/Config.in index 59ef31c47dec..ce832b55bcc7 100644 --- a/extra/Configs/Config.in +++ b/extra/Configs/Config.in @@ -250,7 +250,6 @@ config TARGET_SUBARCH default "i486" if CONFIG_486 default "i586" if CONFIG_586 default "i686" if CONFIG_686 - default "arcv2" if CONFIG_ARC_CPU_HS default "" source "extra/Configs/Config.in.arch" diff --git a/libc/string/arc/arcv2/memcpy.S b/libc/string/arc/arcv2/memcpy.S deleted file mode 100644 index ba29e8790721..000000000000 --- a/libc/string/arc/arcv2/memcpy.S +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) - * - * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. - */ - -#include -#include - -#ifdef __LITTLE_ENDIAN__ -# define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << -# define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> -# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM -# define MERGE_2(RX,RY,IMM) -# define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF -# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM -#else -# define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> -# define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << -# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << -# define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << -# define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM -# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 -#endif - -#if defined(__LL64__) || defined(__ARC_LL64__) -# define PREFETCH_READ(RX) prefetch [RX, 56] -# define PREFETCH_WRITE(RX) prefetchw [RX, 64] -# define LOADX(DST,RX) ldd.ab DST, [RX, 8] -# define STOREX(SRC,RX) std.ab SRC, [RX, 8] -# define ZOLSHFT 5 -# define ZOLAND 0x1F -#else -# define PREFETCH_READ(RX) prefetch [RX, 28] -# define PREFETCH_WRITE(RX) prefetchw [RX, 32] -# define LOADX(DST,RX) ld.ab DST, [RX, 4] -# define STOREX(SRC,RX) st.ab SRC, [RX, 4] -# define ZOLSHFT 4 -# define ZOLAND 0xF -#endif - -ENTRY(memcpy) - prefetch [r1] ; Prefetch the read location - prefetchw [r0] ; Prefetch the write location - mov.f 0, r2 -;;; if size is zero - jz.d [blink] - mov r3, r0 ; don't clobber ret val - -;;; if size <= 8 - cmp r2, 8 - bls.d @.Lsmallchunk - mov.f lp_count, r2 - - and.f r4, r0, 0x03 - rsub lp_count, r4, 4 - lpnz @.Laligndestination - ;; LOOP BEGIN - ldb.ab r5, [r1,1] - sub r2, r2, 1 - stb.ab r5, [r3,1] -.Laligndestination: - -;;; Check the alignment of the source - and.f r4, r1, 0x03 - bnz.d @.Lsourceunaligned - -;;; CASE 0: Both source and destination are 32bit aligned -;;; Convert len to Dwords, unfold x4 - lsr.f lp_count, r2, ZOLSHFT - lpnz @.Lcopy32_64bytes - ;; LOOP START - LOADX (r6, r1) - PREFETCH_READ (r1) - PREFETCH_WRITE (r3) - LOADX (r8, r1) - LOADX (r10, r1) - LOADX (r4, r1) - STOREX (r6, r3) - STOREX (r8, r3) - STOREX (r10, r3) - STOREX (r4, r3) -.Lcopy32_64bytes: - - and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes -.Lsmallchunk: - lpnz @.Lcopyremainingbytes - ;; LOOP START - ldb.ab r5, [r1,1] - stb.ab r5, [r3,1] -.Lcopyremainingbytes: - - j [blink] -;;; END CASE 0 - -.Lsourceunaligned: - cmp r4, 2 - beq.d @.LunalignedOffby2 - sub r2, r2, 1 - - bhi.d @.LunalignedOffby3 - ldb.ab r5, [r1, 1] - -;;; CASE 1: The source is unaligned, off by 1 - ;; Hence I need to read 1 byte for a 16bit alignment - ;; and 2bytes to reach 32bit alignment - ldh.ab r6, [r1, 2] - sub r2, r2, 2 - ;; Convert to words, unfold x2 - lsr.f lp_count, r2, 3 - MERGE_1 (r6, r6, 8) - MERGE_2 (r5, r5, 24) - or r5, r5, r6 - - ;; Both src and dst are aligned - lpnz @.Lcopy8bytes_1 - ;; LOOP START - ld.ab r6, [r1, 4] - prefetch [r1, 28] ;Prefetch the next read location - ld.ab r8, [r1,4] - prefetchw [r3, 32] ;Prefetch the next write location - - SHIFT_1 (r7, r6, 24) - or r7, r7, r5 - SHIFT_2 (r5, r6, 8) - - SHIFT_1 (r9, r8, 24) - or r9, r9, r5 - SHIFT_2 (r5, r8, 8) - - st.ab r7, [r3, 4] - st.ab r9, [r3, 4] -.Lcopy8bytes_1: - - ;; Write back the remaining 16bits - EXTRACT_1 (r6, r5, 16) - sth.ab r6, [r3, 2] - ;; Write back the remaining 8bits - EXTRACT_2 (r5, r5, 16) - stb.ab r5, [r3, 1] - - and.f lp_count, r2, 0x07 ;Last 8bytes - lpnz @.Lcopybytewise_1 - ;; LOOP START - ldb.ab r6, [r1,1] - stb.ab r6, [r3,1] -.Lcopybytewise_1: - j [blink] - -.LunalignedOffby2: -;;; CASE 2: The source is unaligned, off by 2 - ldh.ab r5, [r1, 2] - sub r2, r2, 1 - - ;; Both src and dst are aligned - ;; Convert to words, unfold x2 - lsr.f lp_count, r2, 3 -#ifdef __BIG_ENDIAN__ - asl.nz r5, r5, 16 -#endif - lpnz @.Lcopy8bytes_2 - ;; LOOP START - ld.ab r6, [r1, 4] - prefetch [r1, 28] ;Prefetch the next read location - ld.ab r8, [r1,4] - prefetchw [r3, 32] ;Prefetch the next write location - - SHIFT_1 (r7, r6, 16) - or r7, r7, r5 - SHIFT_2 (r5, r6, 16) - - SHIFT_1 (r9, r8, 16) - or r9, r9, r5 - SHIFT_2 (r5, r8, 16) - - st.ab r7, [r3, 4] - st.ab r9, [r3, 4] -.Lcopy8bytes_2: - -#ifdef __BIG_ENDIAN__ - lsr.nz r5, r5, 16 -#endif - sth.ab r5, [r3, 2] - - and.f lp_count, r2, 0x07 ;Last 8bytes - lpnz @.Lcopybytewise_2 - ;; LOOP START - ldb.ab r6, [r1,1] - stb.ab r6, [r3,1] -.Lcopybytewise_2: - j [blink] - -.LunalignedOffby3: -;;; CASE 3: The source is unaligned, off by 3 -;;; Hence, I need to read 1byte for achieve the 32bit alignment - - ;; Both src and dst are aligned - ;; Convert to words, unfold x2 - lsr.f lp_count, r2, 3 -#ifdef __BIG_ENDIAN__ - asl.ne r5, r5, 24 -#endif - lpnz @.Lcopy8bytes_3 - ;; LOOP START - ld.ab r6, [r1, 4] - prefetch [r1, 28] ;Prefetch the next read location - ld.ab r8, [r1,4] - prefetchw [r3, 32] ;Prefetch the next write location - - SHIFT_1 (r7, r6, 8) - or r7, r7, r5 - SHIFT_2 (r5, r6, 24) - - SHIFT_1 (r9, r8, 8) - or r9, r9, r5 - SHIFT_2 (r5, r8, 24) - - st.ab r7, [r3, 4] - st.ab r9, [r3, 4] -.Lcopy8bytes_3: - -#ifdef __BIG_ENDIAN__ - lsr.nz r5, r5, 24 -#endif - stb.ab r5, [r3, 1] - - and.f lp_count, r2, 0x07 ;Last 8bytes - lpnz @.Lcopybytewise_3 - ;; LOOP START - ldb.ab r6, [r1,1] - stb.ab r6, [r3,1] -.Lcopybytewise_3: - j [blink] - -END(memcpy) -libc_hidden_def(memcpy) diff --git a/libc/string/arc/arcv2/memset.S b/libc/string/arc/arcv2/memset.S deleted file mode 100644 index 343cfaf81c6d..000000000000 --- a/libc/string/arc/arcv2/memset.S +++ /dev/null @@ -1,115 +0,0 @@ - -/* - * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) - * - * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. - */ - -#include -#include - -#ifdef DONT_USE_PREALLOC -#define PREWRITE(A,B) prefetchw [(A),(B)] -#else -#define PREWRITE(A,B) prealloc [(A),(B)] -#endif - -ENTRY(memset) - prefetchw [r0] ; Prefetch the write location - mov.f 0, r2 -;;; if size is zero - jz.d [blink] - mov r3, r0 ; don't clobber ret val - -;;; if length < 8 - brls.d.nt r2, 8, .Lsmallchunk - mov.f lp_count,r2 - - and.f r4, r0, 0x03 - rsub lp_count, r4, 4 - lpnz @.Laligndestination - ;; LOOP BEGIN - stb.ab r1, [r3,1] - sub r2, r2, 1 -.Laligndestination: - -;;; Destination is aligned - and r1, r1, 0xFF - asl r4, r1, 8 - or r4, r4, r1 - asl r5, r4, 16 - or r5, r5, r4 - mov r4, r5 - - sub3 lp_count, r2, 8 - cmp r2, 64 - bmsk.hi r2, r2, 5 - mov.ls lp_count, 0 - add3.hi r2, r2, 8 - -;;; Convert len to Dwords, unfold x8 - lsr.f lp_count, lp_count, 6 - lpnz @.Lset64bytes - ;; LOOP START - PREWRITE(r3, 64) ;Prefetch the next write location -#if defined(__LL64__) || defined(__ARC_LL64__) - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] -#else - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] -#endif -.Lset64bytes: - - lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes - lpnz .Lset32bytes - ;; LOOP START - prefetchw [r3, 32] ;Prefetch the next write location -#if defined(__LL64__) || defined(__ARC_LL64__) - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] -#else - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] -#endif -.Lset32bytes: - - and.f lp_count, r2, 0x1F ;Last remaining 31 bytes -.Lsmallchunk: - lpnz .Lcopy3bytes - ;; LOOP START - stb.ab r1, [r3, 1] -.Lcopy3bytes: - - j [blink] - -END(memset) -libc_hidden_def(memset) diff --git a/libc/string/arc/arcv2/strcmp.S b/libc/string/arc/arcv2/strcmp.S deleted file mode 100644 index 2e0e64a0c394..000000000000 --- a/libc/string/arc/arcv2/strcmp.S +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) - * - * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. - */ - -#include -#include - -ENTRY(strcmp) - or r2, r0, r1 - bmsk_s r2, r2, 1 - brne r2, 0, @.Lcharloop - -;;; s1 and s2 are word aligned - ld.ab r2, [r0, 4] - - mov_s r12, 0x01010101 - ror r11, r12 - .align 4 -.LwordLoop: - ld.ab r3, [r1, 4] - ;; Detect NULL char in str1 - sub r4, r2, r12 - ld.ab r5, [r0, 4] - bic r4, r4, r2 - and r4, r4, r11 - brne.d.nt r4, 0, .LfoundNULL - ;; Check if the read locations are the same - cmp r2, r3 - beq.d .LwordLoop - mov.eq r2, r5 - - ;; A match is found, spot it out -#ifdef __LITTLE_ENDIAN__ - swape r3, r3 - mov_s r0, 1 - swape r2, r2 -#else - mov_s r0, 1 -#endif - cmp_s r2, r3 - j_s.d [blink] - bset.lo r0, r0, 31 - - .align 4 -.LfoundNULL: -#ifdef __BIG_ENDIAN__ - swape r4, r4 - swape r2, r2 - swape r3, r3 -#endif - ;; Find null byte - ffs r0, r4 - bmsk r2, r2, r0 - bmsk r3, r3, r0 - swape r2, r2 - swape r3, r3 - ;; make the return value - sub.f r0, r2, r3 - mov.hi r0, 1 - j_s.d [blink] - bset.lo r0, r0, 31 - - .align 4 -.Lcharloop: - ldb.ab r2, [r0, 1] - ldb.ab r3, [r1, 1] - nop - breq r2, 0, .Lcmpend - breq r2, r3, .Lcharloop - - .align 4 -.Lcmpend: - j_s.d [blink] - sub r0, r2, r3 -END(strcmp) -libc_hidden_def(strcmp) - -#ifndef __UCLIBC_HAS_LOCALE__ -strong_alias(strcmp,strcoll) -libc_hidden_def(strcoll) -#endif diff --git a/libc/string/arc/memcpy.S b/libc/string/arc/memcpy.S index 1c11951e40d2..69d7220b8a7e 100644 --- a/libc/string/arc/memcpy.S +++ b/libc/string/arc/memcpy.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,12 +7,18 @@ #include +#if !defined(__ARC700__) && !defined(__ARCHS__) +#error "Neither ARC700 nor ARCHS is defined!" +#endif + +ENTRY(memcpy) + +#ifdef __ARC700__ /* This memcpy implementation does not support objects of 1GB or larger - the check for alignment does not work then. */ /* We assume that most sources and destinations are aligned, and that also lengths are mostly a multiple of four, although to a lesser extent. */ -ENTRY(memcpy) or r3,r0,r1 asl_s r3,r3,30 mov_s r5,r0 @@ -67,5 +73,233 @@ ENTRY(memcpy) .Lendbloop: j_s.d [blink] stb r12,[r5,0] +#endif /* __ARC700__ */ + +#ifdef __ARCHS__ +#ifdef __LITTLE_ENDIAN__ +# define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << +# define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> +# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM +# define MERGE_2(RX,RY,IMM) +# define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF +# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM +#else +# define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> +# define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << +# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << +# define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << +# define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM +# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 +#endif + +#if defined(__LL64__) || defined(__ARC_LL64__) +# define PREFETCH_READ(RX) prefetch [RX, 56] +# define PREFETCH_WRITE(RX) prefetchw [RX, 64] +# define LOADX(DST,RX) ldd.ab DST, [RX, 8] +# define STOREX(SRC,RX) std.ab SRC, [RX, 8] +# define ZOLSHFT 5 +# define ZOLAND 0x1F +#else +# define PREFETCH_READ(RX) prefetch [RX, 28] +# define PREFETCH_WRITE(RX) prefetchw [RX, 32] +# define LOADX(DST,RX) ld.ab DST, [RX, 4] +# define STOREX(SRC,RX) st.ab SRC, [RX, 4] +# define ZOLSHFT 4 +# define ZOLAND 0xF +#endif + + prefetch [r1] ; Prefetch the read location + prefetchw [r0] ; Prefetch the write location + mov.f 0, r2 +;;; if size is zero + jz.d [blink] + mov r3, r0 ; don't clobber ret val + +;;; if size <= 8 + cmp r2, 8 + bls.d @.Lsmallchunk + mov.f lp_count, r2 + + and.f r4, r0, 0x03 + rsub lp_count, r4, 4 + lpnz @.Laligndestination + ;; LOOP BEGIN + ldb.ab r5, [r1,1] + sub r2, r2, 1 + stb.ab r5, [r3,1] +.Laligndestination: + +;;; Check the alignment of the source + and.f r4, r1, 0x03 + bnz.d @.Lsourceunaligned + +;;; CASE 0: Both source and destination are 32bit aligned +;;; Convert len to Dwords, unfold x4 + lsr.f lp_count, r2, ZOLSHFT + lpnz @.Lcopy32_64bytes + ;; LOOP START + LOADX (r6, r1) + PREFETCH_READ (r1) + PREFETCH_WRITE (r3) + LOADX (r8, r1) + LOADX (r10, r1) + LOADX (r4, r1) + STOREX (r6, r3) + STOREX (r8, r3) + STOREX (r10, r3) + STOREX (r4, r3) +.Lcopy32_64bytes: + + and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes +.Lsmallchunk: + lpnz @.Lcopyremainingbytes + ;; LOOP START + ldb.ab r5, [r1,1] + stb.ab r5, [r3,1] +.Lcopyremainingbytes: + + j [blink] +;;; END CASE 0 + +.Lsourceunaligned: + cmp r4, 2 + beq.d @.LunalignedOffby2 + sub r2, r2, 1 + + bhi.d @.LunalignedOffby3 + ldb.ab r5, [r1, 1] + +;;; CASE 1: The source is unaligned, off by 1 + ;; Hence I need to read 1 byte for a 16bit alignment + ;; and 2bytes to reach 32bit alignment + ldh.ab r6, [r1, 2] + sub r2, r2, 2 + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 + MERGE_1 (r6, r6, 8) + MERGE_2 (r5, r5, 24) + or r5, r5, r6 + + ;; Both src and dst are aligned + lpnz @.Lcopy8bytes_1 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetchw [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 24) + or r7, r7, r5 + SHIFT_2 (r5, r6, 8) + + SHIFT_1 (r9, r8, 24) + or r9, r9, r5 + SHIFT_2 (r5, r8, 8) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +.Lcopy8bytes_1: + + ;; Write back the remaining 16bits + EXTRACT_1 (r6, r5, 16) + sth.ab r6, [r3, 2] + ;; Write back the remaining 8bits + EXTRACT_2 (r5, r5, 16) + stb.ab r5, [r3, 1] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @.Lcopybytewise_1 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +.Lcopybytewise_1: + j [blink] + +.LunalignedOffby2: +;;; CASE 2: The source is unaligned, off by 2 + ldh.ab r5, [r1, 2] + sub r2, r2, 1 + + ;; Both src and dst are aligned + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 +#ifdef __BIG_ENDIAN__ + asl.nz r5, r5, 16 +#endif + lpnz @.Lcopy8bytes_2 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetchw [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 16) + or r7, r7, r5 + SHIFT_2 (r5, r6, 16) + + SHIFT_1 (r9, r8, 16) + or r9, r9, r5 + SHIFT_2 (r5, r8, 16) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +.Lcopy8bytes_2: + +#ifdef __BIG_ENDIAN__ + lsr.nz r5, r5, 16 +#endif + sth.ab r5, [r3, 2] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @.Lcopybytewise_2 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +.Lcopybytewise_2: + j [blink] + +.LunalignedOffby3: +;;; CASE 3: The source is unaligned, off by 3 +;;; Hence, I need to read 1byte for achieve the 32bit alignment + + ;; Both src and dst are aligned + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 +#ifdef __BIG_ENDIAN__ + asl.ne r5, r5, 24 +#endif + lpnz @.Lcopy8bytes_3 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetchw [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 8) + or r7, r7, r5 + SHIFT_2 (r5, r6, 24) + + SHIFT_1 (r9, r8, 8) + or r9, r9, r5 + SHIFT_2 (r5, r8, 24) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +.Lcopy8bytes_3: + +#ifdef __BIG_ENDIAN__ + lsr.nz r5, r5, 24 +#endif + stb.ab r5, [r3, 1] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @.Lcopybytewise_3 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +.Lcopybytewise_3: + j [blink] +#endif /* __ARCHS__ */ + END(memcpy) libc_hidden_def(memcpy) diff --git a/libc/string/arc/memset.S b/libc/string/arc/memset.S index f4048455a433..0b74ddc7fca8 100644 --- a/libc/string/arc/memset.S +++ b/libc/string/arc/memset.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,10 +7,15 @@ #include -#define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */ +#if !defined(__ARC700__) && !defined(__ARCHS__) +#error "Neither ARC700 nor ARCHS is defined!" +#endif ENTRY(memset) +#ifdef __ARC700__ +#define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */ + mov_s r4,r0 or r12,r0,r2 bmsk.f r12,r12,1 @@ -47,5 +52,111 @@ ENTRY(memset) stb.ab r1,[r4,1] .Ltiny_end: j_s [blink] +#endif /* __ARC700__ */ + +#ifdef __ARCHS__ +#ifdef DONT_USE_PREALLOC +#define PREWRITE(A,B) prefetchw [(A),(B)] +#else +#define PREWRITE(A,B) prealloc [(A),(B)] +#endif + + prefetchw [r0] ; Prefetch the write location + mov.f 0, r2 +;;; if size is zero + jz.d [blink] + mov r3, r0 ; don't clobber ret val + +;;; if length < 8 + brls.d.nt r2, 8, .Lsmallchunk + mov.f lp_count,r2 + + and.f r4, r0, 0x03 + rsub lp_count, r4, 4 + lpnz @.Laligndestination + ;; LOOP BEGIN + stb.ab r1, [r3,1] + sub r2, r2, 1 +.Laligndestination: + +;;; Destination is aligned + and r1, r1, 0xFF + asl r4, r1, 8 + or r4, r4, r1 + asl r5, r4, 16 + or r5, r5, r4 + mov r4, r5 + + sub3 lp_count, r2, 8 + cmp r2, 64 + bmsk.hi r2, r2, 5 + mov.ls lp_count, 0 + add3.hi r2, r2, 8 + +;;; Convert len to Dwords, unfold x8 + lsr.f lp_count, lp_count, 6 + lpnz @.Lset64bytes + ;; LOOP START + PREWRITE(r3, 64) ;Prefetch the next write location +#if defined(__LL64__) || defined(__ARC_LL64__) + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] +#else + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] +#endif +.Lset64bytes: + + lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes + lpnz .Lset32bytes + ;; LOOP START + prefetchw [r3, 32] ;Prefetch the next write location +#if defined(__LL64__) || defined(__ARC_LL64__) + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] +#else + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] +#endif +.Lset32bytes: + + and.f lp_count, r2, 0x1F ;Last remaining 31 bytes +.Lsmallchunk: + lpnz .Lcopy3bytes + ;; LOOP START + stb.ab r1, [r3, 1] +.Lcopy3bytes: + + j [blink] +#endif /* __ARCHS__ */ + END(memset) libc_hidden_def(memset) diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S index 5a0e56045c44..ad38d9e00c8a 100644 --- a/libc/string/arc/strcmp.S +++ b/libc/string/arc/strcmp.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -8,6 +8,13 @@ #include #include +#if !defined(__ARC700__) && !defined(__ARCHS__) +#error "Neither ARC700 nor ARCHS is defined!" +#endif + +ENTRY(strcmp) + +#ifdef __ARC700__ /* This is optimized primarily for the ARC700. It would be possible to speed up the loops by one cycle / word respective one cycle / byte by forcing double source 1 alignment, unrolling @@ -15,7 +22,6 @@ source 1; however, that would increase the overhead for loop setup / finish, and strcmp might often terminate early. */ -ENTRY(strcmp) or r2,r0,r1 bmsk_s r2,r2,1 brne r2,0,.Lcharloop @@ -93,6 +99,77 @@ ENTRY(strcmp) .Lcmpend: j_s.d [blink] sub r0,r2,r3 +#endif /* __ARC700__ */ + +#ifdef __ARCHS__ + or r2, r0, r1 + bmsk_s r2, r2, 1 + brne r2, 0, @.Lcharloop + +;;; s1 and s2 are word aligned + ld.ab r2, [r0, 4] + + mov_s r12, 0x01010101 + ror r11, r12 + .align 4 +.LwordLoop: + ld.ab r3, [r1, 4] + ;; Detect NULL char in str1 + sub r4, r2, r12 + ld.ab r5, [r0, 4] + bic r4, r4, r2 + and r4, r4, r11 + brne.d.nt r4, 0, .LfoundNULL + ;; Check if the read locations are the same + cmp r2, r3 + beq.d .LwordLoop + mov.eq r2, r5 + + ;; A match is found, spot it out +#ifdef __LITTLE_ENDIAN__ + swape r3, r3 + mov_s r0, 1 + swape r2, r2 +#else + mov_s r0, 1 +#endif + cmp_s r2, r3 + j_s.d [blink] + bset.lo r0, r0, 31 + + .align 4 +.LfoundNULL: +#ifdef __BIG_ENDIAN__ + swape r4, r4 + swape r2, r2 + swape r3, r3 +#endif + ;; Find null byte + ffs r0, r4 + bmsk r2, r2, r0 + bmsk r3, r3, r0 + swape r2, r2 + swape r3, r3 + ;; make the return value + sub.f r0, r2, r3 + mov.hi r0, 1 + j_s.d [blink] + bset.lo r0, r0, 31 + + .align 4 +.Lcharloop: + ldb.ab r2, [r0, 1] + ldb.ab r3, [r1, 1] + nop + breq r2, 0, .Lcmpend + breq r2, r3, .Lcharloop + + .align 4 +.Lcmpend: + j_s.d [blink] + sub r0, r2, r3 +#endif /* __ARCHS__ */ + END(strcmp) libc_hidden_def(strcmp)