From patchwork Mon Jul 11 17:09:44 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Andre Vieira (lists)" X-Patchwork-Id: 646999 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3rpBRk4qDfz9s9r for ; Tue, 12 Jul 2016 03:10:08 +1000 (AEST) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.b=iIimnXRG; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :subject:to:references:cc:from:message-id:date:mime-version :in-reply-to:content-type; q=dns; s=default; b=U3iN0OVDmznXzz9O8 iZj4Wk3qfg0yzhqkCsnIjnj1Om/ZGoz98KV0H3WtJCg89Yk755BiFAVZfDg0jYJ+ l9xvbM/eatFekiJQb3z6wgLb58jio9srK6/vLQi7gfEtumpd5qNEnc9OABU5VAFe OqN220+5rp+e2pQHvlTkJY4xfo= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :subject:to:references:cc:from:message-id:date:mime-version :in-reply-to:content-type; s=default; bh=NNeoaV+fKC+Febl5RjVNFJq PGQM=; b=iIimnXRG2/q2IcY+TkDjfE8tnxm/48X+gLS4g95K6NsD9rQik3qpQPo Er476YPF0pKxlCSNViAwSTGoJVNev61BI8Ub9EH8kTEVyBKFx1GleD8a2vtofRtu hMHbxBgKyOVtdk7fiQYiR/n+EbzOpDeTg7FEiSkmXrvhHg9WJdxQ= Received: (qmail 128046 invoked by alias); 11 Jul 2016 17:09:59 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 128032 invoked by uid 89); 11 Jul 2016 17:09:58 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-2.1 required=5.0 tests=BAYES_00, KAM_ASCII_DIVIDERS, KAM_LOTSOFHASH, RP_MATCHES_RCVD, SPF_PASS autolearn=ham version=3.3.2 spammy=simultaneously, expectation, Body, dividend X-HELO: foss.arm.com Received: from foss.arm.com (HELO foss.arm.com) (217.140.101.70) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Mon, 11 Jul 2016 17:09:48 +0000 Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.72.51.249]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 160B428; Mon, 11 Jul 2016 10:10:50 -0700 (PDT) Received: from [10.2.206.221] (e107157-lin.cambridge.arm.com [10.2.206.221]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 65FE53F387; Mon, 11 Jul 2016 10:09:46 -0700 (PDT) Subject: Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m To: gcc-patches@gcc.gnu.org References: <561D38F7.809@arm.com> <577672E3.6080002@foss.arm.com> <577CE2F2.1080805@arm.com> Cc: Ramana Radhakrishnan From: "Andre Vieira (lists)" Message-ID: <5783D2D8.2090707@arm.com> Date: Mon, 11 Jul 2016 18:09:44 +0100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.2.0 MIME-Version: 1.0 In-Reply-To: <577CE2F2.1080805@arm.com> X-IsSubscribed: yes On 06/07/16 11:52, Andre Vieira (lists) wrote: > On 01/07/16 14:40, Ramana Radhakrishnan wrote: >> >> >> On 13/10/15 18:01, Andre Vieira wrote: >>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license. >>> >>> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license. >> >> This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review. >> >>> >>> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined. >>> >>> Make check passed for armv6-m. >>> >>> libgcc/ChangeLog: >>> 2015-08-10 Hale Wang >>> Andre Vieira >>> >>> * config/arm/lib1funcs.S: Add new wrapper. >>> >>> 0001-integer-division.patch >>> >>> >>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001 >>> From: Andre Simoes Dias Vieira >>> Date: Fri, 21 Aug 2015 14:23:28 +0100 >>> Subject: [PATCH] new wrapper idivmod >>> >>> --- >>> libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------ >>> 1 file changed, 217 insertions(+), 33 deletions(-) >>> >>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S >>> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644 >>> --- a/libgcc/config/arm/lib1funcs.S >>> +++ b/libgcc/config/arm/lib1funcs.S >>> @@ -306,34 +306,12 @@ LSYM(Lend_fde): >>> #ifdef __ARM_EABI__ >>> .macro THUMB_LDIV0 name signed >>> #if defined(__ARM_ARCH_6M__) >>> - .ifc \signed, unsigned >>> - cmp r0, #0 >>> - beq 1f >>> - mov r0, #0 >>> - mvn r0, r0 @ 0xffffffff >>> -1: >>> - .else >>> - cmp r0, #0 >>> - beq 2f >>> - blt 3f >>> + >>> + push {r0, lr} >>> mov r0, #0 >>> - mvn r0, r0 >>> - lsr r0, r0, #1 @ 0x7fffffff >>> - b 2f >>> -3: mov r0, #0x80 >>> - lsl r0, r0, #24 @ 0x80000000 >>> -2: >>> - .endif >>> - push {r0, r1, r2} >>> - ldr r0, 4f >>> - adr r1, 4f >>> - add r0, r1 >>> - str r0, [sp, #8] >>> - @ We know we are not on armv4t, so pop pc is safe. >>> - pop {r0, r1, pc} >>> - .align 2 >>> -4: >>> - .word __aeabi_idiv0 - 4b >>> + bl SYM(__aeabi_idiv0) >>> + pop {r1, pc} >>> + >> >> I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code. >> >>> #elif defined(__thumb2__) >>> .syntax unified >>> .ifc \signed, unsigned >>> @@ -945,7 +923,170 @@ LSYM(Lover7): >>> add dividend, work >>> .endif >>> LSYM(Lgot_result): >>> -.endm >>> +.endm >>> + >>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__) >>> +/* If performance is preferred, the following functions are provided. */ >>> + >> >> Comment above #if please and also check elsewhere in patch. >> >>> +/* Branch to div(n), and jump to label if curbit is lo than divisior. */ >>> +.macro BranchToDiv n, label >>> + lsr curbit, dividend, \n >>> + cmp curbit, divisor >>> + blo \label >>> +.endm >>> + >>> +/* Body of div(n). Shift the divisor in n bits and compare the divisor >>> + and dividend. Update the dividend as the substruction result. */ >>> +.macro DoDiv n >>> + lsr curbit, dividend, \n >>> + cmp curbit, divisor >>> + bcc 1f >>> + lsl curbit, divisor, \n >>> + sub dividend, dividend, curbit >>> + >>> +1: adc result, result >>> +.endm >>> + >>> +/* The body of division with positive divisor. Unless the divisor is very >>> + big, shift it up in multiples of four bits, since this is the amount of >>> + unwinding in the main division loop. Continue shifting until the divisor >>> + is larger than the dividend. */ >>> +.macro THUMB1_Div_Positive >>> + mov result, #0 >>> + BranchToDiv #1, LSYM(Lthumb1_div1) >>> + BranchToDiv #4, LSYM(Lthumb1_div4) >>> + BranchToDiv #8, LSYM(Lthumb1_div8) >>> + BranchToDiv #12, LSYM(Lthumb1_div12) >>> + BranchToDiv #16, LSYM(Lthumb1_div16) >>> +LSYM(Lthumb1_div_large_positive): >>> + mov result, #0xff >>> + lsl divisor, divisor, #8 >>> + rev result, result >>> + lsr curbit, dividend, #16 >>> + cmp curbit, divisor >>> + blo 1f >>> + asr result, #8 >>> + lsl divisor, divisor, #8 >>> + beq LSYM(Ldivbyzero_waypoint) >>> + >>> +1: lsr curbit, dividend, #12 >>> + cmp curbit, divisor >>> + blo LSYM(Lthumb1_div12) >>> + b LSYM(Lthumb1_div16) >>> +LSYM(Lthumb1_div_loop): >>> + lsr divisor, divisor, #8 >>> +LSYM(Lthumb1_div16): >>> + Dodiv #15 >>> + Dodiv #14 >>> + Dodiv #13 >>> + Dodiv #12 >>> +LSYM(Lthumb1_div12): >>> + Dodiv #11 >>> + Dodiv #10 >>> + Dodiv #9 >>> + Dodiv #8 >>> + bcs LSYM(Lthumb1_div_loop) >>> +LSYM(Lthumb1_div8): >>> + Dodiv #7 >>> + Dodiv #6 >>> + Dodiv #5 >>> +LSYM(Lthumb1_div5): >>> + Dodiv #4 >>> +LSYM(Lthumb1_div4): >>> + Dodiv #3 >>> +LSYM(Lthumb1_div3): >>> + Dodiv #2 >>> +LSYM(Lthumb1_div2): >>> + Dodiv #1 >>> +LSYM(Lthumb1_div1): >>> + sub divisor, dividend, divisor >>> + bcs 1f >>> + cpy divisor, dividend >>> + >>> +1: adc result, result >>> + cpy dividend, result >>> + RET >>> + >>> +LSYM(Ldivbyzero_waypoint): >>> + b LSYM(Ldiv0) >>> +.endm >>> + >>> +/* The body of division with negative divisor. Similar with >>> + THUMB1_Div_Positive except that the shift steps are in multiples >>> + of six bits. */ >>> +.macro THUMB1_Div_Negative >>> + lsr result, divisor, #31 >>> + beq 1f >>> + neg divisor, divisor >>> + >>> +1: asr curbit, dividend, #32 >>> + bcc 2f >>> + neg dividend, dividend >>> + >>> +2: eor curbit, result >>> + mov result, #0 >>> + cpy ip, curbit >>> + BranchToDiv #4, LSYM(Lthumb1_div_negative4) >>> + BranchToDiv #8, LSYM(Lthumb1_div_negative8) >>> +LSYM(Lthumb1_div_large): >>> + mov result, #0xfc >>> + lsl divisor, divisor, #6 >>> + rev result, result >>> + lsr curbit, dividend, #8 >>> + cmp curbit, divisor >>> + blo LSYM(Lthumb1_div_negative8) >>> + >>> + lsl divisor, divisor, #6 >>> + asr result, result, #6 >>> + cmp curbit, divisor >>> + blo LSYM(Lthumb1_div_negative8) >>> + >>> + lsl divisor, divisor, #6 >>> + asr result, result, #6 >>> + cmp curbit, divisor >>> + blo LSYM(Lthumb1_div_negative8) >>> + >>> + lsl divisor, divisor, #6 >>> + beq LSYM(Ldivbyzero_negative) >>> + asr result, result, #6 >>> + b LSYM(Lthumb1_div_negative8) >>> +LSYM(Lthumb1_div_negative_loop): >>> + lsr divisor, divisor, #6 >>> +LSYM(Lthumb1_div_negative8): >>> + DoDiv #7 >>> + DoDiv #6 >>> + DoDiv #5 >>> + DoDiv #4 >>> +LSYM(Lthumb1_div_negative4): >>> + DoDiv #3 >>> + DoDiv #2 >>> + bcs LSYM(Lthumb1_div_negative_loop) >>> + DoDiv #1 >>> + sub divisor, dividend, divisor >>> + bcs 1f >>> + cpy divisor, dividend >>> + >>> +1: cpy curbit, ip >>> + adc result, result >>> + asr curbit, curbit, #1 >>> + cpy dividend, result >>> + bcc 2f >>> + neg dividend, dividend >>> + cmp curbit, #0 >>> + >>> +2: bpl 3f >>> + neg divisor, divisor >>> + >>> +3: RET >>> + >>> +LSYM(Ldivbyzero_negative): >>> + cpy curbit, ip >>> + asr curbit, curbit, #1 >>> + bcc LSYM(Ldiv0) >>> + neg dividend, dividend >>> +.endm >>> +#endif /* ARM Thumb version. */ >>> + >>> /* ------------------------------------------------------------------------ */ >>> /* Start of the Real Functions */ >>> /* ------------------------------------------------------------------------ */ >>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result): >>> >>> FUNC_START udivsi3 >>> FUNC_ALIAS aeabi_uidiv udivsi3 >>> +#if defined(__OPTIMIZE_SIZE__) >>> >>> cmp divisor, #0 >>> beq LSYM(Ldiv0) >>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test): >>> pop { work } >>> RET >>> >>> +#else >>> + /* Implementation of aeabi_uidiv for ARMv6m. This version is only >>> + used in ARMv6-M when we need an efficient implementation. */ >>> +LSYM(udivsi3_skip_div0_test): >>> + THUMB1_Div_Positive >>> + >>> +#endif /* __OPTIMIZE_SIZE__ */ >>> + >>> #elif defined(__ARM_ARCH_EXT_IDIV__) >>> >>> ARM_FUNC_START udivsi3 >>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test): >>> FUNC_START aeabi_uidivmod >>> cmp r1, #0 >>> beq LSYM(Ldiv0) >>> +# if defined(__OPTIMIZE_SIZE__) >>> push {r0, r1, lr} >>> bl LSYM(udivsi3_skip_div0_test) >>> POP {r1, r2, r3} >>> mul r2, r0 >>> sub r1, r1, r2 >>> bx r3 >>> +# else >>> + /* Both the quotient and remainder are calculated simultaneously >>> + in THUMB1_Div_Positive. There is no need to calculate the >>> + remainder again here. */ >>> + b LSYM(udivsi3_skip_div0_test) >>> + RET >>> +# endif /* __OPTIMIZE_SIZE__ */ >>> + >>> #elif defined(__ARM_ARCH_EXT_IDIV__) >>> ARM_FUNC_START aeabi_uidivmod >>> cmp r1, #0 >>> @@ -1084,7 +1243,7 @@ LSYM(Lover10): >>> RET >>> >>> #else /* ARM version. */ >>> - >>> + >>> FUNC_START umodsi3 >>> >>> subs r2, r1, #1 @ compare divisor with 1 >>> @@ -1109,8 +1268,9 @@ LSYM(Lover10): >>> >>> #if defined(__prefer_thumb__) >>> >>> - FUNC_START divsi3 >>> + FUNC_START divsi3 >>> FUNC_ALIAS aeabi_idiv divsi3 >>> +#if defined(__OPTIMIZE_SIZE__) >>> >>> cmp divisor, #0 >>> beq LSYM(Ldiv0) >>> @@ -1133,7 +1293,7 @@ LSYM(Lover11): >>> blo LSYM(Lgot_result) >>> >>> THUMB_DIV_MOD_BODY 0 >>> - >>> + >>> mov r0, result >>> mov work, ip >>> cmp work, #0 >>> @@ -1142,6 +1302,21 @@ LSYM(Lover11): >>> LSYM(Lover12): >>> pop { work } >>> RET >>> +#else >>> + /* Implementation of aeabi_idiv for ARMv6m. This version is only >>> + used in ARMv6-M when we need an efficient implementation. */ >>> +LSYM(divsi3_skip_div0_test): >>> + cpy curbit, dividend >>> + orr curbit, divisor >>> + bmi LSYM(Lthumb1_div_negative) >>> + >>> +LSYM(Lthumb1_div_positive): >>> + THUMB1_Div_Positive >>> + >>> +LSYM(Lthumb1_div_negative): >>> + THUMB1_Div_Negative >>> + >>> +#endif /* __OPTIMIZE_SIZE__ */ >>> >>> #elif defined(__ARM_ARCH_EXT_IDIV__) >>> >>> @@ -1154,8 +1329,8 @@ LSYM(Lover12): >>> RET >>> >>> #else /* ARM/Thumb-2 version. */ >>> - >>> - ARM_FUNC_START divsi3 >>> + >>> + ARM_FUNC_START divsi3 >>> ARM_FUNC_ALIAS aeabi_idiv divsi3 >>> >>> cmp r1, #0 >>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test): >>> FUNC_START aeabi_idivmod >>> cmp r1, #0 >>> beq LSYM(Ldiv0) >>> +# if defined(__OPTIMIZE_SIZE__) >>> push {r0, r1, lr} >>> bl LSYM(divsi3_skip_div0_test) >>> POP {r1, r2, r3} >>> mul r2, r0 >>> sub r1, r1, r2 >>> bx r3 >>> +# else >>> + /* Both the quotient and remainder are calculated simultaneously >>> + in THUMB1_Div_Positive and THUMB1_Div_Negative. There is no >>> + need to calculate the remainder again here. */ >>> + b LSYM(divsi3_skip_div0_test) >>> + RET >>> +# endif /* __OPTIMIZE_SIZE__ */ >>> + >>> #elif defined(__ARM_ARCH_EXT_IDIV__) >>> ARM_FUNC_START aeabi_idivmod >>> cmp r1, #0 >>> -- 1.9.1 >>> >> >> Otherwise OK if no regressions and the following request passes. >> >> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ? >> >> regards >> Ramana >> > Hi Ramana, > > Thank you for the comments. Sorry about the license, must have been a > mixup somewhere. > > I put back the 'pop pc is safe' assembly comment and I moved some > comments before the #if and #else as requested. I left some in place > because they did not apply to the whole block but simply to the first > assembly instruction after the #if/else. > > I checked that the assembly generated for libgcc was the same with and > without the patch for armv7-a in arm mode and armv7-m in thumb mode. > > Is this OK? > > Cheers, > Andre > > libgcc/ChangeLog: > 2016-07-06 Hale Wang > Andre Vieira > > * config/arm/lib1funcs.S: Add new wrapper. > I had to rebase the patch due to the ARMv8-M patches. This implied changing a context line that changed due to the code for ARMv6-M being reused for ARMv8-M Baseline. I ran regression tests for both ARMv6-M and ARMv8-M Baseline and compared the generated libgcc for ARMv7-A in ARM mode and ARMv7-M in Thumb mode, observing no changes. Applying patch, as it was previously OK'ed. Cheers, Andre From ad757144fad2d9608ed840153071bb5d470193ef Mon Sep 17 00:00:00 2001 From: Andre Simoes Dias Vieira Date: Thu, 7 Jul 2016 10:49:19 +0100 Subject: [PATCH] integer division --- libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 218 insertions(+), 32 deletions(-) diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S index 96e206ee542126c5d68091087446afe9f01aa51f..ba52e7b762f5573445349a574a3878859a992f13 100644 --- a/libgcc/config/arm/lib1funcs.S +++ b/libgcc/config/arm/lib1funcs.S @@ -311,34 +311,13 @@ LSYM(Lend_fde): #ifdef __ARM_EABI__ .macro THUMB_LDIV0 name signed #ifdef NOT_ISA_TARGET_32BIT - .ifc \signed, unsigned - cmp r0, #0 - beq 1f - mov r0, #0 - mvn r0, r0 @ 0xffffffff -1: - .else - cmp r0, #0 - beq 2f - blt 3f + + push {r0, lr} mov r0, #0 - mvn r0, r0 - lsr r0, r0, #1 @ 0x7fffffff - b 2f -3: mov r0, #0x80 - lsl r0, r0, #24 @ 0x80000000 -2: - .endif - push {r0, r1, r2} - ldr r0, 4f - adr r1, 4f - add r0, r1 - str r0, [sp, #8] + bl SYM(__aeabi_idiv0) @ We know we are not on armv4t, so pop pc is safe. - pop {r0, r1, pc} - .align 2 -4: - .word __aeabi_idiv0 - 4b + pop {r1, pc} + #elif defined(__thumb2__) .syntax unified .ifc \signed, unsigned @@ -950,7 +929,170 @@ LSYM(Lover7): add dividend, work .endif LSYM(Lgot_result): -.endm +.endm + +/* If performance is preferred, the following functions are provided. */ +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__) + +/* Branch to div(n), and jump to label if curbit is lo than divisior. */ +.macro BranchToDiv n, label + lsr curbit, dividend, \n + cmp curbit, divisor + blo \label +.endm + +/* Body of div(n). Shift the divisor in n bits and compare the divisor + and dividend. Update the dividend as the substruction result. */ +.macro DoDiv n + lsr curbit, dividend, \n + cmp curbit, divisor + bcc 1f + lsl curbit, divisor, \n + sub dividend, dividend, curbit + +1: adc result, result +.endm + +/* The body of division with positive divisor. Unless the divisor is very + big, shift it up in multiples of four bits, since this is the amount of + unwinding in the main division loop. Continue shifting until the divisor + is larger than the dividend. */ +.macro THUMB1_Div_Positive + mov result, #0 + BranchToDiv #1, LSYM(Lthumb1_div1) + BranchToDiv #4, LSYM(Lthumb1_div4) + BranchToDiv #8, LSYM(Lthumb1_div8) + BranchToDiv #12, LSYM(Lthumb1_div12) + BranchToDiv #16, LSYM(Lthumb1_div16) +LSYM(Lthumb1_div_large_positive): + mov result, #0xff + lsl divisor, divisor, #8 + rev result, result + lsr curbit, dividend, #16 + cmp curbit, divisor + blo 1f + asr result, #8 + lsl divisor, divisor, #8 + beq LSYM(Ldivbyzero_waypoint) + +1: lsr curbit, dividend, #12 + cmp curbit, divisor + blo LSYM(Lthumb1_div12) + b LSYM(Lthumb1_div16) +LSYM(Lthumb1_div_loop): + lsr divisor, divisor, #8 +LSYM(Lthumb1_div16): + Dodiv #15 + Dodiv #14 + Dodiv #13 + Dodiv #12 +LSYM(Lthumb1_div12): + Dodiv #11 + Dodiv #10 + Dodiv #9 + Dodiv #8 + bcs LSYM(Lthumb1_div_loop) +LSYM(Lthumb1_div8): + Dodiv #7 + Dodiv #6 + Dodiv #5 +LSYM(Lthumb1_div5): + Dodiv #4 +LSYM(Lthumb1_div4): + Dodiv #3 +LSYM(Lthumb1_div3): + Dodiv #2 +LSYM(Lthumb1_div2): + Dodiv #1 +LSYM(Lthumb1_div1): + sub divisor, dividend, divisor + bcs 1f + cpy divisor, dividend + +1: adc result, result + cpy dividend, result + RET + +LSYM(Ldivbyzero_waypoint): + b LSYM(Ldiv0) +.endm + +/* The body of division with negative divisor. Similar with + THUMB1_Div_Positive except that the shift steps are in multiples + of six bits. */ +.macro THUMB1_Div_Negative + lsr result, divisor, #31 + beq 1f + neg divisor, divisor + +1: asr curbit, dividend, #32 + bcc 2f + neg dividend, dividend + +2: eor curbit, result + mov result, #0 + cpy ip, curbit + BranchToDiv #4, LSYM(Lthumb1_div_negative4) + BranchToDiv #8, LSYM(Lthumb1_div_negative8) +LSYM(Lthumb1_div_large): + mov result, #0xfc + lsl divisor, divisor, #6 + rev result, result + lsr curbit, dividend, #8 + cmp curbit, divisor + blo LSYM(Lthumb1_div_negative8) + + lsl divisor, divisor, #6 + asr result, result, #6 + cmp curbit, divisor + blo LSYM(Lthumb1_div_negative8) + + lsl divisor, divisor, #6 + asr result, result, #6 + cmp curbit, divisor + blo LSYM(Lthumb1_div_negative8) + + lsl divisor, divisor, #6 + beq LSYM(Ldivbyzero_negative) + asr result, result, #6 + b LSYM(Lthumb1_div_negative8) +LSYM(Lthumb1_div_negative_loop): + lsr divisor, divisor, #6 +LSYM(Lthumb1_div_negative8): + DoDiv #7 + DoDiv #6 + DoDiv #5 + DoDiv #4 +LSYM(Lthumb1_div_negative4): + DoDiv #3 + DoDiv #2 + bcs LSYM(Lthumb1_div_negative_loop) + DoDiv #1 + sub divisor, dividend, divisor + bcs 1f + cpy divisor, dividend + +1: cpy curbit, ip + adc result, result + asr curbit, curbit, #1 + cpy dividend, result + bcc 2f + neg dividend, dividend + cmp curbit, #0 + +2: bpl 3f + neg divisor, divisor + +3: RET + +LSYM(Ldivbyzero_negative): + cpy curbit, ip + asr curbit, curbit, #1 + bcc LSYM(Ldiv0) + neg dividend, dividend +.endm +#endif /* ARM Thumb version. */ + /* ------------------------------------------------------------------------ */ /* Start of the Real Functions */ /* ------------------------------------------------------------------------ */ @@ -960,6 +1102,7 @@ LSYM(Lgot_result): FUNC_START udivsi3 FUNC_ALIAS aeabi_uidiv udivsi3 +#if defined(__OPTIMIZE_SIZE__) cmp divisor, #0 beq LSYM(Ldiv0) @@ -977,6 +1120,14 @@ LSYM(udivsi3_skip_div0_test): pop { work } RET +/* Implementation of aeabi_uidiv for ARMv6m. This version is only + used in ARMv6-M when we need an efficient implementation. */ +#else +LSYM(udivsi3_skip_div0_test): + THUMB1_Div_Positive + +#endif /* __OPTIMIZE_SIZE__ */ + #elif defined(__ARM_ARCH_EXT_IDIV__) ARM_FUNC_START udivsi3 @@ -1028,12 +1179,21 @@ LSYM(udivsi3_skip_div0_test): FUNC_START aeabi_uidivmod cmp r1, #0 beq LSYM(Ldiv0) +# if defined(__OPTIMIZE_SIZE__) push {r0, r1, lr} bl LSYM(udivsi3_skip_div0_test) POP {r1, r2, r3} mul r2, r0 sub r1, r1, r2 bx r3 +# else + /* Both the quotient and remainder are calculated simultaneously + in THUMB1_Div_Positive. There is no need to calculate the + remainder again here. */ + b LSYM(udivsi3_skip_div0_test) + RET +# endif /* __OPTIMIZE_SIZE__ */ + #elif defined(__ARM_ARCH_EXT_IDIV__) ARM_FUNC_START aeabi_uidivmod cmp r1, #0 @@ -1089,7 +1249,7 @@ LSYM(Lover10): RET #else /* ARM version. */ - + FUNC_START umodsi3 subs r2, r1, #1 @ compare divisor with 1 @@ -1114,8 +1274,9 @@ LSYM(Lover10): #if defined(__prefer_thumb__) - FUNC_START divsi3 + FUNC_START divsi3 FUNC_ALIAS aeabi_idiv divsi3 +#if defined(__OPTIMIZE_SIZE__) cmp divisor, #0 beq LSYM(Ldiv0) @@ -1138,7 +1299,7 @@ LSYM(Lover11): blo LSYM(Lgot_result) THUMB_DIV_MOD_BODY 0 - + mov r0, result mov work, ip cmp work, #0 @@ -1148,6 +1309,22 @@ LSYM(Lover12): pop { work } RET +/* Implementation of aeabi_idiv for ARMv6m. This version is only + used in ARMv6-M when we need an efficient implementation. */ +#else +LSYM(divsi3_skip_div0_test): + cpy curbit, dividend + orr curbit, divisor + bmi LSYM(Lthumb1_div_negative) + +LSYM(Lthumb1_div_positive): + THUMB1_Div_Positive + +LSYM(Lthumb1_div_negative): + THUMB1_Div_Negative + +#endif /* __OPTIMIZE_SIZE__ */ + #elif defined(__ARM_ARCH_EXT_IDIV__) ARM_FUNC_START divsi3 @@ -1159,8 +1336,8 @@ LSYM(Lover12): RET #else /* ARM/Thumb-2 version. */ - - ARM_FUNC_START divsi3 + + ARM_FUNC_START divsi3 ARM_FUNC_ALIAS aeabi_idiv divsi3 cmp r1, #0 @@ -1214,12 +1391,21 @@ LSYM(divsi3_skip_div0_test): FUNC_START aeabi_idivmod cmp r1, #0 beq LSYM(Ldiv0) +# if defined(__OPTIMIZE_SIZE__) push {r0, r1, lr} bl LSYM(divsi3_skip_div0_test) POP {r1, r2, r3} mul r2, r0 sub r1, r1, r2 bx r3 +# else + /* Both the quotient and remainder are calculated simultaneously + in THUMB1_Div_Positive and THUMB1_Div_Negative. There is no + need to calculate the remainder again here. */ + b LSYM(divsi3_skip_div0_test) + RET +# endif /* __OPTIMIZE_SIZE__ */ + #elif defined(__ARM_ARCH_EXT_IDIV__) ARM_FUNC_START aeabi_idivmod cmp r1, #0 -- 1.9.1