diff mbox

[PATCHv2,ARM,libgcc] New aeabi_idiv function for armv6-m

Message ID 5783D2D8.2090707@arm.com
State New
Headers show

Commit Message

Andre Vieira (lists) July 11, 2016, 5:09 p.m. UTC
On 06/07/16 11:52, Andre Vieira (lists) wrote:
> On 01/07/16 14:40, Ramana Radhakrishnan wrote:
>>
>>
>> On 13/10/15 18:01, Andre Vieira wrote:
>>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
>>>
>>> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.
>>
>> This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.
>>
>>>
>>> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>>>
>>> Make check passed for armv6-m.
>>>
>>> libgcc/ChangeLog:
>>> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>>>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>>
>>>   * config/arm/lib1funcs.S: Add new wrapper.
>>>
>>> 0001-integer-division.patch
>>>
>>>
>>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
>>> From: Andre Simoes Dias Vieira <andsim01@arm.com>
>>> Date: Fri, 21 Aug 2015 14:23:28 +0100
>>> Subject: [PATCH] new wrapper idivmod
>>>
>>> ---
>>>  libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
>>>  1 file changed, 217 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
>>> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
>>> --- a/libgcc/config/arm/lib1funcs.S
>>> +++ b/libgcc/config/arm/lib1funcs.S
>>> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>>>  #ifdef __ARM_EABI__
>>>  .macro THUMB_LDIV0 name signed
>>>  #if defined(__ARM_ARCH_6M__)
>>> -	.ifc \signed, unsigned
>>> -	cmp	r0, #0
>>> -	beq	1f
>>> -	mov	r0, #0
>>> -	mvn	r0, r0		@ 0xffffffff
>>> -1:
>>> -	.else
>>> -	cmp	r0, #0
>>> -	beq	2f
>>> -	blt	3f
>>> +
>>> +	push	{r0, lr}
>>>  	mov	r0, #0
>>> -	mvn	r0, r0
>>> -	lsr	r0, r0, #1	@ 0x7fffffff
>>> -	b	2f
>>> -3:	mov	r0, #0x80
>>> -	lsl	r0, r0, #24	@ 0x80000000
>>> -2:
>>> -	.endif
>>> -	push	{r0, r1, r2}
>>> -	ldr	r0, 4f
>>> -	adr	r1, 4f
>>> -	add	r0, r1
>>> -	str	r0, [sp, #8]
>>> -	@ We know we are not on armv4t, so pop pc is safe.
>>> -	pop	{r0, r1, pc}
>>> -	.align	2
>>> -4:
>>> -	.word	__aeabi_idiv0 - 4b
>>> +	bl	SYM(__aeabi_idiv0)
>>> +	pop	{r1, pc}
>>> +
>>
>> I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.
>>
>>>  #elif defined(__thumb2__)
>>>  	.syntax unified
>>>  	.ifc \signed, unsigned
>>> @@ -945,7 +923,170 @@ LSYM(Lover7):
>>>  	add	dividend, work
>>>    .endif
>>>  LSYM(Lgot_result):
>>> -.endm	
>>> +.endm
>>> +
>>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
>>> +/* If performance is preferred, the following functions are provided.  */
>>> +
>>
>> Comment above #if please and also check elsewhere in patch.
>>
>>> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
>>> +.macro BranchToDiv n, label
>>> +	lsr	curbit, dividend, \n
>>> +	cmp	curbit, divisor
>>> +	blo	\label
>>> +.endm
>>> +
>>> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
>>> +   and dividend.  Update the dividend as the substruction result.  */
>>> +.macro DoDiv n
>>> +	lsr	curbit, dividend, \n
>>> +	cmp	curbit, divisor
>>> +	bcc	1f
>>> +	lsl	curbit, divisor, \n
>>> +	sub	dividend, dividend, curbit
>>> +
>>> +1:	adc	result, result
>>> +.endm
>>> +
>>> +/* The body of division with positive divisor.  Unless the divisor is very
>>> +   big, shift it up in multiples of four bits, since this is the amount of
>>> +   unwinding in the main division loop.  Continue shifting until the divisor
>>> +   is larger than the dividend.  */
>>> +.macro THUMB1_Div_Positive
>>> +	mov	result, #0
>>> +	BranchToDiv #1, LSYM(Lthumb1_div1)
>>> +	BranchToDiv #4, LSYM(Lthumb1_div4)
>>> +	BranchToDiv #8, LSYM(Lthumb1_div8)
>>> +	BranchToDiv #12, LSYM(Lthumb1_div12)
>>> +	BranchToDiv #16, LSYM(Lthumb1_div16)
>>> +LSYM(Lthumb1_div_large_positive):
>>> +	mov	result, #0xff
>>> +	lsl	divisor, divisor, #8
>>> +	rev	result, result
>>> +	lsr	curbit, dividend, #16
>>> +	cmp	curbit, divisor
>>> +	blo	1f
>>> +	asr	result, #8
>>> +	lsl	divisor, divisor, #8
>>> +	beq	LSYM(Ldivbyzero_waypoint)
>>> +
>>> +1:	lsr	curbit, dividend, #12
>>> +	cmp	curbit, divisor
>>> +	blo	LSYM(Lthumb1_div12)
>>> +	b	LSYM(Lthumb1_div16)
>>> +LSYM(Lthumb1_div_loop):
>>> +	lsr	divisor, divisor, #8
>>> +LSYM(Lthumb1_div16):
>>> +	Dodiv	#15
>>> +	Dodiv	#14
>>> +	Dodiv	#13
>>> +	Dodiv	#12
>>> +LSYM(Lthumb1_div12):
>>> +	Dodiv	#11
>>> +	Dodiv	#10
>>> +	Dodiv	#9
>>> +	Dodiv	#8
>>> +	bcs	LSYM(Lthumb1_div_loop)
>>> +LSYM(Lthumb1_div8):
>>> +	Dodiv	#7
>>> +	Dodiv	#6
>>> +	Dodiv	#5
>>> +LSYM(Lthumb1_div5):
>>> +	Dodiv	#4
>>> +LSYM(Lthumb1_div4):
>>> +	Dodiv	#3
>>> +LSYM(Lthumb1_div3):
>>> +	Dodiv	#2
>>> +LSYM(Lthumb1_div2):
>>> +	Dodiv	#1
>>> +LSYM(Lthumb1_div1):
>>> +	sub	divisor, dividend, divisor
>>> +	bcs	1f
>>> +	cpy	divisor, dividend
>>> +
>>> +1:	adc	result, result
>>> +	cpy	dividend, result
>>> +	RET
>>> +
>>> +LSYM(Ldivbyzero_waypoint):
>>> +	b	LSYM(Ldiv0)
>>> +.endm
>>> +
>>> +/* The body of division with negative divisor.  Similar with
>>> +   THUMB1_Div_Positive except that the shift steps are in multiples
>>> +   of six bits.  */
>>> +.macro THUMB1_Div_Negative
>>> +	lsr	result, divisor, #31
>>> +	beq	1f
>>> +	neg	divisor, divisor
>>> +
>>> +1:	asr	curbit, dividend, #32
>>> +	bcc	2f
>>> +	neg	dividend, dividend
>>> +
>>> +2:	eor	curbit, result
>>> +	mov	result, #0
>>> +	cpy	ip, curbit
>>> +	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
>>> +	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
>>> +LSYM(Lthumb1_div_large):
>>> +	mov	result, #0xfc
>>> +	lsl	divisor, divisor, #6
>>> +	rev	result, result
>>> +	lsr	curbit, dividend, #8
>>> +	cmp	curbit, divisor
>>> +	blo	LSYM(Lthumb1_div_negative8)
>>> +
>>> +	lsl	divisor, divisor, #6
>>> +	asr	result, result, #6
>>> +	cmp	curbit, divisor
>>> +	blo	LSYM(Lthumb1_div_negative8)
>>> +
>>> +	lsl	divisor, divisor, #6
>>> +	asr	result, result, #6
>>> +	cmp	curbit, divisor
>>> +	blo	LSYM(Lthumb1_div_negative8)
>>> +
>>> +	lsl	divisor, divisor, #6
>>> +	beq	LSYM(Ldivbyzero_negative)
>>> +	asr	result, result, #6
>>> +	b	LSYM(Lthumb1_div_negative8)
>>> +LSYM(Lthumb1_div_negative_loop):
>>> +	lsr	divisor, divisor, #6
>>> +LSYM(Lthumb1_div_negative8):
>>> +	DoDiv	#7
>>> +	DoDiv	#6
>>> +	DoDiv	#5
>>> +	DoDiv	#4
>>> +LSYM(Lthumb1_div_negative4):
>>> +	DoDiv	#3
>>> +	DoDiv	#2
>>> +	bcs	LSYM(Lthumb1_div_negative_loop)
>>> +	DoDiv	#1
>>> +	sub	divisor, dividend, divisor
>>> +	bcs	1f
>>> +	cpy	divisor, dividend
>>> +
>>> +1:	cpy	curbit, ip
>>> +	adc	result, result
>>> +	asr	curbit, curbit, #1
>>> +	cpy	dividend, result
>>> +	bcc	2f
>>> +	neg	dividend, dividend
>>> +	cmp	curbit, #0
>>> +
>>> +2:	bpl	3f
>>> +	neg	divisor, divisor
>>> +
>>> +3:	RET
>>> +
>>> +LSYM(Ldivbyzero_negative):
>>> +	cpy	curbit, ip
>>> +	asr	curbit, curbit, #1
>>> +	bcc	LSYM(Ldiv0)
>>> +	neg	dividend, dividend
>>> +.endm
>>> +#endif /* ARM Thumb version.  */
>>> +
>>>  /* ------------------------------------------------------------------------ */
>>>  /*		Start of the Real Functions				    */
>>>  /* ------------------------------------------------------------------------ */
>>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>>>  
>>>  	FUNC_START udivsi3
>>>  	FUNC_ALIAS aeabi_uidiv udivsi3
>>> +#if defined(__OPTIMIZE_SIZE__)
>>>  
>>>  	cmp	divisor, #0
>>>  	beq	LSYM(Ldiv0)
>>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>>>  	pop	{ work }
>>>  	RET
>>>  
>>> +#else
>>> +	/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
>>> +	   used in ARMv6-M when we need an efficient implementation.  */
>>> +LSYM(udivsi3_skip_div0_test):
>>> +	THUMB1_Div_Positive
>>> +
>>> +#endif /* __OPTIMIZE_SIZE__ */
>>> +
>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>  
>>>  	ARM_FUNC_START udivsi3
>>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>>>  FUNC_START aeabi_uidivmod
>>>  	cmp	r1, #0
>>>  	beq	LSYM(Ldiv0)
>>> +# if defined(__OPTIMIZE_SIZE__)
>>>  	push	{r0, r1, lr}
>>>  	bl	LSYM(udivsi3_skip_div0_test)
>>>  	POP	{r1, r2, r3}
>>>  	mul	r2, r0
>>>  	sub	r1, r1, r2
>>>  	bx	r3
>>> +# else
>>> +	/* Both the quotient and remainder are calculated simultaneously
>>> +	   in THUMB1_Div_Positive.  There is no need to calculate the
>>> +	   remainder again here.  */
>>> +	b	LSYM(udivsi3_skip_div0_test)
>>> +	RET
>>> +# endif /* __OPTIMIZE_SIZE__ */
>>> +
>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>  ARM_FUNC_START aeabi_uidivmod
>>>  	cmp	r1, #0
>>> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>>>  	RET
>>>  	
>>>  #else  /* ARM version.  */
>>> -	
>>> +
>>>  	FUNC_START umodsi3
>>>  
>>>  	subs	r2, r1, #1			@ compare divisor with 1
>>> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>>>  
>>>  #if defined(__prefer_thumb__)
>>>  
>>> -	FUNC_START divsi3	
>>> +	FUNC_START divsi3
>>>  	FUNC_ALIAS aeabi_idiv divsi3
>>> +#if defined(__OPTIMIZE_SIZE__)
>>>  
>>>  	cmp	divisor, #0
>>>  	beq	LSYM(Ldiv0)
>>> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>>>  	blo	LSYM(Lgot_result)
>>>  
>>>  	THUMB_DIV_MOD_BODY 0
>>> -	
>>> +
>>>  	mov	r0, result
>>>  	mov	work, ip
>>>  	cmp	work, #0
>>> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>>>  LSYM(Lover12):
>>>  	pop	{ work }
>>>  	RET
>>> +#else
>>> +	/* Implementation of aeabi_idiv for ARMv6m.  This version is only
>>> +	   used in ARMv6-M when we need an efficient implementation.  */
>>> +LSYM(divsi3_skip_div0_test):
>>> +	cpy	curbit, dividend
>>> +	orr	curbit, divisor
>>> +	bmi	LSYM(Lthumb1_div_negative)
>>> +
>>> +LSYM(Lthumb1_div_positive):
>>> +	THUMB1_Div_Positive
>>> +
>>> +LSYM(Lthumb1_div_negative):
>>> +	THUMB1_Div_Negative
>>> +
>>> +#endif /* __OPTIMIZE_SIZE__ */
>>>  
>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>  
>>> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>>>  	RET
>>>  
>>>  #else /* ARM/Thumb-2 version.  */
>>> -	
>>> -	ARM_FUNC_START divsi3	
>>> +
>>> +	ARM_FUNC_START divsi3
>>>  	ARM_FUNC_ALIAS aeabi_idiv divsi3
>>>  
>>>  	cmp	r1, #0
>>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>>>  FUNC_START aeabi_idivmod
>>>  	cmp	r1, #0
>>>  	beq	LSYM(Ldiv0)
>>> +# if defined(__OPTIMIZE_SIZE__)
>>>  	push	{r0, r1, lr}
>>>  	bl	LSYM(divsi3_skip_div0_test)
>>>  	POP	{r1, r2, r3}
>>>  	mul	r2, r0
>>>  	sub	r1, r1, r2
>>>  	bx	r3
>>> +# else
>>> +	/* Both the quotient and remainder are calculated simultaneously
>>> +	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
>>> +	   need to calculate the remainder again here.  */
>>> +	b	LSYM(divsi3_skip_div0_test)
>>> +	RET
>>> +# endif /* __OPTIMIZE_SIZE__ */
>>> +
>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>  ARM_FUNC_START aeabi_idivmod
>>>  	cmp 	r1, #0
>>> -- 1.9.1
>>>
>>
>> Otherwise OK if no regressions and the following request passes.
>>
>> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ? 
>>
>> regards
>> Ramana
>>
> Hi Ramana,
> 
> Thank you for the comments. Sorry about the license, must have been a
> mixup somewhere.
> 
> I put back the 'pop pc is safe' assembly comment and I moved some
> comments before the #if and #else as requested. I left some in place
> because they did not apply to the whole block but simply to the first
> assembly instruction after the #if/else.
> 
> I checked that the assembly generated for libgcc was the same with and
> without the patch for armv7-a in arm mode and armv7-m in thumb mode.
> 
> Is this OK?
> 
> Cheers,
> Andre
> 
> libgcc/ChangeLog:
> 2016-07-06  Hale Wang  <hale.wang@arm.com>
>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
> 
>    * config/arm/lib1funcs.S: Add new wrapper.
> 
I had to rebase the patch due to the ARMv8-M patches. This implied
changing a context line that changed due to the code for ARMv6-M being
reused for ARMv8-M Baseline.

I ran regression tests for both ARMv6-M and ARMv8-M Baseline and
compared the generated libgcc for ARMv7-A in ARM mode and ARMv7-M in
Thumb mode, observing no changes.

Applying patch, as it was previously OK'ed.

Cheers,
Andre

Comments

Andre Vieira (lists) July 12, 2016, 10:41 a.m. UTC | #1
On 11/07/16 18:09, Andre Vieira (lists) wrote:
> On 06/07/16 11:52, Andre Vieira (lists) wrote:
>> On 01/07/16 14:40, Ramana Radhakrishnan wrote:
>>>
>>>
>>> On 13/10/15 18:01, Andre Vieira wrote:
>>>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
>>>>
>>>> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.
>>>
>>> This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.
>>>
>>>>
>>>> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>>>>
>>>> Make check passed for armv6-m.
>>>>
>>>> libgcc/ChangeLog:
>>>> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>>>>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>>>
>>>>   * config/arm/lib1funcs.S: Add new wrapper.
>>>>
>>>> 0001-integer-division.patch
>>>>
>>>>
>>>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
>>>> From: Andre Simoes Dias Vieira <andsim01@arm.com>
>>>> Date: Fri, 21 Aug 2015 14:23:28 +0100
>>>> Subject: [PATCH] new wrapper idivmod
>>>>
>>>> ---
>>>>  libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
>>>>  1 file changed, 217 insertions(+), 33 deletions(-)
>>>>
>>>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
>>>> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
>>>> --- a/libgcc/config/arm/lib1funcs.S
>>>> +++ b/libgcc/config/arm/lib1funcs.S
>>>> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>>>>  #ifdef __ARM_EABI__
>>>>  .macro THUMB_LDIV0 name signed
>>>>  #if defined(__ARM_ARCH_6M__)
>>>> -	.ifc \signed, unsigned
>>>> -	cmp	r0, #0
>>>> -	beq	1f
>>>> -	mov	r0, #0
>>>> -	mvn	r0, r0		@ 0xffffffff
>>>> -1:
>>>> -	.else
>>>> -	cmp	r0, #0
>>>> -	beq	2f
>>>> -	blt	3f
>>>> +
>>>> +	push	{r0, lr}
>>>>  	mov	r0, #0
>>>> -	mvn	r0, r0
>>>> -	lsr	r0, r0, #1	@ 0x7fffffff
>>>> -	b	2f
>>>> -3:	mov	r0, #0x80
>>>> -	lsl	r0, r0, #24	@ 0x80000000
>>>> -2:
>>>> -	.endif
>>>> -	push	{r0, r1, r2}
>>>> -	ldr	r0, 4f
>>>> -	adr	r1, 4f
>>>> -	add	r0, r1
>>>> -	str	r0, [sp, #8]
>>>> -	@ We know we are not on armv4t, so pop pc is safe.
>>>> -	pop	{r0, r1, pc}
>>>> -	.align	2
>>>> -4:
>>>> -	.word	__aeabi_idiv0 - 4b
>>>> +	bl	SYM(__aeabi_idiv0)
>>>> +	pop	{r1, pc}
>>>> +
>>>
>>> I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.
>>>
>>>>  #elif defined(__thumb2__)
>>>>  	.syntax unified
>>>>  	.ifc \signed, unsigned
>>>> @@ -945,7 +923,170 @@ LSYM(Lover7):
>>>>  	add	dividend, work
>>>>    .endif
>>>>  LSYM(Lgot_result):
>>>> -.endm	
>>>> +.endm
>>>> +
>>>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
>>>> +/* If performance is preferred, the following functions are provided.  */
>>>> +
>>>
>>> Comment above #if please and also check elsewhere in patch.
>>>
>>>> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
>>>> +.macro BranchToDiv n, label
>>>> +	lsr	curbit, dividend, \n
>>>> +	cmp	curbit, divisor
>>>> +	blo	\label
>>>> +.endm
>>>> +
>>>> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
>>>> +   and dividend.  Update the dividend as the substruction result.  */
>>>> +.macro DoDiv n
>>>> +	lsr	curbit, dividend, \n
>>>> +	cmp	curbit, divisor
>>>> +	bcc	1f
>>>> +	lsl	curbit, divisor, \n
>>>> +	sub	dividend, dividend, curbit
>>>> +
>>>> +1:	adc	result, result
>>>> +.endm
>>>> +
>>>> +/* The body of division with positive divisor.  Unless the divisor is very
>>>> +   big, shift it up in multiples of four bits, since this is the amount of
>>>> +   unwinding in the main division loop.  Continue shifting until the divisor
>>>> +   is larger than the dividend.  */
>>>> +.macro THUMB1_Div_Positive
>>>> +	mov	result, #0
>>>> +	BranchToDiv #1, LSYM(Lthumb1_div1)
>>>> +	BranchToDiv #4, LSYM(Lthumb1_div4)
>>>> +	BranchToDiv #8, LSYM(Lthumb1_div8)
>>>> +	BranchToDiv #12, LSYM(Lthumb1_div12)
>>>> +	BranchToDiv #16, LSYM(Lthumb1_div16)
>>>> +LSYM(Lthumb1_div_large_positive):
>>>> +	mov	result, #0xff
>>>> +	lsl	divisor, divisor, #8
>>>> +	rev	result, result
>>>> +	lsr	curbit, dividend, #16
>>>> +	cmp	curbit, divisor
>>>> +	blo	1f
>>>> +	asr	result, #8
>>>> +	lsl	divisor, divisor, #8
>>>> +	beq	LSYM(Ldivbyzero_waypoint)
>>>> +
>>>> +1:	lsr	curbit, dividend, #12
>>>> +	cmp	curbit, divisor
>>>> +	blo	LSYM(Lthumb1_div12)
>>>> +	b	LSYM(Lthumb1_div16)
>>>> +LSYM(Lthumb1_div_loop):
>>>> +	lsr	divisor, divisor, #8
>>>> +LSYM(Lthumb1_div16):
>>>> +	Dodiv	#15
>>>> +	Dodiv	#14
>>>> +	Dodiv	#13
>>>> +	Dodiv	#12
>>>> +LSYM(Lthumb1_div12):
>>>> +	Dodiv	#11
>>>> +	Dodiv	#10
>>>> +	Dodiv	#9
>>>> +	Dodiv	#8
>>>> +	bcs	LSYM(Lthumb1_div_loop)
>>>> +LSYM(Lthumb1_div8):
>>>> +	Dodiv	#7
>>>> +	Dodiv	#6
>>>> +	Dodiv	#5
>>>> +LSYM(Lthumb1_div5):
>>>> +	Dodiv	#4
>>>> +LSYM(Lthumb1_div4):
>>>> +	Dodiv	#3
>>>> +LSYM(Lthumb1_div3):
>>>> +	Dodiv	#2
>>>> +LSYM(Lthumb1_div2):
>>>> +	Dodiv	#1
>>>> +LSYM(Lthumb1_div1):
>>>> +	sub	divisor, dividend, divisor
>>>> +	bcs	1f
>>>> +	cpy	divisor, dividend
>>>> +
>>>> +1:	adc	result, result
>>>> +	cpy	dividend, result
>>>> +	RET
>>>> +
>>>> +LSYM(Ldivbyzero_waypoint):
>>>> +	b	LSYM(Ldiv0)
>>>> +.endm
>>>> +
>>>> +/* The body of division with negative divisor.  Similar with
>>>> +   THUMB1_Div_Positive except that the shift steps are in multiples
>>>> +   of six bits.  */
>>>> +.macro THUMB1_Div_Negative
>>>> +	lsr	result, divisor, #31
>>>> +	beq	1f
>>>> +	neg	divisor, divisor
>>>> +
>>>> +1:	asr	curbit, dividend, #32
>>>> +	bcc	2f
>>>> +	neg	dividend, dividend
>>>> +
>>>> +2:	eor	curbit, result
>>>> +	mov	result, #0
>>>> +	cpy	ip, curbit
>>>> +	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
>>>> +	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
>>>> +LSYM(Lthumb1_div_large):
>>>> +	mov	result, #0xfc
>>>> +	lsl	divisor, divisor, #6
>>>> +	rev	result, result
>>>> +	lsr	curbit, dividend, #8
>>>> +	cmp	curbit, divisor
>>>> +	blo	LSYM(Lthumb1_div_negative8)
>>>> +
>>>> +	lsl	divisor, divisor, #6
>>>> +	asr	result, result, #6
>>>> +	cmp	curbit, divisor
>>>> +	blo	LSYM(Lthumb1_div_negative8)
>>>> +
>>>> +	lsl	divisor, divisor, #6
>>>> +	asr	result, result, #6
>>>> +	cmp	curbit, divisor
>>>> +	blo	LSYM(Lthumb1_div_negative8)
>>>> +
>>>> +	lsl	divisor, divisor, #6
>>>> +	beq	LSYM(Ldivbyzero_negative)
>>>> +	asr	result, result, #6
>>>> +	b	LSYM(Lthumb1_div_negative8)
>>>> +LSYM(Lthumb1_div_negative_loop):
>>>> +	lsr	divisor, divisor, #6
>>>> +LSYM(Lthumb1_div_negative8):
>>>> +	DoDiv	#7
>>>> +	DoDiv	#6
>>>> +	DoDiv	#5
>>>> +	DoDiv	#4
>>>> +LSYM(Lthumb1_div_negative4):
>>>> +	DoDiv	#3
>>>> +	DoDiv	#2
>>>> +	bcs	LSYM(Lthumb1_div_negative_loop)
>>>> +	DoDiv	#1
>>>> +	sub	divisor, dividend, divisor
>>>> +	bcs	1f
>>>> +	cpy	divisor, dividend
>>>> +
>>>> +1:	cpy	curbit, ip
>>>> +	adc	result, result
>>>> +	asr	curbit, curbit, #1
>>>> +	cpy	dividend, result
>>>> +	bcc	2f
>>>> +	neg	dividend, dividend
>>>> +	cmp	curbit, #0
>>>> +
>>>> +2:	bpl	3f
>>>> +	neg	divisor, divisor
>>>> +
>>>> +3:	RET
>>>> +
>>>> +LSYM(Ldivbyzero_negative):
>>>> +	cpy	curbit, ip
>>>> +	asr	curbit, curbit, #1
>>>> +	bcc	LSYM(Ldiv0)
>>>> +	neg	dividend, dividend
>>>> +.endm
>>>> +#endif /* ARM Thumb version.  */
>>>> +
>>>>  /* ------------------------------------------------------------------------ */
>>>>  /*		Start of the Real Functions				    */
>>>>  /* ------------------------------------------------------------------------ */
>>>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>>>>  
>>>>  	FUNC_START udivsi3
>>>>  	FUNC_ALIAS aeabi_uidiv udivsi3
>>>> +#if defined(__OPTIMIZE_SIZE__)
>>>>  
>>>>  	cmp	divisor, #0
>>>>  	beq	LSYM(Ldiv0)
>>>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>>>>  	pop	{ work }
>>>>  	RET
>>>>  
>>>> +#else
>>>> +	/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
>>>> +	   used in ARMv6-M when we need an efficient implementation.  */
>>>> +LSYM(udivsi3_skip_div0_test):
>>>> +	THUMB1_Div_Positive
>>>> +
>>>> +#endif /* __OPTIMIZE_SIZE__ */
>>>> +
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  
>>>>  	ARM_FUNC_START udivsi3
>>>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>>>>  FUNC_START aeabi_uidivmod
>>>>  	cmp	r1, #0
>>>>  	beq	LSYM(Ldiv0)
>>>> +# if defined(__OPTIMIZE_SIZE__)
>>>>  	push	{r0, r1, lr}
>>>>  	bl	LSYM(udivsi3_skip_div0_test)
>>>>  	POP	{r1, r2, r3}
>>>>  	mul	r2, r0
>>>>  	sub	r1, r1, r2
>>>>  	bx	r3
>>>> +# else
>>>> +	/* Both the quotient and remainder are calculated simultaneously
>>>> +	   in THUMB1_Div_Positive.  There is no need to calculate the
>>>> +	   remainder again here.  */
>>>> +	b	LSYM(udivsi3_skip_div0_test)
>>>> +	RET
>>>> +# endif /* __OPTIMIZE_SIZE__ */
>>>> +
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  ARM_FUNC_START aeabi_uidivmod
>>>>  	cmp	r1, #0
>>>> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>>>>  	RET
>>>>  	
>>>>  #else  /* ARM version.  */
>>>> -	
>>>> +
>>>>  	FUNC_START umodsi3
>>>>  
>>>>  	subs	r2, r1, #1			@ compare divisor with 1
>>>> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>>>>  
>>>>  #if defined(__prefer_thumb__)
>>>>  
>>>> -	FUNC_START divsi3	
>>>> +	FUNC_START divsi3
>>>>  	FUNC_ALIAS aeabi_idiv divsi3
>>>> +#if defined(__OPTIMIZE_SIZE__)
>>>>  
>>>>  	cmp	divisor, #0
>>>>  	beq	LSYM(Ldiv0)
>>>> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>>>>  	blo	LSYM(Lgot_result)
>>>>  
>>>>  	THUMB_DIV_MOD_BODY 0
>>>> -	
>>>> +
>>>>  	mov	r0, result
>>>>  	mov	work, ip
>>>>  	cmp	work, #0
>>>> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>>>>  LSYM(Lover12):
>>>>  	pop	{ work }
>>>>  	RET
>>>> +#else
>>>> +	/* Implementation of aeabi_idiv for ARMv6m.  This version is only
>>>> +	   used in ARMv6-M when we need an efficient implementation.  */
>>>> +LSYM(divsi3_skip_div0_test):
>>>> +	cpy	curbit, dividend
>>>> +	orr	curbit, divisor
>>>> +	bmi	LSYM(Lthumb1_div_negative)
>>>> +
>>>> +LSYM(Lthumb1_div_positive):
>>>> +	THUMB1_Div_Positive
>>>> +
>>>> +LSYM(Lthumb1_div_negative):
>>>> +	THUMB1_Div_Negative
>>>> +
>>>> +#endif /* __OPTIMIZE_SIZE__ */
>>>>  
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  
>>>> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>>>>  	RET
>>>>  
>>>>  #else /* ARM/Thumb-2 version.  */
>>>> -	
>>>> -	ARM_FUNC_START divsi3	
>>>> +
>>>> +	ARM_FUNC_START divsi3
>>>>  	ARM_FUNC_ALIAS aeabi_idiv divsi3
>>>>  
>>>>  	cmp	r1, #0
>>>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>>>>  FUNC_START aeabi_idivmod
>>>>  	cmp	r1, #0
>>>>  	beq	LSYM(Ldiv0)
>>>> +# if defined(__OPTIMIZE_SIZE__)
>>>>  	push	{r0, r1, lr}
>>>>  	bl	LSYM(divsi3_skip_div0_test)
>>>>  	POP	{r1, r2, r3}
>>>>  	mul	r2, r0
>>>>  	sub	r1, r1, r2
>>>>  	bx	r3
>>>> +# else
>>>> +	/* Both the quotient and remainder are calculated simultaneously
>>>> +	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
>>>> +	   need to calculate the remainder again here.  */
>>>> +	b	LSYM(divsi3_skip_div0_test)
>>>> +	RET
>>>> +# endif /* __OPTIMIZE_SIZE__ */
>>>> +
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  ARM_FUNC_START aeabi_idivmod
>>>>  	cmp 	r1, #0
>>>> -- 1.9.1
>>>>
>>>
>>> Otherwise OK if no regressions and the following request passes.
>>>
>>> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ? 
>>>
>>> regards
>>> Ramana
>>>
>> Hi Ramana,
>>
>> Thank you for the comments. Sorry about the license, must have been a
>> mixup somewhere.
>>
>> I put back the 'pop pc is safe' assembly comment and I moved some
>> comments before the #if and #else as requested. I left some in place
>> because they did not apply to the whole block but simply to the first
>> assembly instruction after the #if/else.
>>
>> I checked that the assembly generated for libgcc was the same with and
>> without the patch for armv7-a in arm mode and armv7-m in thumb mode.
>>
>> Is this OK?
>>
>> Cheers,
>> Andre
>>
>> libgcc/ChangeLog:
>> 2016-07-06  Hale Wang  <hale.wang@arm.com>
>>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>
>>    * config/arm/lib1funcs.S: Add new wrapper.
>>
> I had to rebase the patch due to the ARMv8-M patches. This implied
> changing a context line that changed due to the code for ARMv6-M being
> reused for ARMv8-M Baseline.
> 
> I ran regression tests for both ARMv6-M and ARMv8-M Baseline and
> compared the generated libgcc for ARMv7-A in ARM mode and ARMv7-M in
> Thumb mode, observing no changes.
> 
> Applying patch, as it was previously OK'ed.
> 
> Cheers,
> Andre
> 
Backported to embedded-6-branch at revision r238240.

Cheers,
Andre
diff mbox

Patch

From ad757144fad2d9608ed840153071bb5d470193ef Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsim01@arm.com>
Date: Thu, 7 Jul 2016 10:49:19 +0100
Subject: [PATCH] integer division

---
 libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 218 insertions(+), 32 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 96e206ee542126c5d68091087446afe9f01aa51f..ba52e7b762f5573445349a574a3878859a992f13 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -311,34 +311,13 @@  LSYM(Lend_fde):
 #ifdef __ARM_EABI__
 .macro THUMB_LDIV0 name signed
 #ifdef NOT_ISA_TARGET_32BIT
-	.ifc \signed, unsigned
-	cmp	r0, #0
-	beq	1f
-	mov	r0, #0
-	mvn	r0, r0		@ 0xffffffff
-1:
-	.else
-	cmp	r0, #0
-	beq	2f
-	blt	3f
+
+	push	{r0, lr}
 	mov	r0, #0
-	mvn	r0, r0
-	lsr	r0, r0, #1	@ 0x7fffffff
-	b	2f
-3:	mov	r0, #0x80
-	lsl	r0, r0, #24	@ 0x80000000
-2:
-	.endif
-	push	{r0, r1, r2}
-	ldr	r0, 4f
-	adr	r1, 4f
-	add	r0, r1
-	str	r0, [sp, #8]
+	bl	SYM(__aeabi_idiv0)
 	@ We know we are not on armv4t, so pop pc is safe.
-	pop	{r0, r1, pc}
-	.align	2
-4:
-	.word	__aeabi_idiv0 - 4b
+	pop	{r1, pc}
+
 #elif defined(__thumb2__)
 	.syntax unified
 	.ifc \signed, unsigned
@@ -950,7 +929,170 @@  LSYM(Lover7):
 	add	dividend, work
   .endif
 LSYM(Lgot_result):
-.endm	
+.endm
+
+/* If performance is preferred, the following functions are provided.  */
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
+.macro BranchToDiv n, label
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	blo	\label
+.endm
+
+/* Body of div(n).  Shift the divisor in n bits and compare the divisor
+   and dividend.  Update the dividend as the substruction result.  */
+.macro DoDiv n
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	bcc	1f
+	lsl	curbit, divisor, \n
+	sub	dividend, dividend, curbit
+
+1:	adc	result, result
+.endm
+
+/* The body of division with positive divisor.  Unless the divisor is very
+   big, shift it up in multiples of four bits, since this is the amount of
+   unwinding in the main division loop.  Continue shifting until the divisor
+   is larger than the dividend.  */
+.macro THUMB1_Div_Positive
+	mov	result, #0
+	BranchToDiv #1, LSYM(Lthumb1_div1)
+	BranchToDiv #4, LSYM(Lthumb1_div4)
+	BranchToDiv #8, LSYM(Lthumb1_div8)
+	BranchToDiv #12, LSYM(Lthumb1_div12)
+	BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+	mov	result, #0xff
+	lsl	divisor, divisor, #8
+	rev	result, result
+	lsr	curbit, dividend, #16
+	cmp	curbit, divisor
+	blo	1f
+	asr	result, #8
+	lsl	divisor, divisor, #8
+	beq	LSYM(Ldivbyzero_waypoint)
+
+1:	lsr	curbit, dividend, #12
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div12)
+	b	LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+	lsr	divisor, divisor, #8
+LSYM(Lthumb1_div16):
+	Dodiv	#15
+	Dodiv	#14
+	Dodiv	#13
+	Dodiv	#12
+LSYM(Lthumb1_div12):
+	Dodiv	#11
+	Dodiv	#10
+	Dodiv	#9
+	Dodiv	#8
+	bcs	LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+	Dodiv	#7
+	Dodiv	#6
+	Dodiv	#5
+LSYM(Lthumb1_div5):
+	Dodiv	#4
+LSYM(Lthumb1_div4):
+	Dodiv	#3
+LSYM(Lthumb1_div3):
+	Dodiv	#2
+LSYM(Lthumb1_div2):
+	Dodiv	#1
+LSYM(Lthumb1_div1):
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	adc	result, result
+	cpy	dividend, result
+	RET
+
+LSYM(Ldivbyzero_waypoint):
+	b	LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor.  Similar with
+   THUMB1_Div_Positive except that the shift steps are in multiples
+   of six bits.  */
+.macro THUMB1_Div_Negative
+	lsr	result, divisor, #31
+	beq	1f
+	neg	divisor, divisor
+
+1:	asr	curbit, dividend, #32
+	bcc	2f
+	neg	dividend, dividend
+
+2:	eor	curbit, result
+	mov	result, #0
+	cpy	ip, curbit
+	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+	mov	result, #0xfc
+	lsl	divisor, divisor, #6
+	rev	result, result
+	lsr	curbit, dividend, #8
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	beq	LSYM(Ldivbyzero_negative)
+	asr	result, result, #6
+	b	LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+	lsr	divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+	DoDiv	#7
+	DoDiv	#6
+	DoDiv	#5
+	DoDiv	#4
+LSYM(Lthumb1_div_negative4):
+	DoDiv	#3
+	DoDiv	#2
+	bcs	LSYM(Lthumb1_div_negative_loop)
+	DoDiv	#1
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	cpy	curbit, ip
+	adc	result, result
+	asr	curbit, curbit, #1
+	cpy	dividend, result
+	bcc	2f
+	neg	dividend, dividend
+	cmp	curbit, #0
+
+2:	bpl	3f
+	neg	divisor, divisor
+
+3:	RET
+
+LSYM(Ldivbyzero_negative):
+	cpy	curbit, ip
+	asr	curbit, curbit, #1
+	bcc	LSYM(Ldiv0)
+	neg	dividend, dividend
+.endm
+#endif /* ARM Thumb version.  */
+
 /* ------------------------------------------------------------------------ */
 /*		Start of the Real Functions				    */
 /* ------------------------------------------------------------------------ */
@@ -960,6 +1102,7 @@  LSYM(Lgot_result):
 
 	FUNC_START udivsi3
 	FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -977,6 +1120,14 @@  LSYM(udivsi3_skip_div0_test):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(udivsi3_skip_div0_test):
+	THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START udivsi3
@@ -1028,12 +1179,21 @@  LSYM(udivsi3_skip_div0_test):
 FUNC_START aeabi_uidivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(udivsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive.  There is no need to calculate the
+	   remainder again here.  */
+	b	LSYM(udivsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_uidivmod
 	cmp	r1, #0
@@ -1089,7 +1249,7 @@  LSYM(Lover10):
 	RET
 	
 #else  /* ARM version.  */
-	
+
 	FUNC_START umodsi3
 
 	subs	r2, r1, #1			@ compare divisor with 1
@@ -1114,8 +1274,9 @@  LSYM(Lover10):
 
 #if defined(__prefer_thumb__)
 
-	FUNC_START divsi3	
+	FUNC_START divsi3
 	FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -1138,7 +1299,7 @@  LSYM(Lover11):
 	blo	LSYM(Lgot_result)
 
 	THUMB_DIV_MOD_BODY 0
-	
+
 	mov	r0, result
 	mov	work, ip
 	cmp	work, #0
@@ -1148,6 +1309,22 @@  LSYM(Lover12):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_idiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(divsi3_skip_div0_test):
+	cpy	curbit, dividend
+	orr	curbit, divisor
+	bmi	LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+	THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+	THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START divsi3
@@ -1159,8 +1336,8 @@  LSYM(Lover12):
 	RET
 
 #else /* ARM/Thumb-2 version.  */
-	
-	ARM_FUNC_START divsi3	
+
+	ARM_FUNC_START divsi3
 	ARM_FUNC_ALIAS aeabi_idiv divsi3
 
 	cmp	r1, #0
@@ -1214,12 +1391,21 @@  LSYM(divsi3_skip_div0_test):
 FUNC_START aeabi_idivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(divsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
+	   need to calculate the remainder again here.  */
+	b	LSYM(divsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_idivmod
 	cmp 	r1, #0
-- 
1.9.1