From b5b129e698b9e7446907d3da1fbce0236b09b67c Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsim01@arm.com>
Date: Mon, 16 May 2016 18:34:52 +0100
Subject: [PATCH 1/2] integer division
---
libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
1 file changed, 218 insertions(+), 32 deletions(-)
@@ -306,34 +306,13 @@ LSYM(Lend_fde):
#ifdef __ARM_EABI__
.macro THUMB_LDIV0 name signed
#if defined(__ARM_ARCH_6M__)
- .ifc \signed, unsigned
- cmp r0, #0
- beq 1f
- mov r0, #0
- mvn r0, r0 @ 0xffffffff
-1:
- .else
- cmp r0, #0
- beq 2f
- blt 3f
+
+ push {r0, lr}
mov r0, #0
- mvn r0, r0
- lsr r0, r0, #1 @ 0x7fffffff
- b 2f
-3: mov r0, #0x80
- lsl r0, r0, #24 @ 0x80000000
-2:
- .endif
- push {r0, r1, r2}
- ldr r0, 4f
- adr r1, 4f
- add r0, r1
- str r0, [sp, #8]
+ bl SYM(__aeabi_idiv0)
@ We know we are not on armv4t, so pop pc is safe.
- pop {r0, r1, pc}
- .align 2
-4:
- .word __aeabi_idiv0 - 4b
+ pop {r1, pc}
+
#elif defined(__thumb2__)
.syntax unified
.ifc \signed, unsigned
@@ -945,7 +924,170 @@ LSYM(Lover7):
add dividend, work
.endif
LSYM(Lgot_result):
-.endm
+.endm
+
+/* If performance is preferred, the following functions are provided. */
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior. */
+.macro BranchToDiv n, label
+ lsr curbit, dividend, \n
+ cmp curbit, divisor
+ blo \label
+.endm
+
+/* Body of div(n). Shift the divisor in n bits and compare the divisor
+ and dividend. Update the dividend as the substruction result. */
+.macro DoDiv n
+ lsr curbit, dividend, \n
+ cmp curbit, divisor
+ bcc 1f
+ lsl curbit, divisor, \n
+ sub dividend, dividend, curbit
+
+1: adc result, result
+.endm
+
+/* The body of division with positive divisor. Unless the divisor is very
+ big, shift it up in multiples of four bits, since this is the amount of
+ unwinding in the main division loop. Continue shifting until the divisor
+ is larger than the dividend. */
+.macro THUMB1_Div_Positive
+ mov result, #0
+ BranchToDiv #1, LSYM(Lthumb1_div1)
+ BranchToDiv #4, LSYM(Lthumb1_div4)
+ BranchToDiv #8, LSYM(Lthumb1_div8)
+ BranchToDiv #12, LSYM(Lthumb1_div12)
+ BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+ mov result, #0xff
+ lsl divisor, divisor, #8
+ rev result, result
+ lsr curbit, dividend, #16
+ cmp curbit, divisor
+ blo 1f
+ asr result, #8
+ lsl divisor, divisor, #8
+ beq LSYM(Ldivbyzero_waypoint)
+
+1: lsr curbit, dividend, #12
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div12)
+ b LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+ lsr divisor, divisor, #8
+LSYM(Lthumb1_div16):
+ Dodiv #15
+ Dodiv #14
+ Dodiv #13
+ Dodiv #12
+LSYM(Lthumb1_div12):
+ Dodiv #11
+ Dodiv #10
+ Dodiv #9
+ Dodiv #8
+ bcs LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+ Dodiv #7
+ Dodiv #6
+ Dodiv #5
+LSYM(Lthumb1_div5):
+ Dodiv #4
+LSYM(Lthumb1_div4):
+ Dodiv #3
+LSYM(Lthumb1_div3):
+ Dodiv #2
+LSYM(Lthumb1_div2):
+ Dodiv #1
+LSYM(Lthumb1_div1):
+ sub divisor, dividend, divisor
+ bcs 1f
+ cpy divisor, dividend
+
+1: adc result, result
+ cpy dividend, result
+ RET
+
+LSYM(Ldivbyzero_waypoint):
+ b LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor. Similar with
+ THUMB1_Div_Positive except that the shift steps are in multiples
+ of six bits. */
+.macro THUMB1_Div_Negative
+ lsr result, divisor, #31
+ beq 1f
+ neg divisor, divisor
+
+1: asr curbit, dividend, #32
+ bcc 2f
+ neg dividend, dividend
+
+2: eor curbit, result
+ mov result, #0
+ cpy ip, curbit
+ BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+ BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+ mov result, #0xfc
+ lsl divisor, divisor, #6
+ rev result, result
+ lsr curbit, dividend, #8
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div_negative8)
+
+ lsl divisor, divisor, #6
+ asr result, result, #6
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div_negative8)
+
+ lsl divisor, divisor, #6
+ asr result, result, #6
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div_negative8)
+
+ lsl divisor, divisor, #6
+ beq LSYM(Ldivbyzero_negative)
+ asr result, result, #6
+ b LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+ lsr divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+ DoDiv #7
+ DoDiv #6
+ DoDiv #5
+ DoDiv #4
+LSYM(Lthumb1_div_negative4):
+ DoDiv #3
+ DoDiv #2
+ bcs LSYM(Lthumb1_div_negative_loop)
+ DoDiv #1
+ sub divisor, dividend, divisor
+ bcs 1f
+ cpy divisor, dividend
+
+1: cpy curbit, ip
+ adc result, result
+ asr curbit, curbit, #1
+ cpy dividend, result
+ bcc 2f
+ neg dividend, dividend
+ cmp curbit, #0
+
+2: bpl 3f
+ neg divisor, divisor
+
+3: RET
+
+LSYM(Ldivbyzero_negative):
+ cpy curbit, ip
+ asr curbit, curbit, #1
+ bcc LSYM(Ldiv0)
+ neg dividend, dividend
+.endm
+#endif /* ARM Thumb version. */
+
/* ------------------------------------------------------------------------ */
/* Start of the Real Functions */
/* ------------------------------------------------------------------------ */
@@ -955,6 +1097,7 @@ LSYM(Lgot_result):
FUNC_START udivsi3
FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
cmp divisor, #0
beq LSYM(Ldiv0)
@@ -972,6 +1115,14 @@ LSYM(udivsi3_skip_div0_test):
pop { work }
RET
+/* Implementation of aeabi_uidiv for ARMv6m. This version is only
+ used in ARMv6-M when we need an efficient implementation. */
+#else
+LSYM(udivsi3_skip_div0_test):
+ THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START udivsi3
@@ -1023,12 +1174,21 @@ LSYM(udivsi3_skip_div0_test):
FUNC_START aeabi_uidivmod
cmp r1, #0
beq LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
push {r0, r1, lr}
bl LSYM(udivsi3_skip_div0_test)
POP {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
+# else
+ /* Both the quotient and remainder are calculated simultaneously
+ in THUMB1_Div_Positive. There is no need to calculate the
+ remainder again here. */
+ b LSYM(udivsi3_skip_div0_test)
+ RET
+# endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START aeabi_uidivmod
cmp r1, #0
@@ -1084,7 +1244,7 @@ LSYM(Lover10):
RET
#else /* ARM version. */
-
+
FUNC_START umodsi3
subs r2, r1, #1 @ compare divisor with 1
@@ -1109,8 +1269,9 @@ LSYM(Lover10):
#if defined(__prefer_thumb__)
- FUNC_START divsi3
+ FUNC_START divsi3
FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
cmp divisor, #0
beq LSYM(Ldiv0)
@@ -1133,7 +1294,7 @@ LSYM(Lover11):
blo LSYM(Lgot_result)
THUMB_DIV_MOD_BODY 0
-
+
mov r0, result
mov work, ip
cmp work, #0
@@ -1143,6 +1304,22 @@ LSYM(Lover12):
pop { work }
RET
+/* Implementation of aeabi_idiv for ARMv6m. This version is only
+ used in ARMv6-M when we need an efficient implementation. */
+#else
+LSYM(divsi3_skip_div0_test):
+ cpy curbit, dividend
+ orr curbit, divisor
+ bmi LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+ THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+ THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START divsi3
@@ -1154,8 +1331,8 @@ LSYM(Lover12):
RET
#else /* ARM/Thumb-2 version. */
-
- ARM_FUNC_START divsi3
+
+ ARM_FUNC_START divsi3
ARM_FUNC_ALIAS aeabi_idiv divsi3
cmp r1, #0
@@ -1209,12 +1386,21 @@ LSYM(divsi3_skip_div0_test):
FUNC_START aeabi_idivmod
cmp r1, #0
beq LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
push {r0, r1, lr}
bl LSYM(divsi3_skip_div0_test)
POP {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
+# else
+ /* Both the quotient and remainder are calculated simultaneously
+ in THUMB1_Div_Positive and THUMB1_Div_Negative. There is no
+ need to calculate the remainder again here. */
+ b LSYM(divsi3_skip_div0_test)
+ RET
+# endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START aeabi_idivmod
cmp r1, #0
--
1.9.1