From patchwork Tue Oct 13 17:01:43 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Andre Vieira (lists)" <Andre.SimoesDiasVieira@arm.com>
X-Patchwork-Id: 529865
Return-Path: 
 <gcc-patches-return-410061-incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 03DA4140187
	for <incoming@patchwork.ozlabs.org>;
	Wed, 14 Oct 2015 04:02:03 +1100 (AEDT)
Authentication-Results: ozlabs.org; dkim=pass (1024-bit key;
	unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org
	header.b=QVJ8Cxip; dkim-atps=neutral
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:to
	:from:subject:message-id:date:mime-version:content-type; q=dns;
	s=default; b=PGU440Mm6qjczDuiQIXkahXT7QlRnsvRKeUgMoYyLvhzUelVHy
	4Ii3kiKsGacj+dAxI90oVphaxM0rfja9vPrFxiGRkVTWjjuU/3m3Fe0YxXhl73lh
	B199w32dT3uLxoBLHefgQ/rfQiyo/hw8CTcehzJ5QwFc+TRHiv/9gRQzQ=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:to
	:from:subject:message-id:date:mime-version:content-type; s=
	default; bh=Fha4HrCoEfpPkQwo5YFY2b7xYBY=; b=QVJ8Cxipuo4v5FiFA2DN
	rJrgRdlO7+LMkg4cww+hwcccw8eWqKqDpm0bJNBJALvwHaqzB08Owvq2B5wRqpRi
	N8dljMsK2/IAeWAcD+/BhcOSRlWaL9zhFmVoDoR/zGcKvIaOSuHTZFpIIp3Mi1/d
	UQLWAsYdGZTBheQQS9nOAFg=
Received: (qmail 89583 invoked by alias); 13 Oct 2015 17:01:56 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Unsubscribe: 
 <mailto:gcc-patches-unsubscribe-incoming=patchwork.ozlabs.org@gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Delivered-To: mailing list gcc-patches@gcc.gnu.org
Received: (qmail 89568 invoked by uid 89); 13 Oct 2015 17:01:55 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.1 required=5.0 tests=AWL, BAYES_00,
	KAM_ASCII_DIVIDERS, SPF_PASS autolearn=no version=3.3.2
X-HELO: eu-smtp-delivery-143.mimecast.com
Received: from eu-smtp-delivery-143.mimecast.com (HELO
	eu-smtp-delivery-143.mimecast.com) (146.101.78.143) by
	sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP;
	Tue, 13 Oct 2015 17:01:48 +0000
Received: from cam-owa2.Emea.Arm.com (fw-tnat.cambridge.arm.com
	[217.140.96.140]) by eu-smtp-1.mimecast.com with ESMTP id
	uk-mta-15-nlSJ7uXgThubEj1jVW8Qlw-1; Tue, 13 Oct 2015 18:01:43 +0100
Received: from [10.2.207.14] ([10.1.2.79]) by cam-owa2.Emea.Arm.com with
	Microsoft SMTPSVC(6.0.3790.3959); Tue, 13 Oct 2015 18:01:43 +0100
To: GCC Patches <gcc-patches@gcc.gnu.org>
From: Andre Vieira <Andre.SimoesDiasVieira@arm.com>
Subject: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
Message-ID: <561D38F7.809@arm.com>
Date: Tue, 13 Oct 2015 18:01:43 +0100
User-Agent: Mozilla/5.0 (X11; Linux x86_64;
	rv:38.0) Gecko/20100101 Thunderbird/38.2.0
MIME-Version: 1.0
X-MC-Unique: nlSJ7uXgThubEj1jVW8Qlw-1
X-IsSubscribed: yes

This patch ports the aeabi_idiv routine from Linaro Cortex-Strings 
(https://git.linaro.org/toolchain/cortex-strings.git), which was 
contributed by ARM under Free BSD license.

The new aeabi_idiv routine is used to replace the one in 
libgcc/config/arm/lib1funcs.S. This replacement happens within the 
Thumb1 wrapper. The new routine is under LGPLv3 license.

The main advantage of this version is that it can improve the 
performance of the aeabi_idiv function for Thumb1. This solution will 
also increase the code size. So it will only be used if 
__OPTIMIZE_SIZE__ is not defined.

Make check passed for armv6-m.

libgcc/ChangeLog:
2015-08-10  Hale Wang  <hale.wang@arm.com>
             Andre Vieira  <andre.simoesdiasvieira@arm.com>

   * config/arm/lib1funcs.S: Add new wrapper.

From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsim01@arm.com>
Date: Fri, 21 Aug 2015 14:23:28 +0100
Subject: [PATCH] new wrapper idivmod

---
 libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 217 insertions(+), 33 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -306,34 +306,12 @@ LSYM(Lend_fde):
 #ifdef __ARM_EABI__
 .macro THUMB_LDIV0 name signed
 #if defined(__ARM_ARCH_6M__)
-	.ifc \signed, unsigned
-	cmp	r0, #0
-	beq	1f
-	mov	r0, #0
-	mvn	r0, r0		@ 0xffffffff
-1:
-	.else
-	cmp	r0, #0
-	beq	2f
-	blt	3f
+
+	push	{r0, lr}
 	mov	r0, #0
-	mvn	r0, r0
-	lsr	r0, r0, #1	@ 0x7fffffff
-	b	2f
-3:	mov	r0, #0x80
-	lsl	r0, r0, #24	@ 0x80000000
-2:
-	.endif
-	push	{r0, r1, r2}
-	ldr	r0, 4f
-	adr	r1, 4f
-	add	r0, r1
-	str	r0, [sp, #8]
-	@ We know we are not on armv4t, so pop pc is safe.
-	pop	{r0, r1, pc}
-	.align	2
-4:
-	.word	__aeabi_idiv0 - 4b
+	bl	SYM(__aeabi_idiv0)
+	pop	{r1, pc}
+
 #elif defined(__thumb2__)
 	.syntax unified
 	.ifc \signed, unsigned
@@ -945,7 +923,170 @@ LSYM(Lover7):
 	add	dividend, work
   .endif
 LSYM(Lgot_result):
-.endm	
+.endm
+
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+/* If performance is preferred, the following functions are provided.  */
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
+.macro BranchToDiv n, label
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	blo	\label
+.endm
+
+/* Body of div(n).  Shift the divisor in n bits and compare the divisor
+   and dividend.  Update the dividend as the substruction result.  */
+.macro DoDiv n
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	bcc	1f
+	lsl	curbit, divisor, \n
+	sub	dividend, dividend, curbit
+
+1:	adc	result, result
+.endm
+
+/* The body of division with positive divisor.  Unless the divisor is very
+   big, shift it up in multiples of four bits, since this is the amount of
+   unwinding in the main division loop.  Continue shifting until the divisor
+   is larger than the dividend.  */
+.macro THUMB1_Div_Positive
+	mov	result, #0
+	BranchToDiv #1, LSYM(Lthumb1_div1)
+	BranchToDiv #4, LSYM(Lthumb1_div4)
+	BranchToDiv #8, LSYM(Lthumb1_div8)
+	BranchToDiv #12, LSYM(Lthumb1_div12)
+	BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+	mov	result, #0xff
+	lsl	divisor, divisor, #8
+	rev	result, result
+	lsr	curbit, dividend, #16
+	cmp	curbit, divisor
+	blo	1f
+	asr	result, #8
+	lsl	divisor, divisor, #8
+	beq	LSYM(Ldivbyzero_waypoint)
+
+1:	lsr	curbit, dividend, #12
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div12)
+	b	LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+	lsr	divisor, divisor, #8
+LSYM(Lthumb1_div16):
+	Dodiv	#15
+	Dodiv	#14
+	Dodiv	#13
+	Dodiv	#12
+LSYM(Lthumb1_div12):
+	Dodiv	#11
+	Dodiv	#10
+	Dodiv	#9
+	Dodiv	#8
+	bcs	LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+	Dodiv	#7
+	Dodiv	#6
+	Dodiv	#5
+LSYM(Lthumb1_div5):
+	Dodiv	#4
+LSYM(Lthumb1_div4):
+	Dodiv	#3
+LSYM(Lthumb1_div3):
+	Dodiv	#2
+LSYM(Lthumb1_div2):
+	Dodiv	#1
+LSYM(Lthumb1_div1):
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	adc	result, result
+	cpy	dividend, result
+	RET
+
+LSYM(Ldivbyzero_waypoint):
+	b	LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor.  Similar with
+   THUMB1_Div_Positive except that the shift steps are in multiples
+   of six bits.  */
+.macro THUMB1_Div_Negative
+	lsr	result, divisor, #31
+	beq	1f
+	neg	divisor, divisor
+
+1:	asr	curbit, dividend, #32
+	bcc	2f
+	neg	dividend, dividend
+
+2:	eor	curbit, result
+	mov	result, #0
+	cpy	ip, curbit
+	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+	mov	result, #0xfc
+	lsl	divisor, divisor, #6
+	rev	result, result
+	lsr	curbit, dividend, #8
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	beq	LSYM(Ldivbyzero_negative)
+	asr	result, result, #6
+	b	LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+	lsr	divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+	DoDiv	#7
+	DoDiv	#6
+	DoDiv	#5
+	DoDiv	#4
+LSYM(Lthumb1_div_negative4):
+	DoDiv	#3
+	DoDiv	#2
+	bcs	LSYM(Lthumb1_div_negative_loop)
+	DoDiv	#1
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	cpy	curbit, ip
+	adc	result, result
+	asr	curbit, curbit, #1
+	cpy	dividend, result
+	bcc	2f
+	neg	dividend, dividend
+	cmp	curbit, #0
+
+2:	bpl	3f
+	neg	divisor, divisor
+
+3:	RET
+
+LSYM(Ldivbyzero_negative):
+	cpy	curbit, ip
+	asr	curbit, curbit, #1
+	bcc	LSYM(Ldiv0)
+	neg	dividend, dividend
+.endm
+#endif /* ARM Thumb version.  */
+
 /* ------------------------------------------------------------------------ */
 /*		Start of the Real Functions				    */
 /* ------------------------------------------------------------------------ */
@@ -955,6 +1096,7 @@ LSYM(Lgot_result):
 
 	FUNC_START udivsi3
 	FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
 	pop	{ work }
 	RET
 
+#else
+	/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
+	   used in ARMv6-M when we need an efficient implementation.  */
+LSYM(udivsi3_skip_div0_test):
+	THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START udivsi3
@@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
 FUNC_START aeabi_uidivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(udivsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive.  There is no need to calculate the
+	   remainder again here.  */
+	b	LSYM(udivsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_uidivmod
 	cmp	r1, #0
@@ -1084,7 +1243,7 @@ LSYM(Lover10):
 	RET
 	
 #else  /* ARM version.  */
-	
+
 	FUNC_START umodsi3
 
 	subs	r2, r1, #1			@ compare divisor with 1
@@ -1109,8 +1268,9 @@ LSYM(Lover10):
 
 #if defined(__prefer_thumb__)
 
-	FUNC_START divsi3	
+	FUNC_START divsi3
 	FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -1133,7 +1293,7 @@ LSYM(Lover11):
 	blo	LSYM(Lgot_result)
 
 	THUMB_DIV_MOD_BODY 0
-	
+
 	mov	r0, result
 	mov	work, ip
 	cmp	work, #0
@@ -1142,6 +1302,21 @@ LSYM(Lover11):
 LSYM(Lover12):
 	pop	{ work }
 	RET
+#else
+	/* Implementation of aeabi_idiv for ARMv6m.  This version is only
+	   used in ARMv6-M when we need an efficient implementation.  */
+LSYM(divsi3_skip_div0_test):
+	cpy	curbit, dividend
+	orr	curbit, divisor
+	bmi	LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+	THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+	THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
 
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
@@ -1154,8 +1329,8 @@ LSYM(Lover12):
 	RET
 
 #else /* ARM/Thumb-2 version.  */
-	
-	ARM_FUNC_START divsi3	
+
+	ARM_FUNC_START divsi3
 	ARM_FUNC_ALIAS aeabi_idiv divsi3
 
 	cmp	r1, #0
@@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
 FUNC_START aeabi_idivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(divsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
+	   need to calculate the remainder again here.  */
+	b	LSYM(divsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_idivmod
 	cmp 	r1, #0
-- 
1.9.1