diff mbox series

[v2] powerpc/math-emu: Update macros from gmp-6.1.2

Message ID 20181114024412.19368-1-joel@jms.id.au (mailing list archive)
State Changes Requested
Headers show
Series [v2] powerpc/math-emu: Update macros from gmp-6.1.2 | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success next/apply_patch Successfully applied
snowpatch_ozlabs/build-ppc64le success build succeded & removed 0 sparse warning(s)
snowpatch_ozlabs/build-ppc64be success build succeded & removed 0 sparse warning(s)
snowpatch_ozlabs/build-ppc64e success build succeded & removed 0 sparse warning(s)
snowpatch_ozlabs/build-pmac32 success build succeded & removed 0 sparse warning(s)
snowpatch_ozlabs/checkpatch warning total: 0 errors, 30 warnings, 17 checks, 154 lines checked

Commit Message

Joel Stanley Nov. 14, 2018, 2:44 a.m. UTC
The add_ssaaaa, sub_ddmmss, umul_ppmm and udiv_qrnnd macros originate
from GMP's longlong.h.
This was found when compiling with clang:

   arch/powerpc/math-emu/fnmsub.c:46:2: error: invalid use of a cast in a
   inline asm context requiring an l-value: remove the cast or build with
   -fheinous-gnu-extensions
           FP_ADD_D(R, T, B);
           ^~~~~~~~~~~~~~~~~
   ...

   ./arch/powerpc/include/asm/sfp-machine.h:283:27: note: expanded from
   macro 'sub_ddmmss'
                  : "=r" ((USItype)(sh)),                                  \
                          ~~~~~~~~~~^~~

Segher points out: this was fixed in GCC over 16 years ago
( https://gcc.gnu.org/r56600 ), and in GMP (where it comes from)
presumably before that.

Update to the latest version in order to git rid of the invalid casts.

The only functional change I noticed was this in udiv_qrnnd.

    __r1 = (n1) % __d1;
    __q1 = (n1) / __d1;

Becomes this:

    __q1 = (n1) / __d1;
    __r1 = (n1) - __q1 * __d1;

This is equivalent as it instead of calculating the remainder
using modulo, it uses the result of integer division to subtract the
count of 'whole' d1 from r1.

Link: https://github.com/ClangBuiltLinux/linux/issues/260
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
v1:
 https://lore.kernel.org/linuxppc-dev/20181102033713.31916-1-joel@jms.id.au/
v2:
 Instead of setting the -fheinous-gnu-extensions for clang, fix the code.

 arch/powerpc/include/asm/sfp-machine.h | 99 +++++++++-----------------
 1 file changed, 32 insertions(+), 67 deletions(-)

Comments

Nick Desaulniers Nov. 19, 2018, 6:53 p.m. UTC | #1
On Tue, Nov 13, 2018 at 6:44 PM Joel Stanley <joel@jms.id.au> wrote:
>
> The add_ssaaaa, sub_ddmmss, umul_ppmm and udiv_qrnnd macros originate
> from GMP's longlong.h.
> This was found when compiling with clang:
>
>    arch/powerpc/math-emu/fnmsub.c:46:2: error: invalid use of a cast in a
>    inline asm context requiring an l-value: remove the cast or build with
>    -fheinous-gnu-extensions
>            FP_ADD_D(R, T, B);
>            ^~~~~~~~~~~~~~~~~
>    ...
>
>    ./arch/powerpc/include/asm/sfp-machine.h:283:27: note: expanded from
>    macro 'sub_ddmmss'
>                   : "=r" ((USItype)(sh)),                                  \
>                           ~~~~~~~~~~^~~
>
> Segher points out: this was fixed in GCC over 16 years ago
> ( https://gcc.gnu.org/r56600 ), and in GMP (where it comes from)
> presumably before that.
>
> Update to the latest version in order to git rid of the invalid casts.

Hi Joel, thanks for this patch.  It's a much better fix IMO than v1.
One nit below.

>
> The only functional change I noticed was this in udiv_qrnnd.
>
>     __r1 = (n1) % __d1;
>     __q1 = (n1) / __d1;
>
> Becomes this:
>
>     __q1 = (n1) / __d1;
>     __r1 = (n1) - __q1 * __d1;
>
> This is equivalent as it instead of calculating the remainder
> using modulo, it uses the result of integer division to subtract the
> count of 'whole' d1 from r1.

I don't understand this; why was this functional change made?

>
> Link: https://github.com/ClangBuiltLinux/linux/issues/260
> Signed-off-by: Joel Stanley <joel@jms.id.au>
> ---
> v1:
>  https://lore.kernel.org/linuxppc-dev/20181102033713.31916-1-joel@jms.id.au/
> v2:
>  Instead of setting the -fheinous-gnu-extensions for clang, fix the code.
>
>  arch/powerpc/include/asm/sfp-machine.h | 99 +++++++++-----------------
>  1 file changed, 32 insertions(+), 67 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/sfp-machine.h b/arch/powerpc/include/asm/sfp-machine.h
> index d89beaba26ff..a6353d9dd5ba 100644
> --- a/arch/powerpc/include/asm/sfp-machine.h
> +++ b/arch/powerpc/include/asm/sfp-machine.h
> @@ -213,30 +213,18 @@
>   * respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
>   * (i.e. carry out) is not stored anywhere, and is lost.
>   */
> -#define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
> +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
>    do {                                                                 \
>      if (__builtin_constant_p (bh) && (bh) == 0)                                \
> -      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"          \
> -            : "=r" ((USItype)(sh)),                                    \
> -              "=&r" ((USItype)(sl))                                    \
> -            : "%r" ((USItype)(ah)),                                    \
> -              "%r" ((USItype)(al)),                                    \
> -              "rI" ((USItype)(bl)));                                   \
> -    else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0)         \
> -      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"          \
> -            : "=r" ((USItype)(sh)),                                    \
> -              "=&r" ((USItype)(sl))                                    \
> -            : "%r" ((USItype)(ah)),                                    \
> -              "%r" ((USItype)(al)),                                    \
> -              "rI" ((USItype)(bl)));                                   \
> +      __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                       \
> +            : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
> +    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)                \
> +      __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                       \
> +            : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
>      else                                                               \
> -      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"         \
> -            : "=r" ((USItype)(sh)),                                    \
> -              "=&r" ((USItype)(sl))                                    \
> -            : "%r" ((USItype)(ah)),                                    \
> -              "r" ((USItype)(bh)),                                     \
> -              "%r" ((USItype)(al)),                                    \
> -              "rI" ((USItype)(bl)));                                   \
> +      __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                     \
> +            : "=r" (sh), "=&r" (sl)                                    \
> +            : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
>    } while (0)
>
>  /* sub_ddmmss is used in op-2.h and udivmodti4.c and should be equivalent to
> @@ -248,44 +236,24 @@
>   * and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
>   * and is lost.
>   */
> -#define sub_ddmmss(sh, sl, ah, al, bh, bl)                             \
> +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
>    do {                                                                 \
>      if (__builtin_constant_p (ah) && (ah) == 0)                                \
> -      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"      \
> -              : "=r" ((USItype)(sh)),                                  \
> -                "=&r" ((USItype)(sl))                                  \
> -              : "r" ((USItype)(bh)),                                   \
> -                "rI" ((USItype)(al)),                                  \
> -                "r" ((USItype)(bl)));                                  \
> -    else if (__builtin_constant_p (ah) && (ah) ==~(USItype) 0)         \
> -      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"      \
> -              : "=r" ((USItype)(sh)),                                  \
> -                "=&r" ((USItype)(sl))                                  \
> -              : "r" ((USItype)(bh)),                                   \
> -                "rI" ((USItype)(al)),                                  \
> -                "r" ((USItype)(bl)));                                  \
> +      __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                     \
> +              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
> +    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)                \
> +      __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                     \
> +              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
>      else if (__builtin_constant_p (bh) && (bh) == 0)                   \
> -      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"                \
> -              : "=r" ((USItype)(sh)),                                  \
> -                "=&r" ((USItype)(sl))                                  \
> -              : "r" ((USItype)(ah)),                                   \
> -                "rI" ((USItype)(al)),                                  \
> -                "r" ((USItype)(bl)));                                  \
> -    else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0)         \
> -      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"                \
> -              : "=r" ((USItype)(sh)),                                  \
> -                "=&r" ((USItype)(sl))                                  \
> -              : "r" ((USItype)(ah)),                                   \
> -                "rI" ((USItype)(al)),                                  \
> -                "r" ((USItype)(bl)));                                  \
> +      __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                      \
> +              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
> +    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)                \
> +      __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                      \
> +              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
>      else                                                               \
> -      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"     \
> -              : "=r" ((USItype)(sh)),                                  \
> -                "=&r" ((USItype)(sl))                                  \
> -              : "r" ((USItype)(ah)),                                   \
> -                "r" ((USItype)(bh)),                                   \
> -                "rI" ((USItype)(al)),                                  \
> -                "r" ((USItype)(bl)));                                  \
> +      __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                   \
> +              : "=r" (sh), "=&r" (sl)                                  \
> +              : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
>    } while (0)
>
>  /* asm fragments for mul and div */
> @@ -294,13 +262,10 @@
>   * UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
>   * word product in HIGH_PROD and LOW_PROD.
>   */
> -#define umul_ppmm(ph, pl, m0, m1)                                      \
> +#define umul_ppmm(ph, pl, m0, m1) \
>    do {                                                                 \
>      USItype __m0 = (m0), __m1 = (m1);                                  \
> -    __asm__ ("mulhwu %0,%1,%2"                                         \
> -            : "=r" ((USItype)(ph))                                     \
> -            : "%r" (__m0),                                             \
> -               "r" (__m1));                                            \
> +    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));     \
>      (pl) = __m0 * __m1;                                                        \
>    } while (0)
>
> @@ -312,28 +277,28 @@
>   * significant bit of DENOMINATOR must be 1, then the pre-processor symbol
>   * UDIV_NEEDS_NORMALIZATION is defined to 1.
>   */
> -#define udiv_qrnnd(q, r, n1, n0, d)                                    \
> +#define udiv_qrnnd(q, r, n1, n0, d) \
>    do {                                                                 \
>      UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                    \
>      __d1 = __ll_highpart (d);                                          \
>      __d0 = __ll_lowpart (d);                                           \
>                                                                         \
> -    __r1 = (n1) % __d1;                                                        \
>      __q1 = (n1) / __d1;                                                        \
> -    __m = (UWtype) __q1 * __d0;                                                \
> +    __r1 = (n1) - __q1 * __d1;                                         \
> +    __m = __q1 * __d0;                                                 \
>      __r1 = __r1 * __ll_B | __ll_highpart (n0);                         \
>      if (__r1 < __m)                                                    \
>        {                                                                        \
>         __q1--, __r1 += (d);                                            \
> -       if (__r1 >= (d)) /* we didn't get carry when adding to __r1 */  \
> +       if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
>           if (__r1 < __m)                                               \
>             __q1--, __r1 += (d);                                        \
>        }                                                                        \
>      __r1 -= __m;                                                       \
>                                                                         \
> -    __r0 = __r1 % __d1;                                                        \
>      __q0 = __r1 / __d1;                                                        \
> -    __m = (UWtype) __q0 * __d0;                                                \
> +    __r0 = __r1  - __q0 * __d1;                                                \
> +    __m = __q0 * __d0;                                                 \
>      __r0 = __r0 * __ll_B | __ll_lowpart (n0);                          \
>      if (__r0 < __m)                                                    \
>        {                                                                        \
> @@ -344,7 +309,7 @@
>        }                                                                        \
>      __r0 -= __m;                                                       \
>                                                                         \
> -    (q) = (UWtype) __q1 * __ll_B | __q0;                               \
> +    (q) = __q1 * __ll_B | __q0;                                                \
>      (r) = __r0;                                                                \
>    } while (0)


This appears to now differ from the upstream source:
https://github.com/gcc-mirror/gcc/blob/f7289f563a5e447ac21a5901ed75aad0dbd37732/include/longlong.h#L1679
If we're going to borrow implementations from GCC, let's borrow the
same implementation. Otherwise it's hard to have confidence in this
part of the patch.
Joel Stanley Nov. 19, 2018, 11:15 p.m. UTC | #2
On Tue, 20 Nov 2018 at 05:24, Nick Desaulniers <ndesaulniers@google.com> wrote:
> >
> > The only functional change I noticed was this in udiv_qrnnd.
> >
> >     __r1 = (n1) % __d1;
> >     __q1 = (n1) / __d1;
> >
> > Becomes this:
> >
> >     __q1 = (n1) / __d1;
> >     __r1 = (n1) - __q1 * __d1;
> >
> > This is equivalent as it instead of calculating the remainder
> > using modulo, it uses the result of integer division to subtract the
> > count of 'whole' d1 from r1.
>
> I don't understand this; why was this functional change made?

I couldn't see why. It pre-dates GMP's mecurial history that  .
                                              \
> > -    (q) = (UWtype) __q1 * __ll_B | __q0;                               \
> > +    (q) = __q1 * __ll_B | __q0;                                                \
> >      (r) = __r0;                                                                \
> >    } while (0)
>
> This appears to now differ from the upstream source:
> https://github.com/gcc-mirror/gcc/blob/f7289f563a5e447ac21a5901ed75aad0dbd37732/include/longlong.h#L1679

AIUI the upstream source is GMP:

 https://gmplib.org/repo/gmp/file/tip/longlong.h

> If we're going to borrow implementations from GCC, let's borrow the
> same implementation. Otherwise it's hard to have confidence in this
> part of the patch.

I agree we should use the upstream source.

Segher, which tree contains the One True Upstream for longlong.h?

Cheers,

Joel
Segher Boessenkool Nov. 20, 2018, 6:20 p.m. UTC | #3
On Tue, Nov 20, 2018 at 09:45:33AM +1030, Joel Stanley wrote:
> On Tue, 20 Nov 2018 at 05:24, Nick Desaulniers <ndesaulniers@google.com> wrote:
> > >
> > > The only functional change I noticed was this in udiv_qrnnd.
> > >
> > >     __r1 = (n1) % __d1;
> > >     __q1 = (n1) / __d1;
> > >
> > > Becomes this:
> > >
> > >     __q1 = (n1) / __d1;
> > >     __r1 = (n1) - __q1 * __d1;
> > >
> > > This is equivalent as it instead of calculating the remainder
> > > using modulo, it uses the result of integer division to subtract the
> > > count of 'whole' d1 from r1.
> >
> > I don't understand this; why was this functional change made?
> 
> I couldn't see why. It pre-dates GMP's mecurial history that  .

Perhaps it is to make it more likely only a single divide machine
instruction results.  Maybe for -O0, maybe for older GCC, maybe for GCC
targets that do not get this right.  It of course means exactly the same.

> AIUI the upstream source is GMP:
> 
>  https://gmplib.org/repo/gmp/file/tip/longlong.h
> 
> > If we're going to borrow implementations from GCC, let's borrow the
> > same implementation. Otherwise it's hard to have confidence in this
> > part of the patch.
> 
> I agree we should use the upstream source.
> 
> Segher, which tree contains the One True Upstream for longlong.h?

You should probably get your updates from the same place as was used to
get the file in the first place.  Or, don't worry so much about it, how
often is this updated?  Once in ten years, or twenty?

If you do not want to include bigger (maybe irrelevant) changes from
upstream, you have already forked, effectively.


Segher
Nick Desaulniers Nov. 20, 2018, 6:45 p.m. UTC | #4
On Tue, Nov 20, 2018 at 10:20 AM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> On Tue, Nov 20, 2018 at 09:45:33AM +1030, Joel Stanley wrote:
> > On Tue, 20 Nov 2018 at 05:24, Nick Desaulniers <ndesaulniers@google.com> wrote:
> > > >
> > > > The only functional change I noticed was this in udiv_qrnnd.
> > AIUI the upstream source is GMP:
> >
> >  https://gmplib.org/repo/gmp/file/tip/longlong.h
> >
> > > If we're going to borrow implementations from GCC, let's borrow the
> > > same implementation. Otherwise it's hard to have confidence in this
> > > part of the patch.
> >
> > I agree we should use the upstream source.
> >
> > Segher, which tree contains the One True Upstream for longlong.h?
>
> You should probably get your updates from the same place as was used to
> get the file in the first place.

So sounds like GMP is the source to take then.  Joel, would you mind
sending a v3 with just the commit message updated to reflect the fact
that GCC and GMP differ in implementations, and this is taken from
GMP, maybe with some links?  I think that will provide more context
for future travelers.  I'd be happy to add my reviewed-by tag to that.
Segher Boessenkool Nov. 20, 2018, 8:15 p.m. UTC | #5
On Tue, Nov 20, 2018 at 10:45:19AM -0800, Nick Desaulniers wrote:
> On Tue, Nov 20, 2018 at 10:20 AM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> > > Segher, which tree contains the One True Upstream for longlong.h?
> >
> > You should probably get your updates from the same place as was used to
> > get the file in the first place.
> 
> So sounds like GMP is the source to take then.

I'm pretty sure the kernel got it from GCC.  How much does the kernel
one differ from current GCC?

FWIW, the GCC version still uses this "abhorrent GCC extension" (or what
were those words) for most targets, just not for powerpc.


Segher
Nick Desaulniers Nov. 20, 2018, 9:28 p.m. UTC | #6
On Tue, Nov 20, 2018 at 12:15 PM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> On Tue, Nov 20, 2018 at 10:45:19AM -0800, Nick Desaulniers wrote:
> > On Tue, Nov 20, 2018 at 10:20 AM Segher Boessenkool
> > <segher@kernel.crashing.org> wrote:
> > > > Segher, which tree contains the One True Upstream for longlong.h?
> > >
> > > You should probably get your updates from the same place as was used to
> > > get the file in the first place.
> >
> > So sounds like GMP is the source to take then.
>
> I'm pretty sure the kernel got it from GCC.
> How much does the kernel
> one differ from current GCC?

Compare udiv_qrnnd implementations from:
gcc: https://github.com/gcc-mirror/gcc/blob/f7289f563a5e447ac21a5901ed75aad0dbd37732/include/longlong.h#L1679
gmp: https://gmplib.org/repo/gmp/file/tip/longlong.h#l2063

The kernel currently is using the GCC impl, which Joel's patch here
changes it to the GMP impl.

If you prefer the kernel to keep the kernel keep the GCC derivative,
then Joel can drop the change to udiv_qrnnd() as it does not contain
inline assembly relevant to the original issue.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/sfp-machine.h b/arch/powerpc/include/asm/sfp-machine.h
index d89beaba26ff..a6353d9dd5ba 100644
--- a/arch/powerpc/include/asm/sfp-machine.h
+++ b/arch/powerpc/include/asm/sfp-machine.h
@@ -213,30 +213,18 @@ 
  * respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
  * (i.e. carry out) is not stored anywhere, and is lost.
  */
-#define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   do {									\
     if (__builtin_constant_p (bh) && (bh) == 0)				\
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
-	     : "=r" ((USItype)(sh)),					\
-	       "=&r" ((USItype)(sl))					\
-	     : "%r" ((USItype)(ah)),					\
-	       "%r" ((USItype)(al)),					\
-	       "rI" ((USItype)(bl)));					\
-    else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0)		\
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
-	     : "=r" ((USItype)(sh)),					\
-	       "=&r" ((USItype)(sl))					\
-	     : "%r" ((USItype)(ah)),					\
-	       "%r" ((USItype)(al)),					\
-	       "rI" ((USItype)(bl)));					\
+      __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));	\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));	\
     else								\
-      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
-	     : "=r" ((USItype)(sh)),					\
-	       "=&r" ((USItype)(sl))					\
-	     : "%r" ((USItype)(ah)),					\
-	       "r" ((USItype)(bh)),					\
-	       "%r" ((USItype)(al)),					\
-	       "rI" ((USItype)(bl)));					\
+      __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
+	     : "=r" (sh), "=&r" (sl)					\
+	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
   } while (0)
 
 /* sub_ddmmss is used in op-2.h and udivmodti4.c and should be equivalent to
@@ -248,44 +236,24 @@ 
  * and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
  * and is lost.
  */
-#define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   do {									\
     if (__builtin_constant_p (ah) && (ah) == 0)				\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
-	       : "=r" ((USItype)(sh)),					\
-		 "=&r" ((USItype)(sl))					\
-	       : "r" ((USItype)(bh)),					\
-		 "rI" ((USItype)(al)),					\
-		 "r" ((USItype)(bl)));					\
-    else if (__builtin_constant_p (ah) && (ah) ==~(USItype) 0)		\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
-	       : "=r" ((USItype)(sh)),					\
-		 "=&r" ((USItype)(sl))					\
-	       : "r" ((USItype)(bh)),					\
-		 "rI" ((USItype)(al)),					\
-		 "r" ((USItype)(bl)));					\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
     else if (__builtin_constant_p (bh) && (bh) == 0)			\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
-	       : "=r" ((USItype)(sh)),					\
-		 "=&r" ((USItype)(sl))					\
-	       : "r" ((USItype)(ah)),					\
-		 "rI" ((USItype)(al)),					\
-		 "r" ((USItype)(bl)));					\
-    else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0)		\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
-	       : "=r" ((USItype)(sh)),					\
-		 "=&r" ((USItype)(sl))					\
-	       : "r" ((USItype)(ah)),					\
-		 "rI" ((USItype)(al)),					\
-		 "r" ((USItype)(bl)));					\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
     else								\
-      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
-	       : "=r" ((USItype)(sh)),					\
-		 "=&r" ((USItype)(sl))					\
-	       : "r" ((USItype)(ah)),					\
-		 "r" ((USItype)(bh)),					\
-		 "rI" ((USItype)(al)),					\
-		 "r" ((USItype)(bl)));					\
+      __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
+	       : "=r" (sh), "=&r" (sl)					\
+	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
   } while (0)
 
 /* asm fragments for mul and div */
@@ -294,13 +262,10 @@ 
  * UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
  * word product in HIGH_PROD and LOW_PROD.
  */
-#define umul_ppmm(ph, pl, m0, m1)					\
+#define umul_ppmm(ph, pl, m0, m1) \
   do {									\
     USItype __m0 = (m0), __m1 = (m1);					\
-    __asm__ ("mulhwu %0,%1,%2"						\
-	     : "=r" ((USItype)(ph))					\
-	     : "%r" (__m0),						\
-               "r" (__m1));						\
+    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
     (pl) = __m0 * __m1;							\
   } while (0)
 
@@ -312,28 +277,28 @@ 
  * significant bit of DENOMINATOR must be 1, then the pre-processor symbol
  * UDIV_NEEDS_NORMALIZATION is defined to 1.
  */
-#define udiv_qrnnd(q, r, n1, n0, d)					\
+#define udiv_qrnnd(q, r, n1, n0, d) \
   do {									\
     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
     __d1 = __ll_highpart (d);						\
     __d0 = __ll_lowpart (d);						\
 									\
-    __r1 = (n1) % __d1;							\
     __q1 = (n1) / __d1;							\
-    __m = (UWtype) __q1 * __d0;						\
+    __r1 = (n1) - __q1 * __d1;						\
+    __m = __q1 * __d0;							\
     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
     if (__r1 < __m)							\
       {									\
 	__q1--, __r1 += (d);						\
-	if (__r1 >= (d)) /* we didn't get carry when adding to __r1 */	\
+	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
 	  if (__r1 < __m)						\
 	    __q1--, __r1 += (d);					\
       }									\
     __r1 -= __m;							\
 									\
-    __r0 = __r1 % __d1;							\
     __q0 = __r1 / __d1;							\
-    __m = (UWtype) __q0 * __d0;						\
+    __r0 = __r1  - __q0 * __d1;						\
+    __m = __q0 * __d0;							\
     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
     if (__r0 < __m)							\
       {									\
@@ -344,7 +309,7 @@ 
       }									\
     __r0 -= __m;							\
 									\
-    (q) = (UWtype) __q1 * __ll_B | __q0;				\
+    (q) = __q1 * __ll_B | __q0;						\
     (r) = __r0;								\
   } while (0)