diff mbox

[V3] powerpc: Implement {cmp}xchg for u8 and u16

Message ID 571782F0.2020201@linux.vnet.ibm.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

xinhui April 20, 2016, 1:24 p.m. UTC
From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.
The basic idea is from commit 3226aad81aa6 ("sh: support 1 and 2 byte xchg")

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
change from v2:
	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
change from V1:
	rework totally.
---
 arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

Comments

Peter Zijlstra April 20, 2016, 2:24 p.m. UTC | #1
On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:

> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
> +static __always_inline unsigned long					\
> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
> +			 unsigned long new);				\
> +static __always_inline u32						\
> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
> +{									\
> +	int size = sizeof (type);					\
> +	int off = (unsigned long)ptr % sizeof(u32);			\
> +	volatile u32 *p = ptr - off;					\
> +	int bitoff = BITOFF_CAL(size, off);				\
> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
> +	u32 oldv, newv, tmp;						\
> +	u32 ret;							\
> +	oldv = READ_ONCE(*p);						\
> +	do {								\
> +		ret = (oldv & bitmask) >> bitoff;			\
> +		if (skip && ret != old)					\
> +			break;						\
> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
> +		tmp = oldv;						\
> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
> +	} while (tmp != oldv);						\
> +	return ret;							\
> +}

So for an LL/SC based arch using cmpxchg() like that is sub-optimal.

Why did you choose to write it entirely in C?
xinhui April 21, 2016, 3:35 p.m. UTC | #2
On 2016年04月20日 22:24, Peter Zijlstra wrote:
> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
> 
>> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
>> +static __always_inline unsigned long					\
>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
>> +			 unsigned long new);				\
>> +static __always_inline u32						\
>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
>> +{									\
>> +	int size = sizeof (type);					\
>> +	int off = (unsigned long)ptr % sizeof(u32);			\
>> +	volatile u32 *p = ptr - off;					\
>> +	int bitoff = BITOFF_CAL(size, off);				\
>> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
>> +	u32 oldv, newv, tmp;						\
>> +	u32 ret;							\
>> +	oldv = READ_ONCE(*p);						\
>> +	do {								\
>> +		ret = (oldv & bitmask) >> bitoff;			\
>> +		if (skip && ret != old)					\
>> +			break;						\
>> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
>> +		tmp = oldv;						\
>> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
>> +	} while (tmp != oldv);						\
>> +	return ret;							\
>> +}
> 
> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
> 
> Why did you choose to write it entirely in C?
> 
yes, you are right. more load/store will be done in C code.
However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
So just wrote in C, for simple. :)

Of course I have done xchg tests.
we run code just like xchg((u8*)&v, j++); in several threads.
and the result is,
[  768.374264] use time[1550072]ns in xchg_u8_asm
[  768.377102] use time[2826802]ns in xchg_u8_c

I think this is because there is one more load in C.
If possible, we can move such code in asm-generic/.

thanks
xinhui
Boqun Feng April 21, 2016, 3:52 p.m. UTC | #3
On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
> On 2016年04月20日 22:24, Peter Zijlstra wrote:
> > On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
> > 
> >> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
> >> +static __always_inline unsigned long					\
> >> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
> >> +			 unsigned long new);				\
> >> +static __always_inline u32						\
> >> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
> >> +{									\
> >> +	int size = sizeof (type);					\
> >> +	int off = (unsigned long)ptr % sizeof(u32);			\
> >> +	volatile u32 *p = ptr - off;					\
> >> +	int bitoff = BITOFF_CAL(size, off);				\
> >> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
> >> +	u32 oldv, newv, tmp;						\
> >> +	u32 ret;							\
> >> +	oldv = READ_ONCE(*p);						\
> >> +	do {								\
> >> +		ret = (oldv & bitmask) >> bitoff;			\
> >> +		if (skip && ret != old)					\
> >> +			break;						\
> >> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
> >> +		tmp = oldv;						\
> >> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
> >> +	} while (tmp != oldv);						\
> >> +	return ret;							\
> >> +}
> > 
> > So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
> > 
> > Why did you choose to write it entirely in C?
> > 
> yes, you are right. more load/store will be done in C code.
> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
> So just wrote in C, for simple. :)
> 
> Of course I have done xchg tests.
> we run code just like xchg((u8*)&v, j++); in several threads.
> and the result is,
> [  768.374264] use time[1550072]ns in xchg_u8_asm

How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
loop with shifting and masking in it?

Regards,
Boqun

> [  768.377102] use time[2826802]ns in xchg_u8_c
> 
> I think this is because there is one more load in C.
> If possible, we can move such code in asm-generic/.
> 
> thanks
> xinhui
>
Peter Zijlstra April 21, 2016, 4:13 p.m. UTC | #4
On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
> yes, you are right. more load/store will be done in C code.
> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
> So just wrote in C, for simple. :)

Which is fine; but worthy of a note in your Changelog.

> Of course I have done xchg tests.
> we run code just like xchg((u8*)&v, j++); in several threads.
> and the result is,
> [  768.374264] use time[1550072]ns in xchg_u8_asm
> [  768.377102] use time[2826802]ns in xchg_u8_c
> 
> I think this is because there is one more load in C.
> If possible, we can move such code in asm-generic/.

So I'm not actually _that_ familiar with the PPC LL/SC implementation;
but there are things a CPU can do to optimize these loops.

For example, a CPU might choose to not release the exclusive hold of the
line for a number of cycles, except when it passes SC or an interrupt
happens. This way there's a smaller chance the SC fails and inhibits
forward progress.

By doing the modification outside of the LL/SC you loose such
advantages.

And yes, doing a !exclusive load prior to the exclusive load leads to an
even bigger window where the data can get changed out from under you.
xinhui April 22, 2016, 1:59 a.m. UTC | #5
On 2016年04月21日 23:52, Boqun Feng wrote:
> On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
>> On 2016年04月20日 22:24, Peter Zijlstra wrote:
>>> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
>>>
>>>> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
>>>> +static __always_inline unsigned long					\
>>>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
>>>> +			 unsigned long new);				\
>>>> +static __always_inline u32						\
>>>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
>>>> +{									\
>>>> +	int size = sizeof (type);					\
>>>> +	int off = (unsigned long)ptr % sizeof(u32);			\
>>>> +	volatile u32 *p = ptr - off;					\
>>>> +	int bitoff = BITOFF_CAL(size, off);				\
>>>> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
>>>> +	u32 oldv, newv, tmp;						\
>>>> +	u32 ret;							\
>>>> +	oldv = READ_ONCE(*p);						\
>>>> +	do {								\
>>>> +		ret = (oldv & bitmask) >> bitoff;			\
>>>> +		if (skip && ret != old)					\
>>>> +			break;						\
>>>> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
>>>> +		tmp = oldv;						\
>>>> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
>>>> +	} while (tmp != oldv);						\
>>>> +	return ret;							\
>>>> +}
>>>
>>> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
>>>
>>> Why did you choose to write it entirely in C?
>>>
>> yes, you are right. more load/store will be done in C code.
>> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
>> So just wrote in C, for simple. :)
>>
>> Of course I have done xchg tests.
>> we run code just like xchg((u8*)&v, j++); in several threads.
>> and the result is,
>> [  768.374264] use time[1550072]ns in xchg_u8_asm
> 
> How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
> loop with shifting and masking in it?
> 
yes, using 32bit ll/sc loops.

looks like:
        __asm__ __volatile__(
"1:     lwarx   %0,0,%3\n"
"       and %1,%0,%5\n"
"       or %1,%1,%4\n"
       PPC405_ERR77(0,%2)
"       stwcx.  %1,0,%3\n"
"       bne-    1b"
        : "=&r" (_oldv), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p)
        : "r" (_p), "r" (_newv), "r" (_oldv_mask)
        : "cc", "memory");


> Regards,
> Boqun
> 
>> [  768.377102] use time[2826802]ns in xchg_u8_c
>>
>> I think this is because there is one more load in C.
>> If possible, we can move such code in asm-generic/.
>>
>> thanks
>> xinhui
>>
Boqun Feng April 22, 2016, 3:16 a.m. UTC | #6
On Fri, Apr 22, 2016 at 09:59:22AM +0800, Pan Xinhui wrote:
> On 2016年04月21日 23:52, Boqun Feng wrote:
> > On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
> >> On 2016年04月20日 22:24, Peter Zijlstra wrote:
> >>> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
> >>>
> >>>> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
> >>>> +static __always_inline unsigned long					\
> >>>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
> >>>> +			 unsigned long new);				\
> >>>> +static __always_inline u32						\
> >>>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
> >>>> +{									\
> >>>> +	int size = sizeof (type);					\
> >>>> +	int off = (unsigned long)ptr % sizeof(u32);			\
> >>>> +	volatile u32 *p = ptr - off;					\
> >>>> +	int bitoff = BITOFF_CAL(size, off);				\
> >>>> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
> >>>> +	u32 oldv, newv, tmp;						\
> >>>> +	u32 ret;							\
> >>>> +	oldv = READ_ONCE(*p);						\
> >>>> +	do {								\
> >>>> +		ret = (oldv & bitmask) >> bitoff;			\
> >>>> +		if (skip && ret != old)					\
> >>>> +			break;						\
> >>>> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
> >>>> +		tmp = oldv;						\
> >>>> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
> >>>> +	} while (tmp != oldv);						\
> >>>> +	return ret;							\
> >>>> +}
> >>>
> >>> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
> >>>
> >>> Why did you choose to write it entirely in C?
> >>>
> >> yes, you are right. more load/store will be done in C code.
> >> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
> >> So just wrote in C, for simple. :)
> >>
> >> Of course I have done xchg tests.
> >> we run code just like xchg((u8*)&v, j++); in several threads.
> >> and the result is,
> >> [  768.374264] use time[1550072]ns in xchg_u8_asm
> > 
> > How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
> > loop with shifting and masking in it?
> > 
> yes, using 32bit ll/sc loops.
> 
> looks like:
>         __asm__ __volatile__(
> "1:     lwarx   %0,0,%3\n"
> "       and %1,%0,%5\n"
> "       or %1,%1,%4\n"
>        PPC405_ERR77(0,%2)
> "       stwcx.  %1,0,%3\n"
> "       bne-    1b"
>         : "=&r" (_oldv), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p)
>         : "r" (_p), "r" (_newv), "r" (_oldv_mask)
>         : "cc", "memory");
> 

Good, so this works for all ppc ISAs too.

Given the performance benefit(maybe caused by the reason Peter
mentioned), I think we should use this as the implementation of u8/u16
{cmp}xchg for now. For Power7 and later, we can always switch to the
lbarx/lharx version if observable performance benefit can be achieved.

But the choice is left to you. After all, as you said, qspinlock is the
only user ;-)

Regards,
Boqun

> 
> > Regards,
> > Boqun
> > 
> >> [  768.377102] use time[2826802]ns in xchg_u8_c
> >>
> >> I think this is because there is one more load in C.
> >> If possible, we can move such code in asm-generic/.
> >>
> >> thanks
> >> xinhui
> >>
>
xinhui April 25, 2016, 10:10 a.m. UTC | #7
On 2016年04月22日 00:13, Peter Zijlstra wrote:
> On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
>> yes, you are right. more load/store will be done in C code.
>> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
>> So just wrote in C, for simple. :)
> 
> Which is fine; but worthy of a note in your Changelog.
> 
will do that.

>> Of course I have done xchg tests.
>> we run code just like xchg((u8*)&v, j++); in several threads.
>> and the result is,
>> [  768.374264] use time[1550072]ns in xchg_u8_asm
>> [  768.377102] use time[2826802]ns in xchg_u8_c
>>
>> I think this is because there is one more load in C.
>> If possible, we can move such code in asm-generic/.
> 
> So I'm not actually _that_ familiar with the PPC LL/SC implementation;
> but there are things a CPU can do to optimize these loops.
> 
> For example, a CPU might choose to not release the exclusive hold of the
> line for a number of cycles, except when it passes SC or an interrupt
> happens. This way there's a smaller chance the SC fails and inhibits
> forward progress.
I am not sure if there is such hardware optimization.

> 
> By doing the modification outside of the LL/SC you loose such
> advantages.
> 
> And yes, doing a !exclusive load prior to the exclusive load leads to an
> even bigger window where the data can get changed out from under you.
> 
you are right.
We have observed such data change during the two different loads.
Peter Zijlstra April 25, 2016, 3:37 p.m. UTC | #8
On Mon, Apr 25, 2016 at 06:10:51PM +0800, Pan Xinhui wrote:
> > So I'm not actually _that_ familiar with the PPC LL/SC implementation;
> > but there are things a CPU can do to optimize these loops.
> > 
> > For example, a CPU might choose to not release the exclusive hold of the
> > line for a number of cycles, except when it passes SC or an interrupt
> > happens. This way there's a smaller chance the SC fails and inhibits
> > forward progress.

> I am not sure if there is such hardware optimization.

So I think the hardware must do _something_, otherwise competing cores
doing load-exlusive could life-lock a system, each one endlessly
breaking the exclusive ownership of the other and the store-conditional
always failing.

Of course, there are such implementations, and they tend to have to put
in explicit backoff loops; however, IIRC, PPC doesn't need that. (See
ARC for an example that needs to do this.)
xinhui April 26, 2016, 11:35 a.m. UTC | #9
On 2016年04月25日 23:37, Peter Zijlstra wrote:
> On Mon, Apr 25, 2016 at 06:10:51PM +0800, Pan Xinhui wrote:
>>> So I'm not actually _that_ familiar with the PPC LL/SC implementation;
>>> but there are things a CPU can do to optimize these loops.
>>>
>>> For example, a CPU might choose to not release the exclusive hold of the
>>> line for a number of cycles, except when it passes SC or an interrupt
>>> happens. This way there's a smaller chance the SC fails and inhibits
>>> forward progress.
> 
>> I am not sure if there is such hardware optimization.
> 
> So I think the hardware must do _something_, otherwise competing cores
> doing load-exlusive could life-lock a system, each one endlessly
> breaking the exclusive ownership of the other and the store-conditional
> always failing.
> 
Seems there is no such optimization.

We haver observed SC fails almost all the time in a contention tests, then got stuck in the loop. :(
one thread modify val with LL/SC, and other threads just modify val without any respect to LL/SC.

So in the end, I choose to rewrite this patch in asm. :)

> Of course, there are such implementations, and they tend to have to put
> in explicit backoff loops; however, IIRC, PPC doesn't need that. (See
> ARC for an example that needs to do this.)
>
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..2aec04e 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,38 @@ 
 #include <asm/asm-compat.h>
 #include <linux/bug.h>
 
+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
+#endif
+
+#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
+static __always_inline unsigned long					\
+__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
+			 unsigned long new);				\
+static __always_inline u32						\
+__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
+{									\
+	int size = sizeof (type);					\
+	int off = (unsigned long)ptr % sizeof(u32);			\
+	volatile u32 *p = ptr - off;					\
+	int bitoff = BITOFF_CAL(size, off);				\
+	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
+	u32 oldv, newv, tmp;						\
+	u32 ret;							\
+	oldv = READ_ONCE(*p);						\
+	do {								\
+		ret = (oldv & bitmask) >> bitoff;			\
+		if (skip && ret != old)					\
+			break;						\
+		newv = (oldv & ~bitmask) | (new << bitoff);		\
+		tmp = oldv;						\
+		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
+	} while (tmp != oldv);						\
+	return ret;							\
+}
+
 /*
  * Atomic exchange
  *
@@ -14,6 +46,19 @@ 
  * the previous value stored there.
  */
 
+#define XCHG_GEN(type, sfx, v)						\
+		__XCHG_GEN(_, type, sfx, 0, v)				\
+static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n)		\
+{									\
+	return ___xchg_##type##sfx(p, 0, n);				\
+}
+
+XCHG_GEN(u8, _local, volatile);
+XCHG_GEN(u8, _relaxed, );
+XCHG_GEN(u16, _local, volatile);
+XCHG_GEN(u16, _relaxed, );
+#undef XCHG_GEN
+
 static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
@@ -88,6 +133,10 @@  static __always_inline unsigned long
 __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_local(ptr, x);
+	case 2:
+		return __xchg_u16_local(ptr, x);
 	case 4:
 		return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +152,10 @@  static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_relaxed(ptr, x);
+	case 2:
+		return __xchg_u16_relaxed(ptr, x);
 	case 4:
 		return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -131,6 +184,20 @@  __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
  * and return the old value of *p.
  */
 
+#define CMPXCHG_GEN(type, sfx, v)				\
+	__XCHG_GEN(cmp, type, sfx, 1, v)
+
+CMPXCHG_GEN(u8, , volatile);
+CMPXCHG_GEN(u8, _local,	volatile);
+CMPXCHG_GEN(u8, _relaxed, );
+CMPXCHG_GEN(u8, _acquire, );
+CMPXCHG_GEN(u16, , volatile);
+CMPXCHG_GEN(u16, _local, volatile);
+CMPXCHG_GEN(u16, _relaxed, );
+CMPXCHG_GEN(u16, _acquire, );
+#undef CMPXCHG_GEN
+#undef __XCHG_GEN
+
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 {
@@ -316,6 +383,10 @@  __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -332,6 +403,10 @@  __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_local(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_local(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_local(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -348,6 +423,10 @@  __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_relaxed(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_relaxed(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_relaxed(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -364,6 +443,10 @@  __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_acquire(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_acquire(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_acquire(ptr, old, new);
 #ifdef CONFIG_PPC64