diff mbox series

[11/17] powerpc/qspinlock: allow propagation of yield CPU down the queue

Message ID 20220728063120.2867508-13-npiggin@gmail.com (mailing list archive)
State Changes Requested
Headers show
Series powerpc: alternate queued spinlock implementation | expand

Commit Message

Nicholas Piggin July 28, 2022, 6:31 a.m. UTC
Having all CPUs poll the lock word for the owner CPU that should be
yielded to defeats most of the purpose of using MCS queueing for
scalability. Yet it may be desirable for queued waiters to to yield
to a preempted owner.

s390 addreses this problem by having queued waiters sample the lock
word to find the owner much less frequently. In this approach, the
waiters never sample it directly, but the queue head propagates the
owner CPU back to the next waiter if it ever finds the owner has
been preempted. Queued waiters then subsequently propagate the owner
CPU back to the next waiter, and so on.

Disable this option by default for now, i.e., no logical change.
---
 arch/powerpc/lib/qspinlock.c | 85 +++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

Comments

Jordan Niethe Aug. 12, 2022, 4:17 a.m. UTC | #1
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> Having all CPUs poll the lock word for the owner CPU that should be
> yielded to defeats most of the purpose of using MCS queueing for
> scalability. Yet it may be desirable for queued waiters to to yield
> to a preempted owner.
> 
> s390 addreses this problem by having queued waiters sample the lock
> word to find the owner much less frequently. In this approach, the
> waiters never sample it directly, but the queue head propagates the
> owner CPU back to the next waiter if it ever finds the owner has
> been preempted. Queued waiters then subsequently propagate the owner
> CPU back to the next waiter, and so on.
> 
> Disable this option by default for now, i.e., no logical change.
> ---
>  arch/powerpc/lib/qspinlock.c | 85 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 94f007f66942..28c85a2d5635 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -12,6 +12,7 @@
>  struct qnode {
>  	struct qnode	*next;
>  	struct qspinlock *lock;
> +	int		yield_cpu;
>  	u8		locked; /* 1 if lock acquired */
>  };
>  
> @@ -28,6 +29,7 @@ static int HEAD_SPINS __read_mostly = (1<<8);
>  static bool pv_yield_owner __read_mostly = true;
>  static bool pv_yield_allow_steal __read_mostly = false;
>  static bool pv_yield_prev __read_mostly = true;
> +static bool pv_yield_propagate_owner __read_mostly = true;

This also seems to be enabled by default.

>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> @@ -257,13 +259,66 @@ static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u
>  	__yield_to_locked_owner(lock, val, paravirt, clear_mustq);
>  }
>  
> +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt)
> +{
> +	struct qnode *next;
> +	int owner;
> +
> +	if (!paravirt)
> +		return;
> +	if (!pv_yield_propagate_owner)
> +		return;
> +
> +	owner = get_owner_cpu(val);
> +	if (*set_yield_cpu == owner)
> +		return;
> +
> +	next = READ_ONCE(node->next);
> +	if (!next)
> +		return;
> +
> +	if (vcpu_is_preempted(owner)) {

Is there a difference about using vcpu_is_preempted() here
vs checking bit 0 in other places?


> +		next->yield_cpu = owner;
> +		*set_yield_cpu = owner;
> +	} else if (*set_yield_cpu != -1) {

It might be worth giving the -1 CPU a #define.

> +		next->yield_cpu = owner;
> +		*set_yield_cpu = owner;
> +	}
> +}

Does this need to pass set_yield_cpu by reference? Couldn't it's new value be
returned? To me it makes it more clear the function is used to change
set_yield_cpu. I think this would work:

int set_yield_cpu = -1;

static __always_inline int propagate_yield_cpu(struct qnode *node, u32 val, int set_yield_cpu, bool paravirt)
{
	struct qnode *next;
	int owner;

	if (!paravirt)
		goto out;
	if (!pv_yield_propagate_owner)
		goto out;

	owner = get_owner_cpu(val);
	if (set_yield_cpu == owner)
		goto out;

	next = READ_ONCE(node->next);
	if (!next)
		goto out;

	if (vcpu_is_preempted(owner)) {
		next->yield_cpu = owner;
		return owner;
	} else if (set_yield_cpu != -1) {
		next->yield_cpu = owner;
		return owner;
	}

out:
	return set_yield_cpu;
}

set_yield_cpu = propagate_yield_cpu(...  set_yield_cpu ...);



> +
>  static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, int prev_cpu, bool paravirt)
>  {
>  	u32 yield_count;
> +	int yield_cpu;
>  
>  	if (!paravirt)
>  		goto relax;
>  
> +	if (!pv_yield_propagate_owner)
> +		goto yield_prev;
> +
> +	yield_cpu = READ_ONCE(node->yield_cpu);
> +	if (yield_cpu == -1) {
> +		/* Propagate back the -1 CPU */
> +		if (node->next && node->next->yield_cpu != -1)
> +			node->next->yield_cpu = yield_cpu;
> +		goto yield_prev;
> +	}
> +
> +	yield_count = yield_count_of(yield_cpu);
> +	if ((yield_count & 1) == 0)
> +		goto yield_prev; /* owner vcpu is running */
> +
> +	smp_rmb();
> +
> +	if (yield_cpu == node->yield_cpu) {
> +		if (node->next && node->next->yield_cpu != yield_cpu)
> +			node->next->yield_cpu = yield_cpu;
> +		yield_to_preempted(yield_cpu, yield_count);
> +		return;
> +	}
> +
> +yield_prev:
>  	if (!pv_yield_prev)
>  		goto relax;
>  
> @@ -337,6 +392,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  	node = &qnodesp->nodes[idx];
>  	node->next = NULL;
>  	node->lock = lock;
> +	node->yield_cpu = -1;
>  	node->locked = 0;
>  
>  	tail = encode_tail_cpu();
> @@ -358,13 +414,21 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  		while (!node->locked)
>  			yield_to_prev(lock, node, prev_cpu, paravirt);
>  
> +		/* Clear out stale propagated yield_cpu */
> +		if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1)
> +			node->yield_cpu = -1;
> +
>  		smp_rmb(); /* acquire barrier for the mcs lock */
>  	}
>  
>  	if (!MAYBE_STEALERS) {
> +		int set_yield_cpu = -1;
> +
>  		/* We're at the head of the waitqueue, wait for the lock. */
> -		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
> +		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> +			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
>  			yield_head_to_locked_owner(lock, val, paravirt, false);
> +		}
>  
>  		/* If we're the last queued, must clean up the tail. */
>  		if ((val & _Q_TAIL_CPU_MASK) == tail) {
> @@ -376,12 +440,14 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  		/* We must be the owner, just set the lock bit and acquire */
>  		lock_set_locked(lock);
>  	} else {
> +		int set_yield_cpu = -1;
>  		int iters = 0;
>  		bool set_mustq = false;
>  
>  again:
>  		/* We're at the head of the waitqueue, wait for the lock. */
>  		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> +			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
>  			yield_head_to_locked_owner(lock, val, paravirt,
>  					pv_yield_allow_steal && set_mustq);
>  
> @@ -540,6 +606,22 @@ static int pv_yield_prev_get(void *data, u64 *val)
>  
>  DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");
>  
> +static int pv_yield_propagate_owner_set(void *data, u64 val)
> +{
> +	pv_yield_propagate_owner = !!val;
> +
> +	return 0;
> +}
> +
> +static int pv_yield_propagate_owner_get(void *data, u64 *val)
> +{
> +	*val = pv_yield_propagate_owner;
> +
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
> +
>  static __init int spinlock_debugfs_init(void)
>  {
>  	debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins);
> @@ -548,6 +630,7 @@ static __init int spinlock_debugfs_init(void)
>  		debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
>  		debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
>  		debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
> +		debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
>  	}
>  
>  	return 0;
Laurent Dufour Oct. 6, 2022, 5:27 p.m. UTC | #2
On 28/07/2022 08:31:14, Nicholas Piggin wrote:
> Having all CPUs poll the lock word for the owner CPU that should be
> yielded to defeats most of the purpose of using MCS queueing for
> scalability. Yet it may be desirable for queued waiters to to yield
> to a preempted owner.
> 
> s390 addreses this problem by having queued waiters sample the lock
> word to find the owner much less frequently. In this approach, the
> waiters never sample it directly, but the queue head propagates the
> owner CPU back to the next waiter if it ever finds the owner has
> been preempted. Queued waiters then subsequently propagate the owner
> CPU back to the next waiter, and so on.
> 
> Disable this option by default for now, i.e., no logical change.
> ---
>  arch/powerpc/lib/qspinlock.c | 85 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 94f007f66942..28c85a2d5635 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -12,6 +12,7 @@
>  struct qnode {
>  	struct qnode	*next;
>  	struct qspinlock *lock;
> +	int		yield_cpu;
>  	u8		locked; /* 1 if lock acquired */
>  };
>  
> @@ -28,6 +29,7 @@ static int HEAD_SPINS __read_mostly = (1<<8);
>  static bool pv_yield_owner __read_mostly = true;
>  static bool pv_yield_allow_steal __read_mostly = false;
>  static bool pv_yield_prev __read_mostly = true;
> +static bool pv_yield_propagate_owner __read_mostly = true;
>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> @@ -257,13 +259,66 @@ static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u
>  	__yield_to_locked_owner(lock, val, paravirt, clear_mustq);
>  }
>  
> +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt)
> +{
> +	struct qnode *next;
> +	int owner;
> +
> +	if (!paravirt)
> +		return;
> +	if (!pv_yield_propagate_owner)
> +		return;
> +
> +	owner = get_owner_cpu(val);
> +	if (*set_yield_cpu == owner)
> +		return;
> +
> +	next = READ_ONCE(node->next);
> +	if (!next)
> +		return;
> +
> +	if (vcpu_is_preempted(owner)) {
> +		next->yield_cpu = owner;
> +		*set_yield_cpu = owner;
> +	} else if (*set_yield_cpu != -1) {
> +		next->yield_cpu = owner;
> +		*set_yield_cpu = owner;
> +	}

This is bit confusing, the else branch is the same as the true one.
This might be written like this:

	if (vcpu_is_preempted(owner) || *set_yield_cpu != -1) {
		next->yield_cpu = owner;
		*set_yield_cpu = owner;
	}

> +}
> +
>  static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, int prev_cpu, bool paravirt)
>  {
>  	u32 yield_count;
> +	int yield_cpu;
>  
>  	if (!paravirt)
>  		goto relax;
>  
> +	if (!pv_yield_propagate_owner)
> +		goto yield_prev;
> +
> +	yield_cpu = READ_ONCE(node->yield_cpu);
> +	if (yield_cpu == -1) {
> +		/* Propagate back the -1 CPU */
> +		if (node->next && node->next->yield_cpu != -1)
> +			node->next->yield_cpu = yield_cpu;
> +		goto yield_prev;
> +	}
> +
> +	yield_count = yield_count_of(yield_cpu);
> +	if ((yield_count & 1) == 0)
> +		goto yield_prev; /* owner vcpu is running */
> +
> +	smp_rmb();
> +
> +	if (yield_cpu == node->yield_cpu) {
> +		if (node->next && node->next->yield_cpu != yield_cpu)
> +			node->next->yield_cpu = yield_cpu;
> +		yield_to_preempted(yield_cpu, yield_count);
> +		return;
> +	}
> +

In the case that test is false, this means that the lock owner has probably
changed, why are we yeilding to the previous node instead of reading again
the node->yield_cpu, checking against -1 value etc..?
Yielding to the previous node is valid, but it might be better to yield to
the owner, isn't it?

> +yield_prev:
>  	if (!pv_yield_prev)
>  		goto relax;
>  
> @@ -337,6 +392,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  	node = &qnodesp->nodes[idx];
>  	node->next = NULL;
>  	node->lock = lock;
> +	node->yield_cpu = -1;
>  	node->locked = 0;
>  
>  	tail = encode_tail_cpu();
> @@ -358,13 +414,21 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  		while (!node->locked)
>  			yield_to_prev(lock, node, prev_cpu, paravirt);
>  
> +		/* Clear out stale propagated yield_cpu */
> +		if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1)
> +			node->yield_cpu = -1;

Why doing tests and not directly setting node->yield_cpu to -1?
Is the write operation more costly than the 3 tests?

> +
>  		smp_rmb(); /* acquire barrier for the mcs lock */
>  	}
>  
>  	if (!MAYBE_STEALERS) {
> +		int set_yield_cpu = -1;
> +
>  		/* We're at the head of the waitqueue, wait for the lock. */
> -		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
> +		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> +			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
>  			yield_head_to_locked_owner(lock, val, paravirt, false);
> +		}
>  
>  		/* If we're the last queued, must clean up the tail. */
>  		if ((val & _Q_TAIL_CPU_MASK) == tail) {
> @@ -376,12 +440,14 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  		/* We must be the owner, just set the lock bit and acquire */
>  		lock_set_locked(lock);
>  	} else {
> +		int set_yield_cpu = -1;
>  		int iters = 0;
>  		bool set_mustq = false;
>  
>  again:
>  		/* We're at the head of the waitqueue, wait for the lock. */
>  		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> +			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
>  			yield_head_to_locked_owner(lock, val, paravirt,
>  					pv_yield_allow_steal && set_mustq);
>  
> @@ -540,6 +606,22 @@ static int pv_yield_prev_get(void *data, u64 *val)
>  
>  DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");
>  
> +static int pv_yield_propagate_owner_set(void *data, u64 val)
> +{
> +	pv_yield_propagate_owner = !!val;
> +
> +	return 0;
> +}
> +
> +static int pv_yield_propagate_owner_get(void *data, u64 *val)
> +{
> +	*val = pv_yield_propagate_owner;
> +
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
> +
>  static __init int spinlock_debugfs_init(void)
>  {
>  	debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins);
> @@ -548,6 +630,7 @@ static __init int spinlock_debugfs_init(void)
>  		debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
>  		debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
>  		debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
> +		debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
>  	}
>  
>  	return 0;
Jordan Niethe Nov. 10, 2022, 12:42 a.m. UTC | #3
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
[resend as utf-8, not utf-7]
> Having all CPUs poll the lock word for the owner CPU that should be
> yielded to defeats most of the purpose of using MCS queueing for
> scalability. Yet it may be desirable for queued waiters to to yield
> to a preempted owner.
> 
> s390 addreses this problem by having queued waiters sample the lock
> word to find the owner much less frequently. In this approach, the
> waiters never sample it directly, but the queue head propagates the
> owner CPU back to the next waiter if it ever finds the owner has
> been preempted. Queued waiters then subsequently propagate the owner
> CPU back to the next waiter, and so on.
> 
> Disable this option by default for now, i.e., no logical change.
> ---
>  arch/powerpc/lib/qspinlock.c | 85 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 94f007f66942..28c85a2d5635 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -12,6 +12,7 @@
>  struct qnode {
>  	struct qnode	*next;
>  	struct qspinlock *lock;
> +	int		yield_cpu;
>  	u8		locked; /* 1 if lock acquired */
>  };
>  
> @@ -28,6 +29,7 @@ static int HEAD_SPINS __read_mostly = (1<<8);
>  static bool pv_yield_owner __read_mostly = true;
>  static bool pv_yield_allow_steal __read_mostly = false;
>  static bool pv_yield_prev __read_mostly = true;
> +static bool pv_yield_propagate_owner __read_mostly = true;

This also seems to be enabled by default.

>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> @@ -257,13 +259,66 @@ static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u
>  	__yield_to_locked_owner(lock, val, paravirt, clear_mustq);
>  }
>  
> +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt)
> +{
> +	struct qnode *next;
> +	int owner;
> +
> +	if (!paravirt)
> +		return;
> +	if (!pv_yield_propagate_owner)
> +		return;
> +
> +	owner = get_owner_cpu(val);
> +	if (*set_yield_cpu == owner)
> +		return;
> +
> +	next = READ_ONCE(node->next);
> +	if (!next)
> +		return;
> +
> +	if (vcpu_is_preempted(owner)) {

Is there a difference about using vcpu_is_preempted() here
vs checking bit 0 in other places?


> +		next->yield_cpu = owner;
> +		*set_yield_cpu = owner;
> +	} else if (*set_yield_cpu != -1) {

It might be worth giving the -1 CPU a #define.

> +		next->yield_cpu = owner;
> +		*set_yield_cpu = owner;
> +	}
> +}

Does this need to pass set_yield_cpu by reference? Couldn't it's new value be
returned? To me it makes it more clear the function is used to change
set_yield_cpu. I think this would work:

int set_yield_cpu = -1;

static __always_inline int propagate_yield_cpu(struct qnode *node, u32 val, int set_yield_cpu, bool paravirt)
{
	struct qnode *next;
	int owner;

	if (!paravirt)
		goto out;
	if (!pv_yield_propagate_owner)
		goto out;

	owner = get_owner_cpu(val);
	if (set_yield_cpu == owner)
		goto out;

	next = READ_ONCE(node->next);
	if (!next)
		goto out;

	if (vcpu_is_preempted(owner)) {
		next->yield_cpu = owner;
		return owner;
	} else if (set_yield_cpu != -1) {
		next->yield_cpu = owner;
		return owner;
	}

out:
	return set_yield_cpu;
}

set_yield_cpu = propagate_yield_cpu(...  set_yield_cpu ...);



> +
>  static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, int prev_cpu, bool paravirt)
>  {
>  	u32 yield_count;
> +	int yield_cpu;
>  
>  	if (!paravirt)
>  		goto relax;
>  
> +	if (!pv_yield_propagate_owner)
> +		goto yield_prev;
> +
> +	yield_cpu = READ_ONCE(node->yield_cpu);
> +	if (yield_cpu == -1) {
> +		/* Propagate back the -1 CPU */
> +		if (node->next && node->next->yield_cpu != -1)
> +			node->next->yield_cpu = yield_cpu;
> +		goto yield_prev;
> +	}
> +
> +	yield_count = yield_count_of(yield_cpu);
> +	if ((yield_count & 1) == 0)
> +		goto yield_prev; /* owner vcpu is running */
> +
> +	smp_rmb();
> +
> +	if (yield_cpu == node->yield_cpu) {
> +		if (node->next && node->next->yield_cpu != yield_cpu)
> +			node->next->yield_cpu = yield_cpu;
> +		yield_to_preempted(yield_cpu, yield_count);
> +		return;
> +	}
> +
> +yield_prev:
>  	if (!pv_yield_prev)
>  		goto relax;
>  
> @@ -337,6 +392,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  	node = &qnodesp->nodes[idx];
>  	node->next = NULL;
>  	node->lock = lock;
> +	node->yield_cpu = -1;
>  	node->locked = 0;
>  
>  	tail = encode_tail_cpu();
> @@ -358,13 +414,21 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  		while (!node->locked)
>  			yield_to_prev(lock, node, prev_cpu, paravirt);
>  
> +		/* Clear out stale propagated yield_cpu */
> +		if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1)
> +			node->yield_cpu = -1;
> +
>  		smp_rmb(); /* acquire barrier for the mcs lock */
>  	}
>  
>  	if (!MAYBE_STEALERS) {
> +		int set_yield_cpu = -1;
> +
>  		/* We're at the head of the waitqueue, wait for the lock. */
> -		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
> +		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> +			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
>  			yield_head_to_locked_owner(lock, val, paravirt, false);
> +		}
>  
>  		/* If we're the last queued, must clean up the tail. */
>  		if ((val & _Q_TAIL_CPU_MASK) == tail) {
> @@ -376,12 +440,14 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  		/* We must be the owner, just set the lock bit and acquire */
>  		lock_set_locked(lock);
>  	} else {
> +		int set_yield_cpu = -1;
>  		int iters = 0;
>  		bool set_mustq = false;
>  
>  again:
>  		/* We're at the head of the waitqueue, wait for the lock. */
>  		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> +			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
>  			yield_head_to_locked_owner(lock, val, paravirt,
>  					pv_yield_allow_steal && set_mustq);
>  
> @@ -540,6 +606,22 @@ static int pv_yield_prev_get(void *data, u64 *val)
>  
>  DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");
>  
> +static int pv_yield_propagate_owner_set(void *data, u64 val)
> +{
> +	pv_yield_propagate_owner = !!val;
> +
> +	return 0;
> +}
> +
> +static int pv_yield_propagate_owner_get(void *data, u64 *val)
> +{
> +	*val = pv_yield_propagate_owner;
> +
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
> +
>  static __init int spinlock_debugfs_init(void)
>  {
>  	debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins);
> @@ -548,6 +630,7 @@ static __init int spinlock_debugfs_init(void)
>  		debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
>  		debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
>  		debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
> +		debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
>  	}
>  
>  	return 0;
Nicholas Piggin Nov. 10, 2022, 11:25 a.m. UTC | #4
On Thu Nov 10, 2022 at 10:42 AM AEST, Jordan Niethe wrote:
> On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> [resend as utf-8, not utf-7]
> > Having all CPUs poll the lock word for the owner CPU that should be
> > yielded to defeats most of the purpose of using MCS queueing for
> > scalability. Yet it may be desirable for queued waiters to to yield
> > to a preempted owner.
> > 
> > s390 addreses this problem by having queued waiters sample the lock
> > word to find the owner much less frequently. In this approach, the
> > waiters never sample it directly, but the queue head propagates the
> > owner CPU back to the next waiter if it ever finds the owner has
> > been preempted. Queued waiters then subsequently propagate the owner
> > CPU back to the next waiter, and so on.
> > 
> > Disable this option by default for now, i.e., no logical change.
> > ---
> >  arch/powerpc/lib/qspinlock.c | 85 +++++++++++++++++++++++++++++++++++-
> >  1 file changed, 84 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> > index 94f007f66942..28c85a2d5635 100644
> > --- a/arch/powerpc/lib/qspinlock.c
> > +++ b/arch/powerpc/lib/qspinlock.c
> > @@ -12,6 +12,7 @@
> >  struct qnode {
> >  	struct qnode	*next;
> >  	struct qspinlock *lock;
> > +	int		yield_cpu;
> >  	u8		locked; /* 1 if lock acquired */
> >  };
> >  
> > @@ -28,6 +29,7 @@ static int HEAD_SPINS __read_mostly = (1<<8);
> >  static bool pv_yield_owner __read_mostly = true;
> >  static bool pv_yield_allow_steal __read_mostly = false;
> >  static bool pv_yield_prev __read_mostly = true;
> > +static bool pv_yield_propagate_owner __read_mostly = true;
>
> This also seems to be enabled by default.
>
> >  
> >  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
> >  
> > @@ -257,13 +259,66 @@ static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u
> >  	__yield_to_locked_owner(lock, val, paravirt, clear_mustq);
> >  }
> >  
> > +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt)
> > +{
> > +	struct qnode *next;
> > +	int owner;
> > +
> > +	if (!paravirt)
> > +		return;
> > +	if (!pv_yield_propagate_owner)
> > +		return;
> > +
> > +	owner = get_owner_cpu(val);
> > +	if (*set_yield_cpu == owner)
> > +		return;
> > +
> > +	next = READ_ONCE(node->next);
> > +	if (!next)
> > +		return;
> > +
> > +	if (vcpu_is_preempted(owner)) {
>
> Is there a difference about using vcpu_is_preempted() here
> vs checking bit 0 in other places?

Yeah we weren't yielding here so didn't have to load the yield count
explicitly.

> > +		next->yield_cpu = owner;
> > +		*set_yield_cpu = owner;
> > +	} else if (*set_yield_cpu != -1) {
>
> It might be worth giving the -1 CPU a #define.

It's fairly standard to use -1 as a null value for cpu.

>
> > +		next->yield_cpu = owner;
> > +		*set_yield_cpu = owner;
> > +	}
> > +}
>
> Does this need to pass set_yield_cpu by reference? Couldn't it's new value be
> returned? To me it makes it more clear the function is used to change
> set_yield_cpu. I think this would work:
>
> int set_yield_cpu = -1;
>
> static __always_inline int propagate_yield_cpu(struct qnode *node, u32 val, int set_yield_cpu, bool paravirt)
> {
> 	struct qnode *next;
> 	int owner;
>
> 	if (!paravirt)
> 		goto out;
> 	if (!pv_yield_propagate_owner)
> 		goto out;
>
> 	owner = get_owner_cpu(val);
> 	if (set_yield_cpu == owner)
> 		goto out;
>
> 	next = READ_ONCE(node->next);
> 	if (!next)
> 		goto out;
>
> 	if (vcpu_is_preempted(owner)) {
> 		next->yield_cpu = owner;
> 		return owner;
> 	} else if (set_yield_cpu != -1) {
> 		next->yield_cpu = owner;
> 		return owner;
> 	}
>
> out:
> 	return set_yield_cpu;
> }
>
> set_yield_cpu = propagate_yield_cpu(...  set_yield_cpu ...);

I think I prefer as is, because the caller doesn't use it anywhere. It
looks more like some temporary storage to be used by the function over
multiple calls this way.

Thanks,
Nick
diff mbox series

Patch

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 94f007f66942..28c85a2d5635 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -12,6 +12,7 @@ 
 struct qnode {
 	struct qnode	*next;
 	struct qspinlock *lock;
+	int		yield_cpu;
 	u8		locked; /* 1 if lock acquired */
 };
 
@@ -28,6 +29,7 @@  static int HEAD_SPINS __read_mostly = (1<<8);
 static bool pv_yield_owner __read_mostly = true;
 static bool pv_yield_allow_steal __read_mostly = false;
 static bool pv_yield_prev __read_mostly = true;
+static bool pv_yield_propagate_owner __read_mostly = true;
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
 
@@ -257,13 +259,66 @@  static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u
 	__yield_to_locked_owner(lock, val, paravirt, clear_mustq);
 }
 
+static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt)
+{
+	struct qnode *next;
+	int owner;
+
+	if (!paravirt)
+		return;
+	if (!pv_yield_propagate_owner)
+		return;
+
+	owner = get_owner_cpu(val);
+	if (*set_yield_cpu == owner)
+		return;
+
+	next = READ_ONCE(node->next);
+	if (!next)
+		return;
+
+	if (vcpu_is_preempted(owner)) {
+		next->yield_cpu = owner;
+		*set_yield_cpu = owner;
+	} else if (*set_yield_cpu != -1) {
+		next->yield_cpu = owner;
+		*set_yield_cpu = owner;
+	}
+}
+
 static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, int prev_cpu, bool paravirt)
 {
 	u32 yield_count;
+	int yield_cpu;
 
 	if (!paravirt)
 		goto relax;
 
+	if (!pv_yield_propagate_owner)
+		goto yield_prev;
+
+	yield_cpu = READ_ONCE(node->yield_cpu);
+	if (yield_cpu == -1) {
+		/* Propagate back the -1 CPU */
+		if (node->next && node->next->yield_cpu != -1)
+			node->next->yield_cpu = yield_cpu;
+		goto yield_prev;
+	}
+
+	yield_count = yield_count_of(yield_cpu);
+	if ((yield_count & 1) == 0)
+		goto yield_prev; /* owner vcpu is running */
+
+	smp_rmb();
+
+	if (yield_cpu == node->yield_cpu) {
+		if (node->next && node->next->yield_cpu != yield_cpu)
+			node->next->yield_cpu = yield_cpu;
+		yield_to_preempted(yield_cpu, yield_count);
+		return;
+	}
+
+yield_prev:
 	if (!pv_yield_prev)
 		goto relax;
 
@@ -337,6 +392,7 @@  static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 	node = &qnodesp->nodes[idx];
 	node->next = NULL;
 	node->lock = lock;
+	node->yield_cpu = -1;
 	node->locked = 0;
 
 	tail = encode_tail_cpu();
@@ -358,13 +414,21 @@  static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 		while (!node->locked)
 			yield_to_prev(lock, node, prev_cpu, paravirt);
 
+		/* Clear out stale propagated yield_cpu */
+		if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1)
+			node->yield_cpu = -1;
+
 		smp_rmb(); /* acquire barrier for the mcs lock */
 	}
 
 	if (!MAYBE_STEALERS) {
+		int set_yield_cpu = -1;
+
 		/* We're at the head of the waitqueue, wait for the lock. */
-		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
+		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
+			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
 			yield_head_to_locked_owner(lock, val, paravirt, false);
+		}
 
 		/* If we're the last queued, must clean up the tail. */
 		if ((val & _Q_TAIL_CPU_MASK) == tail) {
@@ -376,12 +440,14 @@  static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 		/* We must be the owner, just set the lock bit and acquire */
 		lock_set_locked(lock);
 	} else {
+		int set_yield_cpu = -1;
 		int iters = 0;
 		bool set_mustq = false;
 
 again:
 		/* We're at the head of the waitqueue, wait for the lock. */
 		while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
+			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
 			yield_head_to_locked_owner(lock, val, paravirt,
 					pv_yield_allow_steal && set_mustq);
 
@@ -540,6 +606,22 @@  static int pv_yield_prev_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");
 
+static int pv_yield_propagate_owner_set(void *data, u64 val)
+{
+	pv_yield_propagate_owner = !!val;
+
+	return 0;
+}
+
+static int pv_yield_propagate_owner_get(void *data, u64 *val)
+{
+	*val = pv_yield_propagate_owner;
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
+
 static __init int spinlock_debugfs_init(void)
 {
 	debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins);
@@ -548,6 +630,7 @@  static __init int spinlock_debugfs_init(void)
 		debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
 		debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
 		debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
+		debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
 	}
 
 	return 0;