diff mbox series

[v2,1/6] powerpc/smp: Cache CPU has Asymmetric SMP

Message ID 20231018163751.2423181-2-srikar@linux.vnet.ibm.com (mailing list archive)
State Superseded
Headers show
Series powerpc/smp: Shared processor sched optimizations | expand

Commit Message

Srikar Dronamraju Oct. 18, 2023, 4:37 p.m. UTC
Currently cpu feature flag is checked whenever powerpc_smt_flags gets
called. This is an unnecessary overhead. CPU_FTR_ASYM_SMT is set based
on the processor and all processors will either have this set or will
have it unset.

Hence only check for the feature flag once and cache it to be used
subsequently. This commit will help avoid a branch in powerpc_smt_flags

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
Changelog:
v1->v2: Using static keys instead of a variable.
Using pr_info_once instead of printk

 arch/powerpc/kernel/smp.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

Comments

Michael Ellerman Oct. 19, 2023, 4:33 a.m. UTC | #1
Srikar Dronamraju <srikar@linux.vnet.ibm.com> writes:
> Currently cpu feature flag is checked whenever powerpc_smt_flags gets
> called. This is an unnecessary overhead. CPU_FTR_ASYM_SMT is set based
> on the processor and all processors will either have this set or will
> have it unset.

The cpu_has_feature() test is implemented with a static key.

So AFAICS this is just replacing one static key with another?

I see that you use the new static key in subsequent patches. But
couldn't those just use the existing cpu feature test?

Anyway I'd be interested to see how the generated code differs
before/after this.

cheers

> Hence only check for the feature flag once and cache it to be used
> subsequently. This commit will help avoid a branch in powerpc_smt_flags
>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
> Changelog:
> v1->v2: Using static keys instead of a variable.
> Using pr_info_once instead of printk
>
>  arch/powerpc/kernel/smp.c | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 5826f5108a12..37c41297c9ce 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -988,18 +988,16 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property)
>  }
>  
>  static bool shared_caches;
> +DEFINE_STATIC_KEY_FALSE(powerpc_asym_packing);
>  
>  #ifdef CONFIG_SCHED_SMT
>  /* cpumask of CPUs with asymmetric SMT dependency */
>  static int powerpc_smt_flags(void)
>  {
> -	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> +	if (static_branch_unlikely(&powerpc_asym_packing))
> +		return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
>  
> -	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
> -		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
> -		flags |= SD_ASYM_PACKING;
> -	}
> -	return flags;
> +	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
>  }
>  #endif
>  
> @@ -1686,6 +1684,11 @@ static void __init fixup_topology(void)
>  {
>  	int i;
>  
> +	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
> +		pr_info_once("Enabling Asymmetric SMT scheduling\n");
> +		static_branch_enable(&powerpc_asym_packing);
> +	}
> +
>  #ifdef CONFIG_SCHED_SMT
>  	if (has_big_cores) {
>  		pr_info("Big cores detected but using small core scheduling\n");
> -- 
> 2.31.1
Srikar Dronamraju Oct. 20, 2023, 9:09 a.m. UTC | #2
* Michael Ellerman <mpe@ellerman.id.au> [2023-10-19 15:33:16]:

> Srikar Dronamraju <srikar@linux.vnet.ibm.com> writes:
> > Currently cpu feature flag is checked whenever powerpc_smt_flags gets
> > called. This is an unnecessary overhead. CPU_FTR_ASYM_SMT is set based
> > on the processor and all processors will either have this set or will
> > have it unset.
> 
> The cpu_has_feature() test is implemented with a static key.
> 
> So AFAICS this is just replacing one static key with another?
> 

> I see that you use the new static key in subsequent patches. But
> couldn't those just use the existing cpu feature test?
> 

Yes, we can use the existing cpu feature test itself.

> Anyway I'd be interested to see how the generated code differs
> before/after this.
> 
---------------------------->8----------------------------------------------8<------------
Before this change
0000000000000500 <powerpc_smt_flags>:
{
     500:	00 00 4c 3c 	addis   r2,r12,0
     504:	00 00 42 38 	addi    r2,r2,0
     508:	a6 02 08 7c 	mflr    r0
     50c:	01 00 00 48 	bl      50c <powerpc_smt_flags+0xc>
     510:	f8 ff e1 fb 	std     r31,-8(r1)
     514:	91 ff 21 f8 	stdu    r1,-112(r1)
#define JUMP_ENTRY_TYPE		stringify_in_c(FTR_ENTRY_LONG)
#define JUMP_LABEL_NOP_SIZE	4

static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
	asm_volatile_goto("1:\n\t"
     518:	00 00 00 60 	nop
		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
     51c:	00 00 22 3d 	addis   r9,r2,0
		flags |= SD_ASYM_PACKING;
     520:	80 05 e0 3b 	li      r31,1408
		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
     524:	00 00 29 89 	lbz     r9,0(r9)
     528:	00 00 09 2c 	cmpwi   r9,0
     52c:	28 00 82 41 	beq     554 <powerpc_smt_flags+0x54>
}
     530:	70 00 21 38 	addi    r1,r1,112
     534:	b4 07 e3 7f 	extsw   r3,r31
     538:	f8 ff e1 eb 	ld      r31,-8(r1)
     53c:	20 00 80 4e 	blr
	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
     540:	80 01 e0 3b 	li      r31,384
}
     544:	70 00 21 38 	addi    r1,r1,112
     548:	b4 07 e3 7f 	extsw   r3,r31
     54c:	f8 ff e1 eb 	ld      r31,-8(r1)
     550:	20 00 80 4e 	blr
		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
     554:	a6 02 08 7c 	mflr    r0
     558:	00 00 62 3c 	addis   r3,r2,0
     55c:	01 00 20 39 	li      r9,1
     560:	00 00 42 3d 	addis   r10,r2,0
     564:	00 00 63 38 	addi    r3,r3,0
     568:	00 00 2a 99 	stb     r9,0(r10)
     56c:	80 00 01 f8 	std     r0,128(r1)
     570:	01 00 00 48 	bl      570 <powerpc_smt_flags+0x70>
     574:	00 00 00 60 	nop
     578:	80 00 01 e8 	ld      r0,128(r1)
     57c:	a6 03 08 7c 	mtlr    r0
     580:	b0 ff ff 4b 	b       530 <powerpc_smt_flags+0x30>
     584:	00 00 00 60 	nop
     588:	00 00 00 60 	nop
     58c:	00 00 00 60 	nop


post this change.
0000000000000340 <powerpc_smt_flags>:
{
     340:	a6 02 08 7c 	mflr    r0
     344:	01 00 00 48 	bl      344 <powerpc_smt_flags+0x4>
#define JUMP_ENTRY_TYPE		stringify_in_c(FTR_ENTRY_LONG)
#define JUMP_LABEL_NOP_SIZE	4

static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
	asm_volatile_goto("1:\n\t"
     348:	00 00 00 60 	nop
	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
     34c:	80 01 60 38 	li      r3,384
}
     350:	b4 07 63 7c 	extsw   r3,r3
     354:	20 00 80 4e 	blr
     358:	00 00 00 60 	nop
     35c:	00 00 00 60 	nop
		return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
     360:	80 05 60 38 	li      r3,1408
}
     364:	b4 07 63 7c 	extsw   r3,r3
     368:	20 00 80 4e 	blr
     36c:	00 00 00 60 	nop

---------------------------->8----------------------------------------------8<------------

I think the most of the difference is due to moving pr_info_once to
fixup_topology. Does it make sense to move the pr_info_once to
fixup_topology (which is called less often) from powerpc_smt_flags?

Even though the pr_info_once would probably translate to load + cmp + branch
we could avoid that for each smt_flag call.

So something like

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5826f5108a12..bc22f775425b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -993,13 +993,10 @@ static bool shared_caches;
 /* cpumask of CPUs with asymmetric SMT dependency */
 static int powerpc_smt_flags(void)
 {
-	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+		return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
 
-	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-		flags |= SD_ASYM_PACKING;
-	}
-	return flags;
+	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
@@ -1687,6 +1684,9 @@ static void __init fixup_topology(void)
 	int i;
 
 #ifdef CONFIG_SCHED_SMT
+	if (cpu_has_feature(CPU_FTR_ASYM_SMT))
+		pr_info_once("Enabling Asymmetric SMT scheduling\n");
+
 	if (has_big_cores) {
 		pr_info("Big cores detected but using small core scheduling\n");
 		powerpc_topology[smt_idx].mask = smallcore_smt_mask;
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5826f5108a12..37c41297c9ce 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -988,18 +988,16 @@  static int __init init_thread_group_cache_map(int cpu, int cache_property)
 }
 
 static bool shared_caches;
+DEFINE_STATIC_KEY_FALSE(powerpc_asym_packing);
 
 #ifdef CONFIG_SCHED_SMT
 /* cpumask of CPUs with asymmetric SMT dependency */
 static int powerpc_smt_flags(void)
 {
-	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+	if (static_branch_unlikely(&powerpc_asym_packing))
+		return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
 
-	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-		flags |= SD_ASYM_PACKING;
-	}
-	return flags;
+	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
@@ -1686,6 +1684,11 @@  static void __init fixup_topology(void)
 {
 	int i;
 
+	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+		pr_info_once("Enabling Asymmetric SMT scheduling\n");
+		static_branch_enable(&powerpc_asym_packing);
+	}
+
 #ifdef CONFIG_SCHED_SMT
 	if (has_big_cores) {
 		pr_info("Big cores detected but using small core scheduling\n");