diff mbox series

[2/2] armv8: generic_timer: Use event stream for udelay

Message ID 20240423081005.23218-3-peter.hoyes@arm.com
State Superseded
Delegated to: Tom Rini
Headers show
Series Optimize udelay for aarch64 boards | expand

Commit Message

Peter Hoyes April 23, 2024, 8:10 a.m. UTC
From: Peter Hoyes <Peter.Hoyes@arm.com>

Polling cntpct_el0 in a tight loop for delays is inefficient.
This is particularly apparent on Arm FVPs, which do not simulate
real time, meaning that a 1s sleep can take a couple of orders
of magnitude longer to execute in wall time.

If running at EL2 or above (where CNTHCTL_EL2 is available), enable
the cntpct_el0 event stream temporarily and use wfe to implement
the delay more efficiently. The event period is chosen as a
trade-off between efficiency and the fact that Arm FVPs do not
typically simulate real time.

This is only implemented for Armv8 boards, where an architectural
timer exists.

Some mach-socfpga AArch64 boards already override __udelay to make
it always inline, so guard the functionality with a new
ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.

Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>
---
 arch/arm/cpu/armv8/Kconfig         |  8 ++++++++
 arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++
 arch/arm/include/asm/system.h      |  6 ++++--
 3 files changed, 39 insertions(+), 2 deletions(-)

Comments

Quentin Schulz April 23, 2024, 10:55 a.m. UTC | #1
Hi Peter,

On 4/23/24 10:10, Peter Hoyes wrote:
> From: Peter Hoyes <Peter.Hoyes@arm.com>
> 
> Polling cntpct_el0 in a tight loop for delays is inefficient.
> This is particularly apparent on Arm FVPs, which do not simulate
> real time, meaning that a 1s sleep can take a couple of orders
> of magnitude longer to execute in wall time.
> 
> If running at EL2 or above (where CNTHCTL_EL2 is available), enable
> the cntpct_el0 event stream temporarily and use wfe to implement
> the delay more efficiently. The event period is chosen as a
> trade-off between efficiency and the fact that Arm FVPs do not
> typically simulate real time.
> 
> This is only implemented for Armv8 boards, where an architectural
> timer exists.
> 
> Some mach-socfpga AArch64 boards already override __udelay to make
> it always inline, so guard the functionality with a new
> ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
> 
> Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>
> ---
>   arch/arm/cpu/armv8/Kconfig         |  8 ++++++++
>   arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++
>   arch/arm/include/asm/system.h      |  6 ++++--
>   3 files changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
> index 9f0fb369f7..544c5e2d74 100644
> --- a/arch/arm/cpu/armv8/Kconfig
> +++ b/arch/arm/cpu/armv8/Kconfig
> @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
>   	  Exception handling at all exception levels for External Abort and
>   	  SError interrupt exception are taken in EL3.
>   
> +config ARMV8_UDELAY_EVENT_STREAM
> +	bool "Use the event stream for udelay"
> +	default y if !ARCH_SOCFPGA
> +	help
> +	  Use the event stream provided by the AArch64 architectural timer for
> +	  delays. This is more efficient than the default polling
> +	  implementation.
> +
>   menuconfig ARMV8_CRYPTO
>   	bool "ARM64 Accelerated Cryptographic Algorithms"
>   
> diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c
> index 8f83372cbc..e18b5c8187 100644
> --- a/arch/arm/cpu/armv8/generic_timer.c
> +++ b/arch/arm/cpu/armv8/generic_timer.c
> @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
>   
>   	return val / get_tbclk();
>   }
> +
> +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
> +void __udelay(unsigned long usec)
> +{
> +	u64 target = get_ticks() + usec_to_tick(usec);
> +

This can theoretically overflow, do we have any guarantee this cannot 
happen in real life, like... we would need U-Boot to be running for 100 
years without being powered-down/reset or something like that? Can we 
document this assumption? Does this make sense?

> +	/* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
> +	if (current_el() >= 2) {
> +		u32 cnthctl_val;
> +		const u8 event_period = 0x7;
> +
> +		asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
> +		asm volatile("msr cnthctl_el2, %0" : : "r"
> +			(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
> +
> +		while (get_ticks() + (1ULL << event_period) <= target)

This could be an overflow as well.

> +			wfe();
> +
> +		/* Reset the event stream */
> +		asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
> +	}
> +
> +	/* Fall back to polling CNTPCT_EL0 */
> +	while (get_ticks() <= target)

get_ticks() could wrap around here maybe?

Cheers,
Quentin
Peter Hoyes April 24, 2024, 10:25 a.m. UTC | #2
Hi Quentin,

On 4/23/24 11:55, Quentin Schulz wrote:
> Hi Peter,
>
> On 4/23/24 10:10, Peter Hoyes wrote:
>> From: Peter Hoyes <Peter.Hoyes@arm.com>
>>
>> Polling cntpct_el0 in a tight loop for delays is inefficient.
>> This is particularly apparent on Arm FVPs, which do not simulate
>> real time, meaning that a 1s sleep can take a couple of orders
>> of magnitude longer to execute in wall time.
>>
>> If running at EL2 or above (where CNTHCTL_EL2 is available), enable
>> the cntpct_el0 event stream temporarily and use wfe to implement
>> the delay more efficiently. The event period is chosen as a
>> trade-off between efficiency and the fact that Arm FVPs do not
>> typically simulate real time.
>>
>> This is only implemented for Armv8 boards, where an architectural
>> timer exists.
>>
>> Some mach-socfpga AArch64 boards already override __udelay to make
>> it always inline, so guard the functionality with a new
>> ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
>>
>> Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>
>> ---
>>   arch/arm/cpu/armv8/Kconfig         |  8 ++++++++
>>   arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++
>>   arch/arm/include/asm/system.h      |  6 ++++--
>>   3 files changed, 39 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
>> index 9f0fb369f7..544c5e2d74 100644
>> --- a/arch/arm/cpu/armv8/Kconfig
>> +++ b/arch/arm/cpu/armv8/Kconfig
>> @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
>>         Exception handling at all exception levels for External Abort 
>> and
>>         SError interrupt exception are taken in EL3.
>>   +config ARMV8_UDELAY_EVENT_STREAM
>> +    bool "Use the event stream for udelay"
>> +    default y if !ARCH_SOCFPGA
>> +    help
>> +      Use the event stream provided by the AArch64 architectural 
>> timer for
>> +      delays. This is more efficient than the default polling
>> +      implementation.
>> +
>>   menuconfig ARMV8_CRYPTO
>>       bool "ARM64 Accelerated Cryptographic Algorithms"
>>   diff --git a/arch/arm/cpu/armv8/generic_timer.c 
>> b/arch/arm/cpu/armv8/generic_timer.c
>> index 8f83372cbc..e18b5c8187 100644
>> --- a/arch/arm/cpu/armv8/generic_timer.c
>> +++ b/arch/arm/cpu/armv8/generic_timer.c
>> @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
>>         return val / get_tbclk();
>>   }
>> +
>> +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
>> +void __udelay(unsigned long usec)
>> +{
>> +    u64 target = get_ticks() + usec_to_tick(usec);
>> +
>
> This can theoretically overflow, do we have any guarantee this cannot 
> happen in real life, like... we would need U-Boot to be running for 
> 100 years without being powered-down/reset or something like that? Can 
> we document this assumption? Does this make sense?
>
>> +    /* At EL2 or above, use the event stream to avoid polling 
>> CNTPCT_EL0 so often */
>> +    if (current_el() >= 2) {
>> +        u32 cnthctl_val;
>> +        const u8 event_period = 0x7;
>> +
>> +        asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
>> +        asm volatile("msr cnthctl_el2, %0" : : "r"
>> +            (cnthctl_val | CNTHCTL_EL2_EVNT_EN | 
>> CNTHCTL_EL2_EVNT_I(event_period)));
>> +
>> +        while (get_ticks() + (1ULL << event_period) <= target)
>
> This could be an overflow as well.
>
>> +            wfe();
>> +
>> +        /* Reset the event stream */
>> +        asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
>> +    }
>> +
>> +    /* Fall back to polling CNTPCT_EL0 */
>> +    while (get_ticks() <= target)
>
> get_ticks() could wrap around here maybe?
I don't see a problem here. It's using u64s and the maximum 
COUNTER_FREQUENCY in the U-Boot source is 200MHz, which gives enough 
ticks for about 3 millennia.

Additionally, the underlying "while" condition is the same as the 
existing weak __udelay implementation
(https://source.denx.de/u-boot/u-boot/-/blob/master/lib/time.c?ref_type=heads#L181), 
so this change doesn't affect the overflow limit.

Thanks,
Peter
Andre Przywara April 24, 2024, 10:29 a.m. UTC | #3
On Tue, 23 Apr 2024 12:55:55 +0200
Quentin Schulz <quentin.schulz@theobroma-systems.com> wrote:

> Hi Peter,
> 
> On 4/23/24 10:10, Peter Hoyes wrote:
> > From: Peter Hoyes <Peter.Hoyes@arm.com>
> > 
> > Polling cntpct_el0 in a tight loop for delays is inefficient.
> > This is particularly apparent on Arm FVPs, which do not simulate
> > real time, meaning that a 1s sleep can take a couple of orders
> > of magnitude longer to execute in wall time.
> > 
> > If running at EL2 or above (where CNTHCTL_EL2 is available), enable
> > the cntpct_el0 event stream temporarily and use wfe to implement
> > the delay more efficiently. The event period is chosen as a
> > trade-off between efficiency and the fact that Arm FVPs do not
> > typically simulate real time.
> > 
> > This is only implemented for Armv8 boards, where an architectural
> > timer exists.
> > 
> > Some mach-socfpga AArch64 boards already override __udelay to make
> > it always inline, so guard the functionality with a new
> > ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
> > 
> > Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>
> > ---
> >   arch/arm/cpu/armv8/Kconfig         |  8 ++++++++
> >   arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++
> >   arch/arm/include/asm/system.h      |  6 ++++--
> >   3 files changed, 39 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
> > index 9f0fb369f7..544c5e2d74 100644
> > --- a/arch/arm/cpu/armv8/Kconfig
> > +++ b/arch/arm/cpu/armv8/Kconfig
> > @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
> >   	  Exception handling at all exception levels for External Abort and
> >   	  SError interrupt exception are taken in EL3.
> >   
> > +config ARMV8_UDELAY_EVENT_STREAM
> > +	bool "Use the event stream for udelay"
> > +	default y if !ARCH_SOCFPGA
> > +	help
> > +	  Use the event stream provided by the AArch64 architectural timer for
> > +	  delays. This is more efficient than the default polling
> > +	  implementation.
> > +
> >   menuconfig ARMV8_CRYPTO
> >   	bool "ARM64 Accelerated Cryptographic Algorithms"
> >   
> > diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c
> > index 8f83372cbc..e18b5c8187 100644
> > --- a/arch/arm/cpu/armv8/generic_timer.c
> > +++ b/arch/arm/cpu/armv8/generic_timer.c
> > @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
> >   
> >   	return val / get_tbclk();
> >   }
> > +
> > +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
> > +void __udelay(unsigned long usec)
> > +{
> > +	u64 target = get_ticks() + usec_to_tick(usec);
> > +  
> 
> This can theoretically overflow, do we have any guarantee this cannot 
> happen in real life, like... we would need U-Boot to be running for 100 
> years without being powered-down/reset or something like that? Can we 
> document this assumption? Does this make sense?

The Arm ARM guarantees a "Roll-over time of not less than 40 years."
(Armv8 ARM 0487K.a D12.1.2 "The system counter").
So that's not the 100 years you are asking for, but I guess still good
enough?

Cheers,
Andre

> 
> > +	/* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
> > +	if (current_el() >= 2) {
> > +		u32 cnthctl_val;
> > +		const u8 event_period = 0x7;
> > +
> > +		asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
> > +		asm volatile("msr cnthctl_el2, %0" : : "r"
> > +			(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
> > +
> > +		while (get_ticks() + (1ULL << event_period) <= target)  
> 
> This could be an overflow as well.
> 
> > +			wfe();
> > +
> > +		/* Reset the event stream */
> > +		asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
> > +	}
> > +
> > +	/* Fall back to polling CNTPCT_EL0 */
> > +	while (get_ticks() <= target)  
> 
> get_ticks() could wrap around here maybe?
> 
> Cheers,
> Quentin
Quentin Schulz April 24, 2024, 10:56 a.m. UTC | #4
Hi Peter, Andre,

On 4/24/24 12:29, Andre Przywara wrote:
> On Tue, 23 Apr 2024 12:55:55 +0200
> Quentin Schulz <quentin.schulz@theobroma-systems.com> wrote:
> 
>> Hi Peter,
>>
>> On 4/23/24 10:10, Peter Hoyes wrote:
>>> From: Peter Hoyes <Peter.Hoyes@arm.com>
>>>
>>> Polling cntpct_el0 in a tight loop for delays is inefficient.
>>> This is particularly apparent on Arm FVPs, which do not simulate
>>> real time, meaning that a 1s sleep can take a couple of orders
>>> of magnitude longer to execute in wall time.
>>>
>>> If running at EL2 or above (where CNTHCTL_EL2 is available), enable
>>> the cntpct_el0 event stream temporarily and use wfe to implement
>>> the delay more efficiently. The event period is chosen as a
>>> trade-off between efficiency and the fact that Arm FVPs do not
>>> typically simulate real time.
>>>
>>> This is only implemented for Armv8 boards, where an architectural
>>> timer exists.
>>>
>>> Some mach-socfpga AArch64 boards already override __udelay to make
>>> it always inline, so guard the functionality with a new
>>> ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
>>>
>>> Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>
>>> ---
>>>    arch/arm/cpu/armv8/Kconfig         |  8 ++++++++
>>>    arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++
>>>    arch/arm/include/asm/system.h      |  6 ++++--
>>>    3 files changed, 39 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
>>> index 9f0fb369f7..544c5e2d74 100644
>>> --- a/arch/arm/cpu/armv8/Kconfig
>>> +++ b/arch/arm/cpu/armv8/Kconfig
>>> @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
>>>    	  Exception handling at all exception levels for External Abort and
>>>    	  SError interrupt exception are taken in EL3.
>>>    
>>> +config ARMV8_UDELAY_EVENT_STREAM
>>> +	bool "Use the event stream for udelay"
>>> +	default y if !ARCH_SOCFPGA
>>> +	help
>>> +	  Use the event stream provided by the AArch64 architectural timer for
>>> +	  delays. This is more efficient than the default polling
>>> +	  implementation.
>>> +
>>>    menuconfig ARMV8_CRYPTO
>>>    	bool "ARM64 Accelerated Cryptographic Algorithms"
>>>    
>>> diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c
>>> index 8f83372cbc..e18b5c8187 100644
>>> --- a/arch/arm/cpu/armv8/generic_timer.c
>>> +++ b/arch/arm/cpu/armv8/generic_timer.c
>>> @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
>>>    
>>>    	return val / get_tbclk();
>>>    }
>>> +
>>> +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
>>> +void __udelay(unsigned long usec)
>>> +{
>>> +	u64 target = get_ticks() + usec_to_tick(usec);
>>> +
>>
>> This can theoretically overflow, do we have any guarantee this cannot
>> happen in real life, like... we would need U-Boot to be running for 100
>> years without being powered-down/reset or something like that? Can we
>> document this assumption? Does this make sense?
> 
> The Arm ARM guarantees a "Roll-over time of not less than 40 years."
> (Armv8 ARM 0487K.a D12.1.2 "The system counter").
> So that's not the 100 years you are asking for, but I guess still good
> enough?
> 

I guess it is, since it is also stored in a u64 and is reset to 0 upon 
start-up according to the ARM. I also assume nobody is going to add a 
udelay of years in their code :) (and if they do, they would probably 
figure out something's wrong before it reaches the final products :) ).

Thanks all for the pointers to reference manuals and current 
implementations.

Cheers,
Quentin
Andre Przywara April 29, 2024, 3:16 p.m. UTC | #5
On Tue, 23 Apr 2024 09:10:05 +0100
Peter Hoyes <peter.hoyes@arm.com> wrote:

Hi,

> From: Peter Hoyes <Peter.Hoyes@arm.com>
> 
> Polling cntpct_el0 in a tight loop for delays is inefficient.
> This is particularly apparent on Arm FVPs, which do not simulate
> real time, meaning that a 1s sleep can take a couple of orders
> of magnitude longer to execute in wall time.
> 
> If running at EL2 or above (where CNTHCTL_EL2 is available), enable
> the cntpct_el0 event stream temporarily and use wfe to implement
> the delay more efficiently. The event period is chosen as a
> trade-off between efficiency and the fact that Arm FVPs do not
> typically simulate real time.
> 
> This is only implemented for Armv8 boards, where an architectural
> timer exists.
> 
> Some mach-socfpga AArch64 boards already override __udelay to make
> it always inline, so guard the functionality with a new
> ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.

So the code looks alright, and it works for me, tested on some hardware I
just had at hand.
However I am a bit worried about the scope of this patch. While it's
indeed purely architectural and should work on all systems, I think we
need to be careful with messing with the arch timer *while the OS is
running*. U-Boot code will run for UEFI runtime services and for serving
PSCI on some platforms, and I guess messing with CNTHCTL_EL2 is not a good
idea then. And udelay sounds like a basic function that this code might
use.
So I wonder if we should limit this to the Arm FVPs for now? To not create
subtle and hard-to-diagnose problems for a lot of boards?

Or maybe there is some mechanism to limit this to U-Boot boot time/UEFI
boot services?

> Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>
> ---
>  arch/arm/cpu/armv8/Kconfig         |  8 ++++++++
>  arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++
>  arch/arm/include/asm/system.h      |  6 ++++--
>  3 files changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
> index 9f0fb369f7..544c5e2d74 100644
> --- a/arch/arm/cpu/armv8/Kconfig
> +++ b/arch/arm/cpu/armv8/Kconfig
> @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
>  	  Exception handling at all exception levels for External Abort and
>  	  SError interrupt exception are taken in EL3.
>  
> +config ARMV8_UDELAY_EVENT_STREAM
> +	bool "Use the event stream for udelay"
> +	default y if !ARCH_SOCFPGA

As described above, change this to something like "... if ARCH_VEXPRESS64".

Cheers,
Andre

> +	help
> +	  Use the event stream provided by the AArch64 architectural timer for
> +	  delays. This is more efficient than the default polling
> +	  implementation.
> +
>  menuconfig ARMV8_CRYPTO
>  	bool "ARM64 Accelerated Cryptographic Algorithms"
>  
> diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c
> index 8f83372cbc..e18b5c8187 100644
> --- a/arch/arm/cpu/armv8/generic_timer.c
> +++ b/arch/arm/cpu/armv8/generic_timer.c
> @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
>  
>  	return val / get_tbclk();
>  }
> +
> +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
> +void __udelay(unsigned long usec)
> +{
> +	u64 target = get_ticks() + usec_to_tick(usec);
> +
> +	/* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
> +	if (current_el() >= 2) {
> +		u32 cnthctl_val;
> +		const u8 event_period = 0x7;
> +
> +		asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
> +		asm volatile("msr cnthctl_el2, %0" : : "r"
> +			(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
> +
> +		while (get_ticks() + (1ULL << event_period) <= target)
> +			wfe();
> +
> +		/* Reset the event stream */
> +		asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
> +	}
> +
> +	/* Fall back to polling CNTPCT_EL0 */
> +	while (get_ticks() <= target)
> +		;
> +}
> +#endif
> diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
> index 51123c2968..7e30cac32a 100644
> --- a/arch/arm/include/asm/system.h
> +++ b/arch/arm/include/asm/system.h
> @@ -69,8 +69,10 @@
>  /*
>   * CNTHCTL_EL2 bits definitions
>   */
> -#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)  /* Physical timer regs accessible   */
> -#define CNTHCTL_EL2_EL1PCTEN_EN	(1 << 0)  /* Physical counter accessible      */
> +#define CNTHCTL_EL2_EVNT_EN	BIT(2)	     /* Enable the event stream       */
> +#define CNTHCTL_EL2_EVNT_I(val)	((val) << 4) /* Event stream trigger bits     */
> +#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)     /* Physical timer regs accessible */
> +#define CNTHCTL_EL2_EL1PCTEN_EN	(1 << 0)     /* Physical counter accessible   */
>  
>  /*
>   * HCR_EL2 bits definitions
Peter Hoyes May 1, 2024, 8:24 a.m. UTC | #6
On 4/29/24 16:16, Andre Przywara wrote:
> On Tue, 23 Apr 2024 09:10:05 +0100
> Peter Hoyes <peter.hoyes@arm.com> wrote:
>
> Hi,
>
>> From: Peter Hoyes <Peter.Hoyes@arm.com>
>>
>> Polling cntpct_el0 in a tight loop for delays is inefficient.
>> This is particularly apparent on Arm FVPs, which do not simulate
>> real time, meaning that a 1s sleep can take a couple of orders
>> of magnitude longer to execute in wall time.
>>
>> If running at EL2 or above (where CNTHCTL_EL2 is available), enable
>> the cntpct_el0 event stream temporarily and use wfe to implement
>> the delay more efficiently. The event period is chosen as a
>> trade-off between efficiency and the fact that Arm FVPs do not
>> typically simulate real time.
>>
>> This is only implemented for Armv8 boards, where an architectural
>> timer exists.
>>
>> Some mach-socfpga AArch64 boards already override __udelay to make
>> it always inline, so guard the functionality with a new
>> ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
> So the code looks alright, and it works for me, tested on some hardware I
> just had at hand.
> However I am a bit worried about the scope of this patch. While it's
> indeed purely architectural and should work on all systems, I think we
> need to be careful with messing with the arch timer *while the OS is
> running*. U-Boot code will run for UEFI runtime services and for serving
> PSCI on some platforms, and I guess messing with CNTHCTL_EL2 is not a good
> idea then. And udelay sounds like a basic function that this code might
> use.
> So I wonder if we should limit this to the Arm FVPs for now? To not create
> subtle and hard-to-diagnose problems for a lot of boards?
>
> Or maybe there is some mechanism to limit this to U-Boot boot time/UEFI
> boot services?
>
>> Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>
>> ---
>>   arch/arm/cpu/armv8/Kconfig         |  8 ++++++++
>>   arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++
>>   arch/arm/include/asm/system.h      |  6 ++++--
>>   3 files changed, 39 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
>> index 9f0fb369f7..544c5e2d74 100644
>> --- a/arch/arm/cpu/armv8/Kconfig
>> +++ b/arch/arm/cpu/armv8/Kconfig
>> @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
>>   	  Exception handling at all exception levels for External Abort and
>>   	  SError interrupt exception are taken in EL3.
>>   
>> +config ARMV8_UDELAY_EVENT_STREAM
>> +	bool "Use the event stream for udelay"
>> +	default y if !ARCH_SOCFPGA
> As described above, change this to something like "... if ARCH_VEXPRESS64".
>
> Cheers,
> Andre
>
Agreed, this is limited to ARCH_VEXPRESS64 in v2.

Peter

>> +	help
>> +	  Use the event stream provided by the AArch64 architectural timer for
>> +	  delays. This is more efficient than the default polling
>> +	  implementation.
>> +
>>   menuconfig ARMV8_CRYPTO
>>   	bool "ARM64 Accelerated Cryptographic Algorithms"
>>   
>> diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c
>> index 8f83372cbc..e18b5c8187 100644
>> --- a/arch/arm/cpu/armv8/generic_timer.c
>> +++ b/arch/arm/cpu/armv8/generic_timer.c
>> @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
>>   
>>   	return val / get_tbclk();
>>   }
>> +
>> +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
>> +void __udelay(unsigned long usec)
>> +{
>> +	u64 target = get_ticks() + usec_to_tick(usec);
>> +
>> +	/* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
>> +	if (current_el() >= 2) {
>> +		u32 cnthctl_val;
>> +		const u8 event_period = 0x7;
>> +
>> +		asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
>> +		asm volatile("msr cnthctl_el2, %0" : : "r"
>> +			(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
>> +
>> +		while (get_ticks() + (1ULL << event_period) <= target)
>> +			wfe();
>> +
>> +		/* Reset the event stream */
>> +		asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
>> +	}
>> +
>> +	/* Fall back to polling CNTPCT_EL0 */
>> +	while (get_ticks() <= target)
>> +		;
>> +}
>> +#endif
>> diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
>> index 51123c2968..7e30cac32a 100644
>> --- a/arch/arm/include/asm/system.h
>> +++ b/arch/arm/include/asm/system.h
>> @@ -69,8 +69,10 @@
>>   /*
>>    * CNTHCTL_EL2 bits definitions
>>    */
>> -#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)  /* Physical timer regs accessible   */
>> -#define CNTHCTL_EL2_EL1PCTEN_EN	(1 << 0)  /* Physical counter accessible      */
>> +#define CNTHCTL_EL2_EVNT_EN	BIT(2)	     /* Enable the event stream       */
>> +#define CNTHCTL_EL2_EVNT_I(val)	((val) << 4) /* Event stream trigger bits     */
>> +#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)     /* Physical timer regs accessible */
>> +#define CNTHCTL_EL2_EL1PCTEN_EN	(1 << 0)     /* Physical counter accessible   */
>>   
>>   /*
>>    * HCR_EL2 bits definitions
diff mbox series

Patch

diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
index 9f0fb369f7..544c5e2d74 100644
--- a/arch/arm/cpu/armv8/Kconfig
+++ b/arch/arm/cpu/armv8/Kconfig
@@ -191,6 +191,14 @@  config ARMV8_EA_EL3_FIRST
 	  Exception handling at all exception levels for External Abort and
 	  SError interrupt exception are taken in EL3.
 
+config ARMV8_UDELAY_EVENT_STREAM
+	bool "Use the event stream for udelay"
+	default y if !ARCH_SOCFPGA
+	help
+	  Use the event stream provided by the AArch64 architectural timer for
+	  delays. This is more efficient than the default polling
+	  implementation.
+
 menuconfig ARMV8_CRYPTO
 	bool "ARM64 Accelerated Cryptographic Algorithms"
 
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c
index 8f83372cbc..e18b5c8187 100644
--- a/arch/arm/cpu/armv8/generic_timer.c
+++ b/arch/arm/cpu/armv8/generic_timer.c
@@ -115,3 +115,30 @@  ulong timer_get_boot_us(void)
 
 	return val / get_tbclk();
 }
+
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
+void __udelay(unsigned long usec)
+{
+	u64 target = get_ticks() + usec_to_tick(usec);
+
+	/* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
+	if (current_el() >= 2) {
+		u32 cnthctl_val;
+		const u8 event_period = 0x7;
+
+		asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
+		asm volatile("msr cnthctl_el2, %0" : : "r"
+			(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
+
+		while (get_ticks() + (1ULL << event_period) <= target)
+			wfe();
+
+		/* Reset the event stream */
+		asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
+	}
+
+	/* Fall back to polling CNTPCT_EL0 */
+	while (get_ticks() <= target)
+		;
+}
+#endif
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index 51123c2968..7e30cac32a 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -69,8 +69,10 @@ 
 /*
  * CNTHCTL_EL2 bits definitions
  */
-#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)  /* Physical timer regs accessible   */
-#define CNTHCTL_EL2_EL1PCTEN_EN	(1 << 0)  /* Physical counter accessible      */
+#define CNTHCTL_EL2_EVNT_EN	BIT(2)	     /* Enable the event stream       */
+#define CNTHCTL_EL2_EVNT_I(val)	((val) << 4) /* Event stream trigger bits     */
+#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)     /* Physical timer regs accessible */
+#define CNTHCTL_EL2_EL1PCTEN_EN	(1 << 0)     /* Physical counter accessible   */
 
 /*
  * HCR_EL2 bits definitions