Message ID | 20210726035036.739609-12-npiggin@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,01/55] KVM: PPC: Book3S HV: Remove TM emulation from POWER7/8 path | expand |
Le 26/07/2021 à 05:49, Nicholas Piggin a écrit : > Rather than have KVM look up the host timer and fiddle with the > irq-work internal details, have the powerpc/time.c code provide a > function for KVM to re-arm the Linux timer code when exiting a > guest. > > This is implementation has an improvement over existing code of > marking a decrementer interrupt as soft-pending if a timer has > expired, rather than setting DEC to a -ve value, which tended to > cause host timers to take two interrupts (first hdec to exit the > guest, then the immediate dec). > > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> > --- > arch/powerpc/include/asm/time.h | 16 +++------- > arch/powerpc/kernel/time.c | 52 +++++++++++++++++++++++++++------ > arch/powerpc/kvm/book3s_hv.c | 7 ++--- > 3 files changed, 49 insertions(+), 26 deletions(-) > > diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h > index 69b6be617772..924b2157882f 100644 > --- a/arch/powerpc/include/asm/time.h > +++ b/arch/powerpc/include/asm/time.h > @@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low, > extern void secondary_cpu_time_init(void); > extern void __init time_init(void); > > -#ifdef CONFIG_PPC64 > -static inline unsigned long test_irq_work_pending(void) > -{ > - unsigned long x; > - > - asm volatile("lbz %0,%1(13)" > - : "=r" (x) > - : "i" (offsetof(struct paca_struct, irq_work_pending))); > - return x; > -} > -#endif > - > DECLARE_PER_CPU(u64, decrementers_next_tb); > > static inline u64 timer_get_next_tb(void) > @@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void) > return __this_cpu_read(decrementers_next_tb); > } > > +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > +void timer_rearm_host_dec(u64 now); > +#endif > + > /* Convert timebase ticks to nanoseconds */ > unsigned long long tb_to_ns(unsigned long long tb_ticks); > > diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c > index 72d872b49167..016828b7401b 100644 > --- a/arch/powerpc/kernel/time.c > +++ b/arch/powerpc/kernel/time.c > @@ -499,6 +499,16 @@ EXPORT_SYMBOL(profile_pc); > * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... > */ > #ifdef CONFIG_PPC64 > +static inline unsigned long test_irq_work_pending(void) > +{ > + unsigned long x; > + > + asm volatile("lbz %0,%1(13)" > + : "=r" (x) > + : "i" (offsetof(struct paca_struct, irq_work_pending))); Can we just use READ_ONCE() instead of hard coding the read ? > + return x; > +} > + > static inline void set_irq_work_pending_flag(void) > { > asm volatile("stb %0,%1(13)" : : > @@ -542,13 +552,44 @@ void arch_irq_work_raise(void) > preempt_enable(); > } > > +static void set_dec_or_work(u64 val) > +{ > + set_dec(val); > + /* We may have raced with new irq work */ > + if (unlikely(test_irq_work_pending())) > + set_dec(1); > +} > + > #else /* CONFIG_IRQ_WORK */ > > #define test_irq_work_pending() 0 > #define clear_irq_work_pending() > > +static void set_dec_or_work(u64 val) > +{ > + set_dec(val); > +} > #endif /* CONFIG_IRQ_WORK */ > > +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > +void timer_rearm_host_dec(u64 now) > +{ > + u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); > + > + WARN_ON_ONCE(!arch_irqs_disabled()); > + WARN_ON_ONCE(mfmsr() & MSR_EE); > + > + if (now >= *next_tb) { > + local_paca->irq_happened |= PACA_IRQ_DEC; > + } else { > + now = *next_tb - now; > + if (now <= decrementer_max) > + set_dec_or_work(now); > + } > +} > +EXPORT_SYMBOL_GPL(timer_rearm_host_dec); > +#endif > + > /* > * timer_interrupt - gets called when the decrementer overflows, > * with interrupts disabled. > @@ -609,10 +650,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) > } else { > now = *next_tb - now; > if (now <= decrementer_max) > - set_dec(now); > - /* We may have raced with new irq work */ > - if (test_irq_work_pending()) > - set_dec(1); > + set_dec_or_work(now); > __this_cpu_inc(irq_stat.timer_irqs_others); > } > > @@ -854,11 +892,7 @@ static int decrementer_set_next_event(unsigned long evt, > struct clock_event_device *dev) > { > __this_cpu_write(decrementers_next_tb, get_tb() + evt); > - set_dec(evt); > - > - /* We may have raced with new irq work */ > - if (test_irq_work_pending()) > - set_dec(1); > + set_dec_or_work(evt); > > return 0; > } > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 6e6cfb10e9bb..0cef578930f9 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -4018,11 +4018,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, > vc->entry_exit_map = 0x101; > vc->in_guest = 0; > > - next_timer = timer_get_next_tb(); > - set_dec(next_timer - tb); > - /* We may have raced with new irq work */ > - if (test_irq_work_pending()) > - set_dec(1); > + timer_rearm_host_dec(tb); > + > mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); > > kvmhv_load_host_pmu(); >
Excerpts from Christophe Leroy's message of August 5, 2021 5:22 pm: > > > Le 26/07/2021 à 05:49, Nicholas Piggin a écrit : >> Rather than have KVM look up the host timer and fiddle with the >> irq-work internal details, have the powerpc/time.c code provide a >> function for KVM to re-arm the Linux timer code when exiting a >> guest. >> >> This is implementation has an improvement over existing code of >> marking a decrementer interrupt as soft-pending if a timer has >> expired, rather than setting DEC to a -ve value, which tended to >> cause host timers to take two interrupts (first hdec to exit the >> guest, then the immediate dec). >> >> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> >> --- >> arch/powerpc/include/asm/time.h | 16 +++------- >> arch/powerpc/kernel/time.c | 52 +++++++++++++++++++++++++++------ >> arch/powerpc/kvm/book3s_hv.c | 7 ++--- >> 3 files changed, 49 insertions(+), 26 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h >> index 69b6be617772..924b2157882f 100644 >> --- a/arch/powerpc/include/asm/time.h >> +++ b/arch/powerpc/include/asm/time.h >> @@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low, >> extern void secondary_cpu_time_init(void); >> extern void __init time_init(void); >> >> -#ifdef CONFIG_PPC64 >> -static inline unsigned long test_irq_work_pending(void) >> -{ >> - unsigned long x; >> - >> - asm volatile("lbz %0,%1(13)" >> - : "=r" (x) >> - : "i" (offsetof(struct paca_struct, irq_work_pending))); >> - return x; >> -} >> -#endif >> - >> DECLARE_PER_CPU(u64, decrementers_next_tb); >> >> static inline u64 timer_get_next_tb(void) >> @@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void) >> return __this_cpu_read(decrementers_next_tb); >> } >> >> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE >> +void timer_rearm_host_dec(u64 now); >> +#endif >> + >> /* Convert timebase ticks to nanoseconds */ >> unsigned long long tb_to_ns(unsigned long long tb_ticks); >> >> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c >> index 72d872b49167..016828b7401b 100644 >> --- a/arch/powerpc/kernel/time.c >> +++ b/arch/powerpc/kernel/time.c >> @@ -499,6 +499,16 @@ EXPORT_SYMBOL(profile_pc); >> * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... >> */ >> #ifdef CONFIG_PPC64 >> +static inline unsigned long test_irq_work_pending(void) >> +{ >> + unsigned long x; >> + >> + asm volatile("lbz %0,%1(13)" >> + : "=r" (x) >> + : "i" (offsetof(struct paca_struct, irq_work_pending))); > > Can we just use READ_ONCE() instead of hard coding the read ? Good question, probably yes. Probably calls for its own patch series, e.g., hw_irq.h has all similar paca accesses. Thanks, Nick
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 69b6be617772..924b2157882f 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low, extern void secondary_cpu_time_init(void); extern void __init time_init(void); -#ifdef CONFIG_PPC64 -static inline unsigned long test_irq_work_pending(void) -{ - unsigned long x; - - asm volatile("lbz %0,%1(13)" - : "=r" (x) - : "i" (offsetof(struct paca_struct, irq_work_pending))); - return x; -} -#endif - DECLARE_PER_CPU(u64, decrementers_next_tb); static inline u64 timer_get_next_tb(void) @@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void) return __this_cpu_read(decrementers_next_tb); } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void timer_rearm_host_dec(u64 now); +#endif + /* Convert timebase ticks to nanoseconds */ unsigned long long tb_to_ns(unsigned long long tb_ticks); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 72d872b49167..016828b7401b 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -499,6 +499,16 @@ EXPORT_SYMBOL(profile_pc); * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... */ #ifdef CONFIG_PPC64 +static inline unsigned long test_irq_work_pending(void) +{ + unsigned long x; + + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, irq_work_pending))); + return x; +} + static inline void set_irq_work_pending_flag(void) { asm volatile("stb %0,%1(13)" : : @@ -542,13 +552,44 @@ void arch_irq_work_raise(void) preempt_enable(); } +static void set_dec_or_work(u64 val) +{ + set_dec(val); + /* We may have raced with new irq work */ + if (unlikely(test_irq_work_pending())) + set_dec(1); +} + #else /* CONFIG_IRQ_WORK */ #define test_irq_work_pending() 0 #define clear_irq_work_pending() +static void set_dec_or_work(u64 val) +{ + set_dec(val); +} #endif /* CONFIG_IRQ_WORK */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void timer_rearm_host_dec(u64 now) +{ + u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); + + WARN_ON_ONCE(!arch_irqs_disabled()); + WARN_ON_ONCE(mfmsr() & MSR_EE); + + if (now >= *next_tb) { + local_paca->irq_happened |= PACA_IRQ_DEC; + } else { + now = *next_tb - now; + if (now <= decrementer_max) + set_dec_or_work(now); + } +} +EXPORT_SYMBOL_GPL(timer_rearm_host_dec); +#endif + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. @@ -609,10 +650,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) } else { now = *next_tb - now; if (now <= decrementer_max) - set_dec(now); - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + set_dec_or_work(now); __this_cpu_inc(irq_stat.timer_irqs_others); } @@ -854,11 +892,7 @@ static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { __this_cpu_write(decrementers_next_tb, get_tb() + evt); - set_dec(evt); - - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + set_dec_or_work(evt); return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6e6cfb10e9bb..0cef578930f9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4018,11 +4018,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->entry_exit_map = 0x101; vc->in_guest = 0; - next_timer = timer_get_next_tb(); - set_dec(next_timer - tb); - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + timer_rearm_host_dec(tb); + mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); kvmhv_load_host_pmu();
Rather than have KVM look up the host timer and fiddle with the irq-work internal details, have the powerpc/time.c code provide a function for KVM to re-arm the Linux timer code when exiting a guest. This is implementation has an improvement over existing code of marking a decrementer interrupt as soft-pending if a timer has expired, rather than setting DEC to a -ve value, which tended to cause host timers to take two interrupts (first hdec to exit the guest, then the immediate dec). Signed-off-by: Nicholas Piggin <npiggin@gmail.com> --- arch/powerpc/include/asm/time.h | 16 +++------- arch/powerpc/kernel/time.c | 52 +++++++++++++++++++++++++++------ arch/powerpc/kvm/book3s_hv.c | 7 ++--- 3 files changed, 49 insertions(+), 26 deletions(-)