Message ID | 20111221013443.GN8378@schlenkerla.am.freescale.net (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
On 21.12.2011, at 02:34, Scott Wood wrote: > Chips such as e500mc that implement category E.HV in Power ISA 2.06 > provide hardware virtualization features, including a new MSR mode for > guest state. The guest OS can perform many operations without trapping > into the hypervisor, including transitions to and from guest userspace. > > Since we can use SRR1[GS] to reliably tell whether an exception came from > guest state, instead of messing around with IVPR, we use DO_KVM similarly > to book3s. Is there any benefit of using DO_KVM? I would assume that messing with IVPR is faster. > > Current issues include: > - Machine checks from guest state are not routed to the host handler. > - The guest can cause a host oops by executing an emulated instruction > in a page that lacks read permission. Existing e500/4xx support has > the same problem. We solve that in book3s pr by doing LAST_INST = <known bad value>; PACA->kvm_mode = <recover at next inst>; lwz(guest pc); do_more_stuff(); That way when an exception occurs at lwz() the DO_KVM handler checks that we're in kvm mode "recover" which does basically srr0+=4; rfi;. > > Includes work by Ashish Kalra <Ashish.Kalra@freescale.com>, > Varun Sethi <Varun.Sethi@freescale.com>, and > Liu Yu <yu.liu@freescale.com>. > > Signed-off-by: Scott Wood <scottwood@freescale.com> > --- > arch/powerpc/include/asm/dbell.h | 1 + > arch/powerpc/include/asm/kvm_asm.h | 8 + > arch/powerpc/include/asm/kvm_booke_hv_asm.h | 49 +++ > arch/powerpc/include/asm/kvm_host.h | 19 +- > arch/powerpc/include/asm/kvm_ppc.h | 3 + > arch/powerpc/include/asm/mmu-book3e.h | 6 + > arch/powerpc/include/asm/processor.h | 3 + > arch/powerpc/include/asm/reg.h | 2 + > arch/powerpc/include/asm/reg_booke.h | 34 ++ > arch/powerpc/kernel/asm-offsets.c | 15 +- > arch/powerpc/kernel/head_booke.h | 28 ++- > arch/powerpc/kvm/Kconfig | 3 + > arch/powerpc/kvm/booke.c | 398 ++++++++++++++----- > arch/powerpc/kvm/booke.h | 24 +- > arch/powerpc/kvm/booke_emulate.c | 23 +- > arch/powerpc/kvm/bookehv_interrupts.S | 587 +++++++++++++++++++++++++++ > arch/powerpc/kvm/powerpc.c | 5 + > arch/powerpc/kvm/timing.h | 6 + > 18 files changed, 1107 insertions(+), 107 deletions(-) > create mode 100644 arch/powerpc/include/asm/kvm_booke_hv_asm.h > create mode 100644 arch/powerpc/kvm/bookehv_interrupts.S > > diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h > index efa74ac..d7365b0 100644 > --- a/arch/powerpc/include/asm/dbell.h > +++ b/arch/powerpc/include/asm/dbell.h > @@ -19,6 +19,7 @@ > > #define PPC_DBELL_MSG_BRDCAST (0x04000000) > #define PPC_DBELL_TYPE(x) (((x) & 0xf) << (63-36)) > +#define PPC_DBELL_LPID(x) ((x) << (63 - 49)) > enum ppc_dbell { > PPC_DBELL = 0, /* doorbell */ > PPC_DBELL_CRIT = 1, /* critical doorbell */ > diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h > index 7b1f0e0..0978152 100644 > --- a/arch/powerpc/include/asm/kvm_asm.h > +++ b/arch/powerpc/include/asm/kvm_asm.h > @@ -48,6 +48,14 @@ > #define BOOKE_INTERRUPT_SPE_FP_DATA 33 > #define BOOKE_INTERRUPT_SPE_FP_ROUND 34 > #define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35 > +#define BOOKE_INTERRUPT_DOORBELL 36 > +#define BOOKE_INTERRUPT_DOORBELL_CRITICAL 37 > + > +/* booke_hv */ > +#define BOOKE_INTERRUPT_GUEST_DBELL 38 > +#define BOOKE_INTERRUPT_GUEST_DBELL_CRIT 39 > +#define BOOKE_INTERRUPT_HV_SYSCALL 40 > +#define BOOKE_INTERRUPT_HV_PRIV 41 > > /* book3s */ > > diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h > new file mode 100644 > index 0000000..30a600f > --- /dev/null > +++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h > @@ -0,0 +1,49 @@ > +/* > + * Copyright 2010-2011 Freescale Semiconductor, Inc. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License, version 2, as > + * published by the Free Software Foundation. > + */ > + > +#ifndef ASM_KVM_BOOKE_HV_ASM_H > +#define ASM_KVM_BOOKE_HV_ASM_H > + > +#ifdef __ASSEMBLY__ > + > +/* > + * All exceptions from guest state must go through KVM > + * (except for those which are delivered directly to the guest) -- > + * there are no exceptions for which we fall through directly to > + * the normal host handler. > + * > + * Expected inputs (normal exceptions): > + * SCRATCH0 = saved r10 > + * r10 = thread struct > + * r11 = appropriate SRR1 variant (currently used as scratch) > + * r13 = saved CR > + * *(r10 + THREAD_NORMSAVE(0)) = saved r11 > + * *(r10 + THREAD_NORMSAVE(2)) = saved r13 > + * > + * Expected inputs (crit/mcheck/debug exceptions): > + * appropriate SCRATCH = saved r8 > + * r8 = exception level stack frame > + * r9 = *(r8 + _CCR) = saved CR > + * r11 = appropriate SRR1 variant (currently used as scratch) > + * *(r8 + GPR9) = saved r9 > + * *(r8 + GPR10) = saved r10 (r10 not yet clobbered) > + * *(r8 + GPR11) = saved r11 > + */ > +.macro DO_KVM intno srr1 > +#ifdef CONFIG_KVM_BOOKE_HV > +BEGIN_FTR_SECTION > + mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */ > + bf 3, kvmppc_resume_\intno\()_\srr1 > + b kvmppc_handler_\intno\()_\srr1 > +kvmppc_resume_\intno\()_\srr1: > +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) > +#endif > +.endm > + > +#endif /*__ASSEMBLY__ */ > +#endif /* ASM_KVM_BOOKE_HV_ASM_H */ > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h > index ad4d671..d603513 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -107,6 +107,8 @@ struct kvm_vcpu_stat { > u32 dec_exits; > u32 ext_intr_exits; > u32 halt_wakeup; > + u32 dbell_exits; > + u32 gdbell_exits; > #ifdef CONFIG_PPC_BOOK3S > u32 pf_storage; > u32 pf_instruc; > @@ -141,6 +143,7 @@ enum kvm_exit_types { > EMULATED_TLBSX_EXITS, > EMULATED_TLBWE_EXITS, > EMULATED_RFI_EXITS, > + EMULATED_RFCI_EXITS, > DEC_EXITS, > EXT_INTR_EXITS, > HALT_WAKEUP, > @@ -148,6 +151,8 @@ enum kvm_exit_types { > FP_UNAVAIL, > DEBUG_EXITS, > TIMEINGUEST, > + DBELL_EXITS, > + GDBELL_EXITS, > __NUMBER_OF_KVM_EXIT_TYPES > }; > > @@ -213,10 +218,10 @@ struct revmap_entry { > #define KVMPPC_GOT_PAGE 0x80 > > struct kvm_arch { > + unsigned int lpid; > #ifdef CONFIG_KVM_BOOK3S_64_HV > unsigned long hpt_virt; > struct revmap_entry *revmap; > - unsigned int lpid; > unsigned int host_lpid; > unsigned long host_lpcr; > unsigned long sdr1; > @@ -346,6 +351,17 @@ struct kvm_vcpu_arch { > u32 qpr[32]; > #endif > > +#ifdef CONFIG_KVM_BOOKE_HV > + u32 host_mas4; > + u32 host_mas6; > + u32 shadow_epcr; > + u32 epcr; > + u32 shadow_msrp; > + u32 eplc; > + u32 epsc; > + u32 oldpir; > +#endif > + > #ifdef CONFIG_PPC_BOOK3S > ulong hflags; > ulong guest_owned_ext; > @@ -417,6 +433,7 @@ struct kvm_vcpu_arch { > ulong queued_esr; > u32 tlbcfg[4]; > u32 mmucfg; > + u32 epr; > #endif > gpa_t paddr_accessed; > > diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h > index 5524f88..247b920 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -137,6 +137,9 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, > extern void kvmppc_core_commit_memory_region(struct kvm *kvm, > struct kvm_userspace_memory_region *mem); > > +extern int kvmppc_bookehv_init(void); > +extern void kvmppc_bookehv_exit(void); > + > /* > * Cuts out inst bits with ordering according to spec. > * That means the leftmost bit is zero. All given bits are included. > diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h > index 36a6eaa..b8e303c 100644 > --- a/arch/powerpc/include/asm/mmu-book3e.h > +++ b/arch/powerpc/include/asm/mmu-book3e.h > @@ -104,6 +104,8 @@ > #define MAS4_TSIZED_MASK 0x00000f80 /* Default TSIZE */ > #define MAS4_TSIZED_SHIFT 7 > > +#define MAS5_SGS 0x80000000 > + > #define MAS6_SPID0 0x3FFF0000 > #define MAS6_SPID1 0x00007FFE > #define MAS6_ISIZE(x) MAS1_TSIZE(x) > @@ -118,6 +120,10 @@ > > #define MAS7_RPN 0xFFFFFFFF > > +#define MAS8_TGS 0x80000000 /* Guest space */ > +#define MAS8_VF 0x40000000 /* Virtualization Fault */ > +#define MAS8_TLPID 0x000000ff > + > /* Bit definitions for MMUCFG */ > #define MMUCFG_MAVN 0x00000003 /* MMU Architecture Version Number */ > #define MMUCFG_MAVN_V1 0x00000000 /* v1.0 */ > diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h > index eb11a44..032a984 100644 > --- a/arch/powerpc/include/asm/processor.h > +++ b/arch/powerpc/include/asm/processor.h > @@ -243,6 +243,9 @@ struct thread_struct { > #ifdef CONFIG_KVM_BOOK3S_32_HANDLER > void* kvm_shadow_vcpu; /* KVM internal data */ > #endif /* CONFIG_KVM_BOOK3S_32_HANDLER */ > +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) > + struct kvm_vcpu *kvm_vcpu; > +#endif > #ifdef CONFIG_PPC64 > unsigned long dscr; > int dscr_inherit; > diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h > index 209dc74..5993770 100644 > --- a/arch/powerpc/include/asm/reg.h > +++ b/arch/powerpc/include/asm/reg.h > @@ -257,7 +257,9 @@ > #define LPCR_LPES_SH 2 > #define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ > #define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ > +#ifndef SPRN_LPID > #define SPRN_LPID 0x13F /* Logical Partition Identifier */ > +#endif > #define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */ > #define SPRN_HMER 0x150 /* Hardware m? error recovery */ > #define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ > diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h > index 03c48e8..bd80b8d 100644 > --- a/arch/powerpc/include/asm/reg_booke.h > +++ b/arch/powerpc/include/asm/reg_booke.h > @@ -56,17 +56,29 @@ > #define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */ > #define SPRN_EPCR 0x133 /* Embedded Processor Control Register */ > #define SPRN_DBCR2 0x136 /* Debug Control Register 2 */ > +#define SPRN_MSRP 0x137 /* MSR Protect Register */ > #define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */ > #define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */ > #define SPRN_DVC1 0x13E /* Data Value Compare Register 1 */ > #define SPRN_DVC2 0x13F /* Data Value Compare Register 2 */ > +#define SPRN_LPID 0x152 /* Logical Partition ID */ > #define SPRN_MAS8 0x155 /* MMU Assist Register 8 */ > #define SPRN_TLB0PS 0x158 /* TLB 0 Page Size Register */ > #define SPRN_MAS5_MAS6 0x15c /* MMU Assist Register 5 || 6 */ > #define SPRN_MAS8_MAS1 0x15d /* MMU Assist Register 8 || 1 */ > #define SPRN_EPTCFG 0x15e /* Embedded Page Table Config */ > +#define SPRN_GSPRG0 0x170 /* Guest SPRG0 */ > +#define SPRN_GSPRG1 0x171 /* Guest SPRG1 */ > +#define SPRN_GSPRG2 0x172 /* Guest SPRG2 */ > +#define SPRN_GSPRG3 0x173 /* Guest SPRG3 */ > #define SPRN_MAS7_MAS3 0x174 /* MMU Assist Register 7 || 3 */ > #define SPRN_MAS0_MAS1 0x175 /* MMU Assist Register 0 || 1 */ > +#define SPRN_GSRR0 0x17A /* Guest SRR0 */ > +#define SPRN_GSRR1 0x17B /* Guest SRR1 */ > +#define SPRN_GEPR 0x17C /* Guest EPR */ > +#define SPRN_GDEAR 0x17D /* Guest DEAR */ > +#define SPRN_GPIR 0x17E /* Guest PIR */ > +#define SPRN_GESR 0x17F /* Guest Exception Syndrome Register */ > #define SPRN_IVOR0 0x190 /* Interrupt Vector Offset Register 0 */ > #define SPRN_IVOR1 0x191 /* Interrupt Vector Offset Register 1 */ > #define SPRN_IVOR2 0x192 /* Interrupt Vector Offset Register 2 */ > @@ -87,6 +99,13 @@ > #define SPRN_IVOR39 0x1B1 /* Interrupt Vector Offset Register 39 */ > #define SPRN_IVOR40 0x1B2 /* Interrupt Vector Offset Register 40 */ > #define SPRN_IVOR41 0x1B3 /* Interrupt Vector Offset Register 41 */ > +#define SPRN_GIVOR2 0x1B8 /* Guest IVOR2 */ > +#define SPRN_GIVOR3 0x1B9 /* Guest IVOR3 */ > +#define SPRN_GIVOR4 0x1BA /* Guest IVOR4 */ > +#define SPRN_GIVOR8 0x1BB /* Guest IVOR8 */ > +#define SPRN_GIVOR13 0x1BC /* Guest IVOR13 */ > +#define SPRN_GIVOR14 0x1BD /* Guest IVOR14 */ > +#define SPRN_GIVPR 0x1BF /* Guest IVPR */ > #define SPRN_SPEFSCR 0x200 /* SPE & Embedded FP Status & Control */ > #define SPRN_BBEAR 0x201 /* Branch Buffer Entry Address Register */ > #define SPRN_BBTAR 0x202 /* Branch Buffer Target Address Register */ > @@ -235,6 +254,10 @@ > #define MCSR_LDG 0x00002000UL /* Guarded Load */ > #define MCSR_TLBSYNC 0x00000002UL /* Multiple tlbsyncs detected */ > #define MCSR_BSL2_ERR 0x00000001UL /* Backside L2 cache error */ > + > +#define MSRP_UCLEP 0x04000000 /* Protect MSR[UCLE] */ > +#define MSRP_DEP 0x00000200 /* Protect MSR[DE] */ > +#define MSRP_PMMP 0x00000004 /* Protect MSR[PMM] */ > #endif > > #ifdef CONFIG_E200 > @@ -589,6 +612,17 @@ > #define SPRN_EPCR_DMIUH 0x00400000 /* Disable MAS Interrupt updates > * for hypervisor */ > > +/* Bit definitions for EPLC/EPSC */ > +#define EPC_EPR 0x80000000 /* 1 = user, 0 = kernel */ > +#define EPC_EPR_SHIFT 31 > +#define EPC_EAS 0x40000000 /* Address Space */ > +#define EPC_EAS_SHIFT 30 > +#define EPC_EGS 0x20000000 /* 1 = guest, 0 = hypervisor */ > +#define EPC_EGS_SHIFT 29 > +#define EPC_ELPID 0x00ff0000 > +#define EPC_ELPID_SHIFT 16 > +#define EPC_EPID 0x00003fff > +#define EPC_EPID_SHIFT 0 > > /* > * The IBM-403 is an even more odd special case, as it is much > diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c > index c80bdd1..e179f09 100644 > --- a/arch/powerpc/kernel/asm-offsets.c > +++ b/arch/powerpc/kernel/asm-offsets.c > @@ -119,6 +119,9 @@ int main(void) > #ifdef CONFIG_KVM_BOOK3S_32_HANDLER > DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); > #endif > +#ifdef CONFIG_KVM_BOOKE_HV > + DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu)); > +#endif > > DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); > DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags)); > @@ -400,6 +403,7 @@ int main(void) > #ifdef CONFIG_KVM > DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); > DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); > + DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid)); > DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.regs.gpr)); > DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); > DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr)); > @@ -442,9 +446,11 @@ int main(void) > DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4)); > DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6)); > > + DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); > + DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); > + > /* book3s */ > #ifdef CONFIG_KVM_BOOK3S_64_HV > - DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); > DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); > DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); > DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); > @@ -459,7 +465,6 @@ int main(void) > DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); > #endif > #ifdef CONFIG_PPC_BOOK3S > - DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); > DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); > DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); > DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); > @@ -605,6 +610,12 @@ int main(void) > DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr)); > #endif > > +#ifdef CONFIG_KVM_BOOKE_HV > + DEFINE(VCPU_HOST_MAS4, offsetof(struct kvm_vcpu, arch.host_mas4)); > + DEFINE(VCPU_HOST_MAS6, offsetof(struct kvm_vcpu, arch.host_mas6)); > + DEFINE(VCPU_EPLC, offsetof(struct kvm_vcpu, arch.eplc)); > +#endif > + > #ifdef CONFIG_KVM_EXIT_TIMING > DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu, > arch.timing_exit.tv32.tbu)); > diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h > index 06ab353..b87c335 100644 > --- a/arch/powerpc/kernel/head_booke.h > +++ b/arch/powerpc/kernel/head_booke.h > @@ -3,6 +3,7 @@ > > #include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */ > #include <asm/kvm_asm.h> > +#include <asm/kvm_booke_hv_asm.h> > > /* > * Macros used for common Book-e exception handling > @@ -36,8 +37,9 @@ > stw r11, THREAD_NORMSAVE(0)(r10); \ > stw r13, THREAD_NORMSAVE(2)(r10); \ > mfcr r13; /* save CR in r13 for now */\ > - mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ > - andi. r11,r11,MSR_PR; \ > + mfspr r11, SPRN_SRR1; \ > + DO_KVM BOOKE_INTERRUPT_##intno SPRN_SRR1; \ > + andi. r11, r11, MSR_PR; /* check whether user or kernel */\ > mr r11, r1; \ > beq 1f; \ > /* if from user, start at top of this thread's kernel stack */ \ > @@ -123,8 +125,9 @@ > stw r10,GPR10(r8); \ > stw r11,GPR11(r8); \ > stw r9,_CCR(r8); /* save CR on stack */\ > - mfspr r10,exc_level_srr1; /* check whether user or kernel */\ > - andi. r10,r10,MSR_PR; \ > + mfspr r11,exc_level_srr1; /* check whether user or kernel */\ > + DO_KVM BOOKE_INTERRUPT_##intno exc_level_srr1; \ > + andi. r11,r11,MSR_PR; \ > mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ > lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ > addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ > @@ -173,6 +176,23 @@ > SPRN_MCSRR0, SPRN_MCSRR1) > > /* > + * Guest Doorbell -- this is a bit odd in that uses GSRR0/1 despite > + * being delivered to the host. This exception can only happen > + * inside a KVM guest -- so we just handle up to the DO_KVM rather > + * than try to fit this into one of the existing prolog macros. > + */ > +#define GUEST_DOORBELL_EXCEPTION \ > + START_EXCEPTION(GuestDoorbell); \ > + mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \ > + mfspr r10, SPRN_SPRG_THREAD; \ > + stw r11, THREAD_NORMSAVE(0)(r10); \ > + mfspr r11, SPRN_SRR1; \ > + stw r13, THREAD_NORMSAVE(2)(r10); \ > + mfcr r13; /* save CR in r13 for now */\ > + DO_KVM BOOKE_INTERRUPT_GUEST_DBELL SPRN_GSRR1; \ > + trap > + > +/* > * Exception vectors. > */ > #define START_EXCEPTION(label) \ > diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig > index 8f64709..2c33cd3 100644 > --- a/arch/powerpc/kvm/Kconfig > +++ b/arch/powerpc/kvm/Kconfig > @@ -90,6 +90,9 @@ config KVM_BOOK3S_64_PR > depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV > select KVM_BOOK3S_PR > > +config KVM_BOOKE_HV > + bool > + > config KVM_440 > bool "KVM support for PowerPC 440 processors" > depends on EXPERIMENTAL && 44x > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > index f66e741..cf63b93 100644 > --- a/arch/powerpc/kvm/booke.c > +++ b/arch/powerpc/kvm/booke.c > @@ -17,6 +17,8 @@ > * > * Authors: Hollis Blanchard <hollisb@us.ibm.com> > * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> > + * Scott Wood <scottwood@freescale.com> > + * Varun Sethi <varun.sethi@freescale.com> > */ > > #include <linux/errno.h> > @@ -30,9 +32,12 @@ > #include <asm/cputable.h> > #include <asm/uaccess.h> > #include <asm/kvm_ppc.h> > -#include "timing.h" > #include <asm/cacheflush.h> > +#include <asm/dbell.h> > +#include <asm/hw_irq.h> > +#include <asm/irq.h> > > +#include "timing.h" > #include "booke.h" > > unsigned long kvmppc_booke_handlers; > @@ -55,6 +60,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { > { "dec", VCPU_STAT(dec_exits) }, > { "ext_intr", VCPU_STAT(ext_intr_exits) }, > { "halt_wakeup", VCPU_STAT(halt_wakeup) }, > + { "doorbell", VCPU_STAT(dbell_exits) }, > + { "guest doorbell", VCPU_STAT(gdbell_exits) }, > { NULL } > }; > > @@ -123,6 +130,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) > { > u32 old_msr = vcpu->arch.shared->msr; > > +#ifdef CONFIG_KVM_BOOKE_HV > + new_msr |= MSR_GS; > +#endif > + > vcpu->arch.shared->msr = new_msr; > > kvmppc_mmu_msr_notify(vcpu, old_msr); > @@ -197,6 +208,75 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, > clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); > } > > +static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) > +{ > +#ifdef CONFIG_KVM_BOOKE_HV > + mtspr(SPRN_GSRR0, srr0); > + mtspr(SPRN_GSRR1, srr1); > +#else > + vcpu->arch.shared->srr0 = srr0; > + vcpu->arch.shared->srr1 = srr1; > +#endif > +} > + > +static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) > +{ > + vcpu->arch.csrr0 = srr0; > + vcpu->arch.csrr1 = srr1; > +} > + > +static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) > +{ > + if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) { > + vcpu->arch.dsrr0 = srr0; > + vcpu->arch.dsrr1 = srr1; > + } else { > + set_guest_csrr(vcpu, srr0, srr1); > + } > +} > + > +static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) > +{ > + vcpu->arch.mcsrr0 = srr0; > + vcpu->arch.mcsrr1 = srr1; > +} > + > +static unsigned long get_guest_dear(struct kvm_vcpu *vcpu) > +{ > +#ifdef CONFIG_KVM_BOOKE_HV > + return mfspr(SPRN_GDEAR); > +#else > + return vcpu->arch.shared->dar; > +#endif > +} > + > +static void set_guest_dear(struct kvm_vcpu *vcpu, unsigned long dear) > +{ > +#ifdef CONFIG_KVM_BOOKE_HV > + mtspr(SPRN_GDEAR, dear); > +#else > + vcpu->arch.shared->dar = dear; > +#endif > +} > + > +static unsigned long get_guest_esr(struct kvm_vcpu *vcpu) > +{ > +#ifdef CONFIG_KVM_BOOKE_HV > + return mfspr(SPRN_ESR); > +#else > + return vcpu->arch.shared->esr; > +#endif > +} > + > +static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr) > +{ > +#ifdef CONFIG_KVM_BOOKE_HV > + mtspr(SPRN_GESR, esr); > +#else > + vcpu->arch.shared->esr = esr; > +#endif > +} > + > /* Deliver the interrupt of the corresponding priority, if possible. */ > static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, > unsigned int priority) > @@ -208,6 +288,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, > ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); > bool crit; > bool keep_irq = false; > + enum int_class int_class; > > /* Truncate crit indicators in 32 bit mode */ > if (!(vcpu->arch.shared->msr & MSR_SF)) { > @@ -243,16 +324,20 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, > case BOOKE_IRQPRIO_AP_UNAVAIL: > case BOOKE_IRQPRIO_ALIGNMENT: > allowed = 1; > - msr_mask = MSR_CE|MSR_ME|MSR_DE; > + msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE; No need to do this. You already force MSR_GS in set_msr(); > + int_class = INT_CLASS_NONCRIT; > break; > case BOOKE_IRQPRIO_CRITICAL: > - case BOOKE_IRQPRIO_WATCHDOG: > allowed = vcpu->arch.shared->msr & MSR_CE; > - msr_mask = MSR_ME; > + allowed = allowed && !crit; > + msr_mask = MSR_GS | MSR_ME; > + int_class = INT_CLASS_CRIT; > break; > case BOOKE_IRQPRIO_MACHINE_CHECK: > allowed = vcpu->arch.shared->msr & MSR_ME; > - msr_mask = 0; > + allowed = allowed && !crit; > + msr_mask = MSR_GS; > + int_class = INT_CLASS_MC; > break; > case BOOKE_IRQPRIO_DECREMENTER: > case BOOKE_IRQPRIO_FIT: > @@ -261,29 +346,63 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, > case BOOKE_IRQPRIO_EXTERNAL: > allowed = vcpu->arch.shared->msr & MSR_EE; > allowed = allowed && !crit; > - msr_mask = MSR_CE|MSR_ME|MSR_DE; > + msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE; > + int_class = INT_CLASS_NONCRIT; > break; > case BOOKE_IRQPRIO_DEBUG: > allowed = vcpu->arch.shared->msr & MSR_DE; > - msr_mask = MSR_ME; > + allowed = allowed && !crit; > + msr_mask = MSR_GS | MSR_ME; > + int_class = INT_CLASS_CRIT; > break; > } > > if (allowed) { > - vcpu->arch.shared->srr0 = vcpu->arch.regs.nip; > - vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; > + switch (int_class) { > + case INT_CLASS_NONCRIT: > + set_guest_srr(vcpu, vcpu->arch.regs.nip, > + vcpu->arch.shared->msr); > + break; > + case INT_CLASS_CRIT: > + set_guest_csrr(vcpu, vcpu->arch.regs.nip, > + vcpu->arch.shared->msr); > + break; > + case INT_CLASS_DBG: > + set_guest_dsrr(vcpu, vcpu->arch.regs.nip, > + vcpu->arch.shared->msr); > + break; > + case INT_CLASS_MC: > + set_guest_mcsrr(vcpu, vcpu->arch.regs.nip, > + vcpu->arch.shared->msr); > + break; > + } > + > vcpu->arch.regs.nip = vcpu->arch.ivpr | > vcpu->arch.ivor[priority]; > if (update_esr == true) > - vcpu->arch.shared->esr = vcpu->arch.queued_esr; > + set_guest_esr(vcpu, vcpu->arch.queued_esr); > if (update_dear == true) > - vcpu->arch.shared->dar = vcpu->arch.queued_dear; > + set_guest_dear(vcpu, vcpu->arch.queued_dear); > kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); > > if (!keep_irq) > clear_bit(priority, &vcpu->arch.pending_exceptions); > } > > +#ifdef CONFIG_KVM_BOOKE_HV > + /* > + * If an interrupt is pending but masked, raise a guest doorbell > + * so that we are notified when the guest enables the relevant > + * MSR bit. > + */ > + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE) > + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT); > + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE) > + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT); > + if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK) > + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC); > +#endif > + > return allowed; > } > > @@ -347,6 +466,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) > return -EINVAL; > } > > + if (!current->thread.kvm_vcpu) { > + WARN(1, "no vcpu\n"); > + return -EPERM; > + } Huh? > + > local_irq_disable(); > > kvmppc_core_prepare_to_enter(vcpu); > @@ -366,6 +490,38 @@ out: > return ret; > } > > +static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) > +{ > + enum emulation_result er; > + > + er = kvmppc_emulate_instruction(run, vcpu); > + switch (er) { > + case EMULATE_DONE: > + /* don't overwrite subtypes, just account kvm_stats */ > + kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); > + /* Future optimization: only reload non-volatiles if > + * they were actually modified by emulation. */ > + return RESUME_GUEST_NV; > + > + case EMULATE_DO_DCR: > + run->exit_reason = KVM_EXIT_DCR; > + return RESUME_HOST; > + > + case EMULATE_FAIL: > + /* XXX Deliver Program interrupt to guest. */ > + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", > + __func__, vcpu->arch.regs.nip, vcpu->arch.last_inst); This should be throttled, otherwise the guest can spam our logs. > + /* For debugging, encode the failing instruction and > + * report it to userspace. */ > + run->hw.hardware_exit_reason = ~0ULL << 32; > + run->hw.hardware_exit_reason |= vcpu->arch.last_inst; I'm fairly sure you want to fix this :) > + return RESUME_HOST; > + > + default: > + BUG(); > + } > +} > + > /** > * kvmppc_handle_exit > * > @@ -374,12 +530,39 @@ out: > int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, > unsigned int exit_nr) > { > - enum emulation_result er; > int r = RESUME_HOST; > > /* update before a new last_exit_type is rewritten */ > kvmppc_update_timing_stats(vcpu); > > + /* > + * If we actually care, we could copy MSR, DEAR, and ESR to regs, > + * insert an appropriate trap number, etc. > + * > + * Seems like a waste of cycles for something that should only matter > + * to someone using sysrq-t/p or similar host kernel debug facility. > + * We have other debug facilities to get that information from a > + * guest through userspace. > + */ > + switch (exit_nr) { > + case BOOKE_INTERRUPT_EXTERNAL: > + do_IRQ(&vcpu->arch.regs); Ah, so that's what you want to use regs for. So is having a pt_regs struct that only contains useful register values in half its fields any useful here? Or could we keep control of the registers ourselves, enabling us to maybe one day optimize things more. > + break; > + > + case BOOKE_INTERRUPT_DECREMENTER: > + timer_interrupt(&vcpu->arch.regs); > + break; > + > +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64) > + case BOOKE_INTERRUPT_DOORBELL: > + doorbell_exception(&vcpu->arch.regs); > + break; > +#endif > + case BOOKE_INTERRUPT_MACHINE_CHECK: > + /* FIXME */ > + break; > + } > + > local_irq_enable(); > > run->exit_reason = KVM_EXIT_UNKNOWN; > @@ -387,30 +570,56 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, > > switch (exit_nr) { > case BOOKE_INTERRUPT_MACHINE_CHECK: > - printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR)); > - kvmppc_dump_vcpu(vcpu); > - r = RESUME_HOST; > + kvm_resched(vcpu); > + r = RESUME_GUEST; huh? > break; > > case BOOKE_INTERRUPT_EXTERNAL: > kvmppc_account_exit(vcpu, EXT_INTR_EXITS); > - if (need_resched()) > - cond_resched(); > + kvm_resched(vcpu); Why are we explicit about the resched? On book3s I just call kvm_resched(vcpu) before the switch(). > r = RESUME_GUEST; > break; > > case BOOKE_INTERRUPT_DECREMENTER: > - /* Since we switched IVPR back to the host's value, the host > - * handled this interrupt the moment we enabled interrupts. > - * Now we just offer it a chance to reschedule the guest. */ > kvmppc_account_exit(vcpu, DEC_EXITS); > - if (need_resched()) > - cond_resched(); > + kvm_resched(vcpu); > + r = RESUME_GUEST; > + break; > + > + case BOOKE_INTERRUPT_DOORBELL: > + kvmppc_account_exit(vcpu, DBELL_EXITS); > + kvm_resched(vcpu); > + r = RESUME_GUEST; > + break; > + > + case BOOKE_INTERRUPT_GUEST_DBELL_CRIT: > + kvmppc_account_exit(vcpu, GDBELL_EXITS); > + > + /* > + * We are here because there is a pending guest interrupt > + * which could not be delivered as MSR_CE or MSR_ME was not > + * set. Once we break from here we will retry delivery. > + */ > r = RESUME_GUEST; > break; > > + case BOOKE_INTERRUPT_GUEST_DBELL: > + kvmppc_account_exit(vcpu, GDBELL_EXITS); > + > + /* > + * We are here because there is a pending guest interrupt > + * which could not be delivered as MSR_EE was not set. Once > + * we break from here we will retry delivery. > + */ > + r = RESUME_GUEST; > + break; > + > + case BOOKE_INTERRUPT_HV_PRIV: > + r = emulation_exit(run, vcpu); > + break; > + > case BOOKE_INTERRUPT_PROGRAM: > - if (vcpu->arch.shared->msr & MSR_PR) { > + if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) { > /* Program traps generated by user-level software must be handled > * by the guest kernel. */ > kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); > @@ -419,33 +628,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, > break; > } > > - er = kvmppc_emulate_instruction(run, vcpu); > - switch (er) { > - case EMULATE_DONE: > - /* don't overwrite subtypes, just account kvm_stats */ > - kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); > - /* Future optimization: only reload non-volatiles if > - * they were actually modified by emulation. */ > - r = RESUME_GUEST_NV; > - break; > - case EMULATE_DO_DCR: > - run->exit_reason = KVM_EXIT_DCR; > - r = RESUME_HOST; > - break; > - case EMULATE_FAIL: > - /* XXX Deliver Program interrupt to guest. */ > - printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", > - __func__, vcpu->arch.regs.nip, > - vcpu->arch.last_inst); > - /* For debugging, encode the failing instruction and > - * report it to userspace. */ > - run->hw.hardware_exit_reason = ~0ULL << 32; > - run->hw.hardware_exit_reason |= vcpu->arch.last_inst; > - r = RESUME_HOST; > - break; > - default: > - BUG(); > - } > + r = emulation_exit(run, vcpu); > break; > > case BOOKE_INTERRUPT_FP_UNAVAIL: > @@ -510,6 +693,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, > r = RESUME_GUEST; > break; > > +#ifdef CONFIG_KVM_BOOKE_HV > + case BOOKE_INTERRUPT_HV_SYSCALL: > + if (!(vcpu->arch.shared->msr & MSR_PR)) { > + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); > + } else { > + /* > + * hcall from guest userspace -- send privileged > + * instruction program check. > + */ > + kvmppc_core_queue_program(vcpu, ESR_PPR); > + } > + > + r = RESUME_GUEST; > + break; > +#else > case BOOKE_INTERRUPT_SYSCALL: > if (!(vcpu->arch.shared->msr & MSR_PR) && > (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { > @@ -523,6 +721,47 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, > kvmppc_account_exit(vcpu, SYSCALL_EXITS); > r = RESUME_GUEST; > break; > +#endif > + > + case BOOKE_INTERRUPT_ITLB_MISS: { > + unsigned long eaddr = vcpu->arch.regs.nip; > + gpa_t gpaddr; > + gfn_t gfn; > + int gtlb_index; > + > + r = RESUME_GUEST; > + > + /* Check the guest TLB. */ > + gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr); > + if (gtlb_index < 0) { > + /* The guest didn't have a mapping for it. */ > + kvmppc_booke_queue_irqprio(vcpu, > + BOOKE_IRQPRIO_ITLB_MISS); > + kvmppc_mmu_itlb_miss(vcpu); > + kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS); > + break; > + } > + > + kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS); > + > + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); > + gfn = gpaddr >> PAGE_SHIFT; > + > + if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { > + /* The guest TLB had a mapping, but the shadow TLB > + * didn't. This could be because: > + * a) the entry is mapping the host kernel, or > + * b) the guest used a large mapping which we're faking > + * Either way, we need to satisfy the fault without > + * invoking the guest. */ > + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); > + } else { > + /* Guest mapped and leaped at non-RAM! */ > + kvmppc_booke_queue_irqprio(vcpu, > + BOOKE_IRQPRIO_MACHINE_CHECK); Are you sure? Couldn't this also be MMIO? That doesn't really improve the situation as executing from MMIO is tricky with the KVM model, but it's not necessarily bad. Oh well, I guess we'll have to do something and throwing an #MC isn't all that ugly. > + } > + break; > + } > > case BOOKE_INTERRUPT_DTLB_MISS: { > unsigned long eaddr = vcpu->arch.fault_dear; > @@ -578,45 +817,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, > break; > } > > - case BOOKE_INTERRUPT_ITLB_MISS: { > - unsigned long eaddr = vcpu->arch.regs.nip; > - gpa_t gpaddr; > - gfn_t gfn; > - int gtlb_index; > - > - r = RESUME_GUEST; > - > - /* Check the guest TLB. */ > - gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr); > - if (gtlb_index < 0) { > - /* The guest didn't have a mapping for it. */ > - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS); > - kvmppc_mmu_itlb_miss(vcpu); > - kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS); > - break; > - } > - > - kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS); > - > - gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); > - gfn = gpaddr >> PAGE_SHIFT; > - > - if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { > - /* The guest TLB had a mapping, but the shadow TLB > - * didn't. This could be because: > - * a) the entry is mapping the host kernel, or > - * b) the guest used a large mapping which we're faking > - * Either way, we need to satisfy the fault without > - * invoking the guest. */ > - kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); > - } else { > - /* Guest mapped and leaped at non-RAM! */ > - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK); Ah, you just shoved the code around :) > - } > - > - break; > - } > - > case BOOKE_INTERRUPT_DEBUG: { > u32 dbsr; > > @@ -663,12 +863,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) > int r; > > vcpu->arch.regs.nip = 0; > - vcpu->arch.shared->msr = 0; > - vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; > vcpu->arch.shared->pir = vcpu->vcpu_id; > kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ > + kvmppc_set_msr(vcpu, 0); > > +#ifndef CONFIG_KVM_BOOKE_HV > + vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; > vcpu->arch.shadow_pid = 1; > + vcpu->arch.shared->msr = 0; > +#endif > > /* Eye-catching numbers so we know if the guest takes an interrupt > * before it's programmed its own IVPR/IVORs. */ > @@ -749,8 +952,8 @@ static void get_sregs_base(struct kvm_vcpu *vcpu, > sregs->u.e.csrr0 = vcpu->arch.csrr0; > sregs->u.e.csrr1 = vcpu->arch.csrr1; > sregs->u.e.mcsr = vcpu->arch.mcsr; > - sregs->u.e.esr = vcpu->arch.shared->esr; > - sregs->u.e.dear = vcpu->arch.shared->dar; > + sregs->u.e.esr = get_guest_esr(vcpu); > + sregs->u.e.dear = get_guest_dear(vcpu); > sregs->u.e.tsr = vcpu->arch.tsr; > sregs->u.e.tcr = vcpu->arch.tcr; > sregs->u.e.dec = kvmppc_get_dec(vcpu, tb); > @@ -767,8 +970,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, > vcpu->arch.csrr0 = sregs->u.e.csrr0; > vcpu->arch.csrr1 = sregs->u.e.csrr1; > vcpu->arch.mcsr = sregs->u.e.mcsr; > - vcpu->arch.shared->esr = sregs->u.e.esr; > - vcpu->arch.shared->dar = sregs->u.e.dear; > + set_guest_esr(vcpu, sregs->u.e.esr); > + set_guest_dear(vcpu, sregs->u.e.dear); > vcpu->arch.vrsave = sregs->u.e.vrsave; > kvmppc_set_tcr(vcpu, sregs->u.e.tcr); > > @@ -965,14 +1168,17 @@ void kvmppc_decrementer_func(unsigned long data) > > void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) > { > + current->thread.kvm_vcpu = vcpu; > } > > void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) > { > + current->thread.kvm_vcpu = NULL; > } > > int __init kvmppc_booke_init(void) > { > +#ifndef CONFIG_KVM_BOOKE_HV > unsigned long ivor[16]; > unsigned long max_ivor = 0; > int i; > @@ -1015,7 +1221,7 @@ int __init kvmppc_booke_init(void) > } > flush_icache_range(kvmppc_booke_handlers, > kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); > - > +#endif /* !BOOKE_HV */ > return 0; > } > > diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h > index 05d1d99..d53bcf2 100644 > --- a/arch/powerpc/kvm/booke.h > +++ b/arch/powerpc/kvm/booke.h > @@ -48,7 +48,20 @@ > #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 > /* Internal pseudo-irqprio for level triggered externals */ > #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 > -#define BOOKE_IRQPRIO_MAX 20 > +#define BOOKE_IRQPRIO_DBELL 21 > +#define BOOKE_IRQPRIO_DBELL_CRIT 22 > +#define BOOKE_IRQPRIO_MAX 23 So was MAX wrong before or is it too big now? > + > +#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \ > + (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \ > + (1 << BOOKE_IRQPRIO_DBELL) | \ > + (1 << BOOKE_IRQPRIO_DECREMENTER) | \ > + (1 << BOOKE_IRQPRIO_FIT) | \ > + (1 << BOOKE_IRQPRIO_EXTERNAL)) > + > +#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \ > + (1 << BOOKE_IRQPRIO_WATCHDOG) | \ > + (1 << BOOKE_IRQPRIO_CRITICAL)) > > extern unsigned long kvmppc_booke_handlers; > > @@ -74,4 +87,13 @@ void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu); > void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu); > void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu); > > +enum int_class { > + INT_CLASS_NONCRIT, > + INT_CLASS_CRIT, > + INT_CLASS_MC, > + INT_CLASS_DBG, > +}; > + > +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); > + > #endif /* __KVM_BOOKE_H__ */ > diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c > index a4af03b..3eb7fc6 100644 > --- a/arch/powerpc/kvm/booke_emulate.c > +++ b/arch/powerpc/kvm/booke_emulate.c > @@ -99,6 +99,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, > return emulated; > } > > +/* > + * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode). > + * Their backing store is in real registers, and these functions > + * will return the wrong result if called for them in another context > + * (such as debugging). > + */ > int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) > { > int emulated = EMULATE_DONE; > @@ -122,9 +128,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) > kvmppc_set_tcr(vcpu, spr_val); > break; > > - /* Note: SPRG4-7 are user-readable. These values are > - * loaded into the real SPRGs when resuming the > - * guest. */ > + /* > + * Note: SPRG4-7 are user-readable. > + * These values are loaded into the real SPRGs when resuming the > + * guest (PR-mode only). > + */ > case SPRN_SPRG4: > vcpu->arch.shared->sprg4 = spr_val; break; > case SPRN_SPRG5: > @@ -136,6 +144,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) > > case SPRN_IVPR: > vcpu->arch.ivpr = spr_val; > +#ifdef CONFIG_KVM_BOOKE_HV > + mtspr(SPRN_GIVPR, spr_val); > +#endif > break; > case SPRN_IVOR0: > vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val; > @@ -145,6 +156,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) > break; > case SPRN_IVOR2: > vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val; > +#ifdef CONFIG_KVM_BOOKE_HV > + mtspr(SPRN_GIVOR2, spr_val); > +#endif > break; > case SPRN_IVOR3: > vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val; > @@ -163,6 +177,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) > break; > case SPRN_IVOR8: > vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val; > +#ifdef CONFIG_KVM_BOOKE_HV > + mtspr(SPRN_GIVOR8, spr_val); > +#endif > break; > case SPRN_IVOR9: > vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val; > diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S > new file mode 100644 > index 0000000..9eaeebd > --- /dev/null > +++ b/arch/powerpc/kvm/bookehv_interrupts.S > @@ -0,0 +1,587 @@ > +/* > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License, version 2, as > + * published by the Free Software Foundation. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. > + * > + * Copyright (C) 2010-2011 Freescale Semiconductor, Inc. > + * > + * Author: Varun Sethi <varun.sethi@freescale.com> > + * Author: Scott Wood <scotwood@freescale.com> > + * > + * This file is derived from arch/powerpc/kvm/booke_interrupts.S > + */ > + > +#include <asm/ppc_asm.h> > +#include <asm/kvm_asm.h> > +#include <asm/reg.h> > +#include <asm/mmu-44x.h> > +#include <asm/page.h> > +#include <asm/asm-compat.h> > +#include <asm/asm-offsets.h> > +#include <asm/bitsperlong.h> > + > +#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ > + > +#define GET_VCPU(vcpu, thread) \ > + PPC_LL vcpu, THREAD_KVM_VCPU(thread) > + > +#define SET_VCPU(vcpu) \ > + PPC_STL vcpu, (THREAD + THREAD_KVM_VCPU)(r2) > + > +#define LONGBYTES (BITS_PER_LONG / 8) > + > +#define VCPU_GPR(n) (VCPU_GPRS + (n * LONGBYTES)) > +#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) > + > +/* The host stack layout: */ > +#define HOST_R1 (0 * LONGBYTES) /* Implied by stwu. */ > +#define HOST_CALLEE_LR (1 * LONGBYTES) > +#define HOST_RUN (2 * LONGBYTES) /* struct kvm_run */ > +/* > + * r2 is special: it holds 'current', and it made nonvolatile in the > + * kernel with the -ffixed-r2 gcc option. > + */ > +#define HOST_R2 (3 * LONGBYTES) > +#define HOST_NV_GPRS (4 * LONGBYTES) > +#define HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) > +#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(31) + LONGBYTES) > +#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ > +#define HOST_STACK_LR (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */ > + > +#define NEED_EMU 0x00000001 /* emulation -- save nv regs */ > +#define NEED_DEAR 0x00000002 /* save faulting DEAR */ > +#define NEED_ESR 0x00000004 /* save faulting ESR */ > + > +/* > + * On entry: > + * r4 = vcpu, r5 = srr0, r6 = srr1 > + * saved in vcpu: cr, ctr, r3-r13 > + */ > +.macro kvm_handler_common intno, srr0, flags > + mfspr r10, SPRN_PID > + lwz r8, VCPU_HOST_PID(r4) > + PPC_LL r11, VCPU_SHARED(r4) > + PPC_STL r14, VCPU_GPR(r14)(r4) /* We need a non-volatile GPR. */ > + li r14, \intno > + > + stw r10, VCPU_GUEST_PID(r4) > + mtspr SPRN_PID, r8 > + > + .if \flags & NEED_EMU > + lwz r9, VCPU_KVM(r4) writing r9 > + .endif > + > +#ifdef CONFIG_KVM_EXIT_TIMING > + /* save exit time */ > +1: mfspr r7, SPRN_TBRU > + mfspr r8, SPRN_TBRL > + mfspr r9, SPRN_TBRU overwriting r9 again? > + cmpw r9, r7 > + PPC_STL r8, VCPU_TIMING_EXIT_TBL(r4) > + bne- 1b > + PPC_STL r9, VCPU_TIMING_EXIT_TBU(r4) > +#endif > + > + oris r8, r6, MSR_CE@h > +#ifndef CONFIG_64BIT Double negation is always hard to read. Please reverse the ifdef :) > + stw r6, (VCPU_SHARED_MSR + 4)(r11) > +#else > + std r6, (VCPU_SHARED_MSR)(r11) > +#endif > + ori r8, r8, MSR_ME | MSR_RI > + PPC_STL r5, VCPU_PC(r4) > + > + /* > + * Make sure CE/ME/RI are set (if appropriate for exception type) > + * whether or not the guest had it set. Since mfmsr/mtmsr are > + * somewhat expensive, skip in the common case where the guest > + * had all these bits set (and thus they're still set if > + * appropriate for the exception type). > + */ > + cmpw r6, r8 > + .if \flags & NEED_EMU > + lwz r9, KVM_LPID(r9) And here r9 is already clobbered > + .endif > + beq 1f > + mfmsr r7 > + .if \srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0 > + oris r7, r7, MSR_CE@h > + .endif > + .if \srr0 != SPRN_MCSRR0 > + ori r7, r7, MSR_ME | MSR_RI > + .endif > + mtmsr r7 > +1: > + > + .if \flags & NEED_EMU > + /* > + * This assumes you have external PID support. > + * To support a bookehv CPU without external PID, you'll > + * need to look up the TLB entry and create a temporary mapping. > + * > + * FIXME: we don't currently handle if the lwepx faults. PR-mode > + * booke doesn't handle it either. Since Linux doesn't use > + * broadcast tlbivax anymore, the only way this should happen is > + * if the guest maps its memory execute-but-not-read, or if we > + * somehow take a TLB miss in the middle of this entry code and > + * evict the relevant entry. On e500mc, all kernel lowmem is > + * bolted into TLB1 large page mappings, and we don't use > + * broadcast invalidates, so we should not take a TLB miss here. > + * > + * Later we'll need to deal with faults here. Disallowing guest > + * mappings that are execute-but-not-read could be an option on > + * e500mc, but not on chips with an LRAT if it is used. > + */ > + > + mfspr r3, SPRN_EPLC /* will already have correct ELPID and EGS */ > + PPC_STL r15, VCPU_GPR(r15)(r4) > + PPC_STL r16, VCPU_GPR(r16)(r4) > + PPC_STL r17, VCPU_GPR(r17)(r4) > + PPC_STL r18, VCPU_GPR(r18)(r4) > + PPC_STL r19, VCPU_GPR(r19)(r4) > + mr r8, r3 > + PPC_STL r20, VCPU_GPR(r20)(r4) > + rlwimi r8, r6, EPC_EAS_SHIFT - MSR_IR_LG, EPC_EAS > + PPC_STL r21, VCPU_GPR(r21)(r4) > + rlwimi r8, r6, EPC_EPR_SHIFT - MSR_PR_LG, EPC_EPR > + PPC_STL r22, VCPU_GPR(r22)(r4) > + rlwimi r8, r10, EPC_EPID_SHIFT, EPC_EPID > + PPC_STL r23, VCPU_GPR(r23)(r4) > + PPC_STL r24, VCPU_GPR(r24)(r4) > + PPC_STL r25, VCPU_GPR(r25)(r4) > + PPC_STL r26, VCPU_GPR(r26)(r4) > + PPC_STL r27, VCPU_GPR(r27)(r4) > + PPC_STL r28, VCPU_GPR(r28)(r4) > + PPC_STL r29, VCPU_GPR(r29)(r4) > + PPC_STL r30, VCPU_GPR(r30)(r4) > + PPC_STL r31, VCPU_GPR(r31)(r4) > + mtspr SPRN_EPLC, r8 > + isync > + lwepx r9, 0, r5 > + mtspr SPRN_EPLC, r3 > + stw r9, VCPU_LAST_INST(r4) > + .endif > + > + .if \flags & NEED_ESR > + mfspr r8, SPRN_ESR > + PPC_STL r8, VCPU_FAULT_ESR(r4) > + .endif > + > + .if \flags & NEED_DEAR > + mfspr r9, SPRN_DEAR > + PPC_STL r9, VCPU_FAULT_DEAR(r4) > + .endif > + > + b kvmppc_resume_host > +.endm > + > +/* > + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h > + */ > +.macro kvm_handler intno srr0, srr1, flags > +_GLOBAL(kvmppc_handler_\intno\()_\srr1) > + GET_VCPU(r11, r10) > + PPC_STL r3, VCPU_GPR(r3)(r11) > + mfspr r3, SPRN_SPRG_RSCRATCH0 > + PPC_STL r4, VCPU_GPR(r4)(r11) > + PPC_LL r4, THREAD_NORMSAVE(0)(r10) > + PPC_STL r5, VCPU_GPR(r5)(r11) > + PPC_STL r13, VCPU_CR(r11) > + mfspr r5, \srr0 > + PPC_STL r3, VCPU_GPR(r10)(r11) > + PPC_LL r3, THREAD_NORMSAVE(2)(r10) > + PPC_STL r6, VCPU_GPR(r6)(r11) > + PPC_STL r4, VCPU_GPR(r11)(r11) > + mfspr r6, \srr1 > + PPC_STL r7, VCPU_GPR(r7)(r11) > + PPC_STL r8, VCPU_GPR(r8)(r11) > + PPC_STL r9, VCPU_GPR(r9)(r11) > + PPC_STL r3, VCPU_GPR(r13)(r11) > + mfctr r7 > + PPC_STL r12, VCPU_GPR(r12)(r11) > + PPC_STL r7, VCPU_CTR(r11) > + mr r4, r11 > + kvm_handler_common \intno, \srr0, \flags > +.endm > + > +.macro kvm_lvl_handler intno scratch srr0, srr1, flags > +_GLOBAL(kvmppc_handler_\intno\()_\srr1) > + mfspr r10, SPRN_SPRG_THREAD > + GET_VCPU(r11, r10) > + PPC_STL r3, VCPU_GPR(r3)(r11) > + mfspr r3, \scratch > + PPC_STL r4, VCPU_GPR(r4)(r11) > + PPC_LL r4, GPR9(r8) > + PPC_STL r5, VCPU_GPR(r5)(r11) > + PPC_STL r9, VCPU_CR(r11) > + mfspr r5, \srr0 > + PPC_STL r3, VCPU_GPR(r8)(r11) > + PPC_LL r3, GPR10(r8) > + PPC_STL r6, VCPU_GPR(r6)(r11) > + PPC_STL r4, VCPU_GPR(r9)(r11) > + mfspr r6, \srr1 > + PPC_LL r4, GPR11(r8) > + PPC_STL r7, VCPU_GPR(r7)(r11) > + PPC_STL r8, VCPU_GPR(r8)(r11) > + PPC_STL r3, VCPU_GPR(r10)(r11) > + mfctr r7 > + PPC_STL r12, VCPU_GPR(r12)(r11) > + PPC_STL r4, VCPU_GPR(r11)(r11) > + PPC_STL r7, VCPU_CTR(r11) > + mr r4, r11 > + kvm_handler_common \intno, \srr0, \flags > +.endm > + > +kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \ > + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 > +kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ > + SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 > +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ > + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR) > +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR > +kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ > + SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR) > +kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, NEED_ESR > +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \ > + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 > +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \ > + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) > +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \ > + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 > +kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU > +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 > +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0 > +kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \ > + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 > +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ > + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 > +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ > + SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 > + > + > +/* Registers: > + * SPRG_SCRATCH0: guest r10 > + * r4: vcpu pointer > + * r11: vcpu->arch.shared > + * r14: KVM exit number > + */ > +_GLOBAL(kvmppc_resume_host) > + /* Save remaining volatile guest register state to vcpu. */ > + mfspr r3, SPRN_VRSAVE > + PPC_STL r0, VCPU_GPR(r0)(r4) > + PPC_STL r1, VCPU_GPR(r1)(r4) > + mflr r5 > + mfspr r6, SPRN_SPRG4 > + PPC_STL r2, VCPU_GPR(r2)(r4) > + PPC_STL r5, VCPU_LR(r4) > + mfspr r7, SPRN_SPRG5 > + PPC_STL r3, VCPU_VRSAVE(r4) > + PPC_STL r6, VCPU_SHARED_SPRG4(r11) > + mfspr r8, SPRN_SPRG6 > + PPC_STL r7, VCPU_SHARED_SPRG5(r11) > + mfspr r9, SPRN_SPRG7 > + PPC_STL r8, VCPU_SHARED_SPRG6(r11) > + mfxer r3 > + PPC_STL r9, VCPU_SHARED_SPRG7(r11) > + > + /* save guest MAS registers and restore host mas4 & mas6 */ > + mfspr r5, SPRN_MAS0 > + PPC_STL r3, VCPU_XER(r4) > + mfspr r6, SPRN_MAS1 > + stw r5, VCPU_SHARED_MAS0(r11) > + mfspr r7, SPRN_MAS2 > + stw r6, VCPU_SHARED_MAS1(r11) > +#ifndef CONFIG_64BIT see above > + stw r7, (VCPU_SHARED_MAS2 + 4)(r11) > +#else > + std r7, (VCPU_SHARED_MAS2)(r11) > +#endif > + mfspr r5, SPRN_MAS3 > + mfspr r6, SPRN_MAS4 > + stw r5, VCPU_SHARED_MAS7_3+4(r11) > + mfspr r7, SPRN_MAS6 > + stw r6, VCPU_SHARED_MAS4(r11) > + mfspr r5, SPRN_MAS7 > + lwz r6, VCPU_HOST_MAS4(r4) > + stw r7, VCPU_SHARED_MAS6(r11) > + lwz r8, VCPU_HOST_MAS6(r4) > + mtspr SPRN_MAS4, r6 > + stw r5, VCPU_SHARED_MAS7_3+0(r11) > + mtspr SPRN_MAS6, r8 > + mfspr r3, SPRN_EPCR > + rlwinm r3, r3, 0, ~SPRN_EPCR_DMIUH > + mtspr SPRN_EPCR, r3 > + isync > + > + /* Restore host stack pointer */ > + PPC_LL r1, VCPU_HOST_STACK(r4) > + PPC_LL r2, HOST_R2(r1) > + > + /* Switch to kernel stack and jump to handler. */ > + PPC_LL r3, HOST_RUN(r1) > + mr r5, r14 /* intno */ > + mr r14, r4 /* Save vcpu pointer. */ > + bl kvmppc_handle_exit > + > + /* Restore vcpu pointer and the nonvolatiles we used. */ > + mr r4, r14 > + PPC_LL r14, VCPU_GPR(r14)(r4) > + > + andi. r5, r3, RESUME_FLAG_NV > + beq skip_nv_load > + PPC_LL r15, VCPU_GPR(r15)(r4) > + PPC_LL r16, VCPU_GPR(r16)(r4) > + PPC_LL r17, VCPU_GPR(r17)(r4) > + PPC_LL r18, VCPU_GPR(r18)(r4) > + PPC_LL r19, VCPU_GPR(r19)(r4) > + PPC_LL r20, VCPU_GPR(r20)(r4) > + PPC_LL r21, VCPU_GPR(r21)(r4) > + PPC_LL r22, VCPU_GPR(r22)(r4) > + PPC_LL r23, VCPU_GPR(r23)(r4) > + PPC_LL r24, VCPU_GPR(r24)(r4) > + PPC_LL r25, VCPU_GPR(r25)(r4) > + PPC_LL r26, VCPU_GPR(r26)(r4) > + PPC_LL r27, VCPU_GPR(r27)(r4) > + PPC_LL r28, VCPU_GPR(r28)(r4) > + PPC_LL r29, VCPU_GPR(r29)(r4) > + PPC_LL r30, VCPU_GPR(r30)(r4) > + PPC_LL r31, VCPU_GPR(r31)(r4) > +skip_nv_load: > + /* Should we return to the guest? */ > + andi. r5, r3, RESUME_FLAG_HOST > + beq lightweight_exit > + > + srawi r3, r3, 2 /* Shift -ERR back down. */ > + > +heavyweight_exit: > + /* Not returning to guest. */ > + PPC_LL r5, HOST_STACK_LR(r1) > + > + /* > + * We already saved guest volatile register state; now save the > + * non-volatiles. > + */ > + > + PPC_STL r15, VCPU_GPR(r15)(r4) > + PPC_STL r16, VCPU_GPR(r16)(r4) > + PPC_STL r17, VCPU_GPR(r17)(r4) > + PPC_STL r18, VCPU_GPR(r18)(r4) > + PPC_STL r19, VCPU_GPR(r19)(r4) > + PPC_STL r20, VCPU_GPR(r20)(r4) > + PPC_STL r21, VCPU_GPR(r21)(r4) > + PPC_STL r22, VCPU_GPR(r22)(r4) > + PPC_STL r23, VCPU_GPR(r23)(r4) > + PPC_STL r24, VCPU_GPR(r24)(r4) > + PPC_STL r25, VCPU_GPR(r25)(r4) > + PPC_STL r26, VCPU_GPR(r26)(r4) > + PPC_STL r27, VCPU_GPR(r27)(r4) > + PPC_STL r28, VCPU_GPR(r28)(r4) > + PPC_STL r29, VCPU_GPR(r29)(r4) > + PPC_STL r30, VCPU_GPR(r30)(r4) > + PPC_STL r31, VCPU_GPR(r31)(r4) > + > + /* Load host non-volatile register state from host stack. */ > + PPC_LL r14, HOST_NV_GPR(r14)(r1) > + PPC_LL r15, HOST_NV_GPR(r15)(r1) > + PPC_LL r16, HOST_NV_GPR(r16)(r1) > + PPC_LL r17, HOST_NV_GPR(r17)(r1) > + PPC_LL r18, HOST_NV_GPR(r18)(r1) > + PPC_LL r19, HOST_NV_GPR(r19)(r1) > + PPC_LL r20, HOST_NV_GPR(r20)(r1) > + PPC_LL r21, HOST_NV_GPR(r21)(r1) > + PPC_LL r22, HOST_NV_GPR(r22)(r1) > + PPC_LL r23, HOST_NV_GPR(r23)(r1) > + PPC_LL r24, HOST_NV_GPR(r24)(r1) > + PPC_LL r25, HOST_NV_GPR(r25)(r1) > + PPC_LL r26, HOST_NV_GPR(r26)(r1) > + PPC_LL r27, HOST_NV_GPR(r27)(r1) > + PPC_LL r28, HOST_NV_GPR(r28)(r1) > + PPC_LL r29, HOST_NV_GPR(r29)(r1) > + PPC_LL r30, HOST_NV_GPR(r30)(r1) > + PPC_LL r31, HOST_NV_GPR(r31)(r1) > + > + /* Return to kvm_vcpu_run(). */ > + mtlr r5 > + addi r1, r1, HOST_STACK_SIZE > + /* r3 still contains the return code from kvmppc_handle_exit(). */ > + blr > + > +/* Registers: > + * r3: kvm_run pointer > + * r4: vcpu pointer > + */ > +_GLOBAL(__kvmppc_vcpu_run) > + stwu r1, -HOST_STACK_SIZE(r1) > + PPC_STL r1, VCPU_HOST_STACK(r4) /* Save stack pointer to vcpu. */ > + > + /* Save host state to stack. */ > + PPC_STL r3, HOST_RUN(r1) > + mflr r3 > + PPC_STL r3, HOST_STACK_LR(r1) > + > + /* Save host non-volatile register state to stack. */ > + PPC_STL r14, HOST_NV_GPR(r14)(r1) > + PPC_STL r15, HOST_NV_GPR(r15)(r1) > + PPC_STL r16, HOST_NV_GPR(r16)(r1) > + PPC_STL r17, HOST_NV_GPR(r17)(r1) > + PPC_STL r18, HOST_NV_GPR(r18)(r1) > + PPC_STL r19, HOST_NV_GPR(r19)(r1) > + PPC_STL r20, HOST_NV_GPR(r20)(r1) > + PPC_STL r21, HOST_NV_GPR(r21)(r1) > + PPC_STL r22, HOST_NV_GPR(r22)(r1) > + PPC_STL r23, HOST_NV_GPR(r23)(r1) > + PPC_STL r24, HOST_NV_GPR(r24)(r1) > + PPC_STL r25, HOST_NV_GPR(r25)(r1) > + PPC_STL r26, HOST_NV_GPR(r26)(r1) > + PPC_STL r27, HOST_NV_GPR(r27)(r1) > + PPC_STL r28, HOST_NV_GPR(r28)(r1) > + PPC_STL r29, HOST_NV_GPR(r29)(r1) > + PPC_STL r30, HOST_NV_GPR(r30)(r1) > + PPC_STL r31, HOST_NV_GPR(r31)(r1) > + > + /* Load guest non-volatiles. */ > + PPC_LL r14, VCPU_GPR(r14)(r4) > + PPC_LL r15, VCPU_GPR(r15)(r4) > + PPC_LL r16, VCPU_GPR(r16)(r4) > + PPC_LL r17, VCPU_GPR(r17)(r4) > + PPC_LL r18, VCPU_GPR(r18)(r4) > + PPC_LL r19, VCPU_GPR(r19)(r4) > + PPC_LL r20, VCPU_GPR(r20)(r4) > + PPC_LL r21, VCPU_GPR(r21)(r4) > + PPC_LL r22, VCPU_GPR(r22)(r4) > + PPC_LL r23, VCPU_GPR(r23)(r4) > + PPC_LL r24, VCPU_GPR(r24)(r4) > + PPC_LL r25, VCPU_GPR(r25)(r4) > + PPC_LL r26, VCPU_GPR(r26)(r4) > + PPC_LL r27, VCPU_GPR(r27)(r4) > + PPC_LL r28, VCPU_GPR(r28)(r4) > + PPC_LL r29, VCPU_GPR(r29)(r4) > + PPC_LL r30, VCPU_GPR(r30)(r4) > + PPC_LL r31, VCPU_GPR(r31)(r4) > + > + > +lightweight_exit: > + PPC_STL r2, HOST_R2(r1) > + > + mfspr r3, SPRN_PID > + stw r3, VCPU_HOST_PID(r4) > + lwz r3, VCPU_GUEST_PID(r4) > + mtspr SPRN_PID, r3 > + > + /* Save vcpu pointer for the exception handlers > + * must be done before loading guest r2. > + */ > +// SET_VCPU(r4) hm? > + > + PPC_LL r11, VCPU_SHARED(r4) > + /* Save host mas4 and mas6 and load guest MAS registers */ > + mfspr r3, SPRN_MAS4 > + stw r3, VCPU_HOST_MAS4(r4) > + mfspr r3, SPRN_MAS6 > + stw r3, VCPU_HOST_MAS6(r4) > + lwz r3, VCPU_SHARED_MAS0(r11) > + lwz r5, VCPU_SHARED_MAS1(r11) > +#ifndef CONFIG_64BIT see above > + lwz r6, (VCPU_SHARED_MAS2 + 4)(r11) > +#else > + ld r6, (VCPU_SHARED_MAS2)(r11) > +#endif > + lwz r7, VCPU_SHARED_MAS7_3+4(r11) > + lwz r8, VCPU_SHARED_MAS4(r11) > + mtspr SPRN_MAS0, r3 > + mtspr SPRN_MAS1, r5 > + mtspr SPRN_MAS2, r6 > + mtspr SPRN_MAS3, r7 > + mtspr SPRN_MAS4, r8 > + lwz r3, VCPU_SHARED_MAS6(r11) > + lwz r5, VCPU_SHARED_MAS7_3+0(r11) > + mtspr SPRN_MAS6, r3 > + mtspr SPRN_MAS7, r5 > + /* Disable MAS register updates via exception */ > + mfspr r3, SPRN_EPCR > + oris r3, r3, SPRN_EPCR_DMIUH@h > + mtspr SPRN_EPCR, r3 Shouldn't this happen before you set the MAS registers? :) > + > + /* > + * Host interrupt handlers may have clobbered these guest-readable > + * SPRGs, so we need to reload them here with the guest's values. > + */ > + lwz r3, VCPU_VRSAVE(r4) > + lwz r5, VCPU_SHARED_SPRG4(r11) > + mtspr SPRN_VRSAVE, r3 > + lwz r6, VCPU_SHARED_SPRG5(r11) > + mtspr SPRN_SPRG4W, r5 > + lwz r7, VCPU_SHARED_SPRG6(r11) > + mtspr SPRN_SPRG5W, r6 > + lwz r8, VCPU_SHARED_SPRG7(r11) > + mtspr SPRN_SPRG6W, r7 > + mtspr SPRN_SPRG7W, r8 > + > + /* Load some guest volatiles. */ > + PPC_LL r3, VCPU_LR(r4) > + PPC_LL r5, VCPU_XER(r4) > + PPC_LL r6, VCPU_CTR(r4) > + PPC_LL r7, VCPU_CR(r4) > + PPC_LL r8, VCPU_PC(r4) > +#ifndef CONFIG_64BIT > + lwz r9, (VCPU_SHARED_MSR + 4)(r11) > +#else > + ld r9, (VCPU_SHARED_MSR)(r11) > +#endif > + PPC_LL r0, VCPU_GPR(r0)(r4) > + PPC_LL r1, VCPU_GPR(r1)(r4) > + PPC_LL r2, VCPU_GPR(r2)(r4) > + PPC_LL r10, VCPU_GPR(r10)(r4) > + PPC_LL r11, VCPU_GPR(r11)(r4) > + PPC_LL r12, VCPU_GPR(r12)(r4) > + PPC_LL r13, VCPU_GPR(r13)(r4) > + mtlr r3 > + mtxer r5 > + mtctr r6 > + mtcr r7 > + mtsrr0 r8 > + mtsrr1 r9 Are you sure this should be shared->msr, not shadow_msr? > + > +#ifdef CONFIG_KVM_EXIT_TIMING > + /* save enter time */ > +1: > + mfspr r6, SPRN_TBRU > + mfspr r7, SPRN_TBRL > + mfspr r8, SPRN_TBRU > + cmpw r8, r6 > + PPC_STL r7, VCPU_TIMING_LAST_ENTER_TBL(r4) > + bne 1b > + PPC_STL r8, VCPU_TIMING_LAST_ENTER_TBU(r4) > +#endif > + > + /* Finish loading guest volatiles and jump to guest. */ > + PPC_LL r5, VCPU_GPR(r5)(r4) > + PPC_LL r6, VCPU_GPR(r6)(r4) > + PPC_LL r7, VCPU_GPR(r7)(r4) > + PPC_LL r8, VCPU_GPR(r8)(r4) > + PPC_LL r9, VCPU_GPR(r9)(r4) > + > + PPC_LL r3, VCPU_GPR(r3)(r4) > + PPC_LL r4, VCPU_GPR(r4)(r4) > + rfi > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c > index 42701e5..f9c62dd 100644 > --- a/arch/powerpc/kvm/powerpc.c > +++ b/arch/powerpc/kvm/powerpc.c > @@ -109,6 +109,11 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu) > goto out; > #endif > > +#ifdef CONFIG_KVM_BOOKE_HV > + if (!cpu_has_feature(CPU_FTR_EMB_HV)) > + goto out; > +#endif > + > r = true; > > out: > diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h > index 8167d42..bf191e7 100644 > --- a/arch/powerpc/kvm/timing.h > +++ b/arch/powerpc/kvm/timing.h > @@ -93,6 +93,12 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type) > case SIGNAL_EXITS: > vcpu->stat.signal_exits++; > break; > + case DBELL_EXITS: > + vcpu->stat.dbell_exits++; > + break; > + case GDBELL_EXITS: > + vcpu->stat.gdbell_exits++; > + break; > } > } > > -- > 1.7.7.rc3.4.g8d714 > >
On 01/09/2012 11:46 AM, Alexander Graf wrote: > > On 21.12.2011, at 02:34, Scott Wood wrote: > >> Chips such as e500mc that implement category E.HV in Power ISA 2.06 >> provide hardware virtualization features, including a new MSR mode for >> guest state. The guest OS can perform many operations without trapping >> into the hypervisor, including transitions to and from guest userspace. >> >> Since we can use SRR1[GS] to reliably tell whether an exception came from >> guest state, instead of messing around with IVPR, we use DO_KVM similarly >> to book3s. > > Is there any benefit of using DO_KVM? I would assume that messing with IVPR is faster. Using the GS bit to decide which handler to run means we won't get confused if a machine check or critical interrupt happens between entering/exiting the guest and updating IVPR (we could use the IS bit similarly in PR-mode). This could be supplemented with IVPR (though that will add a few cycles to guest entry/exit) or some sort of runtime patching (would be more coarse-grained, active when any KVM guest exists) to avoid adding overhead to traps when KVM is not used, but I'd like to quantify that overhead first. It should be much lower than what happens on book3s. >> Current issues include: >> - Machine checks from guest state are not routed to the host handler. >> - The guest can cause a host oops by executing an emulated instruction >> in a page that lacks read permission. Existing e500/4xx support has >> the same problem. > > We solve that in book3s pr by doing > > LAST_INST = <known bad value>; > PACA->kvm_mode = <recover at next inst>; > lwz(guest pc); > do_more_stuff(); > > That way when an exception occurs at lwz() the DO_KVM handler checks that we're in kvm mode "recover" which does basically srr0+=4; rfi;. I was thinking we'd check ESR[EPID] or SRR1[IS] as appropriate, and treat it as a kernel fault (search exception table) -- but this works too and is a bit cleaner (could be other uses of external pid), at the expense of a couple extra instructions in the emulation path (but probably a slightly faster host TLB handler). The check wouldn't go in DO_KVM, though, since on bookehv that only deals with diverting flow when xSRR1[GS] is set, which wouldn't be the case here. >> @@ -243,16 +324,20 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, >> case BOOKE_IRQPRIO_AP_UNAVAIL: >> case BOOKE_IRQPRIO_ALIGNMENT: >> allowed = 1; >> - msr_mask = MSR_CE|MSR_ME|MSR_DE; >> + msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE; > > No need to do this. You already force MSR_GS in set_msr(); OK. This was here since before set_msr() started doing that. :-) >> + if (!current->thread.kvm_vcpu) { >> + WARN(1, "no vcpu\n"); >> + return -EPERM; >> + } > > Huh? Oops, leftover debugging. >> +static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) >> +{ >> + enum emulation_result er; >> + >> + er = kvmppc_emulate_instruction(run, vcpu); >> + switch (er) { >> + case EMULATE_DONE: >> + /* don't overwrite subtypes, just account kvm_stats */ >> + kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); >> + /* Future optimization: only reload non-volatiles if >> + * they were actually modified by emulation. */ >> + return RESUME_GUEST_NV; >> + >> + case EMULATE_DO_DCR: >> + run->exit_reason = KVM_EXIT_DCR; >> + return RESUME_HOST; >> + >> + case EMULATE_FAIL: >> + /* XXX Deliver Program interrupt to guest. */ >> + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", >> + __func__, vcpu->arch.regs.nip, vcpu->arch.last_inst); > > This should be throttled, otherwise the guest can spam our logs. Yes it should, but I'm just moving the code here. >> + /* For debugging, encode the failing instruction and >> + * report it to userspace. */ >> + run->hw.hardware_exit_reason = ~0ULL << 32; >> + run->hw.hardware_exit_reason |= vcpu->arch.last_inst; > > > I'm fairly sure you want to fix this :) Likewise, that's what booke.c already does. What should it do instead? > /** >> * kvmppc_handle_exit >> * >> @@ -374,12 +530,39 @@ out: >> int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, >> unsigned int exit_nr) >> { >> - enum emulation_result er; >> int r = RESUME_HOST; >> >> /* update before a new last_exit_type is rewritten */ >> kvmppc_update_timing_stats(vcpu); >> >> + /* >> + * If we actually care, we could copy MSR, DEAR, and ESR to regs, >> + * insert an appropriate trap number, etc. >> + * >> + * Seems like a waste of cycles for something that should only matter >> + * to someone using sysrq-t/p or similar host kernel debug facility. >> + * We have other debug facilities to get that information from a >> + * guest through userspace. >> + */ >> + switch (exit_nr) { >> + case BOOKE_INTERRUPT_EXTERNAL: >> + do_IRQ(&vcpu->arch.regs); > > Ah, so that's what you want to use regs for. So is having a pt_regs > struct that only contains useful register values in half its fields > any useful here? Or could we keep control of the registers ourselves, > enabling us to maybe one day optimize things more. I think it contains enough to be useful for debugging code such as sysrq and tracers, and as noted in the comment we could copy the rest if we care enough. MSR might be worth copying. It will eventually be used for machine checks as well, which I'd like to hand reasonable register state to, at least for GPRs, LR, and PC. If there's a good enough performance reason, we could just copy everything over for machine checks and pass NULL to do_IRQ (I think it can take this -- a dummy regs struct if not), but it seems premature at the moment unless the switch already causes measured performance loss (cache utilization?). >> @@ -387,30 +570,56 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, >> >> switch (exit_nr) { >> case BOOKE_INTERRUPT_MACHINE_CHECK: >> - printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR)); >> - kvmppc_dump_vcpu(vcpu); >> - r = RESUME_HOST; >> + kvm_resched(vcpu); >> + r = RESUME_GUEST; > > huh? Patch shuffling accident -- this belongs with a later patch that invokes the host machine check handler similar to what is done with do_IRQ(). The host machine check handler needs some work first, though. >> break; >> >> case BOOKE_INTERRUPT_EXTERNAL: >> kvmppc_account_exit(vcpu, EXT_INTR_EXITS); >> - if (need_resched()) >> - cond_resched(); >> + kvm_resched(vcpu); > > Why are we explicit about the resched? On book3s I just call kvm_resched(vcpu) before the switch(). There are a few exit types where we don't currently do the resched -- if they're all bugs or don't-cares, we could move it out of the switch. We probably should defer the check until after we've disabled interrupts, similar to signals -- even if we didn't exit for an interrupt, we could have received one after enabling them. >> + if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { >> + /* The guest TLB had a mapping, but the shadow TLB >> + * didn't. This could be because: >> + * a) the entry is mapping the host kernel, or >> + * b) the guest used a large mapping which we're faking >> + * Either way, we need to satisfy the fault without >> + * invoking the guest. */ >> + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); >> + } else { >> + /* Guest mapped and leaped at non-RAM! */ >> + kvmppc_booke_queue_irqprio(vcpu, >> + BOOKE_IRQPRIO_MACHINE_CHECK); > > Are you sure? Couldn't this also be MMIO? That doesn't really improve the situation as executing from MMIO is tricky with the KVM model, but it's not necessarily bad. Oh well, I guess we'll have to do something and throwing an #MC isn't all that ugly. I think I asked you about executing from MMIO once, and you said it wasn't supported even in straight QEMU. Have things changed? >> diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h >> index 05d1d99..d53bcf2 100644 >> --- a/arch/powerpc/kvm/booke.h >> +++ b/arch/powerpc/kvm/booke.h >> @@ -48,7 +48,20 @@ >> #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 >> /* Internal pseudo-irqprio for level triggered externals */ >> #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 >> -#define BOOKE_IRQPRIO_MAX 20 >> +#define BOOKE_IRQPRIO_DBELL 21 >> +#define BOOKE_IRQPRIO_DBELL_CRIT 22 >> +#define BOOKE_IRQPRIO_MAX 23 > > So was MAX wrong before or is it too big now? MAX is just a marker for how many IRQPRIOs we have, not any sort of external limit. This patch adds new IRQPRIOs, so MAX goes up. The actual limit is the number of bits in a long. >> + .if \flags & NEED_EMU >> + lwz r9, VCPU_KVM(r4) > > writing r9 > >> + .endif >> + >> +#ifdef CONFIG_KVM_EXIT_TIMING >> + /* save exit time */ >> +1: mfspr r7, SPRN_TBRU >> + mfspr r8, SPRN_TBRL >> + mfspr r9, SPRN_TBRU > > overwriting r9 again? Oops. It's RFC for a reason. :-) >> +#ifndef CONFIG_64BIT > > Double negation is always hard to read. Please reverse the ifdef :) OK. >> +lightweight_exit: >> + PPC_STL r2, HOST_R2(r1) >> + >> + mfspr r3, SPRN_PID >> + stw r3, VCPU_HOST_PID(r4) >> + lwz r3, VCPU_GUEST_PID(r4) >> + mtspr SPRN_PID, r3 >> + >> + /* Save vcpu pointer for the exception handlers >> + * must be done before loading guest r2. >> + */ >> +// SET_VCPU(r4) > > hm? Can just be removed, it's handled in booke's vcpu load/put. >> + lwz r6, (VCPU_SHARED_MAS2 + 4)(r11) >> +#else >> + ld r6, (VCPU_SHARED_MAS2)(r11) >> +#endif >> + lwz r7, VCPU_SHARED_MAS7_3+4(r11) >> + lwz r8, VCPU_SHARED_MAS4(r11) >> + mtspr SPRN_MAS0, r3 >> + mtspr SPRN_MAS1, r5 >> + mtspr SPRN_MAS2, r6 >> + mtspr SPRN_MAS3, r7 >> + mtspr SPRN_MAS4, r8 >> + lwz r3, VCPU_SHARED_MAS6(r11) >> + lwz r5, VCPU_SHARED_MAS7_3+0(r11) >> + mtspr SPRN_MAS6, r3 >> + mtspr SPRN_MAS7, r5 >> + /* Disable MAS register updates via exception */ >> + mfspr r3, SPRN_EPCR >> + oris r3, r3, SPRN_EPCR_DMIUH@h >> + mtspr SPRN_EPCR, r3 > > Shouldn't this happen before you set the MAS registers? :) Yes (though we really shouldn't be getting a TLB miss here, at least on e500mc). >> + /* Load some guest volatiles. */ >> + PPC_LL r3, VCPU_LR(r4) >> + PPC_LL r5, VCPU_XER(r4) >> + PPC_LL r6, VCPU_CTR(r4) >> + PPC_LL r7, VCPU_CR(r4) >> + PPC_LL r8, VCPU_PC(r4) >> +#ifndef CONFIG_64BIT >> + lwz r9, (VCPU_SHARED_MSR + 4)(r11) >> +#else >> + ld r9, (VCPU_SHARED_MSR)(r11) >> +#endif >> + PPC_LL r0, VCPU_GPR(r0)(r4) >> + PPC_LL r1, VCPU_GPR(r1)(r4) >> + PPC_LL r2, VCPU_GPR(r2)(r4) >> + PPC_LL r10, VCPU_GPR(r10)(r4) >> + PPC_LL r11, VCPU_GPR(r11)(r4) >> + PPC_LL r12, VCPU_GPR(r12)(r4) >> + PPC_LL r13, VCPU_GPR(r13)(r4) >> + mtlr r3 >> + mtxer r5 >> + mtctr r6 >> + mtcr r7 >> + mtsrr0 r8 >> + mtsrr1 r9 > > Are you sure this should be shared->msr, not shadow_msr? Yes, we don't use shadow_msr on bookehv. I'll add a comment in the struct definition as discussed in the other thread, as well as other areas where there are subtle differences between PR-mode and GS-mode. -Scott
On 10.01.2012, at 01:51, Scott Wood wrote: > On 01/09/2012 11:46 AM, Alexander Graf wrote: >> >> On 21.12.2011, at 02:34, Scott Wood wrote: >> >>> Chips such as e500mc that implement category E.HV in Power ISA 2.06 >>> provide hardware virtualization features, including a new MSR mode for >>> guest state. The guest OS can perform many operations without trapping >>> into the hypervisor, including transitions to and from guest userspace. >>> >>> Since we can use SRR1[GS] to reliably tell whether an exception came from >>> guest state, instead of messing around with IVPR, we use DO_KVM similarly >>> to book3s. >> >> Is there any benefit of using DO_KVM? I would assume that messing with IVPR is faster. > > Using the GS bit to decide which handler to run means we won't get > confused if a machine check or critical interrupt happens between > entering/exiting the guest and updating IVPR (we could use the IS bit > similarly in PR-mode). > > This could be supplemented with IVPR (though that will add a few cycles > to guest entry/exit) or some sort of runtime patching (would be more > coarse-grained, active when any KVM guest exists) to avoid adding > overhead to traps when KVM is not used, but I'd like to quantify that > overhead first. It should be much lower than what happens on book3s. Hrm. Yeah, given that your DO_KVM handler is so much simpler, it might make sense to stick with that method. Benchmarks would be useful in the long run though. > >>> Current issues include: >>> - Machine checks from guest state are not routed to the host handler. >>> - The guest can cause a host oops by executing an emulated instruction >>> in a page that lacks read permission. Existing e500/4xx support has >>> the same problem. >> >> We solve that in book3s pr by doing >> >> LAST_INST = <known bad value>; >> PACA->kvm_mode = <recover at next inst>; >> lwz(guest pc); >> do_more_stuff(); >> >> That way when an exception occurs at lwz() the DO_KVM handler checks that we're in kvm mode "recover" which does basically srr0+=4; rfi;. > > I was thinking we'd check ESR[EPID] or SRR1[IS] as appropriate, and > treat it as a kernel fault (search exception table) -- but this works > too and is a bit cleaner (could be other uses of external pid), at the > expense of a couple extra instructions in the emulation path (but > probably a slightly faster host TLB handler). > > The check wouldn't go in DO_KVM, though, since on bookehv that only > deals with diverting flow when xSRR1[GS] is set, which wouldn't be the > case here. Yup, not sure where you'd put the check, as it'd slow down normal operation too. Hrm. > >>> @@ -243,16 +324,20 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, >>> case BOOKE_IRQPRIO_AP_UNAVAIL: >>> case BOOKE_IRQPRIO_ALIGNMENT: >>> allowed = 1; >>> - msr_mask = MSR_CE|MSR_ME|MSR_DE; >>> + msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE; >> >> No need to do this. You already force MSR_GS in set_msr(); > > OK. This was here since before set_msr() started doing that. :-) > >>> + if (!current->thread.kvm_vcpu) { >>> + WARN(1, "no vcpu\n"); >>> + return -EPERM; >>> + } >> >> Huh? > > Oops, leftover debugging. > >>> +static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) >>> +{ >>> + enum emulation_result er; >>> + >>> + er = kvmppc_emulate_instruction(run, vcpu); >>> + switch (er) { >>> + case EMULATE_DONE: >>> + /* don't overwrite subtypes, just account kvm_stats */ >>> + kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); >>> + /* Future optimization: only reload non-volatiles if >>> + * they were actually modified by emulation. */ >>> + return RESUME_GUEST_NV; >>> + >>> + case EMULATE_DO_DCR: >>> + run->exit_reason = KVM_EXIT_DCR; >>> + return RESUME_HOST; >>> + >>> + case EMULATE_FAIL: >>> + /* XXX Deliver Program interrupt to guest. */ >>> + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", >>> + __func__, vcpu->arch.regs.nip, vcpu->arch.last_inst); >> >> This should be throttled, otherwise the guest can spam our logs. > > Yes it should, but I'm just moving the code here. Yeah, only realized this later. Maybe next time (not for this patch set, next time you're sending something) just extract these mechanical parts, so it's easier to review the pieces where code actually changes :). > >>> + /* For debugging, encode the failing instruction and >>> + * report it to userspace. */ >>> + run->hw.hardware_exit_reason = ~0ULL << 32; >>> + run->hw.hardware_exit_reason |= vcpu->arch.last_inst; >> >> >> I'm fairly sure you want to fix this :) > > Likewise, that's what booke.c already does. What should it do instead? This is what book3s does: case EMULATE_FAIL: printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); kvmppc_core_queue_program(vcpu, flags); r = RESUME_GUEST; which also doesn't throttle the printk, but I think injecting a program fault into the guest is the most sensible thing to do if we don't know what the instruction is supposed to do. Best case we get an oops inside the guest telling us what broke :). > >> /** >>> * kvmppc_handle_exit >>> * >>> @@ -374,12 +530,39 @@ out: >>> int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, >>> unsigned int exit_nr) >>> { >>> - enum emulation_result er; >>> int r = RESUME_HOST; >>> >>> /* update before a new last_exit_type is rewritten */ >>> kvmppc_update_timing_stats(vcpu); >>> >>> + /* >>> + * If we actually care, we could copy MSR, DEAR, and ESR to regs, >>> + * insert an appropriate trap number, etc. >>> + * >>> + * Seems like a waste of cycles for something that should only matter >>> + * to someone using sysrq-t/p or similar host kernel debug facility. >>> + * We have other debug facilities to get that information from a >>> + * guest through userspace. >>> + */ >>> + switch (exit_nr) { >>> + case BOOKE_INTERRUPT_EXTERNAL: >>> + do_IRQ(&vcpu->arch.regs); >> >> Ah, so that's what you want to use regs for. So is having a pt_regs >> struct that only contains useful register values in half its fields >> any useful here? Or could we keep control of the registers ourselves, >> enabling us to maybe one day optimize things more. > > I think it contains enough to be useful for debugging code such as sysrq > and tracers, and as noted in the comment we could copy the rest if we > care enough. MSR might be worth copying. > > It will eventually be used for machine checks as well, which I'd like to > hand reasonable register state to, at least for GPRs, LR, and PC. > > If there's a good enough performance reason, we could just copy > everything over for machine checks and pass NULL to do_IRQ (I think it > can take this -- a dummy regs struct if not), but it seems premature at > the moment unless the switch already causes measured performance loss > (cache utilization?). I'm definitely not concerned about performance, but complexity and uniqueness. With the pt_regs struct, we have a bunch of fields in the vcpu that are there, but unused. I find that situation pretty confusing. So yes, I would definitely prefer to copy registers during MC and keep the registers where they are today - unless there are SPRs for them of course. Imagine we'd one day want to share GPRs with user space through the kvm_run structure (see the s390 patches on the ML for this). I really wouldn't want to make pt_regs part of our userspace ABI. > >>> @@ -387,30 +570,56 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, >>> >>> switch (exit_nr) { >>> case BOOKE_INTERRUPT_MACHINE_CHECK: >>> - printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR)); >>> - kvmppc_dump_vcpu(vcpu); >>> - r = RESUME_HOST; >>> + kvm_resched(vcpu); >>> + r = RESUME_GUEST; >> >> huh? > > Patch shuffling accident -- this belongs with a later patch that invokes > the host machine check handler similar to what is done with do_IRQ(). > The host machine check handler needs some work first, though. > >>> break; >>> >>> case BOOKE_INTERRUPT_EXTERNAL: >>> kvmppc_account_exit(vcpu, EXT_INTR_EXITS); >>> - if (need_resched()) >>> - cond_resched(); >>> + kvm_resched(vcpu); >> >> Why are we explicit about the resched? On book3s I just call kvm_resched(vcpu) before the switch(). > > There are a few exit types where we don't currently do the resched -- if > they're all bugs or don't-cares, we could move it out of the switch. > > We probably should defer the check until after we've disabled > interrupts, similar to signals -- even if we didn't exit for an > interrupt, we could have received one after enabling them. Yup. I just don't think you can call resched() with interrupts disabled, so a bit cleverness is probably required here. > >>> + if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { >>> + /* The guest TLB had a mapping, but the shadow TLB >>> + * didn't. This could be because: >>> + * a) the entry is mapping the host kernel, or >>> + * b) the guest used a large mapping which we're faking >>> + * Either way, we need to satisfy the fault without >>> + * invoking the guest. */ >>> + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); >>> + } else { >>> + /* Guest mapped and leaped at non-RAM! */ >>> + kvmppc_booke_queue_irqprio(vcpu, >>> + BOOKE_IRQPRIO_MACHINE_CHECK); >> >> Are you sure? Couldn't this also be MMIO? That doesn't really improve the situation as executing from MMIO is tricky with the KVM model, but it's not necessarily bad. Oh well, I guess we'll have to do something and throwing an #MC isn't all that ugly. > > I think I asked you about executing from MMIO once, and you said it > wasn't supported even in straight QEMU. Have things changed? Yeah, I talked to Anthony about that part and apparently the QEMU design does support execution from MMIO. But don't worry about it for now. I don't think we'll really have guest OSs doing this. And if they do, we can worry about it then. > >>> diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h >>> index 05d1d99..d53bcf2 100644 >>> --- a/arch/powerpc/kvm/booke.h >>> +++ b/arch/powerpc/kvm/booke.h >>> @@ -48,7 +48,20 @@ >>> #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 >>> /* Internal pseudo-irqprio for level triggered externals */ >>> #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 >>> -#define BOOKE_IRQPRIO_MAX 20 >>> +#define BOOKE_IRQPRIO_DBELL 21 >>> +#define BOOKE_IRQPRIO_DBELL_CRIT 22 >>> +#define BOOKE_IRQPRIO_MAX 23 >> >> So was MAX wrong before or is it too big now? > > MAX is just a marker for how many IRQPRIOs we have, not any sort of > external limit. This patch adds new IRQPRIOs, so MAX goes up. > > The actual limit is the number of bits in a long. Yes, and before the highest value was 20 with MAX being 20, now the highest value is 22 with MAX being 23. Either MAX == highest number or MAX == highest number + 1, but you're changing the semantics of MAX here. Maybe it was wrong before, I don't know, hence I'm asking :). > >>> + .if \flags & NEED_EMU >>> + lwz r9, VCPU_KVM(r4) >> >> writing r9 >> >>> + .endif >>> + >>> +#ifdef CONFIG_KVM_EXIT_TIMING >>> + /* save exit time */ >>> +1: mfspr r7, SPRN_TBRU >>> + mfspr r8, SPRN_TBRL >>> + mfspr r9, SPRN_TBRU >> >> overwriting r9 again? > > Oops. It's RFC for a reason. :-) > >>> +#ifndef CONFIG_64BIT >> >> Double negation is always hard to read. Please reverse the ifdef :) > > OK. > >>> +lightweight_exit: >>> + PPC_STL r2, HOST_R2(r1) >>> + >>> + mfspr r3, SPRN_PID >>> + stw r3, VCPU_HOST_PID(r4) >>> + lwz r3, VCPU_GUEST_PID(r4) >>> + mtspr SPRN_PID, r3 >>> + >>> + /* Save vcpu pointer for the exception handlers >>> + * must be done before loading guest r2. >>> + */ >>> +// SET_VCPU(r4) >> >> hm? > > Can just be removed, it's handled in booke's vcpu load/put. > >>> + lwz r6, (VCPU_SHARED_MAS2 + 4)(r11) >>> +#else >>> + ld r6, (VCPU_SHARED_MAS2)(r11) >>> +#endif >>> + lwz r7, VCPU_SHARED_MAS7_3+4(r11) >>> + lwz r8, VCPU_SHARED_MAS4(r11) >>> + mtspr SPRN_MAS0, r3 >>> + mtspr SPRN_MAS1, r5 >>> + mtspr SPRN_MAS2, r6 >>> + mtspr SPRN_MAS3, r7 >>> + mtspr SPRN_MAS4, r8 >>> + lwz r3, VCPU_SHARED_MAS6(r11) >>> + lwz r5, VCPU_SHARED_MAS7_3+0(r11) >>> + mtspr SPRN_MAS6, r3 >>> + mtspr SPRN_MAS7, r5 >>> + /* Disable MAS register updates via exception */ >>> + mfspr r3, SPRN_EPCR >>> + oris r3, r3, SPRN_EPCR_DMIUH@h >>> + mtspr SPRN_EPCR, r3 >> >> Shouldn't this happen before you set the MAS registers? :) > > Yes (though we really shouldn't be getting a TLB miss here, at least on > e500mc). Yeah, but the way it's now it gives you a false feeling of security :) > >>> + /* Load some guest volatiles. */ >>> + PPC_LL r3, VCPU_LR(r4) >>> + PPC_LL r5, VCPU_XER(r4) >>> + PPC_LL r6, VCPU_CTR(r4) >>> + PPC_LL r7, VCPU_CR(r4) >>> + PPC_LL r8, VCPU_PC(r4) >>> +#ifndef CONFIG_64BIT >>> + lwz r9, (VCPU_SHARED_MSR + 4)(r11) >>> +#else >>> + ld r9, (VCPU_SHARED_MSR)(r11) >>> +#endif >>> + PPC_LL r0, VCPU_GPR(r0)(r4) >>> + PPC_LL r1, VCPU_GPR(r1)(r4) >>> + PPC_LL r2, VCPU_GPR(r2)(r4) >>> + PPC_LL r10, VCPU_GPR(r10)(r4) >>> + PPC_LL r11, VCPU_GPR(r11)(r4) >>> + PPC_LL r12, VCPU_GPR(r12)(r4) >>> + PPC_LL r13, VCPU_GPR(r13)(r4) >>> + mtlr r3 >>> + mtxer r5 >>> + mtctr r6 >>> + mtcr r7 >>> + mtsrr0 r8 >>> + mtsrr1 r9 >> >> Are you sure this should be shared->msr, not shadow_msr? > > Yes, we don't use shadow_msr on bookehv. I'll add a comment in the > struct definition as discussed in the other thread, as well as other > areas where there are subtle differences between PR-mode and GS-mode. Thanks! Alex
On 01/09/2012 09:11 PM, Alexander Graf wrote: > On 10.01.2012, at 01:51, Scott Wood wrote: >> On 01/09/2012 11:46 AM, Alexander Graf wrote: >>> On 21.12.2011, at 02:34, Scott Wood wrote: >>>> + /* For debugging, encode the failing instruction and >>>> + * report it to userspace. */ >>>> + run->hw.hardware_exit_reason = ~0ULL << 32; >>>> + run->hw.hardware_exit_reason |= vcpu->arch.last_inst; >>> >>> >>> I'm fairly sure you want to fix this :) >> >> Likewise, that's what booke.c already does. What should it do instead? > > This is what book3s does: > > case EMULATE_FAIL: > printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", > __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); > kvmppc_core_queue_program(vcpu, flags); > r = RESUME_GUEST; > > which also doesn't throttle the printk, but I think injecting a > program fault into the guest is the most sensible thing to do if we > don't know what the instruction is supposed to do. Best case we get > an oops inside the guest telling us what broke :). Ah, yes, it should send a program check. >>> Ah, so that's what you want to use regs for. So is having a pt_regs >>> struct that only contains useful register values in half its fields >>> any useful here? Or could we keep control of the registers ourselves, >>> enabling us to maybe one day optimize things more. >> >> I think it contains enough to be useful for debugging code such as sysrq >> and tracers, and as noted in the comment we could copy the rest if we >> care enough. MSR might be worth copying. >> >> It will eventually be used for machine checks as well, which I'd like to >> hand reasonable register state to, at least for GPRs, LR, and PC. >> >> If there's a good enough performance reason, we could just copy >> everything over for machine checks and pass NULL to do_IRQ (I think it >> can take this -- a dummy regs struct if not), but it seems premature at >> the moment unless the switch already causes measured performance loss >> (cache utilization?). > > I'm definitely not concerned about performance, but complexity and uniqueness. > > With the pt_regs struct, we have a bunch of fields in the vcpu that are there, but unused. I find that situation pretty confusing. I removed the registers from the vcpu, that are to be used in regs instead. There are a few fields in regs that are not valid, though it is explicitly pointed out via a comment. > So yes, I would definitely prefer to copy registers during MC and keep the registers where they are today - unless there are SPRs for them of course. > > Imagine we'd one day want to share GPRs with user space through the > kvm_run structure (see the s390 patches on the ML for this). I really > wouldn't want to make pt_regs part of our userspace ABI. Neither would I. If that's something that's reasonably likely to happen, I guess that's a good enough reason to avoid this. We could always add later a debug option to copy regs even on normal interrupts, if needed. >> We probably should defer the check until after we've disabled >> interrupts, similar to signals -- even if we didn't exit for an >> interrupt, we could have received one after enabling them. > > Yup. I just don't think you can call resched() with interrupts disabled, so a bit cleverness is probably required here. I think it is actually allowed, but interrupts will be enabled on return. We'll need to repeat prepare_to_enter if we do schedule. Since we already need special handling for that, we might as well add a local_irq_enable() once we know we are going to schedule, just in case. >>>> diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h >>>> index 05d1d99..d53bcf2 100644 >>>> --- a/arch/powerpc/kvm/booke.h >>>> +++ b/arch/powerpc/kvm/booke.h >>>> @@ -48,7 +48,20 @@ >>>> #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 >>>> /* Internal pseudo-irqprio for level triggered externals */ >>>> #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 >>>> -#define BOOKE_IRQPRIO_MAX 20 >>>> +#define BOOKE_IRQPRIO_DBELL 21 >>>> +#define BOOKE_IRQPRIO_DBELL_CRIT 22 >>>> +#define BOOKE_IRQPRIO_MAX 23 >>> >>> So was MAX wrong before or is it too big now? >> >> MAX is just a marker for how many IRQPRIOs we have, not any sort of >> external limit. This patch adds new IRQPRIOs, so MAX goes up. >> >> The actual limit is the number of bits in a long. > > Yes, and before the highest value was 20 with MAX being 20, now the > highest value is 22 with MAX being 23. Either MAX == highest number > or MAX == highest number + 1, but you're changing the semantics of > MAX here. Maybe it was wrong before, I don't know, hence I'm asking > :). Oh, didn't notice that. Actually, it looks like the two places that reference BOOKE_IRQPRIO_MAX don't agree on what they're expecting. book3s uses "one greater than the highest irqprio", so I guess we should resolve it that way (even though I'd normally expect that to be phrased "num" rather than "max") -- as a separate patch, of course. -Scott
On 10.01.2012, at 23:03, Scott Wood wrote: > On 01/09/2012 09:11 PM, Alexander Graf wrote: >> On 10.01.2012, at 01:51, Scott Wood wrote: >>> On 01/09/2012 11:46 AM, Alexander Graf wrote: >>>> On 21.12.2011, at 02:34, Scott Wood wrote: >>>>> + /* For debugging, encode the failing instruction and >>>>> + * report it to userspace. */ >>>>> + run->hw.hardware_exit_reason = ~0ULL << 32; >>>>> + run->hw.hardware_exit_reason |= vcpu->arch.last_inst; >>>> >>>> >>>> I'm fairly sure you want to fix this :) >>> >>> Likewise, that's what booke.c already does. What should it do instead? >> >> This is what book3s does: >> >> case EMULATE_FAIL: >> printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", >> __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); >> kvmppc_core_queue_program(vcpu, flags); >> r = RESUME_GUEST; >> >> which also doesn't throttle the printk, but I think injecting a >> program fault into the guest is the most sensible thing to do if we >> don't know what the instruction is supposed to do. Best case we get >> an oops inside the guest telling us what broke :). > > Ah, yes, it should send a program check. > >>>> Ah, so that's what you want to use regs for. So is having a pt_regs >>>> struct that only contains useful register values in half its fields >>>> any useful here? Or could we keep control of the registers ourselves, >>>> enabling us to maybe one day optimize things more. >>> >>> I think it contains enough to be useful for debugging code such as sysrq >>> and tracers, and as noted in the comment we could copy the rest if we >>> care enough. MSR might be worth copying. >>> >>> It will eventually be used for machine checks as well, which I'd like to >>> hand reasonable register state to, at least for GPRs, LR, and PC. >>> >>> If there's a good enough performance reason, we could just copy >>> everything over for machine checks and pass NULL to do_IRQ (I think it >>> can take this -- a dummy regs struct if not), but it seems premature at >>> the moment unless the switch already causes measured performance loss >>> (cache utilization?). >> >> I'm definitely not concerned about performance, but complexity and uniqueness. >> >> With the pt_regs struct, we have a bunch of fields in the vcpu that are there, but unused. I find that situation pretty confusing. > > I removed the registers from the vcpu, that are to be used in regs instead. > > There are a few fields in regs that are not valid, though it is > explicitly pointed out via a comment. Yes, and if there was real technical reason to do it this way I'd agree. But there isn't. > >> So yes, I would definitely prefer to copy registers during MC and keep the registers where they are today - unless there are SPRs for them of course. >> >> Imagine we'd one day want to share GPRs with user space through the >> kvm_run structure (see the s390 patches on the ML for this). I really >> wouldn't want to make pt_regs part of our userspace ABI. > > Neither would I. If that's something that's reasonably likely to > happen, I guess that's a good enough reason to avoid this. We could > always add later a debug option to copy regs even on normal interrupts, > if needed. Yup. I don't want to walk in the wrong direction basically. The overhead of copying a couple fields to the stack on machine checks doesn't sound too bad compared to the flexibility we maintain by keeping fields under our control. Another imaginary case. I experimented with putting the GPRs into the PACA back in the day. I don't remember why anymore, but it was for some speedup of something. That wouldn't be possible if we mandate everyone to use pt_regs. > >>> We probably should defer the check until after we've disabled >>> interrupts, similar to signals -- even if we didn't exit for an >>> interrupt, we could have received one after enabling them. >> >> Yup. I just don't think you can call resched() with interrupts disabled, so a bit cleverness is probably required here. > > I think it is actually allowed, but interrupts will be enabled on > return. We'll need to repeat prepare_to_enter if we do schedule. Since > we already need special handling for that, we might as well add a > local_irq_enable() once we know we are going to schedule, just in case. Yup :). And then check again. > >>>>> diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h >>>>> index 05d1d99..d53bcf2 100644 >>>>> --- a/arch/powerpc/kvm/booke.h >>>>> +++ b/arch/powerpc/kvm/booke.h >>>>> @@ -48,7 +48,20 @@ >>>>> #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 >>>>> /* Internal pseudo-irqprio for level triggered externals */ >>>>> #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 >>>>> -#define BOOKE_IRQPRIO_MAX 20 >>>>> +#define BOOKE_IRQPRIO_DBELL 21 >>>>> +#define BOOKE_IRQPRIO_DBELL_CRIT 22 >>>>> +#define BOOKE_IRQPRIO_MAX 23 >>>> >>>> So was MAX wrong before or is it too big now? >>> >>> MAX is just a marker for how many IRQPRIOs we have, not any sort of >>> external limit. This patch adds new IRQPRIOs, so MAX goes up. >>> >>> The actual limit is the number of bits in a long. >> > >> Yes, and before the highest value was 20 with MAX being 20, now the >> highest value is 22 with MAX being 23. Either MAX == highest number >> or MAX == highest number + 1, but you're changing the semantics of >> MAX here. Maybe it was wrong before, I don't know, hence I'm asking >> :). > > Oh, didn't notice that. > > Actually, it looks like the two places that reference BOOKE_IRQPRIO_MAX > don't agree on what they're expecting. book3s uses "one greater than > the highest irqprio", so I guess we should resolve it that way (even > though I'd normally expect that to be phrased "num" rather than "max") > -- as a separate patch, of course. Yup. As long as it's consistent it's fine. I just really stumbled over this since the semantics of the define changed. Alex
On Tue, 2012-01-10 at 04:11 +0100, Alexander Graf wrote: > This is what book3s does: > > case EMULATE_FAIL: > printk(KERN_CRIT "%s: emulation at %lx failed > (%08x)\n", > __func__, kvmppc_get_pc(vcpu), > kvmppc_get_last_inst(vcpu)); > kvmppc_core_queue_program(vcpu, flags); > r = RESUME_GUEST; > > which also doesn't throttle the printk, but I think injecting a > program fault into the guest is the most sensible thing to do if we > don't know what the instruction is supposed to do. Best case we get an > oops inside the guest telling us what broke :). You can also fallback to a slow path that reads the guest TLB, translates then reads the instruction. Of course you have to be careful as such a manual translate + read + execute needs to be somewhat synchronized with a possible TLB invalidation :-) (MMIO emulation is broken in this regard too btw) Cheers, Ben.
On 12.01.2012, at 07:44, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote: > On Tue, 2012-01-10 at 04:11 +0100, Alexander Graf wrote: >> This is what book3s does: >> >> case EMULATE_FAIL: >> printk(KERN_CRIT "%s: emulation at %lx failed >> (%08x)\n", >> __func__, kvmppc_get_pc(vcpu), >> kvmppc_get_last_inst(vcpu)); >> kvmppc_core_queue_program(vcpu, flags); >> r = RESUME_GUEST; >> >> which also doesn't throttle the printk, but I think injecting a >> program fault into the guest is the most sensible thing to do if we >> don't know what the instruction is supposed to do. Best case we get an >> oops inside the guest telling us what broke :). > > You can also fallback to a slow path that reads the guest TLB, > translates then reads the instruction. Of course you have to be careful > as such a manual translate + read + execute needs to be somewhat > synchronized with a possible TLB invalidation :-) Well we do want to be fast on the default path though. So yes, what you're saying is what book3s does, but as a fallback in case the fast path didn't work. The problem here however is that we don't know if the fast path failed; we oops. > > (MMIO emulation is broken in this regard too btw) Huh? Alex > > Cheers, > Ben. > >
On Thu, Jan 12, 2012 at 05:44:26PM +1100, Benjamin Herrenschmidt wrote: > On Tue, 2012-01-10 at 04:11 +0100, Alexander Graf wrote: > > This is what book3s does: > > > > case EMULATE_FAIL: > > printk(KERN_CRIT "%s: emulation at %lx failed > > (%08x)\n", > > __func__, kvmppc_get_pc(vcpu), > > kvmppc_get_last_inst(vcpu)); > > kvmppc_core_queue_program(vcpu, flags); > > r = RESUME_GUEST; > > > > which also doesn't throttle the printk, but I think injecting a > > program fault into the guest is the most sensible thing to do if we > > don't know what the instruction is supposed to do. Best case we get an > > oops inside the guest telling us what broke :). > > You can also fallback to a slow path that reads the guest TLB, > translates then reads the instruction. Of course you have to be careful > as such a manual translate + read + execute needs to be somewhat > synchronized with a possible TLB invalidation :-) That's how we should deal with a failure to read the instruction due to it being execute-only (once we add the ability to fix up a fault on a booke KVM instruction fetch) -- but the above code is dealing with the case where we read the instruction successfully, but don't have an emulation handler for it. -Scott
On 10.01.2012, at 01:51, Scott Wood wrote: > On 01/09/2012 11:46 AM, Alexander Graf wrote: >> >> On 21.12.2011, at 02:34, Scott Wood wrote: > [...] >>> Current issues include: >>> - Machine checks from guest state are not routed to the host handler. >>> - The guest can cause a host oops by executing an emulated instruction >>> in a page that lacks read permission. Existing e500/4xx support has >>> the same problem. >> >> We solve that in book3s pr by doing >> >> LAST_INST = <known bad value>; >> PACA->kvm_mode = <recover at next inst>; >> lwz(guest pc); >> do_more_stuff(); >> >> That way when an exception occurs at lwz() the DO_KVM handler checks that we're in kvm mode "recover" which does basically srr0+=4; rfi;. > > I was thinking we'd check ESR[EPID] or SRR1[IS] as appropriate, and > treat it as a kernel fault (search exception table) -- but this works > too and is a bit cleaner (could be other uses of external pid), at the > expense of a couple extra instructions in the emulation path (but > probably a slightly faster host TLB handler). > > The check wouldn't go in DO_KVM, though, since on bookehv that only > deals with diverting flow when xSRR1[GS] is set, which wouldn't be the > case here. Thinking about it a bit more, how is this different from a failed get_user()? We can just use the same fixup mechanism as there, right? Alex
On 02/15/2012 01:36 PM, Alexander Graf wrote: > > On 10.01.2012, at 01:51, Scott Wood wrote: >> I was thinking we'd check ESR[EPID] or SRR1[IS] as appropriate, and >> treat it as a kernel fault (search exception table) -- but this works >> too and is a bit cleaner (could be other uses of external pid), at the >> expense of a couple extra instructions in the emulation path (but >> probably a slightly faster host TLB handler). >> >> The check wouldn't go in DO_KVM, though, since on bookehv that only >> deals with diverting flow when xSRR1[GS] is set, which wouldn't be the >> case here. > > Thinking about it a bit more, how is this different from a failed get_user()? We can just use the same fixup mechanism as there, right? The fixup mechanism can be the same (we'd like to know whether it failed due to TLB miss or DSI, so we know which to reflect -- but if necessary I think we can figure that out with a tlbsx). What's different is that the page fault handler needs to know that any external pid (or AS1) fault is bad, same as if the address were in the kernel area, and it should go directly to searching the exception tables instead of trying to page something in. -Scott
On 15.02.2012, at 20:40, Scott Wood wrote: > On 02/15/2012 01:36 PM, Alexander Graf wrote: >> >> On 10.01.2012, at 01:51, Scott Wood wrote: >>> I was thinking we'd check ESR[EPID] or SRR1[IS] as appropriate, and >>> treat it as a kernel fault (search exception table) -- but this works >>> too and is a bit cleaner (could be other uses of external pid), at the >>> expense of a couple extra instructions in the emulation path (but >>> probably a slightly faster host TLB handler). >>> >>> The check wouldn't go in DO_KVM, though, since on bookehv that only >>> deals with diverting flow when xSRR1[GS] is set, which wouldn't be the >>> case here. >> >> Thinking about it a bit more, how is this different from a failed get_user()? We can just use the same fixup mechanism as there, right? > > The fixup mechanism can be the same (we'd like to know whether it failed > due to TLB miss or DSI, so we know which to reflect No, we only want to know "fast path failed". The reason is a different pair of shoes and should be evaluated in the slow path. We shouldn't ever fault here during normal operation btw. We already executed a guest instruction, so there's almost no reason it can't be read. > -- but if necessary > I think we can figure that out with a tlbsx). What's different is that > the page fault handler needs to know that any external pid (or AS1) > fault is bad, same as if the address were in the kernel area, and it > should go directly to searching the exception tables instead of trying > to page something in. Yes and no. We need to force it to search the exception tables. We don't care if the page fault handlers knows anything about external pids. Either way, we discussed the further stuff on IRC and came to a working solution :). Stay tuned. Alex
diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index efa74ac..d7365b0 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -19,6 +19,7 @@ #define PPC_DBELL_MSG_BRDCAST (0x04000000) #define PPC_DBELL_TYPE(x) (((x) & 0xf) << (63-36)) +#define PPC_DBELL_LPID(x) ((x) << (63 - 49)) enum ppc_dbell { PPC_DBELL = 0, /* doorbell */ PPC_DBELL_CRIT = 1, /* critical doorbell */ diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 7b1f0e0..0978152 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -48,6 +48,14 @@ #define BOOKE_INTERRUPT_SPE_FP_DATA 33 #define BOOKE_INTERRUPT_SPE_FP_ROUND 34 #define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35 +#define BOOKE_INTERRUPT_DOORBELL 36 +#define BOOKE_INTERRUPT_DOORBELL_CRITICAL 37 + +/* booke_hv */ +#define BOOKE_INTERRUPT_GUEST_DBELL 38 +#define BOOKE_INTERRUPT_GUEST_DBELL_CRIT 39 +#define BOOKE_INTERRUPT_HV_SYSCALL 40 +#define BOOKE_INTERRUPT_HV_PRIV 41 /* book3s */ diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h new file mode 100644 index 0000000..30a600f --- /dev/null +++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h @@ -0,0 +1,49 @@ +/* + * Copyright 2010-2011 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef ASM_KVM_BOOKE_HV_ASM_H +#define ASM_KVM_BOOKE_HV_ASM_H + +#ifdef __ASSEMBLY__ + +/* + * All exceptions from guest state must go through KVM + * (except for those which are delivered directly to the guest) -- + * there are no exceptions for which we fall through directly to + * the normal host handler. + * + * Expected inputs (normal exceptions): + * SCRATCH0 = saved r10 + * r10 = thread struct + * r11 = appropriate SRR1 variant (currently used as scratch) + * r13 = saved CR + * *(r10 + THREAD_NORMSAVE(0)) = saved r11 + * *(r10 + THREAD_NORMSAVE(2)) = saved r13 + * + * Expected inputs (crit/mcheck/debug exceptions): + * appropriate SCRATCH = saved r8 + * r8 = exception level stack frame + * r9 = *(r8 + _CCR) = saved CR + * r11 = appropriate SRR1 variant (currently used as scratch) + * *(r8 + GPR9) = saved r9 + * *(r8 + GPR10) = saved r10 (r10 not yet clobbered) + * *(r8 + GPR11) = saved r11 + */ +.macro DO_KVM intno srr1 +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */ + bf 3, kvmppc_resume_\intno\()_\srr1 + b kvmppc_handler_\intno\()_\srr1 +kvmppc_resume_\intno\()_\srr1: +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif +.endm + +#endif /*__ASSEMBLY__ */ +#endif /* ASM_KVM_BOOKE_HV_ASM_H */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ad4d671..d603513 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -107,6 +107,8 @@ struct kvm_vcpu_stat { u32 dec_exits; u32 ext_intr_exits; u32 halt_wakeup; + u32 dbell_exits; + u32 gdbell_exits; #ifdef CONFIG_PPC_BOOK3S u32 pf_storage; u32 pf_instruc; @@ -141,6 +143,7 @@ enum kvm_exit_types { EMULATED_TLBSX_EXITS, EMULATED_TLBWE_EXITS, EMULATED_RFI_EXITS, + EMULATED_RFCI_EXITS, DEC_EXITS, EXT_INTR_EXITS, HALT_WAKEUP, @@ -148,6 +151,8 @@ enum kvm_exit_types { FP_UNAVAIL, DEBUG_EXITS, TIMEINGUEST, + DBELL_EXITS, + GDBELL_EXITS, __NUMBER_OF_KVM_EXIT_TYPES }; @@ -213,10 +218,10 @@ struct revmap_entry { #define KVMPPC_GOT_PAGE 0x80 struct kvm_arch { + unsigned int lpid; #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; struct revmap_entry *revmap; - unsigned int lpid; unsigned int host_lpid; unsigned long host_lpcr; unsigned long sdr1; @@ -346,6 +351,17 @@ struct kvm_vcpu_arch { u32 qpr[32]; #endif +#ifdef CONFIG_KVM_BOOKE_HV + u32 host_mas4; + u32 host_mas6; + u32 shadow_epcr; + u32 epcr; + u32 shadow_msrp; + u32 eplc; + u32 epsc; + u32 oldpir; +#endif + #ifdef CONFIG_PPC_BOOK3S ulong hflags; ulong guest_owned_ext; @@ -417,6 +433,7 @@ struct kvm_vcpu_arch { ulong queued_esr; u32 tlbcfg[4]; u32 mmucfg; + u32 epr; #endif gpa_t paddr_accessed; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 5524f88..247b920 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -137,6 +137,9 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, extern void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); +extern int kvmppc_bookehv_init(void); +extern void kvmppc_bookehv_exit(void); + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 36a6eaa..b8e303c 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -104,6 +104,8 @@ #define MAS4_TSIZED_MASK 0x00000f80 /* Default TSIZE */ #define MAS4_TSIZED_SHIFT 7 +#define MAS5_SGS 0x80000000 + #define MAS6_SPID0 0x3FFF0000 #define MAS6_SPID1 0x00007FFE #define MAS6_ISIZE(x) MAS1_TSIZE(x) @@ -118,6 +120,10 @@ #define MAS7_RPN 0xFFFFFFFF +#define MAS8_TGS 0x80000000 /* Guest space */ +#define MAS8_VF 0x40000000 /* Virtualization Fault */ +#define MAS8_TLPID 0x000000ff + /* Bit definitions for MMUCFG */ #define MMUCFG_MAVN 0x00000003 /* MMU Architecture Version Number */ #define MMUCFG_MAVN_V1 0x00000000 /* v1.0 */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index eb11a44..032a984 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -243,6 +243,9 @@ struct thread_struct { #ifdef CONFIG_KVM_BOOK3S_32_HANDLER void* kvm_shadow_vcpu; /* KVM internal data */ #endif /* CONFIG_KVM_BOOK3S_32_HANDLER */ +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) + struct kvm_vcpu *kvm_vcpu; +#endif #ifdef CONFIG_PPC64 unsigned long dscr; int dscr_inherit; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 209dc74..5993770 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -257,7 +257,9 @@ #define LPCR_LPES_SH 2 #define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ #define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ +#ifndef SPRN_LPID #define SPRN_LPID 0x13F /* Logical Partition Identifier */ +#endif #define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */ #define SPRN_HMER 0x150 /* Hardware m? error recovery */ #define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 03c48e8..bd80b8d 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -56,17 +56,29 @@ #define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */ #define SPRN_EPCR 0x133 /* Embedded Processor Control Register */ #define SPRN_DBCR2 0x136 /* Debug Control Register 2 */ +#define SPRN_MSRP 0x137 /* MSR Protect Register */ #define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */ #define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */ #define SPRN_DVC1 0x13E /* Data Value Compare Register 1 */ #define SPRN_DVC2 0x13F /* Data Value Compare Register 2 */ +#define SPRN_LPID 0x152 /* Logical Partition ID */ #define SPRN_MAS8 0x155 /* MMU Assist Register 8 */ #define SPRN_TLB0PS 0x158 /* TLB 0 Page Size Register */ #define SPRN_MAS5_MAS6 0x15c /* MMU Assist Register 5 || 6 */ #define SPRN_MAS8_MAS1 0x15d /* MMU Assist Register 8 || 1 */ #define SPRN_EPTCFG 0x15e /* Embedded Page Table Config */ +#define SPRN_GSPRG0 0x170 /* Guest SPRG0 */ +#define SPRN_GSPRG1 0x171 /* Guest SPRG1 */ +#define SPRN_GSPRG2 0x172 /* Guest SPRG2 */ +#define SPRN_GSPRG3 0x173 /* Guest SPRG3 */ #define SPRN_MAS7_MAS3 0x174 /* MMU Assist Register 7 || 3 */ #define SPRN_MAS0_MAS1 0x175 /* MMU Assist Register 0 || 1 */ +#define SPRN_GSRR0 0x17A /* Guest SRR0 */ +#define SPRN_GSRR1 0x17B /* Guest SRR1 */ +#define SPRN_GEPR 0x17C /* Guest EPR */ +#define SPRN_GDEAR 0x17D /* Guest DEAR */ +#define SPRN_GPIR 0x17E /* Guest PIR */ +#define SPRN_GESR 0x17F /* Guest Exception Syndrome Register */ #define SPRN_IVOR0 0x190 /* Interrupt Vector Offset Register 0 */ #define SPRN_IVOR1 0x191 /* Interrupt Vector Offset Register 1 */ #define SPRN_IVOR2 0x192 /* Interrupt Vector Offset Register 2 */ @@ -87,6 +99,13 @@ #define SPRN_IVOR39 0x1B1 /* Interrupt Vector Offset Register 39 */ #define SPRN_IVOR40 0x1B2 /* Interrupt Vector Offset Register 40 */ #define SPRN_IVOR41 0x1B3 /* Interrupt Vector Offset Register 41 */ +#define SPRN_GIVOR2 0x1B8 /* Guest IVOR2 */ +#define SPRN_GIVOR3 0x1B9 /* Guest IVOR3 */ +#define SPRN_GIVOR4 0x1BA /* Guest IVOR4 */ +#define SPRN_GIVOR8 0x1BB /* Guest IVOR8 */ +#define SPRN_GIVOR13 0x1BC /* Guest IVOR13 */ +#define SPRN_GIVOR14 0x1BD /* Guest IVOR14 */ +#define SPRN_GIVPR 0x1BF /* Guest IVPR */ #define SPRN_SPEFSCR 0x200 /* SPE & Embedded FP Status & Control */ #define SPRN_BBEAR 0x201 /* Branch Buffer Entry Address Register */ #define SPRN_BBTAR 0x202 /* Branch Buffer Target Address Register */ @@ -235,6 +254,10 @@ #define MCSR_LDG 0x00002000UL /* Guarded Load */ #define MCSR_TLBSYNC 0x00000002UL /* Multiple tlbsyncs detected */ #define MCSR_BSL2_ERR 0x00000001UL /* Backside L2 cache error */ + +#define MSRP_UCLEP 0x04000000 /* Protect MSR[UCLE] */ +#define MSRP_DEP 0x00000200 /* Protect MSR[DE] */ +#define MSRP_PMMP 0x00000004 /* Protect MSR[PMM] */ #endif #ifdef CONFIG_E200 @@ -589,6 +612,17 @@ #define SPRN_EPCR_DMIUH 0x00400000 /* Disable MAS Interrupt updates * for hypervisor */ +/* Bit definitions for EPLC/EPSC */ +#define EPC_EPR 0x80000000 /* 1 = user, 0 = kernel */ +#define EPC_EPR_SHIFT 31 +#define EPC_EAS 0x40000000 /* Address Space */ +#define EPC_EAS_SHIFT 30 +#define EPC_EGS 0x20000000 /* 1 = guest, 0 = hypervisor */ +#define EPC_EGS_SHIFT 29 +#define EPC_ELPID 0x00ff0000 +#define EPC_ELPID_SHIFT 16 +#define EPC_EPID 0x00003fff +#define EPC_EPID_SHIFT 0 /* * The IBM-403 is an even more odd special case, as it is much diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index c80bdd1..e179f09 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -119,6 +119,9 @@ int main(void) #ifdef CONFIG_KVM_BOOK3S_32_HANDLER DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); #endif +#ifdef CONFIG_KVM_BOOKE_HV + DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu)); +#endif DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags)); @@ -400,6 +403,7 @@ int main(void) #ifdef CONFIG_KVM DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); + DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.regs.gpr)); DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr)); @@ -442,9 +446,11 @@ int main(void) DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4)); DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6)); + DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); + DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); + /* book3s */ #ifdef CONFIG_KVM_BOOK3S_64_HV - DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); @@ -459,7 +465,6 @@ int main(void) DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); #endif #ifdef CONFIG_PPC_BOOK3S - DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); @@ -605,6 +610,12 @@ int main(void) DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr)); #endif +#ifdef CONFIG_KVM_BOOKE_HV + DEFINE(VCPU_HOST_MAS4, offsetof(struct kvm_vcpu, arch.host_mas4)); + DEFINE(VCPU_HOST_MAS6, offsetof(struct kvm_vcpu, arch.host_mas6)); + DEFINE(VCPU_EPLC, offsetof(struct kvm_vcpu, arch.eplc)); +#endif + #ifdef CONFIG_KVM_EXIT_TIMING DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu, arch.timing_exit.tv32.tbu)); diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 06ab353..b87c335 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -3,6 +3,7 @@ #include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */ #include <asm/kvm_asm.h> +#include <asm/kvm_booke_hv_asm.h> /* * Macros used for common Book-e exception handling @@ -36,8 +37,9 @@ stw r11, THREAD_NORMSAVE(0)(r10); \ stw r13, THREAD_NORMSAVE(2)(r10); \ mfcr r13; /* save CR in r13 for now */\ - mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ - andi. r11,r11,MSR_PR; \ + mfspr r11, SPRN_SRR1; \ + DO_KVM BOOKE_INTERRUPT_##intno SPRN_SRR1; \ + andi. r11, r11, MSR_PR; /* check whether user or kernel */\ mr r11, r1; \ beq 1f; \ /* if from user, start at top of this thread's kernel stack */ \ @@ -123,8 +125,9 @@ stw r10,GPR10(r8); \ stw r11,GPR11(r8); \ stw r9,_CCR(r8); /* save CR on stack */\ - mfspr r10,exc_level_srr1; /* check whether user or kernel */\ - andi. r10,r10,MSR_PR; \ + mfspr r11,exc_level_srr1; /* check whether user or kernel */\ + DO_KVM BOOKE_INTERRUPT_##intno exc_level_srr1; \ + andi. r11,r11,MSR_PR; \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ @@ -173,6 +176,23 @@ SPRN_MCSRR0, SPRN_MCSRR1) /* + * Guest Doorbell -- this is a bit odd in that uses GSRR0/1 despite + * being delivered to the host. This exception can only happen + * inside a KVM guest -- so we just handle up to the DO_KVM rather + * than try to fit this into one of the existing prolog macros. + */ +#define GUEST_DOORBELL_EXCEPTION \ + START_EXCEPTION(GuestDoorbell); \ + mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \ + mfspr r10, SPRN_SPRG_THREAD; \ + stw r11, THREAD_NORMSAVE(0)(r10); \ + mfspr r11, SPRN_SRR1; \ + stw r13, THREAD_NORMSAVE(2)(r10); \ + mfcr r13; /* save CR in r13 for now */\ + DO_KVM BOOKE_INTERRUPT_GUEST_DBELL SPRN_GSRR1; \ + trap + +/* * Exception vectors. */ #define START_EXCEPTION(label) \ diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 8f64709..2c33cd3 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -90,6 +90,9 @@ config KVM_BOOK3S_64_PR depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV select KVM_BOOK3S_PR +config KVM_BOOKE_HV + bool + config KVM_440 bool "KVM support for PowerPC 440 processors" depends on EXPERIMENTAL && 44x diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index f66e741..cf63b93 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -17,6 +17,8 @@ * * Authors: Hollis Blanchard <hollisb@us.ibm.com> * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> + * Scott Wood <scottwood@freescale.com> + * Varun Sethi <varun.sethi@freescale.com> */ #include <linux/errno.h> @@ -30,9 +32,12 @@ #include <asm/cputable.h> #include <asm/uaccess.h> #include <asm/kvm_ppc.h> -#include "timing.h" #include <asm/cacheflush.h> +#include <asm/dbell.h> +#include <asm/hw_irq.h> +#include <asm/irq.h> +#include "timing.h" #include "booke.h" unsigned long kvmppc_booke_handlers; @@ -55,6 +60,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "dec", VCPU_STAT(dec_exits) }, { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "doorbell", VCPU_STAT(dbell_exits) }, + { "guest doorbell", VCPU_STAT(gdbell_exits) }, { NULL } }; @@ -123,6 +130,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) { u32 old_msr = vcpu->arch.shared->msr; +#ifdef CONFIG_KVM_BOOKE_HV + new_msr |= MSR_GS; +#endif + vcpu->arch.shared->msr = new_msr; kvmppc_mmu_msr_notify(vcpu, old_msr); @@ -197,6 +208,75 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); } +static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GSRR0, srr0); + mtspr(SPRN_GSRR1, srr1); +#else + vcpu->arch.shared->srr0 = srr0; + vcpu->arch.shared->srr1 = srr1; +#endif +} + +static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + vcpu->arch.csrr0 = srr0; + vcpu->arch.csrr1 = srr1; +} + +static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) { + vcpu->arch.dsrr0 = srr0; + vcpu->arch.dsrr1 = srr1; + } else { + set_guest_csrr(vcpu, srr0, srr1); + } +} + +static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + vcpu->arch.mcsrr0 = srr0; + vcpu->arch.mcsrr1 = srr1; +} + +static unsigned long get_guest_dear(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV + return mfspr(SPRN_GDEAR); +#else + return vcpu->arch.shared->dar; +#endif +} + +static void set_guest_dear(struct kvm_vcpu *vcpu, unsigned long dear) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GDEAR, dear); +#else + vcpu->arch.shared->dar = dear; +#endif +} + +static unsigned long get_guest_esr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV + return mfspr(SPRN_ESR); +#else + return vcpu->arch.shared->esr; +#endif +} + +static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GESR, esr); +#else + vcpu->arch.shared->esr = esr; +#endif +} + /* Deliver the interrupt of the corresponding priority, if possible. */ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) @@ -208,6 +288,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); bool crit; bool keep_irq = false; + enum int_class int_class; /* Truncate crit indicators in 32 bit mode */ if (!(vcpu->arch.shared->msr & MSR_SF)) { @@ -243,16 +324,20 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, case BOOKE_IRQPRIO_AP_UNAVAIL: case BOOKE_IRQPRIO_ALIGNMENT: allowed = 1; - msr_mask = MSR_CE|MSR_ME|MSR_DE; + msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE; + int_class = INT_CLASS_NONCRIT; break; case BOOKE_IRQPRIO_CRITICAL: - case BOOKE_IRQPRIO_WATCHDOG: allowed = vcpu->arch.shared->msr & MSR_CE; - msr_mask = MSR_ME; + allowed = allowed && !crit; + msr_mask = MSR_GS | MSR_ME; + int_class = INT_CLASS_CRIT; break; case BOOKE_IRQPRIO_MACHINE_CHECK: allowed = vcpu->arch.shared->msr & MSR_ME; - msr_mask = 0; + allowed = allowed && !crit; + msr_mask = MSR_GS; + int_class = INT_CLASS_MC; break; case BOOKE_IRQPRIO_DECREMENTER: case BOOKE_IRQPRIO_FIT: @@ -261,29 +346,63 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, case BOOKE_IRQPRIO_EXTERNAL: allowed = vcpu->arch.shared->msr & MSR_EE; allowed = allowed && !crit; - msr_mask = MSR_CE|MSR_ME|MSR_DE; + msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE; + int_class = INT_CLASS_NONCRIT; break; case BOOKE_IRQPRIO_DEBUG: allowed = vcpu->arch.shared->msr & MSR_DE; - msr_mask = MSR_ME; + allowed = allowed && !crit; + msr_mask = MSR_GS | MSR_ME; + int_class = INT_CLASS_CRIT; break; } if (allowed) { - vcpu->arch.shared->srr0 = vcpu->arch.regs.nip; - vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; + switch (int_class) { + case INT_CLASS_NONCRIT: + set_guest_srr(vcpu, vcpu->arch.regs.nip, + vcpu->arch.shared->msr); + break; + case INT_CLASS_CRIT: + set_guest_csrr(vcpu, vcpu->arch.regs.nip, + vcpu->arch.shared->msr); + break; + case INT_CLASS_DBG: + set_guest_dsrr(vcpu, vcpu->arch.regs.nip, + vcpu->arch.shared->msr); + break; + case INT_CLASS_MC: + set_guest_mcsrr(vcpu, vcpu->arch.regs.nip, + vcpu->arch.shared->msr); + break; + } + vcpu->arch.regs.nip = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; if (update_esr == true) - vcpu->arch.shared->esr = vcpu->arch.queued_esr; + set_guest_esr(vcpu, vcpu->arch.queued_esr); if (update_dear == true) - vcpu->arch.shared->dar = vcpu->arch.queued_dear; + set_guest_dear(vcpu, vcpu->arch.queued_dear); kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); if (!keep_irq) clear_bit(priority, &vcpu->arch.pending_exceptions); } +#ifdef CONFIG_KVM_BOOKE_HV + /* + * If an interrupt is pending but masked, raise a guest doorbell + * so that we are notified when the guest enables the relevant + * MSR bit. + */ + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT); + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT); + if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC); +#endif + return allowed; } @@ -347,6 +466,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return -EINVAL; } + if (!current->thread.kvm_vcpu) { + WARN(1, "no vcpu\n"); + return -EPERM; + } + local_irq_disable(); kvmppc_core_prepare_to_enter(vcpu); @@ -366,6 +490,38 @@ out: return ret; } +static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + /* don't overwrite subtypes, just account kvm_stats */ + kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); + /* Future optimization: only reload non-volatiles if + * they were actually modified by emulation. */ + return RESUME_GUEST_NV; + + case EMULATE_DO_DCR: + run->exit_reason = KVM_EXIT_DCR; + return RESUME_HOST; + + case EMULATE_FAIL: + /* XXX Deliver Program interrupt to guest. */ + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", + __func__, vcpu->arch.regs.nip, vcpu->arch.last_inst); + /* For debugging, encode the failing instruction and + * report it to userspace. */ + run->hw.hardware_exit_reason = ~0ULL << 32; + run->hw.hardware_exit_reason |= vcpu->arch.last_inst; + return RESUME_HOST; + + default: + BUG(); + } +} + /** * kvmppc_handle_exit * @@ -374,12 +530,39 @@ out: int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { - enum emulation_result er; int r = RESUME_HOST; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); + /* + * If we actually care, we could copy MSR, DEAR, and ESR to regs, + * insert an appropriate trap number, etc. + * + * Seems like a waste of cycles for something that should only matter + * to someone using sysrq-t/p or similar host kernel debug facility. + * We have other debug facilities to get that information from a + * guest through userspace. + */ + switch (exit_nr) { + case BOOKE_INTERRUPT_EXTERNAL: + do_IRQ(&vcpu->arch.regs); + break; + + case BOOKE_INTERRUPT_DECREMENTER: + timer_interrupt(&vcpu->arch.regs); + break; + +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64) + case BOOKE_INTERRUPT_DOORBELL: + doorbell_exception(&vcpu->arch.regs); + break; +#endif + case BOOKE_INTERRUPT_MACHINE_CHECK: + /* FIXME */ + break; + } + local_irq_enable(); run->exit_reason = KVM_EXIT_UNKNOWN; @@ -387,30 +570,56 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (exit_nr) { case BOOKE_INTERRUPT_MACHINE_CHECK: - printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR)); - kvmppc_dump_vcpu(vcpu); - r = RESUME_HOST; + kvm_resched(vcpu); + r = RESUME_GUEST; break; case BOOKE_INTERRUPT_EXTERNAL: kvmppc_account_exit(vcpu, EXT_INTR_EXITS); - if (need_resched()) - cond_resched(); + kvm_resched(vcpu); r = RESUME_GUEST; break; case BOOKE_INTERRUPT_DECREMENTER: - /* Since we switched IVPR back to the host's value, the host - * handled this interrupt the moment we enabled interrupts. - * Now we just offer it a chance to reschedule the guest. */ kvmppc_account_exit(vcpu, DEC_EXITS); - if (need_resched()) - cond_resched(); + kvm_resched(vcpu); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_DOORBELL: + kvmppc_account_exit(vcpu, DBELL_EXITS); + kvm_resched(vcpu); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_GUEST_DBELL_CRIT: + kvmppc_account_exit(vcpu, GDBELL_EXITS); + + /* + * We are here because there is a pending guest interrupt + * which could not be delivered as MSR_CE or MSR_ME was not + * set. Once we break from here we will retry delivery. + */ r = RESUME_GUEST; break; + case BOOKE_INTERRUPT_GUEST_DBELL: + kvmppc_account_exit(vcpu, GDBELL_EXITS); + + /* + * We are here because there is a pending guest interrupt + * which could not be delivered as MSR_EE was not set. Once + * we break from here we will retry delivery. + */ + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_HV_PRIV: + r = emulation_exit(run, vcpu); + break; + case BOOKE_INTERRUPT_PROGRAM: - if (vcpu->arch.shared->msr & MSR_PR) { + if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) { /* Program traps generated by user-level software must be handled * by the guest kernel. */ kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); @@ -419,33 +628,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } - er = kvmppc_emulate_instruction(run, vcpu); - switch (er) { - case EMULATE_DONE: - /* don't overwrite subtypes, just account kvm_stats */ - kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); - /* Future optimization: only reload non-volatiles if - * they were actually modified by emulation. */ - r = RESUME_GUEST_NV; - break; - case EMULATE_DO_DCR: - run->exit_reason = KVM_EXIT_DCR; - r = RESUME_HOST; - break; - case EMULATE_FAIL: - /* XXX Deliver Program interrupt to guest. */ - printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", - __func__, vcpu->arch.regs.nip, - vcpu->arch.last_inst); - /* For debugging, encode the failing instruction and - * report it to userspace. */ - run->hw.hardware_exit_reason = ~0ULL << 32; - run->hw.hardware_exit_reason |= vcpu->arch.last_inst; - r = RESUME_HOST; - break; - default: - BUG(); - } + r = emulation_exit(run, vcpu); break; case BOOKE_INTERRUPT_FP_UNAVAIL: @@ -510,6 +693,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; +#ifdef CONFIG_KVM_BOOKE_HV + case BOOKE_INTERRUPT_HV_SYSCALL: + if (!(vcpu->arch.shared->msr & MSR_PR)) { + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); + } else { + /* + * hcall from guest userspace -- send privileged + * instruction program check. + */ + kvmppc_core_queue_program(vcpu, ESR_PPR); + } + + r = RESUME_GUEST; + break; +#else case BOOKE_INTERRUPT_SYSCALL: if (!(vcpu->arch.shared->msr & MSR_PR) && (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { @@ -523,6 +721,47 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_account_exit(vcpu, SYSCALL_EXITS); r = RESUME_GUEST; break; +#endif + + case BOOKE_INTERRUPT_ITLB_MISS: { + unsigned long eaddr = vcpu->arch.regs.nip; + gpa_t gpaddr; + gfn_t gfn; + int gtlb_index; + + r = RESUME_GUEST; + + /* Check the guest TLB. */ + gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr); + if (gtlb_index < 0) { + /* The guest didn't have a mapping for it. */ + kvmppc_booke_queue_irqprio(vcpu, + BOOKE_IRQPRIO_ITLB_MISS); + kvmppc_mmu_itlb_miss(vcpu); + kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS); + break; + } + + kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS); + + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); + gfn = gpaddr >> PAGE_SHIFT; + + if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { + /* The guest TLB had a mapping, but the shadow TLB + * didn't. This could be because: + * a) the entry is mapping the host kernel, or + * b) the guest used a large mapping which we're faking + * Either way, we need to satisfy the fault without + * invoking the guest. */ + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); + } else { + /* Guest mapped and leaped at non-RAM! */ + kvmppc_booke_queue_irqprio(vcpu, + BOOKE_IRQPRIO_MACHINE_CHECK); + } + break; + } case BOOKE_INTERRUPT_DTLB_MISS: { unsigned long eaddr = vcpu->arch.fault_dear; @@ -578,45 +817,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } - case BOOKE_INTERRUPT_ITLB_MISS: { - unsigned long eaddr = vcpu->arch.regs.nip; - gpa_t gpaddr; - gfn_t gfn; - int gtlb_index; - - r = RESUME_GUEST; - - /* Check the guest TLB. */ - gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr); - if (gtlb_index < 0) { - /* The guest didn't have a mapping for it. */ - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS); - kvmppc_mmu_itlb_miss(vcpu); - kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS); - break; - } - - kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS); - - gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); - gfn = gpaddr >> PAGE_SHIFT; - - if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { - /* The guest TLB had a mapping, but the shadow TLB - * didn't. This could be because: - * a) the entry is mapping the host kernel, or - * b) the guest used a large mapping which we're faking - * Either way, we need to satisfy the fault without - * invoking the guest. */ - kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); - } else { - /* Guest mapped and leaped at non-RAM! */ - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK); - } - - break; - } - case BOOKE_INTERRUPT_DEBUG: { u32 dbsr; @@ -663,12 +863,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) int r; vcpu->arch.regs.nip = 0; - vcpu->arch.shared->msr = 0; - vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; vcpu->arch.shared->pir = vcpu->vcpu_id; kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ + kvmppc_set_msr(vcpu, 0); +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; vcpu->arch.shadow_pid = 1; + vcpu->arch.shared->msr = 0; +#endif /* Eye-catching numbers so we know if the guest takes an interrupt * before it's programmed its own IVPR/IVORs. */ @@ -749,8 +952,8 @@ static void get_sregs_base(struct kvm_vcpu *vcpu, sregs->u.e.csrr0 = vcpu->arch.csrr0; sregs->u.e.csrr1 = vcpu->arch.csrr1; sregs->u.e.mcsr = vcpu->arch.mcsr; - sregs->u.e.esr = vcpu->arch.shared->esr; - sregs->u.e.dear = vcpu->arch.shared->dar; + sregs->u.e.esr = get_guest_esr(vcpu); + sregs->u.e.dear = get_guest_dear(vcpu); sregs->u.e.tsr = vcpu->arch.tsr; sregs->u.e.tcr = vcpu->arch.tcr; sregs->u.e.dec = kvmppc_get_dec(vcpu, tb); @@ -767,8 +970,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, vcpu->arch.csrr0 = sregs->u.e.csrr0; vcpu->arch.csrr1 = sregs->u.e.csrr1; vcpu->arch.mcsr = sregs->u.e.mcsr; - vcpu->arch.shared->esr = sregs->u.e.esr; - vcpu->arch.shared->dar = sregs->u.e.dear; + set_guest_esr(vcpu, sregs->u.e.esr); + set_guest_dear(vcpu, sregs->u.e.dear); vcpu->arch.vrsave = sregs->u.e.vrsave; kvmppc_set_tcr(vcpu, sregs->u.e.tcr); @@ -965,14 +1168,17 @@ void kvmppc_decrementer_func(unsigned long data) void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + current->thread.kvm_vcpu = vcpu; } void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) { + current->thread.kvm_vcpu = NULL; } int __init kvmppc_booke_init(void) { +#ifndef CONFIG_KVM_BOOKE_HV unsigned long ivor[16]; unsigned long max_ivor = 0; int i; @@ -1015,7 +1221,7 @@ int __init kvmppc_booke_init(void) } flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); - +#endif /* !BOOKE_HV */ return 0; } diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 05d1d99..d53bcf2 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -48,7 +48,20 @@ #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 /* Internal pseudo-irqprio for level triggered externals */ #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 -#define BOOKE_IRQPRIO_MAX 20 +#define BOOKE_IRQPRIO_DBELL 21 +#define BOOKE_IRQPRIO_DBELL_CRIT 22 +#define BOOKE_IRQPRIO_MAX 23 + +#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \ + (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \ + (1 << BOOKE_IRQPRIO_DBELL) | \ + (1 << BOOKE_IRQPRIO_DECREMENTER) | \ + (1 << BOOKE_IRQPRIO_FIT) | \ + (1 << BOOKE_IRQPRIO_EXTERNAL)) + +#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \ + (1 << BOOKE_IRQPRIO_WATCHDOG) | \ + (1 << BOOKE_IRQPRIO_CRITICAL)) extern unsigned long kvmppc_booke_handlers; @@ -74,4 +87,13 @@ void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu); void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu); +enum int_class { + INT_CLASS_NONCRIT, + INT_CLASS_CRIT, + INT_CLASS_MC, + INT_CLASS_DBG, +}; + +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); + #endif /* __KVM_BOOKE_H__ */ diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index a4af03b..3eb7fc6 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -99,6 +99,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } +/* + * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode). + * Their backing store is in real registers, and these functions + * will return the wrong result if called for them in another context + * (such as debugging). + */ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { int emulated = EMULATE_DONE; @@ -122,9 +128,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) kvmppc_set_tcr(vcpu, spr_val); break; - /* Note: SPRG4-7 are user-readable. These values are - * loaded into the real SPRGs when resuming the - * guest. */ + /* + * Note: SPRG4-7 are user-readable. + * These values are loaded into the real SPRGs when resuming the + * guest (PR-mode only). + */ case SPRN_SPRG4: vcpu->arch.shared->sprg4 = spr_val; break; case SPRN_SPRG5: @@ -136,6 +144,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_IVPR: vcpu->arch.ivpr = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVPR, spr_val); +#endif break; case SPRN_IVOR0: vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val; @@ -145,6 +156,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) break; case SPRN_IVOR2: vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVOR2, spr_val); +#endif break; case SPRN_IVOR3: vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val; @@ -163,6 +177,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) break; case SPRN_IVOR8: vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVOR8, spr_val); +#endif break; case SPRN_IVOR9: vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val; diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S new file mode 100644 index 0000000..9eaeebd --- /dev/null +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -0,0 +1,587 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2010-2011 Freescale Semiconductor, Inc. + * + * Author: Varun Sethi <varun.sethi@freescale.com> + * Author: Scott Wood <scotwood@freescale.com> + * + * This file is derived from arch/powerpc/kvm/booke_interrupts.S + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/mmu-44x.h> +#include <asm/page.h> +#include <asm/asm-compat.h> +#include <asm/asm-offsets.h> +#include <asm/bitsperlong.h> + +#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ + +#define GET_VCPU(vcpu, thread) \ + PPC_LL vcpu, THREAD_KVM_VCPU(thread) + +#define SET_VCPU(vcpu) \ + PPC_STL vcpu, (THREAD + THREAD_KVM_VCPU)(r2) + +#define LONGBYTES (BITS_PER_LONG / 8) + +#define VCPU_GPR(n) (VCPU_GPRS + (n * LONGBYTES)) +#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) + +/* The host stack layout: */ +#define HOST_R1 (0 * LONGBYTES) /* Implied by stwu. */ +#define HOST_CALLEE_LR (1 * LONGBYTES) +#define HOST_RUN (2 * LONGBYTES) /* struct kvm_run */ +/* + * r2 is special: it holds 'current', and it made nonvolatile in the + * kernel with the -ffixed-r2 gcc option. + */ +#define HOST_R2 (3 * LONGBYTES) +#define HOST_NV_GPRS (4 * LONGBYTES) +#define HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) +#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(31) + LONGBYTES) +#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ +#define HOST_STACK_LR (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */ + +#define NEED_EMU 0x00000001 /* emulation -- save nv regs */ +#define NEED_DEAR 0x00000002 /* save faulting DEAR */ +#define NEED_ESR 0x00000004 /* save faulting ESR */ + +/* + * On entry: + * r4 = vcpu, r5 = srr0, r6 = srr1 + * saved in vcpu: cr, ctr, r3-r13 + */ +.macro kvm_handler_common intno, srr0, flags + mfspr r10, SPRN_PID + lwz r8, VCPU_HOST_PID(r4) + PPC_LL r11, VCPU_SHARED(r4) + PPC_STL r14, VCPU_GPR(r14)(r4) /* We need a non-volatile GPR. */ + li r14, \intno + + stw r10, VCPU_GUEST_PID(r4) + mtspr SPRN_PID, r8 + + .if \flags & NEED_EMU + lwz r9, VCPU_KVM(r4) + .endif + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save exit time */ +1: mfspr r7, SPRN_TBRU + mfspr r8, SPRN_TBRL + mfspr r9, SPRN_TBRU + cmpw r9, r7 + PPC_STL r8, VCPU_TIMING_EXIT_TBL(r4) + bne- 1b + PPC_STL r9, VCPU_TIMING_EXIT_TBU(r4) +#endif + + oris r8, r6, MSR_CE@h +#ifndef CONFIG_64BIT + stw r6, (VCPU_SHARED_MSR + 4)(r11) +#else + std r6, (VCPU_SHARED_MSR)(r11) +#endif + ori r8, r8, MSR_ME | MSR_RI + PPC_STL r5, VCPU_PC(r4) + + /* + * Make sure CE/ME/RI are set (if appropriate for exception type) + * whether or not the guest had it set. Since mfmsr/mtmsr are + * somewhat expensive, skip in the common case where the guest + * had all these bits set (and thus they're still set if + * appropriate for the exception type). + */ + cmpw r6, r8 + .if \flags & NEED_EMU + lwz r9, KVM_LPID(r9) + .endif + beq 1f + mfmsr r7 + .if \srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0 + oris r7, r7, MSR_CE@h + .endif + .if \srr0 != SPRN_MCSRR0 + ori r7, r7, MSR_ME | MSR_RI + .endif + mtmsr r7 +1: + + .if \flags & NEED_EMU + /* + * This assumes you have external PID support. + * To support a bookehv CPU without external PID, you'll + * need to look up the TLB entry and create a temporary mapping. + * + * FIXME: we don't currently handle if the lwepx faults. PR-mode + * booke doesn't handle it either. Since Linux doesn't use + * broadcast tlbivax anymore, the only way this should happen is + * if the guest maps its memory execute-but-not-read, or if we + * somehow take a TLB miss in the middle of this entry code and + * evict the relevant entry. On e500mc, all kernel lowmem is + * bolted into TLB1 large page mappings, and we don't use + * broadcast invalidates, so we should not take a TLB miss here. + * + * Later we'll need to deal with faults here. Disallowing guest + * mappings that are execute-but-not-read could be an option on + * e500mc, but not on chips with an LRAT if it is used. + */ + + mfspr r3, SPRN_EPLC /* will already have correct ELPID and EGS */ + PPC_STL r15, VCPU_GPR(r15)(r4) + PPC_STL r16, VCPU_GPR(r16)(r4) + PPC_STL r17, VCPU_GPR(r17)(r4) + PPC_STL r18, VCPU_GPR(r18)(r4) + PPC_STL r19, VCPU_GPR(r19)(r4) + mr r8, r3 + PPC_STL r20, VCPU_GPR(r20)(r4) + rlwimi r8, r6, EPC_EAS_SHIFT - MSR_IR_LG, EPC_EAS + PPC_STL r21, VCPU_GPR(r21)(r4) + rlwimi r8, r6, EPC_EPR_SHIFT - MSR_PR_LG, EPC_EPR + PPC_STL r22, VCPU_GPR(r22)(r4) + rlwimi r8, r10, EPC_EPID_SHIFT, EPC_EPID + PPC_STL r23, VCPU_GPR(r23)(r4) + PPC_STL r24, VCPU_GPR(r24)(r4) + PPC_STL r25, VCPU_GPR(r25)(r4) + PPC_STL r26, VCPU_GPR(r26)(r4) + PPC_STL r27, VCPU_GPR(r27)(r4) + PPC_STL r28, VCPU_GPR(r28)(r4) + PPC_STL r29, VCPU_GPR(r29)(r4) + PPC_STL r30, VCPU_GPR(r30)(r4) + PPC_STL r31, VCPU_GPR(r31)(r4) + mtspr SPRN_EPLC, r8 + isync + lwepx r9, 0, r5 + mtspr SPRN_EPLC, r3 + stw r9, VCPU_LAST_INST(r4) + .endif + + .if \flags & NEED_ESR + mfspr r8, SPRN_ESR + PPC_STL r8, VCPU_FAULT_ESR(r4) + .endif + + .if \flags & NEED_DEAR + mfspr r9, SPRN_DEAR + PPC_STL r9, VCPU_FAULT_DEAR(r4) + .endif + + b kvmppc_resume_host +.endm + +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) + GET_VCPU(r11, r10) + PPC_STL r3, VCPU_GPR(r3)(r11) + mfspr r3, SPRN_SPRG_RSCRATCH0 + PPC_STL r4, VCPU_GPR(r4)(r11) + PPC_LL r4, THREAD_NORMSAVE(0)(r10) + PPC_STL r5, VCPU_GPR(r5)(r11) + PPC_STL r13, VCPU_CR(r11) + mfspr r5, \srr0 + PPC_STL r3, VCPU_GPR(r10)(r11) + PPC_LL r3, THREAD_NORMSAVE(2)(r10) + PPC_STL r6, VCPU_GPR(r6)(r11) + PPC_STL r4, VCPU_GPR(r11)(r11) + mfspr r6, \srr1 + PPC_STL r7, VCPU_GPR(r7)(r11) + PPC_STL r8, VCPU_GPR(r8)(r11) + PPC_STL r9, VCPU_GPR(r9)(r11) + PPC_STL r3, VCPU_GPR(r13)(r11) + mfctr r7 + PPC_STL r12, VCPU_GPR(r12)(r11) + PPC_STL r7, VCPU_CTR(r11) + mr r4, r11 + kvm_handler_common \intno, \srr0, \flags +.endm + +.macro kvm_lvl_handler intno scratch srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) + mfspr r10, SPRN_SPRG_THREAD + GET_VCPU(r11, r10) + PPC_STL r3, VCPU_GPR(r3)(r11) + mfspr r3, \scratch + PPC_STL r4, VCPU_GPR(r4)(r11) + PPC_LL r4, GPR9(r8) + PPC_STL r5, VCPU_GPR(r5)(r11) + PPC_STL r9, VCPU_CR(r11) + mfspr r5, \srr0 + PPC_STL r3, VCPU_GPR(r8)(r11) + PPC_LL r3, GPR10(r8) + PPC_STL r6, VCPU_GPR(r6)(r11) + PPC_STL r4, VCPU_GPR(r9)(r11) + mfspr r6, \srr1 + PPC_LL r4, GPR11(r8) + PPC_STL r7, VCPU_GPR(r7)(r11) + PPC_STL r8, VCPU_GPR(r8)(r11) + PPC_STL r3, VCPU_GPR(r10)(r11) + mfctr r7 + PPC_STL r12, VCPU_GPR(r12)(r11) + PPC_STL r4, VCPU_GPR(r11)(r11) + PPC_STL r7, VCPU_CTR(r11) + mr r4, r11 + kvm_handler_common \intno, \srr0, \flags +.endm + +kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ + SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ + SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ + SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 + + +/* Registers: + * SPRG_SCRATCH0: guest r10 + * r4: vcpu pointer + * r11: vcpu->arch.shared + * r14: KVM exit number + */ +_GLOBAL(kvmppc_resume_host) + /* Save remaining volatile guest register state to vcpu. */ + mfspr r3, SPRN_VRSAVE + PPC_STL r0, VCPU_GPR(r0)(r4) + PPC_STL r1, VCPU_GPR(r1)(r4) + mflr r5 + mfspr r6, SPRN_SPRG4 + PPC_STL r2, VCPU_GPR(r2)(r4) + PPC_STL r5, VCPU_LR(r4) + mfspr r7, SPRN_SPRG5 + PPC_STL r3, VCPU_VRSAVE(r4) + PPC_STL r6, VCPU_SHARED_SPRG4(r11) + mfspr r8, SPRN_SPRG6 + PPC_STL r7, VCPU_SHARED_SPRG5(r11) + mfspr r9, SPRN_SPRG7 + PPC_STL r8, VCPU_SHARED_SPRG6(r11) + mfxer r3 + PPC_STL r9, VCPU_SHARED_SPRG7(r11) + + /* save guest MAS registers and restore host mas4 & mas6 */ + mfspr r5, SPRN_MAS0 + PPC_STL r3, VCPU_XER(r4) + mfspr r6, SPRN_MAS1 + stw r5, VCPU_SHARED_MAS0(r11) + mfspr r7, SPRN_MAS2 + stw r6, VCPU_SHARED_MAS1(r11) +#ifndef CONFIG_64BIT + stw r7, (VCPU_SHARED_MAS2 + 4)(r11) +#else + std r7, (VCPU_SHARED_MAS2)(r11) +#endif + mfspr r5, SPRN_MAS3 + mfspr r6, SPRN_MAS4 + stw r5, VCPU_SHARED_MAS7_3+4(r11) + mfspr r7, SPRN_MAS6 + stw r6, VCPU_SHARED_MAS4(r11) + mfspr r5, SPRN_MAS7 + lwz r6, VCPU_HOST_MAS4(r4) + stw r7, VCPU_SHARED_MAS6(r11) + lwz r8, VCPU_HOST_MAS6(r4) + mtspr SPRN_MAS4, r6 + stw r5, VCPU_SHARED_MAS7_3+0(r11) + mtspr SPRN_MAS6, r8 + mfspr r3, SPRN_EPCR + rlwinm r3, r3, 0, ~SPRN_EPCR_DMIUH + mtspr SPRN_EPCR, r3 + isync + + /* Restore host stack pointer */ + PPC_LL r1, VCPU_HOST_STACK(r4) + PPC_LL r2, HOST_R2(r1) + + /* Switch to kernel stack and jump to handler. */ + PPC_LL r3, HOST_RUN(r1) + mr r5, r14 /* intno */ + mr r14, r4 /* Save vcpu pointer. */ + bl kvmppc_handle_exit + + /* Restore vcpu pointer and the nonvolatiles we used. */ + mr r4, r14 + PPC_LL r14, VCPU_GPR(r14)(r4) + + andi. r5, r3, RESUME_FLAG_NV + beq skip_nv_load + PPC_LL r15, VCPU_GPR(r15)(r4) + PPC_LL r16, VCPU_GPR(r16)(r4) + PPC_LL r17, VCPU_GPR(r17)(r4) + PPC_LL r18, VCPU_GPR(r18)(r4) + PPC_LL r19, VCPU_GPR(r19)(r4) + PPC_LL r20, VCPU_GPR(r20)(r4) + PPC_LL r21, VCPU_GPR(r21)(r4) + PPC_LL r22, VCPU_GPR(r22)(r4) + PPC_LL r23, VCPU_GPR(r23)(r4) + PPC_LL r24, VCPU_GPR(r24)(r4) + PPC_LL r25, VCPU_GPR(r25)(r4) + PPC_LL r26, VCPU_GPR(r26)(r4) + PPC_LL r27, VCPU_GPR(r27)(r4) + PPC_LL r28, VCPU_GPR(r28)(r4) + PPC_LL r29, VCPU_GPR(r29)(r4) + PPC_LL r30, VCPU_GPR(r30)(r4) + PPC_LL r31, VCPU_GPR(r31)(r4) +skip_nv_load: + /* Should we return to the guest? */ + andi. r5, r3, RESUME_FLAG_HOST + beq lightweight_exit + + srawi r3, r3, 2 /* Shift -ERR back down. */ + +heavyweight_exit: + /* Not returning to guest. */ + PPC_LL r5, HOST_STACK_LR(r1) + + /* + * We already saved guest volatile register state; now save the + * non-volatiles. + */ + + PPC_STL r15, VCPU_GPR(r15)(r4) + PPC_STL r16, VCPU_GPR(r16)(r4) + PPC_STL r17, VCPU_GPR(r17)(r4) + PPC_STL r18, VCPU_GPR(r18)(r4) + PPC_STL r19, VCPU_GPR(r19)(r4) + PPC_STL r20, VCPU_GPR(r20)(r4) + PPC_STL r21, VCPU_GPR(r21)(r4) + PPC_STL r22, VCPU_GPR(r22)(r4) + PPC_STL r23, VCPU_GPR(r23)(r4) + PPC_STL r24, VCPU_GPR(r24)(r4) + PPC_STL r25, VCPU_GPR(r25)(r4) + PPC_STL r26, VCPU_GPR(r26)(r4) + PPC_STL r27, VCPU_GPR(r27)(r4) + PPC_STL r28, VCPU_GPR(r28)(r4) + PPC_STL r29, VCPU_GPR(r29)(r4) + PPC_STL r30, VCPU_GPR(r30)(r4) + PPC_STL r31, VCPU_GPR(r31)(r4) + + /* Load host non-volatile register state from host stack. */ + PPC_LL r14, HOST_NV_GPR(r14)(r1) + PPC_LL r15, HOST_NV_GPR(r15)(r1) + PPC_LL r16, HOST_NV_GPR(r16)(r1) + PPC_LL r17, HOST_NV_GPR(r17)(r1) + PPC_LL r18, HOST_NV_GPR(r18)(r1) + PPC_LL r19, HOST_NV_GPR(r19)(r1) + PPC_LL r20, HOST_NV_GPR(r20)(r1) + PPC_LL r21, HOST_NV_GPR(r21)(r1) + PPC_LL r22, HOST_NV_GPR(r22)(r1) + PPC_LL r23, HOST_NV_GPR(r23)(r1) + PPC_LL r24, HOST_NV_GPR(r24)(r1) + PPC_LL r25, HOST_NV_GPR(r25)(r1) + PPC_LL r26, HOST_NV_GPR(r26)(r1) + PPC_LL r27, HOST_NV_GPR(r27)(r1) + PPC_LL r28, HOST_NV_GPR(r28)(r1) + PPC_LL r29, HOST_NV_GPR(r29)(r1) + PPC_LL r30, HOST_NV_GPR(r30)(r1) + PPC_LL r31, HOST_NV_GPR(r31)(r1) + + /* Return to kvm_vcpu_run(). */ + mtlr r5 + addi r1, r1, HOST_STACK_SIZE + /* r3 still contains the return code from kvmppc_handle_exit(). */ + blr + +/* Registers: + * r3: kvm_run pointer + * r4: vcpu pointer + */ +_GLOBAL(__kvmppc_vcpu_run) + stwu r1, -HOST_STACK_SIZE(r1) + PPC_STL r1, VCPU_HOST_STACK(r4) /* Save stack pointer to vcpu. */ + + /* Save host state to stack. */ + PPC_STL r3, HOST_RUN(r1) + mflr r3 + PPC_STL r3, HOST_STACK_LR(r1) + + /* Save host non-volatile register state to stack. */ + PPC_STL r14, HOST_NV_GPR(r14)(r1) + PPC_STL r15, HOST_NV_GPR(r15)(r1) + PPC_STL r16, HOST_NV_GPR(r16)(r1) + PPC_STL r17, HOST_NV_GPR(r17)(r1) + PPC_STL r18, HOST_NV_GPR(r18)(r1) + PPC_STL r19, HOST_NV_GPR(r19)(r1) + PPC_STL r20, HOST_NV_GPR(r20)(r1) + PPC_STL r21, HOST_NV_GPR(r21)(r1) + PPC_STL r22, HOST_NV_GPR(r22)(r1) + PPC_STL r23, HOST_NV_GPR(r23)(r1) + PPC_STL r24, HOST_NV_GPR(r24)(r1) + PPC_STL r25, HOST_NV_GPR(r25)(r1) + PPC_STL r26, HOST_NV_GPR(r26)(r1) + PPC_STL r27, HOST_NV_GPR(r27)(r1) + PPC_STL r28, HOST_NV_GPR(r28)(r1) + PPC_STL r29, HOST_NV_GPR(r29)(r1) + PPC_STL r30, HOST_NV_GPR(r30)(r1) + PPC_STL r31, HOST_NV_GPR(r31)(r1) + + /* Load guest non-volatiles. */ + PPC_LL r14, VCPU_GPR(r14)(r4) + PPC_LL r15, VCPU_GPR(r15)(r4) + PPC_LL r16, VCPU_GPR(r16)(r4) + PPC_LL r17, VCPU_GPR(r17)(r4) + PPC_LL r18, VCPU_GPR(r18)(r4) + PPC_LL r19, VCPU_GPR(r19)(r4) + PPC_LL r20, VCPU_GPR(r20)(r4) + PPC_LL r21, VCPU_GPR(r21)(r4) + PPC_LL r22, VCPU_GPR(r22)(r4) + PPC_LL r23, VCPU_GPR(r23)(r4) + PPC_LL r24, VCPU_GPR(r24)(r4) + PPC_LL r25, VCPU_GPR(r25)(r4) + PPC_LL r26, VCPU_GPR(r26)(r4) + PPC_LL r27, VCPU_GPR(r27)(r4) + PPC_LL r28, VCPU_GPR(r28)(r4) + PPC_LL r29, VCPU_GPR(r29)(r4) + PPC_LL r30, VCPU_GPR(r30)(r4) + PPC_LL r31, VCPU_GPR(r31)(r4) + + +lightweight_exit: + PPC_STL r2, HOST_R2(r1) + + mfspr r3, SPRN_PID + stw r3, VCPU_HOST_PID(r4) + lwz r3, VCPU_GUEST_PID(r4) + mtspr SPRN_PID, r3 + + /* Save vcpu pointer for the exception handlers + * must be done before loading guest r2. + */ +// SET_VCPU(r4) + + PPC_LL r11, VCPU_SHARED(r4) + /* Save host mas4 and mas6 and load guest MAS registers */ + mfspr r3, SPRN_MAS4 + stw r3, VCPU_HOST_MAS4(r4) + mfspr r3, SPRN_MAS6 + stw r3, VCPU_HOST_MAS6(r4) + lwz r3, VCPU_SHARED_MAS0(r11) + lwz r5, VCPU_SHARED_MAS1(r11) +#ifndef CONFIG_64BIT + lwz r6, (VCPU_SHARED_MAS2 + 4)(r11) +#else + ld r6, (VCPU_SHARED_MAS2)(r11) +#endif + lwz r7, VCPU_SHARED_MAS7_3+4(r11) + lwz r8, VCPU_SHARED_MAS4(r11) + mtspr SPRN_MAS0, r3 + mtspr SPRN_MAS1, r5 + mtspr SPRN_MAS2, r6 + mtspr SPRN_MAS3, r7 + mtspr SPRN_MAS4, r8 + lwz r3, VCPU_SHARED_MAS6(r11) + lwz r5, VCPU_SHARED_MAS7_3+0(r11) + mtspr SPRN_MAS6, r3 + mtspr SPRN_MAS7, r5 + /* Disable MAS register updates via exception */ + mfspr r3, SPRN_EPCR + oris r3, r3, SPRN_EPCR_DMIUH@h + mtspr SPRN_EPCR, r3 + + /* + * Host interrupt handlers may have clobbered these guest-readable + * SPRGs, so we need to reload them here with the guest's values. + */ + lwz r3, VCPU_VRSAVE(r4) + lwz r5, VCPU_SHARED_SPRG4(r11) + mtspr SPRN_VRSAVE, r3 + lwz r6, VCPU_SHARED_SPRG5(r11) + mtspr SPRN_SPRG4W, r5 + lwz r7, VCPU_SHARED_SPRG6(r11) + mtspr SPRN_SPRG5W, r6 + lwz r8, VCPU_SHARED_SPRG7(r11) + mtspr SPRN_SPRG6W, r7 + mtspr SPRN_SPRG7W, r8 + + /* Load some guest volatiles. */ + PPC_LL r3, VCPU_LR(r4) + PPC_LL r5, VCPU_XER(r4) + PPC_LL r6, VCPU_CTR(r4) + PPC_LL r7, VCPU_CR(r4) + PPC_LL r8, VCPU_PC(r4) +#ifndef CONFIG_64BIT + lwz r9, (VCPU_SHARED_MSR + 4)(r11) +#else + ld r9, (VCPU_SHARED_MSR)(r11) +#endif + PPC_LL r0, VCPU_GPR(r0)(r4) + PPC_LL r1, VCPU_GPR(r1)(r4) + PPC_LL r2, VCPU_GPR(r2)(r4) + PPC_LL r10, VCPU_GPR(r10)(r4) + PPC_LL r11, VCPU_GPR(r11)(r4) + PPC_LL r12, VCPU_GPR(r12)(r4) + PPC_LL r13, VCPU_GPR(r13)(r4) + mtlr r3 + mtxer r5 + mtctr r6 + mtcr r7 + mtsrr0 r8 + mtsrr1 r9 + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save enter time */ +1: + mfspr r6, SPRN_TBRU + mfspr r7, SPRN_TBRL + mfspr r8, SPRN_TBRU + cmpw r8, r6 + PPC_STL r7, VCPU_TIMING_LAST_ENTER_TBL(r4) + bne 1b + PPC_STL r8, VCPU_TIMING_LAST_ENTER_TBU(r4) +#endif + + /* Finish loading guest volatiles and jump to guest. */ + PPC_LL r5, VCPU_GPR(r5)(r4) + PPC_LL r6, VCPU_GPR(r6)(r4) + PPC_LL r7, VCPU_GPR(r7)(r4) + PPC_LL r8, VCPU_GPR(r8)(r4) + PPC_LL r9, VCPU_GPR(r9)(r4) + + PPC_LL r3, VCPU_GPR(r3)(r4) + PPC_LL r4, VCPU_GPR(r4)(r4) + rfi diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 42701e5..f9c62dd 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -109,6 +109,11 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu) goto out; #endif +#ifdef CONFIG_KVM_BOOKE_HV + if (!cpu_has_feature(CPU_FTR_EMB_HV)) + goto out; +#endif + r = true; out: diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h index 8167d42..bf191e7 100644 --- a/arch/powerpc/kvm/timing.h +++ b/arch/powerpc/kvm/timing.h @@ -93,6 +93,12 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type) case SIGNAL_EXITS: vcpu->stat.signal_exits++; break; + case DBELL_EXITS: + vcpu->stat.dbell_exits++; + break; + case GDBELL_EXITS: + vcpu->stat.gdbell_exits++; + break; } }
Chips such as e500mc that implement category E.HV in Power ISA 2.06 provide hardware virtualization features, including a new MSR mode for guest state. The guest OS can perform many operations without trapping into the hypervisor, including transitions to and from guest userspace. Since we can use SRR1[GS] to reliably tell whether an exception came from guest state, instead of messing around with IVPR, we use DO_KVM similarly to book3s. Current issues include: - Machine checks from guest state are not routed to the host handler. - The guest can cause a host oops by executing an emulated instruction in a page that lacks read permission. Existing e500/4xx support has the same problem. Includes work by Ashish Kalra <Ashish.Kalra@freescale.com>, Varun Sethi <Varun.Sethi@freescale.com>, and Liu Yu <yu.liu@freescale.com>. Signed-off-by: Scott Wood <scottwood@freescale.com> --- arch/powerpc/include/asm/dbell.h | 1 + arch/powerpc/include/asm/kvm_asm.h | 8 + arch/powerpc/include/asm/kvm_booke_hv_asm.h | 49 +++ arch/powerpc/include/asm/kvm_host.h | 19 +- arch/powerpc/include/asm/kvm_ppc.h | 3 + arch/powerpc/include/asm/mmu-book3e.h | 6 + arch/powerpc/include/asm/processor.h | 3 + arch/powerpc/include/asm/reg.h | 2 + arch/powerpc/include/asm/reg_booke.h | 34 ++ arch/powerpc/kernel/asm-offsets.c | 15 +- arch/powerpc/kernel/head_booke.h | 28 ++- arch/powerpc/kvm/Kconfig | 3 + arch/powerpc/kvm/booke.c | 398 ++++++++++++++----- arch/powerpc/kvm/booke.h | 24 +- arch/powerpc/kvm/booke_emulate.c | 23 +- arch/powerpc/kvm/bookehv_interrupts.S | 587 +++++++++++++++++++++++++++ arch/powerpc/kvm/powerpc.c | 5 + arch/powerpc/kvm/timing.h | 6 + 18 files changed, 1107 insertions(+), 107 deletions(-) create mode 100644 arch/powerpc/include/asm/kvm_booke_hv_asm.h create mode 100644 arch/powerpc/kvm/bookehv_interrupts.S