Message ID | 567473a0-6005-5843-4c73-951f476085ca@web.de |
---|---|
State | New |
Headers | show |
Series | [v2] target-i386: Add NPT support | expand |
On 30/06/2018 08:08, Jan Kiszka wrote: > From: Jan Kiszka <jan.kiszka@siemens.com> > > This implements NPT suport for SVM by hooking into > x86_cpu_handle_mmu_fault where it reads the stage-1 page table. Whether > we need to perform this 2nd stage translation, and how, is decided > during vmrun and stored in hflags2, along with nested_cr3 and > nested_pg_mode. > > As get_hphys performs a direct cpu_vmexit in case of NPT faults, we need > retaddr in that function. To avoid changing the signature of > cpu_handle_mmu_fault, this passes the value from tlb_fill to get_hphys > via the CPU state. > > This was tested successfully via the Jailhouse hypervisor. > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > --- > > Changes in v2: > - use hflags2 instead of hflags > - add conditional vmstate subsection > > target/i386/cpu.c | 2 +- > target/i386/cpu.h | 6 ++ > target/i386/excp_helper.c | 216 +++++++++++++++++++++++++++++++++++++++++++++- > target/i386/machine.c | 21 +++++ > target/i386/mem_helper.c | 6 +- > target/i386/svm.h | 14 +++ > target/i386/svm_helper.c | 22 +++++ > 7 files changed, 281 insertions(+), 6 deletions(-) > > diff --git a/target/i386/cpu.c b/target/i386/cpu.c > index 1e6a7d0a75..6e1f180249 100644 > --- a/target/i386/cpu.c > +++ b/target/i386/cpu.c > @@ -751,7 +751,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, > #define TCG_EXT3_FEATURES (CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM | \ > CPUID_EXT3_CR8LEG | CPUID_EXT3_ABM | CPUID_EXT3_SSE4A) > #define TCG_EXT4_FEATURES 0 > -#define TCG_SVM_FEATURES 0 > +#define TCG_SVM_FEATURES CPUID_SVM_NPT > #define TCG_KVM_FEATURES 0 > #define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP | \ > CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \ > diff --git a/target/i386/cpu.h b/target/i386/cpu.h > index 8eaefeee3e..7f33755bf5 100644 > --- a/target/i386/cpu.h > +++ b/target/i386/cpu.h > @@ -211,6 +211,7 @@ typedef enum X86Seg { > #define HF2_VINTR_SHIFT 3 /* value of V_INTR_MASKING bit */ > #define HF2_SMM_INSIDE_NMI_SHIFT 4 /* CPU serving SMI nested inside NMI */ > #define HF2_MPX_PR_SHIFT 5 /* BNDCFGx.BNDPRESERVE */ > +#define HF2_NPT_SHIFT 6 /* Nested Paging enabled */ > > #define HF2_GIF_MASK (1 << HF2_GIF_SHIFT) > #define HF2_HIF_MASK (1 << HF2_HIF_SHIFT) > @@ -218,6 +219,7 @@ typedef enum X86Seg { > #define HF2_VINTR_MASK (1 << HF2_VINTR_SHIFT) > #define HF2_SMM_INSIDE_NMI_MASK (1 << HF2_SMM_INSIDE_NMI_SHIFT) > #define HF2_MPX_PR_MASK (1 << HF2_MPX_PR_SHIFT) > +#define HF2_NPT_MASK (1 << HF2_NPT_SHIFT) > > #define CR0_PE_SHIFT 0 > #define CR0_MP_SHIFT 1 > @@ -1265,12 +1267,16 @@ typedef struct CPUX86State { > uint16_t intercept_dr_read; > uint16_t intercept_dr_write; > uint32_t intercept_exceptions; > + uint64_t nested_cr3; > + uint32_t nested_pg_mode; > uint8_t v_tpr; > > /* KVM states, automatically cleared on reset */ > uint8_t nmi_injected; > uint8_t nmi_pending; > > + uintptr_t retaddr; > + > /* Fields up to this point are cleared by a CPU reset */ > struct {} end_reset_fields; > > diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c > index cb4d1b7d33..37a33d5ae0 100644 > --- a/target/i386/excp_helper.c > +++ b/target/i386/excp_helper.c > @@ -157,6 +157,209 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > #else > > +static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > + int *prot) > +{ > + CPUX86State *env = &X86_CPU(cs)->env; > + uint64_t rsvd_mask = PG_HI_RSVD_MASK; > + uint64_t ptep, pte; > + uint64_t exit_info_1 = 0; > + target_ulong pde_addr, pte_addr; > + uint32_t page_offset; > + int page_size; > + > + if (likely(!(env->hflags2 & HF2_NPT_MASK))) { > + return gphys; > + } > + > + if (!(env->nested_pg_mode & SVM_NPT_NXE)) { > + rsvd_mask |= PG_NX_MASK; > + } > + > + if (env->nested_pg_mode & SVM_NPT_PAE) { > + uint64_t pde, pdpe; > + target_ulong pdpe_addr; > + > +#ifdef TARGET_X86_64 > + if (env->nested_pg_mode & SVM_NPT_LMA) { > + uint64_t pml5e; > + uint64_t pml4e_addr, pml4e; > + > + pml5e = env->nested_cr3; > + ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK; > + > + pml4e_addr = (pml5e & PG_ADDRESS_MASK) + > + (((gphys >> 39) & 0x1ff) << 3); > + pml4e = x86_ldq_phys(cs, pml4e_addr); > + if (!(pml4e & PG_PRESENT_MASK)) { > + goto do_fault; > + } > + if (pml4e & (rsvd_mask | PG_PSE_MASK)) { > + goto do_fault_rsvd; > + } > + if (!(pml4e & PG_ACCESSED_MASK)) { > + pml4e |= PG_ACCESSED_MASK; > + x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); > + } > + ptep &= pml4e ^ PG_NX_MASK; > + pdpe_addr = (pml4e & PG_ADDRESS_MASK) + > + (((gphys >> 30) & 0x1ff) << 3); > + pdpe = x86_ldq_phys(cs, pdpe_addr); > + if (!(pdpe & PG_PRESENT_MASK)) { > + goto do_fault; > + } > + if (pdpe & rsvd_mask) { > + goto do_fault_rsvd; > + } > + ptep &= pdpe ^ PG_NX_MASK; > + if (!(pdpe & PG_ACCESSED_MASK)) { > + pdpe |= PG_ACCESSED_MASK; > + x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); > + } > + if (pdpe & PG_PSE_MASK) { > + /* 1 GB page */ > + page_size = 1024 * 1024 * 1024; > + pte_addr = pdpe_addr; > + pte = pdpe; > + goto do_check_protect; > + } > + } else > +#endif > + { > + pdpe_addr = (env->nested_cr3 & ~0x1f) + ((gphys >> 27) & 0x18); > + pdpe = x86_ldq_phys(cs, pdpe_addr); > + if (!(pdpe & PG_PRESENT_MASK)) { > + goto do_fault; > + } > + rsvd_mask |= PG_HI_USER_MASK; > + if (pdpe & (rsvd_mask | PG_NX_MASK)) { > + goto do_fault_rsvd; > + } > + ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK; > + } > + > + pde_addr = (pdpe & PG_ADDRESS_MASK) + (((gphys >> 21) & 0x1ff) << 3); > + pde = x86_ldq_phys(cs, pde_addr); > + if (!(pde & PG_PRESENT_MASK)) { > + goto do_fault; > + } > + if (pde & rsvd_mask) { > + goto do_fault_rsvd; > + } > + ptep &= pde ^ PG_NX_MASK; > + if (pde & PG_PSE_MASK) { > + /* 2 MB page */ > + page_size = 2048 * 1024; > + pte_addr = pde_addr; > + pte = pde; > + goto do_check_protect; > + } > + /* 4 KB page */ > + if (!(pde & PG_ACCESSED_MASK)) { > + pde |= PG_ACCESSED_MASK; > + x86_stl_phys_notdirty(cs, pde_addr, pde); > + } > + pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3); > + pte = x86_ldq_phys(cs, pte_addr); > + if (!(pte & PG_PRESENT_MASK)) { > + goto do_fault; > + } > + if (pte & rsvd_mask) { > + goto do_fault_rsvd; > + } > + /* combine pde and pte nx, user and rw protections */ > + ptep &= pte ^ PG_NX_MASK; > + page_size = 4096; > + } else { > + uint32_t pde; > + > + /* page directory entry */ > + pde_addr = (env->nested_cr3 & ~0xfff) + ((gphys >> 20) & 0xffc); > + pde = x86_ldl_phys(cs, pde_addr); > + if (!(pde & PG_PRESENT_MASK)) { > + goto do_fault; > + } > + ptep = pde | PG_NX_MASK; > + > + /* if PSE bit is set, then we use a 4MB page */ > + if ((pde & PG_PSE_MASK) && (env->cr[4] & CR4_PSE_MASK)) { > + page_size = 4096 * 1024; > + pte_addr = pde_addr; > + > + /* Bits 20-13 provide bits 39-32 of the address, bit 21 is reserved. > + * Leave bits 20-13 in place for setting accessed/dirty bits below. > + */ > + pte = pde | ((pde & 0x1fe000LL) << (32 - 13)); > + rsvd_mask = 0x200000; > + goto do_check_protect_pse36; > + } > + > + if (!(pde & PG_ACCESSED_MASK)) { > + pde |= PG_ACCESSED_MASK; > + x86_stl_phys_notdirty(cs, pde_addr, pde); > + } > + > + /* page directory entry */ > + pte_addr = (pde & ~0xfff) + ((gphys >> 10) & 0xffc); > + pte = x86_ldl_phys(cs, pte_addr); > + if (!(pte & PG_PRESENT_MASK)) { > + goto do_fault; > + } > + /* combine pde and pte user and rw protections */ > + ptep &= pte | PG_NX_MASK; > + page_size = 4096; > + rsvd_mask = 0; > + } > + > + do_check_protect: > + rsvd_mask |= (page_size - 1) & PG_ADDRESS_MASK & ~PG_PSE_PAT_MASK; > + do_check_protect_pse36: > + if (pte & rsvd_mask) { > + goto do_fault_rsvd; > + } > + ptep ^= PG_NX_MASK; > + > + if (!(ptep & PG_USER_MASK)) { > + goto do_fault_protect; > + } > + if (ptep & PG_NX_MASK) { > + if (access_type == MMU_INST_FETCH) { > + goto do_fault_protect; > + } > + *prot &= ~PAGE_EXEC; > + } > + if (!(ptep & PG_RW_MASK)) { > + if (access_type == MMU_DATA_STORE) { > + goto do_fault_protect; > + } > + *prot &= ~PAGE_WRITE; > + } > + > + pte &= PG_ADDRESS_MASK & ~(page_size - 1); > + page_offset = gphys & (page_size - 1); > + return pte + page_offset; > + > + do_fault_rsvd: > + exit_info_1 |= SVM_NPTEXIT_RSVD; > + do_fault_protect: > + exit_info_1 |= SVM_NPTEXIT_P; > + do_fault: > + x86_stq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2), > + gphys); > + exit_info_1 |= SVM_NPTEXIT_US; > + if (access_type == MMU_DATA_STORE) { > + exit_info_1 |= SVM_NPTEXIT_RW; > + } else if (access_type == MMU_INST_FETCH) { > + exit_info_1 |= SVM_NPTEXIT_ID; > + } > + if (prot) { > + exit_info_1 |= SVM_NPTEXIT_GPA; > + } else { /* page table access */ > + exit_info_1 |= SVM_NPTEXIT_GPT; > + } > + cpu_vmexit(env, SVM_EXIT_NPF, exit_info_1, env->retaddr); > +} > + > /* return value: > * -1 = cannot handle fault > * 0 = nothing more to do > @@ -224,6 +427,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > if (la57) { > pml5e_addr = ((env->cr[3] & ~0xfff) + > (((addr >> 48) & 0x1ff) << 3)) & a20_mask; > + pml5e_addr = get_hphys(cs, pml5e_addr, MMU_DATA_STORE, NULL); > pml5e = x86_ldq_phys(cs, pml5e_addr); > if (!(pml5e & PG_PRESENT_MASK)) { > goto do_fault; > @@ -243,6 +447,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > pml4e_addr = ((pml5e & PG_ADDRESS_MASK) + > (((addr >> 39) & 0x1ff) << 3)) & a20_mask; > + pml4e_addr = get_hphys(cs, pml4e_addr, MMU_DATA_STORE, false); > pml4e = x86_ldq_phys(cs, pml4e_addr); > if (!(pml4e & PG_PRESENT_MASK)) { > goto do_fault; > @@ -257,6 +462,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > ptep &= pml4e ^ PG_NX_MASK; > pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) & > a20_mask; > + pdpe_addr = get_hphys(cs, pdpe_addr, MMU_DATA_STORE, NULL); > pdpe = x86_ldq_phys(cs, pdpe_addr); > if (!(pdpe & PG_PRESENT_MASK)) { > goto do_fault; > @@ -282,6 +488,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > /* XXX: load them when cr3 is loaded ? */ > pdpe_addr = ((env->cr[3] & ~0x1f) + ((addr >> 27) & 0x18)) & > a20_mask; > + pdpe_addr = get_hphys(cs, pdpe_addr, MMU_DATA_STORE, false); > pdpe = x86_ldq_phys(cs, pdpe_addr); > if (!(pdpe & PG_PRESENT_MASK)) { > goto do_fault; > @@ -295,6 +502,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > pde_addr = ((pdpe & PG_ADDRESS_MASK) + (((addr >> 21) & 0x1ff) << 3)) & > a20_mask; > + pde_addr = get_hphys(cs, pde_addr, MMU_DATA_STORE, NULL); > pde = x86_ldq_phys(cs, pde_addr); > if (!(pde & PG_PRESENT_MASK)) { > goto do_fault; > @@ -317,6 +525,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > } > pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) & > a20_mask; > + pte_addr = get_hphys(cs, pte_addr, MMU_DATA_STORE, NULL); > pte = x86_ldq_phys(cs, pte_addr); > if (!(pte & PG_PRESENT_MASK)) { > goto do_fault; > @@ -333,6 +542,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > /* page directory entry */ > pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) & > a20_mask; > + pde_addr = get_hphys(cs, pde_addr, MMU_DATA_STORE, NULL); > pde = x86_ldl_phys(cs, pde_addr); > if (!(pde & PG_PRESENT_MASK)) { > goto do_fault; > @@ -360,6 +570,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > /* page directory entry */ > pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & > a20_mask; > + pte_addr = get_hphys(cs, pte_addr, MMU_DATA_STORE, NULL); > pte = x86_ldl_phys(cs, pte_addr); > if (!(pte & PG_PRESENT_MASK)) { > goto do_fault; > @@ -442,12 +653,13 @@ do_check_protect_pse36: > > /* align to page_size */ > pte &= PG_ADDRESS_MASK & ~(page_size - 1); > + page_offset = addr & (page_size - 1); > + paddr = get_hphys(cs, pte + page_offset, is_write1, &prot); > > /* Even if 4MB pages, we map only one 4KB page in the cache to > avoid filling it too fast */ > vaddr = addr & TARGET_PAGE_MASK; > - page_offset = vaddr & (page_size - 1); > - paddr = pte + page_offset; > + paddr &= TARGET_PAGE_MASK; > > assert(prot & (1 << is_write1)); > tlb_set_page_with_attrs(cs, vaddr, paddr, cpu_get_mem_attrs(env), > diff --git a/target/i386/machine.c b/target/i386/machine.c > index 4d98d367c1..8b64dff487 100644 > --- a/target/i386/machine.c > +++ b/target/i386/machine.c > @@ -935,6 +935,26 @@ static const VMStateDescription vmstate_msr_virt_ssbd = { > } > }; > > +static bool svm_npt_needed(void *opaque) > +{ > + X86CPU *cpu = opaque; > + CPUX86State *env = &cpu->env; > + > + return !!(env->hflags2 & HF2_NPT_MASK); > +} > + > +static const VMStateDescription vmstate_svm_npt = { > + .name = "cpu/svn_npt", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = svm_npt_needed, > + .fields = (VMStateField[]){ > + VMSTATE_UINT64(env.nested_cr3, X86CPU), > + VMSTATE_UINT32(env.nested_pg_mode, X86CPU), > + VMSTATE_END_OF_LIST() > + } > +}; > + > VMStateDescription vmstate_x86_cpu = { > .name = "cpu", > .version_id = 12, > @@ -1059,6 +1079,7 @@ VMStateDescription vmstate_x86_cpu = { > &vmstate_mcg_ext_ctl, > &vmstate_msr_intel_pt, > &vmstate_msr_virt_ssbd, > + &vmstate_svm_npt, > NULL > } > }; > diff --git a/target/i386/mem_helper.c b/target/i386/mem_helper.c > index a8ae694a9c..30c26b9d9c 100644 > --- a/target/i386/mem_helper.c > +++ b/target/i386/mem_helper.c > @@ -202,13 +202,13 @@ void helper_boundl(CPUX86State *env, target_ulong a0, int v) > void tlb_fill(CPUState *cs, target_ulong addr, int size, > MMUAccessType access_type, int mmu_idx, uintptr_t retaddr) > { > + X86CPU *cpu = X86_CPU(cs); > + CPUX86State *env = &cpu->env; > int ret; > > + env->retaddr = retaddr; > ret = x86_cpu_handle_mmu_fault(cs, addr, size, access_type, mmu_idx); > if (ret) { > - X86CPU *cpu = X86_CPU(cs); > - CPUX86State *env = &cpu->env; > - > raise_exception_err_ra(env, cs->exception_index, env->error_code, retaddr); > } > } > diff --git a/target/i386/svm.h b/target/i386/svm.h > index 922c8fd39c..23a3a040b8 100644 > --- a/target/i386/svm.h > +++ b/target/i386/svm.h > @@ -130,6 +130,20 @@ > > #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ > > +#define SVM_NPT_ENABLED (1 << 0) > + > +#define SVM_NPT_PAE (1 << 0) > +#define SVM_NPT_LMA (1 << 1) > +#define SVM_NPT_NXE (1 << 2) > + > +#define SVM_NPTEXIT_P (1ULL << 0) > +#define SVM_NPTEXIT_RW (1ULL << 1) > +#define SVM_NPTEXIT_US (1ULL << 2) > +#define SVM_NPTEXIT_RSVD (1ULL << 3) > +#define SVM_NPTEXIT_ID (1ULL << 4) > +#define SVM_NPTEXIT_GPA (1ULL << 32) > +#define SVM_NPTEXIT_GPT (1ULL << 33) > + > struct QEMU_PACKED vmcb_control_area { > uint16_t intercept_cr_read; > uint16_t intercept_cr_write; > diff --git a/target/i386/svm_helper.c b/target/i386/svm_helper.c > index f245aec310..342ece082f 100644 > --- a/target/i386/svm_helper.c > +++ b/target/i386/svm_helper.c > @@ -124,6 +124,7 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend) > { > CPUState *cs = CPU(x86_env_get_cpu(env)); > target_ulong addr; > + uint64_t nested_ctl; > uint32_t event_inj; > uint32_t int_ctl; > > @@ -206,6 +207,26 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend) > control.intercept_exceptions > )); > > + nested_ctl = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, > + control.nested_ctl)); > + if (nested_ctl & SVM_NPT_ENABLED) { > + env->nested_cr3 = x86_ldq_phys(cs, > + env->vm_vmcb + offsetof(struct vmcb, > + control.nested_cr3)); > + env->hflags2 |= HF2_NPT_MASK; > + > + env->nested_pg_mode = 0; > + if (env->cr[4] & CR4_PAE_MASK) { > + env->nested_pg_mode |= SVM_NPT_PAE; > + } > + if (env->hflags & HF_LMA_MASK) { > + env->nested_pg_mode |= SVM_NPT_LMA; > + } > + if (env->efer & MSR_EFER_NXE) { > + env->nested_pg_mode |= SVM_NPT_NXE; > + } > + } > + > /* enable intercepts */ > env->hflags |= HF_SVMI_MASK; > > @@ -616,6 +637,7 @@ void do_vmexit(CPUX86State *env, uint32_t exit_code, uint64_t exit_info_1) > x86_stl_phys(cs, > env->vm_vmcb + offsetof(struct vmcb, control.int_state), 0); > } > + env->hflags2 &= ~HF2_NPT_MASK; > > /* Save the VM state in the vmcb */ > svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.es), > Queued, thanks. Paolo
diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 1e6a7d0a75..6e1f180249 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -751,7 +751,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, #define TCG_EXT3_FEATURES (CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM | \ CPUID_EXT3_CR8LEG | CPUID_EXT3_ABM | CPUID_EXT3_SSE4A) #define TCG_EXT4_FEATURES 0 -#define TCG_SVM_FEATURES 0 +#define TCG_SVM_FEATURES CPUID_SVM_NPT #define TCG_KVM_FEATURES 0 #define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP | \ CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \ diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 8eaefeee3e..7f33755bf5 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -211,6 +211,7 @@ typedef enum X86Seg { #define HF2_VINTR_SHIFT 3 /* value of V_INTR_MASKING bit */ #define HF2_SMM_INSIDE_NMI_SHIFT 4 /* CPU serving SMI nested inside NMI */ #define HF2_MPX_PR_SHIFT 5 /* BNDCFGx.BNDPRESERVE */ +#define HF2_NPT_SHIFT 6 /* Nested Paging enabled */ #define HF2_GIF_MASK (1 << HF2_GIF_SHIFT) #define HF2_HIF_MASK (1 << HF2_HIF_SHIFT) @@ -218,6 +219,7 @@ typedef enum X86Seg { #define HF2_VINTR_MASK (1 << HF2_VINTR_SHIFT) #define HF2_SMM_INSIDE_NMI_MASK (1 << HF2_SMM_INSIDE_NMI_SHIFT) #define HF2_MPX_PR_MASK (1 << HF2_MPX_PR_SHIFT) +#define HF2_NPT_MASK (1 << HF2_NPT_SHIFT) #define CR0_PE_SHIFT 0 #define CR0_MP_SHIFT 1 @@ -1265,12 +1267,16 @@ typedef struct CPUX86State { uint16_t intercept_dr_read; uint16_t intercept_dr_write; uint32_t intercept_exceptions; + uint64_t nested_cr3; + uint32_t nested_pg_mode; uint8_t v_tpr; /* KVM states, automatically cleared on reset */ uint8_t nmi_injected; uint8_t nmi_pending; + uintptr_t retaddr; + /* Fields up to this point are cleared by a CPU reset */ struct {} end_reset_fields; diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c index cb4d1b7d33..37a33d5ae0 100644 --- a/target/i386/excp_helper.c +++ b/target/i386/excp_helper.c @@ -157,6 +157,209 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, #else +static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, + int *prot) +{ + CPUX86State *env = &X86_CPU(cs)->env; + uint64_t rsvd_mask = PG_HI_RSVD_MASK; + uint64_t ptep, pte; + uint64_t exit_info_1 = 0; + target_ulong pde_addr, pte_addr; + uint32_t page_offset; + int page_size; + + if (likely(!(env->hflags2 & HF2_NPT_MASK))) { + return gphys; + } + + if (!(env->nested_pg_mode & SVM_NPT_NXE)) { + rsvd_mask |= PG_NX_MASK; + } + + if (env->nested_pg_mode & SVM_NPT_PAE) { + uint64_t pde, pdpe; + target_ulong pdpe_addr; + +#ifdef TARGET_X86_64 + if (env->nested_pg_mode & SVM_NPT_LMA) { + uint64_t pml5e; + uint64_t pml4e_addr, pml4e; + + pml5e = env->nested_cr3; + ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK; + + pml4e_addr = (pml5e & PG_ADDRESS_MASK) + + (((gphys >> 39) & 0x1ff) << 3); + pml4e = x86_ldq_phys(cs, pml4e_addr); + if (!(pml4e & PG_PRESENT_MASK)) { + goto do_fault; + } + if (pml4e & (rsvd_mask | PG_PSE_MASK)) { + goto do_fault_rsvd; + } + if (!(pml4e & PG_ACCESSED_MASK)) { + pml4e |= PG_ACCESSED_MASK; + x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); + } + ptep &= pml4e ^ PG_NX_MASK; + pdpe_addr = (pml4e & PG_ADDRESS_MASK) + + (((gphys >> 30) & 0x1ff) << 3); + pdpe = x86_ldq_phys(cs, pdpe_addr); + if (!(pdpe & PG_PRESENT_MASK)) { + goto do_fault; + } + if (pdpe & rsvd_mask) { + goto do_fault_rsvd; + } + ptep &= pdpe ^ PG_NX_MASK; + if (!(pdpe & PG_ACCESSED_MASK)) { + pdpe |= PG_ACCESSED_MASK; + x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); + } + if (pdpe & PG_PSE_MASK) { + /* 1 GB page */ + page_size = 1024 * 1024 * 1024; + pte_addr = pdpe_addr; + pte = pdpe; + goto do_check_protect; + } + } else +#endif + { + pdpe_addr = (env->nested_cr3 & ~0x1f) + ((gphys >> 27) & 0x18); + pdpe = x86_ldq_phys(cs, pdpe_addr); + if (!(pdpe & PG_PRESENT_MASK)) { + goto do_fault; + } + rsvd_mask |= PG_HI_USER_MASK; + if (pdpe & (rsvd_mask | PG_NX_MASK)) { + goto do_fault_rsvd; + } + ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK; + } + + pde_addr = (pdpe & PG_ADDRESS_MASK) + (((gphys >> 21) & 0x1ff) << 3); + pde = x86_ldq_phys(cs, pde_addr); + if (!(pde & PG_PRESENT_MASK)) { + goto do_fault; + } + if (pde & rsvd_mask) { + goto do_fault_rsvd; + } + ptep &= pde ^ PG_NX_MASK; + if (pde & PG_PSE_MASK) { + /* 2 MB page */ + page_size = 2048 * 1024; + pte_addr = pde_addr; + pte = pde; + goto do_check_protect; + } + /* 4 KB page */ + if (!(pde & PG_ACCESSED_MASK)) { + pde |= PG_ACCESSED_MASK; + x86_stl_phys_notdirty(cs, pde_addr, pde); + } + pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3); + pte = x86_ldq_phys(cs, pte_addr); + if (!(pte & PG_PRESENT_MASK)) { + goto do_fault; + } + if (pte & rsvd_mask) { + goto do_fault_rsvd; + } + /* combine pde and pte nx, user and rw protections */ + ptep &= pte ^ PG_NX_MASK; + page_size = 4096; + } else { + uint32_t pde; + + /* page directory entry */ + pde_addr = (env->nested_cr3 & ~0xfff) + ((gphys >> 20) & 0xffc); + pde = x86_ldl_phys(cs, pde_addr); + if (!(pde & PG_PRESENT_MASK)) { + goto do_fault; + } + ptep = pde | PG_NX_MASK; + + /* if PSE bit is set, then we use a 4MB page */ + if ((pde & PG_PSE_MASK) && (env->cr[4] & CR4_PSE_MASK)) { + page_size = 4096 * 1024; + pte_addr = pde_addr; + + /* Bits 20-13 provide bits 39-32 of the address, bit 21 is reserved. + * Leave bits 20-13 in place for setting accessed/dirty bits below. + */ + pte = pde | ((pde & 0x1fe000LL) << (32 - 13)); + rsvd_mask = 0x200000; + goto do_check_protect_pse36; + } + + if (!(pde & PG_ACCESSED_MASK)) { + pde |= PG_ACCESSED_MASK; + x86_stl_phys_notdirty(cs, pde_addr, pde); + } + + /* page directory entry */ + pte_addr = (pde & ~0xfff) + ((gphys >> 10) & 0xffc); + pte = x86_ldl_phys(cs, pte_addr); + if (!(pte & PG_PRESENT_MASK)) { + goto do_fault; + } + /* combine pde and pte user and rw protections */ + ptep &= pte | PG_NX_MASK; + page_size = 4096; + rsvd_mask = 0; + } + + do_check_protect: + rsvd_mask |= (page_size - 1) & PG_ADDRESS_MASK & ~PG_PSE_PAT_MASK; + do_check_protect_pse36: + if (pte & rsvd_mask) { + goto do_fault_rsvd; + } + ptep ^= PG_NX_MASK; + + if (!(ptep & PG_USER_MASK)) { + goto do_fault_protect; + } + if (ptep & PG_NX_MASK) { + if (access_type == MMU_INST_FETCH) { + goto do_fault_protect; + } + *prot &= ~PAGE_EXEC; + } + if (!(ptep & PG_RW_MASK)) { + if (access_type == MMU_DATA_STORE) { + goto do_fault_protect; + } + *prot &= ~PAGE_WRITE; + } + + pte &= PG_ADDRESS_MASK & ~(page_size - 1); + page_offset = gphys & (page_size - 1); + return pte + page_offset; + + do_fault_rsvd: + exit_info_1 |= SVM_NPTEXIT_RSVD; + do_fault_protect: + exit_info_1 |= SVM_NPTEXIT_P; + do_fault: + x86_stq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2), + gphys); + exit_info_1 |= SVM_NPTEXIT_US; + if (access_type == MMU_DATA_STORE) { + exit_info_1 |= SVM_NPTEXIT_RW; + } else if (access_type == MMU_INST_FETCH) { + exit_info_1 |= SVM_NPTEXIT_ID; + } + if (prot) { + exit_info_1 |= SVM_NPTEXIT_GPA; + } else { /* page table access */ + exit_info_1 |= SVM_NPTEXIT_GPT; + } + cpu_vmexit(env, SVM_EXIT_NPF, exit_info_1, env->retaddr); +} + /* return value: * -1 = cannot handle fault * 0 = nothing more to do @@ -224,6 +427,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, if (la57) { pml5e_addr = ((env->cr[3] & ~0xfff) + (((addr >> 48) & 0x1ff) << 3)) & a20_mask; + pml5e_addr = get_hphys(cs, pml5e_addr, MMU_DATA_STORE, NULL); pml5e = x86_ldq_phys(cs, pml5e_addr); if (!(pml5e & PG_PRESENT_MASK)) { goto do_fault; @@ -243,6 +447,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, pml4e_addr = ((pml5e & PG_ADDRESS_MASK) + (((addr >> 39) & 0x1ff) << 3)) & a20_mask; + pml4e_addr = get_hphys(cs, pml4e_addr, MMU_DATA_STORE, false); pml4e = x86_ldq_phys(cs, pml4e_addr); if (!(pml4e & PG_PRESENT_MASK)) { goto do_fault; @@ -257,6 +462,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, ptep &= pml4e ^ PG_NX_MASK; pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) & a20_mask; + pdpe_addr = get_hphys(cs, pdpe_addr, MMU_DATA_STORE, NULL); pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) { goto do_fault; @@ -282,6 +488,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, /* XXX: load them when cr3 is loaded ? */ pdpe_addr = ((env->cr[3] & ~0x1f) + ((addr >> 27) & 0x18)) & a20_mask; + pdpe_addr = get_hphys(cs, pdpe_addr, MMU_DATA_STORE, false); pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) { goto do_fault; @@ -295,6 +502,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, pde_addr = ((pdpe & PG_ADDRESS_MASK) + (((addr >> 21) & 0x1ff) << 3)) & a20_mask; + pde_addr = get_hphys(cs, pde_addr, MMU_DATA_STORE, NULL); pde = x86_ldq_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) { goto do_fault; @@ -317,6 +525,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, } pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) & a20_mask; + pte_addr = get_hphys(cs, pte_addr, MMU_DATA_STORE, NULL); pte = x86_ldq_phys(cs, pte_addr); if (!(pte & PG_PRESENT_MASK)) { goto do_fault; @@ -333,6 +542,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, /* page directory entry */ pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) & a20_mask; + pde_addr = get_hphys(cs, pde_addr, MMU_DATA_STORE, NULL); pde = x86_ldl_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) { goto do_fault; @@ -360,6 +570,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, /* page directory entry */ pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & a20_mask; + pte_addr = get_hphys(cs, pte_addr, MMU_DATA_STORE, NULL); pte = x86_ldl_phys(cs, pte_addr); if (!(pte & PG_PRESENT_MASK)) { goto do_fault; @@ -442,12 +653,13 @@ do_check_protect_pse36: /* align to page_size */ pte &= PG_ADDRESS_MASK & ~(page_size - 1); + page_offset = addr & (page_size - 1); + paddr = get_hphys(cs, pte + page_offset, is_write1, &prot); /* Even if 4MB pages, we map only one 4KB page in the cache to avoid filling it too fast */ vaddr = addr & TARGET_PAGE_MASK; - page_offset = vaddr & (page_size - 1); - paddr = pte + page_offset; + paddr &= TARGET_PAGE_MASK; assert(prot & (1 << is_write1)); tlb_set_page_with_attrs(cs, vaddr, paddr, cpu_get_mem_attrs(env), diff --git a/target/i386/machine.c b/target/i386/machine.c index 4d98d367c1..8b64dff487 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -935,6 +935,26 @@ static const VMStateDescription vmstate_msr_virt_ssbd = { } }; +static bool svm_npt_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return !!(env->hflags2 & HF2_NPT_MASK); +} + +static const VMStateDescription vmstate_svm_npt = { + .name = "cpu/svn_npt", + .version_id = 1, + .minimum_version_id = 1, + .needed = svm_npt_needed, + .fields = (VMStateField[]){ + VMSTATE_UINT64(env.nested_cr3, X86CPU), + VMSTATE_UINT32(env.nested_pg_mode, X86CPU), + VMSTATE_END_OF_LIST() + } +}; + VMStateDescription vmstate_x86_cpu = { .name = "cpu", .version_id = 12, @@ -1059,6 +1079,7 @@ VMStateDescription vmstate_x86_cpu = { &vmstate_mcg_ext_ctl, &vmstate_msr_intel_pt, &vmstate_msr_virt_ssbd, + &vmstate_svm_npt, NULL } }; diff --git a/target/i386/mem_helper.c b/target/i386/mem_helper.c index a8ae694a9c..30c26b9d9c 100644 --- a/target/i386/mem_helper.c +++ b/target/i386/mem_helper.c @@ -202,13 +202,13 @@ void helper_boundl(CPUX86State *env, target_ulong a0, int v) void tlb_fill(CPUState *cs, target_ulong addr, int size, MMUAccessType access_type, int mmu_idx, uintptr_t retaddr) { + X86CPU *cpu = X86_CPU(cs); + CPUX86State *env = &cpu->env; int ret; + env->retaddr = retaddr; ret = x86_cpu_handle_mmu_fault(cs, addr, size, access_type, mmu_idx); if (ret) { - X86CPU *cpu = X86_CPU(cs); - CPUX86State *env = &cpu->env; - raise_exception_err_ra(env, cs->exception_index, env->error_code, retaddr); } } diff --git a/target/i386/svm.h b/target/i386/svm.h index 922c8fd39c..23a3a040b8 100644 --- a/target/i386/svm.h +++ b/target/i386/svm.h @@ -130,6 +130,20 @@ #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ +#define SVM_NPT_ENABLED (1 << 0) + +#define SVM_NPT_PAE (1 << 0) +#define SVM_NPT_LMA (1 << 1) +#define SVM_NPT_NXE (1 << 2) + +#define SVM_NPTEXIT_P (1ULL << 0) +#define SVM_NPTEXIT_RW (1ULL << 1) +#define SVM_NPTEXIT_US (1ULL << 2) +#define SVM_NPTEXIT_RSVD (1ULL << 3) +#define SVM_NPTEXIT_ID (1ULL << 4) +#define SVM_NPTEXIT_GPA (1ULL << 32) +#define SVM_NPTEXIT_GPT (1ULL << 33) + struct QEMU_PACKED vmcb_control_area { uint16_t intercept_cr_read; uint16_t intercept_cr_write; diff --git a/target/i386/svm_helper.c b/target/i386/svm_helper.c index f245aec310..342ece082f 100644 --- a/target/i386/svm_helper.c +++ b/target/i386/svm_helper.c @@ -124,6 +124,7 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend) { CPUState *cs = CPU(x86_env_get_cpu(env)); target_ulong addr; + uint64_t nested_ctl; uint32_t event_inj; uint32_t int_ctl; @@ -206,6 +207,26 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend) control.intercept_exceptions )); + nested_ctl = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, + control.nested_ctl)); + if (nested_ctl & SVM_NPT_ENABLED) { + env->nested_cr3 = x86_ldq_phys(cs, + env->vm_vmcb + offsetof(struct vmcb, + control.nested_cr3)); + env->hflags2 |= HF2_NPT_MASK; + + env->nested_pg_mode = 0; + if (env->cr[4] & CR4_PAE_MASK) { + env->nested_pg_mode |= SVM_NPT_PAE; + } + if (env->hflags & HF_LMA_MASK) { + env->nested_pg_mode |= SVM_NPT_LMA; + } + if (env->efer & MSR_EFER_NXE) { + env->nested_pg_mode |= SVM_NPT_NXE; + } + } + /* enable intercepts */ env->hflags |= HF_SVMI_MASK; @@ -616,6 +637,7 @@ void do_vmexit(CPUX86State *env, uint32_t exit_code, uint64_t exit_info_1) x86_stl_phys(cs, env->vm_vmcb + offsetof(struct vmcb, control.int_state), 0); } + env->hflags2 &= ~HF2_NPT_MASK; /* Save the VM state in the vmcb */ svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.es),