diff mbox

[v2,4/9] provide in-kernel apic

Message ID 1254953315-5761-5-git-send-email-glommer@redhat.com
State Changes Requested
Headers show

Commit Message

Glauber Costa Oct. 7, 2009, 10:08 p.m. UTC
This patch provides kvm with an in-kernel apic. We are currently not enabling it.
The code is heavily based on what's in qemu-kvm.git.

Signed-off-by: Glauber Costa <glommer@redhat.com>
---
 hw/apic.c         |  135 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 kvm.h             |    3 +
 target-i386/kvm.c |   18 +++++++
 3 files changed, 152 insertions(+), 4 deletions(-)

Comments

Anthony Liguori Oct. 8, 2009, 1:55 p.m. UTC | #1
Glauber Costa wrote:
> This patch provides kvm with an in-kernel apic. We are currently not enabling it.
> The code is heavily based on what's in qemu-kvm.git.
>
> Signed-off-by: Glauber Costa <glommer@redhat.com>
> ---
>  hw/apic.c         |  135 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>  kvm.h             |    3 +
>  target-i386/kvm.c |   18 +++++++
>  3 files changed, 152 insertions(+), 4 deletions(-)
>
> diff --git a/hw/apic.c b/hw/apic.c
> index c89008e..5635607 100644
> --- a/hw/apic.c
> +++ b/hw/apic.c
> @@ -299,7 +299,11 @@ void cpu_set_apic_base(CPUState *env, uint64_t val)
>  #endif
>      if (!s)
>          return;
> -    s->apicbase = (val & 0xfffff000) |
> +
> +    if (kvm_enabled() && kvm_irqchip_in_kernel())
> +        s->apicbase = val;
> +    else
> +        s->apicbase = (val & 0xfffff000) |
>          (s->apicbase & (MSR_IA32_APICBASE_BSP | MSR_IA32_APICBASE_ENABLE));
>      /* if disabled, cannot be enabled again */
>      if (!(val & MSR_IA32_APICBASE_ENABLE)) {
> @@ -497,6 +501,13 @@ void apic_init_reset(CPUState *env)
>      s->wait_for_sipi = 1;
>
>      env->halted = !(s->apicbase & MSR_IA32_APICBASE_BSP);
> +
> +#ifdef KVM_CAP_MP_STATE
> +    if (kvm_enabled() && kvm_irqchip_in_kernel())
> +        env->mp_state
> +            = env->halted ? KVM_MP_STATE_UNINITIALIZED : KVM_MP_STATE_RUNNABLE;
> +#endif
>   

I don't think CAP_MP_STATE should be treated as an optional feature.

> +static int kvm_kernel_lapic_load_from_user(APICState *s)
> +{
> +    int r = 0;
> +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
> +    struct kvm_lapic_state apic;
> +    struct kvm_lapic_state *klapic = &apic;
> +    int i;
> +
> +    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
> +        return 0;
> +
> +    memset(klapic, 0, sizeof apic);
> +    kapic_set_reg(klapic, 0x2, s->id << 24);
> +    kapic_set_reg(klapic, 0x8, s->tpr);
> +    kapic_set_reg(klapic, 0xd, s->log_dest << 24);
> +    kapic_set_reg(klapic, 0xe, s->dest_mode << 28 | 0x0fffffff);
> +    kapic_set_reg(klapic, 0xf, s->spurious_vec);
> +    for (i = 0; i < 8; i++) {
> +        kapic_set_reg(klapic, 0x10 + i, s->isr[i]);
> +        kapic_set_reg(klapic, 0x18 + i, s->tmr[i]);
> +        kapic_set_reg(klapic, 0x20 + i, s->irr[i]);
> +    }
> +    kapic_set_reg(klapic, 0x28, s->esr);
> +    kapic_set_reg(klapic, 0x30, s->icr[0]);
> +    kapic_set_reg(klapic, 0x31, s->icr[1]);
> +    for (i = 0; i < APIC_LVT_NB; i++)
> +        kapic_set_reg(klapic, 0x32 + i, s->lvt[i]);
> +    kapic_set_reg(klapic, 0x38, s->initial_count);
> +    kapic_set_reg(klapic, 0x3e, s->divide_conf);
> +
> +    r = kvm_set_lapic(s->cpu_env, klapic);
> +#endif
> +    return r;
> +}
>   

You should probably just setup VMState such that it directly saves 
kvm_lapic_state and then have the pre/post functions call the kernel 
ioctls to sync it.  There's not a whole lot of point switching the state 
between two different structures.

>  static const VMStateDescription vmstate_apic = {
>      .name = "apic",
>      .version_id = 3,
>      .minimum_version_id = 3,
>      .minimum_version_id_old = 1,
>      .load_state_old = apic_load_old,
> +    .pre_save = apic_pre_save,
> +    .post_load = apic_post_load,
>      .fields      = (VMStateField []) {
>          VMSTATE_UINT32(apicbase, APICState),
>          VMSTATE_UINT8(id, APICState),
> @@ -933,9 +1052,8 @@ static const VMStateDescription vmstate_apic = {
>      }
>  };
>   

Same applies here as ioapic.  Should be a separate device.
Avi Kivity Oct. 8, 2009, 2:09 p.m. UTC | #2
On 10/08/2009 03:55 PM, Anthony Liguori wrote:
> Glauber Costa wrote:
>> This patch provides kvm with an in-kernel apic. We are currently not 
>> enabling it.
>> The code is heavily based on what's in qemu-kvm.git.
>>
>> Signed-off-by: Glauber Costa <glommer@redhat.com>
>> ---
>>  hw/apic.c         |  135 
>> +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>  kvm.h             |    3 +
>>  target-i386/kvm.c |   18 +++++++
>>  3 files changed, 152 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/apic.c b/hw/apic.c
>> index c89008e..5635607 100644
>> --- a/hw/apic.c
>> +++ b/hw/apic.c
>> @@ -299,7 +299,11 @@ void cpu_set_apic_base(CPUState *env, uint64_t val)
>>  #endif
>>      if (!s)
>>          return;
>> -    s->apicbase = (val & 0xfffff000) |
>> +
>> +    if (kvm_enabled() && kvm_irqchip_in_kernel())
>> +        s->apicbase = val;
>> +    else
>> +        s->apicbase = (val & 0xfffff000) |
>>          (s->apicbase & (MSR_IA32_APICBASE_BSP | 
>> MSR_IA32_APICBASE_ENABLE));
>>      /* if disabled, cannot be enabled again */
>>      if (!(val & MSR_IA32_APICBASE_ENABLE)) {
>> @@ -497,6 +501,13 @@ void apic_init_reset(CPUState *env)
>>      s->wait_for_sipi = 1;
>>
>>      env->halted = !(s->apicbase & MSR_IA32_APICBASE_BSP);
>> +
>> +#ifdef KVM_CAP_MP_STATE
>> +    if (kvm_enabled() && kvm_irqchip_in_kernel())
>> +        env->mp_state
>> +            = env->halted ? KVM_MP_STATE_UNINITIALIZED : 
>> KVM_MP_STATE_RUNNABLE;
>> +#endif
>
> I don't think CAP_MP_STATE should be treated as an optional feature.
>
>> +static int kvm_kernel_lapic_load_from_user(APICState *s)
>> +{
>> +    int r = 0;
>> +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
>> +    struct kvm_lapic_state apic;
>> +    struct kvm_lapic_state *klapic = &apic;
>> +    int i;
>> +
>> +    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
>> +        return 0;
>> +
>> +    memset(klapic, 0, sizeof apic);
>> +    kapic_set_reg(klapic, 0x2, s->id << 24);
>> +    kapic_set_reg(klapic, 0x8, s->tpr);
>> +    kapic_set_reg(klapic, 0xd, s->log_dest << 24);
>> +    kapic_set_reg(klapic, 0xe, s->dest_mode << 28 | 0x0fffffff);
>> +    kapic_set_reg(klapic, 0xf, s->spurious_vec);
>> +    for (i = 0; i < 8; i++) {
>> +        kapic_set_reg(klapic, 0x10 + i, s->isr[i]);
>> +        kapic_set_reg(klapic, 0x18 + i, s->tmr[i]);
>> +        kapic_set_reg(klapic, 0x20 + i, s->irr[i]);
>> +    }
>> +    kapic_set_reg(klapic, 0x28, s->esr);
>> +    kapic_set_reg(klapic, 0x30, s->icr[0]);
>> +    kapic_set_reg(klapic, 0x31, s->icr[1]);
>> +    for (i = 0; i < APIC_LVT_NB; i++)
>> +        kapic_set_reg(klapic, 0x32 + i, s->lvt[i]);
>> +    kapic_set_reg(klapic, 0x38, s->initial_count);
>> +    kapic_set_reg(klapic, 0x3e, s->divide_conf);
>> +
>> +    r = kvm_set_lapic(s->cpu_env, klapic);
>> +#endif
>> +    return r;
>> +}
>
> You should probably just setup VMState such that it directly saves 
> kvm_lapic_state and then have the pre/post functions call the kernel 
> ioctls to sync it.  There's not a whole lot of point switching the 
> state between two different structures.

It ensures the two models are compatible.  Since they're the same device 
from the point of view of the guest, there's no reason for them to have 
different representations or to be incompatible.
Glauber Costa Oct. 8, 2009, 2:22 p.m. UTC | #3
On Thu, Oct 08, 2009 at 04:09:27PM +0200, Avi Kivity wrote:
> On 10/08/2009 03:55 PM, Anthony Liguori wrote:
>> Glauber Costa wrote:
>>> This patch provides kvm with an in-kernel apic. We are currently not  
>>> enabling it.
>>> The code is heavily based on what's in qemu-kvm.git.
>>>
>>> Signed-off-by: Glauber Costa <glommer@redhat.com>
>>> ---
>>>  hw/apic.c         |  135  
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>  kvm.h             |    3 +
>>>  target-i386/kvm.c |   18 +++++++
>>>  3 files changed, 152 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/hw/apic.c b/hw/apic.c
>>> index c89008e..5635607 100644
>>> --- a/hw/apic.c
>>> +++ b/hw/apic.c
>>> @@ -299,7 +299,11 @@ void cpu_set_apic_base(CPUState *env, uint64_t val)
>>>  #endif
>>>      if (!s)
>>>          return;
>>> -    s->apicbase = (val & 0xfffff000) |
>>> +
>>> +    if (kvm_enabled() && kvm_irqchip_in_kernel())
>>> +        s->apicbase = val;
>>> +    else
>>> +        s->apicbase = (val & 0xfffff000) |
>>>          (s->apicbase & (MSR_IA32_APICBASE_BSP |  
>>> MSR_IA32_APICBASE_ENABLE));
>>>      /* if disabled, cannot be enabled again */
>>>      if (!(val & MSR_IA32_APICBASE_ENABLE)) {
>>> @@ -497,6 +501,13 @@ void apic_init_reset(CPUState *env)
>>>      s->wait_for_sipi = 1;
>>>
>>>      env->halted = !(s->apicbase & MSR_IA32_APICBASE_BSP);
>>> +
>>> +#ifdef KVM_CAP_MP_STATE
>>> +    if (kvm_enabled() && kvm_irqchip_in_kernel())
>>> +        env->mp_state
>>> +            = env->halted ? KVM_MP_STATE_UNINITIALIZED :  
>>> KVM_MP_STATE_RUNNABLE;
>>> +#endif
>>
>> I don't think CAP_MP_STATE should be treated as an optional feature.
>>
>>> +static int kvm_kernel_lapic_load_from_user(APICState *s)
>>> +{
>>> +    int r = 0;
>>> +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
>>> +    struct kvm_lapic_state apic;
>>> +    struct kvm_lapic_state *klapic = &apic;
>>> +    int i;
>>> +
>>> +    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
>>> +        return 0;
>>> +
>>> +    memset(klapic, 0, sizeof apic);
>>> +    kapic_set_reg(klapic, 0x2, s->id << 24);
>>> +    kapic_set_reg(klapic, 0x8, s->tpr);
>>> +    kapic_set_reg(klapic, 0xd, s->log_dest << 24);
>>> +    kapic_set_reg(klapic, 0xe, s->dest_mode << 28 | 0x0fffffff);
>>> +    kapic_set_reg(klapic, 0xf, s->spurious_vec);
>>> +    for (i = 0; i < 8; i++) {
>>> +        kapic_set_reg(klapic, 0x10 + i, s->isr[i]);
>>> +        kapic_set_reg(klapic, 0x18 + i, s->tmr[i]);
>>> +        kapic_set_reg(klapic, 0x20 + i, s->irr[i]);
>>> +    }
>>> +    kapic_set_reg(klapic, 0x28, s->esr);
>>> +    kapic_set_reg(klapic, 0x30, s->icr[0]);
>>> +    kapic_set_reg(klapic, 0x31, s->icr[1]);
>>> +    for (i = 0; i < APIC_LVT_NB; i++)
>>> +        kapic_set_reg(klapic, 0x32 + i, s->lvt[i]);
>>> +    kapic_set_reg(klapic, 0x38, s->initial_count);
>>> +    kapic_set_reg(klapic, 0x3e, s->divide_conf);
>>> +
>>> +    r = kvm_set_lapic(s->cpu_env, klapic);
>>> +#endif
>>> +    return r;
>>> +}
>>
>> You should probably just setup VMState such that it directly saves  
>> kvm_lapic_state and then have the pre/post functions call the kernel  
>> ioctls to sync it.  There's not a whole lot of point switching the  
>> state between two different structures.
>
> It ensures the two models are compatible.  Since they're the same device  
> from the point of view of the guest, there's no reason for them to have  
> different representations or to be incompatible.

live migration between something that has in-kernel irqchip and something that
has not is likely to be a completely untested thing. And this is the only reason
we might think of it as the same device. I don't see any value in supporting this
combination
Anthony Liguori Oct. 8, 2009, 2:26 p.m. UTC | #4
Avi Kivity wrote:
> On 10/08/2009 03:55 PM, Anthony Liguori wrote:
>>
>> You should probably just setup VMState such that it directly saves 
>> kvm_lapic_state and then have the pre/post functions call the kernel 
>> ioctls to sync it.  There's not a whole lot of point switching the 
>> state between two different structures.
>
> It ensures the two models are compatible.  Since they're the same 
> device from the point of view of the guest, there's no reason for them 
> to have different representations or to be incompatible.

The problem is, the in-kernel apic is not part of the qemu source tree.  
If we add a field to the qemu apic, then we would break the in-kernel 
apic and vice-versa.  It's far easier to just have the in-kernel apic as 
a separate model.

Most importantly though, the code should reflect how things behave.  
It's extremely difficult to look at the apic code and figure out what 
bits are used and what bits aren't used when the in-kernel apic is enabled.

Regards,

Anthony Liguori
Avi Kivity Oct. 8, 2009, 2:31 p.m. UTC | #5
On 10/08/2009 04:26 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>> On 10/08/2009 03:55 PM, Anthony Liguori wrote:
>>>
>>> You should probably just setup VMState such that it directly saves 
>>> kvm_lapic_state and then have the pre/post functions call the kernel 
>>> ioctls to sync it.  There's not a whole lot of point switching the 
>>> state between two different structures.
>>
>> It ensures the two models are compatible.  Since they're the same 
>> device from the point of view of the guest, there's no reason for 
>> them to have different representations or to be incompatible.
>
> The problem is, the in-kernel apic is not part of the qemu source 
> tree.  If we add a field to the qemu apic, then we would break the 
> in-kernel apic and vice-versa.  It's far easier to just have the 
> in-kernel apic as a separate model.
>

You need to handle it anyway due to save/restore; that is the new field 
and whatever functionality it has must be optional.

> Most importantly though, the code should reflect how things behave.  
> It's extremely difficult to look at the apic code and figure out what 
> bits are used and what bits aren't used when the in-kernel apic is 
> enabled.

That doesn't mean they need to be separate devices.
Anthony Liguori Oct. 8, 2009, 2:39 p.m. UTC | #6
Avi Kivity wrote:
>> The problem is, the in-kernel apic is not part of the qemu source 
>> tree.  If we add a field to the qemu apic, then we would break the 
>> in-kernel apic and vice-versa.  It's far easier to just have the 
>> in-kernel apic as a separate model.
>>
>
> You need to handle it anyway due to save/restore; that is the new 
> field and whatever functionality it has must be optional.

Not necessarily.  If it's a bug fix, we may disable the older versions 
of savevm (which we have to do from time to time).

> That doesn't mean they need to be separate devices.

But they are.  The in-kernel device models are not mere accelerations.  
There are features that are only enabled with in-kernel device models 
(like PCI passthrough).  In fact, in-kernel PIT is not at all an 
acceleration, it's there because it's functionally different.

The sync stuff is really ugly too.  It would be much cleaner to have a 
separate state for the in-kernel device models that saved the structures 
from the kernel directly instead of having to translate between 
formats.  More straight forward code means it's all easier to 
understand, easier to debug, etc.

Regards,

Anthony Liguori
Glauber Costa Oct. 8, 2009, 2:44 p.m. UTC | #7
On Thu, Oct 08, 2009 at 04:31:57PM +0200, Avi Kivity wrote:
> On 10/08/2009 04:26 PM, Anthony Liguori wrote:
>> Avi Kivity wrote:
>>> On 10/08/2009 03:55 PM, Anthony Liguori wrote:
>>>>
>>>> You should probably just setup VMState such that it directly saves  
>>>> kvm_lapic_state and then have the pre/post functions call the 
>>>> kernel ioctls to sync it.  There's not a whole lot of point 
>>>> switching the state between two different structures.
>>>
>>> It ensures the two models are compatible.  Since they're the same  
>>> device from the point of view of the guest, there's no reason for  
>>> them to have different representations or to be incompatible.
>>
>> The problem is, the in-kernel apic is not part of the qemu source  
>> tree.  If we add a field to the qemu apic, then we would break the  
>> in-kernel apic and vice-versa.  It's far easier to just have the  
>> in-kernel apic as a separate model.
>>
>
> You need to handle it anyway due to save/restore; that is the new field  
> and whatever functionality it has must be optional.
Not necessarily. You can grab the structures directly from the kernel definition
, copy that over, issue the ioctl, and just make sure the source and destination
have compatible kernels.
Glauber Costa Oct. 8, 2009, 2:46 p.m. UTC | #8
On Thu, Oct 08, 2009 at 09:39:13AM -0500, Anthony Liguori wrote:
> The sync stuff is really ugly too.  It would be much cleaner to have a  
> separate state for the in-kernel device models that saved the structures  
> from the kernel directly instead of having to translate between formats.  
> More straight forward code means it's all easier to understand, easier to 
> debug, etc.
>
One may arguee that I am stupid, but I have caught myself reading code that
was totally unused in the quest for irqchip related bugs...
Jamie Lokier Oct. 9, 2009, 10:06 a.m. UTC | #9
Glauber Costa wrote:
> > It ensures the two models are compatible.  Since they're the same device  
> > from the point of view of the guest, there's no reason for them to have  
> > different representations or to be incompatible.
> 
> live migration between something that has in-kernel irqchip and
> something that has not is likely to be a completely untested
> thing. And this is the only reason we might think of it as the same
> device. I don't see any value in supporting this combination

Not just live migration.  ACPI sleep + savevm + loadvm + ACPI resume,
for example.

-- Jamie
Glauber Costa Oct. 9, 2009, 2:30 p.m. UTC | #10
On Fri, Oct 09, 2009 at 11:06:41AM +0100, Jamie Lokier wrote:
> Glauber Costa wrote:
> > > It ensures the two models are compatible.  Since they're the same device  
> > > from the point of view of the guest, there's no reason for them to have  
> > > different representations or to be incompatible.
> > 
> > live migration between something that has in-kernel irqchip and
> > something that has not is likely to be a completely untested
> > thing. And this is the only reason we might think of it as the same
> > device. I don't see any value in supporting this combination
> 
> Not just live migration.  ACPI sleep + savevm + loadvm + ACPI resume,
> for example.
Yes, but in this case too, I'd expect the irqchipness of qemu not to change.
Jamie Lokier Oct. 9, 2009, 4:48 p.m. UTC | #11
Glauber Costa wrote:
> On Fri, Oct 09, 2009 at 11:06:41AM +0100, Jamie Lokier wrote:
> > Glauber Costa wrote:
> > > > It ensures the two models are compatible.  Since they're the same device  
> > > > from the point of view of the guest, there's no reason for them to have  
> > > > different representations or to be incompatible.
> > > 
> > > live migration between something that has in-kernel irqchip and
> > > something that has not is likely to be a completely untested
> > > thing. And this is the only reason we might think of it as the same
> > > device. I don't see any value in supporting this combination
> > 
> > Not just live migration.  ACPI sleep + savevm + loadvm + ACPI resume,
> > for example.
> Yes, but in this case too, I'd expect the irqchipness of qemu not to change.

If I've just been sent an image produced by someone running KVM, and
my machine is not KVM-capable, or I cannot upgrade the KVM kernel
module because it's in use by other VMs (had this problem a few
times), there's no choice but to change the irqchipness.

-- Jamie
Glauber Costa Oct. 9, 2009, 6:06 p.m. UTC | #12
On Fri, Oct 09, 2009 at 05:48:31PM +0100, Jamie Lokier wrote:
> Glauber Costa wrote:
> > On Fri, Oct 09, 2009 at 11:06:41AM +0100, Jamie Lokier wrote:
> > > Glauber Costa wrote:
> > > > > It ensures the two models are compatible.  Since they're the same device  
> > > > > from the point of view of the guest, there's no reason for them to have  
> > > > > different representations or to be incompatible.
> > > > 
> > > > live migration between something that has in-kernel irqchip and
> > > > something that has not is likely to be a completely untested
> > > > thing. And this is the only reason we might think of it as the same
> > > > device. I don't see any value in supporting this combination
> > > 
> > > Not just live migration.  ACPI sleep + savevm + loadvm + ACPI resume,
> > > for example.
> > Yes, but in this case too, I'd expect the irqchipness of qemu not to change.
> 
> If I've just been sent an image produced by someone running KVM, and
> my machine is not KVM-capable, or I cannot upgrade the KVM kernel
> module because it's in use by other VMs (had this problem a few
> times), there's no choice but to change the irqchipness.
As gleb mentioned, requiring such a change to happen offline (across a reboot)
is not that much of a pain.

There are thousands of scenarios in which it will have to happen anyway,
including major bumps in qemu version.
Anthony Liguori Oct. 9, 2009, 7:49 p.m. UTC | #13
Jamie Lokier wrote:
> Glauber Costa wrote:
>   
>> On Fri, Oct 09, 2009 at 11:06:41AM +0100, Jamie Lokier wrote:
>>     
>>> Glauber Costa wrote:
>>>       
>>>>> It ensures the two models are compatible.  Since they're the same device  
>>>>> from the point of view of the guest, there's no reason for them to have  
>>>>> different representations or to be incompatible.
>>>>>           
>>>> live migration between something that has in-kernel irqchip and
>>>> something that has not is likely to be a completely untested
>>>> thing. And this is the only reason we might think of it as the same
>>>> device. I don't see any value in supporting this combination
>>>>         
>>> Not just live migration.  ACPI sleep + savevm + loadvm + ACPI resume,
>>> for example.
>>>       
>> Yes, but in this case too, I'd expect the irqchipness of qemu not to change.
>>     
>
> If I've just been sent an image produced by someone running KVM, and
> my machine is not KVM-capable, or I cannot upgrade the KVM kernel
> module because it's in use by other VMs (had this problem a few
> times), there's no choice but to change the irqchipness.
>   

You cannot migrate from KVM to TCG so this use-case is irrelevant.
Avi Kivity Oct. 11, 2009, 9:10 a.m. UTC | #14
On 10/09/2009 09:49 PM, Anthony Liguori wrote:
>> If I've just been sent an image produced by someone running KVM, and
>> my machine is not KVM-capable, or I cannot upgrade the KVM kernel
>> module because it's in use by other VMs (had this problem a few
>> times), there's no choice but to change the irqchipness.
>
>
> You cannot migrate from KVM to TCG so this use-case is irrelevant.

I agree it's mostly irrelevant, however nothing in principle prevents 
such a migration, as long as the cpuid feature bits are implemented on 
both sides.
Anthony Liguori Oct. 12, 2009, 1:41 p.m. UTC | #15
Avi Kivity wrote:
> On 10/09/2009 09:49 PM, Anthony Liguori wrote:
>>> If I've just been sent an image produced by someone running KVM, and
>>> my machine is not KVM-capable, or I cannot upgrade the KVM kernel
>>> module because it's in use by other VMs (had this problem a few
>>> times), there's no choice but to change the irqchipness.
>>
>>
>> You cannot migrate from KVM to TCG so this use-case is irrelevant.
>
> I agree it's mostly irrelevant, however nothing in principle prevents 
> such a migration, as long as the cpuid feature bits are implemented on 
> both sides.

Sure, in principle it's certainly possible but in practice, it isn't 
today and I don't see anything that's likely to happen in the near term 
future that would make it.
diff mbox

Patch

diff --git a/hw/apic.c b/hw/apic.c
index c89008e..5635607 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -299,7 +299,11 @@  void cpu_set_apic_base(CPUState *env, uint64_t val)
 #endif
     if (!s)
         return;
-    s->apicbase = (val & 0xfffff000) |
+
+    if (kvm_enabled() && kvm_irqchip_in_kernel())
+        s->apicbase = val;
+    else
+        s->apicbase = (val & 0xfffff000) |
         (s->apicbase & (MSR_IA32_APICBASE_BSP | MSR_IA32_APICBASE_ENABLE));
     /* if disabled, cannot be enabled again */
     if (!(val & MSR_IA32_APICBASE_ENABLE)) {
@@ -497,6 +501,13 @@  void apic_init_reset(CPUState *env)
     s->wait_for_sipi = 1;
 
     env->halted = !(s->apicbase & MSR_IA32_APICBASE_BSP);
+
+#ifdef KVM_CAP_MP_STATE
+    if (kvm_enabled() && kvm_irqchip_in_kernel())
+        env->mp_state
+            = env->halted ? KVM_MP_STATE_UNINITIALIZED : KVM_MP_STATE_RUNNABLE;
+#endif
+
 }
 
 static void apic_startup(APICState *s, int vector_num)
@@ -903,12 +914,120 @@  static int apic_load_old(QEMUFile *f, void *opaque, int version_id)
     return 0;
 }
 
+#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
+static inline uint32_t kapic_reg(struct kvm_lapic_state *kapic, int reg_id)
+{
+    return *((uint32_t *) (kapic->regs + (reg_id << 4)));
+}
+
+static inline void kapic_set_reg(struct kvm_lapic_state *kapic,
+                                 int reg_id, uint32_t val)
+{
+    *((uint32_t *) (kapic->regs + (reg_id << 4))) = val;
+}
+#endif
+
+static int kvm_kernel_lapic_load_from_user(APICState *s)
+{
+    int r = 0;
+#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
+    struct kvm_lapic_state apic;
+    struct kvm_lapic_state *klapic = &apic;
+    int i;
+
+    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
+        return 0;
+
+    memset(klapic, 0, sizeof apic);
+    kapic_set_reg(klapic, 0x2, s->id << 24);
+    kapic_set_reg(klapic, 0x8, s->tpr);
+    kapic_set_reg(klapic, 0xd, s->log_dest << 24);
+    kapic_set_reg(klapic, 0xe, s->dest_mode << 28 | 0x0fffffff);
+    kapic_set_reg(klapic, 0xf, s->spurious_vec);
+    for (i = 0; i < 8; i++) {
+        kapic_set_reg(klapic, 0x10 + i, s->isr[i]);
+        kapic_set_reg(klapic, 0x18 + i, s->tmr[i]);
+        kapic_set_reg(klapic, 0x20 + i, s->irr[i]);
+    }
+    kapic_set_reg(klapic, 0x28, s->esr);
+    kapic_set_reg(klapic, 0x30, s->icr[0]);
+    kapic_set_reg(klapic, 0x31, s->icr[1]);
+    for (i = 0; i < APIC_LVT_NB; i++)
+        kapic_set_reg(klapic, 0x32 + i, s->lvt[i]);
+    kapic_set_reg(klapic, 0x38, s->initial_count);
+    kapic_set_reg(klapic, 0x3e, s->divide_conf);
+
+    r = kvm_set_lapic(s->cpu_env, klapic);
+#endif
+    return r;
+}
+
+static void kvm_kernel_lapic_save_to_user(APICState *s)
+{
+#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
+    struct kvm_lapic_state apic;
+    struct kvm_lapic_state *kapic = &apic;
+    int i, v;
+
+    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
+        return;
+
+    kvm_get_lapic(s->cpu_env, kapic);
+
+    s->id = kapic_reg(kapic, 0x2) >> 24;
+    s->tpr = kapic_reg(kapic, 0x8);
+    s->arb_id = kapic_reg(kapic, 0x9);
+    s->log_dest = kapic_reg(kapic, 0xd) >> 24;
+    s->dest_mode = kapic_reg(kapic, 0xe) >> 28;
+    s->spurious_vec = kapic_reg(kapic, 0xf);
+    for (i = 0; i < 8; i++) {
+        s->isr[i] = kapic_reg(kapic, 0x10 + i);
+        s->tmr[i] = kapic_reg(kapic, 0x18 + i);
+        s->irr[i] = kapic_reg(kapic, 0x20 + i);
+    }
+    s->esr = kapic_reg(kapic, 0x28);
+    s->icr[0] = kapic_reg(kapic, 0x30);
+    s->icr[1] = kapic_reg(kapic, 0x31);
+    for (i = 0; i < APIC_LVT_NB; i++)
+    s->lvt[i] = kapic_reg(kapic, 0x32 + i);
+    s->initial_count = kapic_reg(kapic, 0x38);
+    s->divide_conf = kapic_reg(kapic, 0x3e);
+
+    v = (s->divide_conf & 3) | ((s->divide_conf >> 1) & 4);
+    s->count_shift = (v + 1) & 7;
+
+    s->initial_count_load_time = qemu_get_clock(vm_clock);
+    apic_timer_update(s, s->initial_count_load_time);
+#endif
+}
+
+static void qemu_kvm_load_lapic(CPUState *env)
+{
+    kvm_kernel_lapic_load_from_user(env->apic_state);
+}
+
+static void apic_pre_save(void *opaque)
+{
+    APICState *s = (void *)opaque;
+
+    kvm_kernel_lapic_save_to_user(s);
+}
+
+static int apic_post_load(void *opaque, int version_id)
+{
+    APICState *s = opaque;
+
+    return kvm_kernel_lapic_load_from_user(s);
+}
+
 static const VMStateDescription vmstate_apic = {
     .name = "apic",
     .version_id = 3,
     .minimum_version_id = 3,
     .minimum_version_id_old = 1,
     .load_state_old = apic_load_old,
+    .pre_save = apic_pre_save,
+    .post_load = apic_post_load,
     .fields      = (VMStateField []) {
         VMSTATE_UINT32(apicbase, APICState),
         VMSTATE_UINT8(id, APICState),
@@ -933,9 +1052,8 @@  static const VMStateDescription vmstate_apic = {
     }
 };
 
-static void apic_reset(void *opaque)
+static void apic_do_reset(APICState *s)
 {
-    APICState *s = opaque;
     int bsp;
 
     cpu_synchronize_state(s->cpu_env);
@@ -957,6 +1075,15 @@  static void apic_reset(void *opaque)
     }
 }
 
+static void apic_reset(void *opaque)
+{
+    APICState *s = opaque;
+
+    apic_do_reset(s);
+
+    qemu_kvm_load_lapic(s->cpu_env);
+}
+
 static CPUReadMemoryFunc * const apic_mem_read[3] = {
     apic_mem_readb,
     apic_mem_readw,
@@ -981,7 +1108,7 @@  int apic_init(CPUState *env)
     s->id = env->cpuid_apic_id;
     s->cpu_env = env;
 
-    apic_reset(s);
+    apic_do_reset(s);
     msix_supported = 1;
 
     /* XXX: mapping more APICs at the same memory location */
diff --git a/kvm.h b/kvm.h
index 8d4afa0..099b55e 100644
--- a/kvm.h
+++ b/kvm.h
@@ -97,6 +97,9 @@  int kvm_arch_init(KVMState *s, int smp_cpus);
 
 int kvm_arch_init_vcpu(CPUState *env);
 
+int kvm_set_lapic(CPUState *env, struct kvm_lapic_state *s);
+int kvm_get_lapic(CPUState *env, struct kvm_lapic_state *s);
+
 struct kvm_guest_debug;
 struct kvm_debug_exit_arch;
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 7010999..d485d4c 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -957,3 +957,21 @@  void kvm_arch_update_guest_debug(CPUState *env, struct kvm_guest_debug *dbg)
     }
 }
 #endif /* KVM_CAP_SET_GUEST_DEBUG */
+
+#ifdef KVM_CAP_IRQCHIP
+int kvm_set_lapic(CPUState *env, struct kvm_lapic_state *s)
+{
+    if (!kvm_irqchip_in_kernel())
+        return 0;
+
+    return kvm_vcpu_ioctl(env, KVM_SET_LAPIC, s);
+}
+
+int kvm_get_lapic(CPUState *env, struct kvm_lapic_state *s)
+{
+    if (!kvm_irqchip_in_kernel())
+        return 0;
+
+    return kvm_vcpu_ioctl(env, KVM_GET_LAPIC, s);
+}
+#endif