diff mbox series

mem/x86: add processor address space check for VM memory

Message ID 20230908095024.270946-1-anisinha@redhat.com
State New
Headers show
Series mem/x86: add processor address space check for VM memory | expand

Commit Message

Ani Sinha Sept. 8, 2023, 9:50 a.m. UTC
Depending on the number of available address bits of the current processor, a
VM can only use a certain maximum amount of memory and no more. This change
makes sure that a VM is not configured to have more memory than what it can use
with the current processor settings when started. Additionally, the change adds
checks during memory hotplug to ensure that the VM does not end up getting more
memory than what it can actually use after hotplug.
Currently, both the above checks are only for pc (x86) platform.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1235403
CC: imammedo@redhat.com
Signed-off-by: Ani Sinha <anisinha@redhat.com>
---
 hw/i386/pc.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
 hw/mem/memory-device.c |  6 ++++++
 include/hw/boards.h    |  9 +++++++++
 3 files changed, 60 insertions(+)

Comments

David Hildenbrand Sept. 8, 2023, 10:28 a.m. UTC | #1
On 08.09.23 11:50, Ani Sinha wrote:
> Depending on the number of available address bits of the current processor, a
> VM can only use a certain maximum amount of memory and no more. This change
> makes sure that a VM is not configured to have more memory than what it can use
> with the current processor settings when started. Additionally, the change adds
> checks during memory hotplug to ensure that the VM does not end up getting more
> memory than what it can actually use after hotplug.
> Currently, both the above checks are only for pc (x86) platform.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1235403
> CC: imammedo@redhat.com
> Signed-off-by: Ani Sinha <anisinha@redhat.com>
> ---
>   hw/i386/pc.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
>   hw/mem/memory-device.c |  6 ++++++
>   include/hw/boards.h    |  9 +++++++++
>   3 files changed, 60 insertions(+)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 54838c0c41..f84e4c4916 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -31,6 +31,7 @@
>   #include "hw/i386/topology.h"
>   #include "hw/i386/fw_cfg.h"
>   #include "hw/i386/vmport.h"
> +#include "hw/mem/memory-device.h"
>   #include "sysemu/cpus.h"
>   #include "hw/block/fdc.h"
>   #include "hw/ide/internal.h"
> @@ -1006,6 +1007,17 @@ void pc_memory_init(PCMachineState *pcms,
>           exit(EXIT_FAILURE);
>       }
>   
> +    /*
> +     * check if the VM started with more ram configured than max physical
> +     * address available with the current processor.
> +     */
> +    if (machine->ram_size > maxphysaddr + 1) {
> +        error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64
> +                     " (max configured memory), phys-bits too low (%u)",
> +                     maxphysaddr, machine->ram_size, cpu->phys_bits);
> +        exit(EXIT_FAILURE);
> +    }

... I know that this used to be a problem in the past, but nowadays we 
already do have similar checks in place?

$ ./build/qemu-system-x86_64 -m 4T -machine q35,memory-backend=mem0 
-object memory-backend-ram,id=mem0,size=4T,reserve=off
qemu-system-x86_64: Address space limit 0xffffffffff < 0x5077fffffff 
phys-bits too low (40)

Why is that not sufficient or why can't that be extended?

> +
>       /*
>        * Split single memory region and use aliases to address portions of it,
>        * done for backwards compatibility with older qemus.
> @@ -1845,6 +1857,38 @@ static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp)
>       return true;
>   }
>   
> +static bool pc_mem_hotplug_allowed(MachineState *ms,
> +                                   MemoryRegion *mr, Error **errp)
> +{
> +    hwaddr maxphysaddr;
> +    uint64_t dimm_size, size, ram_size, total_mem_size;
> +    X86CPU *cpu = X86_CPU(first_cpu);
> +
> +    if (!mr) {
> +        return true;
> +    }
> +
> +    dimm_size = ms->device_memory->dimm_size;
> +    size = memory_region_size(mr);
> +    ram_size = (uint64_t) ms->ram_size;
> +    total_mem_size = ram_size + dimm_size + size;

That's wrong. The sizes does not tell you where the devices are actually 
located in the address space.

> +
> +    maxphysaddr = ((hwaddr)1 << cpu->phys_bits) - 1;
> +
> +    /*
> +     * total memory after hotplug will exceed the maximum physical
> +     * address limit of the processor. So hotplug cannot be allowed.
> +     */
> +    if ((total_mem_size > (uint64_t)maxphysaddr + 1) &&
> +        (total_mem_size > ram_size + dimm_size)) {
> +        error_setg(errp, "Address space limit 0x%"PRIx64" < 0x%"PRIx64
> +                   " phys-bits too low (%u)",
> +                   maxphysaddr, total_mem_size, cpu->phys_bits);
> +        return false;
> +    }
> +    return true;
> +}
> +
>   static void pc_machine_class_init(ObjectClass *oc, void *data)
>   {
>       MachineClass *mc = MACHINE_CLASS(oc);
> @@ -1870,6 +1914,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
>       assert(!mc->get_hotplug_handler);
>       mc->get_hotplug_handler = pc_get_hotplug_handler;
>       mc->hotplug_allowed = pc_hotplug_allowed;
> +    mc->mem_hotplug_allowed = pc_mem_hotplug_allowed;
>       mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
>       mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
>       mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
> diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c
> index 667d56bd29..825bc593ae 100644
> --- a/hw/mem/memory-device.c
> +++ b/hw/mem/memory-device.c
> @@ -57,6 +57,7 @@ static void memory_device_check_addable(MachineState *ms, MemoryRegion *mr,
>   {
>       const uint64_t used_region_size = ms->device_memory->used_region_size;
>       const uint64_t size = memory_region_size(mr);
> +    MachineClass *mc = MACHINE_GET_CLASS(ms);
>   
>       /* we will need a new memory slot for kvm and vhost */
>       if (kvm_enabled() && !kvm_has_free_slot(ms)) {
> @@ -68,6 +69,11 @@ static void memory_device_check_addable(MachineState *ms, MemoryRegion *mr,
>           return;
>       }
>   
> +    if (mc->mem_hotplug_allowed &&
> +        (!(mc->mem_hotplug_allowed(ms, mr, errp)))) {
> +        return;
> +    }
> +
>       /* will we exceed the total amount of memory specified */
>       if (used_region_size + size < used_region_size ||
>           used_region_size + size > ms->maxram_size - ms->ram_size) {
> diff --git a/include/hw/boards.h b/include/hw/boards.h
> index 3b541ffd24..84b199ee51 100644
> --- a/include/hw/boards.h
> +++ b/include/hw/boards.h
> @@ -210,6 +210,13 @@ typedef struct {
>    *    false is returned, an error must be set to show the reason of
>    *    the rejection.  If the hook is not provided, all hotplug will be
>    *    allowed.
> + * @mem_hotplug_allowed:
> + *    If the hook is provided, then it'll be called for each memory device
> + *    hotplug to check whether the mem device hotplug is allowed.  Return
> + *    true to grant allowance or false to reject the hotplug.  When
> + *    false is returned, an error must be set to show the reason of
> + *    the rejection.  If the hook is not provided, all mem hotplug will be
> + *    allowed.

That's nasty.

1) The machine hotplug handler already is in charge of plugging such 
devices. It could perform such checks there but,

2) Why even allow the device memory region to exceed maxphysaddr?


Instead, we should probably fail creating the device managed region if 
it would end up exceeding maxphysaddr.

pc_memory_init()-> ... -> machine_memory_devices_init()

Can't we make sure in pc_memory_init() that we can never have memory 
devices being plugged into inaccessible regions? Or check back later 
once we know the limit (if not already known)?
Ani Sinha Sept. 8, 2023, 2:12 p.m. UTC | #2
> On 08-Sep-2023, at 3:58 PM, David Hildenbrand <david@redhat.com> wrote:
> 
> On 08.09.23 11:50, Ani Sinha wrote:
>> Depending on the number of available address bits of the current processor, a
>> VM can only use a certain maximum amount of memory and no more. This change
>> makes sure that a VM is not configured to have more memory than what it can use
>> with the current processor settings when started. Additionally, the change adds
>> checks during memory hotplug to ensure that the VM does not end up getting more
>> memory than what it can actually use after hotplug.
>> Currently, both the above checks are only for pc (x86) platform.
>> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1235403
>> CC: imammedo@redhat.com
>> Signed-off-by: Ani Sinha <anisinha@redhat.com>
>> ---
>>  hw/i386/pc.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
>>  hw/mem/memory-device.c |  6 ++++++
>>  include/hw/boards.h    |  9 +++++++++
>>  3 files changed, 60 insertions(+)
>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>> index 54838c0c41..f84e4c4916 100644
>> --- a/hw/i386/pc.c
>> +++ b/hw/i386/pc.c
>> @@ -31,6 +31,7 @@
>>  #include "hw/i386/topology.h"
>>  #include "hw/i386/fw_cfg.h"
>>  #include "hw/i386/vmport.h"
>> +#include "hw/mem/memory-device.h"
>>  #include "sysemu/cpus.h"
>>  #include "hw/block/fdc.h"
>>  #include "hw/ide/internal.h"
>> @@ -1006,6 +1007,17 @@ void pc_memory_init(PCMachineState *pcms,
>>          exit(EXIT_FAILURE);
>>      }
>>  +    /*
>> +     * check if the VM started with more ram configured than max physical
>> +     * address available with the current processor.
>> +     */
>> +    if (machine->ram_size > maxphysaddr + 1) {
>> +        error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64
>> +                     " (max configured memory), phys-bits too low (%u)",
>> +                     maxphysaddr, machine->ram_size, cpu->phys_bits);
>> +        exit(EXIT_FAILURE);
>> +    }
> 
> ... I know that this used to be a problem in the past, but nowadays we already do have similar checks in place?
> 
> $ ./build/qemu-system-x86_64 -m 4T -machine q35,memory-backend=mem0 -object memory-backend-ram,id=mem0,size=4T,reserve=off
> qemu-system-x86_64: Address space limit 0xffffffffff < 0x5077fffffff phys-bits too low (40)

So you are saying that this is OK and should be allowed? On a 32 bit processor that can access only 4G memory, I am spinning up a 10G VM.

$ ./qemu-system-x86_64 -cpu pentium -m size=10G, -monitor stdio -qmp tcp:0:5555,server,nowait
QEMU 8.1.50 monitor - type 'help' for more information
VNC server running on ::1:5900
(qemu) 



> 
> Why is that not sufficient or why can't that be extended?
> 
>> +
>>      /*
>>       * Split single memory region and use aliases to address portions of it,
>>       * done for backwards compatibility with older qemus.
>> @@ -1845,6 +1857,38 @@ static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp)
>>      return true;
>>  }
>>  +static bool pc_mem_hotplug_allowed(MachineState *ms,
>> +                                   MemoryRegion *mr, Error **errp)
>> +{
>> +    hwaddr maxphysaddr;
>> +    uint64_t dimm_size, size, ram_size, total_mem_size;
>> +    X86CPU *cpu = X86_CPU(first_cpu);
>> +
>> +    if (!mr) {
>> +        return true;
>> +    }
>> +
>> +    dimm_size = ms->device_memory->dimm_size;
>> +    size = memory_region_size(mr);
>> +    ram_size = (uint64_t) ms->ram_size;
>> +    total_mem_size = ram_size + dimm_size + size;
> 
> That's wrong. The sizes does not tell you where the devices are actually located in the address space.
> 
>> +
>> +    maxphysaddr = ((hwaddr)1 << cpu->phys_bits) - 1;
>> +
>> +    /*
>> +     * total memory after hotplug will exceed the maximum physical
>> +     * address limit of the processor. So hotplug cannot be allowed.
>> +     */
>> +    if ((total_mem_size > (uint64_t)maxphysaddr + 1) &&
>> +        (total_mem_size > ram_size + dimm_size)) {
>> +        error_setg(errp, "Address space limit 0x%"PRIx64" < 0x%"PRIx64
>> +                   " phys-bits too low (%u)",
>> +                   maxphysaddr, total_mem_size, cpu->phys_bits);
>> +        return false;
>> +    }
>> +    return true;
>> +}
>> +
>>  static void pc_machine_class_init(ObjectClass *oc, void *data)
>>  {
>>      MachineClass *mc = MACHINE_CLASS(oc);
>> @@ -1870,6 +1914,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
>>      assert(!mc->get_hotplug_handler);
>>      mc->get_hotplug_handler = pc_get_hotplug_handler;
>>      mc->hotplug_allowed = pc_hotplug_allowed;
>> +    mc->mem_hotplug_allowed = pc_mem_hotplug_allowed;
>>      mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
>>      mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
>>      mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
>> diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c
>> index 667d56bd29..825bc593ae 100644
>> --- a/hw/mem/memory-device.c
>> +++ b/hw/mem/memory-device.c
>> @@ -57,6 +57,7 @@ static void memory_device_check_addable(MachineState *ms, MemoryRegion *mr,
>>  {
>>      const uint64_t used_region_size = ms->device_memory->used_region_size;
>>      const uint64_t size = memory_region_size(mr);
>> +    MachineClass *mc = MACHINE_GET_CLASS(ms);
>>        /* we will need a new memory slot for kvm and vhost */
>>      if (kvm_enabled() && !kvm_has_free_slot(ms)) {
>> @@ -68,6 +69,11 @@ static void memory_device_check_addable(MachineState *ms, MemoryRegion *mr,
>>          return;
>>      }
>>  +    if (mc->mem_hotplug_allowed &&
>> +        (!(mc->mem_hotplug_allowed(ms, mr, errp)))) {
>> +        return;
>> +    }
>> +
>>      /* will we exceed the total amount of memory specified */
>>      if (used_region_size + size < used_region_size ||
>>          used_region_size + size > ms->maxram_size - ms->ram_size) {
>> diff --git a/include/hw/boards.h b/include/hw/boards.h
>> index 3b541ffd24..84b199ee51 100644
>> --- a/include/hw/boards.h
>> +++ b/include/hw/boards.h
>> @@ -210,6 +210,13 @@ typedef struct {
>>   *    false is returned, an error must be set to show the reason of
>>   *    the rejection.  If the hook is not provided, all hotplug will be
>>   *    allowed.
>> + * @mem_hotplug_allowed:
>> + *    If the hook is provided, then it'll be called for each memory device
>> + *    hotplug to check whether the mem device hotplug is allowed.  Return
>> + *    true to grant allowance or false to reject the hotplug.  When
>> + *    false is returned, an error must be set to show the reason of
>> + *    the rejection.  If the hook is not provided, all mem hotplug will be
>> + *    allowed.
> 
> That's nasty.
> 
> 1) The machine hotplug handler already is in charge of plugging such devices. It could perform such checks there but,
> 
> 2) Why even allow the device memory region to exceed maxphysaddr?
> 
> 
> Instead, we should probably fail creating the device managed region if it would end up exceeding maxphysaddr.
> 
> pc_memory_init()-> ... -> machine_memory_devices_init()
> 
> Can't we make sure in pc_memory_init() that we can never have memory devices being plugged into inaccessible regions? Or check back later once we know the limit (if not already known)?
> 
> -- 
> Cheers,
> 
> David / dhildenb
David Hildenbrand Sept. 8, 2023, 2:16 p.m. UTC | #3
On 08.09.23 16:12, Ani Sinha wrote:
> 
> 
>> On 08-Sep-2023, at 3:58 PM, David Hildenbrand <david@redhat.com> wrote:
>>
>> On 08.09.23 11:50, Ani Sinha wrote:
>>> Depending on the number of available address bits of the current processor, a
>>> VM can only use a certain maximum amount of memory and no more. This change
>>> makes sure that a VM is not configured to have more memory than what it can use
>>> with the current processor settings when started. Additionally, the change adds
>>> checks during memory hotplug to ensure that the VM does not end up getting more
>>> memory than what it can actually use after hotplug.
>>> Currently, both the above checks are only for pc (x86) platform.
>>> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1235403
>>> CC: imammedo@redhat.com
>>> Signed-off-by: Ani Sinha <anisinha@redhat.com>
>>> ---
>>>   hw/i386/pc.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
>>>   hw/mem/memory-device.c |  6 ++++++
>>>   include/hw/boards.h    |  9 +++++++++
>>>   3 files changed, 60 insertions(+)
>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>> index 54838c0c41..f84e4c4916 100644
>>> --- a/hw/i386/pc.c
>>> +++ b/hw/i386/pc.c
>>> @@ -31,6 +31,7 @@
>>>   #include "hw/i386/topology.h"
>>>   #include "hw/i386/fw_cfg.h"
>>>   #include "hw/i386/vmport.h"
>>> +#include "hw/mem/memory-device.h"
>>>   #include "sysemu/cpus.h"
>>>   #include "hw/block/fdc.h"
>>>   #include "hw/ide/internal.h"
>>> @@ -1006,6 +1007,17 @@ void pc_memory_init(PCMachineState *pcms,
>>>           exit(EXIT_FAILURE);
>>>       }
>>>   +    /*
>>> +     * check if the VM started with more ram configured than max physical
>>> +     * address available with the current processor.
>>> +     */
>>> +    if (machine->ram_size > maxphysaddr + 1) {
>>> +        error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64
>>> +                     " (max configured memory), phys-bits too low (%u)",
>>> +                     maxphysaddr, machine->ram_size, cpu->phys_bits);
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>
>> ... I know that this used to be a problem in the past, but nowadays we already do have similar checks in place?
>>
>> $ ./build/qemu-system-x86_64 -m 4T -machine q35,memory-backend=mem0 -object memory-backend-ram,id=mem0,size=4T,reserve=off
>> qemu-system-x86_64: Address space limit 0xffffffffff < 0x5077fffffff phys-bits too low (40)
> 
> So you are saying that this is OK and should be allowed? On a 32 bit processor that can access only 4G memory, I am spinning up a 10G VM.

Would that 32bit process have PAE (Physical Address Extension) and still 
be able to access that memory?
Ani Sinha Sept. 8, 2023, 3:13 p.m. UTC | #4
> On 08-Sep-2023, at 7:46 PM, David Hildenbrand <david@redhat.com> wrote:
> 
> On 08.09.23 16:12, Ani Sinha wrote:
>>> On 08-Sep-2023, at 3:58 PM, David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> On 08.09.23 11:50, Ani Sinha wrote:
>>>> Depending on the number of available address bits of the current processor, a
>>>> VM can only use a certain maximum amount of memory and no more. This change
>>>> makes sure that a VM is not configured to have more memory than what it can use
>>>> with the current processor settings when started. Additionally, the change adds
>>>> checks during memory hotplug to ensure that the VM does not end up getting more
>>>> memory than what it can actually use after hotplug.
>>>> Currently, both the above checks are only for pc (x86) platform.
>>>> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1235403
>>>> CC: imammedo@redhat.com
>>>> Signed-off-by: Ani Sinha <anisinha@redhat.com>
>>>> ---
>>>>  hw/i386/pc.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
>>>>  hw/mem/memory-device.c |  6 ++++++
>>>>  include/hw/boards.h    |  9 +++++++++
>>>>  3 files changed, 60 insertions(+)
>>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>>> index 54838c0c41..f84e4c4916 100644
>>>> --- a/hw/i386/pc.c
>>>> +++ b/hw/i386/pc.c
>>>> @@ -31,6 +31,7 @@
>>>>  #include "hw/i386/topology.h"
>>>>  #include "hw/i386/fw_cfg.h"
>>>>  #include "hw/i386/vmport.h"
>>>> +#include "hw/mem/memory-device.h"
>>>>  #include "sysemu/cpus.h"
>>>>  #include "hw/block/fdc.h"
>>>>  #include "hw/ide/internal.h"
>>>> @@ -1006,6 +1007,17 @@ void pc_memory_init(PCMachineState *pcms,
>>>>          exit(EXIT_FAILURE);
>>>>      }
>>>>  +    /*
>>>> +     * check if the VM started with more ram configured than max physical
>>>> +     * address available with the current processor.
>>>> +     */
>>>> +    if (machine->ram_size > maxphysaddr + 1) {
>>>> +        error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64
>>>> +                     " (max configured memory), phys-bits too low (%u)",
>>>> +                     maxphysaddr, machine->ram_size, cpu->phys_bits);
>>>> +        exit(EXIT_FAILURE);
>>>> +    }
>>> 
>>> ... I know that this used to be a problem in the past, but nowadays we already do have similar checks in place?
>>> 
>>> $ ./build/qemu-system-x86_64 -m 4T -machine q35,memory-backend=mem0 -object memory-backend-ram,id=mem0,size=4T,reserve=off
>>> qemu-system-x86_64: Address space limit 0xffffffffff < 0x5077fffffff phys-bits too low (40)
>> So you are saying that this is OK and should be allowed? On a 32 bit processor that can access only 4G memory, I am spinning up a 10G VM.
> 
> Would that 32bit process have PAE (Physical Address Extension) and still be able to access that memory?


You are sidestepping my point. Sure, we can improve the condition check by checking for PAE CPUID etc but that is not the issue I am trying too point out. What if the processor did not have PAE? Would we allow a VM to have memory size which the processor can’t access? There is no such check today it would seem.
David Hildenbrand Sept. 8, 2023, 4:02 p.m. UTC | #5
On 08.09.23 17:13, Ani Sinha wrote:
> 
> 
>> On 08-Sep-2023, at 7:46 PM, David Hildenbrand <david@redhat.com> wrote:
>>
>> On 08.09.23 16:12, Ani Sinha wrote:
>>>> On 08-Sep-2023, at 3:58 PM, David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 08.09.23 11:50, Ani Sinha wrote:
>>>>> Depending on the number of available address bits of the current processor, a
>>>>> VM can only use a certain maximum amount of memory and no more. This change
>>>>> makes sure that a VM is not configured to have more memory than what it can use
>>>>> with the current processor settings when started. Additionally, the change adds
>>>>> checks during memory hotplug to ensure that the VM does not end up getting more
>>>>> memory than what it can actually use after hotplug.
>>>>> Currently, both the above checks are only for pc (x86) platform.
>>>>> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1235403
>>>>> CC: imammedo@redhat.com
>>>>> Signed-off-by: Ani Sinha <anisinha@redhat.com>
>>>>> ---
>>>>>   hw/i386/pc.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
>>>>>   hw/mem/memory-device.c |  6 ++++++
>>>>>   include/hw/boards.h    |  9 +++++++++
>>>>>   3 files changed, 60 insertions(+)
>>>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>>>> index 54838c0c41..f84e4c4916 100644
>>>>> --- a/hw/i386/pc.c
>>>>> +++ b/hw/i386/pc.c
>>>>> @@ -31,6 +31,7 @@
>>>>>   #include "hw/i386/topology.h"
>>>>>   #include "hw/i386/fw_cfg.h"
>>>>>   #include "hw/i386/vmport.h"
>>>>> +#include "hw/mem/memory-device.h"
>>>>>   #include "sysemu/cpus.h"
>>>>>   #include "hw/block/fdc.h"
>>>>>   #include "hw/ide/internal.h"
>>>>> @@ -1006,6 +1007,17 @@ void pc_memory_init(PCMachineState *pcms,
>>>>>           exit(EXIT_FAILURE);
>>>>>       }
>>>>>   +    /*
>>>>> +     * check if the VM started with more ram configured than max physical
>>>>> +     * address available with the current processor.
>>>>> +     */
>>>>> +    if (machine->ram_size > maxphysaddr + 1) {
>>>>> +        error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64
>>>>> +                     " (max configured memory), phys-bits too low (%u)",
>>>>> +                     maxphysaddr, machine->ram_size, cpu->phys_bits);
>>>>> +        exit(EXIT_FAILURE);
>>>>> +    }
>>>>
>>>> ... I know that this used to be a problem in the past, but nowadays we already do have similar checks in place?
>>>>
>>>> $ ./build/qemu-system-x86_64 -m 4T -machine q35,memory-backend=mem0 -object memory-backend-ram,id=mem0,size=4T,reserve=off
>>>> qemu-system-x86_64: Address space limit 0xffffffffff < 0x5077fffffff phys-bits too low (40)
>>> So you are saying that this is OK and should be allowed? On a 32 bit processor that can access only 4G memory, I am spinning up a 10G VM.
>>
>> Would that 32bit process have PAE (Physical Address Extension) and still be able to access that memory?
> 
> 
> You are sidestepping my point. Sure, we can improve the condition check by checking for PAE CPUID etc but that is not the issue I am trying too point out. What if the processor did not have PAE? Would we allow a VM to have memory size which the processor can’t access? There is no such check today it would seem.
> 

Indeed, because the implementation for 32bit in pc_max_used_gpa() is wrong.

Note that for 64bit it does the right thing, even with memory hotplug, 
because the PCI64 hole is placed above the memory device region.

So I think we should tackle that via pc_max_used_gpa().

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 54838c0c41..d187890675 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -908,9 +908,12 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, 
uint64_t pci_hole64_size)
  {
      X86CPU *cpu = X86_CPU(first_cpu);

-    /* 32-bit systems don't have hole64 thus return max CPU address */
-    if (cpu->phys_bits <= 32) {
-        return ((hwaddr)1 << cpu->phys_bits) - 1;
+    /*
+     * 32-bit systems don't have hole64, but we might have a region for
+     * memory hotplug.
+     */
+    if (!(cpu->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
+        return pc_pci_hole64_start() - 1;
      }

      return pc_pci_hole64_start() + pci_hole64_size - 1;


That implies:

./build/qemu-system-x86_64 -cpu pentium -m size=4G -nodefaults -nographic
qemu-system-x86_64: Address space limit 0xffffffff < 0x13fffffff 
phys-bits too low (32)

As we have memory over 4G (due to PCI hole), that would now correctly fail.

However, what works is:

./build/qemu-system-x86_64 -cpu pentium -m size=3G -nodefaults -nographic


Weirdly enough, when setting cpu->phys_bits, we take care of PSE36 and 
allow for 36bits in the address space.


So what works:

./build/qemu-system-x86_64 -cpu pentium,pse36=on -m size=32G -nodefaults 
-nographic

And what doesn't:

  ./build/qemu-system-x86_64 -cpu pentium,pse36=on -m size=64G 
-nodefaults -nographic -S
qemu-system-x86_64: Address space limit 0xfffffffff < 0x103fffffff 
phys-bits too low (36)


However, we don't seem to have such handling in place for PAE (do we 
have to extend that handling in x86_cpu_realizefn()?). Maybe pae should 
always imply pse36, not sure ...
David Hildenbrand Sept. 8, 2023, 4:04 p.m. UTC | #6
On 08.09.23 18:02, David Hildenbrand wrote:
> On 08.09.23 17:13, Ani Sinha wrote:
>>
>>
>>> On 08-Sep-2023, at 7:46 PM, David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 08.09.23 16:12, Ani Sinha wrote:
>>>>> On 08-Sep-2023, at 3:58 PM, David Hildenbrand <david@redhat.com> wrote:
>>>>>
>>>>> On 08.09.23 11:50, Ani Sinha wrote:
>>>>>> Depending on the number of available address bits of the current processor, a
>>>>>> VM can only use a certain maximum amount of memory and no more. This change
>>>>>> makes sure that a VM is not configured to have more memory than what it can use
>>>>>> with the current processor settings when started. Additionally, the change adds
>>>>>> checks during memory hotplug to ensure that the VM does not end up getting more
>>>>>> memory than what it can actually use after hotplug.
>>>>>> Currently, both the above checks are only for pc (x86) platform.
>>>>>> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1235403
>>>>>> CC: imammedo@redhat.com
>>>>>> Signed-off-by: Ani Sinha <anisinha@redhat.com>
>>>>>> ---
>>>>>>    hw/i386/pc.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
>>>>>>    hw/mem/memory-device.c |  6 ++++++
>>>>>>    include/hw/boards.h    |  9 +++++++++
>>>>>>    3 files changed, 60 insertions(+)
>>>>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>>>>> index 54838c0c41..f84e4c4916 100644
>>>>>> --- a/hw/i386/pc.c
>>>>>> +++ b/hw/i386/pc.c
>>>>>> @@ -31,6 +31,7 @@
>>>>>>    #include "hw/i386/topology.h"
>>>>>>    #include "hw/i386/fw_cfg.h"
>>>>>>    #include "hw/i386/vmport.h"
>>>>>> +#include "hw/mem/memory-device.h"
>>>>>>    #include "sysemu/cpus.h"
>>>>>>    #include "hw/block/fdc.h"
>>>>>>    #include "hw/ide/internal.h"
>>>>>> @@ -1006,6 +1007,17 @@ void pc_memory_init(PCMachineState *pcms,
>>>>>>            exit(EXIT_FAILURE);
>>>>>>        }
>>>>>>    +    /*
>>>>>> +     * check if the VM started with more ram configured than max physical
>>>>>> +     * address available with the current processor.
>>>>>> +     */
>>>>>> +    if (machine->ram_size > maxphysaddr + 1) {
>>>>>> +        error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64
>>>>>> +                     " (max configured memory), phys-bits too low (%u)",
>>>>>> +                     maxphysaddr, machine->ram_size, cpu->phys_bits);
>>>>>> +        exit(EXIT_FAILURE);
>>>>>> +    }
>>>>>
>>>>> ... I know that this used to be a problem in the past, but nowadays we already do have similar checks in place?
>>>>>
>>>>> $ ./build/qemu-system-x86_64 -m 4T -machine q35,memory-backend=mem0 -object memory-backend-ram,id=mem0,size=4T,reserve=off
>>>>> qemu-system-x86_64: Address space limit 0xffffffffff < 0x5077fffffff phys-bits too low (40)
>>>> So you are saying that this is OK and should be allowed? On a 32 bit processor that can access only 4G memory, I am spinning up a 10G VM.
>>>
>>> Would that 32bit process have PAE (Physical Address Extension) and still be able to access that memory?
>>
>>
>> You are sidestepping my point. Sure, we can improve the condition check by checking for PAE CPUID etc but that is not the issue I am trying too point out. What if the processor did not have PAE? Would we allow a VM to have memory size which the processor can’t access? There is no such check today it would seem.
>>
> 
> Indeed, because the implementation for 32bit in pc_max_used_gpa() is wrong.
> 
> Note that for 64bit it does the right thing, even with memory hotplug,
> because the PCI64 hole is placed above the memory device region.
> 
> So I think we should tackle that via pc_max_used_gpa().
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 54838c0c41..d187890675 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -908,9 +908,12 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms,
> uint64_t pci_hole64_size)
>    {
>        X86CPU *cpu = X86_CPU(first_cpu);
> 
> -    /* 32-bit systems don't have hole64 thus return max CPU address */
> -    if (cpu->phys_bits <= 32) {
> -        return ((hwaddr)1 << cpu->phys_bits) - 1;
> +    /*
> +     * 32-bit systems don't have hole64, but we might have a region for
> +     * memory hotplug.
> +     */
> +    if (!(cpu->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
> +        return pc_pci_hole64_start() - 1;
>        }
> 
>        return pc_pci_hole64_start() + pci_hole64_size - 1;
> 
> 
> That implies:
> 
> ./build/qemu-system-x86_64 -cpu pentium -m size=4G -nodefaults -nographic
> qemu-system-x86_64: Address space limit 0xffffffff < 0x13fffffff
> phys-bits too low (32)
> 
> As we have memory over 4G (due to PCI hole), that would now correctly fail.
> 
> However, what works is:
> 
> ./build/qemu-system-x86_64 -cpu pentium -m size=3G -nodefaults -nographic
> 
> 
> Weirdly enough, when setting cpu->phys_bits, we take care of PSE36 and
> allow for 36bits in the address space.
> 
> 
> So what works:
> 
> ./build/qemu-system-x86_64 -cpu pentium,pse36=on -m size=32G -nodefaults
> -nographic
> 
> And what doesn't:
> 
>    ./build/qemu-system-x86_64 -cpu pentium,pse36=on -m size=64G
> -nodefaults -nographic -S
> qemu-system-x86_64: Address space limit 0xfffffffff < 0x103fffffff
> phys-bits too low (36)
> 
> 
> However, we don't seem to have such handling in place for PAE (do we
> have to extend that handling in x86_cpu_realizefn()?). Maybe pae should
> always imply pse36, not sure ...
> 

Reading trustworthy wikipedia:

"Physical Address Extension (PAE) is an alternative to PSE-36 which also 
allows 36-bit addressing."

So maybe we have to consider PAE as well.
Philippe Mathieu-Daudé Sept. 8, 2023, 4:04 p.m. UTC | #7
On 8/9/23 17:13, Ani Sinha wrote:
> 
> 
>> On 08-Sep-2023, at 7:46 PM, David Hildenbrand <david@redhat.com> wrote:
>>
>> On 08.09.23 16:12, Ani Sinha wrote:
>>>> On 08-Sep-2023, at 3:58 PM, David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 08.09.23 11:50, Ani Sinha wrote:
>>>>> Depending on the number of available address bits of the current processor, a
>>>>> VM can only use a certain maximum amount of memory and no more. This change
>>>>> makes sure that a VM is not configured to have more memory than what it can use
>>>>> with the current processor settings when started. Additionally, the change adds
>>>>> checks during memory hotplug to ensure that the VM does not end up getting more
>>>>> memory than what it can actually use after hotplug.
>>>>> Currently, both the above checks are only for pc (x86) platform.
>>>>> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1235403
>>>>> CC: imammedo@redhat.com
>>>>> Signed-off-by: Ani Sinha <anisinha@redhat.com>
>>>>> ---
>>>>>   hw/i386/pc.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
>>>>>   hw/mem/memory-device.c |  6 ++++++
>>>>>   include/hw/boards.h    |  9 +++++++++
>>>>>   3 files changed, 60 insertions(+)
>>>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>>>> index 54838c0c41..f84e4c4916 100644
>>>>> --- a/hw/i386/pc.c
>>>>> +++ b/hw/i386/pc.c
>>>>> @@ -31,6 +31,7 @@
>>>>>   #include "hw/i386/topology.h"
>>>>>   #include "hw/i386/fw_cfg.h"
>>>>>   #include "hw/i386/vmport.h"
>>>>> +#include "hw/mem/memory-device.h"
>>>>>   #include "sysemu/cpus.h"
>>>>>   #include "hw/block/fdc.h"
>>>>>   #include "hw/ide/internal.h"
>>>>> @@ -1006,6 +1007,17 @@ void pc_memory_init(PCMachineState *pcms,
>>>>>           exit(EXIT_FAILURE);
>>>>>       }
>>>>>   +    /*
>>>>> +     * check if the VM started with more ram configured than max physical
>>>>> +     * address available with the current processor.
>>>>> +     */
>>>>> +    if (machine->ram_size > maxphysaddr + 1) {
>>>>> +        error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64
>>>>> +                     " (max configured memory), phys-bits too low (%u)",
>>>>> +                     maxphysaddr, machine->ram_size, cpu->phys_bits);
>>>>> +        exit(EXIT_FAILURE);
>>>>> +    }
>>>>
>>>> ... I know that this used to be a problem in the past, but nowadays we already do have similar checks in place?
>>>>
>>>> $ ./build/qemu-system-x86_64 -m 4T -machine q35,memory-backend=mem0 -object memory-backend-ram,id=mem0,size=4T,reserve=off
>>>> qemu-system-x86_64: Address space limit 0xffffffffff < 0x5077fffffff phys-bits too low (40)
>>> So you are saying that this is OK and should be allowed? On a 32 bit processor that can access only 4G memory, I am spinning up a 10G VM.
>>
>> Would that 32bit process have PAE (Physical Address Extension) and still be able to access that memory?
> 
> 
> You are sidestepping my point. Sure, we can improve the condition check by checking for PAE CPUID etc but that is not the issue I am trying too point out. What if the processor did not have PAE? Would we allow a VM to have memory size which the processor can’t access? There is no such check today it would seem.

Which processor, the host or the guest?

Even if the guest CPU can't access all the VM memory, this memory can be
used by devices without interaction with the CPU (see i.e. PCI).
Ani Sinha Sept. 12, 2023, 10:41 a.m. UTC | #8
> On 08-Sep-2023, at 9:32 PM, David Hildenbrand <david@redhat.com> wrote:
> 
> On 08.09.23 17:13, Ani Sinha wrote:
>>> On 08-Sep-2023, at 7:46 PM, David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> On 08.09.23 16:12, Ani Sinha wrote:
>>>>> On 08-Sep-2023, at 3:58 PM, David Hildenbrand <david@redhat.com> wrote:
>>>>> 
>>>>> On 08.09.23 11:50, Ani Sinha wrote:
>>>>>> Depending on the number of available address bits of the current processor, a
>>>>>> VM can only use a certain maximum amount of memory and no more. This change
>>>>>> makes sure that a VM is not configured to have more memory than what it can use
>>>>>> with the current processor settings when started. Additionally, the change adds
>>>>>> checks during memory hotplug to ensure that the VM does not end up getting more
>>>>>> memory than what it can actually use after hotplug.
>>>>>> Currently, both the above checks are only for pc (x86) platform.
>>>>>> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1235403
>>>>>> CC: imammedo@redhat.com
>>>>>> Signed-off-by: Ani Sinha <anisinha@redhat.com>
>>>>>> ---
>>>>>>  hw/i386/pc.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
>>>>>>  hw/mem/memory-device.c |  6 ++++++
>>>>>>  include/hw/boards.h    |  9 +++++++++
>>>>>>  3 files changed, 60 insertions(+)
>>>>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>>>>> index 54838c0c41..f84e4c4916 100644
>>>>>> --- a/hw/i386/pc.c
>>>>>> +++ b/hw/i386/pc.c
>>>>>> @@ -31,6 +31,7 @@
>>>>>>  #include "hw/i386/topology.h"
>>>>>>  #include "hw/i386/fw_cfg.h"
>>>>>>  #include "hw/i386/vmport.h"
>>>>>> +#include "hw/mem/memory-device.h"
>>>>>>  #include "sysemu/cpus.h"
>>>>>>  #include "hw/block/fdc.h"
>>>>>>  #include "hw/ide/internal.h"
>>>>>> @@ -1006,6 +1007,17 @@ void pc_memory_init(PCMachineState *pcms,
>>>>>>          exit(EXIT_FAILURE);
>>>>>>      }
>>>>>>  +    /*
>>>>>> +     * check if the VM started with more ram configured than max physical
>>>>>> +     * address available with the current processor.
>>>>>> +     */
>>>>>> +    if (machine->ram_size > maxphysaddr + 1) {
>>>>>> +        error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64
>>>>>> +                     " (max configured memory), phys-bits too low (%u)",
>>>>>> +                     maxphysaddr, machine->ram_size, cpu->phys_bits);
>>>>>> +        exit(EXIT_FAILURE);
>>>>>> +    }
>>>>> 
>>>>> ... I know that this used to be a problem in the past, but nowadays we already do have similar checks in place?
>>>>> 
>>>>> $ ./build/qemu-system-x86_64 -m 4T -machine q35,memory-backend=mem0 -object memory-backend-ram,id=mem0,size=4T,reserve=off
>>>>> qemu-system-x86_64: Address space limit 0xffffffffff < 0x5077fffffff phys-bits too low (40)
>>>> So you are saying that this is OK and should be allowed? On a 32 bit processor that can access only 4G memory, I am spinning up a 10G VM.
>>> 
>>> Would that 32bit process have PAE (Physical Address Extension) and still be able to access that memory?
>> You are sidestepping my point. Sure, we can improve the condition check by checking for PAE CPUID etc but that is not the issue I am trying too point out. What if the processor did not have PAE? Would we allow a VM to have memory size which the processor can’t access? There is no such check today it would seem.
> 
> Indeed, because the implementation for 32bit in pc_max_used_gpa() is wrong.
> 
> Note that for 64bit it does the right thing, even with memory hotplug, because the PCI64 hole is placed above the memory device region.
> 
> So I think we should tackle that via pc_max_used_gpa().
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 54838c0c41..d187890675 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -908,9 +908,12 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
> {
>     X86CPU *cpu = X86_CPU(first_cpu);
> 
> -    /* 32-bit systems don't have hole64 thus return max CPU address */
> -    if (cpu->phys_bits <= 32) {
> -        return ((hwaddr)1 << cpu->phys_bits) - 1;
> +    /*
> +     * 32-bit systems don't have hole64, but we might have a region for
> +     * memory hotplug.
> +     */
> +    if (!(cpu->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
> +        return pc_pci_hole64_start() - 1;

Ok this is very confusing! I am looking at pc_pci_hole64_start() function. I have a few questions …
(a) pc_get_device_memory_range() returns the size of the device memory as the difference between ram_size and maxram_size. But from what I understand, ram_size is the actual size of the ram present and maxram_size is the max size of ram *after* hot plugging additional memory. How can we assume that the additional available space is already occupied by hot plugged memory?
(b) Another question is, in pc_pci_hole64_start(), why are we adding this size to the start address?

} else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) {
	pc_get_device_memory_range(pcms, &hole64_start, &size);
        if (!pcmc->broken_reserved_end) {
            hole64_start += size;

I think this is trying to put the hole after the device memory. But if the ram size is <=maxram_size then the hole is after the above_4G memory? Why?

(c) in your above change, what does long mode have anything to do with all of this? 

>     }
> 
>     return pc_pci_hole64_start() + pci_hole64_size - 1;
> 
> 
> That implies:
> 
> ./build/qemu-system-x86_64 -cpu pentium -m size=4G -nodefaults -nographic
> qemu-system-x86_64: Address space limit 0xffffffff < 0x13fffffff phys-bits too low (32)
> 
> As we have memory over 4G (due to PCI hole), that would now correctly fail.
> 
> However, what works is:
> 
> ./build/qemu-system-x86_64 -cpu pentium -m size=3G -nodefaults -nographic
> 
> 
> Weirdly enough, when setting cpu->phys_bits, we take care of PSE36 and allow for 36bits in the address space.

Hmm, I see this

 if (env->features[FEAT_1_EDX] & CPUID_PSE36) {
            cpu->phys_bits = 36;
        } else {
            cpu->phys_bits = 32;
        }

I will send a small patch to add PAE as well to this. 

> 
> So what works:
> 
> ./build/qemu-system-x86_64 -cpu pentium,pse36=on -m size=32G -nodefaults -nographic
> 
> And what doesn't:
> 
> ./build/qemu-system-x86_64 -cpu pentium,pse36=on -m size=64G -nodefaults -nographic -S
> qemu-system-x86_64: Address space limit 0xfffffffff < 0x103fffffff phys-bits too low (36)
> 
> 
> However, we don't seem to have such handling in place for PAE (do we have to extend that handling in x86_cpu_realizefn()?). Maybe pae should always imply pse36, not sure ...
> 
> -- 
> Cheers,
> 
> David / dhildenb
>
David Hildenbrand Sept. 12, 2023, 3:34 p.m. UTC | #9
[...]

>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>> index 54838c0c41..d187890675 100644
>> --- a/hw/i386/pc.c
>> +++ b/hw/i386/pc.c
>> @@ -908,9 +908,12 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
>> {
>>      X86CPU *cpu = X86_CPU(first_cpu);
>>
>> -    /* 32-bit systems don't have hole64 thus return max CPU address */
>> -    if (cpu->phys_bits <= 32) {
>> -        return ((hwaddr)1 << cpu->phys_bits) - 1;
>> +    /*
>> +     * 32-bit systems don't have hole64, but we might have a region for
>> +     * memory hotplug.
>> +     */
>> +    if (!(cpu->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
>> +        return pc_pci_hole64_start() - 1;
> 
> Ok this is very confusing! I am looking at pc_pci_hole64_start() function. I have a few questions …
> (a) pc_get_device_memory_range() returns the size of the device memory as the difference between ram_size and maxram_size. But from what I understand, ram_size is the actual size of the ram present and maxram_size is the max size of ram *after* hot plugging additional memory. How can we assume that the additional available space is already occupied by hot plugged memory?

Let's take a look at an example:

$ ./build/qemu-system-x86_64 -m 8g,maxmem=16g,slots=1 \
   -object memory-backend-ram,id=mem0,size=1g \
   -device pc-dimm,memdev=mem0 \
   -nodefaults -nographic -S -monitor stdio

(qemu) info mtree
...
memory-region: system
   0000000000000000-ffffffffffffffff (prio 0, i/o): system
     0000000000000000-00000000bfffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
     0000000000000000-ffffffffffffffff (prio -1, i/o): pci
       00000000000c0000-00000000000dffff (prio 1, rom): pc.rom
       00000000000e0000-00000000000fffff (prio 1, rom): alias isa-bios @pc.bios 0000000000020000-000000000003ffff
       00000000fffc0000-00000000ffffffff (prio 0, rom): pc.bios
     00000000000a0000-00000000000bffff (prio 1, i/o): alias smram-region @pci 00000000000a0000-00000000000bffff
     00000000000c0000-00000000000c3fff (prio 1, i/o): alias pam-pci @pci 00000000000c0000-00000000000c3fff
     00000000000c4000-00000000000c7fff (prio 1, i/o): alias pam-pci @pci 00000000000c4000-00000000000c7fff
     00000000000c8000-00000000000cbfff (prio 1, i/o): alias pam-pci @pci 00000000000c8000-00000000000cbfff
     00000000000cc000-00000000000cffff (prio 1, i/o): alias pam-pci @pci 00000000000cc000-00000000000cffff
     00000000000d0000-00000000000d3fff (prio 1, i/o): alias pam-pci @pci 00000000000d0000-00000000000d3fff
     00000000000d4000-00000000000d7fff (prio 1, i/o): alias pam-pci @pci 00000000000d4000-00000000000d7fff
     00000000000d8000-00000000000dbfff (prio 1, i/o): alias pam-pci @pci 00000000000d8000-00000000000dbfff
     00000000000dc000-00000000000dffff (prio 1, i/o): alias pam-pci @pci 00000000000dc000-00000000000dffff
     00000000000e0000-00000000000e3fff (prio 1, i/o): alias pam-pci @pci 00000000000e0000-00000000000e3fff
     00000000000e4000-00000000000e7fff (prio 1, i/o): alias pam-pci @pci 00000000000e4000-00000000000e7fff
     00000000000e8000-00000000000ebfff (prio 1, i/o): alias pam-pci @pci 00000000000e8000-00000000000ebfff
     00000000000ec000-00000000000effff (prio 1, i/o): alias pam-pci @pci 00000000000ec000-00000000000effff
     00000000000f0000-00000000000fffff (prio 1, i/o): alias pam-pci @pci 00000000000f0000-00000000000fffff
     00000000fec00000-00000000fec00fff (prio 0, i/o): ioapic
     00000000fed00000-00000000fed003ff (prio 0, i/o): hpet
     00000000fee00000-00000000feefffff (prio 4096, i/o): apic-msi
     0000000100000000-000000023fffffff (prio 0, ram): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
     0000000240000000-000000047fffffff (prio 0, i/o): device-memory
       0000000240000000-000000027fffffff (prio 0, ram): mem0


We requested 8G of boot memory, which is split between "<4G" memory and ">=4G" memory.

We only place exactly 3G (0x0->0xbfffffff) under 4G, starting at address 0.

We leave the remainder (1G) of the <4G addresses available for I/O devices (32bit PCI hole).

So we end up with 5G (0x100000000->0x23fffffff) of memory starting exactly at address 4G.

"maxram_size - ram_size"=8G is the maximum amount of memory you can hotplug. We use it to size the
"device-memory" region:

0x47fffffff - 0x240000000+1 = 0x240000000
-> 9 GiB

We requested a to hotplug a maximum of "8 GiB", and sized the area slightly larger to allow for some flexibility
when it comes to placing DIMMs in that "device-memory" area.

We place that area for memory devices after the RAM. So it starts after the 5G of ">=4G" boot memory.


Long story short, based on the initial RAM size and the maximum RAM size, you
can construct the layout above and exactly know
a) How much memory is below 4G, starting at address 0 -> leaving 1G for the 32bit PCI hole
b) How much memory is above 4G, starting at address 4g.
c) Where the region for memory devices starts (aligned after b) ) and how big it is.
d) Where the 64bit PCI hole is (after c) )

> (b) Another question is, in pc_pci_hole64_start(), why are we adding this size to the start address?
> 
> } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) {
> 	pc_get_device_memory_range(pcms, &hole64_start, &size);
>          if (!pcmc->broken_reserved_end) {
>              hole64_start += size;

The 64bit PCI hole starts after "device-memory" above.

Apparently, we have to take care of some layout issues before QEMU 2.5. You can assume that nowadays,
"pcmc->broken_reserved_end" is never set. So the PCI64 hole is always after the device-memory region.

> 
> I think this is trying to put the hole after the device memory. But if the ram size is <=maxram_size then the hole is after the above_4G memory? Why?

I didn't quit get what the concern is, can you elaborate?

> 
> (c) in your above change, what does long mode have anything to do with all of this?

According to my understanding, 32bit (i386) doesn't have a 64bit hole. And 32bit vs.
64bit (i386 vs. x86_64) is decided based on LM, not on the address bits (as we learned, PSE36, and PAE).

But really, I just did what x86_cpu_realizefn() does to decide 32bit vs. 64bit ;)

     /* For 64bit systems think about the number of physical bits to present.
      * ideally this should be the same as the host; anything other than matching
      * the host can cause incorrect guest behaviour.
      * QEMU used to pick the magic value of 40 bits that corresponds to
      * consumer AMD devices but nothing else.
      *
      * Note that this code assumes features expansion has already been done
      * (as it checks for CPUID_EXT2_LM), and also assumes that potential
      * phys_bits adjustments to match the host have been already done in
      * accel-specific code in cpu_exec_realizefn.
      */
     if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
     ...
     } else {
         /* For 32 bit systems don't use the user set value, but keep
          * phys_bits consistent with what we tell the guest.
          */
     ...


But that was just my quick attempt at fixing pc_max_used_gpa().

*Maybe* there is a 64bit PCI hole on 32bit i386 with 36bit addresses?

I'm the wrong person to ask, but I kind-of doubt it. :)
Ani Sinha Sept. 14, 2023, 5:53 a.m. UTC | #10
> On 12-Sep-2023, at 9:04 PM, David Hildenbrand <david@redhat.com> wrote:
> 
> [...]
> 
>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>> index 54838c0c41..d187890675 100644
>>> --- a/hw/i386/pc.c
>>> +++ b/hw/i386/pc.c
>>> @@ -908,9 +908,12 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
>>> {
>>>     X86CPU *cpu = X86_CPU(first_cpu);
>>> 
>>> -    /* 32-bit systems don't have hole64 thus return max CPU address */
>>> -    if (cpu->phys_bits <= 32) {
>>> -        return ((hwaddr)1 << cpu->phys_bits) - 1;
>>> +    /*
>>> +     * 32-bit systems don't have hole64, but we might have a region for
>>> +     * memory hotplug.
>>> +     */
>>> +    if (!(cpu->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
>>> +        return pc_pci_hole64_start() - 1;
>> Ok this is very confusing! I am looking at pc_pci_hole64_start() function. I have a few questions …
>> (a) pc_get_device_memory_range() returns the size of the device memory as the difference between ram_size and maxram_size. But from what I understand, ram_size is the actual size of the ram present and maxram_size is the max size of ram *after* hot plugging additional memory. How can we assume that the additional available space is already occupied by hot plugged memory?
> 
> Let's take a look at an example:
> 
> $ ./build/qemu-system-x86_64 -m 8g,maxmem=16g,slots=1 \
>  -object memory-backend-ram,id=mem0,size=1g \
>  -device pc-dimm,memdev=mem0 \
>  -nodefaults -nographic -S -monitor stdio
> 
> (qemu) info mtree
> ...
> memory-region: system
>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
>    0000000000000000-00000000bfffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
>    0000000000000000-ffffffffffffffff (prio -1, i/o): pci
>      00000000000c0000-00000000000dffff (prio 1, rom): pc.rom
>      00000000000e0000-00000000000fffff (prio 1, rom): alias isa-bios @pc.bios 0000000000020000-000000000003ffff
>      00000000fffc0000-00000000ffffffff (prio 0, rom): pc.bios
>    00000000000a0000-00000000000bffff (prio 1, i/o): alias smram-region @pci 00000000000a0000-00000000000bffff
>    00000000000c0000-00000000000c3fff (prio 1, i/o): alias pam-pci @pci 00000000000c0000-00000000000c3fff
>    00000000000c4000-00000000000c7fff (prio 1, i/o): alias pam-pci @pci 00000000000c4000-00000000000c7fff
>    00000000000c8000-00000000000cbfff (prio 1, i/o): alias pam-pci @pci 00000000000c8000-00000000000cbfff
>    00000000000cc000-00000000000cffff (prio 1, i/o): alias pam-pci @pci 00000000000cc000-00000000000cffff
>    00000000000d0000-00000000000d3fff (prio 1, i/o): alias pam-pci @pci 00000000000d0000-00000000000d3fff
>    00000000000d4000-00000000000d7fff (prio 1, i/o): alias pam-pci @pci 00000000000d4000-00000000000d7fff
>    00000000000d8000-00000000000dbfff (prio 1, i/o): alias pam-pci @pci 00000000000d8000-00000000000dbfff
>    00000000000dc000-00000000000dffff (prio 1, i/o): alias pam-pci @pci 00000000000dc000-00000000000dffff
>    00000000000e0000-00000000000e3fff (prio 1, i/o): alias pam-pci @pci 00000000000e0000-00000000000e3fff
>    00000000000e4000-00000000000e7fff (prio 1, i/o): alias pam-pci @pci 00000000000e4000-00000000000e7fff
>    00000000000e8000-00000000000ebfff (prio 1, i/o): alias pam-pci @pci 00000000000e8000-00000000000ebfff
>    00000000000ec000-00000000000effff (prio 1, i/o): alias pam-pci @pci 00000000000ec000-00000000000effff
>    00000000000f0000-00000000000fffff (prio 1, i/o): alias pam-pci @pci 00000000000f0000-00000000000fffff
>    00000000fec00000-00000000fec00fff (prio 0, i/o): ioapic
>    00000000fed00000-00000000fed003ff (prio 0, i/o): hpet
>    00000000fee00000-00000000feefffff (prio 4096, i/o): apic-msi
>    0000000100000000-000000023fffffff (prio 0, ram): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
>    0000000240000000-000000047fffffff (prio 0, i/o): device-memory
>      0000000240000000-000000027fffffff (prio 0, ram): mem0
> 
> 
> We requested 8G of boot memory, which is split between "<4G" memory and ">=4G" memory.
> 
> We only place exactly 3G (0x0->0xbfffffff) under 4G, starting at address 0.

I can’t reconcile this with this code for q35:

   if (machine->ram_size >= 0xb0000000) {
        lowmem = 0x80000000; // max memory 0x8fffffff or 2.25 GiB                                                                                                                           
    } else {
        lowmem = 0xb0000000; // max memory 0xbfffffff or 3 GiB                                                                                                                
    }

You assigned 8 Gib to ram which is > 0xb0000000 (2.75 Gib) 


> 
> We leave the remainder (1G) of the <4G addresses available for I/O devices (32bit PCI hole).
> 
> So we end up with 5G (0x100000000->0x23fffffff) of memory starting exactly at address 4G.
> 
> "maxram_size - ram_size"=8G is the maximum amount of memory you can hotplug. We use it to size the
> "device-memory" region:
> 
> 0x47fffffff - 0x240000000+1 = 0x240000000
> -> 9 GiB
> 
> We requested a to hotplug a maximum of "8 GiB", and sized the area slightly larger to allow for some flexibility
> when it comes to placing DIMMs in that "device-memory" area.

Right but here in this example you do not hot plug memory while the VM is running. We can hot plug 8G yes, but the memory may not physically exist yet (and may never exist). How can we use this math to provision device-memory when the memory may not exist physically?

> 
> We place that area for memory devices after the RAM. So it starts after the 5G of ">=4G" boot memory.
> 
> 
> Long story short, based on the initial RAM size and the maximum RAM size, you
> can construct the layout above and exactly know
> a) How much memory is below 4G, starting at address 0 -> leaving 1G for the 32bit PCI hole
> b) How much memory is above 4G, starting at address 4g.
> c) Where the region for memory devices starts (aligned after b) ) and how big it is.
> d) Where the 64bit PCI hole is (after c) )
> 
>> (b) Another question is, in pc_pci_hole64_start(), why are we adding this size to the start address?
>> } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) {
>> 	pc_get_device_memory_range(pcms, &hole64_start, &size);
>>         if (!pcmc->broken_reserved_end) {
>>             hole64_start += size;
> 
> The 64bit PCI hole starts after "device-memory" above.
> 
> Apparently, we have to take care of some layout issues before QEMU 2.5. You can assume that nowadays,
> "pcmc->broken_reserved_end" is never set. So the PCI64 hole is always after the device-memory region.
> 
>> I think this is trying to put the hole after the device memory. But if the ram size is <=maxram_size then the hole is after the above_4G memory? Why?
> 
> I didn't quit get what the concern is, can you elaborate?

Oh I meant the else part here and made a typo, the else implies ram size == maxram_size

  } else {
        hole64_start = pc_above_4g_end(pcms);
    }

So in this case, there is no device_memory region?!
Another thing I do not understand is, for 32 -bit, 
above_4g_mem_start is 4GiB  and above_4g_mem_size = ram_size - lowmem.
So we are allocating “above-4G” ram above address space of the processor?!

> 
>> (c) in your above change, what does long mode have anything to do with all of this?
> 
> According to my understanding, 32bit (i386) doesn't have a 64bit hole. And 32bit vs.
> 64bit (i386 vs. x86_64) is decided based on LM, not on the address bits (as we learned, PSE36, and PAE).
> 
> But really, I just did what x86_cpu_realizefn() does to decide 32bit vs. 64bit ;)
> 
>    /* For 64bit systems think about the number of physical bits to present.
>     * ideally this should be the same as the host; anything other than matching
>     * the host can cause incorrect guest behaviour.
>     * QEMU used to pick the magic value of 40 bits that corresponds to
>     * consumer AMD devices but nothing else.
>     *
>     * Note that this code assumes features expansion has already been done
>     * (as it checks for CPUID_EXT2_LM), and also assumes that potential
>     * phys_bits adjustments to match the host have been already done in
>     * accel-specific code in cpu_exec_realizefn.
>     */
>    if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
>    ...
>    } else {
>        /* For 32 bit systems don't use the user set value, but keep
>         * phys_bits consistent with what we tell the guest.
>         */

Ah I see. I missed this. But I still can’t understand why for 32 bit, pc_pci_hole64_start() would be the right address for max gpa?

>    ...
> 
> 
> But that was just my quick attempt at fixing pc_max_used_gpa().
> 
> *Maybe* there is a 64bit PCI hole on 32bit i386 with 36bit addresses?
> 
> I'm the wrong person to ask, but I kind-of doubt it. :)
> 
> 
> -- 
> Cheers,
> 
> David / dhildenb
>
David Hildenbrand Sept. 14, 2023, 8:37 a.m. UTC | #11
On 14.09.23 07:53, Ani Sinha wrote:
> 
> 
>> On 12-Sep-2023, at 9:04 PM, David Hildenbrand <david@redhat.com> wrote:
>>
>> [...]
>>
>>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>>> index 54838c0c41..d187890675 100644
>>>> --- a/hw/i386/pc.c
>>>> +++ b/hw/i386/pc.c
>>>> @@ -908,9 +908,12 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
>>>> {
>>>>      X86CPU *cpu = X86_CPU(first_cpu);
>>>>
>>>> -    /* 32-bit systems don't have hole64 thus return max CPU address */
>>>> -    if (cpu->phys_bits <= 32) {
>>>> -        return ((hwaddr)1 << cpu->phys_bits) - 1;
>>>> +    /*
>>>> +     * 32-bit systems don't have hole64, but we might have a region for
>>>> +     * memory hotplug.
>>>> +     */
>>>> +    if (!(cpu->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
>>>> +        return pc_pci_hole64_start() - 1;
>>> Ok this is very confusing! I am looking at pc_pci_hole64_start() function. I have a few questions …
>>> (a) pc_get_device_memory_range() returns the size of the device memory as the difference between ram_size and maxram_size. But from what I understand, ram_size is the actual size of the ram present and maxram_size is the max size of ram *after* hot plugging additional memory. How can we assume that the additional available space is already occupied by hot plugged memory?
>>
>> Let's take a look at an example:
>>
>> $ ./build/qemu-system-x86_64 -m 8g,maxmem=16g,slots=1 \
>>   -object memory-backend-ram,id=mem0,size=1g \
>>   -device pc-dimm,memdev=mem0 \
>>   -nodefaults -nographic -S -monitor stdio
>>
>> (qemu) info mtree
>> ...
>> memory-region: system
>>   0000000000000000-ffffffffffffffff (prio 0, i/o): system
>>     0000000000000000-00000000bfffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
>>     0000000000000000-ffffffffffffffff (prio -1, i/o): pci
>>       00000000000c0000-00000000000dffff (prio 1, rom): pc.rom
>>       00000000000e0000-00000000000fffff (prio 1, rom): alias isa-bios @pc.bios 0000000000020000-000000000003ffff
>>       00000000fffc0000-00000000ffffffff (prio 0, rom): pc.bios
>>     00000000000a0000-00000000000bffff (prio 1, i/o): alias smram-region @pci 00000000000a0000-00000000000bffff
>>     00000000000c0000-00000000000c3fff (prio 1, i/o): alias pam-pci @pci 00000000000c0000-00000000000c3fff
>>     00000000000c4000-00000000000c7fff (prio 1, i/o): alias pam-pci @pci 00000000000c4000-00000000000c7fff
>>     00000000000c8000-00000000000cbfff (prio 1, i/o): alias pam-pci @pci 00000000000c8000-00000000000cbfff
>>     00000000000cc000-00000000000cffff (prio 1, i/o): alias pam-pci @pci 00000000000cc000-00000000000cffff
>>     00000000000d0000-00000000000d3fff (prio 1, i/o): alias pam-pci @pci 00000000000d0000-00000000000d3fff
>>     00000000000d4000-00000000000d7fff (prio 1, i/o): alias pam-pci @pci 00000000000d4000-00000000000d7fff
>>     00000000000d8000-00000000000dbfff (prio 1, i/o): alias pam-pci @pci 00000000000d8000-00000000000dbfff
>>     00000000000dc000-00000000000dffff (prio 1, i/o): alias pam-pci @pci 00000000000dc000-00000000000dffff
>>     00000000000e0000-00000000000e3fff (prio 1, i/o): alias pam-pci @pci 00000000000e0000-00000000000e3fff
>>     00000000000e4000-00000000000e7fff (prio 1, i/o): alias pam-pci @pci 00000000000e4000-00000000000e7fff
>>     00000000000e8000-00000000000ebfff (prio 1, i/o): alias pam-pci @pci 00000000000e8000-00000000000ebfff
>>     00000000000ec000-00000000000effff (prio 1, i/o): alias pam-pci @pci 00000000000ec000-00000000000effff
>>     00000000000f0000-00000000000fffff (prio 1, i/o): alias pam-pci @pci 00000000000f0000-00000000000fffff
>>     00000000fec00000-00000000fec00fff (prio 0, i/o): ioapic
>>     00000000fed00000-00000000fed003ff (prio 0, i/o): hpet
>>     00000000fee00000-00000000feefffff (prio 4096, i/o): apic-msi
>>     0000000100000000-000000023fffffff (prio 0, ram): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
>>     0000000240000000-000000047fffffff (prio 0, i/o): device-memory
>>       0000000240000000-000000027fffffff (prio 0, ram): mem0
>>
>>
>> We requested 8G of boot memory, which is split between "<4G" memory and ">=4G" memory.
>>
>> We only place exactly 3G (0x0->0xbfffffff) under 4G, starting at address 0.
> 
> I can’t reconcile this with this code for q35:
> 
>     if (machine->ram_size >= 0xb0000000) {
>          lowmem = 0x80000000; // max memory 0x8fffffff or 2.25 GiB
>      } else {
>          lowmem = 0xb0000000; // max memory 0xbfffffff or 3 GiB
>      }
> 
> You assigned 8 Gib to ram which is > 0xb0000000 (2.75 Gib)
> 

QEMU defaults to the "pc" machine. If you add "-M q35" you get:

address-space: memory
   0000000000000000-ffffffffffffffff (prio 0, i/o): system
     0000000000000000-000000007fffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-000000007fffffff
[...]
     0000000100000000-000000027fffffff (prio 0, ram): alias ram-above-4g @pc.ram 0000000080000000-00000001ffffffff
     0000000280000000-00000004bfffffff (prio 0, i/o): device-memory
       0000000280000000-00000002bfffffff (prio 0, ram): mem0


> 
>>
>> We leave the remainder (1G) of the <4G addresses available for I/O devices (32bit PCI hole).
>>
>> So we end up with 5G (0x100000000->0x23fffffff) of memory starting exactly at address 4G.
>>
>> "maxram_size - ram_size"=8G is the maximum amount of memory you can hotplug. We use it to size the
>> "device-memory" region:
>>
>> 0x47fffffff - 0x240000000+1 = 0x240000000
>> -> 9 GiB
>>
>> We requested a to hotplug a maximum of "8 GiB", and sized the area slightly larger to allow for some flexibility
>> when it comes to placing DIMMs in that "device-memory" area.
> 
> Right but here in this example you do not hot plug memory while the VM is running. We can hot plug 8G yes, but the memory may not physically exist yet (and may never exist). How can we use this math to provision device-memory when the memory may not exist physically?

We simply reserve a region in GPA space where we can coldplug and hotplug a
predefined maximum amount of memory we can hotplug.

What do you think is wrong with that?

> 
>>
>> We place that area for memory devices after the RAM. So it starts after the 5G of ">=4G" boot memory.
>>
>>
>> Long story short, based on the initial RAM size and the maximum RAM size, you
>> can construct the layout above and exactly know
>> a) How much memory is below 4G, starting at address 0 -> leaving 1G for the 32bit PCI hole
>> b) How much memory is above 4G, starting at address 4g.
>> c) Where the region for memory devices starts (aligned after b) ) and how big it is.
>> d) Where the 64bit PCI hole is (after c) )
>>
>>> (b) Another question is, in pc_pci_hole64_start(), why are we adding this size to the start address?
>>> } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) {
>>> 	pc_get_device_memory_range(pcms, &hole64_start, &size);
>>>          if (!pcmc->broken_reserved_end) {
>>>              hole64_start += size;
>>
>> The 64bit PCI hole starts after "device-memory" above.
>>
>> Apparently, we have to take care of some layout issues before QEMU 2.5. You can assume that nowadays,
>> "pcmc->broken_reserved_end" is never set. So the PCI64 hole is always after the device-memory region.
>>
>>> I think this is trying to put the hole after the device memory. But if the ram size is <=maxram_size then the hole is after the above_4G memory? Why?
>>
>> I didn't quit get what the concern is, can you elaborate?
> 
> Oh I meant the else part here and made a typo, the else implies ram size == maxram_size
> 
>    } else {
>          hole64_start = pc_above_4g_end(pcms);
>      }
> 
> So in this case, there is no device_memory region?!

Yes. In this case ms->ram_size == ms->maxram_size and you cannot cold/hotplug any memory devices.

See how pc_memory_init() doesn't call machine_memory_devices_init() in that case.

That's what the QEMU user asked for when *not* specifying maxmem (e.g., -m 4g).

In order to cold/hotplug any memory devices, you have to tell QEMU ahead of time how much memory
you are intending to provide using memory devices (DIMM, NVDIMM, virtio-pmem, virtio-mem).

So when specifying, say -m 4g,maxmem=20g, we can have memory devices of a total of 16g (20 - 4).
We use reserve a GPA space for device_memory that is at least 16g, into which we can either coldplug
(QEMU cmdline) or hotplug (qmp/hmp) memory later.

> Another thing I do not understand is, for 32 -bit,
> above_4g_mem_start is 4GiB  and above_4g_mem_size = ram_size - lowmem.
> So we are allocating “above-4G” ram above address space of the processor?!
> 
>>
>>> (c) in your above change, what does long mode have anything to do with all of this?
>>
>> According to my understanding, 32bit (i386) doesn't have a 64bit hole. And 32bit vs.
>> 64bit (i386 vs. x86_64) is decided based on LM, not on the address bits (as we learned, PSE36, and PAE).
>>
>> But really, I just did what x86_cpu_realizefn() does to decide 32bit vs. 64bit ;)
>>
>>     /* For 64bit systems think about the number of physical bits to present.
>>      * ideally this should be the same as the host; anything other than matching
>>      * the host can cause incorrect guest behaviour.
>>      * QEMU used to pick the magic value of 40 bits that corresponds to
>>      * consumer AMD devices but nothing else.
>>      *
>>      * Note that this code assumes features expansion has already been done
>>      * (as it checks for CPUID_EXT2_LM), and also assumes that potential
>>      * phys_bits adjustments to match the host have been already done in
>>      * accel-specific code in cpu_exec_realizefn.
>>      */
>>     if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
>>     ...
>>     } else {
>>         /* For 32 bit systems don't use the user set value, but keep
>>          * phys_bits consistent with what we tell the guest.
>>          */
> 
> Ah I see. I missed this. But I still can’t understand why for 32 bit, pc_pci_hole64_start() would be the right address for max gpa?

You want "end of device memory region" if there is one, or
"end of RAM" is there is none.

What pc_pci_hole64_start() does:

/*
  * The 64bit pci hole starts after "above 4G RAM" and
  * potentially the space reserved for memory hotplug.
  */

There is the
	ROUND_UP(hole64_start, 1 * GiB);
in there that is not really required for the !hole64 case. It
shouldn't matter much in practice I think (besides an aligned value
showing up in the error message).

We could factor out most of that calculation into a
separate function, skipping that alignment to make that
clearer.
Ani Sinha Sept. 14, 2023, 11:21 a.m. UTC | #12
> On 14-Sep-2023, at 2:07 PM, David Hildenbrand <david@redhat.com> wrote:
> 
> On 14.09.23 07:53, Ani Sinha wrote:
>>> On 12-Sep-2023, at 9:04 PM, David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> [...]
>>> 
>>>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>>>> index 54838c0c41..d187890675 100644
>>>>> --- a/hw/i386/pc.c
>>>>> +++ b/hw/i386/pc.c
>>>>> @@ -908,9 +908,12 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
>>>>> {
>>>>>     X86CPU *cpu = X86_CPU(first_cpu);
>>>>> 
>>>>> -    /* 32-bit systems don't have hole64 thus return max CPU address */
>>>>> -    if (cpu->phys_bits <= 32) {
>>>>> -        return ((hwaddr)1 << cpu->phys_bits) - 1;
>>>>> +    /*
>>>>> +     * 32-bit systems don't have hole64, but we might have a region for
>>>>> +     * memory hotplug.
>>>>> +     */
>>>>> +    if (!(cpu->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
>>>>> +        return pc_pci_hole64_start() - 1;
>>>> Ok this is very confusing! I am looking at pc_pci_hole64_start() function. I have a few questions …
>>>> (a) pc_get_device_memory_range() returns the size of the device memory as the difference between ram_size and maxram_size. But from what I understand, ram_size is the actual size of the ram present and maxram_size is the max size of ram *after* hot plugging additional memory. How can we assume that the additional available space is already occupied by hot plugged memory?
>>> 
>>> Let's take a look at an example:
>>> 
>>> $ ./build/qemu-system-x86_64 -m 8g,maxmem=16g,slots=1 \
>>>  -object memory-backend-ram,id=mem0,size=1g \
>>>  -device pc-dimm,memdev=mem0 \
>>>  -nodefaults -nographic -S -monitor stdio
>>> 
>>> (qemu) info mtree
>>> ...
>>> memory-region: system
>>>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
>>>    0000000000000000-00000000bfffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
>>>    0000000000000000-ffffffffffffffff (prio -1, i/o): pci
>>>      00000000000c0000-00000000000dffff (prio 1, rom): pc.rom
>>>      00000000000e0000-00000000000fffff (prio 1, rom): alias isa-bios @pc.bios 0000000000020000-000000000003ffff
>>>      00000000fffc0000-00000000ffffffff (prio 0, rom): pc.bios
>>>    00000000000a0000-00000000000bffff (prio 1, i/o): alias smram-region @pci 00000000000a0000-00000000000bffff
>>>    00000000000c0000-00000000000c3fff (prio 1, i/o): alias pam-pci @pci 00000000000c0000-00000000000c3fff
>>>    00000000000c4000-00000000000c7fff (prio 1, i/o): alias pam-pci @pci 00000000000c4000-00000000000c7fff
>>>    00000000000c8000-00000000000cbfff (prio 1, i/o): alias pam-pci @pci 00000000000c8000-00000000000cbfff
>>>    00000000000cc000-00000000000cffff (prio 1, i/o): alias pam-pci @pci 00000000000cc000-00000000000cffff
>>>    00000000000d0000-00000000000d3fff (prio 1, i/o): alias pam-pci @pci 00000000000d0000-00000000000d3fff
>>>    00000000000d4000-00000000000d7fff (prio 1, i/o): alias pam-pci @pci 00000000000d4000-00000000000d7fff
>>>    00000000000d8000-00000000000dbfff (prio 1, i/o): alias pam-pci @pci 00000000000d8000-00000000000dbfff
>>>    00000000000dc000-00000000000dffff (prio 1, i/o): alias pam-pci @pci 00000000000dc000-00000000000dffff
>>>    00000000000e0000-00000000000e3fff (prio 1, i/o): alias pam-pci @pci 00000000000e0000-00000000000e3fff
>>>    00000000000e4000-00000000000e7fff (prio 1, i/o): alias pam-pci @pci 00000000000e4000-00000000000e7fff
>>>    00000000000e8000-00000000000ebfff (prio 1, i/o): alias pam-pci @pci 00000000000e8000-00000000000ebfff
>>>    00000000000ec000-00000000000effff (prio 1, i/o): alias pam-pci @pci 00000000000ec000-00000000000effff
>>>    00000000000f0000-00000000000fffff (prio 1, i/o): alias pam-pci @pci 00000000000f0000-00000000000fffff
>>>    00000000fec00000-00000000fec00fff (prio 0, i/o): ioapic
>>>    00000000fed00000-00000000fed003ff (prio 0, i/o): hpet
>>>    00000000fee00000-00000000feefffff (prio 4096, i/o): apic-msi
>>>    0000000100000000-000000023fffffff (prio 0, ram): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
>>>    0000000240000000-000000047fffffff (prio 0, i/o): device-memory
>>>      0000000240000000-000000027fffffff (prio 0, ram): mem0
>>> 
>>> 
>>> We requested 8G of boot memory, which is split between "<4G" memory and ">=4G" memory.
>>> 
>>> We only place exactly 3G (0x0->0xbfffffff) under 4G, starting at address 0.
>> I can’t reconcile this with this code for q35:
>>    if (machine->ram_size >= 0xb0000000) {
>>         lowmem = 0x80000000; // max memory 0x8fffffff or 2.25 GiB
>>     } else {
>>         lowmem = 0xb0000000; // max memory 0xbfffffff or 3 GiB
>>     }
>> You assigned 8 Gib to ram which is > 0xb0000000 (2.75 Gib)
> 
> QEMU defaults to the "pc" machine. If you add "-M q35" you get:
> 
> address-space: memory
>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
>    0000000000000000-000000007fffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-000000007fffffff
> [...]
>    0000000100000000-000000027fffffff (prio 0, ram): alias ram-above-4g @pc.ram 0000000080000000-00000001ffffffff
>    0000000280000000-00000004bfffffff (prio 0, i/o): device-memory
>      0000000280000000-00000002bfffffff (prio 0, ram): mem0
> 
> 
>>> 
>>> We leave the remainder (1G) of the <4G addresses available for I/O devices (32bit PCI hole).
>>> 
>>> So we end up with 5G (0x100000000->0x23fffffff) of memory starting exactly at address 4G.
>>> 
>>> "maxram_size - ram_size"=8G is the maximum amount of memory you can hotplug. We use it to size the
>>> "device-memory" region:
>>> 
>>> 0x47fffffff - 0x240000000+1 = 0x240000000
>>> -> 9 GiB
>>> 
>>> We requested a to hotplug a maximum of "8 GiB", and sized the area slightly larger to allow for some flexibility
>>> when it comes to placing DIMMs in that "device-memory" area.
>> Right but here in this example you do not hot plug memory while the VM is running. We can hot plug 8G yes, but the memory may not physically exist yet (and may never exist). How can we use this math to provision device-memory when the memory may not exist physically?
> 
> We simply reserve a region in GPA space where we can coldplug and hotplug a
> predefined maximum amount of memory we can hotplug.
> 
> What do you think is wrong with that?

The only issue I have is that even though we are accounting for it, the memory actually might not be physically present.

> 
>>> 
>>> We place that area for memory devices after the RAM. So it starts after the 5G of ">=4G" boot memory.
>>> 
>>> 
>>> Long story short, based on the initial RAM size and the maximum RAM size, you
>>> can construct the layout above and exactly know
>>> a) How much memory is below 4G, starting at address 0 -> leaving 1G for the 32bit PCI hole
>>> b) How much memory is above 4G, starting at address 4g.
>>> c) Where the region for memory devices starts (aligned after b) ) and how big it is.
>>> d) Where the 64bit PCI hole is (after c) )
>>> 
>>>> (b) Another question is, in pc_pci_hole64_start(), why are we adding this size to the start address?
>>>> } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) {
>>>> 	pc_get_device_memory_range(pcms, &hole64_start, &size);
>>>>         if (!pcmc->broken_reserved_end) {
>>>>             hole64_start += size;
>>> 
>>> The 64bit PCI hole starts after "device-memory" above.
>>> 
>>> Apparently, we have to take care of some layout issues before QEMU 2.5. You can assume that nowadays,
>>> "pcmc->broken_reserved_end" is never set. So the PCI64 hole is always after the device-memory region.
>>> 
>>>> I think this is trying to put the hole after the device memory. But if the ram size is <=maxram_size then the hole is after the above_4G memory? Why?
>>> 
>>> I didn't quit get what the concern is, can you elaborate?
>> Oh I meant the else part here and made a typo, the else implies ram size == maxram_size
>>   } else {
>>         hole64_start = pc_above_4g_end(pcms);
>>     }
>> So in this case, there is no device_memory region?!
> 
> Yes. In this case ms->ram_size == ms->maxram_size and you cannot cold/hotplug any memory devices.
> 
> See how pc_memory_init() doesn't call machine_memory_devices_init() in that case.
> 
> That's what the QEMU user asked for when *not* specifying maxmem (e.g., -m 4g).
> 
> In order to cold/hotplug any memory devices, you have to tell QEMU ahead of time how much memory
> you are intending to provide using memory devices (DIMM, NVDIMM, virtio-pmem, virtio-mem).

So that means that when we are actually hot plugging the memory, there is no need to actually perform additional checks. It can be done statically when -mem and -maxmem etc are provided in the command line.

> 
> So when specifying, say -m 4g,maxmem=20g, we can have memory devices of a total of 16g (20 - 4).
> We use reserve a GPA space for device_memory that is at least 16g, into which we can either coldplug
> (QEMU cmdline) or hotplug (qmp/hmp) memory later.
> 
>> Another thing I do not understand is, for 32 -bit,
>> above_4g_mem_start is 4GiB  and above_4g_mem_size = ram_size - lowmem.
>> So we are allocating “above-4G” ram above address space of the processor?!
>>> 
>>>> (c) in your above change, what does long mode have anything to do with all of this?
>>> 
>>> According to my understanding, 32bit (i386) doesn't have a 64bit hole. And 32bit vs.
>>> 64bit (i386 vs. x86_64) is decided based on LM, not on the address bits (as we learned, PSE36, and PAE).
>>> 
>>> But really, I just did what x86_cpu_realizefn() does to decide 32bit vs. 64bit ;)
>>> 
>>>    /* For 64bit systems think about the number of physical bits to present.
>>>     * ideally this should be the same as the host; anything other than matching
>>>     * the host can cause incorrect guest behaviour.
>>>     * QEMU used to pick the magic value of 40 bits that corresponds to
>>>     * consumer AMD devices but nothing else.
>>>     *
>>>     * Note that this code assumes features expansion has already been done
>>>     * (as it checks for CPUID_EXT2_LM), and also assumes that potential
>>>     * phys_bits adjustments to match the host have been already done in
>>>     * accel-specific code in cpu_exec_realizefn.
>>>     */
>>>    if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
>>>    ...
>>>    } else {
>>>        /* For 32 bit systems don't use the user set value, but keep
>>>         * phys_bits consistent with what we tell the guest.
>>>         */
>> Ah I see. I missed this. But I still can’t understand why for 32 bit, pc_pci_hole64_start() would be the right address for max gpa?
> 
> You want "end of device memory region" if there is one, or
> "end of RAM" is there is none.
> 
> What pc_pci_hole64_start() does:
> 
> /*
> * The 64bit pci hole starts after "above 4G RAM" and
> * potentially the space reserved for memory hotplug.
> */
> 
> There is the
> 	ROUND_UP(hole64_start, 1 * GiB);
> in there that is not really required for the !hole64 case. It
> shouldn't matter much in practice I think (besides an aligned value
> showing up in the error message).
> 
> We could factor out most of that calculation into a
> separate function, skipping that alignment to make that
> clearer.

Yeah this whole memory segmentation is quite complicated and might benefit from a qemu doc or a refactoring.
David Hildenbrand Sept. 14, 2023, 11:49 a.m. UTC | #13
>>>> We requested a to hotplug a maximum of "8 GiB", and sized the area slightly larger to allow for some flexibility
>>>> when it comes to placing DIMMs in that "device-memory" area.
>>> Right but here in this example you do not hot plug memory while the VM is running. We can hot plug 8G yes, but the memory may not physically exist yet (and may never exist). How can we use this math to provision device-memory when the memory may not exist physically?
>>
>> We simply reserve a region in GPA space where we can coldplug and hotplug a
>> predefined maximum amount of memory we can hotplug.
>>
>> What do you think is wrong with that?
> 
> The only issue I have is that even though we are accounting for it, the memory actually might not be physically present.

Not sure if "accounting" is the right word; the memory is not present 
and nowhere indicated as present. It's just a reservation of GPA space, 
like the PCI hole is as well.

[...]

>>
>> Yes. In this case ms->ram_size == ms->maxram_size and you cannot cold/hotplug any memory devices.
>>
>> See how pc_memory_init() doesn't call machine_memory_devices_init() in that case.
>>
>> That's what the QEMU user asked for when *not* specifying maxmem (e.g., -m 4g).
>>
>> In order to cold/hotplug any memory devices, you have to tell QEMU ahead of time how much memory
>> you are intending to provide using memory devices (DIMM, NVDIMM, virtio-pmem, virtio-mem).
> 
> So that means that when we are actually hot plugging the memory, there is no need to actually perform additional checks. It can be done statically when -mem and -maxmem etc are provided in the command line.

What memory device code does, is find a free location inside the 
reserved GPA space for memory devices. Then, it maps that device at
that location.

[...]

>> /*
>> * The 64bit pci hole starts after "above 4G RAM" and
>> * potentially the space reserved for memory hotplug.
>> */
>>
>> There is the
>> 	ROUND_UP(hole64_start, 1 * GiB);
>> in there that is not really required for the !hole64 case. It
>> shouldn't matter much in practice I think (besides an aligned value
>> showing up in the error message).
>>
>> We could factor out most of that calculation into a
>> separate function, skipping that alignment to make that
>> clearer.
> 
> Yeah this whole memory segmentation is quite complicated and might benefit from a qemu doc or a refactoring.

Absolutely. Do you have time to work on that (including the updated fix?).
Ani Sinha Sept. 14, 2023, 5:11 p.m. UTC | #14
> On 14-Sep-2023, at 2:07 PM, David Hildenbrand <david@redhat.com> wrote:
> 
> On 14.09.23 07:53, Ani Sinha wrote:
>>> On 12-Sep-2023, at 9:04 PM, David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> [...]
>>> 
>>>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>>>> index 54838c0c41..d187890675 100644
>>>>> --- a/hw/i386/pc.c
>>>>> +++ b/hw/i386/pc.c
>>>>> @@ -908,9 +908,12 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
>>>>> {
>>>>>     X86CPU *cpu = X86_CPU(first_cpu);
>>>>> 
>>>>> -    /* 32-bit systems don't have hole64 thus return max CPU address */
>>>>> -    if (cpu->phys_bits <= 32) {
>>>>> -        return ((hwaddr)1 << cpu->phys_bits) - 1;
>>>>> +    /*
>>>>> +     * 32-bit systems don't have hole64, but we might have a region for
>>>>> +     * memory hotplug.
>>>>> +     */
>>>>> +    if (!(cpu->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
>>>>> +        return pc_pci_hole64_start() - 1;
>>>> Ok this is very confusing! I am looking at pc_pci_hole64_start() function. I have a few questions …
>>>> (a) pc_get_device_memory_range() returns the size of the device memory as the difference between ram_size and maxram_size. But from what I understand, ram_size is the actual size of the ram present and maxram_size is the max size of ram *after* hot plugging additional memory. How can we assume that the additional available space is already occupied by hot plugged memory?
>>> 
>>> Let's take a look at an example:
>>> 
>>> $ ./build/qemu-system-x86_64 -m 8g,maxmem=16g,slots=1 \
>>>  -object memory-backend-ram,id=mem0,size=1g \
>>>  -device pc-dimm,memdev=mem0 \
>>>  -nodefaults -nographic -S -monitor stdio
>>> 
>>> (qemu) info mtree
>>> ...
>>> memory-region: system
>>>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
>>>    0000000000000000-00000000bfffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
>>>    0000000000000000-ffffffffffffffff (prio -1, i/o): pci
>>>      00000000000c0000-00000000000dffff (prio 1, rom): pc.rom
>>>      00000000000e0000-00000000000fffff (prio 1, rom): alias isa-bios @pc.bios 0000000000020000-000000000003ffff
>>>      00000000fffc0000-00000000ffffffff (prio 0, rom): pc.bios
>>>    00000000000a0000-00000000000bffff (prio 1, i/o): alias smram-region @pci 00000000000a0000-00000000000bffff
>>>    00000000000c0000-00000000000c3fff (prio 1, i/o): alias pam-pci @pci 00000000000c0000-00000000000c3fff
>>>    00000000000c4000-00000000000c7fff (prio 1, i/o): alias pam-pci @pci 00000000000c4000-00000000000c7fff
>>>    00000000000c8000-00000000000cbfff (prio 1, i/o): alias pam-pci @pci 00000000000c8000-00000000000cbfff
>>>    00000000000cc000-00000000000cffff (prio 1, i/o): alias pam-pci @pci 00000000000cc000-00000000000cffff
>>>    00000000000d0000-00000000000d3fff (prio 1, i/o): alias pam-pci @pci 00000000000d0000-00000000000d3fff
>>>    00000000000d4000-00000000000d7fff (prio 1, i/o): alias pam-pci @pci 00000000000d4000-00000000000d7fff
>>>    00000000000d8000-00000000000dbfff (prio 1, i/o): alias pam-pci @pci 00000000000d8000-00000000000dbfff
>>>    00000000000dc000-00000000000dffff (prio 1, i/o): alias pam-pci @pci 00000000000dc000-00000000000dffff
>>>    00000000000e0000-00000000000e3fff (prio 1, i/o): alias pam-pci @pci 00000000000e0000-00000000000e3fff
>>>    00000000000e4000-00000000000e7fff (prio 1, i/o): alias pam-pci @pci 00000000000e4000-00000000000e7fff
>>>    00000000000e8000-00000000000ebfff (prio 1, i/o): alias pam-pci @pci 00000000000e8000-00000000000ebfff
>>>    00000000000ec000-00000000000effff (prio 1, i/o): alias pam-pci @pci 00000000000ec000-00000000000effff
>>>    00000000000f0000-00000000000fffff (prio 1, i/o): alias pam-pci @pci 00000000000f0000-00000000000fffff
>>>    00000000fec00000-00000000fec00fff (prio 0, i/o): ioapic
>>>    00000000fed00000-00000000fed003ff (prio 0, i/o): hpet
>>>    00000000fee00000-00000000feefffff (prio 4096, i/o): apic-msi
>>>    0000000100000000-000000023fffffff (prio 0, ram): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
>>>    0000000240000000-000000047fffffff (prio 0, i/o): device-memory
>>>      0000000240000000-000000027fffffff (prio 0, ram): mem0
>>> 
>>> 
>>> We requested 8G of boot memory, which is split between "<4G" memory and ">=4G" memory.
>>> 
>>> We only place exactly 3G (0x0->0xbfffffff) under 4G, starting at address 0.
>> I can’t reconcile this with this code for q35:
>>    if (machine->ram_size >= 0xb0000000) {
>>         lowmem = 0x80000000; // max memory 0x8fffffff or 2.25 GiB
>>     } else {
>>         lowmem = 0xb0000000; // max memory 0xbfffffff or 3 GiB
>>     }
>> You assigned 8 Gib to ram which is > 0xb0000000 (2.75 Gib)
> 
> QEMU defaults to the "pc" machine. If you add "-M q35" you get:

Indeed. For some reason I thought we moved away from i440fx both upstream and downstream.

static void pc_i440fx_8_2_machine_options(MachineClass *m)
{
    pc_i440fx_machine_options(m);
    m->alias = "pc";
    m->is_default = true;
}


And

static MachineClass *find_default_machine(GSList *machines)
{
    GSList *el;
    MachineClass *default_machineclass = NULL;
        
    for (el = machines; el; el = el->next) {
        MachineClass *mc = el->data;
    
        if (mc->is_default) {
            assert(default_machineclass == NULL && "Multiple default machines");
            default_machineclass = mc;
        }
    }
                               
    return default_machineclass;
}


But even on downstream its i440fx:

$ /usr/libexec/qemu-kvm -M ? | grep default
pc-i440fx-rhel7.6.0  RHEL 7.6.0 PC (i440FX + PIIX, 1996) (default) (deprecated)



> address-space: memory
>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
>    0000000000000000-000000007fffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-000000007fffffff
> [...]
>    0000000100000000-000000027fffffff (prio 0, ram): alias ram-above-4g @pc.ram 0000000080000000-00000001ffffffff
>    0000000280000000-00000004bfffffff (prio 0, i/o): device-memory
>      0000000280000000-00000002bfffffff (prio 0, ram): mem0
> 
> 
>>> 
>>> We leave the remainder (1G) of the <4G addresses available for I/O devices (32bit PCI hole).
>>> 
>>> So we end up with 5G (0x100000000->0x23fffffff) of memory starting exactly at address 4G.
>>> 
>>> "maxram_size - ram_size"=8G is the maximum amount of memory you can hotplug. We use it to size the
>>> "device-memory" region:
>>> 
>>> 0x47fffffff - 0x240000000+1 = 0x240000000
>>> -> 9 GiB
>>> 
>>> We requested a to hotplug a maximum of "8 GiB", and sized the area slightly larger to allow for some flexibility
>>> when it comes to placing DIMMs in that "device-memory" area.
>> Right but here in this example you do not hot plug memory while the VM is running. We can hot plug 8G yes, but the memory may not physically exist yet (and may never exist). How can we use this math to provision device-memory when the memory may not exist physically?
> 
> We simply reserve a region in GPA space where we can coldplug and hotplug a
> predefined maximum amount of memory we can hotplug.
> 
> What do you think is wrong with that?
> 
>>> 
>>> We place that area for memory devices after the RAM. So it starts after the 5G of ">=4G" boot memory.
>>> 
>>> 
>>> Long story short, based on the initial RAM size and the maximum RAM size, you
>>> can construct the layout above and exactly know
>>> a) How much memory is below 4G, starting at address 0 -> leaving 1G for the 32bit PCI hole
>>> b) How much memory is above 4G, starting at address 4g.
>>> c) Where the region for memory devices starts (aligned after b) ) and how big it is.
>>> d) Where the 64bit PCI hole is (after c) )
>>> 
>>>> (b) Another question is, in pc_pci_hole64_start(), why are we adding this size to the start address?
>>>> } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) {
>>>> 	pc_get_device_memory_range(pcms, &hole64_start, &size);
>>>>         if (!pcmc->broken_reserved_end) {
>>>>             hole64_start += size;
>>> 
>>> The 64bit PCI hole starts after "device-memory" above.
>>> 
>>> Apparently, we have to take care of some layout issues before QEMU 2.5. You can assume that nowadays,
>>> "pcmc->broken_reserved_end" is never set. So the PCI64 hole is always after the device-memory region.
>>> 
>>>> I think this is trying to put the hole after the device memory. But if the ram size is <=maxram_size then the hole is after the above_4G memory? Why?
>>> 
>>> I didn't quit get what the concern is, can you elaborate?
>> Oh I meant the else part here and made a typo, the else implies ram size == maxram_size
>>   } else {
>>         hole64_start = pc_above_4g_end(pcms);
>>     }
>> So in this case, there is no device_memory region?!
> 
> Yes. In this case ms->ram_size == ms->maxram_size and you cannot cold/hotplug any memory devices.
> 
> See how pc_memory_init() doesn't call machine_memory_devices_init() in that case.
> 
> That's what the QEMU user asked for when *not* specifying maxmem (e.g., -m 4g).
> 
> In order to cold/hotplug any memory devices, you have to tell QEMU ahead of time how much memory
> you are intending to provide using memory devices (DIMM, NVDIMM, virtio-pmem, virtio-mem).
> 
> So when specifying, say -m 4g,maxmem=20g, we can have memory devices of a total of 16g (20 - 4).
> We use reserve a GPA space for device_memory that is at least 16g, into which we can either coldplug
> (QEMU cmdline) or hotplug (qmp/hmp) memory later.
> 
>> Another thing I do not understand is, for 32 -bit,
>> above_4g_mem_start is 4GiB  and above_4g_mem_size = ram_size - lowmem.
>> So we are allocating “above-4G” ram above address space of the processor?!
>>> 
>>>> (c) in your above change, what does long mode have anything to do with all of this?
>>> 
>>> According to my understanding, 32bit (i386) doesn't have a 64bit hole. And 32bit vs.
>>> 64bit (i386 vs. x86_64) is decided based on LM, not on the address bits (as we learned, PSE36, and PAE).
>>> 
>>> But really, I just did what x86_cpu_realizefn() does to decide 32bit vs. 64bit ;)
>>> 
>>>    /* For 64bit systems think about the number of physical bits to present.
>>>     * ideally this should be the same as the host; anything other than matching
>>>     * the host can cause incorrect guest behaviour.
>>>     * QEMU used to pick the magic value of 40 bits that corresponds to
>>>     * consumer AMD devices but nothing else.
>>>     *
>>>     * Note that this code assumes features expansion has already been done
>>>     * (as it checks for CPUID_EXT2_LM), and also assumes that potential
>>>     * phys_bits adjustments to match the host have been already done in
>>>     * accel-specific code in cpu_exec_realizefn.
>>>     */
>>>    if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
>>>    ...
>>>    } else {
>>>        /* For 32 bit systems don't use the user set value, but keep
>>>         * phys_bits consistent with what we tell the guest.
>>>         */
>> Ah I see. I missed this. But I still can’t understand why for 32 bit, pc_pci_hole64_start() would be the right address for max gpa?
> 
> You want "end of device memory region" if there is one, or
> "end of RAM" is there is none.
> 
> What pc_pci_hole64_start() does:
> 
> /*
> * The 64bit pci hole starts after "above 4G RAM" and
> * potentially the space reserved for memory hotplug.
> */
> 
> There is the
> 	ROUND_UP(hole64_start, 1 * GiB);
> in there that is not really required for the !hole64 case. It
> shouldn't matter much in practice I think (besides an aligned value
> showing up in the error message).
> 
> We could factor out most of that calculation into a
> separate function, skipping that alignment to make that
> clearer.
> 
> -- 
> Cheers,
> 
> David / dhildenb
Ani Sinha Sept. 15, 2023, 10:38 a.m. UTC | #15
> On 14-Sep-2023, at 5:19 PM, David Hildenbrand <david@redhat.com> wrote:
> 
>>>>> We requested a to hotplug a maximum of "8 GiB", and sized the area slightly larger to allow for some flexibility
>>>>> when it comes to placing DIMMs in that "device-memory" area.
>>>> Right but here in this example you do not hot plug memory while the VM is running. We can hot plug 8G yes, but the memory may not physically exist yet (and may never exist). How can we use this math to provision device-memory when the memory may not exist physically?
>>> 
>>> We simply reserve a region in GPA space where we can coldplug and hotplug a
>>> predefined maximum amount of memory we can hotplug.
>>> 
>>> What do you think is wrong with that?
>> The only issue I have is that even though we are accounting for it, the memory actually might not be physically present.
> 
> Not sure if "accounting" is the right word; the memory is not present and nowhere indicated as present. It's just a reservation of GPA space, like the PCI hole is as well.
> 
> [...]
> 
>>> 
>>> Yes. In this case ms->ram_size == ms->maxram_size and you cannot cold/hotplug any memory devices.
>>> 
>>> See how pc_memory_init() doesn't call machine_memory_devices_init() in that case.
>>> 
>>> That's what the QEMU user asked for when *not* specifying maxmem (e.g., -m 4g).
>>> 
>>> In order to cold/hotplug any memory devices, you have to tell QEMU ahead of time how much memory
>>> you are intending to provide using memory devices (DIMM, NVDIMM, virtio-pmem, virtio-mem).
>> So that means that when we are actually hot plugging the memory, there is no need to actually perform additional checks. It can be done statically when -mem and -maxmem etc are provided in the command line.
> 
> What memory device code does, is find a free location inside the reserved GPA space for memory devices. Then, it maps that device at
> that location.

Yes I see the function memory_device_get_free_addr() that does all the range checks and validations and this gets called from the pre_plug() handler. 

> 
> [...]
> 
>>> /*
>>> * The 64bit pci hole starts after "above 4G RAM" and
>>> * potentially the space reserved for memory hotplug.
>>> */
>>> 
>>> There is the
>>> 	ROUND_UP(hole64_start, 1 * GiB);
>>> in there that is not really required for the !hole64 case. It
>>> shouldn't matter much in practice I think (besides an aligned value
>>> showing up in the error message).
>>> 
>>> We could factor out most of that calculation into a
>>> separate function, skipping that alignment to make that
>>> clearer.
>> Yeah this whole memory segmentation is quite complicated and might benefit from a qemu doc or a refactoring.
> 
> Absolutely. Do you have time to work on that (including the updated fix?).

Other than the fix you proposed I am not sure if we need to fix anything else atm. Seems physical address space bound checks are already in place.
Re: doc, maybe. I will add it to my TODO list.

> 
> -- 
> Cheers,
> 
> David / dhildenb
Ani Sinha Sept. 16, 2023, 5:17 a.m. UTC | #16
> On 14-Sep-2023, at 2:07 PM, David Hildenbrand <david@redhat.com> wrote:
> 
> On 14.09.23 07:53, Ani Sinha wrote:
>>> On 12-Sep-2023, at 9:04 PM, David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> [...]
>>> 
>>>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>>>> index 54838c0c41..d187890675 100644
>>>>> --- a/hw/i386/pc.c
>>>>> +++ b/hw/i386/pc.c
>>>>> @@ -908,9 +908,12 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size)
>>>>> {
>>>>>     X86CPU *cpu = X86_CPU(first_cpu);
>>>>> 
>>>>> -    /* 32-bit systems don't have hole64 thus return max CPU address */
>>>>> -    if (cpu->phys_bits <= 32) {
>>>>> -        return ((hwaddr)1 << cpu->phys_bits) - 1;
>>>>> +    /*
>>>>> +     * 32-bit systems don't have hole64, but we might have a region for
>>>>> +     * memory hotplug.
>>>>> +     */
>>>>> +    if (!(cpu->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM)) {
>>>>> +        return pc_pci_hole64_start() - 1;
>>>> Ok this is very confusing! I am looking at pc_pci_hole64_start() function. I have a few questions …
>>>> (a) pc_get_device_memory_range() returns the size of the device memory as the difference between ram_size and maxram_size. But from what I understand, ram_size is the actual size of the ram present and maxram_size is the max size of ram *after* hot plugging additional memory. How can we assume that the additional available space is already occupied by hot plugged memory?
>>> 
>>> Let's take a look at an example:
>>> 
>>> $ ./build/qemu-system-x86_64 -m 8g,maxmem=16g,slots=1 \
>>>  -object memory-backend-ram,id=mem0,size=1g \
>>>  -device pc-dimm,memdev=mem0 \
>>>  -nodefaults -nographic -S -monitor stdio
>>> 
>>> (qemu) info mtree
>>> ...
>>> memory-region: system
>>>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
>>>    0000000000000000-00000000bfffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
>>>    0000000000000000-ffffffffffffffff (prio -1, i/o): pci
>>>      00000000000c0000-00000000000dffff (prio 1, rom): pc.rom
>>>      00000000000e0000-00000000000fffff (prio 1, rom): alias isa-bios @pc.bios 0000000000020000-000000000003ffff
>>>      00000000fffc0000-00000000ffffffff (prio 0, rom): pc.bios
>>>    00000000000a0000-00000000000bffff (prio 1, i/o): alias smram-region @pci 00000000000a0000-00000000000bffff
>>>    00000000000c0000-00000000000c3fff (prio 1, i/o): alias pam-pci @pci 00000000000c0000-00000000000c3fff
>>>    00000000000c4000-00000000000c7fff (prio 1, i/o): alias pam-pci @pci 00000000000c4000-00000000000c7fff
>>>    00000000000c8000-00000000000cbfff (prio 1, i/o): alias pam-pci @pci 00000000000c8000-00000000000cbfff
>>>    00000000000cc000-00000000000cffff (prio 1, i/o): alias pam-pci @pci 00000000000cc000-00000000000cffff
>>>    00000000000d0000-00000000000d3fff (prio 1, i/o): alias pam-pci @pci 00000000000d0000-00000000000d3fff
>>>    00000000000d4000-00000000000d7fff (prio 1, i/o): alias pam-pci @pci 00000000000d4000-00000000000d7fff
>>>    00000000000d8000-00000000000dbfff (prio 1, i/o): alias pam-pci @pci 00000000000d8000-00000000000dbfff
>>>    00000000000dc000-00000000000dffff (prio 1, i/o): alias pam-pci @pci 00000000000dc000-00000000000dffff
>>>    00000000000e0000-00000000000e3fff (prio 1, i/o): alias pam-pci @pci 00000000000e0000-00000000000e3fff
>>>    00000000000e4000-00000000000e7fff (prio 1, i/o): alias pam-pci @pci 00000000000e4000-00000000000e7fff
>>>    00000000000e8000-00000000000ebfff (prio 1, i/o): alias pam-pci @pci 00000000000e8000-00000000000ebfff
>>>    00000000000ec000-00000000000effff (prio 1, i/o): alias pam-pci @pci 00000000000ec000-00000000000effff
>>>    00000000000f0000-00000000000fffff (prio 1, i/o): alias pam-pci @pci 00000000000f0000-00000000000fffff
>>>    00000000fec00000-00000000fec00fff (prio 0, i/o): ioapic
>>>    00000000fed00000-00000000fed003ff (prio 0, i/o): hpet
>>>    00000000fee00000-00000000feefffff (prio 4096, i/o): apic-msi
>>>    0000000100000000-000000023fffffff (prio 0, ram): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
>>>    0000000240000000-000000047fffffff (prio 0, i/o): device-memory
>>>      0000000240000000-000000027fffffff (prio 0, ram): mem0
>>> 
>>> 
>>> We requested 8G of boot memory, which is split between "<4G" memory and ">=4G" memory.
>>> 
>>> We only place exactly 3G (0x0->0xbfffffff) under 4G, starting at address 0.
>> I can’t reconcile this with this code for q35:
>>    if (machine->ram_size >= 0xb0000000) {
>>         lowmem = 0x80000000; // max memory 0x8fffffff or 2.25 GiB
>>     } else {
>>         lowmem = 0xb0000000; // max memory 0xbfffffff or 3 GiB
>>     }
>> You assigned 8 Gib to ram which is > 0xb0000000 (2.75 Gib)
> 
> QEMU defaults to the "pc" machine. If you add "-M q35" you get:
> 
> address-space: memory
>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
>    0000000000000000-000000007fffffff (prio 0, ram): alias ram-below-4g @pc.ram 0000000000000000-000000007fffffff
> [...]
>    0000000100000000-000000027fffffff (prio 0, ram): alias ram-above-4g @pc.ram 0000000080000000-00000001ffffffff
>    0000000280000000-00000004bfffffff (prio 0, i/o): device-memory
>      0000000280000000-00000002bfffffff (prio 0, ram): mem0
> 
> 
>>> 
>>> We leave the remainder (1G) of the <4G addresses available for I/O devices (32bit PCI hole).
>>> 
>>> So we end up with 5G (0x100000000->0x23fffffff) of memory starting exactly at address 4G.
>>> 
>>> "maxram_size - ram_size"=8G is the maximum amount of memory you can hotplug. We use it to size the
>>> "device-memory" region:
>>> 
>>> 0x47fffffff - 0x240000000+1 = 0x240000000
>>> -> 9 GiB
>>> 
>>> We requested a to hotplug a maximum of "8 GiB", and sized the area slightly larger to allow for some flexibility
>>> when it comes to placing DIMMs in that "device-memory" area.
>> Right but here in this example you do not hot plug memory while the VM is running. We can hot plug 8G yes, but the memory may not physically exist yet (and may never exist). How can we use this math to provision device-memory when the memory may not exist physically?
> 
> We simply reserve a region in GPA space where we can coldplug and hotplug a
> predefined maximum amount of memory we can hotplug.
> 
> What do you think is wrong with that?
> 
>>> 
>>> We place that area for memory devices after the RAM. So it starts after the 5G of ">=4G" boot memory.
>>> 
>>> 
>>> Long story short, based on the initial RAM size and the maximum RAM size, you
>>> can construct the layout above and exactly know
>>> a) How much memory is below 4G, starting at address 0 -> leaving 1G for the 32bit PCI hole
>>> b) How much memory is above 4G, starting at address 4g.
>>> c) Where the region for memory devices starts (aligned after b) ) and how big it is.
>>> d) Where the 64bit PCI hole is (after c) )
>>> 
>>>> (b) Another question is, in pc_pci_hole64_start(), why are we adding this size to the start address?
>>>> } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) {
>>>> 	pc_get_device_memory_range(pcms, &hole64_start, &size);
>>>>         if (!pcmc->broken_reserved_end) {
>>>>             hole64_start += size;
>>> 
>>> The 64bit PCI hole starts after "device-memory" above.
>>> 
>>> Apparently, we have to take care of some layout issues before QEMU 2.5. You can assume that nowadays,
>>> "pcmc->broken_reserved_end" is never set. So the PCI64 hole is always after the device-memory region.
>>> 
>>>> I think this is trying to put the hole after the device memory. But if the ram size is <=maxram_size then the hole is after the above_4G memory? Why?
>>> 
>>> I didn't quit get what the concern is, can you elaborate?
>> Oh I meant the else part here and made a typo, the else implies ram size == maxram_size
>>   } else {
>>         hole64_start = pc_above_4g_end(pcms);
>>     }
>> So in this case, there is no device_memory region?!
> 
> Yes. In this case ms->ram_size == ms->maxram_size and you cannot cold/hotplug any memory devices.
> 
> See how pc_memory_init() doesn't call machine_memory_devices_init() in that case.
> 
> That's what the QEMU user asked for when *not* specifying maxmem (e.g., -m 4g).
> 
> In order to cold/hotplug any memory devices, you have to tell QEMU ahead of time how much memory
> you are intending to provide using memory devices (DIMM, NVDIMM, virtio-pmem, virtio-mem).
> 
> So when specifying, say -m 4g,maxmem=20g, we can have memory devices of a total of 16g (20 - 4).
> We use reserve a GPA space for device_memory that is at least 16g, into which we can either coldplug
> (QEMU cmdline) or hotplug (qmp/hmp) memory later.
> 
>> Another thing I do not understand is, for 32 -bit,
>> above_4g_mem_start is 4GiB  and above_4g_mem_size = ram_size - lowmem.
>> So we are allocating “above-4G” ram above address space of the processor?!
>>> 
>>>> (c) in your above change, what does long mode have anything to do with all of this?
>>> 
>>> According to my understanding, 32bit (i386) doesn't have a 64bit hole. And 32bit vs.
>>> 64bit (i386 vs. x86_64) is decided based on LM, not on the address bits (as we learned, PSE36, and PAE).
>>> 
>>> But really, I just did what x86_cpu_realizefn() does to decide 32bit vs. 64bit ;)
>>> 
>>>    /* For 64bit systems think about the number of physical bits to present.
>>>     * ideally this should be the same as the host; anything other than matching
>>>     * the host can cause incorrect guest behaviour.
>>>     * QEMU used to pick the magic value of 40 bits that corresponds to
>>>     * consumer AMD devices but nothing else.
>>>     *
>>>     * Note that this code assumes features expansion has already been done
>>>     * (as it checks for CPUID_EXT2_LM), and also assumes that potential
>>>     * phys_bits adjustments to match the host have been already done in
>>>     * accel-specific code in cpu_exec_realizefn.
>>>     */
>>>    if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
>>>    ...
>>>    } else {
>>>        /* For 32 bit systems don't use the user set value, but keep
>>>         * phys_bits consistent with what we tell the guest.
>>>         */
>> Ah I see. I missed this. But I still can’t understand why for 32 bit, pc_pci_hole64_start() would be the right address for max gpa?
> 
> You want "end of device memory region" if there is one, or
> "end of RAM" is there is none.
> 
> What pc_pci_hole64_start() does:
> 
> /*
> * The 64bit pci hole starts after "above 4G RAM" and
> * potentially the space reserved for memory hotplug.
> */

Ok hopefully my last question. I am still confused on something. Does the above mean that the hole64 will actually start from an address that is beyond maxram? Like basically if you added all of ram_below_4G, ram_above_4G, hot plug_mem and pci_hole64 then can it exceed maxram? I think it will. Does this not an issue?

> 
> There is the
> 	ROUND_UP(hole64_start, 1 * GiB);
> in there that is not really required for the !hole64 case. It
> shouldn't matter much in practice I think (besides an aligned value
> showing up in the error message).
> 
> We could factor out most of that calculation into a
> separate function, skipping that alignment to make that
> clearer.
> 
> -- 
> Cheers,
> 
> David / dhildenb
David Hildenbrand Sept. 18, 2023, 9:33 a.m. UTC | #17
>>
>>>> /*
>>>> * The 64bit pci hole starts after "above 4G RAM" and
>>>> * potentially the space reserved for memory hotplug.
>>>> */
>>>>
>>>> There is the
>>>> 	ROUND_UP(hole64_start, 1 * GiB);
>>>> in there that is not really required for the !hole64 case. It
>>>> shouldn't matter much in practice I think (besides an aligned value
>>>> showing up in the error message).
>>>>
>>>> We could factor out most of that calculation into a
>>>> separate function, skipping that alignment to make that
>>>> clearer.
>>> Yeah this whole memory segmentation is quite complicated and might benefit from a qemu doc or a refactoring.
>>
>> Absolutely. Do you have time to work on that (including the updated fix?).
> 
> Other than the fix you proposed I am not sure if we need to fix anything else atm. Seems physical address space bound checks are already in place.
> Re: doc, maybe. I will add it to my TODO list.

Will you send a proper patch, ideally not using pc_pci_hole64_start() 
but instead the same logic without the final alignment to 1 GiB?

I can also do it, but might take a bit longer.
Ani Sinha Sept. 18, 2023, 10:07 a.m. UTC | #18
On Mon, 18 Sept, 2023, 3:03 pm David Hildenbrand, <david@redhat.com> wrote:

> >>
> >>>> /*
> >>>> * The 64bit pci hole starts after "above 4G RAM" and
> >>>> * potentially the space reserved for memory hotplug.
> >>>> */
> >>>>
> >>>> There is the
> >>>>    ROUND_UP(hole64_start, 1 * GiB);
> >>>> in there that is not really required for the !hole64 case. It
> >>>> shouldn't matter much in practice I think (besides an aligned value
> >>>> showing up in the error message).
> >>>>
> >>>> We could factor out most of that calculation into a
> >>>> separate function, skipping that alignment to make that
> >>>> clearer.
> >>> Yeah this whole memory segmentation is quite complicated and might
> benefit from a qemu doc or a refactoring.
> >>
> >> Absolutely. Do you have time to work on that (including the updated
> fix?).
> >
> > Other than the fix you proposed I am not sure if we need to fix anything
> else atm. Seems physical address space bound checks are already in place.
> > Re: doc, maybe. I will add it to my TODO list.
>
> Will you send a proper patch, ideally not using pc_pci_hole64_start()
> but instead the same logic without the final alignment to 1 GiB?
>

I'll send. No problem. Could you answer my other question please ?


> I can also do it, but might take a bit longer.
>
> --
> Cheers,
>
> David / dhildenb
>
>
David Hildenbrand Sept. 18, 2023, 10:09 a.m. UTC | #19
On 18.09.23 12:07, Ani Sinha wrote:
> 
> 
> On Mon, 18 Sept, 2023, 3:03 pm David Hildenbrand, <david@redhat.com 
> <mailto:david@redhat.com>> wrote:
> 
>      >>
>      >>>> /*
>      >>>> * The 64bit pci hole starts after "above 4G RAM" and
>      >>>> * potentially the space reserved for memory hotplug.
>      >>>> */
>      >>>>
>      >>>> There is the
>      >>>>    ROUND_UP(hole64_start, 1 * GiB);
>      >>>> in there that is not really required for the !hole64 case. It
>      >>>> shouldn't matter much in practice I think (besides an aligned
>     value
>      >>>> showing up in the error message).
>      >>>>
>      >>>> We could factor out most of that calculation into a
>      >>>> separate function, skipping that alignment to make that
>      >>>> clearer.
>      >>> Yeah this whole memory segmentation is quite complicated and
>     might benefit from a qemu doc or a refactoring.
>      >>
>      >> Absolutely. Do you have time to work on that (including the
>     updated fix?).
>      >
>      > Other than the fix you proposed I am not sure if we need to fix
>     anything else atm. Seems physical address space bound checks are
>     already in place.
>      > Re: doc, maybe. I will add it to my TODO list.
> 
>     Will you send a proper patch, ideally not using pc_pci_hole64_start()
>     but instead the same logic without the final alignment to 1 GiB?
> 
> 
> I'll send. No problem. Could you answer my other question please ?

Sorry, which one did I miss?
Ani Sinha Sept. 18, 2023, 10:11 a.m. UTC | #20
On Mon, 18 Sept, 2023, 3:39 pm David Hildenbrand, <david@redhat.com> wrote:

> On 18.09.23 12:07, Ani Sinha wrote:
> >
> >
> > On Mon, 18 Sept, 2023, 3:03 pm David Hildenbrand, <david@redhat.com
> > <mailto:david@redhat.com>> wrote:
> >
> >      >>
> >      >>>> /*
> >      >>>> * The 64bit pci hole starts after "above 4G RAM" and
> >      >>>> * potentially the space reserved for memory hotplug.
> >      >>>> */
> >      >>>>
> >      >>>> There is the
> >      >>>>    ROUND_UP(hole64_start, 1 * GiB);
> >      >>>> in there that is not really required for the !hole64 case. It
> >      >>>> shouldn't matter much in practice I think (besides an aligned
> >     value
> >      >>>> showing up in the error message).
> >      >>>>
> >      >>>> We could factor out most of that calculation into a
> >      >>>> separate function, skipping that alignment to make that
> >      >>>> clearer.
> >      >>> Yeah this whole memory segmentation is quite complicated and
> >     might benefit from a qemu doc or a refactoring.
> >      >>
> >      >> Absolutely. Do you have time to work on that (including the
> >     updated fix?).
> >      >
> >      > Other than the fix you proposed I am not sure if we need to fix
> >     anything else atm. Seems physical address space bound checks are
> >     already in place.
> >      > Re: doc, maybe. I will add it to my TODO list.
> >
> >     Will you send a proper patch, ideally not using pc_pci_hole64_start()
> >     but instead the same logic without the final alignment to 1 GiB?
> >
> >
> > I'll send. No problem. Could you answer my other question please ?
>
> Sorry, which one did I miss



Ok hopefully my last question. I am still confused on something. Does the
> above mean that the hole64 will actually start from an address that is
> beyond maxram? Like basically if you added all of ram_below_4G,
> ram_above_4G, hot plug_mem and pci_hole64 then can it exceed maxram? I
> think it will. Does this not an issue?




> --
> Cheers,
>
> David / dhildenb
>
>
Ani Sinha Sept. 18, 2023, 10:14 a.m. UTC | #21
On Mon, 18 Sept, 2023, 3:41 pm Ani Sinha, <anisinha@redhat.com> wrote:

>
>
> On Mon, 18 Sept, 2023, 3:39 pm David Hildenbrand, <david@redhat.com>
> wrote:
>
>> On 18.09.23 12:07, Ani Sinha wrote:
>> >
>> >
>> > On Mon, 18 Sept, 2023, 3:03 pm David Hildenbrand, <david@redhat.com
>> > <mailto:david@redhat.com>> wrote:
>> >
>> >      >>
>> >      >>>> /*
>> >      >>>> * The 64bit pci hole starts after "above 4G RAM" and
>> >      >>>> * potentially the space reserved for memory hotplug.
>> >      >>>> */
>> >      >>>>
>> >      >>>> There is the
>> >      >>>>    ROUND_UP(hole64_start, 1 * GiB);
>> >      >>>> in there that is not really required for the !hole64 case. It
>> >      >>>> shouldn't matter much in practice I think (besides an aligned
>> >     value
>> >      >>>> showing up in the error message).
>> >      >>>>
>> >      >>>> We could factor out most of that calculation into a
>> >      >>>> separate function, skipping that alignment to make that
>> >      >>>> clearer.
>> >      >>> Yeah this whole memory segmentation is quite complicated and
>> >     might benefit from a qemu doc or a refactoring.
>> >      >>
>> >      >> Absolutely. Do you have time to work on that (including the
>> >     updated fix?).
>> >      >
>> >      > Other than the fix you proposed I am not sure if we need to fix
>> >     anything else atm. Seems physical address space bound checks are
>> >     already in place.
>> >      > Re: doc, maybe. I will add it to my TODO list.
>> >
>> >     Will you send a proper patch, ideally not using
>> pc_pci_hole64_start()
>> >     but instead the same logic without the final alignment to 1 GiB?
>> >
>> >
>> > I'll send. No problem. Could you answer my other question please ?
>>
>> Sorry, which one did I miss
>
>

Ok hopefully my last question. I am still confused on something. Does the
above mean that the hole64 will actually start from an address that is
beyond maxram? Like basically if you added all of ram_below_4G,
ram_above_4G, hot plug_mem and pci_hole64 then can it exceed maxram? I
think it will. Does this not an issue?



>>
David Hildenbrand Sept. 18, 2023, 10:19 a.m. UTC | #22
On 18.09.23 12:11, Ani Sinha wrote:

> 
>     Ok hopefully my last question. I am still confused on something.
>     Does the above mean that the hole64 will actually start from an
>     address that is beyond maxram? Like basically if you added all of
>     ram_below_4G, ram_above_4G, hot plug_mem and pci_hole64 then can it
>     exceed maxram? I think it will. Does this not an issue?

If you'd have a 2 GiB VM, the device memory region and hole64 would 
always be placed >= 4 GiB address, yes.

As maxram is just a size, and not a PFN, I don't think there is any 
issue with that.

ms->maxram_size is usually only used in combination with ms->ram_size to
detect if memory devices will be enabled, and to size that region for 
memory devices.
Ani Sinha Sept. 18, 2023, 10:54 a.m. UTC | #23
On Mon, Sep 18, 2023 at 3:49 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 18.09.23 12:11, Ani Sinha wrote:
>
> >
> >     Ok hopefully my last question. I am still confused on something.
> >     Does the above mean that the hole64 will actually start from an
> >     address that is beyond maxram? Like basically if you added all of
> >     ram_below_4G, ram_above_4G, hot plug_mem and pci_hole64 then can it
> >     exceed maxram? I think it will. Does this not an issue?
>
> If you'd have a 2 GiB VM, the device memory region and hole64 would
> always be placed >= 4 GiB address, yes.
>
> As maxram is just a size, and not a PFN, I don't think there is any
> issue with that.

So this is all just a scheme to decide what to place where with maxram
amount of memory available. When the processor needs to access the
memory mapped PCI device, its simply dynamically mapped to the
available physical ram. Is my understanding correct here?

>
> ms->maxram_size is usually only used in combination with ms->ram_size to
> detect if memory devices will be enabled, and to size that region for
> memory devices.
>
> --
> Cheers,
>
> David / dhildenb
>
David Hildenbrand Sept. 18, 2023, 10:58 a.m. UTC | #24
On 18.09.23 12:54, Ani Sinha wrote:
> On Mon, Sep 18, 2023 at 3:49 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 18.09.23 12:11, Ani Sinha wrote:
>>
>>>
>>>      Ok hopefully my last question. I am still confused on something.
>>>      Does the above mean that the hole64 will actually start from an
>>>      address that is beyond maxram? Like basically if you added all of
>>>      ram_below_4G, ram_above_4G, hot plug_mem and pci_hole64 then can it
>>>      exceed maxram? I think it will. Does this not an issue?
>>
>> If you'd have a 2 GiB VM, the device memory region and hole64 would
>> always be placed >= 4 GiB address, yes.
>>
>> As maxram is just a size, and not a PFN, I don't think there is any
>> issue with that.
> 
> So this is all just a scheme to decide what to place where with maxram
> amount of memory available. When the processor needs to access the

Yes. ram_size and maxram_size are only used to create the memory layout.

> memory mapped PCI device, its simply dynamically mapped to the
> available physical ram. Is my understanding correct here?

I'm no expert on that, but from my understanding that's what the 
pci/pci64 hole is for -- mapping PCI BARs into these areas, such that 
they don't conflict with actual guest RAM. That's why we still account 
these "holes" as valid GFN that could be used+accessed by the VM once a 
PCI BAR gets mapped in there.
Ani Sinha Sept. 18, 2023, 11 a.m. UTC | #25
On Mon, Sep 18, 2023 at 4:28 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 18.09.23 12:54, Ani Sinha wrote:
> > On Mon, Sep 18, 2023 at 3:49 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 18.09.23 12:11, Ani Sinha wrote:
> >>
> >>>
> >>>      Ok hopefully my last question. I am still confused on something.
> >>>      Does the above mean that the hole64 will actually start from an
> >>>      address that is beyond maxram? Like basically if you added all of
> >>>      ram_below_4G, ram_above_4G, hot plug_mem and pci_hole64 then can it
> >>>      exceed maxram? I think it will. Does this not an issue?
> >>
> >> If you'd have a 2 GiB VM, the device memory region and hole64 would
> >> always be placed >= 4 GiB address, yes.
> >>
> >> As maxram is just a size, and not a PFN, I don't think there is any
> >> issue with that.
> >
> > So this is all just a scheme to decide what to place where with maxram
> > amount of memory available. When the processor needs to access the
>
> Yes. ram_size and maxram_size are only used to create the memory layout.
>
> > memory mapped PCI device, its simply dynamically mapped to the
> > available physical ram. Is my understanding correct here?
>
> I'm no expert on that, but from my understanding that's what the
> pci/pci64 hole is for -- mapping PCI BARs into these areas, such that
> they don't conflict with actual guest RAM. That's why we still account
> these "holes" as valid GFN that could be used+accessed by the VM once a
> PCI BAR gets mapped in there.

Yes that was my understanding too but since device drivers need to
access those BAR addresses, they need to be mapped to the actual
available physical ram.
Ani Sinha Sept. 18, 2023, 11:02 a.m. UTC | #26
On Mon, Sep 18, 2023 at 4:30 PM Ani Sinha <anisinha@redhat.com> wrote:
>
> On Mon, Sep 18, 2023 at 4:28 PM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 18.09.23 12:54, Ani Sinha wrote:
> > > On Mon, Sep 18, 2023 at 3:49 PM David Hildenbrand <david@redhat.com> wrote:
> > >>
> > >> On 18.09.23 12:11, Ani Sinha wrote:
> > >>
> > >>>
> > >>>      Ok hopefully my last question. I am still confused on something.
> > >>>      Does the above mean that the hole64 will actually start from an
> > >>>      address that is beyond maxram? Like basically if you added all of
> > >>>      ram_below_4G, ram_above_4G, hot plug_mem and pci_hole64 then can it
> > >>>      exceed maxram? I think it will. Does this not an issue?
> > >>
> > >> If you'd have a 2 GiB VM, the device memory region and hole64 would
> > >> always be placed >= 4 GiB address, yes.
> > >>
> > >> As maxram is just a size, and not a PFN, I don't think there is any
> > >> issue with that.
> > >
> > > So this is all just a scheme to decide what to place where with maxram
> > > amount of memory available. When the processor needs to access the
> >
> > Yes. ram_size and maxram_size are only used to create the memory layout.
> >
> > > memory mapped PCI device, its simply dynamically mapped to the
> > > available physical ram. Is my understanding correct here?
> >
> > I'm no expert on that, but from my understanding that's what the
> > pci/pci64 hole is for -- mapping PCI BARs into these areas, such that
> > they don't conflict with actual guest RAM. That's why we still account
> > these "holes" as valid GFN that could be used+accessed by the VM once a
> > PCI BAR gets mapped in there.
>
> Yes that was my understanding too but since device drivers need to
> access those BAR addresses, they need to be mapped to the actual
> available physical ram.

No sorry I was confused. They are just register addresses on the
device unrelated to RAM.
David Hildenbrand Sept. 18, 2023, 11:02 a.m. UTC | #27
On 18.09.23 13:00, Ani Sinha wrote:
> On Mon, Sep 18, 2023 at 4:28 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 18.09.23 12:54, Ani Sinha wrote:
>>> On Mon, Sep 18, 2023 at 3:49 PM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 18.09.23 12:11, Ani Sinha wrote:
>>>>
>>>>>
>>>>>       Ok hopefully my last question. I am still confused on something.
>>>>>       Does the above mean that the hole64 will actually start from an
>>>>>       address that is beyond maxram? Like basically if you added all of
>>>>>       ram_below_4G, ram_above_4G, hot plug_mem and pci_hole64 then can it
>>>>>       exceed maxram? I think it will. Does this not an issue?
>>>>
>>>> If you'd have a 2 GiB VM, the device memory region and hole64 would
>>>> always be placed >= 4 GiB address, yes.
>>>>
>>>> As maxram is just a size, and not a PFN, I don't think there is any
>>>> issue with that.
>>>
>>> So this is all just a scheme to decide what to place where with maxram
>>> amount of memory available. When the processor needs to access the
>>
>> Yes. ram_size and maxram_size are only used to create the memory layout.
>>
>>> memory mapped PCI device, its simply dynamically mapped to the
>>> available physical ram. Is my understanding correct here?
>>
>> I'm no expert on that, but from my understanding that's what the
>> pci/pci64 hole is for -- mapping PCI BARs into these areas, such that
>> they don't conflict with actual guest RAM. That's why we still account
>> these "holes" as valid GFN that could be used+accessed by the VM once a
>> PCI BAR gets mapped in there.
> 
> Yes that was my understanding too but since device drivers need to
> access those BAR addresses, they need to be mapped to the actual
> available physical ram.

These devices supply their own RAM memory regions for BARs, that will 
get mapped into the holes. They are not using any boot/hotplug memory. 
It's confusing.
Ani Sinha Sept. 18, 2023, 11:04 a.m. UTC | #28
On Mon, Sep 18, 2023 at 4:32 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 18.09.23 13:00, Ani Sinha wrote:
> > On Mon, Sep 18, 2023 at 4:28 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 18.09.23 12:54, Ani Sinha wrote:
> >>> On Mon, Sep 18, 2023 at 3:49 PM David Hildenbrand <david@redhat.com> wrote:
> >>>>
> >>>> On 18.09.23 12:11, Ani Sinha wrote:
> >>>>
> >>>>>
> >>>>>       Ok hopefully my last question. I am still confused on something.
> >>>>>       Does the above mean that the hole64 will actually start from an
> >>>>>       address that is beyond maxram? Like basically if you added all of
> >>>>>       ram_below_4G, ram_above_4G, hot plug_mem and pci_hole64 then can it
> >>>>>       exceed maxram? I think it will. Does this not an issue?
> >>>>
> >>>> If you'd have a 2 GiB VM, the device memory region and hole64 would
> >>>> always be placed >= 4 GiB address, yes.
> >>>>
> >>>> As maxram is just a size, and not a PFN, I don't think there is any
> >>>> issue with that.
> >>>
> >>> So this is all just a scheme to decide what to place where with maxram
> >>> amount of memory available. When the processor needs to access the
> >>
> >> Yes. ram_size and maxram_size are only used to create the memory layout.
> >>
> >>> memory mapped PCI device, its simply dynamically mapped to the
> >>> available physical ram. Is my understanding correct here?
> >>
> >> I'm no expert on that, but from my understanding that's what the
> >> pci/pci64 hole is for -- mapping PCI BARs into these areas, such that
> >> they don't conflict with actual guest RAM. That's why we still account
> >> these "holes" as valid GFN that could be used+accessed by the VM once a
> >> PCI BAR gets mapped in there.
> >
> > Yes that was my understanding too but since device drivers need to
> > access those BAR addresses, they need to be mapped to the actual
> > available physical ram.
>
> These devices supply their own RAM memory regions for BARs, that will
> get mapped into the holes. They are not using any boot/hotplug memory.
> It's confusing.

yeah exactly. The actual physical memory is in the device in the form
of BAR registers.
diff mbox series

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 54838c0c41..f84e4c4916 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -31,6 +31,7 @@ 
 #include "hw/i386/topology.h"
 #include "hw/i386/fw_cfg.h"
 #include "hw/i386/vmport.h"
+#include "hw/mem/memory-device.h"
 #include "sysemu/cpus.h"
 #include "hw/block/fdc.h"
 #include "hw/ide/internal.h"
@@ -1006,6 +1007,17 @@  void pc_memory_init(PCMachineState *pcms,
         exit(EXIT_FAILURE);
     }
 
+    /*
+     * check if the VM started with more ram configured than max physical
+     * address available with the current processor.
+     */
+    if (machine->ram_size > maxphysaddr + 1) {
+        error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64
+                     " (max configured memory), phys-bits too low (%u)",
+                     maxphysaddr, machine->ram_size, cpu->phys_bits);
+        exit(EXIT_FAILURE);
+    }
+
     /*
      * Split single memory region and use aliases to address portions of it,
      * done for backwards compatibility with older qemus.
@@ -1845,6 +1857,38 @@  static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp)
     return true;
 }
 
+static bool pc_mem_hotplug_allowed(MachineState *ms,
+                                   MemoryRegion *mr, Error **errp)
+{
+    hwaddr maxphysaddr;
+    uint64_t dimm_size, size, ram_size, total_mem_size;
+    X86CPU *cpu = X86_CPU(first_cpu);
+
+    if (!mr) {
+        return true;
+    }
+
+    dimm_size = ms->device_memory->dimm_size;
+    size = memory_region_size(mr);
+    ram_size = (uint64_t) ms->ram_size;
+    total_mem_size = ram_size + dimm_size + size;
+
+    maxphysaddr = ((hwaddr)1 << cpu->phys_bits) - 1;
+
+    /*
+     * total memory after hotplug will exceed the maximum physical
+     * address limit of the processor. So hotplug cannot be allowed.
+     */
+    if ((total_mem_size > (uint64_t)maxphysaddr + 1) &&
+        (total_mem_size > ram_size + dimm_size)) {
+        error_setg(errp, "Address space limit 0x%"PRIx64" < 0x%"PRIx64
+                   " phys-bits too low (%u)",
+                   maxphysaddr, total_mem_size, cpu->phys_bits);
+        return false;
+    }
+    return true;
+}
+
 static void pc_machine_class_init(ObjectClass *oc, void *data)
 {
     MachineClass *mc = MACHINE_CLASS(oc);
@@ -1870,6 +1914,7 @@  static void pc_machine_class_init(ObjectClass *oc, void *data)
     assert(!mc->get_hotplug_handler);
     mc->get_hotplug_handler = pc_get_hotplug_handler;
     mc->hotplug_allowed = pc_hotplug_allowed;
+    mc->mem_hotplug_allowed = pc_mem_hotplug_allowed;
     mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
     mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
     mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c
index 667d56bd29..825bc593ae 100644
--- a/hw/mem/memory-device.c
+++ b/hw/mem/memory-device.c
@@ -57,6 +57,7 @@  static void memory_device_check_addable(MachineState *ms, MemoryRegion *mr,
 {
     const uint64_t used_region_size = ms->device_memory->used_region_size;
     const uint64_t size = memory_region_size(mr);
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
 
     /* we will need a new memory slot for kvm and vhost */
     if (kvm_enabled() && !kvm_has_free_slot(ms)) {
@@ -68,6 +69,11 @@  static void memory_device_check_addable(MachineState *ms, MemoryRegion *mr,
         return;
     }
 
+    if (mc->mem_hotplug_allowed &&
+        (!(mc->mem_hotplug_allowed(ms, mr, errp)))) {
+        return;
+    }
+
     /* will we exceed the total amount of memory specified */
     if (used_region_size + size < used_region_size ||
         used_region_size + size > ms->maxram_size - ms->ram_size) {
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 3b541ffd24..84b199ee51 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -210,6 +210,13 @@  typedef struct {
  *    false is returned, an error must be set to show the reason of
  *    the rejection.  If the hook is not provided, all hotplug will be
  *    allowed.
+ * @mem_hotplug_allowed:
+ *    If the hook is provided, then it'll be called for each memory device
+ *    hotplug to check whether the mem device hotplug is allowed.  Return
+ *    true to grant allowance or false to reject the hotplug.  When
+ *    false is returned, an error must be set to show the reason of
+ *    the rejection.  If the hook is not provided, all mem hotplug will be
+ *    allowed.
  * @default_ram_id:
  *    Specifies inital RAM MemoryRegion name to be used for default backend
  *    creation if user explicitly hasn't specified backend with "memory-backend"
@@ -285,6 +292,8 @@  struct MachineClass {
                                            DeviceState *dev);
     bool (*hotplug_allowed)(MachineState *state, DeviceState *dev,
                             Error **errp);
+    bool (*mem_hotplug_allowed)(MachineState *state, MemoryRegion *mr,
+                                Error **errp);
     CpuInstanceProperties (*cpu_index_to_instance_props)(MachineState *machine,
                                                          unsigned cpu_index);
     const CPUArchIdList *(*possible_cpu_arch_ids)(MachineState *machine);