Message ID | 1348226255-4226-7-git-send-email-vasilis.liaskovitis@profitbricks.com |
---|---|
State | New |
Headers | show |
On Fri, Sep 21, 2012 at 11:17 AM, Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com> wrote: > Example: > "-dimm id=dimm0,size=512M,node=0,populated=off" There should not be a need to introduce a new top level option, instead you should just use -device, like -device dimm,base=0,id=dimm0,size=512M,node=0,populated=off That would also specify the start address. > will define a 512M memory slot belonging to numa node 0. > > When "populated=on", a DimmDevice is created and hot-plugged at system startup. > > Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com> > --- > hw/Makefile.objs | 2 +- > qemu-config.c | 25 +++++++++++++++++++++++++ > qemu-options.hx | 5 +++++ > sysemu.h | 1 + > vl.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 82 insertions(+), 1 deletions(-) > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs > index 6dfebd2..8c5c39a 100644 > --- a/hw/Makefile.objs > +++ b/hw/Makefile.objs > @@ -26,7 +26,7 @@ hw-obj-$(CONFIG_I8254) += i8254_common.o i8254.o > hw-obj-$(CONFIG_PCSPK) += pcspk.o > hw-obj-$(CONFIG_PCKBD) += pckbd.o > hw-obj-$(CONFIG_FDC) += fdc.o > -hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o > +hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o dimm.o > hw-obj-$(CONFIG_APM) += pm_smbus.o apm.o > hw-obj-$(CONFIG_DMA) += dma.o > hw-obj-$(CONFIG_I82374) += i82374.o > diff --git a/qemu-config.c b/qemu-config.c > index eba977e..4022d64 100644 > --- a/qemu-config.c > +++ b/qemu-config.c > @@ -646,6 +646,30 @@ QemuOptsList qemu_boot_opts = { > }, > }; > > +static QemuOptsList qemu_dimm_opts = { > + .name = "dimm", > + .head = QTAILQ_HEAD_INITIALIZER(qemu_dimm_opts.head), > + .desc = { > + { > + .name = "id", > + .type = QEMU_OPT_STRING, > + .help = "id of this dimm device", > + },{ > + .name = "size", > + .type = QEMU_OPT_SIZE, > + .help = "memory size for this dimm", > + },{ > + .name = "populated", > + .type = QEMU_OPT_BOOL, > + .help = "populated for this dimm", > + },{ > + .name = "node", > + .type = QEMU_OPT_NUMBER, > + .help = "NUMA node number (i.e. proximity) for this dimm", > + }, > + { /* end of list */ } > + }, > +}; > static QemuOptsList *vm_config_groups[32] = { > &qemu_drive_opts, > &qemu_chardev_opts, > @@ -662,6 +686,7 @@ static QemuOptsList *vm_config_groups[32] = { > &qemu_boot_opts, > &qemu_iscsi_opts, > &qemu_sandbox_opts, > + &qemu_dimm_opts, > NULL, > }; > > diff --git a/qemu-options.hx b/qemu-options.hx > index 804a2d1..3687722 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -2842,3 +2842,8 @@ HXCOMM This is the last statement. Insert new options before this line! > STEXI > @end table > ETEXI > + > +DEF("dimm", HAS_ARG, QEMU_OPTION_dimm, > + "-dimm id=dimmid,size=sz,node=nd,populated=on|off\n" > + "specify memory dimm device with name dimmid, size sz on node nd", > + QEMU_ARCH_ALL) > diff --git a/sysemu.h b/sysemu.h > index 65552ac..7baf9c9 100644 > --- a/sysemu.h > +++ b/sysemu.h > @@ -139,6 +139,7 @@ extern QEMUClock *rtc_clock; > extern int nb_numa_nodes; > extern uint64_t node_mem[MAX_NODES]; > extern unsigned long *node_cpumask[MAX_NODES]; > +extern int nb_hp_dimms; > > #define MAX_OPTION_ROMS 16 > typedef struct QEMUOptionRom { > diff --git a/vl.c b/vl.c > index 7c577fa..af1745c 100644 > --- a/vl.c > +++ b/vl.c > @@ -126,6 +126,7 @@ int main(int argc, char **argv) > #include "hw/xen.h" > #include "hw/qdev.h" > #include "hw/loader.h" > +#include "hw/dimm.h" > #include "bt-host.h" > #include "net.h" > #include "net/slirp.h" > @@ -248,6 +249,7 @@ QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order > int nb_numa_nodes; > uint64_t node_mem[MAX_NODES]; > unsigned long *node_cpumask[MAX_NODES]; > +int nb_hp_dimms; This counter (if needed) should be private to dimm.c. > > uint8_t qemu_uuid[16]; > > @@ -530,6 +532,37 @@ static void configure_rtc_date_offset(const char *startdate, int legacy) > } > } > > +static void configure_dimm(QemuOpts *opts) > +{ > + const char *id; > + uint64_t size, node; > + bool populated; > + QemuOpts *devopts; > + char buf[256]; > + if (nb_hp_dimms == MAX_DIMMS) { Why should there be any limit of DIMMS? Please use lists etc. to avoid restrictions. > + fprintf(stderr, "qemu: maximum number of DIMMs (%d) exceeded\n", > + MAX_DIMMS); > + exit(1); > + } > + id = qemu_opts_id(opts); > + size = qemu_opt_get_size(opts, "size", DEFAULT_DIMMSIZE); > + populated = qemu_opt_get_bool(opts, "populated", 0); > + node = qemu_opt_get_number(opts, "node", 0); > + > + dimm_config_create((char*)id, size, node, nb_hp_dimms, 0); > + > + if (populated) { > + devopts = qemu_opts_create(qemu_find_opts("device"), id, 0, NULL); > + qemu_opt_set(devopts, "driver", "dimm"); > + snprintf(buf, sizeof(buf), "%lu", size); > + qemu_opt_set(devopts, "size", buf); > + snprintf(buf, sizeof(buf), "%lu", node); > + qemu_opt_set(devopts, "node", buf); > + qemu_opt_set(devopts, "bus", "membus"); > + } > + nb_hp_dimms++; > +} > + > static void configure_rtc(QemuOpts *opts) > { > const char *value; > @@ -2354,6 +2387,8 @@ int main(int argc, char **argv, char **envp) > DisplayChangeListener *dcl; > int cyls, heads, secs, translation; > QemuOpts *hda_opts = NULL, *opts, *machine_opts; > + QemuOpts *dimm_opts[MAX_DIMMS]; > + int nb_dimm_opts = 0; > QemuOptsList *olist; > int optind; > const char *optarg; > @@ -3288,6 +3323,18 @@ int main(int argc, char **argv, char **envp) > exit(0); > } > break; > + case QEMU_OPTION_dimm: > + if (nb_dimm_opts == MAX_DIMMS) { > + fprintf(stderr, "qemu: maximum number of DIMMs (%d) exceeded\n", > + MAX_DIMMS); > + } > + dimm_opts[nb_dimm_opts] = > + qemu_opts_parse(qemu_find_opts("dimm"), optarg, 0); > + if (!dimm_opts[nb_dimm_opts]) { > + exit(1); > + } > + nb_dimm_opts++; > + break; > default: > os_parse_cmd_args(popt->index, optarg); > } > @@ -3611,6 +3658,9 @@ int main(int argc, char **argv, char **envp) > } > qemu_add_globals(); > > + for (i = 0; i < nb_dimm_opts; i++) Missing braces, please read CODING_STYLE. > + configure_dimm(dimm_opts[i]); > + > qdev_machine_init(); > > machine->init(ram_size, boot_devices, > -- > 1.7.9 >
On Sat, Sep 22, 2012 at 01:46:57PM +0000, Blue Swirl wrote: > On Fri, Sep 21, 2012 at 11:17 AM, Vasilis Liaskovitis > <vasilis.liaskovitis@profitbricks.com> wrote: > > Example: > > "-dimm id=dimm0,size=512M,node=0,populated=off" > > There should not be a need to introduce a new top level option, > instead you should just use -device, like > -device dimm,base=0,id=dimm0,size=512M,node=0,populated=off > > That would also specify the start address. What is "base"? the start address? I think the start address should be calculated by the chipset / board, not by the user. The "-dimm" option is supposed to specify the dimm/memory layout, and not create any devices. If we don't want this new option, I have a question: A "-device/device_add" means we create a new qdev device at startup or as a hotplug operation respectively. So, the semantics of "-device dimm,id=dimm0,size=512M,node=0,populated=on" are clear to me. What does "-device dimm,populated=off" mean from a qdev perspective? There are 2 alternatives: - The device is created on the dimmbus, but is not used/populated yet. Than the activation/acpi-hotplug of the dimm may require a separate command (we used to have "dimm_add" in versions < 3). "device_add" handling always hotplugs a new qdev device, so this wouldn't fit this usecase, because the device already exists. In this case, the actual "acpi hotplug" operation is decoupled from qdev device creation. - The dimmdevice is not created when "-device dimm,populated=off" (this would require some ugly checking in normal -device argument handling). Only the dimm layout is saved. The hotplug is triggered from a normal device_add later. So in this case, the "acpi hotplug" happens at the same time as the qdev hotplug. Do you see a simpler alternative without introducing a new option? Using the "-dimm" option follows the second semantic and avoids changing the "-device" semantics. Dimm layout description is decoupled from dimmdevice creation, and qdev hotplug coincides with acpi hotplug. thanks, - Vasilis
On Mon, Sep 24, 2012 at 10:42 AM, Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com> wrote: > On Sat, Sep 22, 2012 at 01:46:57PM +0000, Blue Swirl wrote: >> On Fri, Sep 21, 2012 at 11:17 AM, Vasilis Liaskovitis >> <vasilis.liaskovitis@profitbricks.com> wrote: >> > Example: >> > "-dimm id=dimm0,size=512M,node=0,populated=off" >> >> There should not be a need to introduce a new top level option, >> instead you should just use -device, like >> -device dimm,base=0,id=dimm0,size=512M,node=0,populated=off >> >> That would also specify the start address. > > What is "base"? the start address? I think the start address should be calculated by the > chipset / board, not by the user. Yes. > > The "-dimm" option is supposed to specify the dimm/memory layout, and not create > any devices. > > If we don't want this new option, I have a question: > > A "-device/device_add" means we create a new qdev device at startup or as a > hotplug operation respectively. So, the semantics of > "-device dimm,id=dimm0,size=512M,node=0,populated=on" are clear to me. > > What does "-device dimm,populated=off" mean from a qdev perspective? There are 2 > alternatives: > > - The device is created on the dimmbus, but is not used/populated yet. Than the > activation/acpi-hotplug of the dimm may require a separate command (we used to have > "dimm_add" in versions < 3). "device_add" handling always hotplugs a new qdev > device, so this wouldn't fit this usecase, because the device already exists. In > this case, the actual "acpi hotplug" operation is decoupled from qdev device > creation. The bus exists but the devices do not, device_add would add DIMMs to the bus. This matches PCI bus created by the host bridge and PCI device hotplug. A more complex setup would be dimm bus, dimm slot devices and DIMM devices. The intermediate slot device would contain one DIMM device if plugged. > > - The dimmdevice is not created when "-device dimm,populated=off" (this would > require some ugly checking in normal -device argument handling). Only the dimm > layout is saved. The hotplug is triggered from a normal device_add later. So in > this case, the "acpi hotplug" happens at the same time as the qdev hotplug. > > Do you see a simpler alternative without introducing a new option? > > Using the "-dimm" option follows the second semantic and avoids changing the "-device" > semantics. Dimm layout description is decoupled from dimmdevice creation, and qdev > hotplug coincides with acpi hotplug. Maybe even the dimmbus device shouldn't exist by itself after all, or it should be pretty much invisible to users. On real HW, the memory controller or south bridge handles the memory. For i440fx, it's part of the same chipset. So I think we should just add qdev properties to i440fx to specify the sizes, nodes etc. Then i440fx should create the dimmbus device unconditionally using the properties. The default properties should create a sane configuration, otherwise -global i440fx.dimm_size=512M etc. could be used. Then the bus would be populated as before or with device_add. > > thanks, > > - Vasilis
Hi, sorry for the delayed answer. On Sat, Sep 29, 2012 at 11:13:04AM +0000, Blue Swirl wrote: > > > > The "-dimm" option is supposed to specify the dimm/memory layout, and not create > > any devices. > > > > If we don't want this new option, I have a question: > > > > A "-device/device_add" means we create a new qdev device at startup or as a > > hotplug operation respectively. So, the semantics of > > "-device dimm,id=dimm0,size=512M,node=0,populated=on" are clear to me. > > > > What does "-device dimm,populated=off" mean from a qdev perspective? There are 2 > > alternatives: > > > > - The device is created on the dimmbus, but is not used/populated yet. Than the > > activation/acpi-hotplug of the dimm may require a separate command (we used to have > > "dimm_add" in versions < 3). "device_add" handling always hotplugs a new qdev > > device, so this wouldn't fit this usecase, because the device already exists. In > > this case, the actual "acpi hotplug" operation is decoupled from qdev device > > creation. > > The bus exists but the devices do not, device_add would add DIMMs to > the bus. This matches PCI bus created by the host bridge and PCI > device hotplug. > > A more complex setup would be dimm bus, dimm slot devices and DIMM > devices. The intermediate slot device would contain one DIMM device if > plugged. interesting, I haven't thought about this alternative. It does sounds overly complex, but a dimmslot / dimmdevice splitup could consolidate hotplug semantic differences between populated=on/off. Something similar to the dimmslot device is already present in v3 (dimmcfg structure), but it's not a qdev visible device. I 'd rather avoid the complication, but i might revisit this idea. > > > > > - The dimmdevice is not created when "-device dimm,populated=off" (this would > > require some ugly checking in normal -device argument handling). Only the dimm > > layout is saved. The hotplug is triggered from a normal device_add later. So in > > this case, the "acpi hotplug" happens at the same time as the qdev hotplug. > > > > Do you see a simpler alternative without introducing a new option? > > > > Using the "-dimm" option follows the second semantic and avoids changing the "-device" > > semantics. Dimm layout description is decoupled from dimmdevice creation, and qdev > > hotplug coincides with acpi hotplug. > > Maybe even the dimmbus device shouldn't exist by itself after all, or > it should be pretty much invisible to users. On real HW, the memory > controller or south bridge handles the memory. For i440fx, it's part > of the same chipset. So I think we should just add qdev properties to > i440fx to specify the sizes, nodes etc. Then i440fx should create the > dimmbus device unconditionally using the properties. The default > properties should create a sane configuration, otherwise -global > i440fx.dimm_size=512M etc. could be used. Then the bus would be > populated as before or with device_add. hmm the problem with using only i440fx properties, is that size/nodes look dimm specific to me, not chipset-memcontroller specific. Unless we only allow uniform size dimms. Is it possible to have a dynamic list of sizes/nodes pairs as properties of a qdev device? Also if there is no dimmbus, and instead we have only links<> from i440fx to dimm-devices, would the current qdev hotplug API be enough? I am currently leaning towards this: i440fx unconditionally creates the dimmbus. Users don't have to specify the bus (i assume this is what you mean by "dimmbus should be invisible to the users") We only use "-device dimm" to describe dimms. With "-device dimm,populated=off", only the dimm config layout will be saved in the dimmbus. The hotplug is triggered from a normal device_add later (same as pci hotplug). thanks, - Vasilis
On Tue, Oct 9, 2012 at 5:04 PM, Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com> wrote: > Hi, > > sorry for the delayed answer. > > On Sat, Sep 29, 2012 at 11:13:04AM +0000, Blue Swirl wrote: >> > >> > The "-dimm" option is supposed to specify the dimm/memory layout, and not create >> > any devices. >> > >> > If we don't want this new option, I have a question: >> > >> > A "-device/device_add" means we create a new qdev device at startup or as a >> > hotplug operation respectively. So, the semantics of >> > "-device dimm,id=dimm0,size=512M,node=0,populated=on" are clear to me. >> > >> > What does "-device dimm,populated=off" mean from a qdev perspective? There are 2 >> > alternatives: >> > >> > - The device is created on the dimmbus, but is not used/populated yet. Than the >> > activation/acpi-hotplug of the dimm may require a separate command (we used to have >> > "dimm_add" in versions < 3). "device_add" handling always hotplugs a new qdev >> > device, so this wouldn't fit this usecase, because the device already exists. In >> > this case, the actual "acpi hotplug" operation is decoupled from qdev device >> > creation. >> >> The bus exists but the devices do not, device_add would add DIMMs to >> the bus. This matches PCI bus created by the host bridge and PCI >> device hotplug. >> >> A more complex setup would be dimm bus, dimm slot devices and DIMM >> devices. The intermediate slot device would contain one DIMM device if >> plugged. > > interesting, I haven't thought about this alternative. It does sounds overly > complex, but a dimmslot / dimmdevice splitup could consolidate hotplug semantic > differences between populated=on/off. Something similar to the dimmslot device > is already present in v3 (dimmcfg structure), but it's not a qdev visible device. > I 'd rather avoid the complication, but i might revisit this idea. The memory controller could be able to also enable and disable slots independently to their population state. > >> >> > >> > - The dimmdevice is not created when "-device dimm,populated=off" (this would >> > require some ugly checking in normal -device argument handling). Only the dimm >> > layout is saved. The hotplug is triggered from a normal device_add later. So in >> > this case, the "acpi hotplug" happens at the same time as the qdev hotplug. >> > >> > Do you see a simpler alternative without introducing a new option? >> > >> > Using the "-dimm" option follows the second semantic and avoids changing the "-device" >> > semantics. Dimm layout description is decoupled from dimmdevice creation, and qdev >> > hotplug coincides with acpi hotplug. >> >> Maybe even the dimmbus device shouldn't exist by itself after all, or >> it should be pretty much invisible to users. On real HW, the memory >> controller or south bridge handles the memory. For i440fx, it's part >> of the same chipset. So I think we should just add qdev properties to >> i440fx to specify the sizes, nodes etc. Then i440fx should create the >> dimmbus device unconditionally using the properties. The default >> properties should create a sane configuration, otherwise -global >> i440fx.dimm_size=512M etc. could be used. Then the bus would be >> populated as before or with device_add. > > hmm the problem with using only i440fx properties, is that size/nodes look > dimm specific to me, not chipset-memcontroller specific. Unless we only allow > uniform size dimms. Is it possible to have a dynamic list of sizes/nodes pairs as > properties of a qdev device? I don't think so, but probably there's a limit of DIMMs that real controllers have, something like 8 max. > > Also if there is no dimmbus, and instead we have only links<> from i440fx to dimm-devices, > would the current qdev hotplug API be enough? I'd just disable hotplug if there is no dimmbus (ISA PC?). > > I am currently leaning towards this: i440fx unconditionally creates the dimmbus. Users > don't have to specify the bus (i assume this is what you mean by "dimmbus should > be invisible to the users") > > We only use "-device dimm" to describe dimms. With "-device dimm,populated=off", only > the dimm config layout will be saved in the dimmbus. The hotplug is triggered from a normal > device_add later (same as pci hotplug). OK. > > thanks, > > - Vasilis
On Sat, Oct 13, 2012 at 08:57:19AM +0000, Blue Swirl wrote: > On Tue, Oct 9, 2012 at 5:04 PM, Vasilis Liaskovitis > <vasilis.liaskovitis@profitbricks.com> wrote: > >> snip > >> Maybe even the dimmbus device shouldn't exist by itself after all, or > >> it should be pretty much invisible to users. On real HW, the memory > >> controller or south bridge handles the memory. For i440fx, it's part > >> of the same chipset. So I think we should just add qdev properties to > >> i440fx to specify the sizes, nodes etc. Then i440fx should create the > >> dimmbus device unconditionally using the properties. The default > >> properties should create a sane configuration, otherwise -global > >> i440fx.dimm_size=512M etc. could be used. Then the bus would be > >> populated as before or with device_add. > > > > hmm the problem with using only i440fx properties, is that size/nodes look > > dimm specific to me, not chipset-memcontroller specific. Unless we only allow > > uniform size dimms. Is it possible to have a dynamic list of sizes/nodes pairs as > > properties of a qdev device? > > I don't think so, but probably there's a limit of DIMMs that real > controllers have, something like 8 max. In the case of i440fx specifically, do you mean that we should model the DRB (Dram row boundary registers in section 3.2.19 of the i440fx spec) ? The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of memory afaict (bit 31 and above is ignored). I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a waste imho to model an old pc memory controller that only supports 8 DIMMs. There was also an old discussion about i440fx modeling here: https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html the general direction was that i440fx is too old and we don't want to precisely emulate the DRB registers, since they lack flexibility. Possible solutions: 1) is there a newer and more flexible chipset that we could model? 2) model and document a generic (non-existent) i440fx that would support more and larger DIMMs. E.g. support 255 DIMMs. If we want to use a description similar to the i440fx DRB registers, the registers would take up a lot of space. In i440fx there is one 8-bit DRB register per DIMM, and DRB[i] describes how many 8MB chunks are contained in DIMMs 0...i. So, the register values are cumulative (and total described memory cannot exceed 256x8MB = 2GB) We could for example model: - an 8-bit non-cumulative register for each DIMM, denoting how many 128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we describe a bit less than 8TB. These registers require 255 bytes. - a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows us to describe 8TB of memory (but the registers take up double the space, because they describe cumulative memory amounts) 3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling is not done (at least for i440fx, other machines could). This is the least precise in terms of emulation. On the other hand, if we are not really trying to emulate the real (too restrictive) hardware, does it matter? thanks, - Vasilis
On 10/17/2012 11:19 AM, Vasilis Liaskovitis wrote: >> >> I don't think so, but probably there's a limit of DIMMs that real >> controllers have, something like 8 max. > > In the case of i440fx specifically, do you mean that we should model the DRB > (Dram row boundary registers in section 3.2.19 of the i440fx spec) ? > > The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row > maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of > memory afaict (bit 31 and above is ignored). > > I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too > restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a > waste imho to model an old pc memory controller that only supports 8 DIMMs. > > There was also an old discussion about i440fx modeling here: > https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html > the general direction was that i440fx is too old and we don't want to precisely > emulate the DRB registers, since they lack flexibility. > > Possible solutions: > > 1) is there a newer and more flexible chipset that we could model? Look for q35 on this list. > > 2) model and document ^--- the critical bit > a generic (non-existent) i440fx that would support more > and larger DIMMs. E.g. support 255 DIMMs. If we want to use a description > similar to the i440fx DRB registers, the registers would take up a lot of space. > In i440fx there is one 8-bit DRB register per DIMM, and DRB[i] describes how > many 8MB chunks are contained in DIMMs 0...i. So, the register values are > cumulative (and total described memory cannot exceed 256x8MB = 2GB) Our i440fx has already been extended by support for pci and cpu hotplug, and I see no reason not to extend it for memory. We can allocate extra mmio space for registers if needed. Usually I'm against this sort of thing, but in this case we don't have much choice. > > We could for example model: > - an 8-bit non-cumulative register for each DIMM, denoting how many > 128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we > describe a bit less than 8TB. These registers require 255 bytes. > - a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows > us to describe 8TB of memory (but the registers take up double the space, because > they describe cumulative memory amounts) There is no reason to save space. Why not have two 64-bit registers per DIMM, one describing the size and the other the base address, both in bytes? Use a few low order bits for control. > > 3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling > is not done (at least for i440fx, other machines could). This is the least precise > in terms of emulation. On the other hand, if we are not really trying to emulate > the real (too restrictive) hardware, does it matter? We could emulate base memory using the chipset, and extra memory using the scheme above. This allows guests that are tied to the chipset to work, and guests that have more awareness (seabios) to use the extra features.
On Wed, Oct 17, 2012 at 12:03:51PM +0200, Avi Kivity wrote: > On 10/17/2012 11:19 AM, Vasilis Liaskovitis wrote: > >> > >> I don't think so, but probably there's a limit of DIMMs that real > >> controllers have, something like 8 max. > > > > In the case of i440fx specifically, do you mean that we should model the DRB > > (Dram row boundary registers in section 3.2.19 of the i440fx spec) ? > > > > The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row > > maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of > > memory afaict (bit 31 and above is ignored). > > > > I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too > > restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a > > waste imho to model an old pc memory controller that only supports 8 DIMMs. > > > > There was also an old discussion about i440fx modeling here: > > https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html > > the general direction was that i440fx is too old and we don't want to precisely > > emulate the DRB registers, since they lack flexibility. > > > > Possible solutions: > > > > 1) is there a newer and more flexible chipset that we could model? > > Look for q35 on this list. thanks, I 'll take a look. It sounds like the other options below are more straightforward now, but let me know if you prefer q35 integration as a priority. > > > > > 2) model and document > ^--- the critical bit > > > a generic (non-existent) i440fx that would support more > > and larger DIMMs. E.g. support 255 DIMMs. If we want to use a description > > similar to the i440fx DRB registers, the registers would take up a lot of space. > > In i440fx there is one 8-bit DRB register per DIMM, and DRB[i] describes how > > many 8MB chunks are contained in DIMMs 0...i. So, the register values are > > cumulative (and total described memory cannot exceed 256x8MB = 2GB) > > Our i440fx has already been extended by support for pci and cpu hotplug, > and I see no reason not to extend it for memory. We can allocate extra > mmio space for registers if needed. Usually I'm against this sort of > thing, but in this case we don't have much choice. ok > > > > > We could for example model: > > - an 8-bit non-cumulative register for each DIMM, denoting how many > > 128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we > > describe a bit less than 8TB. These registers require 255 bytes. > > - a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows > > us to describe 8TB of memory (but the registers take up double the space, because > > they describe cumulative memory amounts) > > There is no reason to save space. Why not have two 64-bit registers per > DIMM, one describing the size and the other the base address, both in > bytes? Use a few low order bits for control. Do we want this generic scheme above to be tied into the i440fx/pc machine? Or have it as a separate generic memory bus / pmc usable by others (e.g. in hw/dimm.c)? The 64-bit values you describe are already part of DimmDevice properties, but they are not hardware registers described as part of a chipset. In terms of control bits, did you want to mimic some other chipset registers? - any examples would be useful. > > > > > 3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling > > is not done (at least for i440fx, other machines could). This is the least precise > > in terms of emulation. On the other hand, if we are not really trying to emulate > > the real (too restrictive) hardware, does it matter? > > We could emulate base memory using the chipset, and extra memory using > the scheme above. This allows guests that are tied to the chipset to > work, and guests that have more awareness (seabios) to use the extra > features. But if we use the real i440fx pmc DRBs for base memory, this means base memory would be <= 2GB, right? Sounds like we 'd need to change the DRBs anyway to describe useful amounts of base memory (e.g. 512MB chunks and check against address lines [36:29] can describe base memory up to 64GB, though that's still limiting for very large VMs). But we'd be diverting from the real hardware again. Then we can model base memory with "tweaked" i440fx pmc's DRB registers - we could only use DRB[0] (one DIMM describing all of base memory) or more. DIMMs would be allowed to be hotplugged in the generic mem-controller scheme only (unless it makes sense to allow hotplug in the remaining pmc DRBs and start using the generic scheme once we run out of emulated DRBs) thanks, - Vasilis
On 10/18/2012 11:27 AM, Vasilis Liaskovitis wrote: > On Wed, Oct 17, 2012 at 12:03:51PM +0200, Avi Kivity wrote: >> On 10/17/2012 11:19 AM, Vasilis Liaskovitis wrote: >> >> >> >> I don't think so, but probably there's a limit of DIMMs that real >> >> controllers have, something like 8 max. >> > >> > In the case of i440fx specifically, do you mean that we should model the DRB >> > (Dram row boundary registers in section 3.2.19 of the i440fx spec) ? >> > >> > The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row >> > maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of >> > memory afaict (bit 31 and above is ignored). >> > >> > I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too >> > restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a >> > waste imho to model an old pc memory controller that only supports 8 DIMMs. >> > >> > There was also an old discussion about i440fx modeling here: >> > https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html >> > the general direction was that i440fx is too old and we don't want to precisely >> > emulate the DRB registers, since they lack flexibility. >> > >> > Possible solutions: >> > >> > 1) is there a newer and more flexible chipset that we could model? >> >> Look for q35 on this list. > > thanks, I 'll take a look. It sounds like the other options below are more > straightforward now, but let me know if you prefer q35 integration as a priority. At least validate that what you're doing fits with how q35 works. >> >> > >> > We could for example model: >> > - an 8-bit non-cumulative register for each DIMM, denoting how many >> > 128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we >> > describe a bit less than 8TB. These registers require 255 bytes. >> > - a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows >> > us to describe 8TB of memory (but the registers take up double the space, because >> > they describe cumulative memory amounts) >> >> There is no reason to save space. Why not have two 64-bit registers per >> DIMM, one describing the size and the other the base address, both in >> bytes? Use a few low order bits for control. > > Do we want this generic scheme above to be tied into the i440fx/pc machine? Yes. q35 should work according to its own specifications. > Or have it as a separate generic memory bus / pmc usable by others (e.g. in > hw/dimm.c)? > The 64-bit values you describe are already part of DimmDevice properties, but > they are not hardware registers described as part of a chipset. > > In terms of control bits, did you want to mimic some other chipset registers? - > any examples would be useful. I don't have any real requirements. Just make it simple and easily accessible to ACPI code. > >> >> > >> > 3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling >> > is not done (at least for i440fx, other machines could). This is the least precise >> > in terms of emulation. On the other hand, if we are not really trying to emulate >> > the real (too restrictive) hardware, does it matter? >> >> We could emulate base memory using the chipset, and extra memory using >> the scheme above. This allows guests that are tied to the chipset to >> work, and guests that have more awareness (seabios) to use the extra >> features. > > But if we use the real i440fx pmc DRBs for base memory, this means base memory > would be <= 2GB, right? > > Sounds like we 'd need to change the DRBs anyway to describe useful amounts of > base memory (e.g. 512MB chunks and check against address lines [36:29] can > describe base memory up to 64GB, though that's still limiting for very large > VMs). But we'd be diverting from the real hardware again. Then there's no point. Modelling real hardware allows guests written to work against that hardware to function correctly. If you diverge, they won't. > > Then we can model base memory with "tweaked" i440fx pmc's DRB registers - we > could only use DRB[0] (one DIMM describing all of base memory) or more. > > DIMMs would be allowed to be hotplugged in the generic mem-controller scheme only > (unless it makes sense to allow hotplug in the remaining pmc DRBs and > start using the generic scheme once we run out of emulated DRBs) > 440fx seems a lost cause, so we can go wild and just implement pv dimms. For q35 I'd like to stay within the spec.
On Thu, Oct 18, 2012 at 12:33 PM, Avi Kivity <avi@redhat.com> wrote: > On 10/18/2012 11:27 AM, Vasilis Liaskovitis wrote: >> On Wed, Oct 17, 2012 at 12:03:51PM +0200, Avi Kivity wrote: >>> On 10/17/2012 11:19 AM, Vasilis Liaskovitis wrote: >>> >> >>> >> I don't think so, but probably there's a limit of DIMMs that real >>> >> controllers have, something like 8 max. >>> > >>> > In the case of i440fx specifically, do you mean that we should model the DRB >>> > (Dram row boundary registers in section 3.2.19 of the i440fx spec) ? >>> > >>> > The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row >>> > maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of >>> > memory afaict (bit 31 and above is ignored). >>> > >>> > I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too >>> > restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a >>> > waste imho to model an old pc memory controller that only supports 8 DIMMs. >>> > >>> > There was also an old discussion about i440fx modeling here: >>> > https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html >>> > the general direction was that i440fx is too old and we don't want to precisely >>> > emulate the DRB registers, since they lack flexibility. >>> > >>> > Possible solutions: >>> > >>> > 1) is there a newer and more flexible chipset that we could model? >>> >>> Look for q35 on this list. >> >> thanks, I 'll take a look. It sounds like the other options below are more >> straightforward now, but let me know if you prefer q35 integration as a priority. > > At least validate that what you're doing fits with how q35 works. > >>> >>> > >>> > We could for example model: >>> > - an 8-bit non-cumulative register for each DIMM, denoting how many >>> > 128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we >>> > describe a bit less than 8TB. These registers require 255 bytes. >>> > - a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows >>> > us to describe 8TB of memory (but the registers take up double the space, because >>> > they describe cumulative memory amounts) >>> >>> There is no reason to save space. Why not have two 64-bit registers per >>> DIMM, one describing the size and the other the base address, both in >>> bytes? Use a few low order bits for control. >> >> Do we want this generic scheme above to be tied into the i440fx/pc machine? > > Yes. q35 should work according to its own specifications. > >> Or have it as a separate generic memory bus / pmc usable by others (e.g. in >> hw/dimm.c)? >> The 64-bit values you describe are already part of DimmDevice properties, but >> they are not hardware registers described as part of a chipset. >> >> In terms of control bits, did you want to mimic some other chipset registers? - >> any examples would be useful. > > I don't have any real requirements. Just make it simple and easily > accessible to ACPI code. > >> >>> >>> > >>> > 3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling >>> > is not done (at least for i440fx, other machines could). This is the least precise >>> > in terms of emulation. On the other hand, if we are not really trying to emulate >>> > the real (too restrictive) hardware, does it matter? >>> >>> We could emulate base memory using the chipset, and extra memory using >>> the scheme above. This allows guests that are tied to the chipset to >>> work, and guests that have more awareness (seabios) to use the extra >>> features. >> >> But if we use the real i440fx pmc DRBs for base memory, this means base memory >> would be <= 2GB, right? >> >> Sounds like we 'd need to change the DRBs anyway to describe useful amounts of >> base memory (e.g. 512MB chunks and check against address lines [36:29] can >> describe base memory up to 64GB, though that's still limiting for very large >> VMs). But we'd be diverting from the real hardware again. > > Then there's no point. Modelling real hardware allows guests written to > work against that hardware to function correctly. If you diverge, they > won't. The guest is also unlikely to want to reprogram the memory controller. > >> >> Then we can model base memory with "tweaked" i440fx pmc's DRB registers - we >> could only use DRB[0] (one DIMM describing all of base memory) or more. >> >> DIMMs would be allowed to be hotplugged in the generic mem-controller scheme only >> (unless it makes sense to allow hotplug in the remaining pmc DRBs and >> start using the generic scheme once we run out of emulated DRBs) >> > > 440fx seems a lost cause, so we can go wild and just implement pv dimms. Maybe. But what would be a PV DIMM? Do we need any DIMM-like granularity at all, instead the guest could be told to use a list of RAM regions with arbitrary start and end addresses? Isn't ballooning also related? > For q35 I'd like to stay within the spec. That may not last forever when machines have terabytes of memory. > > -- > error compiling committee.c: too many arguments to function
Hi, On Thu, Oct 18, 2012 at 02:33:02PM +0200, Avi Kivity wrote: > On 10/18/2012 11:27 AM, Vasilis Liaskovitis wrote: > > On Wed, Oct 17, 2012 at 12:03:51PM +0200, Avi Kivity wrote: > >> On 10/17/2012 11:19 AM, Vasilis Liaskovitis wrote: > >> >> > >> >> I don't think so, but probably there's a limit of DIMMs that real > >> >> controllers have, something like 8 max. > >> > > >> > In the case of i440fx specifically, do you mean that we should model the DRB > >> > (Dram row boundary registers in section 3.2.19 of the i440fx spec) ? > >> > > >> > The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row > >> > maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of > >> > memory afaict (bit 31 and above is ignored). > >> > > >> > I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too > >> > restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a > >> > waste imho to model an old pc memory controller that only supports 8 DIMMs. > >> > > >> > There was also an old discussion about i440fx modeling here: > >> > https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html > >> > the general direction was that i440fx is too old and we don't want to precisely > >> > emulate the DRB registers, since they lack flexibility. > >> > > >> > Possible solutions: > >> > > >> > 1) is there a newer and more flexible chipset that we could model? > >> > >> Look for q35 on this list. > > > > thanks, I 'll take a look. It sounds like the other options below are more > > straightforward now, but let me know if you prefer q35 integration as a priority. > > At least validate that what you're doing fits with how q35 works. In terms of pmc modeling, the q35 page http://wiki.qemu.org/Features/Q35 mentions: Refactor i440fx to create i440fx-pmc class ich9: model ICH9 Super I/O chip ich9: make i440fx-pmc a generic PCNorthBridge class and add support for ich9 northbridge is this still the plan? There was an old patchset creating i440fx-pmc here: http://lists.gnu.org/archive/html/qemu-devel/2012-01/msg03501.html but I am not sure if it has been dropped or worked on. v3 of the q35 patchset doesn't include a pmc I think. It would be good to know what the current plan regarding pmc modeling (for both q35 and i440fx) is. thanks, - Vasilis
On 10/19/2012 07:48 PM, Blue Swirl wrote: >>> >>> DIMMs would be allowed to be hotplugged in the generic mem-controller scheme only >>> (unless it makes sense to allow hotplug in the remaining pmc DRBs and >>> start using the generic scheme once we run out of emulated DRBs) >>> >> >> 440fx seems a lost cause, so we can go wild and just implement pv dimms. > > Maybe. But what would be a PV DIMM? Do we need any DIMM-like > granularity at all, instead the guest could be told to use a list of > RAM regions with arbitrary start and end addresses? Guests are likely to support something that has the same constraints as real hardware. If we allow non-power-of-two DIMMs, we might find that guests don't support them well. > Isn't ballooning > also related? It is related in that it is also a memory hotplug technology. But ballooning is subtractive and fine-grained where classic hotplug is additive and coarse grained. We can use both together, but I don't think any work is needed at the qemu level. > >> For q35 I'd like to stay within the spec. > > That may not last forever when machines have terabytes of memory. At least there's work for chipset implementers. Or we can do PV-DIMMs for q35 too.
diff --git a/hw/Makefile.objs b/hw/Makefile.objs index 6dfebd2..8c5c39a 100644 --- a/hw/Makefile.objs +++ b/hw/Makefile.objs @@ -26,7 +26,7 @@ hw-obj-$(CONFIG_I8254) += i8254_common.o i8254.o hw-obj-$(CONFIG_PCSPK) += pcspk.o hw-obj-$(CONFIG_PCKBD) += pckbd.o hw-obj-$(CONFIG_FDC) += fdc.o -hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o +hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o dimm.o hw-obj-$(CONFIG_APM) += pm_smbus.o apm.o hw-obj-$(CONFIG_DMA) += dma.o hw-obj-$(CONFIG_I82374) += i82374.o diff --git a/qemu-config.c b/qemu-config.c index eba977e..4022d64 100644 --- a/qemu-config.c +++ b/qemu-config.c @@ -646,6 +646,30 @@ QemuOptsList qemu_boot_opts = { }, }; +static QemuOptsList qemu_dimm_opts = { + .name = "dimm", + .head = QTAILQ_HEAD_INITIALIZER(qemu_dimm_opts.head), + .desc = { + { + .name = "id", + .type = QEMU_OPT_STRING, + .help = "id of this dimm device", + },{ + .name = "size", + .type = QEMU_OPT_SIZE, + .help = "memory size for this dimm", + },{ + .name = "populated", + .type = QEMU_OPT_BOOL, + .help = "populated for this dimm", + },{ + .name = "node", + .type = QEMU_OPT_NUMBER, + .help = "NUMA node number (i.e. proximity) for this dimm", + }, + { /* end of list */ } + }, +}; static QemuOptsList *vm_config_groups[32] = { &qemu_drive_opts, &qemu_chardev_opts, @@ -662,6 +686,7 @@ static QemuOptsList *vm_config_groups[32] = { &qemu_boot_opts, &qemu_iscsi_opts, &qemu_sandbox_opts, + &qemu_dimm_opts, NULL, }; diff --git a/qemu-options.hx b/qemu-options.hx index 804a2d1..3687722 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2842,3 +2842,8 @@ HXCOMM This is the last statement. Insert new options before this line! STEXI @end table ETEXI + +DEF("dimm", HAS_ARG, QEMU_OPTION_dimm, + "-dimm id=dimmid,size=sz,node=nd,populated=on|off\n" + "specify memory dimm device with name dimmid, size sz on node nd", + QEMU_ARCH_ALL) diff --git a/sysemu.h b/sysemu.h index 65552ac..7baf9c9 100644 --- a/sysemu.h +++ b/sysemu.h @@ -139,6 +139,7 @@ extern QEMUClock *rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +extern int nb_hp_dimms; #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/vl.c b/vl.c index 7c577fa..af1745c 100644 --- a/vl.c +++ b/vl.c @@ -126,6 +126,7 @@ int main(int argc, char **argv) #include "hw/xen.h" #include "hw/qdev.h" #include "hw/loader.h" +#include "hw/dimm.h" #include "bt-host.h" #include "net.h" #include "net/slirp.h" @@ -248,6 +249,7 @@ QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order int nb_numa_nodes; uint64_t node_mem[MAX_NODES]; unsigned long *node_cpumask[MAX_NODES]; +int nb_hp_dimms; uint8_t qemu_uuid[16]; @@ -530,6 +532,37 @@ static void configure_rtc_date_offset(const char *startdate, int legacy) } } +static void configure_dimm(QemuOpts *opts) +{ + const char *id; + uint64_t size, node; + bool populated; + QemuOpts *devopts; + char buf[256]; + if (nb_hp_dimms == MAX_DIMMS) { + fprintf(stderr, "qemu: maximum number of DIMMs (%d) exceeded\n", + MAX_DIMMS); + exit(1); + } + id = qemu_opts_id(opts); + size = qemu_opt_get_size(opts, "size", DEFAULT_DIMMSIZE); + populated = qemu_opt_get_bool(opts, "populated", 0); + node = qemu_opt_get_number(opts, "node", 0); + + dimm_config_create((char*)id, size, node, nb_hp_dimms, 0); + + if (populated) { + devopts = qemu_opts_create(qemu_find_opts("device"), id, 0, NULL); + qemu_opt_set(devopts, "driver", "dimm"); + snprintf(buf, sizeof(buf), "%lu", size); + qemu_opt_set(devopts, "size", buf); + snprintf(buf, sizeof(buf), "%lu", node); + qemu_opt_set(devopts, "node", buf); + qemu_opt_set(devopts, "bus", "membus"); + } + nb_hp_dimms++; +} + static void configure_rtc(QemuOpts *opts) { const char *value; @@ -2354,6 +2387,8 @@ int main(int argc, char **argv, char **envp) DisplayChangeListener *dcl; int cyls, heads, secs, translation; QemuOpts *hda_opts = NULL, *opts, *machine_opts; + QemuOpts *dimm_opts[MAX_DIMMS]; + int nb_dimm_opts = 0; QemuOptsList *olist; int optind; const char *optarg; @@ -3288,6 +3323,18 @@ int main(int argc, char **argv, char **envp) exit(0); } break; + case QEMU_OPTION_dimm: + if (nb_dimm_opts == MAX_DIMMS) { + fprintf(stderr, "qemu: maximum number of DIMMs (%d) exceeded\n", + MAX_DIMMS); + } + dimm_opts[nb_dimm_opts] = + qemu_opts_parse(qemu_find_opts("dimm"), optarg, 0); + if (!dimm_opts[nb_dimm_opts]) { + exit(1); + } + nb_dimm_opts++; + break; default: os_parse_cmd_args(popt->index, optarg); } @@ -3611,6 +3658,9 @@ int main(int argc, char **argv, char **envp) } qemu_add_globals(); + for (i = 0; i < nb_dimm_opts; i++) + configure_dimm(dimm_opts[i]); + qdev_machine_init(); machine->init(ram_size, boot_devices,
Example: "-dimm id=dimm0,size=512M,node=0,populated=off" will define a 512M memory slot belonging to numa node 0. When "populated=on", a DimmDevice is created and hot-plugged at system startup. Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com> --- hw/Makefile.objs | 2 +- qemu-config.c | 25 +++++++++++++++++++++++++ qemu-options.hx | 5 +++++ sysemu.h | 1 + vl.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 82 insertions(+), 1 deletions(-)