Message ID | 6b5ff346b23fba9a8707507fda7f9b71719a55be.1372234719.git.hutao@cn.fujitsu.com |
---|---|
State | New |
Headers | show |
Il 26/06/2013 11:13, Hu Tao ha scritto: > From: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com> > > Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com> > Signed-off-by: Hu Tao <hutao@cn.fujitsu.com> > --- > vl.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 51 insertions(+) > > diff --git a/vl.c b/vl.c > index 767e020..9d88a79 100644 > --- a/vl.c > +++ b/vl.c > @@ -170,6 +170,7 @@ int main(int argc, char **argv) > > #include "ui/qemu-spice.h" > #include "qapi/string-input-visitor.h" > +#include "hw/mem-hotplug/dimm.h" > > //#define DEBUG_NET > //#define DEBUG_SLIRP > @@ -252,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = > int nb_numa_nodes; > uint64_t node_mem[MAX_NODES]; > unsigned long *node_cpumask[MAX_NODES]; > +int nb_hp_dimms; > > uint8_t qemu_uuid[16]; > > @@ -2338,6 +2340,50 @@ static int chardev_init_func(QemuOpts *opts, void *opaque) > return 0; > } > > +static int dimmcfg_init_func(QemuOpts *opts, void *opaque) > +{ > + const char *driver; > + const char *id; > + uint64_t node, size; > + uint32_t populated; > + const char *buf, *busbuf; > + > + /* DimmDevice configuration needs to be known in order to initialize chipset > + * with correct memory and pci ranges. But all devices are created after > + * chipset / machine initialization. In * order to avoid this problem, we > + * parse dimm information earlier into dimmcfg structs. */ > + > + driver = qemu_opt_get(opts, "driver"); > + if (!strcmp(driver, "dimm")) { > + > + id = qemu_opts_id(opts); > + buf = qemu_opt_get(opts, "size"); > + parse_option_size("size", buf, &size, NULL); > + buf = qemu_opt_get(opts, "node"); > + parse_option_number("node", buf, &node, NULL); > + busbuf = qemu_opt_get(opts, "bus"); > + buf = qemu_opt_get(opts, "populated"); > + if (!buf) { > + populated = 0; > + } else { > + populated = strcmp(buf, "on") ? 0 : 1; > + } > + > + dimm_config_create((char *)id, size, busbuf ? busbuf : "membus.0", > + node, nb_hp_dimms); > + > + /* if !populated, we just keep the config. The real device > + * will be created in the future with a normal device_add > + * command. */ > + if (!populated) { > + qemu_opts_del(opts); > + } I think you need another option than -device dimm. For example it could be declared together with the NUMA node. This could declare two NUMA nodes, each with 2G of populated and 2G of unpopulated RAM: -numa node,mem-range=0-2G,mem-range-hotplug=4G-6G \ -numa node,mem-range=2G-4G,mem-range-hotplug=6G-8G I'm not sure I like the names particularly though. CCing Eduardo, Bandan and Wanlong Gao. Paolo > + nb_hp_dimms++; > + } > + > + return 0; > +} > + > #ifdef CONFIG_VIRTFS > static int fsdev_init_func(QemuOpts *opts, void *opaque) > { > @@ -4260,6 +4306,11 @@ int main(int argc, char **argv, char **envp) > } > qemu_add_globals(); > > + /* init generic devices */ > + if (qemu_opts_foreach(qemu_find_opts("device"), > + dimmcfg_init_func, NULL, 1) != 0) { > + exit(1); > + } > qdev_machine_init(); > > QEMUMachineInitArgs args = { .ram_size = ram_size, >
On 06/26/2013 05:46 PM, Paolo Bonzini wrote: > Il 26/06/2013 11:13, Hu Tao ha scritto: >> From: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com> >> >> Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com> >> Signed-off-by: Hu Tao <hutao@cn.fujitsu.com> >> --- >> vl.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 51 insertions(+) >> >> diff --git a/vl.c b/vl.c >> index 767e020..9d88a79 100644 >> --- a/vl.c >> +++ b/vl.c >> @@ -170,6 +170,7 @@ int main(int argc, char **argv) >> >> #include "ui/qemu-spice.h" >> #include "qapi/string-input-visitor.h" >> +#include "hw/mem-hotplug/dimm.h" >> >> //#define DEBUG_NET >> //#define DEBUG_SLIRP >> @@ -252,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = >> int nb_numa_nodes; >> uint64_t node_mem[MAX_NODES]; >> unsigned long *node_cpumask[MAX_NODES]; >> +int nb_hp_dimms; >> >> uint8_t qemu_uuid[16]; >> >> @@ -2338,6 +2340,50 @@ static int chardev_init_func(QemuOpts *opts, void *opaque) >> return 0; >> } >> >> +static int dimmcfg_init_func(QemuOpts *opts, void *opaque) >> +{ >> + const char *driver; >> + const char *id; >> + uint64_t node, size; >> + uint32_t populated; >> + const char *buf, *busbuf; >> + >> + /* DimmDevice configuration needs to be known in order to initialize chipset >> + * with correct memory and pci ranges. But all devices are created after >> + * chipset / machine initialization. In * order to avoid this problem, we >> + * parse dimm information earlier into dimmcfg structs. */ >> + >> + driver = qemu_opt_get(opts, "driver"); >> + if (!strcmp(driver, "dimm")) { >> + >> + id = qemu_opts_id(opts); >> + buf = qemu_opt_get(opts, "size"); >> + parse_option_size("size", buf, &size, NULL); >> + buf = qemu_opt_get(opts, "node"); >> + parse_option_number("node", buf, &node, NULL); >> + busbuf = qemu_opt_get(opts, "bus"); >> + buf = qemu_opt_get(opts, "populated"); >> + if (!buf) { >> + populated = 0; >> + } else { >> + populated = strcmp(buf, "on") ? 0 : 1; >> + } >> + >> + dimm_config_create((char *)id, size, busbuf ? busbuf : "membus.0", >> + node, nb_hp_dimms); >> + >> + /* if !populated, we just keep the config. The real device >> + * will be created in the future with a normal device_add >> + * command. */ >> + if (!populated) { >> + qemu_opts_del(opts); >> + } > > I think you need another option than -device dimm. For example it could > be declared together with the NUMA node. This could declare two NUMA > nodes, each with 2G of populated and 2G of unpopulated RAM: > > -numa node,mem-range=0-2G,mem-range-hotplug=4G-6G \ > -numa node,mem-range=2G-4G,mem-range-hotplug=6G-8G > > I'm not sure I like the names particularly though. CCing Eduardo, > Bandan and Wanlong Gao. Do we really need to specify the memory range? I suspect that we can follow current design of normal memory in hot-plug memory. Currently, we just specify the size of normal memory in each node, and the range in normal memory is node by node. Then I think we can just specify the memory size of hot-plug in each node, then the hot-plug memory range is also node by node, and the whole hot-plug memory block is just located after the normal memory block. If so, the option can come like: -numa node,nodeid=0,mem=2G,cpus=0-1,mem-hotplug=2G,mem-policy=membind,mem-hostnode=0-1,mem-hotplug-policy=interleave,mem-hotplug-hostnode=1 -numa node,nodeid=1,mem=2G,cpus=2-3,mem-hotplug=2G,mem-policy=preferred,mem-hostnode=1,mem-hotplug-policy=membind,mem-hotplug-hostnode=0-1 And each hot-plug memory device size can be assigned through "-device dimm,size=1G", assume that we specify 4 hot-plug memory devices and each 1G, then first two devices as we ranged belong to node0, and other two belong to node1. Then the hot-plug memory will have no effect on current normal memory design. Thanks, Wanlong Gao > > Paolo > >> + nb_hp_dimms++; >> + } >> + >> + return 0; >> +} >> + >> #ifdef CONFIG_VIRTFS >> static int fsdev_init_func(QemuOpts *opts, void *opaque) >> { >> @@ -4260,6 +4306,11 @@ int main(int argc, char **argv, char **envp) >> } >> qemu_add_globals(); >> >> + /* init generic devices */ >> + if (qemu_opts_foreach(qemu_find_opts("device"), >> + dimmcfg_init_func, NULL, 1) != 0) { >> + exit(1); >> + } >> qdev_machine_init(); >> >> QEMUMachineInitArgs args = { .ram_size = ram_size, >> > >
Il 27/06/2013 07:08, Wanlong Gao ha scritto: > Do we really need to specify the memory range? I suspect that we can > follow current design of normal memory in hot-plug memory. I think we can do both. I'm afraid that the configuration of the VM will not be perfectly reproducible without specifying the range, more so if you allow hotplug. > Currently, > we just specify the size of normal memory in each node, and the range > in normal memory is node by node. Then I think we can just specify > the memory size of hot-plug in each node, then the hot-plug memory > range is also node by node, and the whole hot-plug memory block is > just located after the normal memory block. If so, the option can > come like: > -numa node,nodeid=0,mem=2G,cpus=0-1,mem-hotplug=2G,mem-policy=membind,mem-hostnode=0-1,mem-hotplug-policy=interleave,mem-hotplug-hostnode=1 > -numa node,nodeid=1,mem=2G,cpus=2-3,mem-hotplug=2G,mem-policy=preferred,mem-hostnode=1,mem-hotplug-policy=membind,mem-hotplug-hostnode=0-1 I think specifying different policies and bindings for normal and hotplug memory is too much fine-grained. If you really want that, then you would need something like -numa node,nodeid=0,cpus=0-1 \ -numa mem,nodeid=0,size=2G,policy=membind,hostnode=0-1 \ -numa mem,nodeid=0,size=2G,policy=interleave,hostnode=1,populated=no Hmm... this actually doesn't look too bad, and it is much more future-proof. Eduardo, what do you think about it? Should Wanlong redo his patches to support this "-numa mem" syntax? Parsing it should be easy using the QemuOpts visitor, too. Paolo
On Thu, 27 Jun 2013 08:55:25 +0200 Paolo Bonzini <pbonzini@redhat.com> wrote: > Il 27/06/2013 07:08, Wanlong Gao ha scritto: > > Do we really need to specify the memory range? I suspect that we can > > follow current design of normal memory in hot-plug memory. > > I think we can do both. I'm afraid that the configuration of the VM > will not be perfectly reproducible without specifying the range, more so > if you allow hotplug. > > > Currently, > > we just specify the size of normal memory in each node, and the range > > in normal memory is node by node. Then I think we can just specify > > the memory size of hot-plug in each node, then the hot-plug memory > > range is also node by node, and the whole hot-plug memory block is > > just located after the normal memory block. If so, the option can > > come like: > > -numa node,nodeid=0,mem=2G,cpus=0-1,mem-hotplug=2G,mem-policy=membind,mem-hostnode=0-1,mem-hotplug-policy=interleave,mem-hotplug-hostnode=1 > > -numa node,nodeid=1,mem=2G,cpus=2-3,mem-hotplug=2G,mem-policy=preferred,mem-hostnode=1,mem-hotplug-policy=membind,mem-hotplug-hostnode=0-1 > > I think specifying different policies and bindings for normal and > hotplug memory is too much fine-grained. If you really want that, then > you would need something like > > -numa node,nodeid=0,cpus=0-1 \ > -numa mem,nodeid=0,size=2G,policy=membind,hostnode=0-1 \ > -numa mem,nodeid=0,size=2G,policy=interleave,hostnode=1,populated=no > > Hmm... this actually doesn't look too bad, and it is much more > future-proof. Eduardo, what do you think about it? Should Wanlong redo > his patches to support this "-numa mem" syntax? Parsing it should be > easy using the QemuOpts visitor, too. Is hot-plugged memory and its bindings to numa nodes have to be specified at startup? How about extending -m option to support following syntax: -m initial_mem[, number_of_hotplug_dimms, max_mem]] and dynamically forming MEMXX._CRS/_PXM resources on demand instead of creating static resources in SSDT. without necessity to specify to be hot-plugged DIMMs at startup, hot-plug could become more flexible since arbitrarily sized DIMMs with required NUMA mapping could be specified during hot-plug time, for example: device_add dimm,id=dimm1,bus=membus.0,size=3G,node=1,... the only limit would be left is a number of increments(DIMM slots), due to need to pre-generate ACPI memory devices that could supply _CRS/_PXM resources later. > Paolo > >
On Tue, Jul 09, 2013 at 06:53:47PM +0200, Igor Mammedov wrote: > On Thu, 27 Jun 2013 08:55:25 +0200 > Paolo Bonzini <pbonzini@redhat.com> wrote: > > > Il 27/06/2013 07:08, Wanlong Gao ha scritto: > > > Do we really need to specify the memory range? I suspect that we can > > > follow current design of normal memory in hot-plug memory. > > > > I think we can do both. I'm afraid that the configuration of the VM > > will not be perfectly reproducible without specifying the range, more so > > if you allow hotplug. > > > > > Currently, > > > we just specify the size of normal memory in each node, and the range > > > in normal memory is node by node. Then I think we can just specify > > > the memory size of hot-plug in each node, then the hot-plug memory > > > range is also node by node, and the whole hot-plug memory block is > > > just located after the normal memory block. If so, the option can > > > come like: > > > -numa node,nodeid=0,mem=2G,cpus=0-1,mem-hotplug=2G,mem-policy=membind,mem-hostnode=0-1,mem-hotplug-policy=interleave,mem-hotplug-hostnode=1 > > > -numa node,nodeid=1,mem=2G,cpus=2-3,mem-hotplug=2G,mem-policy=preferred,mem-hostnode=1,mem-hotplug-policy=membind,mem-hotplug-hostnode=0-1 > > > > I think specifying different policies and bindings for normal and > > hotplug memory is too much fine-grained. If you really want that, then > > you would need something like > > > > -numa node,nodeid=0,cpus=0-1 \ > > -numa mem,nodeid=0,size=2G,policy=membind,hostnode=0-1 \ > > -numa mem,nodeid=0,size=2G,policy=interleave,hostnode=1,populated=no > > > > Hmm... this actually doesn't look too bad, and it is much more > > future-proof. Eduardo, what do you think about it? Should Wanlong redo > > his patches to support this "-numa mem" syntax? Parsing it should be > > easy using the QemuOpts visitor, too. > > Is hot-plugged memory and its bindings to numa nodes have to be specified > at startup? > > How about extending -m option to support following syntax: > > -m initial_mem[, number_of_hotplug_dimms, max_mem]] > > and dynamically forming MEMXX._CRS/_PXM resources on demand instead > of creating static resources in SSDT. > > without necessity to specify to be hot-plugged DIMMs at startup, hot-plug > could become more flexible since arbitrarily sized DIMMs with required NUMA > mapping could be specified during hot-plug time, for example: > > device_add dimm,id=dimm1,bus=membus.0,size=3G,node=1,... Seems impossible as ACPI memory devices' ranges must be specified at startup, but I'll be glad if I'm wrong. > > the only limit would be left is a number of increments(DIMM slots), due to > need to pre-generate ACPI memory devices that could supply _CRS/_PXM > resources later. > > > > Paolo > > > > >
Il 12/07/2013 04:39, Hu Tao ha scritto: > > without necessity to specify to be hot-plugged DIMMs at startup, hot-plug > > could become more flexible since arbitrarily sized DIMMs with required NUMA > > mapping could be specified during hot-plug time, for example: > > > > device_add dimm,id=dimm1,bus=membus.0,size=3G,node=1,... > > Seems impossible as ACPI memory devices' ranges must be specified at startup, > but I'll be glad if I'm wrong. Yeah, I also understood the problem to be the SRAT, not the DSDT... Of course you could reserve a single large part of the address range in "-numa mem", and then add one or more DIMMs covering only part of the address range. Perhaps "populated" could be a size instead of a boolean, to do something like what Igor suggested. Paolo
Hi, On Thu, Jun 27, 2013 at 08:55:25AM +0200, Paolo Bonzini wrote: > Il 27/06/2013 07:08, Wanlong Gao ha scritto: > > Do we really need to specify the memory range? I suspect that we can > > follow current design of normal memory in hot-plug memory. > > I think we can do both. I'm afraid that the configuration of the VM > will not be perfectly reproducible without specifying the range, more so > if you allow hotplug. > > > Currently, > > we just specify the size of normal memory in each node, and the range > > in normal memory is node by node. Then I think we can just specify > > the memory size of hot-plug in each node, then the hot-plug memory > > range is also node by node, and the whole hot-plug memory block is > > just located after the normal memory block. If so, the option can > > come like: > > -numa node,nodeid=0,mem=2G,cpus=0-1,mem-hotplug=2G,mem-policy=membind,mem-hostnode=0-1,mem-hotplug-policy=interleave,mem-hotplug-hostnode=1 > > -numa node,nodeid=1,mem=2G,cpus=2-3,mem-hotplug=2G,mem-policy=preferred,mem-hostnode=1,mem-hotplug-policy=membind,mem-hotplug-hostnode=0-1 > > I think specifying different policies and bindings for normal and > hotplug memory is too much fine-grained. If you really want that, then > you would need something like > > -numa node,nodeid=0,cpus=0-1 \ > -numa mem,nodeid=0,size=2G,policy=membind,hostnode=0-1 \ > -numa mem,nodeid=0,size=2G,policy=interleave,hostnode=1,populated=no > > Hmm... this actually doesn't look too bad, and it is much more > future-proof. Eduardo, what do you think about it? Should Wanlong redo > his patches to support this "-numa mem" syntax? Parsing it should be > easy using the QemuOpts visitor, too. from what i understand, we are currently favoring this numa option? (I saw it mentioned in Gao's numa patchset series as well) There is still the question of "how many hotpluggable dimm devices does this memory range describe?". With the dimm device that was clearly defined, but not so with this option. Do we choose a default granularity e.g. 1 GB? Also, as you mentioned, without specifying the memory range, the VM configuration may be ambiguous. Currently, the VM memory map depends on the order of dimms defined on the command line. So: "-device dimm,id=dimm0,size=1G,node=0 -device dimm,id=dimm1,size=2G,node=0" and "-device dimm,id=dimm1,size=2G,node=0 -device dimm,id=dimm1,size=1G,node=0" assign different memory ranges to the dimms. On the other hand, iirc memory ranges were discussed with previous maintainers but was rejected: The user/management library may not want to know or simply does not know architectural details of the guest hardware. What happens if the user specifies memory on the PCI-hole? Do we bail out or adjust their arguments? Adjusting ranges might open another can of worms. In any case, it would be good to get a final consensus on this. thanks, - Vasilis
Il 15/07/2013 19:05, Vasilis Liaskovitis ha scritto: > from what i understand, we are currently favoring this numa option? (I saw it > mentioned in Gao's numa patchset series as well) The two patchsets have some overlap, so it's good to find a design that fits both. > There is still the question of "how many hotpluggable dimm devices does this > memory range describe?". With the dimm device that was clearly defined, but not > so with this option. Do we choose a default granularity e.g. 1 GB? I think it's the same. One "-numa mem" option = one "-device dimm" option; both define one range. Unused memory ranges may remain if you stumble upon a unusable range such as the PCI window. For example two "-numa mem,size=2G" options would allocate memory from 0 to 2G and from 4 to 6G. I'm snipping the rest of the email because I hope this covers all your doubts. Paolo
On Mon, Jul 15, 2013 at 07:10:30PM +0200, Paolo Bonzini wrote: > Il 15/07/2013 19:05, Vasilis Liaskovitis ha scritto: > > from what i understand, we are currently favoring this numa option? (I saw it > > mentioned in Gao's numa patchset series as well) > > The two patchsets have some overlap, so it's good to find a design that > fits both. > > > There is still the question of "how many hotpluggable dimm devices does this > > memory range describe?". With the dimm device that was clearly defined, but not > > so with this option. Do we choose a default granularity e.g. 1 GB? > > I think it's the same. One "-numa mem" option = one "-device dimm" > option; both define one range. Unused memory ranges may remain if you ah ok, I get the proposal now. > stumble upon a unusable range such as the PCI window. For example two > "-numa mem,size=2G" options would allocate memory from 0 to 2G and from > 4 to 6G. ok, that's done already. thanks, - Vasilis
On Sun, Jul 14, 2013 at 06:58:23PM +0200, Paolo Bonzini wrote: > Il 12/07/2013 04:39, Hu Tao ha scritto: > > > without necessity to specify to be hot-plugged DIMMs at startup, hot-plug > > > could become more flexible since arbitrarily sized DIMMs with required NUMA > > > mapping could be specified during hot-plug time, for example: > > > > > > device_add dimm,id=dimm1,bus=membus.0,size=3G,node=1,... > > > > Seems impossible as ACPI memory devices' ranges must be specified at startup, > > but I'll be glad if I'm wrong. > > Yeah, I also understood the problem to be the SRAT, not the DSDT... Of > course you could reserve a single large part of the address range in > "-numa mem", and then add one or more DIMMs covering only part of the > address range. still needs to specify -dimm at startup, right? > > Perhaps "populated" could be a size instead of a boolean, to do > something like what Igor suggested. seems odd that a memory device is partially populated. How can guest OS and firmware cope with that? > > Paolo
On Mon, Jul 15, 2013 at 07:10:30PM +0200, Paolo Bonzini wrote: > Il 15/07/2013 19:05, Vasilis Liaskovitis ha scritto: > > from what i understand, we are currently favoring this numa option? (I saw it > > mentioned in Gao's numa patchset series as well) > > The two patchsets have some overlap, so it's good to find a design that > fits both. > > > There is still the question of "how many hotpluggable dimm devices does this > > memory range describe?". With the dimm device that was clearly defined, but not > > so with this option. Do we choose a default granularity e.g. 1 GB? > > I think it's the same. One "-numa mem" option = one "-device dimm" > option; both define one range. Unused memory ranges may remain if you > stumble upon a unusable range such as the PCI window. For example two > "-numa mem,size=2G" options would allocate memory from 0 to 2G and from > 4 to 6G. So we can drop -dimm if we agree on -numa mem? > > I'm snipping the rest of the email because I hope this covers all your > doubts. > > Paolo
Il 16/07/2013 03:27, Hu Tao ha scritto: > > I think it's the same. One "-numa mem" option = one "-device dimm" > > option; both define one range. Unused memory ranges may remain if you > > stumble upon a unusable range such as the PCI window. For example two > > "-numa mem,size=2G" options would allocate memory from 0 to 2G and from > > 4 to 6G. > > So we can drop -dimm if we agree on -numa mem? Yes, the point of the "-numa mem" proposal was to avoid the concept of a "partially initialized device" that you had for DIMMs. BTW, how do you specify which module you are plugging in? I.e., what if I have three 1G ranges at 0, 1G and 2G, and I want to plug the first and the third? This is especially important with hot-unplug, because then you can have this kind of hole in the address space. If you migrate the VM, you have to reproduce the situation in the destination command line. Paolo
On Tue, Jul 16, 2013 at 08:19:48AM +0200, Paolo Bonzini wrote: > Il 16/07/2013 03:27, Hu Tao ha scritto: > > > I think it's the same. One "-numa mem" option = one "-device dimm" > > > option; both define one range. Unused memory ranges may remain if you > > > stumble upon a unusable range such as the PCI window. For example two > > > "-numa mem,size=2G" options would allocate memory from 0 to 2G and from > > > 4 to 6G. > > > > So we can drop -dimm if we agree on -numa mem? > > Yes, the point of the "-numa mem" proposal was to avoid the concept of a > "partially initialized device" that you had for DIMMs. > > BTW, how do you specify which module you are plugging in? I.e., what if > I have three 1G ranges at 0, 1G and 2G, and I want to plug the first and > the third? I think an id is still needed to identify ranges, which can be shown to user with `info numa' or similar command, along with the corresponding ranges. > > This is especially important with hot-unplug, because then you can have > this kind of hole in the address space. If you migrate the VM, you have > to reproduce the situation in the destination command line. > > Paolo >
On Tue, 16 Jul 2013 08:19:48 +0200 Paolo Bonzini <pbonzini@redhat.com> wrote: > Il 16/07/2013 03:27, Hu Tao ha scritto: > > > I think it's the same. One "-numa mem" option = one "-device dimm" > > > option; both define one range. Unused memory ranges may remain if you > > > stumble upon a unusable range such as the PCI window. For example two > > > "-numa mem,size=2G" options would allocate memory from 0 to 2G and from > > > 4 to 6G. > > > > So we can drop -dimm if we agree on -numa mem? > > Yes, the point of the "-numa mem" proposal was to avoid the concept of a > "partially initialized device" that you had for DIMMs. I've though -numa mem was for mapping initial memory to numa nodes. It seem wrong to use it for representing dimm device and also limiting possible hotplugged regions to specified at startup ranges. we can leave -numa for initial memory mapping and manage of the mapping of hotpluggable regions with -device dimm,node=X,size=Y. It that case command line -device dimm will provide a fully initialized dimm device usable at startup (but hot-unplugable) and (monitor) device_add dimm,,node=X,size=Y would serve hot-plug case. That way arbitrary sized dimm could be hot-pluged without specifying them at startup, like it's done on bare-metal. In addition command line -device would be used in migration case to describe already hot-plugged dimms on target. > BTW, how do you specify which module you are plugging in? I.e., what if > I have three 1G ranges at 0, 1G and 2G, and I want to plug the first and > the third? > This is especially important with hot-unplug, because then you can have > this kind of hole in the address space. If you migrate the VM, you have > to reproduce the situation in the destination command line. Could we add optional advising 'base-addr' property to replicate exact source configuration on target? some thing like: -device dimm,node=X,size=Y,base-addr=Z where mgmt could get actual X,Y,Z inspecting dim device via qom-get. and if base-addr is not set parent of dimm would auto-allocate address where dimm is mapped. > > Paolo >
On Tue, 16 Jul 2013 15:27:19 +0800 Hu Tao <hutao@cn.fujitsu.com> wrote: > On Tue, Jul 16, 2013 at 08:19:48AM +0200, Paolo Bonzini wrote: > > Il 16/07/2013 03:27, Hu Tao ha scritto: > > > > I think it's the same. One "-numa mem" option = one "-device dimm" > > > > option; both define one range. Unused memory ranges may remain if you > > > > stumble upon a unusable range such as the PCI window. For example two > > > > "-numa mem,size=2G" options would allocate memory from 0 to 2G and from > > > > 4 to 6G. > > > > > > So we can drop -dimm if we agree on -numa mem? > > > > Yes, the point of the "-numa mem" proposal was to avoid the concept of a > > "partially initialized device" that you had for DIMMs. > > > > BTW, how do you specify which module you are plugging in? I.e., what if > > I have three 1G ranges at 0, 1G and 2G, and I want to plug the first and > > the third? > > I think an id is still needed to identify ranges, which can be shown to > user with `info numa' or similar command, along with the corresponding > ranges. "info numa" could get actual ranges enumerating present dimms if we would use plain -device for hotplug memory instead of -numa mem hacking. > > > > > This is especially important with hot-unplug, because then you can have > > this kind of hole in the address space. If you migrate the VM, you have > > to reproduce the situation in the destination command line. > > > > Paolo > > >
Il 16/07/2013 12:19, Igor Mammedov ha scritto: > On Tue, 16 Jul 2013 08:19:48 +0200 > Paolo Bonzini <pbonzini@redhat.com> wrote: > >> Il 16/07/2013 03:27, Hu Tao ha scritto: >>>> I think it's the same. One "-numa mem" option = one "-device dimm" >>>> option; both define one range. Unused memory ranges may remain if you >>>> stumble upon a unusable range such as the PCI window. For example two >>>> "-numa mem,size=2G" options would allocate memory from 0 to 2G and from >>>> 4 to 6G. >>> >>> So we can drop -dimm if we agree on -numa mem? >> >> Yes, the point of the "-numa mem" proposal was to avoid the concept of a >> "partially initialized device" that you had for DIMMs. > I've though -numa mem was for mapping initial memory to numa nodes. > It seem wrong to use it for representing dimm device and also limiting > possible hotplugged regions to specified at startup ranges. It's not for DIMM devices, it is for reserving areas of the address space for hot-plugged RAM. DIMM hotplug is done with "device_add dimm" (and you can also use "-numa mem,populated=no,... -device dimm,..." to start a VM with hot-unpluggable memory). > we can leave -numa for initial memory mapping and manage of the mapping > of hotpluggable regions with -device dimm,node=X,size=Y. > > It that case command line -device dimm will provide a fully initialized > dimm device usable at startup (but hot-unplugable) and > (monitor) device_add dimm,,node=X,size=Y > would serve hot-plug case. > > That way arbitrary sized dimm could be hot-pluged without specifying them > at startup, like it's done on bare-metal. But the memory ranges need to be specified at startup in the ACPI tables, and that's what "-numa mem" is for. > In addition command line -device would be used in migration case to describe > already hot-plugged dimms on target. Yep. Paolo
On Tue, 16 Jul 2013 12:31:46 +0200 Paolo Bonzini <pbonzini@redhat.com> wrote: > Il 16/07/2013 12:19, Igor Mammedov ha scritto: > > On Tue, 16 Jul 2013 08:19:48 +0200 > > Paolo Bonzini <pbonzini@redhat.com> wrote: > > > >> Il 16/07/2013 03:27, Hu Tao ha scritto: > >>>> I think it's the same. One "-numa mem" option = one "-device dimm" > >>>> option; both define one range. Unused memory ranges may remain if you > >>>> stumble upon a unusable range such as the PCI window. For example two > >>>> "-numa mem,size=2G" options would allocate memory from 0 to 2G and from > >>>> 4 to 6G. > >>> > >>> So we can drop -dimm if we agree on -numa mem? > >> > >> Yes, the point of the "-numa mem" proposal was to avoid the concept of a > >> "partially initialized device" that you had for DIMMs. > > I've though -numa mem was for mapping initial memory to numa nodes. > > It seem wrong to use it for representing dimm device and also limiting > > possible hotplugged regions to specified at startup ranges. > > It's not for DIMM devices, it is for reserving areas of the address > space for hot-plugged RAM. DIMM hotplug is done with "device_add dimm" > (and you can also use "-numa mem,populated=no,... -device dimm,..." to > start a VM with hot-unpluggable memory). There isn't a real need to reserve from ACPI pov, memory device in ACPI could provide _PXM() method to return mapping to numa node. And from my testing linux and windows guest are using it, even if is there is unnecessary mapping in SRAT table overriding SRAT mammping with dynamic one. It would be better not to use "populated" concept at all. If there is -device dim on cmd line, then it populated and for hotplugged dimm all necessary information could be generated dynamically. > > we can leave -numa for initial memory mapping and manage of the mapping > > of hotpluggable regions with -device dimm,node=X,size=Y. > > > > It that case command line -device dimm will provide a fully initialized > > dimm device usable at startup (but hot-unplugable) and > > (monitor) device_add dimm,,node=X,size=Y > > would serve hot-plug case. > > > > That way arbitrary sized dimm could be hot-pluged without specifying them > > at startup, like it's done on bare-metal. > > But the memory ranges need to be specified at startup in the ACPI > tables, and that's what "-numa mem" is for. not really, there is caveat with windows, which needs a hotplugable SRAT entry that tells it max possible limit (otherwise windows sees new dimm device but refuses to use it saying "server is not configured for hotplug" or something like this), but as far as such entry exists, windows is happily uses dynamic _CRS() and _PXM() if they are below that limit (even if a new range is not in any range defined by SRAT). And ACPI spec doesn't say that SRAT MUST be populated with hotplug ranges. It's kind of simplier for bare-metal, where they might do it due to limited supported DIMM capacity by reserving static entries with max supported ranges per DIMM and know in advance DIMM count for platform. But actual _CRS() anyway dynamic since plugged in DIMM could have a smaller capacity then supported max for slot. To summarize ACPI + windows limitations: - ACPI needs to pre-allocate memory devices, i.e. number of possible increments OSPM could utilize. It might be possible to overcome limitation be using Load() or LoadTable() in runtime, but I haven't tried it. - Windows needs to know max supported limit, a fake entry in SRAT from RamSize to max_mem works nicely there (tested with ws2008r2DC and ws2012DC). That's why I was proposing to extend "-m" option for "slots" number (i.e. nr of memory devices) and 'max_mem' to make Windows happy and cap mgmt tools from going over initially configured limit. then -device dimm could be used for hotpluggable mem available at startup and device_add fir adding more dimms with user defined sizes to desired nodes at runtime. Works nice without any need for 'populated=xxx' and predefined ranges. PS: I'll be able to post more or less usable RFC that does it on top of mst's ACPI tables in QEMU by the end of this week. > > > In addition command line -device would be used in migration case to describe > > already hot-plugged dimms on target. > > Yep. > > Paolo
Il 16/07/2013 14:00, Igor Mammedov ha scritto: >>> we can leave -numa for initial memory mapping and manage of the mapping >>> of hotpluggable regions with -device dimm,node=X,size=Y. >>> >>> It that case command line -device dimm will provide a fully initialized >>> dimm device usable at startup (but hot-unplugable) and >>> (monitor) device_add dimm,,node=X,size=Y >>> would serve hot-plug case. >>> >>> That way arbitrary sized dimm could be hot-pluged without specifying them >>> at startup, like it's done on bare-metal. >> >> But the memory ranges need to be specified at startup in the ACPI >> tables, and that's what "-numa mem" is for. > not really, there is caveat with windows, which needs a hotplugable SRAT entry > that tells it max possible limit (otherwise windows sees new dimm device but > refuses to use it saying "server is not configured for hotplug" or something > like this), but as far as such entry exists, windows is happily uses dynamic > _CRS() and _PXM() if they are below that limit (even if a new range is not in > any range defined by SRAT). > > And ACPI spec doesn't say that SRAT MUST be populated with hotplug ranges. Right, what is required in ACPI is only pre-allocation of slots. > It's kind of simplier for bare-metal, where they might do it due to limited > supported DIMM capacity by reserving static entries with max supported ranges > per DIMM and know in advance DIMM count for platform. But actual _CRS() anyway > dynamic since plugged in DIMM could have a smaller capacity then supported > max for slot. > > To summarize ACPI + windows limitations: > - ACPI needs to pre-allocate memory devices, i.e. number of possible increments > OSPM could utilize. It might be possible to overcome limitation be using > Load() or LoadTable() in runtime, but I haven't tried it. > - Windows needs to know max supported limit, a fake entry in SRAT from RamSize > to max_mem works nicely there (tested with ws2008r2DC and ws2012DC). > > That's why I was proposing to extend "-m" option for "slots" number (i.e. nr of > memory devices) and 'max_mem' to make Windows happy and cap mgmt tools from > going over initially configured limit. As far as memory hot-plug is concerned, the "-numa mem" proposal is exactly the same thing that you are proposing, minus the ability to specify many slots in one go. The same "-numa mem" can be used for host node binding as well, but that's not really relevant for memory hot-plug. In case you want multiple hotpluggable ranges, bonud to different host nodes, It doesn't matter if the SRAT will have one or many fake entries. > then -device dimm could be used for hotpluggable mem available at startup > and device_add fir adding more dimms with user defined sizes to desired nodes > at runtime. Yes, no disagreement on this. > Works nice without any need for 'populated=xxx' and predefined ranges. > > PS: > I'll be able to post more or less usable RFC that does it on top of mst's ACPI > tables in QEMU by the end of this week. Good! Paolo >> >>> In addition command line -device would be used in migration case to describe >>> already hot-plugged dimms on target. >> >> Yep. >> >> Paolo >
diff --git a/vl.c b/vl.c index 767e020..9d88a79 100644 --- a/vl.c +++ b/vl.c @@ -170,6 +170,7 @@ int main(int argc, char **argv) #include "ui/qemu-spice.h" #include "qapi/string-input-visitor.h" +#include "hw/mem-hotplug/dimm.h" //#define DEBUG_NET //#define DEBUG_SLIRP @@ -252,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = int nb_numa_nodes; uint64_t node_mem[MAX_NODES]; unsigned long *node_cpumask[MAX_NODES]; +int nb_hp_dimms; uint8_t qemu_uuid[16]; @@ -2338,6 +2340,50 @@ static int chardev_init_func(QemuOpts *opts, void *opaque) return 0; } +static int dimmcfg_init_func(QemuOpts *opts, void *opaque) +{ + const char *driver; + const char *id; + uint64_t node, size; + uint32_t populated; + const char *buf, *busbuf; + + /* DimmDevice configuration needs to be known in order to initialize chipset + * with correct memory and pci ranges. But all devices are created after + * chipset / machine initialization. In * order to avoid this problem, we + * parse dimm information earlier into dimmcfg structs. */ + + driver = qemu_opt_get(opts, "driver"); + if (!strcmp(driver, "dimm")) { + + id = qemu_opts_id(opts); + buf = qemu_opt_get(opts, "size"); + parse_option_size("size", buf, &size, NULL); + buf = qemu_opt_get(opts, "node"); + parse_option_number("node", buf, &node, NULL); + busbuf = qemu_opt_get(opts, "bus"); + buf = qemu_opt_get(opts, "populated"); + if (!buf) { + populated = 0; + } else { + populated = strcmp(buf, "on") ? 0 : 1; + } + + dimm_config_create((char *)id, size, busbuf ? busbuf : "membus.0", + node, nb_hp_dimms); + + /* if !populated, we just keep the config. The real device + * will be created in the future with a normal device_add + * command. */ + if (!populated) { + qemu_opts_del(opts); + } + nb_hp_dimms++; + } + + return 0; +} + #ifdef CONFIG_VIRTFS static int fsdev_init_func(QemuOpts *opts, void *opaque) { @@ -4260,6 +4306,11 @@ int main(int argc, char **argv, char **envp) } qemu_add_globals(); + /* init generic devices */ + if (qemu_opts_foreach(qemu_find_opts("device"), + dimmcfg_init_func, NULL, 1) != 0) { + exit(1); + } qdev_machine_init(); QEMUMachineInitArgs args = { .ram_size = ram_size,