Message ID | 20131024211249.723543071@amt.cnet |
---|---|
State | New |
Headers | show |
On 24 October 2013 22:12, Marcelo Tosatti <mtosatti@redhat.com> wrote: > Align guest physical address and host physical address > beyond guest 4GB on a 1GB boundary, in case hugetlbfs is used. > > Otherwise 1GB TLBs cannot be cached for the range. > + if (hpagesize == (1<<30)) { > + unsigned long holesize = 0x100000000ULL - below_4g_mem_size; > + > + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, > + 0x100000000ULL, > + above_4g_mem_size - holesize); > + memory_region_add_subregion(system_memory, 0x100000000ULL, > + ram_above_4g); > + > + ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo)); > + memory_region_init_alias(ram_above_4g_piecetwo, NULL, > + "ram-above-4g-piecetwo", ram, > + 0x100000000ULL - holesize, holesize); > + memory_region_add_subregion(system_memory, > + 0x100000000ULL + > + above_4g_mem_size - holesize, > + ram_above_4g_piecetwo); > + } else { > + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, > + below_4g_mem_size, above_4g_mem_size); > + memory_region_add_subregion(system_memory, 0x100000000ULL, > ram_above_4g); This looks pretty weird. Presence or absence of host OS features shouldn't affect how we model the guest hardware and RAM. Conversely, if hugetlbs have performance related requirements then a patch which only touches the x86 pc model seems rather limited. -- PMM
On Thu, Oct 24, 2013 at 10:55:54PM +0100, Peter Maydell wrote: > On 24 October 2013 22:12, Marcelo Tosatti <mtosatti@redhat.com> wrote: > > Align guest physical address and host physical address > > beyond guest 4GB on a 1GB boundary, in case hugetlbfs is used. > > > > Otherwise 1GB TLBs cannot be cached for the range. > > > + if (hpagesize == (1<<30)) { > > + unsigned long holesize = 0x100000000ULL - below_4g_mem_size; > > + > > + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, > > + 0x100000000ULL, > > + above_4g_mem_size - holesize); > > + memory_region_add_subregion(system_memory, 0x100000000ULL, > > + ram_above_4g); > > + > > + ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo)); > > + memory_region_init_alias(ram_above_4g_piecetwo, NULL, > > + "ram-above-4g-piecetwo", ram, > > + 0x100000000ULL - holesize, holesize); > > + memory_region_add_subregion(system_memory, > > + 0x100000000ULL + > > + above_4g_mem_size - holesize, > > + ram_above_4g_piecetwo); > > + } else { > > + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, > > + below_4g_mem_size, above_4g_mem_size); > > + memory_region_add_subregion(system_memory, 0x100000000ULL, > > ram_above_4g); > > This looks pretty weird. Presence or absence of host OS features > shouldn't affect how we model the guest hardware and RAM. This is not visible to the guest (read the comment in the patch). > Conversely, if hugetlbs have performance related requirements > then a patch which only touches the x86 pc model seems rather > limited. The requirement is that gpa and and hpas must be aligned on hugepage boundaries. The memory region API allows gpas to be registered at custom hpas (via the offset parameter). It is not entirely clear what is your request.
Il 24/10/2013 22:12, Marcelo Tosatti ha scritto: > Align guest physical address and host physical address > beyond guest 4GB on a 1GB boundary, in case hugetlbfs is used. > > Otherwise 1GB TLBs cannot be cached for the range. > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> > > Index: qemu/hw/i386/pc.c > =================================================================== > --- qemu.orig/hw/i386/pc.c > +++ qemu/hw/i386/pc.c > @@ -1116,8 +1116,9 @@ FWCfgState *pc_memory_init(MemoryRegion > { > int linux_boot, i; > MemoryRegion *ram, *option_rom_mr; > - MemoryRegion *ram_below_4g, *ram_above_4g; > + MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo; > FWCfgState *fw_cfg; > + unsigned long hpagesize; > > linux_boot = (kernel_filename != NULL); > > @@ -1129,6 +1130,7 @@ FWCfgState *pc_memory_init(MemoryRegion > memory_region_init_ram(ram, NULL, "pc.ram", > below_4g_mem_size + above_4g_mem_size); > vmstate_register_ram_global(ram); > + hpagesize = qemu_get_ram_hpagesize(ram->ram_addr); > *ram_memory = ram; > ram_below_4g = g_malloc(sizeof(*ram_below_4g)); > memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram, > @@ -1136,10 +1138,46 @@ FWCfgState *pc_memory_init(MemoryRegion > memory_region_add_subregion(system_memory, 0, ram_below_4g); > if (above_4g_mem_size > 0) { > ram_above_4g = g_malloc(sizeof(*ram_above_4g)); > - memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, > - below_4g_mem_size, above_4g_mem_size); > - memory_region_add_subregion(system_memory, 0x100000000ULL, > + > + /* > + * > + * If 1GB hugepages are used to back guest RAM, map guest address > + * space in the range [ramsize,ramsize+holesize] to the ram block > + * range [holestart, 4GB] > + * > + * 0 h 4G [ramsize,ramsize+holesize] > + * > + * guest-addr-space [ ] [ ][xxx] > + * /----------/ > + * contiguous-ram-block [ ][xxx][ ] > + * > + * So that memory beyond 4GB is aligned on a 1GB boundary, > + * at the host physical address space. > + * > + */ > + if (hpagesize == (1<<30)) { > + unsigned long holesize = 0x100000000ULL - below_4g_mem_size; > + > + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, > + 0x100000000ULL, > + above_4g_mem_size - holesize); > + memory_region_add_subregion(system_memory, 0x100000000ULL, > + ram_above_4g); > + > + ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo)); > + memory_region_init_alias(ram_above_4g_piecetwo, NULL, > + "ram-above-4g-piecetwo", ram, > + 0x100000000ULL - holesize, holesize); > + memory_region_add_subregion(system_memory, > + 0x100000000ULL + > + above_4g_mem_size - holesize, > + ram_above_4g_piecetwo); Why break it in two? You can just allocate extra holesize bytes in the "ram" MemoryRegion, and not map the part that corresponds to [0x100000000ULL - holesize, 0x100000000ULL). Also, as Peter said this cannot depend on host considerations. Just do it unconditionally, but only for new machine types (pc-1.8 and q35-1.8, since unfortunately we're too close to hard freeze). Paolo > + } else { > + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, > + below_4g_mem_size, above_4g_mem_size); > + memory_region_add_subregion(system_memory, 0x100000000ULL, > ram_above_4g); > + } > }
On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote: > > + if (hpagesize == (1<<30)) { > > + unsigned long holesize = 0x100000000ULL - below_4g_mem_size; > > + > > + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, > > + 0x100000000ULL, > > + above_4g_mem_size - holesize); > > + memory_region_add_subregion(system_memory, 0x100000000ULL, > > + ram_above_4g); > > + > > + ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo)); > > + memory_region_init_alias(ram_above_4g_piecetwo, NULL, > > + "ram-above-4g-piecetwo", ram, > > + 0x100000000ULL - holesize, holesize); > > + memory_region_add_subregion(system_memory, > > + 0x100000000ULL + > > + above_4g_mem_size - holesize, > > + ram_above_4g_piecetwo); > > Why break it in two? You can just allocate extra holesize bytes in the > "ram" MemoryRegion, and not map the part that corresponds to > [0x100000000ULL - holesize, 0x100000000ULL). - If the "ram" MemoryRegion is backed with 1GB hugepages, you might not want to allocate extra holesize bytes (which might require an entire 1GB page). - 1GB backed RAM can be mapped with 2MB pages. > Also, as Peter said this cannot depend on host considerations. Just do > it unconditionally, but only for new machine types (pc-1.8 and q35-1.8, > since unfortunately we're too close to hard freeze). Why the description of memory subregions and aliases are part of machine types?
Il 25/10/2013 05:58, Marcelo Tosatti ha scritto: > On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote: >>> + if (hpagesize == (1<<30)) { >>> + unsigned long holesize = 0x100000000ULL - below_4g_mem_size; >>> + >>> + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, >>> + 0x100000000ULL, >>> + above_4g_mem_size - holesize); >>> + memory_region_add_subregion(system_memory, 0x100000000ULL, >>> + ram_above_4g); >>> + >>> + ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo)); >>> + memory_region_init_alias(ram_above_4g_piecetwo, NULL, >>> + "ram-above-4g-piecetwo", ram, >>> + 0x100000000ULL - holesize, holesize); >>> + memory_region_add_subregion(system_memory, >>> + 0x100000000ULL + >>> + above_4g_mem_size - holesize, >>> + ram_above_4g_piecetwo); >> >> Why break it in two? You can just allocate extra holesize bytes in the >> "ram" MemoryRegion, and not map the part that corresponds to >> [0x100000000ULL - holesize, 0x100000000ULL). > > - If the "ram" MemoryRegion is backed with 1GB hugepages, you might not > want to allocate extra holesize bytes (which might require an entire > 1GB page). > > - 1GB backed RAM can be mapped with 2MB pages. > >> Also, as Peter said this cannot depend on host considerations. Just do >> it unconditionally, but only for new machine types (pc-1.8 and q35-1.8, >> since unfortunately we're too close to hard freeze). > > Why the description of memory subregions and aliases are part of machine > types? It affects the migration stream, which stores RAM offsets instead of physical addresses. Let's say you have an 8 GB guest and the hole size is 0.25 GB. If the huge page size is 2MB, you have: Physical address Length RAM offsets 0 3.75 GB pc.ram @ 0 4 GB 4.25 GB pc.ram @ 3.75 GB If the huge page size is 1GB, you have: Physical address Length RAM offsets 0 3.75 GB pc.ram @ 0 4 GB 4 GB pc.ram @ 4 GB 8 GB 0.25 GB pc.ram @ 3.75 GB So your memory "rotates" around the 3.75 GB boundary when you migrate from a non-gbpages host to a gbpages host or vice versa. If we're doing it only for new machine types, it's even simpler to just have two RAM regions: Physical address Length RAM offsets 0 3.75 GB pc.ram-below-4g @ 0 4 GB 4.25 GB pc.ram-above-4g @ 0 Because offsets are zero, and lengths match the RAM block lengths, you do not need any complication with aliasing. This still has to be done only for new machine types. Paolo
On Fri, 25 Oct 2013 02:58:05 -0200 Marcelo Tosatti <mtosatti@redhat.com> wrote: > On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote: > > > + if (hpagesize == (1<<30)) { > > > + unsigned long holesize = 0x100000000ULL - > > > below_4g_mem_size; + > > > + memory_region_init_alias(ram_above_4g, NULL, > > > "ram-above-4g", ram, > > > + 0x100000000ULL, > > > + above_4g_mem_size - > > > holesize); > > > + memory_region_add_subregion(system_memory, > > > 0x100000000ULL, > > > + ram_above_4g); > > > + > > > + ram_above_4g_piecetwo = > > > g_malloc(sizeof(*ram_above_4g_piecetwo)); > > > + memory_region_init_alias(ram_above_4g_piecetwo, NULL, > > > + "ram-above-4g-piecetwo", > > > ram, > > > + 0x100000000ULL - holesize, > > > holesize); > > > + memory_region_add_subregion(system_memory, > > > + 0x100000000ULL + > > > + above_4g_mem_size - > > > holesize, > > > + ram_above_4g_piecetwo); > > > > Why break it in two? You can just allocate extra holesize bytes in > > the "ram" MemoryRegion, and not map the part that corresponds to > > [0x100000000ULL - holesize, 0x100000000ULL). > > - If the "ram" MemoryRegion is backed with 1GB hugepages, you might > not want to allocate extra holesize bytes (which might require an > entire 1GB page). From POV of moddeling current "ram" as dimm devices, aliasing wouldn't work nice. But breaking one block in two or more is fine since then blocks could be represented as several dimm devices. +3Gb backend ram it could be split in blocks like this: [ 3Gb (1Gb pages backed) ] [tail1 (below_4gb - 3Gb) (2mb pages backed) ] [above_4gb whole X Gb pages (1Gb pages backed)] [tail2 (2mb pages backed)] > - 1GB backed RAM can be mapped with 2MB pages. > > > Also, as Peter said this cannot depend on host considerations. > > Just do it unconditionally, but only for new machine types (pc-1.8 > > and q35-1.8, since unfortunately we're too close to hard freeze). > > Why the description of memory subregions and aliases are part of > machine types? > >
On Fri, Oct 25, 2013 at 11:57:18AM +0200, igor Mammedov wrote: > On Fri, 25 Oct 2013 02:58:05 -0200 > Marcelo Tosatti <mtosatti@redhat.com> wrote: > > > On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote: > > > > + if (hpagesize == (1<<30)) { > > > > + unsigned long holesize = 0x100000000ULL - > > > > below_4g_mem_size; + > > > > + memory_region_init_alias(ram_above_4g, NULL, > > > > "ram-above-4g", ram, > > > > + 0x100000000ULL, > > > > + above_4g_mem_size - > > > > holesize); > > > > + memory_region_add_subregion(system_memory, > > > > 0x100000000ULL, > > > > + ram_above_4g); > > > > + > > > > + ram_above_4g_piecetwo = > > > > g_malloc(sizeof(*ram_above_4g_piecetwo)); > > > > + memory_region_init_alias(ram_above_4g_piecetwo, NULL, > > > > + "ram-above-4g-piecetwo", > > > > ram, > > > > + 0x100000000ULL - holesize, > > > > holesize); > > > > + memory_region_add_subregion(system_memory, > > > > + 0x100000000ULL + > > > > + above_4g_mem_size - > > > > holesize, > > > > + ram_above_4g_piecetwo); > > > > > > Why break it in two? You can just allocate extra holesize bytes in > > > the "ram" MemoryRegion, and not map the part that corresponds to > > > [0x100000000ULL - holesize, 0x100000000ULL). > > > > - If the "ram" MemoryRegion is backed with 1GB hugepages, you might > > not want to allocate extra holesize bytes (which might require an > > entire 1GB page). > From POV of moddeling current "ram" as dimm devices, aliasing > wouldn't work nice. But breaking one block in two or more is fine since > then blocks could be represented as several dimm devices. > > +3Gb backend ram it could be split in blocks like this: > > [ 3Gb (1Gb pages backed) ] > [tail1 (below_4gb - 3Gb) (2mb pages backed) ] > [above_4gb whole X Gb pages (1Gb pages backed)] > [tail2 (2mb pages backed)] Yes, thought of that, unfortunately its cumbersome to add an interface for the user to supply both 2MB and 1GB hugetlbfs pages.
On Fri, Oct 25, 2013 at 09:52:34AM +0100, Paolo Bonzini wrote: > Because offsets are zero, and lengths match the RAM block lengths, you > do not need any complication with aliasing. This still has to be done > only for new machine types. Not possible because you just wasted holesize bytes (if number of additional bytes due to huge page alignment is smaller than holesize, a new hugepage is required, which is not acceptable). Is there a tree the new machine types can live until 1.8 opens up? Can you pick up the MAP_POPULATE patch?
Il 25/10/2013 20:50, Marcelo Tosatti ha scritto: > On Fri, Oct 25, 2013 at 09:52:34AM +0100, Paolo Bonzini wrote: >> Because offsets are zero, and lengths match the RAM block lengths, you >> do not need any complication with aliasing. This still has to be done >> only for new machine types. > > Not possible because you just wasted holesize bytes (if number of > additional bytes due to huge page alignment is smaller than holesize, a > new hugepage is required, which is not acceptable). Ok. Thanks for explaining---the patch seems good with the proper compatibility option in the machine type. Please run the guest_memory_dump_analysis test in autotest too. > Is there a tree the new machine types can live until 1.8 opens up? > > Can you pick up the MAP_POPULATE patch? Yes, I can pick that one up next week. Michael is usually gathering hw/i386/pc* patches in his PCI tree, you can Cc him on v2 of this one. Paolo
On Fri, 25 Oct 2013 11:34:22 -0200 Marcelo Tosatti <mtosatti@redhat.com> wrote: > On Fri, Oct 25, 2013 at 11:57:18AM +0200, igor Mammedov wrote: > > On Fri, 25 Oct 2013 02:58:05 -0200 > > Marcelo Tosatti <mtosatti@redhat.com> wrote: > > > > > On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote: > > > > > + if (hpagesize == (1<<30)) { > > > > > + unsigned long holesize = 0x100000000ULL - > > > > > below_4g_mem_size; + > > > > > + memory_region_init_alias(ram_above_4g, NULL, > > > > > "ram-above-4g", ram, > > > > > + 0x100000000ULL, > > > > > + above_4g_mem_size - > > > > > holesize); > > > > > + memory_region_add_subregion(system_memory, > > > > > 0x100000000ULL, > > > > > + ram_above_4g); > > > > > + > > > > > + ram_above_4g_piecetwo = > > > > > g_malloc(sizeof(*ram_above_4g_piecetwo)); > > > > > + memory_region_init_alias(ram_above_4g_piecetwo, > > > > > NULL, > > > > > + "ram-above-4g-piecetwo", > > > > > ram, > > > > > + 0x100000000ULL - > > > > > holesize, holesize); > > > > > + memory_region_add_subregion(system_memory, > > > > > + 0x100000000ULL + > > > > > + above_4g_mem_size - > > > > > holesize, > > > > > + > > > > > ram_above_4g_piecetwo); > > > > > > > > Why break it in two? You can just allocate extra holesize > > > > bytes in the "ram" MemoryRegion, and not map the part that > > > > corresponds to [0x100000000ULL - holesize, 0x100000000ULL). > > > > > > - If the "ram" MemoryRegion is backed with 1GB hugepages, you > > > might not want to allocate extra holesize bytes (which might > > > require an entire 1GB page). > > From POV of moddeling current "ram" as dimm devices, aliasing > > wouldn't work nice. But breaking one block in two or more is fine > > since then blocks could be represented as several dimm devices. > > > > +3Gb backend ram it could be split in blocks like this: > > > > [ 3Gb (1Gb pages backed) ] > > [tail1 (below_4gb - 3Gb) (2mb pages backed) ] > > [above_4gb whole X Gb pages (1Gb pages backed)] > > [tail2 (2mb pages backed)] > > Yes, thought of that, unfortunately its cumbersome to add an interface > for the user to supply both 2MB and 1GB hugetlbfs pages. Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and there is/are tail/s, QEMU should be able to figure out alignment issues and allocate with appropriate pages. Goal is separate host part allocation aspect from guest related one, aliasing 32-bit hole size at the end doesn't help it at all, it's quite opposite, it's making current code more complicated and harder to fix in the future.
On Sun, Oct 27, 2013 at 04:20:44PM +0100, igor Mammedov wrote: > > Yes, thought of that, unfortunately its cumbersome to add an interface > > for the user to supply both 2MB and 1GB hugetlbfs pages. > Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and > there is/are tail/s, QEMU should be able to figure out alignment > issues and allocate with appropriate pages. Yes that would be ideal but the problem with hugetlbfs is that pages are preallocated. So in the end you'd have to expose the split of guest RAM in 2MB/1GB types to the user (it would be necessary for the user to calculate the size of the hole, etc). > Goal is separate host part allocation aspect from guest related one, > aliasing 32-bit hole size at the end doesn't help it at all, it's quite > opposite, it's making current code more complicated and harder to fix > in the future. You can simply back the 1GB areas which the hole reside with 2MB pages. Can't see why having the tail of RAM map to the hole is problematic. Understand your concern, but the complication is necessary: the host virtual/physical address and guest physical addresses must be aligned on largepage boundaries. Do you foresee any problem with memory hotplug? Could add a warning to memory API: if memory region is larger than 1GB and RAM is 1GB backed, and not properly aligned, warn.
On Mon, Oct 28, 2013 at 12:04:06PM -0200, Marcelo Tosatti wrote: > On Sun, Oct 27, 2013 at 04:20:44PM +0100, igor Mammedov wrote: > > > Yes, thought of that, unfortunately its cumbersome to add an interface > > > for the user to supply both 2MB and 1GB hugetlbfs pages. > > Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and > > there is/are tail/s, QEMU should be able to figure out alignment > > issues and allocate with appropriate pages. > > Yes that would be ideal but the problem with hugetlbfs is that pages are > preallocated. > > So in the end you'd have to expose the split of guest RAM in 2MB/1GB types > to the user (it would be necessary for the user to calculate the size of > the hole, etc). Note the assumption here is this: its easier for the hugetlbfs user to manage number of 1GB hugepages = size of guest RAM Than for him to calculate the size of the hole (which depends on machine type), allocate 1GB/2MB hugepages accordingly. And the benefit of that would be to save one 1GB hugepage (which is preallocated during boot, in the first place). So matching number of 1GB hugepages and guest RAM seems the easier choice. > > Goal is separate host part allocation aspect from guest related one, > > aliasing 32-bit hole size at the end doesn't help it at all, it's quite > > opposite, it's making current code more complicated and harder to fix > > in the future. What is the problem to be fixed, exactly? > You can simply back the 1GB areas which the hole reside with 2MB pages. > Can't see why having the tail of RAM map to the hole is problematic. > > Understand your concern, but the complication is necessary: the host > virtual/physical address and guest physical addresses must be aligned on > largepage boundaries. > > Do you foresee any problem with memory hotplug? > > Could add a warning to memory API: if memory region is larger than 1GB > and RAM is 1GB backed, and not properly aligned, warn.
On Mon, 28 Oct 2013 12:04:06 -0200 Marcelo Tosatti <mtosatti@redhat.com> wrote: > On Sun, Oct 27, 2013 at 04:20:44PM +0100, igor Mammedov wrote: > > > Yes, thought of that, unfortunately its cumbersome to add an interface > > > for the user to supply both 2MB and 1GB hugetlbfs pages. > > Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and > > there is/are tail/s, QEMU should be able to figure out alignment > > issues and allocate with appropriate pages. > > Yes that would be ideal but the problem with hugetlbfs is that pages are > preallocated. > > So in the end you'd have to expose the split of guest RAM in 2MB/1GB types > to the user (it would be necessary for the user to calculate the size of > the hole, etc). exposing it to the user might be not necessary, QEMU could allocate 5Gb+3Mb ram without user intervention: 3Gb low.ram.aligned.region // using huge pages 1mb low.ram.unaligned.region if below_4g_ram_size - 3Gb; // so not to waste precious low ram, using fallback allocation //hypothetically hole starts at 3Gb+1mb 2Gb high.ram.aligned.region // using huge pages 2Mb high.ram.unaligned.region // so that not to waste 1Gb on memory using huge page > > > Goal is separate host part allocation aspect from guest related one, > > aliasing 32-bit hole size at the end doesn't help it at all, it's quite > > opposite, it's making current code more complicated and harder to fix > > in the future. > > You can simply back the 1GB areas which the hole reside with 2MB pages. I'm not getting what do you mean here. > Can't see why having the tail of RAM map to the hole is problematic. Problem I see is that with proposed aliasing there is no one-one mapping to future "memdev" where each Dimm device (guest/model visible memory block) has a corresponding memdev backend (host memory block). Moreover with current hugepages handling in QEMU including this patch and usage of 1Gb hugepages, QEMU might loose ~1Gb if -m "hpagesize*n+1", which is by itself is a good reason to use several allocations with different allocator backends. > Understand your concern, but the complication is necessary: the host > virtual/physical address and guest physical addresses must be aligned on > largepage boundaries. I don't argue against it, only about the best way to achieve it. If we assume possible conversion from adhoc way of allocating initial RAM to DIMM devices in the future then changing region layout several times in incompatible way doesn't seems to be the best approach. If we are going to change it, let at least minimize compatibility issues and do it right in the first place. I'll post RFC patch as reply to this thread. > > Do you foresee any problem with memory hotplug? I don't see any problem with memory hotplug so far, but as noted above there will be problems with converting initial ram to DIMM devices. > > Could add a warning to memory API: if memory region is larger than 1GB > and RAM is 1GB backed, and not properly aligned, warn. Perhaps it would be better do abort and ask user to fix configuration, and on hugepage allocation failure not fallback to malloc but abort and tell user amount of hugepages needed to run guest with hugepage backend.
On Tue, Oct 29, 2013 at 07:00:54PM +0100, Igor Mammedov wrote: > On Mon, 28 Oct 2013 12:04:06 -0200 > Marcelo Tosatti <mtosatti@redhat.com> wrote: > > > On Sun, Oct 27, 2013 at 04:20:44PM +0100, igor Mammedov wrote: > > > > Yes, thought of that, unfortunately its cumbersome to add an interface > > > > for the user to supply both 2MB and 1GB hugetlbfs pages. > > > Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and > > > there is/are tail/s, QEMU should be able to figure out alignment > > > issues and allocate with appropriate pages. > > > > Yes that would be ideal but the problem with hugetlbfs is that pages are > > preallocated. > > > > So in the end you'd have to expose the split of guest RAM in 2MB/1GB types > > to the user (it would be necessary for the user to calculate the size of > > the hole, etc). > exposing it to the user might be not necessary, > QEMU could allocate 5Gb+3Mb ram without user intervention: It is necessary because the user has to allocate hugetlbfs pages (see the end of the email). > 3Gb low.ram.aligned.region // using huge pages > 1mb low.ram.unaligned.region if below_4g_ram_size - 3Gb; // so not to waste precious low ram, using fallback allocation > //hypothetically hole starts at 3Gb+1mb > 2Gb high.ram.aligned.region // using huge pages > 2Mb high.ram.unaligned.region // so that not to waste 1Gb on memory using huge page You want memory areas not backed by 1GB pages to be backed by 2MB pages (so that possibility of creation of TLB entries per physical address range is similar, or matches, physical hardware) (*) > > > Goal is separate host part allocation aspect from guest related one, > > > aliasing 32-bit hole size at the end doesn't help it at all, it's quite > > > opposite, it's making current code more complicated and harder to fix > > > in the future. > > > > You can simply back the 1GB areas which the hole reside with 2MB pages. > I'm not getting what do you mean here. The 1GB memory backed areas which can't be mapped with 1GB TLBs, such as the [3GB,4GB] guest physical address range can be mapped with 2MB TLBs. > > Can't see why having the tail of RAM map to the hole is problematic. > Problem I see is that with proposed aliasing there is no one-one > mapping to future "memdev" where each Dimm device (guest/model visible memory block) > has a corresponding memdev backend (host memory block). 1) What is the dependency of memdev on linear host memory block? (that is, i can't see the reasoning behind a one-to-one mapping). 2) Why can't memdev access host memory via mappings? (that is, why does memdev require each DIMM to be mapped linearly in QEMU's virtual address space?). > Moreover with current hugepages handling in QEMU including this patch and usage of > 1Gb hugepages, QEMU might loose ~1Gb if -m "hpagesize*n+1", which is by itself is a > good reason to use several allocations with different allocator backends. > > Understand your concern, but the complication is necessary: the host > > virtual/physical address and guest physical addresses must be aligned on > > largepage boundaries. > I don't argue against it, only about the best way to achieve it. > > If we assume possible conversion from adhoc way of allocating initial RAM to > DIMM devices in the future then changing region layout several times in > incompatible way doesn't seems to be the best approach. If we are going to > change it, let at least minimize compatibility issues and do it right > in the first place. > > I'll post RFC patch as reply to this thread. > > > > > Do you foresee any problem with memory hotplug? > I don't see any problem with memory hotplug so far, but as noted above > there will be problems with converting initial ram to DIMM devices. > > > > > Could add a warning to memory API: if memory region is larger than 1GB > > and RAM is 1GB backed, and not properly aligned, warn. > Perhaps it would be better do abort and ask user to fix configuration, > and on hugepage allocation failure not fallback to malloc but abort and > tell user amount of hugepages needed to run guest with hugepage backend. You want to back your guest with 1GB hugepages. You get 1 such page at a time, worst case. You either 1) map the guest physical address space region (1GB sized) where the hole is located with smaller page sizes, which must be 2MB, see *, which requires the user to specify a different hugetlbfs mount path with sufficient 2MB huge pages. 2) move the pieces of memory which can't be 1GB mapped backed into 1GB hugepages, and map the remaining 1GB-aligned regions to individual 1GB pages. I am trying to avoid 1) as it complicates management (and fixes a bug).
On Tue, Oct 29, 2013 at 07:21:59PM -0200, Marcelo Tosatti wrote: > > > Could add a warning to memory API: if memory region is larger than 1GB > > > and RAM is 1GB backed, and not properly aligned, warn. > > Perhaps it would be better do abort and ask user to fix configuration, > > and on hugepage allocation failure not fallback to malloc but abort and > > tell user amount of hugepages needed to run guest with hugepage backend. > > You want to back your guest with 1GB hugepages. You get 1 such page at a > time, worst case. > > You either > > 1) map the guest physical address space region (1GB sized) where > the hole is located with smaller page sizes, which must be 2MB, see *, > which requires the user to specify a different hugetlbfs mount path with > sufficient 2MB huge pages. > Why not really on THP to do the work? > 2) move the pieces of memory which can't be 1GB mapped backed into > 1GB hugepages, and map the remaining 1GB-aligned regions to individual 1GB > pages. > > I am trying to avoid 1) as it complicates management (and fixes a bug). -- Gleb.
On Fr, 2013-10-25 at 23:53 +0100, Paolo Bonzini wrote: > Il 25/10/2013 20:50, Marcelo Tosatti ha scritto: > > On Fri, Oct 25, 2013 at 09:52:34AM +0100, Paolo Bonzini wrote: > >> Because offsets are zero, and lengths match the RAM block lengths, you > >> do not need any complication with aliasing. This still has to be done > >> only for new machine types. > > > > Not possible because you just wasted holesize bytes (if number of > > additional bytes due to huge page alignment is smaller than holesize, a > > new hugepage is required, which is not acceptable). > > Ok. Thanks for explaining---the patch seems good with the proper > compatibility option in the machine type. Please run the > guest_memory_dump_analysis test in autotest too. As the whole thing must depend on machine type anyway for live migration compatibility we can also simply change the memory split, i.e. map 2GB (-M q35) or 3GB (-M pc) below 4G instead of the odd sizes (2.75 / 3.5) we have now. cheers, Gerd
Il 30/10/2013 12:07, Gerd Hoffmann ha scritto: > On Fr, 2013-10-25 at 23:53 +0100, Paolo Bonzini wrote: >> Il 25/10/2013 20:50, Marcelo Tosatti ha scritto: >>> On Fri, Oct 25, 2013 at 09:52:34AM +0100, Paolo Bonzini wrote: >>>> Because offsets are zero, and lengths match the RAM block lengths, you >>>> do not need any complication with aliasing. This still has to be done >>>> only for new machine types. >>> >>> Not possible because you just wasted holesize bytes (if number of >>> additional bytes due to huge page alignment is smaller than holesize, a >>> new hugepage is required, which is not acceptable). >> >> Ok. Thanks for explaining---the patch seems good with the proper >> compatibility option in the machine type. Please run the >> guest_memory_dump_analysis test in autotest too. > > As the whole thing must depend on machine type anyway for live migration > compatibility we can also simply change the memory split, i.e. map 2GB > (-M q35) or 3GB (-M pc) below 4G instead of the odd sizes (2.75 / 3.5) > we have now. For q35 it could be a possibility. For pc, I'm worried about old Windows systems that ignore all memory outside the first 4GB. They would lose 512 MB of memory. Paolo
On Wed, Oct 30, 2013 at 10:48:13AM +0200, Gleb Natapov wrote: > On Tue, Oct 29, 2013 at 07:21:59PM -0200, Marcelo Tosatti wrote: > > > > Could add a warning to memory API: if memory region is larger than 1GB > > > > and RAM is 1GB backed, and not properly aligned, warn. > > > Perhaps it would be better do abort and ask user to fix configuration, > > > and on hugepage allocation failure not fallback to malloc but abort and > > > tell user amount of hugepages needed to run guest with hugepage backend. > > > > You want to back your guest with 1GB hugepages. You get 1 such page at a > > time, worst case. > > > > You either > > > > 1) map the guest physical address space region (1GB sized) where > > the hole is located with smaller page sizes, which must be 2MB, see *, > > which requires the user to specify a different hugetlbfs mount path with > > sufficient 2MB huge pages. > > > Why not really on THP to do the work? Because it might be the case that static hugepage assignment is desired (no guarantees of backingwith THP).
Index: qemu/hw/i386/pc.c =================================================================== --- qemu.orig/hw/i386/pc.c +++ qemu/hw/i386/pc.c @@ -1116,8 +1116,9 @@ FWCfgState *pc_memory_init(MemoryRegion { int linux_boot, i; MemoryRegion *ram, *option_rom_mr; - MemoryRegion *ram_below_4g, *ram_above_4g; + MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo; FWCfgState *fw_cfg; + unsigned long hpagesize; linux_boot = (kernel_filename != NULL); @@ -1129,6 +1130,7 @@ FWCfgState *pc_memory_init(MemoryRegion memory_region_init_ram(ram, NULL, "pc.ram", below_4g_mem_size + above_4g_mem_size); vmstate_register_ram_global(ram); + hpagesize = qemu_get_ram_hpagesize(ram->ram_addr); *ram_memory = ram; ram_below_4g = g_malloc(sizeof(*ram_below_4g)); memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram, @@ -1136,10 +1138,46 @@ FWCfgState *pc_memory_init(MemoryRegion memory_region_add_subregion(system_memory, 0, ram_below_4g); if (above_4g_mem_size > 0) { ram_above_4g = g_malloc(sizeof(*ram_above_4g)); - memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, - below_4g_mem_size, above_4g_mem_size); - memory_region_add_subregion(system_memory, 0x100000000ULL, + + /* + * + * If 1GB hugepages are used to back guest RAM, map guest address + * space in the range [ramsize,ramsize+holesize] to the ram block + * range [holestart, 4GB] + * + * 0 h 4G [ramsize,ramsize+holesize] + * + * guest-addr-space [ ] [ ][xxx] + * /----------/ + * contiguous-ram-block [ ][xxx][ ] + * + * So that memory beyond 4GB is aligned on a 1GB boundary, + * at the host physical address space. + * + */ + if (hpagesize == (1<<30)) { + unsigned long holesize = 0x100000000ULL - below_4g_mem_size; + + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, + 0x100000000ULL, + above_4g_mem_size - holesize); + memory_region_add_subregion(system_memory, 0x100000000ULL, + ram_above_4g); + + ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo)); + memory_region_init_alias(ram_above_4g_piecetwo, NULL, + "ram-above-4g-piecetwo", ram, + 0x100000000ULL - holesize, holesize); + memory_region_add_subregion(system_memory, + 0x100000000ULL + + above_4g_mem_size - holesize, + ram_above_4g_piecetwo); + } else { + memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, + below_4g_mem_size, above_4g_mem_size); + memory_region_add_subregion(system_memory, 0x100000000ULL, ram_above_4g); + } }
Align guest physical address and host physical address beyond guest 4GB on a 1GB boundary, in case hugetlbfs is used. Otherwise 1GB TLBs cannot be cached for the range. Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>