Message ID | 20230623125707.323517-5-npiggin@gmail.com |
---|---|
State | New |
Headers | show |
Series | ppc: fix larx migration, fix record-replay | expand |
e500 has the same problem, I think, according to this issue: https://gitlab.com/qemu-project/qemu/-/issues/1634 Btw, ARM virt platform rebuilds fdt only at initialization phase, not when reset. Isn't this behavior correct? Shouldn't PPC platforms do the similar thing? On 23.06.2023 15:57, Nicholas Piggin wrote: > spapr_machine_reset gets a random number to populate the device-tree > rng seed with. When loading a snapshot for record-replay, the machine > is reset again, and that tries to consume the random event record > again, crashing due to inconsistent record > > Fix this by saving the seed to populate the device tree with, and > skipping the rng on snapshot load. > > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> > --- > hw/ppc/spapr.c | 12 +++++++++--- > include/hw/ppc/spapr.h | 1 + > 2 files changed, 10 insertions(+), 3 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index d290acfa95..55948f233f 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -1017,7 +1017,6 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > { > MachineState *machine = MACHINE(spapr); > SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); > - uint8_t rng_seed[32]; > int chosen; > > _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen")); > @@ -1095,8 +1094,7 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > spapr_dt_ov5_platform_support(spapr, fdt, chosen); > } > > - qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); > - _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed))); > + _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32)); > > _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5")); > } > @@ -1649,6 +1647,14 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) > void *fdt; > int rc; > > + if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) { > + /* > + * Record-replay snapshot load must not consume random, this was > + * already replayed from initial machine reset. > + */ > + qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32); > + } > + > pef_kvm_reset(machine->cgs, &error_fatal); > spapr_caps_apply(spapr); > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index f47e8419a5..f4bd204d86 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -204,6 +204,7 @@ struct SpaprMachineState { > uint32_t fdt_size; > uint32_t fdt_initial_size; > void *fdt_blob; > + uint8_t fdt_rng_seed[32]; > long kernel_size; > bool kernel_le; > uint64_t kernel_addr;
On Mon Jun 26, 2023 at 6:07 PM AEST, Pavel Dovgalyuk wrote: > e500 has the same problem, I think, according to this issue: > https://gitlab.com/qemu-project/qemu/-/issues/1634 Same symptoms. e500 looks like it does the dt build in machine_init_done notifier, though. Maybe I miss something. I'll take a look. > > Btw, ARM virt platform rebuilds fdt only at initialization phase, not > when reset. I was actually wondering about keeping the same rng-seed across resets to make the code even simpler, but decided to keep behaviour unchanged. That seems like one downside. > Isn't this behavior correct? Shouldn't PPC platforms do the similar thing? I believe spapr does this for an spapr "feature" that rebuilds the fdt at runtime ("client-architecture-support"), so reset has to build the original one again. Thanks, Nick > > On 23.06.2023 15:57, Nicholas Piggin wrote: > > spapr_machine_reset gets a random number to populate the device-tree > > rng seed with. When loading a snapshot for record-replay, the machine > > is reset again, and that tries to consume the random event record > > again, crashing due to inconsistent record > > > > Fix this by saving the seed to populate the device tree with, and > > skipping the rng on snapshot load. > > > > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> > > --- > > hw/ppc/spapr.c | 12 +++++++++--- > > include/hw/ppc/spapr.h | 1 + > > 2 files changed, 10 insertions(+), 3 deletions(-) > > > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > > index d290acfa95..55948f233f 100644 > > --- a/hw/ppc/spapr.c > > +++ b/hw/ppc/spapr.c > > @@ -1017,7 +1017,6 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > > { > > MachineState *machine = MACHINE(spapr); > > SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); > > - uint8_t rng_seed[32]; > > int chosen; > > > > _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen")); > > @@ -1095,8 +1094,7 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > > spapr_dt_ov5_platform_support(spapr, fdt, chosen); > > } > > > > - qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); > > - _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed))); > > + _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32)); > > > > _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5")); > > } > > @@ -1649,6 +1647,14 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) > > void *fdt; > > int rc; > > > > + if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) { > > + /* > > + * Record-replay snapshot load must not consume random, this was > > + * already replayed from initial machine reset. > > + */ > > + qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32); > > + } > > + > > pef_kvm_reset(machine->cgs, &error_fatal); > > spapr_caps_apply(spapr); > > > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > > index f47e8419a5..f4bd204d86 100644 > > --- a/include/hw/ppc/spapr.h > > +++ b/include/hw/ppc/spapr.h > > @@ -204,6 +204,7 @@ struct SpaprMachineState { > > uint32_t fdt_size; > > uint32_t fdt_initial_size; > > void *fdt_blob; > > + uint8_t fdt_rng_seed[32]; > > long kernel_size; > > bool kernel_le; > > uint64_t kernel_addr;
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index d290acfa95..55948f233f 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1017,7 +1017,6 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) { MachineState *machine = MACHINE(spapr); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); - uint8_t rng_seed[32]; int chosen; _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen")); @@ -1095,8 +1094,7 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) spapr_dt_ov5_platform_support(spapr, fdt, chosen); } - qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); - _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed))); + _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32)); _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5")); } @@ -1649,6 +1647,14 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) void *fdt; int rc; + if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) { + /* + * Record-replay snapshot load must not consume random, this was + * already replayed from initial machine reset. + */ + qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32); + } + pef_kvm_reset(machine->cgs, &error_fatal); spapr_caps_apply(spapr); diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index f47e8419a5..f4bd204d86 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -204,6 +204,7 @@ struct SpaprMachineState { uint32_t fdt_size; uint32_t fdt_initial_size; void *fdt_blob; + uint8_t fdt_rng_seed[32]; long kernel_size; bool kernel_le; uint64_t kernel_addr;
spapr_machine_reset gets a random number to populate the device-tree rng seed with. When loading a snapshot for record-replay, the machine is reset again, and that tries to consume the random event record again, crashing due to inconsistent record Fix this by saving the seed to populate the device tree with, and skipping the rng on snapshot load. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> --- hw/ppc/spapr.c | 12 +++++++++--- include/hw/ppc/spapr.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-)