Message ID | 20230726183532.434380-5-npiggin@gmail.com |
---|---|
State | New |
Headers | show |
Series | ppc: record-replay fixes and enablement | expand |
Acked-by: Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru> On 26.07.2023 21:35, Nicholas Piggin wrote: > spapr_machine_reset gets a random number to populate the device-tree > rng seed with. When loading a snapshot for record-replay, the machine > is reset again, and that tries to consume the random event record > again, crashing due to inconsistent record > > Fix this by saving the seed to populate the device tree with, and > skipping the rng on snapshot load. > > Cc: Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> > --- > hw/ppc/spapr.c | 12 +++++++++--- > include/hw/ppc/spapr.h | 1 + > 2 files changed, 10 insertions(+), 3 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 7d84244f03..ecfbdb0030 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -1022,7 +1022,6 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > { > MachineState *machine = MACHINE(spapr); > SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); > - uint8_t rng_seed[32]; > int chosen; > > _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen")); > @@ -1100,8 +1099,7 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > spapr_dt_ov5_platform_support(spapr, fdt, chosen); > } > > - qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); > - _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed))); > + _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32)); > > _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5")); > } > @@ -1654,6 +1652,14 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) > void *fdt; > int rc; > > + if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) { > + /* > + * Record-replay snapshot load must not consume random, this was > + * already replayed from initial machine reset. > + */ > + qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32); > + } > + > pef_kvm_reset(machine->cgs, &error_fatal); > spapr_caps_apply(spapr); > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index f47e8419a5..f4bd204d86 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -204,6 +204,7 @@ struct SpaprMachineState { > uint32_t fdt_size; > uint32_t fdt_initial_size; > void *fdt_blob; > + uint8_t fdt_rng_seed[32]; > long kernel_size; > bool kernel_le; > uint64_t kernel_addr;
BTW, there is a function qemu_register_reset_nosnapshotload that can be used in similar cases. Can you just use it without changing the code of the reset handler? On 26.07.2023 21:35, Nicholas Piggin wrote: > spapr_machine_reset gets a random number to populate the device-tree > rng seed with. When loading a snapshot for record-replay, the machine > is reset again, and that tries to consume the random event record > again, crashing due to inconsistent record > > Fix this by saving the seed to populate the device tree with, and > skipping the rng on snapshot load. > > Cc: Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> > --- > hw/ppc/spapr.c | 12 +++++++++--- > include/hw/ppc/spapr.h | 1 + > 2 files changed, 10 insertions(+), 3 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 7d84244f03..ecfbdb0030 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -1022,7 +1022,6 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > { > MachineState *machine = MACHINE(spapr); > SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); > - uint8_t rng_seed[32]; > int chosen; > > _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen")); > @@ -1100,8 +1099,7 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > spapr_dt_ov5_platform_support(spapr, fdt, chosen); > } > > - qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); > - _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed))); > + _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32)); > > _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5")); > } > @@ -1654,6 +1652,14 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) > void *fdt; > int rc; > > + if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) { > + /* > + * Record-replay snapshot load must not consume random, this was > + * already replayed from initial machine reset. > + */ > + qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32); > + } > + > pef_kvm_reset(machine->cgs, &error_fatal); > spapr_caps_apply(spapr); > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index f47e8419a5..f4bd204d86 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -204,6 +204,7 @@ struct SpaprMachineState { > uint32_t fdt_size; > uint32_t fdt_initial_size; > void *fdt_blob; > + uint8_t fdt_rng_seed[32]; > long kernel_size; > bool kernel_le; > uint64_t kernel_addr;
On Fri Aug 4, 2023 at 6:50 PM AEST, Pavel Dovgalyuk wrote: > BTW, there is a function qemu_register_reset_nosnapshotload that can be > used in similar cases. > Can you just use it without changing the code of the reset handler? I didn't know that, thanks for pointing it out. I'll take a closer look at it before reposting. Thanks, Nick > > On 26.07.2023 21:35, Nicholas Piggin wrote: > > spapr_machine_reset gets a random number to populate the device-tree > > rng seed with. When loading a snapshot for record-replay, the machine > > is reset again, and that tries to consume the random event record > > again, crashing due to inconsistent record > > > > Fix this by saving the seed to populate the device tree with, and > > skipping the rng on snapshot load. > > > > Cc: Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru> > > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> > > --- > > hw/ppc/spapr.c | 12 +++++++++--- > > include/hw/ppc/spapr.h | 1 + > > 2 files changed, 10 insertions(+), 3 deletions(-) > > > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > > index 7d84244f03..ecfbdb0030 100644 > > --- a/hw/ppc/spapr.c > > +++ b/hw/ppc/spapr.c > > @@ -1022,7 +1022,6 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > > { > > MachineState *machine = MACHINE(spapr); > > SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); > > - uint8_t rng_seed[32]; > > int chosen; > > > > _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen")); > > @@ -1100,8 +1099,7 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) > > spapr_dt_ov5_platform_support(spapr, fdt, chosen); > > } > > > > - qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); > > - _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed))); > > + _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32)); > > > > _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5")); > > } > > @@ -1654,6 +1652,14 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) > > void *fdt; > > int rc; > > > > + if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) { > > + /* > > + * Record-replay snapshot load must not consume random, this was > > + * already replayed from initial machine reset. > > + */ > > + qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32); > > + } > > + > > pef_kvm_reset(machine->cgs, &error_fatal); > > spapr_caps_apply(spapr); > > > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > > index f47e8419a5..f4bd204d86 100644 > > --- a/include/hw/ppc/spapr.h > > +++ b/include/hw/ppc/spapr.h > > @@ -204,6 +204,7 @@ struct SpaprMachineState { > > uint32_t fdt_size; > > uint32_t fdt_initial_size; > > void *fdt_blob; > > + uint8_t fdt_rng_seed[32]; > > long kernel_size; > > bool kernel_le; > > uint64_t kernel_addr;
On Sun Aug 6, 2023 at 9:46 PM AEST, Nicholas Piggin wrote: > On Fri Aug 4, 2023 at 6:50 PM AEST, Pavel Dovgalyuk wrote: > > BTW, there is a function qemu_register_reset_nosnapshotload that can be > > used in similar cases. > > Can you just use it without changing the code of the reset handler? > > I didn't know that, thanks for pointing it out. I'll take a closer look > at it before reposting. Seems a bit tricky because the device tree has to be rebuilt at reset time (including snapshot load), but it uses the random number. So having a second nosnapshotload reset function might not be called in the correct order, I think? For now I will keep it as is. Thanks, Nick
On 08.08.2023 06:09, Nicholas Piggin wrote: > On Sun Aug 6, 2023 at 9:46 PM AEST, Nicholas Piggin wrote: >> On Fri Aug 4, 2023 at 6:50 PM AEST, Pavel Dovgalyuk wrote: >>> BTW, there is a function qemu_register_reset_nosnapshotload that can be >>> used in similar cases. >>> Can you just use it without changing the code of the reset handler? >> >> I didn't know that, thanks for pointing it out. I'll take a closer look >> at it before reposting. > > Seems a bit tricky because the device tree has to be rebuilt at reset > time (including snapshot load), but it uses the random number. So It seems strange to me, that loading the existing configuration has to randomize the device tree. > having a second nosnapshotload reset function might not be called in > the correct order, I think? For now I will keep it as is. Ok, let's wait for other reviewers. Pavel Dovgalyuk
On Tue Aug 8, 2023 at 1:52 PM AEST, Pavel Dovgalyuk wrote: > On 08.08.2023 06:09, Nicholas Piggin wrote: > > On Sun Aug 6, 2023 at 9:46 PM AEST, Nicholas Piggin wrote: > >> On Fri Aug 4, 2023 at 6:50 PM AEST, Pavel Dovgalyuk wrote: > >>> BTW, there is a function qemu_register_reset_nosnapshotload that can be > >>> used in similar cases. > >>> Can you just use it without changing the code of the reset handler? > >> > >> I didn't know that, thanks for pointing it out. I'll take a closer look > >> at it before reposting. > > > > Seems a bit tricky because the device tree has to be rebuilt at reset > > time (including snapshot load), but it uses the random number. So > > It seems strange to me, that loading the existing configuration has to > randomize the device tree. Building the device tree requires a random number for one of the properties. Other architectures that don't have this "cas" firmware call that changes the device tree and so requires it is rebuilt at machine reset time just build the device tree once at machine creation time I believe. So spapr is already weird in that way. We could go the way that other archs have and just save that random number once at creation and then reuse it for each reset. I thought that was not so good because for a normal reset I think it is better to get a new random number each time, no? So I think it's natural enough to take a new random number for a regular reset, but keep the existing one for a snapshot reset. I could be misunderstanding something though. Thanks, Nick > > > having a second nosnapshotload reset function might not be called in > > the correct order, I think? For now I will keep it as is. > > Ok, let's wait for other reviewers. > > > Pavel Dovgalyuk
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 7d84244f03..ecfbdb0030 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1022,7 +1022,6 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) { MachineState *machine = MACHINE(spapr); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); - uint8_t rng_seed[32]; int chosen; _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen")); @@ -1100,8 +1099,7 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) spapr_dt_ov5_platform_support(spapr, fdt, chosen); } - qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); - _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed))); + _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32)); _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5")); } @@ -1654,6 +1652,14 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) void *fdt; int rc; + if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) { + /* + * Record-replay snapshot load must not consume random, this was + * already replayed from initial machine reset. + */ + qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32); + } + pef_kvm_reset(machine->cgs, &error_fatal); spapr_caps_apply(spapr); diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index f47e8419a5..f4bd204d86 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -204,6 +204,7 @@ struct SpaprMachineState { uint32_t fdt_size; uint32_t fdt_initial_size; void *fdt_blob; + uint8_t fdt_rng_seed[32]; long kernel_size; bool kernel_le; uint64_t kernel_addr;
spapr_machine_reset gets a random number to populate the device-tree rng seed with. When loading a snapshot for record-replay, the machine is reset again, and that tries to consume the random event record again, crashing due to inconsistent record Fix this by saving the seed to populate the device tree with, and skipping the rng on snapshot load. Cc: Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> --- hw/ppc/spapr.c | 12 +++++++++--- include/hw/ppc/spapr.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-)