diff mbox series

[2/4] reset: Allow multiple stages of system resets

Message ID 20240117091559.144730-3-peterx@redhat.com
State New
Headers show
Series intel_iommu: Reset vIOMMU after all the rest of devices | expand

Commit Message

Peter Xu Jan. 17, 2024, 9:15 a.m. UTC
From: Peter Xu <peterx@redhat.com>

QEMU resets do not have a way to order reset hooks.  Add one coarse grained
reset stage so that some devices can be reset later than some others.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/sysemu/reset.h |  5 ++++
 hw/core/reset.c        | 60 +++++++++++++++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 16 deletions(-)

Comments

Eric Auger Jan. 17, 2024, 10:28 a.m. UTC | #1
Hi Peter,
On 1/17/24 10:15, peterx@redhat.com wrote:
> From: Peter Xu <peterx@redhat.com>
>
> QEMU resets do not have a way to order reset hooks.  Add one coarse grained
> reset stage so that some devices can be reset later than some others.
I would precise that the lowest stage has the highest priority and is
handled first.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/sysemu/reset.h |  5 ++++
>  hw/core/reset.c        | 60 +++++++++++++++++++++++++++++++-----------
>  2 files changed, 49 insertions(+), 16 deletions(-)
>
> diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h
> index 609e4d50c2..0de697ce9f 100644
> --- a/include/sysemu/reset.h
> +++ b/include/sysemu/reset.h
> @@ -5,9 +5,14 @@
>  
>  typedef void QEMUResetHandler(void *opaque);
>  
> +#define  QEMU_RESET_STAGES_N  2
> +
>  void qemu_register_reset(QEMUResetHandler *func, void *opaque);
> +void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
> +                             bool skip_snap, int stage);
>  void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque);
>  void qemu_unregister_reset(QEMUResetHandler *func, void *opaque);
> +void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage);
>  void qemu_devices_reset(ShutdownCause reason);
>  
>  #endif
> diff --git a/hw/core/reset.c b/hw/core/reset.c
> index 8cf60b2b09..a84c9bee84 100644
> --- a/hw/core/reset.c
> +++ b/hw/core/reset.c
> @@ -36,55 +36,83 @@ typedef struct QEMUResetEntry {
>      bool skip_on_snapshot_load;
>  } QEMUResetEntry;
>  
> -static QTAILQ_HEAD(, QEMUResetEntry) reset_handlers =
> -    QTAILQ_HEAD_INITIALIZER(reset_handlers);
> +typedef QTAILQ_HEAD(QEMUResetList, QEMUResetEntry) QEMUResetList;
> +static QEMUResetList reset_handlers[QEMU_RESET_STAGES_N];
>  
> -static void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
> -                                    bool skip_snap)
> +static void __attribute__((__constructor__)) qemu_reset_handlers_init(void)
> +{
> +    QEMUResetList *head;
> +    int i = 0;
nit: you may put the declarations within the block
> +
> +    for (i = 0; i < QEMU_RESET_STAGES_N; i++) {
> +        head = &reset_handlers[i];
> +        QTAILQ_INIT(head);
> +    }
> +}
> +
> +void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
> +                             bool skip_snap, int stage)
>  {
>      QEMUResetEntry *re = g_new0(QEMUResetEntry, 1);
> +    QEMUResetList *head;
> +
> +    assert(stage >= 0 && stage < QEMU_RESET_STAGES_N);
> +    head = &reset_handlers[stage];
>  
>      re->func = func;
>      re->opaque = opaque;
>      re->skip_on_snapshot_load = skip_snap;
> -    QTAILQ_INSERT_TAIL(&reset_handlers, re, entry);
> +    QTAILQ_INSERT_TAIL(head, re, entry);
>  }
>  
>  void qemu_register_reset(QEMUResetHandler *func, void *opaque)
>  {
> -    /* By default, do not skip during load of a snapshot */
Shouldn't the above comment stay since the statement is not affected by
this patch? Or remove it in previous patch?
> -    qemu_register_reset_one(func, opaque, false);
> +    qemu_register_reset_one(func, opaque, false, 0);
>  }
>  
>  void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque)
>  {
> -    qemu_register_reset_one(func, opaque, true);
> +    qemu_register_reset_one(func, opaque, true, 0);
>  }
>  
> -void qemu_unregister_reset(QEMUResetHandler *func, void *opaque)
> +void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage)
>  {
> +    QEMUResetList *head;
>      QEMUResetEntry *re;
>  
> -    QTAILQ_FOREACH(re, &reset_handlers, entry) {
> +    assert(stage >= 0 && stage < QEMU_RESET_STAGES_N);
> +    head = &reset_handlers[stage];
> +
> +    QTAILQ_FOREACH(re, head, entry) {
>          if (re->func == func && re->opaque == opaque) {
> -            QTAILQ_REMOVE(&reset_handlers, re, entry);
> +            QTAILQ_REMOVE(head, re, entry);
>              g_free(re);
>              return;
>          }
>      }
>  }
>  
> +void qemu_unregister_reset(QEMUResetHandler *func, void *opaque)
> +{
> +    qemu_unregister_reset_one(func, opaque, 0);
> +}
> +
>  void qemu_devices_reset(ShutdownCause reason)
>  {
>      QEMUResetEntry *re, *nre;
> +    QEMUResetList *head;
> +    int stage;
>  
>      /* reset all devices */
> -    QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) {
> -        if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD &&
> -            re->skip_on_snapshot_load) {
> -            continue;
> +    for (stage = 0; stage < QEMU_RESET_STAGES_N; stage++) {
> +        head = &reset_handlers[stage];
> +        QTAILQ_FOREACH_SAFE(re, head, entry, nre) {
> +            if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD &&
> +                re->skip_on_snapshot_load) {
> +                continue;
> +            }
> +            re->func(re->opaque);
>          }
> -        re->func(re->opaque);
>      }
>  }
>  
Thanks

Eric
Cédric Le Goater Jan. 17, 2024, 1:58 p.m. UTC | #2
On 1/17/24 11:28, Eric Auger wrote:
> Hi Peter,
> On 1/17/24 10:15, peterx@redhat.com wrote:
>> From: Peter Xu <peterx@redhat.com>
>>
>> QEMU resets do not have a way to order reset hooks.  Add one coarse grained
>> reset stage so that some devices can be reset later than some others.
> I would precise that the lowest stage has the highest priority and is
> handled first.

yes. May be add an enum like we have for migration :

typedef enum {
     MIG_PRI_DEFAULT = 0,
     MIG_PRI_IOMMU,              /* Must happen before PCI devices */
     MIG_PRI_PCI_BUS,            /* Must happen before IOMMU */
     MIG_PRI_VIRTIO_MEM,         /* Must happen before IOMMU */
     MIG_PRI_GICV3_ITS,          /* Must happen before PCI devices */
     MIG_PRI_GICV3,              /* Must happen before the ITS */
     MIG_PRI_MAX,
} MigrationPriority;

I think it would help understand the reset ordering and maintenance
when grepping qemu_register_reset_one().

Thanks,

C.



>>
>> Signed-off-by: Peter Xu <peterx@redhat.com>
>> ---
>>   include/sysemu/reset.h |  5 ++++
>>   hw/core/reset.c        | 60 +++++++++++++++++++++++++++++++-----------
>>   2 files changed, 49 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h
>> index 609e4d50c2..0de697ce9f 100644
>> --- a/include/sysemu/reset.h
>> +++ b/include/sysemu/reset.h
>> @@ -5,9 +5,14 @@
>>   
>>   typedef void QEMUResetHandler(void *opaque);
>>   
>> +#define  QEMU_RESET_STAGES_N  2
>> +
>>   void qemu_register_reset(QEMUResetHandler *func, void *opaque);
>> +void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
>> +                             bool skip_snap, int stage);
>>   void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque);
>>   void qemu_unregister_reset(QEMUResetHandler *func, void *opaque);
>> +void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage);
>>   void qemu_devices_reset(ShutdownCause reason);
>>   
>>   #endif
>> diff --git a/hw/core/reset.c b/hw/core/reset.c
>> index 8cf60b2b09..a84c9bee84 100644
>> --- a/hw/core/reset.c
>> +++ b/hw/core/reset.c
>> @@ -36,55 +36,83 @@ typedef struct QEMUResetEntry {
>>       bool skip_on_snapshot_load;
>>   } QEMUResetEntry;
>>   
>> -static QTAILQ_HEAD(, QEMUResetEntry) reset_handlers =
>> -    QTAILQ_HEAD_INITIALIZER(reset_handlers);
>> +typedef QTAILQ_HEAD(QEMUResetList, QEMUResetEntry) QEMUResetList;
>> +static QEMUResetList reset_handlers[QEMU_RESET_STAGES_N];
>>   
>> -static void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
>> -                                    bool skip_snap)
>> +static void __attribute__((__constructor__)) qemu_reset_handlers_init(void)
>> +{
>> +    QEMUResetList *head;
>> +    int i = 0;
> nit: you may put the declarations within the block
>> +
>> +    for (i = 0; i < QEMU_RESET_STAGES_N; i++) {
>> +        head = &reset_handlers[i];
>> +        QTAILQ_INIT(head);
>> +    }
>> +}
>> +
>> +void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
>> +                             bool skip_snap, int stage)
>>   {
>>       QEMUResetEntry *re = g_new0(QEMUResetEntry, 1);
>> +    QEMUResetList *head;
>> +
>> +    assert(stage >= 0 && stage < QEMU_RESET_STAGES_N);
>> +    head = &reset_handlers[stage];
>>   
>>       re->func = func;
>>       re->opaque = opaque;
>>       re->skip_on_snapshot_load = skip_snap;
>> -    QTAILQ_INSERT_TAIL(&reset_handlers, re, entry);
>> +    QTAILQ_INSERT_TAIL(head, re, entry);
>>   }
>>   
>>   void qemu_register_reset(QEMUResetHandler *func, void *opaque)
>>   {
>> -    /* By default, do not skip during load of a snapshot */
> Shouldn't the above comment stay since the statement is not affected by
> this patch? Or remove it in previous patch?
>> -    qemu_register_reset_one(func, opaque, false);
>> +    qemu_register_reset_one(func, opaque, false, 0);
>>   }
>>   
>>   void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque)
>>   {
>> -    qemu_register_reset_one(func, opaque, true);
>> +    qemu_register_reset_one(func, opaque, true, 0);
>>   }
>>   
>> -void qemu_unregister_reset(QEMUResetHandler *func, void *opaque)
>> +void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage)
>>   {
>> +    QEMUResetList *head;
>>       QEMUResetEntry *re;
>>   
>> -    QTAILQ_FOREACH(re, &reset_handlers, entry) {
>> +    assert(stage >= 0 && stage < QEMU_RESET_STAGES_N);
>> +    head = &reset_handlers[stage];
>> +
>> +    QTAILQ_FOREACH(re, head, entry) {
>>           if (re->func == func && re->opaque == opaque) {
>> -            QTAILQ_REMOVE(&reset_handlers, re, entry);
>> +            QTAILQ_REMOVE(head, re, entry);
>>               g_free(re);
>>               return;
>>           }
>>       }
>>   }
>>   
>> +void qemu_unregister_reset(QEMUResetHandler *func, void *opaque)
>> +{
>> +    qemu_unregister_reset_one(func, opaque, 0);
>> +}
>> +
>>   void qemu_devices_reset(ShutdownCause reason)
>>   {
>>       QEMUResetEntry *re, *nre;
>> +    QEMUResetList *head;
>> +    int stage;
>>   
>>       /* reset all devices */
>> -    QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) {
>> -        if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD &&
>> -            re->skip_on_snapshot_load) {
>> -            continue;
>> +    for (stage = 0; stage < QEMU_RESET_STAGES_N; stage++) {
>> +        head = &reset_handlers[stage];
>> +        QTAILQ_FOREACH_SAFE(re, head, entry, nre) {
>> +            if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD &&
>> +                re->skip_on_snapshot_load) {
>> +                continue;
>> +            }
>> +            re->func(re->opaque);
>>           }
>> -        re->func(re->opaque);
>>       }
>>   }
>>   
> Thanks
> 
> Eric
> 
>
Peter Maydell Jan. 17, 2024, 5:46 p.m. UTC | #3
On Wed, 17 Jan 2024 at 09:16, <peterx@redhat.com> wrote:
>
> From: Peter Xu <peterx@redhat.com>
>
> QEMU resets do not have a way to order reset hooks.  Add one coarse grained
> reset stage so that some devices can be reset later than some others.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/sysemu/reset.h |  5 ++++
>  hw/core/reset.c        | 60 +++++++++++++++++++++++++++++++-----------
>  2 files changed, 49 insertions(+), 16 deletions(-)
>
> diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h
> index 609e4d50c2..0de697ce9f 100644
> --- a/include/sysemu/reset.h
> +++ b/include/sysemu/reset.h
> @@ -5,9 +5,14 @@
>
>  typedef void QEMUResetHandler(void *opaque);
>
> +#define  QEMU_RESET_STAGES_N  2
> +

Our reset handling APIs are already pretty complicated, and
raw qemu_register_reset() is kind of a legacy API that I would
prefer that we try to move away from, not add extra complexity to.

Our device reset design already has a multiple-phase system
(see docs/devel/reset.rst), part of the point of which is to
try to give us a way to deal with reset-ordering problems.
I feel like the right way to handle the issue you're trying to
address is to ensure that the thing that needs to happen last is
done in the 'exit' phase rather than the 'hold' phase (which is
where legacy reset methods get called).

There are some annoying wrinkles here, notably that legacy
qemu_register_reset() doesn't support 3-phase reset and so
the phasing only happens for devices reset via the device/bus
tree hierarchy. But I think the way to go is to try to move
forward with that design (i.e. expand 3-phase reset to
qemu_register_reset() and/or move things using qemu_register_reset()
to device reset where that makes sense), not to ignore it and
put a completely different reset-ordering API in a different place.

thanks
-- PMM
Philippe Mathieu-Daudé Jan. 18, 2024, 3:53 p.m. UTC | #4
On 17/1/24 18:46, Peter Maydell wrote:
> On Wed, 17 Jan 2024 at 09:16, <peterx@redhat.com> wrote:
>>
>> From: Peter Xu <peterx@redhat.com>
>>
>> QEMU resets do not have a way to order reset hooks.  Add one coarse grained
>> reset stage so that some devices can be reset later than some others.
>>
>> Signed-off-by: Peter Xu <peterx@redhat.com>
>> ---
>>   include/sysemu/reset.h |  5 ++++
>>   hw/core/reset.c        | 60 +++++++++++++++++++++++++++++++-----------
>>   2 files changed, 49 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h
>> index 609e4d50c2..0de697ce9f 100644
>> --- a/include/sysemu/reset.h
>> +++ b/include/sysemu/reset.h
>> @@ -5,9 +5,14 @@
>>
>>   typedef void QEMUResetHandler(void *opaque);
>>
>> +#define  QEMU_RESET_STAGES_N  2
>> +
> 
> Our reset handling APIs are already pretty complicated, and
> raw qemu_register_reset() is kind of a legacy API that I would
> prefer that we try to move away from, not add extra complexity to.
> 
> Our device reset design already has a multiple-phase system
> (see docs/devel/reset.rst), part of the point of which is to
> try to give us a way to deal with reset-ordering problems.
> I feel like the right way to handle the issue you're trying to
> address is to ensure that the thing that needs to happen last is
> done in the 'exit' phase rather than the 'hold' phase (which is
> where legacy reset methods get called).

I concur. Devices reset is hard, but bus reset is even harder.
Having a quick look, the issues tracked by Alex & Peter might
come from the PCI bridges using the legacy DeviceReset. In
particular functions like:

- hw/pci-bridge/pcie_root_port.c

  46 static void rp_reset_hold(Object *obj)
  47 {
  48     PCIDevice *d = PCI_DEVICE(obj);
  49     DeviceState *qdev = DEVICE(obj);
  50
  51     rp_aer_vector_update(d);
  52     pcie_cap_root_reset(d);
  53     pcie_cap_deverr_reset(d);
  54     pcie_cap_slot_reset(d);
  55     pcie_cap_arifwd_reset(d);
  56     pcie_acs_reset(d);
  57     pcie_aer_root_reset(d);
  58     pci_bridge_reset(qdev);
  59     pci_bridge_disable_base_limit(d);
  60 }

- hw/pci-bridge/pcie_pci_bridge.c

107 static void pcie_pci_bridge_reset(DeviceState *qdev)
108 {
109     PCIDevice *d = PCI_DEVICE(qdev);
110     pci_bridge_reset(qdev);
111     if (msi_present(d)) {
112         msi_reset(d);
113     }
114     shpc_reset(d);
115 }

- hw/pci-bridge/xio3130_downstream.c

  56 static void xio3130_downstream_reset(DeviceState *qdev)
  57 {
  58     PCIDevice *d = PCI_DEVICE(qdev);
  59
  60     pcie_cap_deverr_reset(d);
  61     pcie_cap_slot_reset(d);
  62     pcie_cap_arifwd_reset(d);
  63     pci_bridge_reset(qdev);
  64 }

should really be split and converted as ResettablePhases.

pci_bridge_reset() is likely a ResettableExitPhase one.

> There are some annoying wrinkles here, notably that legacy
> qemu_register_reset() doesn't support 3-phase reset and so
> the phasing only happens for devices reset via the device/bus
> tree hierarchy. But I think the way to go is to try to move
> forward with that design (i.e. expand 3-phase reset to
> qemu_register_reset() and/or move things using qemu_register_reset()
> to device reset where that makes sense), not to ignore it and
> put a completely different reset-ordering API in a different place.

Unfortunately despite DeviceReset being deprecated since 4 years
in commit c11256aa6f ("hw/core: add Resettable support to BusClass
and DeviceClass"), we keep adding code using this legacy API; for
example in the last 4 months:

- e029bb00a7 ("hw/pci-host: Add Astro system bus adapter found on 
PA-RISC machines")
- 2880e676c0 ("Add virtio-sound device stub")
- 4a58330343 ("hw/cxl: Add a switch mailbox CCI function")
- 95e1019a4a ("vhost-user-scsi: free the inflight area when reset")
- 6f9c3aaa34 ("fsl-imx: add simple RTC emulation for i.MX6 and i.MX7 
boards")

Regards,

Phil.
Peter Maydell Jan. 18, 2024, 4:15 p.m. UTC | #5
On Thu, 18 Jan 2024 at 15:53, Philippe Mathieu-Daudé <philmd@linaro.org> wrote:
> Unfortunately despite DeviceReset being deprecated since 4 years
> in commit c11256aa6f ("hw/core: add Resettable support to BusClass
> and DeviceClass"), we keep adding code using this legacy API; for
> example in the last 4 months:
>
> - e029bb00a7 ("hw/pci-host: Add Astro system bus adapter found on
> PA-RISC machines")
> - 2880e676c0 ("Add virtio-sound device stub")
> - 4a58330343 ("hw/cxl: Add a switch mailbox CCI function")
> - 95e1019a4a ("vhost-user-scsi: free the inflight area when reset")
> - 6f9c3aaa34 ("fsl-imx: add simple RTC emulation for i.MX6 and i.MX7
> boards")

For plain old leaf device reset methods, I'm not too fussed
about this, because a reset method is exactly equivalent
to a single 'hold' phase method, and the relatively
small amount of code to convert from one to the other isn't
introducing a lot of extra complication.

The cleanup I really want to get back to is the bus reset
handling, because we only have a few buses that don't use
3-phase reset, and if we convert those then we can get rid
of the handling of transitional reset from BusClass
completely. Unfortunately I ran into a regression somewhere
in the PCI reset handling in the patchsets I was working on
which I never got back to to track down the problem.

thanks
-- PMM
Peter Xu Jan. 19, 2024, 11:10 a.m. UTC | #6
Hello, Phil, PeterM,

On Thu, Jan 18, 2024 at 04:53:42PM +0100, Philippe Mathieu-Daudé wrote:
> I concur. Devices reset is hard, but bus reset is even harder.
> Having a quick look, the issues tracked by Alex & Peter might
> come from the PCI bridges using the legacy DeviceReset. 

The challenges we're facing with VFIO on vIOMMU are actually listed in
patch 4's large comment I added, here:

https://lore.kernel.org/qemu-devel/20240117091559.144730-5-peterx@redhat.com/

+    /*
+     * vIOMMU reset may require proper ordering with other devices.  There
+     * are two complexities so that normal DeviceState.reset() may not
+     * work properly for vIOMMUs:
+     *
+     * (1) Device depth-first reset hierachy doesn't yet work for vIOMMUs
+     *     (reference: resettable_cold_reset_fn())
+     *
+     *     Currently, vIOMMU devices are created as normal '-device'
+     *     cmdlines.  It means in many ways it has the same attributes with
+     *     most of the rest devices, even if the rest devices should
+     *     logically be under control of the vIOMMU unit.
+     *
+     *     One side effect of it is vIOMMU devices will be currently put
+     *     randomly under qdev tree hierarchy, which is the source of
+     *     device reset ordering in current QEMU (depth-first traversal).
+     *     It means vIOMMU now can be reset before some devices.  For fully
+     *     emulated devices that's not a problem, because the traversal
+     *     holds BQL for the whole process.  However it is a problem if DMA
+     *     can happen without BQL, like VFIO, vDPA or remote device process.
+     *
+     *     TODO: one ideal solution can be that we make vIOMMU the parent
+     *     of the whole pci host bridge.  Hence vIOMMU can be reset after
+     *     all the devices are reset and quiesced.
+     *
+     * (2) Some devices register its own reset functions
+     *
+     *     Even if above issue solved, if devices register its own reset
+     *     functions for some reason via QEMU reset hooks, vIOMMU can still
+     *     be reset before the device. One example is vfio_reset_handler()
+     *     where FLR is not supported on the device.
+     *
+     *     TODO: merge relevant reset functions into the device tree reset
+     *     framework.
+     *
+     * Neither of the above TODO may be trivial.  To make it work for now,
+     * leverage reset stages and reset vIOMMU always at latter stage of the
+     * default.  It means it needs to be reset after at least:
+     *
+     *   - resettable_cold_reset_fn(): machine qdev tree reset
+     *   - vfio_reset_handler():       VFIO reset for !FLR
+     */

What you're asking makes sense to me, because that's also what I would
prefer to consider (and that's why I mentioned my series "slightly ugly" in
the cover letter), even if I don't yet know much on the other reset
challenges QEMU is already facing.

The issue is just like what I mentioned in the cover letter: that complete
solution can be non-trivial and can take a lot of time, and I probably
wouldn't have time at least recently to persue such a solution.

To fix the issue cleanly, I assume we need to start from making sure all
VFIO paths will only use the Resettable interface to reset.

The second part will involve moving vIOMMU device in the QOM tree to be the
parent of whatever it manages.  I didn't follow recent changes in this
area, but currently vIOMMU is probably an anonymous object dangling
somewhere and more or less orphaned, while "-device" is the interface for
now to create the vIOMMU which might be too generic.  I'm not sure whether
we'll need new interface already to create the vIOMMU, e.g., it may ideally
need to be the parent of the root pci bus that it manages, one or multiple.
And we need to make sure the vIOMMU being present when the root pci buses
are created, I think.  IIUC that can be before parsing "-device"s.

For the 2nd issue, I assume the ->hold() phase for VT-d to reset address
spaces might be good enough, as long as the reset is done depth-first, then
VFIO's ->hold()s will be called before that point.  I consider after all
devices' ->hold() kicked off there should have no DMA anymore, so quit()
shouldn't hopefully matter.

However even if hold() works it is still challenging for the hierachy
change.

Considering that this issue so far shouldn't cause any functional breakage,
maybe one option is we gradually fix all above, and before that we declare
it a known issue.

Any other suggestions would also be greatly welcomed.

Thanks,
diff mbox series

Patch

diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h
index 609e4d50c2..0de697ce9f 100644
--- a/include/sysemu/reset.h
+++ b/include/sysemu/reset.h
@@ -5,9 +5,14 @@ 
 
 typedef void QEMUResetHandler(void *opaque);
 
+#define  QEMU_RESET_STAGES_N  2
+
 void qemu_register_reset(QEMUResetHandler *func, void *opaque);
+void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
+                             bool skip_snap, int stage);
 void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque);
 void qemu_unregister_reset(QEMUResetHandler *func, void *opaque);
+void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage);
 void qemu_devices_reset(ShutdownCause reason);
 
 #endif
diff --git a/hw/core/reset.c b/hw/core/reset.c
index 8cf60b2b09..a84c9bee84 100644
--- a/hw/core/reset.c
+++ b/hw/core/reset.c
@@ -36,55 +36,83 @@  typedef struct QEMUResetEntry {
     bool skip_on_snapshot_load;
 } QEMUResetEntry;
 
-static QTAILQ_HEAD(, QEMUResetEntry) reset_handlers =
-    QTAILQ_HEAD_INITIALIZER(reset_handlers);
+typedef QTAILQ_HEAD(QEMUResetList, QEMUResetEntry) QEMUResetList;
+static QEMUResetList reset_handlers[QEMU_RESET_STAGES_N];
 
-static void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
-                                    bool skip_snap)
+static void __attribute__((__constructor__)) qemu_reset_handlers_init(void)
+{
+    QEMUResetList *head;
+    int i = 0;
+
+    for (i = 0; i < QEMU_RESET_STAGES_N; i++) {
+        head = &reset_handlers[i];
+        QTAILQ_INIT(head);
+    }
+}
+
+void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
+                             bool skip_snap, int stage)
 {
     QEMUResetEntry *re = g_new0(QEMUResetEntry, 1);
+    QEMUResetList *head;
+
+    assert(stage >= 0 && stage < QEMU_RESET_STAGES_N);
+    head = &reset_handlers[stage];
 
     re->func = func;
     re->opaque = opaque;
     re->skip_on_snapshot_load = skip_snap;
-    QTAILQ_INSERT_TAIL(&reset_handlers, re, entry);
+    QTAILQ_INSERT_TAIL(head, re, entry);
 }
 
 void qemu_register_reset(QEMUResetHandler *func, void *opaque)
 {
-    /* By default, do not skip during load of a snapshot */
-    qemu_register_reset_one(func, opaque, false);
+    qemu_register_reset_one(func, opaque, false, 0);
 }
 
 void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque)
 {
-    qemu_register_reset_one(func, opaque, true);
+    qemu_register_reset_one(func, opaque, true, 0);
 }
 
-void qemu_unregister_reset(QEMUResetHandler *func, void *opaque)
+void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage)
 {
+    QEMUResetList *head;
     QEMUResetEntry *re;
 
-    QTAILQ_FOREACH(re, &reset_handlers, entry) {
+    assert(stage >= 0 && stage < QEMU_RESET_STAGES_N);
+    head = &reset_handlers[stage];
+
+    QTAILQ_FOREACH(re, head, entry) {
         if (re->func == func && re->opaque == opaque) {
-            QTAILQ_REMOVE(&reset_handlers, re, entry);
+            QTAILQ_REMOVE(head, re, entry);
             g_free(re);
             return;
         }
     }
 }
 
+void qemu_unregister_reset(QEMUResetHandler *func, void *opaque)
+{
+    qemu_unregister_reset_one(func, opaque, 0);
+}
+
 void qemu_devices_reset(ShutdownCause reason)
 {
     QEMUResetEntry *re, *nre;
+    QEMUResetList *head;
+    int stage;
 
     /* reset all devices */
-    QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) {
-        if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD &&
-            re->skip_on_snapshot_load) {
-            continue;
+    for (stage = 0; stage < QEMU_RESET_STAGES_N; stage++) {
+        head = &reset_handlers[stage];
+        QTAILQ_FOREACH_SAFE(re, head, entry, nre) {
+            if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD &&
+                re->skip_on_snapshot_load) {
+                continue;
+            }
+            re->func(re->opaque);
         }
-        re->func(re->opaque);
     }
 }