diff mbox series

e1000: Fix the unexpected assumption that the receive buffer is full

Message ID c7338afab65df208772f215567f323ae9b3c5910.1720210988.git.yong.huang@smartx.com
State New
Headers show
Series e1000: Fix the unexpected assumption that the receive buffer is full | expand

Commit Message

Yong Huang July 5, 2024, 8:30 p.m. UTC
Unexpected work by certain Windows guests equipped with the e1000
interface can cause the network to go down and never come back up
again unless the guest's interface is reset.

To reproduce the failure:
1. Set up two guests with a Windows 2016 or 2019 server operating
   system.
2. Set up the e1000 interface for the guests.
3. Pressurize the network slightly between two guests using the iPerf tool.

The network goes down after a few days (2-5days), and the issue
is the result of not adhering to the e1000 specification. Refer
to the details of the specification at the following link:
https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf

Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
as following:
This register holds a value that is an offset from the base, and
identifies the location beyond the last descriptor hardware can
process. Note that tail should still point to an area in the
descriptor ring (somewhere between RDBA and RDBA + RDLEN).
This is because tail points to the location where software writes
the first new descriptor.

This means that if the provider—in this case, QEMU—has not yet
loaded the packet, RDT should never point to that place. When
implementing the emulation of the e1000 interface, QEMU evaluates
if the receive ring buffer is full once the RDT equals the RDH,
based on the assumption that guest drivers adhere to this
criterion strictly.

We applied the following log patch to assist in analyzing the
issue and eventually obtained the unexpected information.

Log patch:
-----------------------------------------------------------------
|--- a/hw/net/e1000.c
|+++ b/hw/net/e1000.c
|@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
| static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
| {
|     int bufs;
|+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
|+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
|+
|     /* Fast-path short packets */
|     if (total_size <= s->rxbuf_size) {
|         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
|@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
|         s->rxbuf_min_shift)
|         n |= E1000_ICS_RXDMT0;
|
|+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
|+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
|+
-----------------------------------------------------------------

The last few logs of information when the network is down:

e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
<- the receive ring buffer is checked for fullness in the
e1000_has_rxbufs function, not full.

e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
<- RDT stays the same, RDH updates to 898, and 1 descriptor
utilized after putting the packet to ring buffer.

e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
<- the receive ring buffer is checked for fullness in the
e1000_has_rxbufs function, not full.

e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
<- RDT stays the same, RDH updates to 899, and 1 descriptor
utilized after putting the packet to ring buffer.

e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
<- the receive ring buffer is checked for fullness in the
e1000_has_rxbufs function, not full.

e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
<- RDT stays the same, RDH updates to 900 , and 1 descriptor
utilized after putting the packet to ring buffer.

e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
<- The ring is full, according to e1000_has_rxbufs, because
of the RDT update to 900 and equals RDH ! But in reality,
the state of the ring buffer is empty because the producer
only used one descriptor the last time, and the ring buffer
was not full after that.

To sum up, QEMU claims that the receive ring buffer is full
in the aforementioned scenario, placing the packet in the
self-maintained queue and unregistering the tap device's
readable fd handler and then waiting for the guest to consume
the receive ring buffer. This brings down the network since
guests have nothing to consume and never update the RDT
location.

In the above scenario, QEMU assert that the ring is full,
put the packet on the queue, unregister the readable fd
handler of the tap device, waiting the guest to consume
the receive ring. While, guest have nothing to consume
on the receive ring and never update the RDT location,
this makes the network down.

To get around this issue, just mark the overrun if RDH
equals RDT at the end of placing the packet on the ring
buffer for the producer.

Signed-off-by: Hyman Huang <yong.huang@smartx.com>
---
 hw/net/e1000.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

Comments

Jason Wang July 8, 2024, 3:21 a.m. UTC | #1
On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com> wrote:
>
> Unexpected work by certain Windows guests equipped with the e1000
> interface can cause the network to go down and never come back up
> again unless the guest's interface is reset.
>
> To reproduce the failure:
> 1. Set up two guests with a Windows 2016 or 2019 server operating
>    system.

I vaguely remember e1000 support for Windows has been deprecated for
several years...

That's why e1000e or igb is implemented in Qemu.

> 2. Set up the e1000 interface for the guests.
> 3. Pressurize the network slightly between two guests using the iPerf tool.
>
> The network goes down after a few days (2-5days), and the issue
> is the result of not adhering to the e1000 specification. Refer
> to the details of the specification at the following link:
> https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
>
> Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
> as following:
> This register holds a value that is an offset from the base, and
> identifies the location beyond the last descriptor hardware can
> process. Note that tail should still point to an area in the
> descriptor ring (somewhere between RDBA and RDBA + RDLEN).
> This is because tail points to the location where software writes
> the first new descriptor.
>
> This means that if the provider—in this case, QEMU—has not yet
> loaded the packet,

What do you mean by "load" here?

> RDT should never point to that place.

And "that place"?

> When
> implementing the emulation of the e1000 interface, QEMU evaluates
> if the receive ring buffer is full once the RDT equals the RDH,
> based on the assumption that guest drivers adhere to this
> criterion strictly.
>
> We applied the following log patch to assist in analyzing the
> issue and eventually obtained the unexpected information.
>
> Log patch:
> -----------------------------------------------------------------
> |--- a/hw/net/e1000.c
> |+++ b/hw/net/e1000.c
> |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
> | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
> | {
> |     int bufs;
> |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
> |+
> |     /* Fast-path short packets */
> |     if (total_size <= s->rxbuf_size) {
> |         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
> |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
> |         s->rxbuf_min_shift)
> |         n |= E1000_ICS_RXDMT0;
> |
> |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
> |+
> -----------------------------------------------------------------
>
> The last few logs of information when the network is down:
>
> e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
> <- the receive ring buffer is checked for fullness in the
> e1000_has_rxbufs function, not full.
>
> e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> <- RDT stays the same, RDH updates to 898, and 1 descriptor
> utilized after putting the packet to ring buffer.
>
> e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> <- the receive ring buffer is checked for fullness in the
> e1000_has_rxbufs function, not full.
>
> e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> <- RDT stays the same, RDH updates to 899, and 1 descriptor
> utilized after putting the packet to ring buffer.
>
> e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> <- the receive ring buffer is checked for fullness in the
> e1000_has_rxbufs function, not full.
>
> e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
> <- RDT stays the same, RDH updates to 900 , and 1 descriptor
> utilized after putting the packet to ring buffer.
>
> e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
> <- The ring is full, according to e1000_has_rxbufs, because
> of the RDT update to 900 and equals RDH !

Just to make sure I understand this, RDT==RDH means the ring is empty I think?

See commit:

commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
Author: Dmitry Fleytman <dmitry@daynix.com>
Date:   Fri Oct 19 07:56:55 2012 +0200

    e1000: drop check_rxov, always treat RX ring with RDH == RDT as empty

    Real HW always treats RX ring with RDH == RDT as empty.
    Emulation is supposed to behave the same.

    Reported-by: Chris Webb <chris.webb@elastichosts.com>
    Reported-by: Richard Davies <richard.davies@elastichosts.com>
    Signed-off-by: Dmitry Fleytman <dmitry@daynix.com>
    Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>

Or did you mean we need to revert the above commit in fact?

Thanks
Yong Huang July 8, 2024, 5:17 a.m. UTC | #2
On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com> wrote:

> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com> wrote:
> >
> > Unexpected work by certain Windows guests equipped with the e1000
> > interface can cause the network to go down and never come back up
> > again unless the guest's interface is reset.
> >
> > To reproduce the failure:
> > 1. Set up two guests with a Windows 2016 or 2019 server operating
> >    system.
>
> I vaguely remember e1000 support for Windows has been deprecated for
> several years...
>
> That's why e1000e or igb is implemented in Qemu.
>
> > 2. Set up the e1000 interface for the guests.
> > 3. Pressurize the network slightly between two guests using the iPerf
> tool.
> >
> > The network goes down after a few days (2-5days), and the issue
> > is the result of not adhering to the e1000 specification. Refer
> > to the details of the specification at the following link:
> >
> https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
> >
> > Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
> > as following:
> > This register holds a value that is an offset from the base, and
> > identifies the location beyond the last descriptor hardware can
> > process. Note that tail should still point to an area in the
> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
> > This is because tail points to the location where software writes
> > the first new descriptor.
> >
> > This means that if the provider—in this case, QEMU—has not yet
> > loaded the packet,
>
> What do you mean by "load" here?
>

Sorry for failing to describe the details.

The guest driver retrieves the packet from the receive ring buffer
after QEMU forwards it from the tun/tap interface in the e1000
emulation.

I used "load" to express "putting packets into the receive ring buffer."


>
> > RDT should never point to that place.
>
> And "that place"?
>
If a descriptor in the receive ring buffer has not been filled with a
packet address by QEMU, the descriptor therefore doesn't have any
available packets. The location of the descriptor should not be referred
to by RDT because the location is in the range that "hardware" handles.

"that place" means the location of the descriptor in the ring buffer
that QEMU hasn't set any available packets related to.


>
> > When
> > implementing the emulation of the e1000 interface, QEMU evaluates
> > if the receive ring buffer is full once the RDT equals the RDH,
> > based on the assumption that guest drivers adhere to this
> > criterion strictly.
> >
> > We applied the following log patch to assist in analyzing the
> > issue and eventually obtained the unexpected information.
> >
> > Log patch:
> > -----------------------------------------------------------------
> > |--- a/hw/net/e1000.c
> > |+++ b/hw/net/e1000.c
> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
> > | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
> > | {
> > |     int bufs;
> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> > |+
> > |     /* Fast-path short packets */
> > |     if (total_size <= s->rxbuf_size) {
> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const
> struct iovec *iov, int iovcnt)
> > |         s->rxbuf_min_shift)
> > |         n |= E1000_ICS_RXDMT0;
> > |
> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> > |+
> > -----------------------------------------------------------------
> >
> > The last few logs of information when the network is down:
> >
> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
> > <- the receive ring buffer is checked for fullness in the
> > e1000_has_rxbufs function, not full.
> >
> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
> > utilized after putting the packet to ring buffer.
> >
> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> > <- the receive ring buffer is checked for fullness in the
> > e1000_has_rxbufs function, not full.
> >
> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
> > utilized after putting the packet to ring buffer.
> >
> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> > <- the receive ring buffer is checked for fullness in the
> > e1000_has_rxbufs function, not full.
> >
> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
> > utilized after putting the packet to ring buffer.
> >
> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
> > <- The ring is full, according to e1000_has_rxbufs, because
> > of the RDT update to 900 and equals RDH !
>
> Just to make sure I understand this, RDT==RDH means the ring is empty I
> think?


> See commit:
>
> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
> Author: Dmitry Fleytman <dmitry@daynix.com>
> Date:   Fri Oct 19 07:56:55 2012 +0200
>
>     e1000: drop check_rxov, always treat RX ring with RDH == RDT as empty
>
>     Real HW always treats RX ring with RDH == RDT as empty.
>     Emulation is supposed to behave the same.
>

Indeed, I'm confused :(,  the description in the comment claims that RX
rings with RDH == RDT as empty, but in implementation, it treats that as
overrun.

See the following 2 contexts:

1. e1000_can_receive:
static bool e1000_can_receive(NetClientState *nc)
{
    E1000State *s = qemu_get_nic_opaque(nc);
    // e1000_has_rxbufs return true means ring buffer has
    // available descriptors to use for QEMU.
    // false means ring buffer overrun and QEMU should queue the packet
    // and wait for the RDT update and available descriptors can be used.

    return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
        e1000_has_rxbufs(s, 1) && !timer_pending(s->flush_queue_timer);
}

2. e1000_receive_iov:
    if (!e1000_has_rxbufs(s, total_size)) {
        // e1000_has_rxbufs return false means overrun and QEMU should
        // inject RXO interrupt to guest
        e1000_receiver_overrun(s, total_size);
        return -1;
    }


>
>     Reported-by: Chris Webb <chris.webb@elastichosts.com>
>     Reported-by: Richard Davies <richard.davies@elastichosts.com>
>     Signed-off-by: Dmitry Fleytman <dmitry@daynix.com>
>     Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>
> Or did you mean we need to revert the above commit in fact?
>

Yes, this commit was overlooked by me but it is almost what I want to do
in this patch.


>
> Thanks
>
>
Thanks for the comment,

Yong
Yong Huang July 8, 2024, 5:25 a.m. UTC | #3
On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com> wrote:

> Unexpected work by certain Windows guests equipped with the e1000
> interface can cause the network to go down and never come back up
> again unless the guest's interface is reset.
>
> To reproduce the failure:
> 1. Set up two guests with a Windows 2016 or 2019 server operating
>    system.
> 2. Set up the e1000 interface for the guests.
> 3. Pressurize the network slightly between two guests using the iPerf tool.
>
> The network goes down after a few days (2-5days), and the issue
> is the result of not adhering to the e1000 specification. Refer
> to the details of the specification at the following link:
>
> https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
>
> Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
> as following:
> This register holds a value that is an offset from the base, and
> identifies the location beyond the last descriptor hardware can
> process. Note that tail should still point to an area in the
> descriptor ring (somewhere between RDBA and RDBA + RDLEN).
> This is because tail points to the location where software writes
> the first new descriptor.
>
> This means that if the provider—in this case, QEMU—has not yet
> loaded the packet, RDT should never point to that place. When
> implementing the emulation of the e1000 interface, QEMU evaluates
> if the receive ring buffer is full once the RDT equals the RDH,
> based on the assumption that guest drivers adhere to this
> criterion strictly.
>
> We applied the following log patch to assist in analyzing the
> issue and eventually obtained the unexpected information.
>
> Log patch:
> -----------------------------------------------------------------
> |--- a/hw/net/e1000.c
> |+++ b/hw/net/e1000.c
> |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
> | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
> | {
> |     int bufs;
> |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH]
> = %u, s->mac_reg[RDT] = %u\n",
> |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> |+
> |     /* Fast-path short packets */
> |     if (total_size <= s->rxbuf_size) {
> |         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
> |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const struct
> iovec *iov, int iovcnt)
> |         s->rxbuf_min_shift)
> |         n |= E1000_ICS_RXDMT0;
> |
> |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH]
> = %u, s->mac_reg[RDT] = %u\n",
> |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> |+
> -----------------------------------------------------------------
>
> The last few logs of information when the network is down:
>
> e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
> <- the receive ring buffer is checked for fullness in the
> e1000_has_rxbufs function, not full.
>
> e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> <- RDT stays the same, RDH updates to 898, and 1 descriptor
> utilized after putting the packet to ring buffer.
>
> e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> <- the receive ring buffer is checked for fullness in the
> e1000_has_rxbufs function, not full.
>
> e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> <- RDT stays the same, RDH updates to 899, and 1 descriptor
> utilized after putting the packet to ring buffer.
>
> e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> <- the receive ring buffer is checked for fullness in the
> e1000_has_rxbufs function, not full.
>
> e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
> <- RDT stays the same, RDH updates to 900 , and 1 descriptor
> utilized after putting the packet to ring buffer.
>
> e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
> <- The ring is full, according to e1000_has_rxbufs, because
> of the RDT update to 900 and equals RDH ! But in reality,
> the state of the ring buffer is empty because the producer
> only used one descriptor the last time, and the ring buffer
> was not full after that.
>
> To sum up, QEMU claims that the receive ring buffer is full
> in the aforementioned scenario, placing the packet in the
> self-maintained queue and unregistering the tap device's
> readable fd handler and then waiting for the guest to consume
> the receive ring buffer. This brings down the network since
> guests have nothing to consume and never update the RDT
> location.
>
> In the above scenario, QEMU assert that the ring is full,
> put the packet on the queue, unregister the readable fd
> handler of the tap device, waiting the guest to consume
> the receive ring. While, guest have nothing to consume
> on the receive ring and never update the RDT location,
> this makes the network down.
>
> To get around this issue, just mark the overrun if RDH
> equals RDT at the end of placing the packet on the ring
> buffer for the producer.
>
> Signed-off-by: Hyman Huang <yong.huang@smartx.com>
> ---
>  hw/net/e1000.c | 22 ++++++++++++++++++++--
>  1 file changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/hw/net/e1000.c b/hw/net/e1000.c
> index 5012b96464..f80cb70283 100644
> --- a/hw/net/e1000.c
> +++ b/hw/net/e1000.c
> @@ -126,6 +126,12 @@ struct E1000State_st {
>
>      QEMUTimer *flush_queue_timer;
>
> +    /*
> +     * Indicate that the receive circular buffer queue overrun
> +     * the last time hardware produced packets.
> +     */
> +    bool last_overrun;
> +
>  /* Compatibility flags for migration to/from qemu 1.3.0 and older */
>  #define E1000_FLAG_MAC_BIT 2
>  #define E1000_FLAG_TSO_BIT 3
> @@ -832,7 +838,12 @@ static bool e1000_has_rxbufs(E1000State *s, size_t
> total_size)
>      int bufs;
>      /* Fast-path short packets */
>      if (total_size <= s->rxbuf_size) {
> -        return s->mac_reg[RDH] != s->mac_reg[RDT];
> +        if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun) {
> +            return false;
> +        }
> +
> +        DBGOUT(RX, "Receive ring buffer is not full unexpectedly!\n");
>

I'll drop this DBGOUT in the next version now that "treat RX ring with RDH
== RDT as empty"
is the expected and original conclusion in commit:

commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
Author: Dmitry Fleytman <dmitry@daynix.com>
Date:   Fri Oct 19 07:56:55 2012 +0200

    e1000: drop check_rxov, always treat RX ring with RDH == RDT as empty

    Real HW always treats RX ring with RDH == RDT as empty.
    Emulation is supposed to behave the same.

    Reported-by: Chris Webb <chris.webb@elastichosts.com>
    Reported-by: Richard Davies <richard.davies@elastichosts.com>
    Signed-off-by: Dmitry Fleytman <dmitry@daynix.com>
    Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>


> +        return true;
>      }
>      if (s->mac_reg[RDH] < s->mac_reg[RDT]) {
>          bufs = s->mac_reg[RDT] - s->mac_reg[RDH];
> @@ -840,7 +851,12 @@ static bool e1000_has_rxbufs(E1000State *s, size_t
> total_size)
>          bufs = s->mac_reg[RDLEN] /  sizeof(struct e1000_rx_desc) +
>              s->mac_reg[RDT] - s->mac_reg[RDH];
>      } else {
> -        return false;
> +        if (s->last_overrun) {
> +            return false;
> +        }
> +
> +        DBGOUT(RX, "Receive ring buffer is not full unexpectedly!\n");
> +        return true;
>      }
>      return total_size <= bufs * s->rxbuf_size;
>  }
> @@ -999,6 +1015,8 @@ e1000_receive_iov(NetClientState *nc, const struct
> iovec *iov, int iovcnt)
>
>      e1000x_update_rx_total_stats(s->mac_reg, pkt_type, size, total_size);
>
> +    s->last_overrun = (s->mac_reg[RDH] == s->mac_reg[RDT]) ? true : false;
> +
>      n = E1000_ICS_RXT0;
>      if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH])
>          rdt += s->mac_reg[RDLEN] / sizeof(desc);
> --
> 2.39.1
>
>
Jason Wang July 9, 2024, 2:41 a.m. UTC | #4
On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <yong.huang@smartx.com> wrote:
>
>
>
> On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com> wrote:
>> >
>> > Unexpected work by certain Windows guests equipped with the e1000
>> > interface can cause the network to go down and never come back up
>> > again unless the guest's interface is reset.
>> >
>> > To reproduce the failure:
>> > 1. Set up two guests with a Windows 2016 or 2019 server operating
>> >    system.
>>
>> I vaguely remember e1000 support for Windows has been deprecated for
>> several years...
>>
>> That's why e1000e or igb is implemented in Qemu.
>>
>> > 2. Set up the e1000 interface for the guests.
>> > 3. Pressurize the network slightly between two guests using the iPerf tool.
>> >
>> > The network goes down after a few days (2-5days), and the issue
>> > is the result of not adhering to the e1000 specification. Refer
>> > to the details of the specification at the following link:
>> > https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
>> >
>> > Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
>> > as following:
>> > This register holds a value that is an offset from the base, and
>> > identifies the location beyond the last descriptor hardware can
>> > process. Note that tail should still point to an area in the
>> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
>> > This is because tail points to the location where software writes
>> > the first new descriptor.
>> >
>> > This means that if the provider—in this case, QEMU—has not yet
>> > loaded the packet,
>>
>> What do you mean by "load" here?
>
>
> Sorry for failing to describe the details.
>
> The guest driver retrieves the packet from the receive ring buffer
> after QEMU forwards it from the tun/tap interface in the e1000
> emulation.
>
> I used "load" to express "putting packets into the receive ring buffer."
>
>>
>>
>> > RDT should never point to that place.
>>
>> And "that place"?
>
> If a descriptor in the receive ring buffer has not been filled with a
> packet address by QEMU, the descriptor therefore doesn't have any
> available packets. The location of the descriptor should not be referred
> to by RDT because the location is in the range that "hardware" handles.
>
> "that place" means the location of the descriptor in the ring buffer
> that QEMU hasn't set any available packets related to.
>
>>
>>
>> > When
>> > implementing the emulation of the e1000 interface, QEMU evaluates
>> > if the receive ring buffer is full once the RDT equals the RDH,
>> > based on the assumption that guest drivers adhere to this
>> > criterion strictly.
>> >
>> > We applied the following log patch to assist in analyzing the
>> > issue and eventually obtained the unexpected information.
>> >
>> > Log patch:
>> > -----------------------------------------------------------------
>> > |--- a/hw/net/e1000.c
>> > |+++ b/hw/net/e1000.c
>> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
>> > | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
>> > | {
>> > |     int bufs;
>> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> > |+
>> > |     /* Fast-path short packets */
>> > |     if (total_size <= s->rxbuf_size) {
>> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
>> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
>> > |         s->rxbuf_min_shift)
>> > |         n |= E1000_ICS_RXDMT0;
>> > |
>> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> > |+
>> > -----------------------------------------------------------------
>> >
>> > The last few logs of information when the network is down:
>> >
>> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
>> > <- the receive ring buffer is checked for fullness in the
>> > e1000_has_rxbufs function, not full.
>> >
>> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
>> > utilized after putting the packet to ring buffer.
>> >
>> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> > <- the receive ring buffer is checked for fullness in the
>> > e1000_has_rxbufs function, not full.
>> >
>> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
>> > utilized after putting the packet to ring buffer.
>> >
>> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> > <- the receive ring buffer is checked for fullness in the
>> > e1000_has_rxbufs function, not full.
>> >
>> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
>> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
>> > utilized after putting the packet to ring buffer.
>> >
>> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
>> > <- The ring is full, according to e1000_has_rxbufs, because
>> > of the RDT update to 900 and equals RDH !
>>
>> Just to make sure I understand this, RDT==RDH means the ring is empty I think?
>>
>>
>> See commit:
>>
>> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
>> Author: Dmitry Fleytman <dmitry@daynix.com>
>> Date:   Fri Oct 19 07:56:55 2012 +0200
>>
>>     e1000: drop check_rxov, always treat RX ring with RDH == RDT as empty
>>
>>     Real HW always treats RX ring with RDH == RDT as empty.
>>     Emulation is supposed to behave the same.
>
>
> Indeed, I'm confused :(,  the description in the comment claims that RX
> rings with RDH == RDT as empty, but in implementation, it treats that as
> overrun.
>
> See the following 2 contexts:
>
> 1. e1000_can_receive:
> static bool e1000_can_receive(NetClientState *nc)
> {
>     E1000State *s = qemu_get_nic_opaque(nc);
>     // e1000_has_rxbufs return true means ring buffer has
>     // available descriptors to use for QEMU.
>     // false means ring buffer overrun and QEMU should queue the packet
>     // and wait for the RDT update and available descriptors can be used.
>
>     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
>         e1000_has_rxbufs(s, 1) && !timer_pending(s->flush_queue_timer);
> }

Well we had in e1000_has_rx_bufs

    if (total_size <= s->rxbuf_size) {
        return s->mac_reg[RDH] != s->mac_reg[RDT];
    }

RDT!=RDH means RX ring has available descriptors for hardware?

Adding more people.

Thanks
Yong Huang July 9, 2024, 2:56 a.m. UTC | #5
On Tue, Jul 9, 2024 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:

> On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <yong.huang@smartx.com> wrote:
> >
> >
> >
> > On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com>
> wrote:
> >> >
> >> > Unexpected work by certain Windows guests equipped with the e1000
> >> > interface can cause the network to go down and never come back up
> >> > again unless the guest's interface is reset.
> >> >
> >> > To reproduce the failure:
> >> > 1. Set up two guests with a Windows 2016 or 2019 server operating
> >> >    system.
> >>
> >> I vaguely remember e1000 support for Windows has been deprecated for
> >> several years...
> >>
> >> That's why e1000e or igb is implemented in Qemu.
> >>
> >> > 2. Set up the e1000 interface for the guests.
> >> > 3. Pressurize the network slightly between two guests using the iPerf
> tool.
> >> >
> >> > The network goes down after a few days (2-5days), and the issue
> >> > is the result of not adhering to the e1000 specification. Refer
> >> > to the details of the specification at the following link:
> >> >
> https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
> >> >
> >> > Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
> >> > as following:
> >> > This register holds a value that is an offset from the base, and
> >> > identifies the location beyond the last descriptor hardware can
> >> > process. Note that tail should still point to an area in the
> >> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
> >> > This is because tail points to the location where software writes
> >> > the first new descriptor.
> >> >
> >> > This means that if the provider—in this case, QEMU—has not yet
> >> > loaded the packet,
> >>
> >> What do you mean by "load" here?
> >
> >
> > Sorry for failing to describe the details.
> >
> > The guest driver retrieves the packet from the receive ring buffer
> > after QEMU forwards it from the tun/tap interface in the e1000
> > emulation.
> >
> > I used "load" to express "putting packets into the receive ring buffer."
> >
> >>
> >>
> >> > RDT should never point to that place.
> >>
> >> And "that place"?
> >
> > If a descriptor in the receive ring buffer has not been filled with a
> > packet address by QEMU, the descriptor therefore doesn't have any
> > available packets. The location of the descriptor should not be referred
> > to by RDT because the location is in the range that "hardware" handles.
> >
> > "that place" means the location of the descriptor in the ring buffer
> > that QEMU hasn't set any available packets related to.
> >
> >>
> >>
> >> > When
> >> > implementing the emulation of the e1000 interface, QEMU evaluates
> >> > if the receive ring buffer is full once the RDT equals the RDH,
> >> > based on the assumption that guest drivers adhere to this
> >> > criterion strictly.
> >> >
> >> > We applied the following log patch to assist in analyzing the
> >> > issue and eventually obtained the unexpected information.
> >> >
> >> > Log patch:
> >> > -----------------------------------------------------------------
> >> > |--- a/hw/net/e1000.c
> >> > |+++ b/hw/net/e1000.c
> >> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
> >> > | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
> >> > | {
> >> > |     int bufs;
> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> >> > |+
> >> > |     /* Fast-path short packets */
> >> > |     if (total_size <= s->rxbuf_size) {
> >> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
> >> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const
> struct iovec *iov, int iovcnt)
> >> > |         s->rxbuf_min_shift)
> >> > |         n |= E1000_ICS_RXDMT0;
> >> > |
> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> >> > |+
> >> > -----------------------------------------------------------------
> >> >
> >> > The last few logs of information when the network is down:
> >> >
> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
> >> > <- the receive ring buffer is checked for fullness in the
> >> > e1000_has_rxbufs function, not full.
> >> >
> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
> >> > utilized after putting the packet to ring buffer.
> >> >
> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> > <- the receive ring buffer is checked for fullness in the
> >> > e1000_has_rxbufs function, not full.
> >> >
> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
> >> > utilized after putting the packet to ring buffer.
> >> >
> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> > <- the receive ring buffer is checked for fullness in the
> >> > e1000_has_rxbufs function, not full.
> >> >
> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
> >> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
> >> > utilized after putting the packet to ring buffer.
> >> >
> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384,
> s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
> >> > <- The ring is full, according to e1000_has_rxbufs, because
> >> > of the RDT update to 900 and equals RDH !
> >>
> >> Just to make sure I understand this, RDT==RDH means the ring is empty I
> think?
> >>
> >>
> >> See commit:
> >>
> >> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
> >> Author: Dmitry Fleytman <dmitry@daynix.com>
> >> Date:   Fri Oct 19 07:56:55 2012 +0200
> >>
> >>     e1000: drop check_rxov, always treat RX ring with RDH == RDT as
> empty
> >>
> >>     Real HW always treats RX ring with RDH == RDT as empty.
> >>     Emulation is supposed to behave the same.
> >
> >
> > Indeed, I'm confused :(,  the description in the comment claims that RX
> > rings with RDH == RDT as empty, but in implementation, it treats that as
> > overrun.
> >
> > See the following 2 contexts:
> >
> > 1. e1000_can_receive:
> > static bool e1000_can_receive(NetClientState *nc)
> > {
> >     E1000State *s = qemu_get_nic_opaque(nc);
> >     // e1000_has_rxbufs return true means ring buffer has
> >     // available descriptors to use for QEMU.
> >     // false means ring buffer overrun and QEMU should queue the packet
> >     // and wait for the RDT update and available descriptors can be used.
> >
> >     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
> >         e1000_has_rxbufs(s, 1) && !timer_pending(s->flush_queue_timer);
> > }
>
> Well we had in e1000_has_rx_bufs
>
>     if (total_size <= s->rxbuf_size) {
>         return s->mac_reg[RDH] != s->mac_reg[RDT];
>     }
>
> RDT!=RDH means RX ring has available descriptors for hardware?
>

IMHO, Yes.


> Adding more people.
>
> Thanks
>
>
Jason Wang July 10, 2024, 3:43 a.m. UTC | #6
On Tue, Jul 9, 2024 at 10:56 AM Yong Huang <yong.huang@smartx.com> wrote:
>
>
>
> On Tue, Jul 9, 2024 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <yong.huang@smartx.com> wrote:
>> >
>> >
>> >
>> > On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com> wrote:
>> >>
>> >> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com> wrote:
>> >> >
>> >> > Unexpected work by certain Windows guests equipped with the e1000
>> >> > interface can cause the network to go down and never come back up
>> >> > again unless the guest's interface is reset.
>> >> >
>> >> > To reproduce the failure:
>> >> > 1. Set up two guests with a Windows 2016 or 2019 server operating
>> >> >    system.
>> >>
>> >> I vaguely remember e1000 support for Windows has been deprecated for
>> >> several years...
>> >>
>> >> That's why e1000e or igb is implemented in Qemu.
>> >>
>> >> > 2. Set up the e1000 interface for the guests.
>> >> > 3. Pressurize the network slightly between two guests using the iPerf tool.
>> >> >
>> >> > The network goes down after a few days (2-5days), and the issue
>> >> > is the result of not adhering to the e1000 specification. Refer
>> >> > to the details of the specification at the following link:
>> >> > https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
>> >> >
>> >> > Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
>> >> > as following:
>> >> > This register holds a value that is an offset from the base, and
>> >> > identifies the location beyond the last descriptor hardware can
>> >> > process. Note that tail should still point to an area in the
>> >> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
>> >> > This is because tail points to the location where software writes
>> >> > the first new descriptor.
>> >> >
>> >> > This means that if the provider—in this case, QEMU—has not yet
>> >> > loaded the packet,
>> >>
>> >> What do you mean by "load" here?
>> >
>> >
>> > Sorry for failing to describe the details.
>> >
>> > The guest driver retrieves the packet from the receive ring buffer
>> > after QEMU forwards it from the tun/tap interface in the e1000
>> > emulation.
>> >
>> > I used "load" to express "putting packets into the receive ring buffer."
>> >
>> >>
>> >>
>> >> > RDT should never point to that place.
>> >>
>> >> And "that place"?
>> >
>> > If a descriptor in the receive ring buffer has not been filled with a
>> > packet address by QEMU, the descriptor therefore doesn't have any
>> > available packets. The location of the descriptor should not be referred
>> > to by RDT because the location is in the range that "hardware" handles.
>> >
>> > "that place" means the location of the descriptor in the ring buffer
>> > that QEMU hasn't set any available packets related to.
>> >
>> >>
>> >>
>> >> > When
>> >> > implementing the emulation of the e1000 interface, QEMU evaluates
>> >> > if the receive ring buffer is full once the RDT equals the RDH,
>> >> > based on the assumption that guest drivers adhere to this
>> >> > criterion strictly.
>> >> >
>> >> > We applied the following log patch to assist in analyzing the
>> >> > issue and eventually obtained the unexpected information.
>> >> >
>> >> > Log patch:
>> >> > -----------------------------------------------------------------
>> >> > |--- a/hw/net/e1000.c
>> >> > |+++ b/hw/net/e1000.c
>> >> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
>> >> > | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
>> >> > | {
>> >> > |     int bufs;
>> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> >> > |+
>> >> > |     /* Fast-path short packets */
>> >> > |     if (total_size <= s->rxbuf_size) {
>> >> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
>> >> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
>> >> > |         s->rxbuf_min_shift)
>> >> > |         n |= E1000_ICS_RXDMT0;
>> >> > |
>> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> >> > |+
>> >> > -----------------------------------------------------------------
>> >> >
>> >> > The last few logs of information when the network is down:
>> >> >
>> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
>> >> > <- the receive ring buffer is checked for fullness in the
>> >> > e1000_has_rxbufs function, not full.
>> >> >
>> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> >> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
>> >> > utilized after putting the packet to ring buffer.
>> >> >
>> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> >> > <- the receive ring buffer is checked for fullness in the
>> >> > e1000_has_rxbufs function, not full.
>> >> >
>> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> >> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
>> >> > utilized after putting the packet to ring buffer.
>> >> >
>> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> >> > <- the receive ring buffer is checked for fullness in the
>> >> > e1000_has_rxbufs function, not full.
>> >> >
>> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
>> >> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
>> >> > utilized after putting the packet to ring buffer.
>> >> >
>> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
>> >> > <- The ring is full, according to e1000_has_rxbufs, because
>> >> > of the RDT update to 900 and equals RDH !
>> >>
>> >> Just to make sure I understand this, RDT==RDH means the ring is empty I think?
>> >>
>> >>
>> >> See commit:
>> >>
>> >> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
>> >> Author: Dmitry Fleytman <dmitry@daynix.com>
>> >> Date:   Fri Oct 19 07:56:55 2012 +0200
>> >>
>> >>     e1000: drop check_rxov, always treat RX ring with RDH == RDT as empty
>> >>
>> >>     Real HW always treats RX ring with RDH == RDT as empty.
>> >>     Emulation is supposed to behave the same.
>> >
>> >
>> > Indeed, I'm confused :(,  the description in the comment claims that RX
>> > rings with RDH == RDT as empty, but in implementation, it treats that as
>> > overrun.
>> >
>> > See the following 2 contexts:
>> >
>> > 1. e1000_can_receive:
>> > static bool e1000_can_receive(NetClientState *nc)
>> > {
>> >     E1000State *s = qemu_get_nic_opaque(nc);
>> >     // e1000_has_rxbufs return true means ring buffer has
>> >     // available descriptors to use for QEMU.
>> >     // false means ring buffer overrun and QEMU should queue the packet
>> >     // and wait for the RDT update and available descriptors can be used.
>> >
>> >     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
>> >         e1000_has_rxbufs(s, 1) && !timer_pending(s->flush_queue_timer);
>> > }
>>
>> Well we had in e1000_has_rx_bufs
>>
>>     if (total_size <= s->rxbuf_size) {
>>         return s->mac_reg[RDH] != s->mac_reg[RDT];
>>     }
>>
>> RDT!=RDH means RX ring has available descriptors for hardware?
>
>
> IMHO, Yes.

Just to make sure we are on the same page, so

RDT!=RDH, descriptors available for hardware
RDT==RDH, descriptor ring is empty for hardware

That is currently what the code did. Seems nothing wrong, or anything
I missed here?

Thanks

>
>>
>> Adding more people.
>>
>> Thanks
>>
>
>
> --
> Best regards
Yong Huang July 10, 2024, 6:26 a.m. UTC | #7
On Wed, Jul 10, 2024 at 11:44 AM Jason Wang <jasowang@redhat.com> wrote:

> On Tue, Jul 9, 2024 at 10:56 AM Yong Huang <yong.huang@smartx.com> wrote:
> >
> >
> >
> > On Tue, Jul 9, 2024 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <yong.huang@smartx.com>
> wrote:
> >> >
> >> >
> >> >
> >> > On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com>
> wrote:
> >> >>
> >> >> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com>
> wrote:
> >> >> >
> >> >> > Unexpected work by certain Windows guests equipped with the e1000
> >> >> > interface can cause the network to go down and never come back up
> >> >> > again unless the guest's interface is reset.
> >> >> >
> >> >> > To reproduce the failure:
> >> >> > 1. Set up two guests with a Windows 2016 or 2019 server operating
> >> >> >    system.
> >> >>
> >> >> I vaguely remember e1000 support for Windows has been deprecated for
> >> >> several years...
> >> >>
> >> >> That's why e1000e or igb is implemented in Qemu.
> >> >>
> >> >> > 2. Set up the e1000 interface for the guests.
> >> >> > 3. Pressurize the network slightly between two guests using the
> iPerf tool.
> >> >> >
> >> >> > The network goes down after a few days (2-5days), and the issue
> >> >> > is the result of not adhering to the e1000 specification. Refer
> >> >> > to the details of the specification at the following link:
> >> >> >
> https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
> >> >> >
> >> >> > Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
> >> >> > as following:
> >> >> > This register holds a value that is an offset from the base, and
> >> >> > identifies the location beyond the last descriptor hardware can
> >> >> > process. Note that tail should still point to an area in the
> >> >> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
> >> >> > This is because tail points to the location where software writes
> >> >> > the first new descriptor.
> >> >> >
> >> >> > This means that if the provider—in this case, QEMU—has not yet
> >> >> > loaded the packet,
> >> >>
> >> >> What do you mean by "load" here?
> >> >
> >> >
> >> > Sorry for failing to describe the details.
> >> >
> >> > The guest driver retrieves the packet from the receive ring buffer
> >> > after QEMU forwards it from the tun/tap interface in the e1000
> >> > emulation.
> >> >
> >> > I used "load" to express "putting packets into the receive ring
> buffer."
> >> >
> >> >>
> >> >>
> >> >> > RDT should never point to that place.
> >> >>
> >> >> And "that place"?
> >> >
> >> > If a descriptor in the receive ring buffer has not been filled with a
> >> > packet address by QEMU, the descriptor therefore doesn't have any
> >> > available packets. The location of the descriptor should not be
> referred
> >> > to by RDT because the location is in the range that "hardware"
> handles.
> >> >
> >> > "that place" means the location of the descriptor in the ring buffer
> >> > that QEMU hasn't set any available packets related to.
> >> >
> >> >>
> >> >>
> >> >> > When
> >> >> > implementing the emulation of the e1000 interface, QEMU evaluates
> >> >> > if the receive ring buffer is full once the RDT equals the RDH,
> >> >> > based on the assumption that guest drivers adhere to this
> >> >> > criterion strictly.
> >> >> >
> >> >> > We applied the following log patch to assist in analyzing the
> >> >> > issue and eventually obtained the unexpected information.
> >> >> >
> >> >> > Log patch:
> >> >> > -----------------------------------------------------------------
> >> >> > |--- a/hw/net/e1000.c
> >> >> > |+++ b/hw/net/e1000.c
> >> >> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
> >> >> > | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
> >> >> > | {
> >> >> > |     int bufs;
> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> >> >> > |+
> >> >> > |     /* Fast-path short packets */
> >> >> > |     if (total_size <= s->rxbuf_size) {
> >> >> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] &&
> s->last_overrun)
> >> >> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const
> struct iovec *iov, int iovcnt)
> >> >> > |         s->rxbuf_min_shift)
> >> >> > |         n |= E1000_ICS_RXDMT0;
> >> >> > |
> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> >> >> > |+
> >> >> > -----------------------------------------------------------------
> >> >> >
> >> >> > The last few logs of information when the network is down:
> >> >> >
> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> > e1000_has_rxbufs function, not full.
> >> >> >
> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> >> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
> >> >> > utilized after putting the packet to ring buffer.
> >> >> >
> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> > e1000_has_rxbufs function, not full.
> >> >> >
> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> >> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
> >> >> > utilized after putting the packet to ring buffer.
> >> >> >
> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> > e1000_has_rxbufs function, not full.
> >> >> >
> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
> >> >> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
> >> >> > utilized after putting the packet to ring buffer.
> >> >> >
> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
> >> >> > <- The ring is full, according to e1000_has_rxbufs, because
> >> >> > of the RDT update to 900 and equals RDH !
> >> >>
> >> >> Just to make sure I understand this, RDT==RDH means the ring is
> empty I think?
> >> >>
> >> >>
> >> >> See commit:
> >> >>
> >> >> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
> >> >> Author: Dmitry Fleytman <dmitry@daynix.com>
> >> >> Date:   Fri Oct 19 07:56:55 2012 +0200
> >> >>
> >> >>     e1000: drop check_rxov, always treat RX ring with RDH == RDT as
> empty
> >> >>
> >> >>     Real HW always treats RX ring with RDH == RDT as empty.
> >> >>     Emulation is supposed to behave the same.
> >> >
> >> >
> >> > Indeed, I'm confused :(,  the description in the comment claims that
> RX
> >> > rings with RDH == RDT as empty, but in implementation, it treats that
> as
> >> > overrun.
> >> >
> >> > See the following 2 contexts:
> >> >
> >> > 1. e1000_can_receive:
> >> > static bool e1000_can_receive(NetClientState *nc)
> >> > {
> >> >     E1000State *s = qemu_get_nic_opaque(nc);
> >> >     // e1000_has_rxbufs return true means ring buffer has
> >> >     // available descriptors to use for QEMU.
> >> >     // false means ring buffer overrun and QEMU should queue the
> packet
> >> >     // and wait for the RDT update and available descriptors can be
> used.
> >> >
> >> >     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
> >> >         e1000_has_rxbufs(s, 1) &&
> !timer_pending(s->flush_queue_timer);
> >> > }
> >>
> >> Well we had in e1000_has_rx_bufs
> >>
> >>     if (total_size <= s->rxbuf_size) {
> >>         return s->mac_reg[RDH] != s->mac_reg[RDT];
> >>     }
> >>
> >> RDT!=RDH means RX ring has available descriptors for hardware?
> >
> >
> > IMHO, Yes.
>
> Just to make sure we are on the same page, so
>
> RDT!=RDH, descriptors available for hardware
> RDT==RDH, descriptor ring is empty for hardware


> That is currently what the code did. Seems nothing wrong, or anything
> I missed here?
>

There are two cases for RDT == RDH.

1. Hardware has filled all available descriptors and overrun.
   In this case, hardware cannot add any new packets to the ring.

2. Software has consumed all descriptors, and all the descriptors
    on the ring can be used by hardware. (Let's name this case "empty.")
   In this case, hardware should keep putting new packets to the ring

But at the moment, the logic of e1000_has_rx_bufs acts exactly like it was
the first case, unable to differentiate between the two scenarios.


>
> Thanks
>
> >
> >>
> >> Adding more people.
> >>
> >> Thanks
> >>
> >
> >
> > --
> > Best regards
>
>
Yong
Jason Wang July 10, 2024, 7:36 a.m. UTC | #8
On Wed, Jul 10, 2024 at 2:26 PM Yong Huang <yong.huang@smartx.com> wrote:
>
>
>
> On Wed, Jul 10, 2024 at 11:44 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On Tue, Jul 9, 2024 at 10:56 AM Yong Huang <yong.huang@smartx.com> wrote:
>> >
>> >
>> >
>> > On Tue, Jul 9, 2024 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
>> >>
>> >> On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <yong.huang@smartx.com> wrote:
>> >> >
>> >> >
>> >> >
>> >> > On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com> wrote:
>> >> >>
>> >> >> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com> wrote:
>> >> >> >
>> >> >> > Unexpected work by certain Windows guests equipped with the e1000
>> >> >> > interface can cause the network to go down and never come back up
>> >> >> > again unless the guest's interface is reset.
>> >> >> >
>> >> >> > To reproduce the failure:
>> >> >> > 1. Set up two guests with a Windows 2016 or 2019 server operating
>> >> >> >    system.
>> >> >>
>> >> >> I vaguely remember e1000 support for Windows has been deprecated for
>> >> >> several years...
>> >> >>
>> >> >> That's why e1000e or igb is implemented in Qemu.
>> >> >>
>> >> >> > 2. Set up the e1000 interface for the guests.
>> >> >> > 3. Pressurize the network slightly between two guests using the iPerf tool.
>> >> >> >
>> >> >> > The network goes down after a few days (2-5days), and the issue
>> >> >> > is the result of not adhering to the e1000 specification. Refer
>> >> >> > to the details of the specification at the following link:
>> >> >> > https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
>> >> >> >
>> >> >> > Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
>> >> >> > as following:
>> >> >> > This register holds a value that is an offset from the base, and
>> >> >> > identifies the location beyond the last descriptor hardware can
>> >> >> > process. Note that tail should still point to an area in the
>> >> >> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
>> >> >> > This is because tail points to the location where software writes
>> >> >> > the first new descriptor.
>> >> >> >
>> >> >> > This means that if the provider—in this case, QEMU—has not yet
>> >> >> > loaded the packet,
>> >> >>
>> >> >> What do you mean by "load" here?
>> >> >
>> >> >
>> >> > Sorry for failing to describe the details.
>> >> >
>> >> > The guest driver retrieves the packet from the receive ring buffer
>> >> > after QEMU forwards it from the tun/tap interface in the e1000
>> >> > emulation.
>> >> >
>> >> > I used "load" to express "putting packets into the receive ring buffer."
>> >> >
>> >> >>
>> >> >>
>> >> >> > RDT should never point to that place.
>> >> >>
>> >> >> And "that place"?
>> >> >
>> >> > If a descriptor in the receive ring buffer has not been filled with a
>> >> > packet address by QEMU, the descriptor therefore doesn't have any
>> >> > available packets. The location of the descriptor should not be referred
>> >> > to by RDT because the location is in the range that "hardware" handles.
>> >> >
>> >> > "that place" means the location of the descriptor in the ring buffer
>> >> > that QEMU hasn't set any available packets related to.
>> >> >
>> >> >>
>> >> >>
>> >> >> > When
>> >> >> > implementing the emulation of the e1000 interface, QEMU evaluates
>> >> >> > if the receive ring buffer is full once the RDT equals the RDH,
>> >> >> > based on the assumption that guest drivers adhere to this
>> >> >> > criterion strictly.
>> >> >> >
>> >> >> > We applied the following log patch to assist in analyzing the
>> >> >> > issue and eventually obtained the unexpected information.
>> >> >> >
>> >> >> > Log patch:
>> >> >> > -----------------------------------------------------------------
>> >> >> > |--- a/hw/net/e1000.c
>> >> >> > |+++ b/hw/net/e1000.c
>> >> >> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
>> >> >> > | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
>> >> >> > | {
>> >> >> > |     int bufs;
>> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> >> >> > |+
>> >> >> > |     /* Fast-path short packets */
>> >> >> > |     if (total_size <= s->rxbuf_size) {
>> >> >> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
>> >> >> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
>> >> >> > |         s->rxbuf_min_shift)
>> >> >> > |         n |= E1000_ICS_RXDMT0;
>> >> >> > |
>> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> >> >> > |+
>> >> >> > -----------------------------------------------------------------
>> >> >> >
>> >> >> > The last few logs of information when the network is down:
>> >> >> >
>> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
>> >> >> > <- the receive ring buffer is checked for fullness in the
>> >> >> > e1000_has_rxbufs function, not full.
>> >> >> >
>> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> >> >> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
>> >> >> > utilized after putting the packet to ring buffer.
>> >> >> >
>> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> >> >> > <- the receive ring buffer is checked for fullness in the
>> >> >> > e1000_has_rxbufs function, not full.
>> >> >> >
>> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> >> >> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
>> >> >> > utilized after putting the packet to ring buffer.
>> >> >> >
>> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> >> >> > <- the receive ring buffer is checked for fullness in the
>> >> >> > e1000_has_rxbufs function, not full.
>> >> >> >
>> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
>> >> >> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
>> >> >> > utilized after putting the packet to ring buffer.
>> >> >> >
>> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
>> >> >> > <- The ring is full, according to e1000_has_rxbufs, because
>> >> >> > of the RDT update to 900 and equals RDH !
>> >> >>
>> >> >> Just to make sure I understand this, RDT==RDH means the ring is empty I think?
>> >> >>
>> >> >>
>> >> >> See commit:
>> >> >>
>> >> >> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
>> >> >> Author: Dmitry Fleytman <dmitry@daynix.com>
>> >> >> Date:   Fri Oct 19 07:56:55 2012 +0200
>> >> >>
>> >> >>     e1000: drop check_rxov, always treat RX ring with RDH == RDT as empty
>> >> >>
>> >> >>     Real HW always treats RX ring with RDH == RDT as empty.
>> >> >>     Emulation is supposed to behave the same.
>> >> >
>> >> >
>> >> > Indeed, I'm confused :(,  the description in the comment claims that RX
>> >> > rings with RDH == RDT as empty, but in implementation, it treats that as
>> >> > overrun.
>> >> >
>> >> > See the following 2 contexts:
>> >> >
>> >> > 1. e1000_can_receive:
>> >> > static bool e1000_can_receive(NetClientState *nc)
>> >> > {
>> >> >     E1000State *s = qemu_get_nic_opaque(nc);
>> >> >     // e1000_has_rxbufs return true means ring buffer has
>> >> >     // available descriptors to use for QEMU.
>> >> >     // false means ring buffer overrun and QEMU should queue the packet
>> >> >     // and wait for the RDT update and available descriptors can be used.
>> >> >
>> >> >     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
>> >> >         e1000_has_rxbufs(s, 1) && !timer_pending(s->flush_queue_timer);
>> >> > }
>> >>
>> >> Well we had in e1000_has_rx_bufs
>> >>
>> >>     if (total_size <= s->rxbuf_size) {
>> >>         return s->mac_reg[RDH] != s->mac_reg[RDT];
>> >>     }
>> >>
>> >> RDT!=RDH means RX ring has available descriptors for hardware?
>> >
>> >
>> > IMHO, Yes.
>>
>> Just to make sure we are on the same page, so
>>
>> RDT!=RDH, descriptors available for hardware
>> RDT==RDH, descriptor ring is empty for hardware
>>
>>
>> That is currently what the code did. Seems nothing wrong, or anything
>> I missed here?
>
>
> There are two cases for RDT == RDH.
>
> 1. Hardware has filled all available descriptors and overrun.
>    In this case, hardware cannot add any new packets to the ring.
>
> 2. Software has consumed all descriptors, and all the descriptors
>     on the ring can be used by hardware. (Let's name this case "empty.")
>    In this case, hardware should keep putting new packets to the ring

Well this seems not what spec said. See Figure 3-2, when RDT==RDH,
nothing is owned by hardware. And this is what Dmitry said in the
commit mentioned above.

Which version of the driver did you use in the guest? (Or have you
tried to download the one from Intel website) I'm asking since e1000
support has been deprecated by Microsoft for years.

Thanks

>
> But at the moment, the logic of e1000_has_rx_bufs acts exactly like it was
> the first case, unable to differentiate between the two scenarios.
>
>>
>>
>> Thanks
>>
>> >
>> >>
>> >> Adding more people.
>> >>
>> >> Thanks
>> >>
>> >
>> >
>> > --
>> > Best regards
>>
>
> Yong
>
> --
> Best regards
Yong Huang July 10, 2024, 9:04 a.m. UTC | #9
On Wed, Jul 10, 2024 at 3:36 PM Jason Wang <jasowang@redhat.com> wrote:

> On Wed, Jul 10, 2024 at 2:26 PM Yong Huang <yong.huang@smartx.com> wrote:
> >
> >
> >
> > On Wed, Jul 10, 2024 at 11:44 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On Tue, Jul 9, 2024 at 10:56 AM Yong Huang <yong.huang@smartx.com>
> wrote:
> >> >
> >> >
> >> >
> >> > On Tue, Jul 9, 2024 at 10:41 AM Jason Wang <jasowang@redhat.com>
> wrote:
> >> >>
> >> >> On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <yong.huang@smartx.com>
> wrote:
> >> >> >
> >> >> >
> >> >> >
> >> >> > On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com>
> wrote:
> >> >> >>
> >> >> >> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com>
> wrote:
> >> >> >> >
> >> >> >> > Unexpected work by certain Windows guests equipped with the
> e1000
> >> >> >> > interface can cause the network to go down and never come back
> up
> >> >> >> > again unless the guest's interface is reset.
> >> >> >> >
> >> >> >> > To reproduce the failure:
> >> >> >> > 1. Set up two guests with a Windows 2016 or 2019 server
> operating
> >> >> >> >    system.
> >> >> >>
> >> >> >> I vaguely remember e1000 support for Windows has been deprecated
> for
> >> >> >> several years...
> >> >> >>
> >> >> >> That's why e1000e or igb is implemented in Qemu.
> >> >> >>
> >> >> >> > 2. Set up the e1000 interface for the guests.
> >> >> >> > 3. Pressurize the network slightly between two guests using the
> iPerf tool.
> >> >> >> >
> >> >> >> > The network goes down after a few days (2-5days), and the issue
> >> >> >> > is the result of not adhering to the e1000 specification. Refer
> >> >> >> > to the details of the specification at the following link:
> >> >> >> >
> https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
> >> >> >> >
> >> >> >> > Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
> >> >> >> > as following:
> >> >> >> > This register holds a value that is an offset from the base, and
> >> >> >> > identifies the location beyond the last descriptor hardware can
> >> >> >> > process. Note that tail should still point to an area in the
> >> >> >> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
> >> >> >> > This is because tail points to the location where software
> writes
> >> >> >> > the first new descriptor.
> >> >> >> >
> >> >> >> > This means that if the provider—in this case, QEMU—has not yet
> >> >> >> > loaded the packet,
> >> >> >>
> >> >> >> What do you mean by "load" here?
> >> >> >
> >> >> >
> >> >> > Sorry for failing to describe the details.
> >> >> >
> >> >> > The guest driver retrieves the packet from the receive ring buffer
> >> >> > after QEMU forwards it from the tun/tap interface in the e1000
> >> >> > emulation.
> >> >> >
> >> >> > I used "load" to express "putting packets into the receive ring
> buffer."
> >> >> >
> >> >> >>
> >> >> >>
> >> >> >> > RDT should never point to that place.
> >> >> >>
> >> >> >> And "that place"?
> >> >> >
> >> >> > If a descriptor in the receive ring buffer has not been filled
> with a
> >> >> > packet address by QEMU, the descriptor therefore doesn't have any
> >> >> > available packets. The location of the descriptor should not be
> referred
> >> >> > to by RDT because the location is in the range that "hardware"
> handles.
> >> >> >
> >> >> > "that place" means the location of the descriptor in the ring
> buffer
> >> >> > that QEMU hasn't set any available packets related to.
> >> >> >
> >> >> >>
> >> >> >>
> >> >> >> > When
> >> >> >> > implementing the emulation of the e1000 interface, QEMU
> evaluates
> >> >> >> > if the receive ring buffer is full once the RDT equals the RDH,
> >> >> >> > based on the assumption that guest drivers adhere to this
> >> >> >> > criterion strictly.
> >> >> >> >
> >> >> >> > We applied the following log patch to assist in analyzing the
> >> >> >> > issue and eventually obtained the unexpected information.
> >> >> >> >
> >> >> >> > Log patch:
> >> >> >> >
> -----------------------------------------------------------------
> >> >> >> > |--- a/hw/net/e1000.c
> >> >> >> > |+++ b/hw/net/e1000.c
> >> >> >> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
> >> >> >> > | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
> >> >> >> > | {
> >> >> >> > |     int bufs;
> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> >> >> >> > |+
> >> >> >> > |     /* Fast-path short packets */
> >> >> >> > |     if (total_size <= s->rxbuf_size) {
> >> >> >> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] &&
> s->last_overrun)
> >> >> >> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc,
> const struct iovec *iov, int iovcnt)
> >> >> >> > |         s->rxbuf_min_shift)
> >> >> >> > |         n |= E1000_ICS_RXDMT0;
> >> >> >> > |
> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH],
> s->mac_reg[RDT]);
> >> >> >> > |+
> >> >> >> >
> -----------------------------------------------------------------
> >> >> >> >
> >> >> >> > The last few logs of information when the network is down:
> >> >> >> >
> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
> >> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> >> > e1000_has_rxbufs function, not full.
> >> >> >> >
> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> >> >> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
> >> >> >> > utilized after putting the packet to ring buffer.
> >> >> >> >
> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> >> > e1000_has_rxbufs function, not full.
> >> >> >> >
> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> >> >> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
> >> >> >> > utilized after putting the packet to ring buffer.
> >> >> >> >
> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> >> > e1000_has_rxbufs function, not full.
> >> >> >> >
> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
> >> >> >> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
> >> >> >> > utilized after putting the packet to ring buffer.
> >> >> >> >
> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] =
> 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
> >> >> >> > <- The ring is full, according to e1000_has_rxbufs, because
> >> >> >> > of the RDT update to 900 and equals RDH !
> >> >> >>
> >> >> >> Just to make sure I understand this, RDT==RDH means the ring is
> empty I think?
> >> >> >>
> >> >> >>
> >> >> >> See commit:
> >> >> >>
> >> >> >> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
> >> >> >> Author: Dmitry Fleytman <dmitry@daynix.com>
> >> >> >> Date:   Fri Oct 19 07:56:55 2012 +0200
> >> >> >>
> >> >> >>     e1000: drop check_rxov, always treat RX ring with RDH == RDT
> as empty
> >> >> >>
> >> >> >>     Real HW always treats RX ring with RDH == RDT as empty.
> >> >> >>     Emulation is supposed to behave the same.
> >> >> >
> >> >> >
> >> >> > Indeed, I'm confused :(,  the description in the comment claims
> that RX
> >> >> > rings with RDH == RDT as empty, but in implementation, it treats
> that as
> >> >> > overrun.
> >> >> >
> >> >> > See the following 2 contexts:
> >> >> >
> >> >> > 1. e1000_can_receive:
> >> >> > static bool e1000_can_receive(NetClientState *nc)
> >> >> > {
> >> >> >     E1000State *s = qemu_get_nic_opaque(nc);
> >> >> >     // e1000_has_rxbufs return true means ring buffer has
> >> >> >     // available descriptors to use for QEMU.
> >> >> >     // false means ring buffer overrun and QEMU should queue the
> packet
> >> >> >     // and wait for the RDT update and available descriptors can
> be used.
> >> >> >
> >> >> >     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
> >> >> >         e1000_has_rxbufs(s, 1) &&
> !timer_pending(s->flush_queue_timer);
> >> >> > }
> >> >>
> >> >> Well we had in e1000_has_rx_bufs
> >> >>
> >> >>     if (total_size <= s->rxbuf_size) {
> >> >>         return s->mac_reg[RDH] != s->mac_reg[RDT];
> >> >>     }
> >> >>
> >> >> RDT!=RDH means RX ring has available descriptors for hardware?
> >> >
> >> >
> >> > IMHO, Yes.
> >>
> >> Just to make sure we are on the same page, so
> >>
> >> RDT!=RDH, descriptors available for hardware
> >> RDT==RDH, descriptor ring is empty for hardware
> >>
> >>
> >> That is currently what the code did. Seems nothing wrong, or anything
> >> I missed here?
> >
> >
> > There are two cases for RDT == RDH.
> >
> > 1. Hardware has filled all available descriptors and overrun.
> >    In this case, hardware cannot add any new packets to the ring.
> >
> > 2. Software has consumed all descriptors, and all the descriptors
> >     on the ring can be used by hardware. (Let's name this case "empty.")
> >    In this case, hardware should keep putting new packets to the ring
>
> Well this seems not what spec said. See Figure 3-2, when RDT==RDH,
> nothing is owned by hardware. And this is what Dmitry said in the
> commit mentioned above.
>

Yes, this is the main cause of network interruptions. Hardware should
never touch the location of the descriptor that RDT points to. This is
what the specification declares. IMHO, it also implies that the descriptor
referred to by the RDT has available packets.

In our case, hardware has not added any new packets to the descriptor,
and the expected behavior is that software never updates the RDT to
the location of that descriptor. But now, it does.

If hardware and software both work as expected under the e1000
specification, the issue does not exist. I have no objection that the root
cause is the Windows driver bug in e1000, and I'm not insisting that
QEMU should take the responsibility for fixing that.


>
> Which version of the driver did you use in the guest? (Or have you
> tried to download the one from Intel website) I'm asking since e1000
> support has been deprecated by Microsoft for years.
>

I use Windows Server 2019 and the driver version is 8.4.13.0.

Since Microsoft no longer supports e1000, as you already mentioned,
this patch merely offers a workaround. Since some users still use
e1000 in production environments, it doesn't seem to have any side
effects.

It would be really appreciated if the patch was given some thought.

Thanks,
Yong


>
> Thanks
>
> >
> > But at the moment, the logic of e1000_has_rx_bufs acts exactly like it
> was
> > the first case, unable to differentiate between the two scenarios.
> >
> >>
> >>
> >> Thanks
> >>
> >> >
> >> >>
> >> >> Adding more people.
> >> >>
> >> >> Thanks
> >> >>
> >> >
> >> >
> >> > --
> >> > Best regards
> >>
> >
> > Yong
> >
> > --
> > Best regards
>
>
Jason Wang July 12, 2024, 2 a.m. UTC | #10
On Wed, Jul 10, 2024 at 5:05 PM Yong Huang <yong.huang@smartx.com> wrote:
>
>
>
> On Wed, Jul 10, 2024 at 3:36 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On Wed, Jul 10, 2024 at 2:26 PM Yong Huang <yong.huang@smartx.com> wrote:
>> >
>> >
>> >
>> > On Wed, Jul 10, 2024 at 11:44 AM Jason Wang <jasowang@redhat.com> wrote:
>> >>
>> >> On Tue, Jul 9, 2024 at 10:56 AM Yong Huang <yong.huang@smartx.com> wrote:
>> >> >
>> >> >
>> >> >
>> >> > On Tue, Jul 9, 2024 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
>> >> >>
>> >> >> On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <yong.huang@smartx.com> wrote:
>> >> >> >
>> >> >> >
>> >> >> >
>> >> >> > On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com> wrote:
>> >> >> >>
>> >> >> >> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com> wrote:
>> >> >> >> >
>> >> >> >> > Unexpected work by certain Windows guests equipped with the e1000
>> >> >> >> > interface can cause the network to go down and never come back up
>> >> >> >> > again unless the guest's interface is reset.
>> >> >> >> >
>> >> >> >> > To reproduce the failure:
>> >> >> >> > 1. Set up two guests with a Windows 2016 or 2019 server operating
>> >> >> >> >    system.
>> >> >> >>
>> >> >> >> I vaguely remember e1000 support for Windows has been deprecated for
>> >> >> >> several years...
>> >> >> >>
>> >> >> >> That's why e1000e or igb is implemented in Qemu.
>> >> >> >>
>> >> >> >> > 2. Set up the e1000 interface for the guests.
>> >> >> >> > 3. Pressurize the network slightly between two guests using the iPerf tool.
>> >> >> >> >
>> >> >> >> > The network goes down after a few days (2-5days), and the issue
>> >> >> >> > is the result of not adhering to the e1000 specification. Refer
>> >> >> >> > to the details of the specification at the following link:
>> >> >> >> > https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
>> >> >> >> >
>> >> >> >> > Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
>> >> >> >> > as following:
>> >> >> >> > This register holds a value that is an offset from the base, and
>> >> >> >> > identifies the location beyond the last descriptor hardware can
>> >> >> >> > process. Note that tail should still point to an area in the
>> >> >> >> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
>> >> >> >> > This is because tail points to the location where software writes
>> >> >> >> > the first new descriptor.
>> >> >> >> >
>> >> >> >> > This means that if the provider—in this case, QEMU—has not yet
>> >> >> >> > loaded the packet,
>> >> >> >>
>> >> >> >> What do you mean by "load" here?
>> >> >> >
>> >> >> >
>> >> >> > Sorry for failing to describe the details.
>> >> >> >
>> >> >> > The guest driver retrieves the packet from the receive ring buffer
>> >> >> > after QEMU forwards it from the tun/tap interface in the e1000
>> >> >> > emulation.
>> >> >> >
>> >> >> > I used "load" to express "putting packets into the receive ring buffer."
>> >> >> >
>> >> >> >>
>> >> >> >>
>> >> >> >> > RDT should never point to that place.
>> >> >> >>
>> >> >> >> And "that place"?
>> >> >> >
>> >> >> > If a descriptor in the receive ring buffer has not been filled with a
>> >> >> > packet address by QEMU, the descriptor therefore doesn't have any
>> >> >> > available packets. The location of the descriptor should not be referred
>> >> >> > to by RDT because the location is in the range that "hardware" handles.
>> >> >> >
>> >> >> > "that place" means the location of the descriptor in the ring buffer
>> >> >> > that QEMU hasn't set any available packets related to.
>> >> >> >
>> >> >> >>
>> >> >> >>
>> >> >> >> > When
>> >> >> >> > implementing the emulation of the e1000 interface, QEMU evaluates
>> >> >> >> > if the receive ring buffer is full once the RDT equals the RDH,
>> >> >> >> > based on the assumption that guest drivers adhere to this
>> >> >> >> > criterion strictly.
>> >> >> >> >
>> >> >> >> > We applied the following log patch to assist in analyzing the
>> >> >> >> > issue and eventually obtained the unexpected information.
>> >> >> >> >
>> >> >> >> > Log patch:
>> >> >> >> > -----------------------------------------------------------------
>> >> >> >> > |--- a/hw/net/e1000.c
>> >> >> >> > |+++ b/hw/net/e1000.c
>> >> >> >> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
>> >> >> >> > | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
>> >> >> >> > | {
>> >> >> >> > |     int bufs;
>> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> >> >> >> > |+
>> >> >> >> > |     /* Fast-path short packets */
>> >> >> >> > |     if (total_size <= s->rxbuf_size) {
>> >> >> >> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
>> >> >> >> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
>> >> >> >> > |         s->rxbuf_min_shift)
>> >> >> >> > |         n |= E1000_ICS_RXDMT0;
>> >> >> >> > |
>> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> >> >> >> > |+
>> >> >> >> > -----------------------------------------------------------------
>> >> >> >> >
>> >> >> >> > The last few logs of information when the network is down:
>> >> >> >> >
>> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
>> >> >> >> > <- the receive ring buffer is checked for fullness in the
>> >> >> >> > e1000_has_rxbufs function, not full.
>> >> >> >> >
>> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> >> >> >> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
>> >> >> >> > utilized after putting the packet to ring buffer.
>> >> >> >> >
>> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> >> >> >> > <- the receive ring buffer is checked for fullness in the
>> >> >> >> > e1000_has_rxbufs function, not full.
>> >> >> >> >
>> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> >> >> >> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
>> >> >> >> > utilized after putting the packet to ring buffer.
>> >> >> >> >
>> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> >> >> >> > <- the receive ring buffer is checked for fullness in the
>> >> >> >> > e1000_has_rxbufs function, not full.
>> >> >> >> >
>> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
>> >> >> >> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
>> >> >> >> > utilized after putting the packet to ring buffer.
>> >> >> >> >
>> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
>> >> >> >> > <- The ring is full, according to e1000_has_rxbufs, because
>> >> >> >> > of the RDT update to 900 and equals RDH !
>> >> >> >>
>> >> >> >> Just to make sure I understand this, RDT==RDH means the ring is empty I think?
>> >> >> >>
>> >> >> >>
>> >> >> >> See commit:
>> >> >> >>
>> >> >> >> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
>> >> >> >> Author: Dmitry Fleytman <dmitry@daynix.com>
>> >> >> >> Date:   Fri Oct 19 07:56:55 2012 +0200
>> >> >> >>
>> >> >> >>     e1000: drop check_rxov, always treat RX ring with RDH == RDT as empty
>> >> >> >>
>> >> >> >>     Real HW always treats RX ring with RDH == RDT as empty.
>> >> >> >>     Emulation is supposed to behave the same.
>> >> >> >
>> >> >> >
>> >> >> > Indeed, I'm confused :(,  the description in the comment claims that RX
>> >> >> > rings with RDH == RDT as empty, but in implementation, it treats that as
>> >> >> > overrun.
>> >> >> >
>> >> >> > See the following 2 contexts:
>> >> >> >
>> >> >> > 1. e1000_can_receive:
>> >> >> > static bool e1000_can_receive(NetClientState *nc)
>> >> >> > {
>> >> >> >     E1000State *s = qemu_get_nic_opaque(nc);
>> >> >> >     // e1000_has_rxbufs return true means ring buffer has
>> >> >> >     // available descriptors to use for QEMU.
>> >> >> >     // false means ring buffer overrun and QEMU should queue the packet
>> >> >> >     // and wait for the RDT update and available descriptors can be used.
>> >> >> >
>> >> >> >     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
>> >> >> >         e1000_has_rxbufs(s, 1) && !timer_pending(s->flush_queue_timer);
>> >> >> > }
>> >> >>
>> >> >> Well we had in e1000_has_rx_bufs
>> >> >>
>> >> >>     if (total_size <= s->rxbuf_size) {
>> >> >>         return s->mac_reg[RDH] != s->mac_reg[RDT];
>> >> >>     }
>> >> >>
>> >> >> RDT!=RDH means RX ring has available descriptors for hardware?
>> >> >
>> >> >
>> >> > IMHO, Yes.
>> >>
>> >> Just to make sure we are on the same page, so
>> >>
>> >> RDT!=RDH, descriptors available for hardware
>> >> RDT==RDH, descriptor ring is empty for hardware
>> >>
>> >>
>> >> That is currently what the code did. Seems nothing wrong, or anything
>> >> I missed here?
>> >
>> >
>> > There are two cases for RDT == RDH.
>> >
>> > 1. Hardware has filled all available descriptors and overrun.
>> >    In this case, hardware cannot add any new packets to the ring.
>> >
>> > 2. Software has consumed all descriptors, and all the descriptors
>> >     on the ring can be used by hardware. (Let's name this case "empty.")
>> >    In this case, hardware should keep putting new packets to the ring
>>
>> Well this seems not what spec said. See Figure 3-2, when RDT==RDH,
>> nothing is owned by hardware. And this is what Dmitry said in the
>> commit mentioned above.
>
>
> Yes, this is the main cause of network interruptions. Hardware should
> never touch the location of the descriptor that RDT points to. This is
> what the specification declares. IMHO, it also implies that the descriptor
> referred to by the RDT has available packets.
>
> In our case, hardware has not added any new packets to the descriptor,
> and the expected behavior is that software never updates the RDT to
> the location of that descriptor. But now, it does.
>
> If hardware and software both work as expected under the e1000
> specification, the issue does not exist. I have no objection that the root
> cause is the Windows driver bug in e1000, and I'm not insisting that
> QEMU should take the responsibility for fixing that.
>
>>
>>
>> Which version of the driver did you use in the guest? (Or have you
>> tried to download the one from Intel website) I'm asking since e1000
>> support has been deprecated by Microsoft for years.
>
>
> I use Windows Server 2019 and the driver version is 8.4.13.0.

Is this the one you download from the Intel website?

>
> Since Microsoft no longer supports e1000, as you already mentioned,
> this patch merely offers a workaround.

Is there a chance to switch to use e1000e, it has been actively
maintained and supported.

> Since some users still use
> e1000 in production environments, it doesn't seem to have any side
> effects.

We need to make sure it matches the hardware behaviour. Otherwise it
might break other operating systems. Looking at the history, we used
to break e1000 on various operating systems: windows, BSD, minix,
windriver ....

>
> It would be really appreciated if the patch was given some thought.

We would try to evaluate each patch carefully. Qemu is a function
emulator so we need to make sure the function matches hardware
behaviour before it can be merged.

One way is to test via real hardware or involve Intel Engineers.

Thanks

>
> Thanks,
> Yong
>
>>
>>
>> Thanks
>>
>> >
>> > But at the moment, the logic of e1000_has_rx_bufs acts exactly like it was
>> > the first case, unable to differentiate between the two scenarios.
>> >
>> >>
>> >>
>> >> Thanks
>> >>
>> >> >
>> >> >>
>> >> >> Adding more people.
>> >> >>
>> >> >> Thanks
>> >> >>
>> >> >
>> >> >
>> >> > --
>> >> > Best regards
>> >>
>> >
>> > Yong
>> >
>> > --
>> > Best regards
>>
>
>
> --
> Best regards
Yong Huang July 17, 2024, 1:24 a.m. UTC | #11
On Fri, Jul 12, 2024 at 10:01 AM Jason Wang <jasowang@redhat.com> wrote:

> On Wed, Jul 10, 2024 at 5:05 PM Yong Huang <yong.huang@smartx.com> wrote:
> >
> >
> >
> > On Wed, Jul 10, 2024 at 3:36 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On Wed, Jul 10, 2024 at 2:26 PM Yong Huang <yong.huang@smartx.com>
> wrote:
> >> >
> >> >
> >> >
> >> > On Wed, Jul 10, 2024 at 11:44 AM Jason Wang <jasowang@redhat.com>
> wrote:
> >> >>
> >> >> On Tue, Jul 9, 2024 at 10:56 AM Yong Huang <yong.huang@smartx.com>
> wrote:
> >> >> >
> >> >> >
> >> >> >
> >> >> > On Tue, Jul 9, 2024 at 10:41 AM Jason Wang <jasowang@redhat.com>
> wrote:
> >> >> >>
> >> >> >> On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <yong.huang@smartx.com>
> wrote:
> >> >> >> >
> >> >> >> >
> >> >> >> >
> >> >> >> > On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com>
> wrote:
> >> >> >> >>
> >> >> >> >> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <
> yong.huang@smartx.com> wrote:
> >> >> >> >> >
> >> >> >> >> > Unexpected work by certain Windows guests equipped with the
> e1000
> >> >> >> >> > interface can cause the network to go down and never come
> back up
> >> >> >> >> > again unless the guest's interface is reset.
> >> >> >> >> >
> >> >> >> >> > To reproduce the failure:
> >> >> >> >> > 1. Set up two guests with a Windows 2016 or 2019 server
> operating
> >> >> >> >> >    system.
> >> >> >> >>
> >> >> >> >> I vaguely remember e1000 support for Windows has been
> deprecated for
> >> >> >> >> several years...
> >> >> >> >>
> >> >> >> >> That's why e1000e or igb is implemented in Qemu.
> >> >> >> >>
> >> >> >> >> > 2. Set up the e1000 interface for the guests.
> >> >> >> >> > 3. Pressurize the network slightly between two guests using
> the iPerf tool.
> >> >> >> >> >
> >> >> >> >> > The network goes down after a few days (2-5days), and the
> issue
> >> >> >> >> > is the result of not adhering to the e1000 specification.
> Refer
> >> >> >> >> > to the details of the specification at the following link:
> >> >> >> >> >
> https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
> >> >> >> >> >
> >> >> >> >> > Chapter 3.2.6 describe the Receive Descriptor Tail
> register(RDT)
> >> >> >> >> > as following:
> >> >> >> >> > This register holds a value that is an offset from the base,
> and
> >> >> >> >> > identifies the location beyond the last descriptor hardware
> can
> >> >> >> >> > process. Note that tail should still point to an area in the
> >> >> >> >> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
> >> >> >> >> > This is because tail points to the location where software
> writes
> >> >> >> >> > the first new descriptor.
> >> >> >> >> >
> >> >> >> >> > This means that if the provider—in this case, QEMU—has not
> yet
> >> >> >> >> > loaded the packet,
> >> >> >> >>
> >> >> >> >> What do you mean by "load" here?
> >> >> >> >
> >> >> >> >
> >> >> >> > Sorry for failing to describe the details.
> >> >> >> >
> >> >> >> > The guest driver retrieves the packet from the receive ring
> buffer
> >> >> >> > after QEMU forwards it from the tun/tap interface in the e1000
> >> >> >> > emulation.
> >> >> >> >
> >> >> >> > I used "load" to express "putting packets into the receive ring
> buffer."
> >> >> >> >
> >> >> >> >>
> >> >> >> >>
> >> >> >> >> > RDT should never point to that place.
> >> >> >> >>
> >> >> >> >> And "that place"?
> >> >> >> >
> >> >> >> > If a descriptor in the receive ring buffer has not been filled
> with a
> >> >> >> > packet address by QEMU, the descriptor therefore doesn't have
> any
> >> >> >> > available packets. The location of the descriptor should not be
> referred
> >> >> >> > to by RDT because the location is in the range that "hardware"
> handles.
> >> >> >> >
> >> >> >> > "that place" means the location of the descriptor in the ring
> buffer
> >> >> >> > that QEMU hasn't set any available packets related to.
> >> >> >> >
> >> >> >> >>
> >> >> >> >>
> >> >> >> >> > When
> >> >> >> >> > implementing the emulation of the e1000 interface, QEMU
> evaluates
> >> >> >> >> > if the receive ring buffer is full once the RDT equals the
> RDH,
> >> >> >> >> > based on the assumption that guest drivers adhere to this
> >> >> >> >> > criterion strictly.
> >> >> >> >> >
> >> >> >> >> > We applied the following log patch to assist in analyzing the
> >> >> >> >> > issue and eventually obtained the unexpected information.
> >> >> >> >> >
> >> >> >> >> > Log patch:
> >> >> >> >> >
> -----------------------------------------------------------------
> >> >> >> >> > |--- a/hw/net/e1000.c
> >> >> >> >> > |+++ b/hw/net/e1000.c
> >> >> >> >> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState
> *nc)
> >> >> >> >> > | static bool e1000_has_rxbufs(E1000State *s, size_t
> total_size)
> >> >> >> >> > | {
> >> >> >> >> > |     int bufs;
> >> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN],
> s->mac_reg[RDH], s->mac_reg[RDT]);
> >> >> >> >> > |+
> >> >> >> >> > |     /* Fast-path short packets */
> >> >> >> >> > |     if (total_size <= s->rxbuf_size) {
> >> >> >> >> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] &&
> s->last_overrun)
> >> >> >> >> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc,
> const struct iovec *iov, int iovcnt)
> >> >> >> >> > |         s->rxbuf_min_shift)
> >> >> >> >> > |         n |= E1000_ICS_RXDMT0;
> >> >> >> >> > |
> >> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u,
> s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN],
> s->mac_reg[RDH], s->mac_reg[RDT]);
> >> >> >> >> > |+
> >> >> >> >> >
> -----------------------------------------------------------------
> >> >> >> >> >
> >> >> >> >> > The last few logs of information when the network is down:
> >> >> >> >> >
> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN]
> = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
> >> >> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> >> >> > e1000_has_rxbufs function, not full.
> >> >> >> >> >
> >> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN]
> = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> >> >> >> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
> >> >> >> >> > utilized after putting the packet to ring buffer.
> >> >> >> >> >
> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN]
> = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> >> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> >> >> > e1000_has_rxbufs function, not full.
> >> >> >> >> >
> >> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN]
> = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> >> >> >> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
> >> >> >> >> > utilized after putting the packet to ring buffer.
> >> >> >> >> >
> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN]
> = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> >> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> >> >> > e1000_has_rxbufs function, not full.
> >> >> >> >> >
> >> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN]
> = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
> >> >> >> >> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
> >> >> >> >> > utilized after putting the packet to ring buffer.
> >> >> >> >> >
> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN]
> = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
> >> >> >> >> > <- The ring is full, according to e1000_has_rxbufs, because
> >> >> >> >> > of the RDT update to 900 and equals RDH !
> >> >> >> >>
> >> >> >> >> Just to make sure I understand this, RDT==RDH means the ring
> is empty I think?
> >> >> >> >>
> >> >> >> >>
> >> >> >> >> See commit:
> >> >> >> >>
> >> >> >> >> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
> >> >> >> >> Author: Dmitry Fleytman <dmitry@daynix.com>
> >> >> >> >> Date:   Fri Oct 19 07:56:55 2012 +0200
> >> >> >> >>
> >> >> >> >>     e1000: drop check_rxov, always treat RX ring with RDH ==
> RDT as empty
> >> >> >> >>
> >> >> >> >>     Real HW always treats RX ring with RDH == RDT as empty.
> >> >> >> >>     Emulation is supposed to behave the same.
> >> >> >> >
> >> >> >> >
> >> >> >> > Indeed, I'm confused :(,  the description in the comment claims
> that RX
> >> >> >> > rings with RDH == RDT as empty, but in implementation, it
> treats that as
> >> >> >> > overrun.
> >> >> >> >
> >> >> >> > See the following 2 contexts:
> >> >> >> >
> >> >> >> > 1. e1000_can_receive:
> >> >> >> > static bool e1000_can_receive(NetClientState *nc)
> >> >> >> > {
> >> >> >> >     E1000State *s = qemu_get_nic_opaque(nc);
> >> >> >> >     // e1000_has_rxbufs return true means ring buffer has
> >> >> >> >     // available descriptors to use for QEMU.
> >> >> >> >     // false means ring buffer overrun and QEMU should queue
> the packet
> >> >> >> >     // and wait for the RDT update and available descriptors
> can be used.
> >> >> >> >
> >> >> >> >     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
> >> >> >> >         e1000_has_rxbufs(s, 1) &&
> !timer_pending(s->flush_queue_timer);
> >> >> >> > }
> >> >> >>
> >> >> >> Well we had in e1000_has_rx_bufs
> >> >> >>
> >> >> >>     if (total_size <= s->rxbuf_size) {
> >> >> >>         return s->mac_reg[RDH] != s->mac_reg[RDT];
> >> >> >>     }
> >> >> >>
> >> >> >> RDT!=RDH means RX ring has available descriptors for hardware?
> >> >> >
> >> >> >
> >> >> > IMHO, Yes.
> >> >>
> >> >> Just to make sure we are on the same page, so
> >> >>
> >> >> RDT!=RDH, descriptors available for hardware
> >> >> RDT==RDH, descriptor ring is empty for hardware
> >> >>
> >> >>
> >> >> That is currently what the code did. Seems nothing wrong, or anything
> >> >> I missed here?
> >> >
> >> >
> >> > There are two cases for RDT == RDH.
> >> >
> >> > 1. Hardware has filled all available descriptors and overrun.
> >> >    In this case, hardware cannot add any new packets to the ring.
> >> >
> >> > 2. Software has consumed all descriptors, and all the descriptors
> >> >     on the ring can be used by hardware. (Let's name this case
> "empty.")
> >> >    In this case, hardware should keep putting new packets to the ring
> >>
> >> Well this seems not what spec said. See Figure 3-2, when RDT==RDH,
> >> nothing is owned by hardware. And this is what Dmitry said in the
> >> commit mentioned above.
> >
> >
> > Yes, this is the main cause of network interruptions. Hardware should
> > never touch the location of the descriptor that RDT points to. This is
> > what the specification declares. IMHO, it also implies that the
> descriptor
> > referred to by the RDT has available packets.
> >
> > In our case, hardware has not added any new packets to the descriptor,
> > and the expected behavior is that software never updates the RDT to
> > the location of that descriptor. But now, it does.
> >
> > If hardware and software both work as expected under the e1000
> > specification, the issue does not exist. I have no objection that the
> root
> > cause is the Windows driver bug in e1000, and I'm not insisting that
> > QEMU should take the responsibility for fixing that.
> >
> >>
> >>
> >> Which version of the driver did you use in the guest? (Or have you
> >> tried to download the one from Intel website) I'm asking since e1000
> >> support has been deprecated by Microsoft for years.
> >
> >
> > I use Windows Server 2019 and the driver version is 8.4.13.0.
>
> Is this the one you download from the Intel website?
>

No, this one is the default driver of Windows Server 2019 if my
information is correct.


>
> >
> > Since Microsoft no longer supports e1000, as you already mentioned,
> > this patch merely offers a workaround.
>
> Is there a chance to switch to use e1000e, it has been actively
> maintained and supported.
>

Yes, this is another workaround, we'll try this in the end, thanks for
the advice.


>
> > Since some users still use
> > e1000 in production environments, it doesn't seem to have any side
> > effects.
>
> We need to make sure it matches the hardware behaviour. Otherwise it
> might break other operating systems. Looking at the history, we used
> to break e1000 on various operating systems: windows, BSD, minix,
> windriver ....
>

Yes, I absolutely agree with you. Determining if this patch will affect
other operating systems is therefore crucial.


>
> >
> > It would be really appreciated if the patch was given some thought.
>
> We would try to evaluate each patch carefully. Qemu is a function
> emulator so we need to make sure the function matches hardware
> behaviour before it can be merged.


Two solutions could be applied:

1. fix the windows driver to ensure that RDT does not point to the location
  of the descriptors hardware doesn't put any packets into. Let's name this
  state of descriptor as "invalid".

2. modify the QEMU to handle this case while not breaking other OSes.

Since this patch only does a sanity check, it appears to adhere to the
second solution, in my opinion.

The patch's sole unintended consequence is that it prevents guests from
purposefully updating the RDT to link to the "invalid" description.

What do you think?

Yong


> One way is to test via real hardware or involve Intel Engineers.


> Thanks
>
> >
> > Thanks,
> > Yong
> >
> >>
> >>
> >> Thanks
> >>
> >> >
> >> > But at the moment, the logic of e1000_has_rx_bufs acts exactly like
> it was
> >> > the first case, unable to differentiate between the two scenarios.
> >> >
> >> >>
> >> >>
> >> >> Thanks
> >> >>
> >> >> >
> >> >> >>
> >> >> >> Adding more people.
> >> >> >>
> >> >> >> Thanks
> >> >> >>
> >> >> >
> >> >> >
> >> >> > --
> >> >> > Best regards
> >> >>
> >> >
> >> > Yong
> >> >
> >> > --
> >> > Best regards
> >>
> >
> >
> > --
> > Best regards
>
>
Jason Wang July 17, 2024, 1:28 a.m. UTC | #12
On Wed, Jul 17, 2024 at 9:24 AM Yong Huang <yong.huang@smartx.com> wrote:
>
>
>
> On Fri, Jul 12, 2024 at 10:01 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On Wed, Jul 10, 2024 at 5:05 PM Yong Huang <yong.huang@smartx.com> wrote:
>> >
>> >
>> >
>> > On Wed, Jul 10, 2024 at 3:36 PM Jason Wang <jasowang@redhat.com> wrote:
>> >>
>> >> On Wed, Jul 10, 2024 at 2:26 PM Yong Huang <yong.huang@smartx.com> wrote:
>> >> >
>> >> >
>> >> >
>> >> > On Wed, Jul 10, 2024 at 11:44 AM Jason Wang <jasowang@redhat.com> wrote:
>> >> >>
>> >> >> On Tue, Jul 9, 2024 at 10:56 AM Yong Huang <yong.huang@smartx.com> wrote:
>> >> >> >
>> >> >> >
>> >> >> >
>> >> >> > On Tue, Jul 9, 2024 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
>> >> >> >>
>> >> >> >> On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <yong.huang@smartx.com> wrote:
>> >> >> >> >
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <jasowang@redhat.com> wrote:
>> >> >> >> >>
>> >> >> >> >> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <yong.huang@smartx.com> wrote:
>> >> >> >> >> >
>> >> >> >> >> > Unexpected work by certain Windows guests equipped with the e1000
>> >> >> >> >> > interface can cause the network to go down and never come back up
>> >> >> >> >> > again unless the guest's interface is reset.
>> >> >> >> >> >
>> >> >> >> >> > To reproduce the failure:
>> >> >> >> >> > 1. Set up two guests with a Windows 2016 or 2019 server operating
>> >> >> >> >> >    system.
>> >> >> >> >>
>> >> >> >> >> I vaguely remember e1000 support for Windows has been deprecated for
>> >> >> >> >> several years...
>> >> >> >> >>
>> >> >> >> >> That's why e1000e or igb is implemented in Qemu.
>> >> >> >> >>
>> >> >> >> >> > 2. Set up the e1000 interface for the guests.
>> >> >> >> >> > 3. Pressurize the network slightly between two guests using the iPerf tool.
>> >> >> >> >> >
>> >> >> >> >> > The network goes down after a few days (2-5days), and the issue
>> >> >> >> >> > is the result of not adhering to the e1000 specification. Refer
>> >> >> >> >> > to the details of the specification at the following link:
>> >> >> >> >> > https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
>> >> >> >> >> >
>> >> >> >> >> > Chapter 3.2.6 describe the Receive Descriptor Tail register(RDT)
>> >> >> >> >> > as following:
>> >> >> >> >> > This register holds a value that is an offset from the base, and
>> >> >> >> >> > identifies the location beyond the last descriptor hardware can
>> >> >> >> >> > process. Note that tail should still point to an area in the
>> >> >> >> >> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
>> >> >> >> >> > This is because tail points to the location where software writes
>> >> >> >> >> > the first new descriptor.
>> >> >> >> >> >
>> >> >> >> >> > This means that if the provider—in this case, QEMU—has not yet
>> >> >> >> >> > loaded the packet,
>> >> >> >> >>
>> >> >> >> >> What do you mean by "load" here?
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > Sorry for failing to describe the details.
>> >> >> >> >
>> >> >> >> > The guest driver retrieves the packet from the receive ring buffer
>> >> >> >> > after QEMU forwards it from the tun/tap interface in the e1000
>> >> >> >> > emulation.
>> >> >> >> >
>> >> >> >> > I used "load" to express "putting packets into the receive ring buffer."
>> >> >> >> >
>> >> >> >> >>
>> >> >> >> >>
>> >> >> >> >> > RDT should never point to that place.
>> >> >> >> >>
>> >> >> >> >> And "that place"?
>> >> >> >> >
>> >> >> >> > If a descriptor in the receive ring buffer has not been filled with a
>> >> >> >> > packet address by QEMU, the descriptor therefore doesn't have any
>> >> >> >> > available packets. The location of the descriptor should not be referred
>> >> >> >> > to by RDT because the location is in the range that "hardware" handles.
>> >> >> >> >
>> >> >> >> > "that place" means the location of the descriptor in the ring buffer
>> >> >> >> > that QEMU hasn't set any available packets related to.
>> >> >> >> >
>> >> >> >> >>
>> >> >> >> >>
>> >> >> >> >> > When
>> >> >> >> >> > implementing the emulation of the e1000 interface, QEMU evaluates
>> >> >> >> >> > if the receive ring buffer is full once the RDT equals the RDH,
>> >> >> >> >> > based on the assumption that guest drivers adhere to this
>> >> >> >> >> > criterion strictly.
>> >> >> >> >> >
>> >> >> >> >> > We applied the following log patch to assist in analyzing the
>> >> >> >> >> > issue and eventually obtained the unexpected information.
>> >> >> >> >> >
>> >> >> >> >> > Log patch:
>> >> >> >> >> > -----------------------------------------------------------------
>> >> >> >> >> > |--- a/hw/net/e1000.c
>> >> >> >> >> > |+++ b/hw/net/e1000.c
>> >> >> >> >> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState *nc)
>> >> >> >> >> > | static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
>> >> >> >> >> > | {
>> >> >> >> >> > |     int bufs;
>> >> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> >> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> >> >> >> >> > |+
>> >> >> >> >> > |     /* Fast-path short packets */
>> >> >> >> >> > |     if (total_size <= s->rxbuf_size) {
>> >> >> >> >> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun)
>> >> >> >> >> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
>> >> >> >> >> > |         s->rxbuf_min_shift)
>> >> >> >> >> > |         n |= E1000_ICS_RXDMT0;
>> >> >> >> >> > |
>> >> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] = %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
>> >> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN], s->mac_reg[RDH], s->mac_reg[RDT]);
>> >> >> >> >> > |+
>> >> >> >> >> > -----------------------------------------------------------------
>> >> >> >> >> >
>> >> >> >> >> > The last few logs of information when the network is down:
>> >> >> >> >> >
>> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
>> >> >> >> >> > <- the receive ring buffer is checked for fullness in the
>> >> >> >> >> > e1000_has_rxbufs function, not full.
>> >> >> >> >> >
>> >> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> >> >> >> >> > <- RDT stays the same, RDH updates to 898, and 1 descriptor
>> >> >> >> >> > utilized after putting the packet to ring buffer.
>> >> >> >> >> >
>> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
>> >> >> >> >> > <- the receive ring buffer is checked for fullness in the
>> >> >> >> >> > e1000_has_rxbufs function, not full.
>> >> >> >> >> >
>> >> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> >> >> >> >> > <- RDT stays the same, RDH updates to 899, and 1 descriptor
>> >> >> >> >> > utilized after putting the packet to ring buffer.
>> >> >> >> >> >
>> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
>> >> >> >> >> > <- the receive ring buffer is checked for fullness in the
>> >> >> >> >> > e1000_has_rxbufs function, not full.
>> >> >> >> >> >
>> >> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
>> >> >> >> >> > <- RDT stays the same, RDH updates to 900 , and 1 descriptor
>> >> >> >> >> > utilized after putting the packet to ring buffer.
>> >> >> >> >> >
>> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048, s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
>> >> >> >> >> > <- The ring is full, according to e1000_has_rxbufs, because
>> >> >> >> >> > of the RDT update to 900 and equals RDH !
>> >> >> >> >>
>> >> >> >> >> Just to make sure I understand this, RDT==RDH means the ring is empty I think?
>> >> >> >> >>
>> >> >> >> >>
>> >> >> >> >> See commit:
>> >> >> >> >>
>> >> >> >> >> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
>> >> >> >> >> Author: Dmitry Fleytman <dmitry@daynix.com>
>> >> >> >> >> Date:   Fri Oct 19 07:56:55 2012 +0200
>> >> >> >> >>
>> >> >> >> >>     e1000: drop check_rxov, always treat RX ring with RDH == RDT as empty
>> >> >> >> >>
>> >> >> >> >>     Real HW always treats RX ring with RDH == RDT as empty.
>> >> >> >> >>     Emulation is supposed to behave the same.
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > Indeed, I'm confused :(,  the description in the comment claims that RX
>> >> >> >> > rings with RDH == RDT as empty, but in implementation, it treats that as
>> >> >> >> > overrun.
>> >> >> >> >
>> >> >> >> > See the following 2 contexts:
>> >> >> >> >
>> >> >> >> > 1. e1000_can_receive:
>> >> >> >> > static bool e1000_can_receive(NetClientState *nc)
>> >> >> >> > {
>> >> >> >> >     E1000State *s = qemu_get_nic_opaque(nc);
>> >> >> >> >     // e1000_has_rxbufs return true means ring buffer has
>> >> >> >> >     // available descriptors to use for QEMU.
>> >> >> >> >     // false means ring buffer overrun and QEMU should queue the packet
>> >> >> >> >     // and wait for the RDT update and available descriptors can be used.
>> >> >> >> >
>> >> >> >> >     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
>> >> >> >> >         e1000_has_rxbufs(s, 1) && !timer_pending(s->flush_queue_timer);
>> >> >> >> > }
>> >> >> >>
>> >> >> >> Well we had in e1000_has_rx_bufs
>> >> >> >>
>> >> >> >>     if (total_size <= s->rxbuf_size) {
>> >> >> >>         return s->mac_reg[RDH] != s->mac_reg[RDT];
>> >> >> >>     }
>> >> >> >>
>> >> >> >> RDT!=RDH means RX ring has available descriptors for hardware?
>> >> >> >
>> >> >> >
>> >> >> > IMHO, Yes.
>> >> >>
>> >> >> Just to make sure we are on the same page, so
>> >> >>
>> >> >> RDT!=RDH, descriptors available for hardware
>> >> >> RDT==RDH, descriptor ring is empty for hardware
>> >> >>
>> >> >>
>> >> >> That is currently what the code did. Seems nothing wrong, or anything
>> >> >> I missed here?
>> >> >
>> >> >
>> >> > There are two cases for RDT == RDH.
>> >> >
>> >> > 1. Hardware has filled all available descriptors and overrun.
>> >> >    In this case, hardware cannot add any new packets to the ring.
>> >> >
>> >> > 2. Software has consumed all descriptors, and all the descriptors
>> >> >     on the ring can be used by hardware. (Let's name this case "empty.")
>> >> >    In this case, hardware should keep putting new packets to the ring
>> >>
>> >> Well this seems not what spec said. See Figure 3-2, when RDT==RDH,
>> >> nothing is owned by hardware. And this is what Dmitry said in the
>> >> commit mentioned above.
>> >
>> >
>> > Yes, this is the main cause of network interruptions. Hardware should
>> > never touch the location of the descriptor that RDT points to. This is
>> > what the specification declares. IMHO, it also implies that the descriptor
>> > referred to by the RDT has available packets.
>> >
>> > In our case, hardware has not added any new packets to the descriptor,
>> > and the expected behavior is that software never updates the RDT to
>> > the location of that descriptor. But now, it does.
>> >
>> > If hardware and software both work as expected under the e1000
>> > specification, the issue does not exist. I have no objection that the root
>> > cause is the Windows driver bug in e1000, and I'm not insisting that
>> > QEMU should take the responsibility for fixing that.
>> >
>> >>
>> >>
>> >> Which version of the driver did you use in the guest? (Or have you
>> >> tried to download the one from Intel website) I'm asking since e1000
>> >> support has been deprecated by Microsoft for years.
>> >
>> >
>> > I use Windows Server 2019 and the driver version is 8.4.13.0.
>>
>> Is this the one you download from the Intel website?
>
>
> No, this one is the default driver of Windows Server 2019 if my
> information is correct.
>
>>
>>
>> >
>> > Since Microsoft no longer supports e1000, as you already mentioned,
>> > this patch merely offers a workaround.
>>
>> Is there a chance to switch to use e1000e, it has been actively
>> maintained and supported.
>
>
> Yes, this is another workaround, we'll try this in the end, thanks for
> the advice.
>
>>
>>
>> > Since some users still use
>> > e1000 in production environments, it doesn't seem to have any side
>> > effects.
>>
>> We need to make sure it matches the hardware behaviour. Otherwise it
>> might break other operating systems. Looking at the history, we used
>> to break e1000 on various operating systems: windows, BSD, minix,
>> windriver ....
>
>
> Yes, I absolutely agree with you. Determining if this patch will affect
> other operating systems is therefore crucial.
>
>>
>>
>> >
>> > It would be really appreciated if the patch was given some thought.
>>
>> We would try to evaluate each patch carefully. Qemu is a function
>> emulator so we need to make sure the function matches hardware
>> behaviour before it can be merged.
>
>
> Two solutions could be applied:
>
> 1. fix the windows driver to ensure that RDT does not point to the location
>   of the descriptors hardware doesn't put any packets into. Let's name this
>   state of descriptor as "invalid".
>
> 2. modify the QEMU to handle this case while not breaking other OSes.
>
> Since this patch only does a sanity check, it appears to adhere to the
> second solution, in my opinion.
>
> The patch's sole unintended consequence is that it prevents guests from
> purposefully updating the RDT to link to the "invalid" description.
>
> What do you think?

For 1), please try the e1000 driver from Intel (I remember it could be
downloaded from the Intel website).
For 2), it would be hard as it needs to be tested with a lot of
operating systems (or we can have Intel Engineers involved).

And I really suggest you shift to e1000e (or igb) if it is possible.

Thanks

>
> Yong
>
>>
>> One way is to test via real hardware or involve Intel Engineers.
>>
>>
>> Thanks
>>
>> >
>> > Thanks,
>> > Yong
>> >
>> >>
>> >>
>> >> Thanks
>> >>
>> >> >
>> >> > But at the moment, the logic of e1000_has_rx_bufs acts exactly like it was
>> >> > the first case, unable to differentiate between the two scenarios.
>> >> >
>> >> >>
>> >> >>
>> >> >> Thanks
>> >> >>
>> >> >> >
>> >> >> >>
>> >> >> >> Adding more people.
>> >> >> >>
>> >> >> >> Thanks
>> >> >> >>
>> >> >> >
>> >> >> >
>> >> >> > --
>> >> >> > Best regards
>> >> >>
>> >> >
>> >> > Yong
>> >> >
>> >> > --
>> >> > Best regards
>> >>
>> >
>> >
>> > --
>> > Best regards
>>
>
>
> --
> Best regards
Yong Huang July 17, 2024, 1:40 a.m. UTC | #13
On Wed, Jul 17, 2024 at 9:29 AM Jason Wang <jasowang@redhat.com> wrote:

> On Wed, Jul 17, 2024 at 9:24 AM Yong Huang <yong.huang@smartx.com> wrote:
> >
> >
> >
> > On Fri, Jul 12, 2024 at 10:01 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On Wed, Jul 10, 2024 at 5:05 PM Yong Huang <yong.huang@smartx.com>
> wrote:
> >> >
> >> >
> >> >
> >> > On Wed, Jul 10, 2024 at 3:36 PM Jason Wang <jasowang@redhat.com>
> wrote:
> >> >>
> >> >> On Wed, Jul 10, 2024 at 2:26 PM Yong Huang <yong.huang@smartx.com>
> wrote:
> >> >> >
> >> >> >
> >> >> >
> >> >> > On Wed, Jul 10, 2024 at 11:44 AM Jason Wang <jasowang@redhat.com>
> wrote:
> >> >> >>
> >> >> >> On Tue, Jul 9, 2024 at 10:56 AM Yong Huang <yong.huang@smartx.com>
> wrote:
> >> >> >> >
> >> >> >> >
> >> >> >> >
> >> >> >> > On Tue, Jul 9, 2024 at 10:41 AM Jason Wang <jasowang@redhat.com>
> wrote:
> >> >> >> >>
> >> >> >> >> On Mon, Jul 8, 2024 at 1:17 PM Yong Huang <
> yong.huang@smartx.com> wrote:
> >> >> >> >> >
> >> >> >> >> >
> >> >> >> >> >
> >> >> >> >> > On Mon, Jul 8, 2024 at 11:21 AM Jason Wang <
> jasowang@redhat.com> wrote:
> >> >> >> >> >>
> >> >> >> >> >> On Sat, Jul 6, 2024 at 4:30 AM Hyman Huang <
> yong.huang@smartx.com> wrote:
> >> >> >> >> >> >
> >> >> >> >> >> > Unexpected work by certain Windows guests equipped with
> the e1000
> >> >> >> >> >> > interface can cause the network to go down and never come
> back up
> >> >> >> >> >> > again unless the guest's interface is reset.
> >> >> >> >> >> >
> >> >> >> >> >> > To reproduce the failure:
> >> >> >> >> >> > 1. Set up two guests with a Windows 2016 or 2019 server
> operating
> >> >> >> >> >> >    system.
> >> >> >> >> >>
> >> >> >> >> >> I vaguely remember e1000 support for Windows has been
> deprecated for
> >> >> >> >> >> several years...
> >> >> >> >> >>
> >> >> >> >> >> That's why e1000e or igb is implemented in Qemu.
> >> >> >> >> >>
> >> >> >> >> >> > 2. Set up the e1000 interface for the guests.
> >> >> >> >> >> > 3. Pressurize the network slightly between two guests
> using the iPerf tool.
> >> >> >> >> >> >
> >> >> >> >> >> > The network goes down after a few days (2-5days), and the
> issue
> >> >> >> >> >> > is the result of not adhering to the e1000 specification.
> Refer
> >> >> >> >> >> > to the details of the specification at the following link:
> >> >> >> >> >> >
> https://www.intel.com/content/dam/doc/manual/pci-pci-x-family-gbe-controllers-software-dev-manual.pdf
> >> >> >> >> >> >
> >> >> >> >> >> > Chapter 3.2.6 describe the Receive Descriptor Tail
> register(RDT)
> >> >> >> >> >> > as following:
> >> >> >> >> >> > This register holds a value that is an offset from the
> base, and
> >> >> >> >> >> > identifies the location beyond the last descriptor
> hardware can
> >> >> >> >> >> > process. Note that tail should still point to an area in
> the
> >> >> >> >> >> > descriptor ring (somewhere between RDBA and RDBA + RDLEN).
> >> >> >> >> >> > This is because tail points to the location where
> software writes
> >> >> >> >> >> > the first new descriptor.
> >> >> >> >> >> >
> >> >> >> >> >> > This means that if the provider—in this case, QEMU—has
> not yet
> >> >> >> >> >> > loaded the packet,
> >> >> >> >> >>
> >> >> >> >> >> What do you mean by "load" here?
> >> >> >> >> >
> >> >> >> >> >
> >> >> >> >> > Sorry for failing to describe the details.
> >> >> >> >> >
> >> >> >> >> > The guest driver retrieves the packet from the receive ring
> buffer
> >> >> >> >> > after QEMU forwards it from the tun/tap interface in the
> e1000
> >> >> >> >> > emulation.
> >> >> >> >> >
> >> >> >> >> > I used "load" to express "putting packets into the receive
> ring buffer."
> >> >> >> >> >
> >> >> >> >> >>
> >> >> >> >> >>
> >> >> >> >> >> > RDT should never point to that place.
> >> >> >> >> >>
> >> >> >> >> >> And "that place"?
> >> >> >> >> >
> >> >> >> >> > If a descriptor in the receive ring buffer has not been
> filled with a
> >> >> >> >> > packet address by QEMU, the descriptor therefore doesn't
> have any
> >> >> >> >> > available packets. The location of the descriptor should not
> be referred
> >> >> >> >> > to by RDT because the location is in the range that
> "hardware" handles.
> >> >> >> >> >
> >> >> >> >> > "that place" means the location of the descriptor in the
> ring buffer
> >> >> >> >> > that QEMU hasn't set any available packets related to.
> >> >> >> >> >
> >> >> >> >> >>
> >> >> >> >> >>
> >> >> >> >> >> > When
> >> >> >> >> >> > implementing the emulation of the e1000 interface, QEMU
> evaluates
> >> >> >> >> >> > if the receive ring buffer is full once the RDT equals
> the RDH,
> >> >> >> >> >> > based on the assumption that guest drivers adhere to this
> >> >> >> >> >> > criterion strictly.
> >> >> >> >> >> >
> >> >> >> >> >> > We applied the following log patch to assist in analyzing
> the
> >> >> >> >> >> > issue and eventually obtained the unexpected information.
> >> >> >> >> >> >
> >> >> >> >> >> > Log patch:
> >> >> >> >> >> >
> -----------------------------------------------------------------
> >> >> >> >> >> > |--- a/hw/net/e1000.c
> >> >> >> >> >> > |+++ b/hw/net/e1000.c
> >> >> >> >> >> > |@@ -836,6 +836,9 @@ e1000_set_link_status(NetClientState
> *nc)
> >> >> >> >> >> > | static bool e1000_has_rxbufs(E1000State *s, size_t
> total_size)
> >> >> >> >> >> > | {
> >> >> >> >> >> > |     int bufs;
> >> >> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] =
> %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> >> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN],
> s->mac_reg[RDH], s->mac_reg[RDT]);
> >> >> >> >> >> > |+
> >> >> >> >> >> > |     /* Fast-path short packets */
> >> >> >> >> >> > |     if (total_size <= s->rxbuf_size) {
> >> >> >> >> >> > |         if (s->mac_reg[RDH] == s->mac_reg[RDT] &&
> s->last_overrun)
> >> >> >> >> >> > |@@ -1022,6 +1025,9 @@ e1000_receive_iov(NetClientState
> *nc, const struct iovec *iov, int iovcnt)
> >> >> >> >> >> > |         s->rxbuf_min_shift)
> >> >> >> >> >> > |         n |= E1000_ICS_RXDMT0;
> >> >> >> >> >> > |
> >> >> >> >> >> > |+    DBGOUT(RX, "rxbuf_size = %u, s->mac_reg[RDLEN] =
> %u, s->mac_reg[RDH] = %u, s->mac_reg[RDT] = %u\n",
> >> >> >> >> >> > |+           s->rxbuf_size, s->mac_reg[RDLEN],
> s->mac_reg[RDH], s->mac_reg[RDT]);
> >> >> >> >> >> > |+
> >> >> >> >> >> >
> -----------------------------------------------------------------
> >> >> >> >> >> >
> >> >> >> >> >> > The last few logs of information when the network is down:
> >> >> >> >> >> >
> >> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048,
> s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 897, s->mac_reg[RDT] = 885
> >> >> >> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> >> >> >> > e1000_has_rxbufs function, not full.
> >> >> >> >> >> >
> >> >> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048,
> s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> >> >> >> >> > <- RDT stays the same, RDH updates to 898, and 1
> descriptor
> >> >> >> >> >> > utilized after putting the packet to ring buffer.
> >> >> >> >> >> >
> >> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048,
> s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 898, s->mac_reg[RDT] = 885
> >> >> >> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> >> >> >> > e1000_has_rxbufs function, not full.
> >> >> >> >> >> >
> >> >> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048,
> s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> >> >> >> >> > <- RDT stays the same, RDH updates to 899, and 1
> descriptor
> >> >> >> >> >> > utilized after putting the packet to ring buffer.
> >> >> >> >> >> >
> >> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048,
> s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 899, s->mac_reg[RDT] = 885
> >> >> >> >> >> > <- the receive ring buffer is checked for fullness in the
> >> >> >> >> >> > e1000_has_rxbufs function, not full.
> >> >> >> >> >> >
> >> >> >> >> >> > e1000: total_size = 64, rxbuf_size = 2048,
> s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 885
> >> >> >> >> >> > <- RDT stays the same, RDH updates to 900 , and 1
> descriptor
> >> >> >> >> >> > utilized after putting the packet to ring buffer.
> >> >> >> >> >> >
> >> >> >> >> >> > e1000: total_size = 1, rxbuf_size = 2048,
> s->mac_reg[RDLEN] = 16384, s->mac_reg[RDH] = 900, s->mac_reg[RDT] = 900
> >> >> >> >> >> > <- The ring is full, according to e1000_has_rxbufs,
> because
> >> >> >> >> >> > of the RDT update to 900 and equals RDH !
> >> >> >> >> >>
> >> >> >> >> >> Just to make sure I understand this, RDT==RDH means the
> ring is empty I think?
> >> >> >> >> >>
> >> >> >> >> >>
> >> >> >> >> >> See commit:
> >> >> >> >> >>
> >> >> >> >> >> commit e5b8b0d4ba29fe1268ba049519a1b0cf8552a21a
> >> >> >> >> >> Author: Dmitry Fleytman <dmitry@daynix.com>
> >> >> >> >> >> Date:   Fri Oct 19 07:56:55 2012 +0200
> >> >> >> >> >>
> >> >> >> >> >>     e1000: drop check_rxov, always treat RX ring with RDH
> == RDT as empty
> >> >> >> >> >>
> >> >> >> >> >>     Real HW always treats RX ring with RDH == RDT as empty.
> >> >> >> >> >>     Emulation is supposed to behave the same.
> >> >> >> >> >
> >> >> >> >> >
> >> >> >> >> > Indeed, I'm confused :(,  the description in the comment
> claims that RX
> >> >> >> >> > rings with RDH == RDT as empty, but in implementation, it
> treats that as
> >> >> >> >> > overrun.
> >> >> >> >> >
> >> >> >> >> > See the following 2 contexts:
> >> >> >> >> >
> >> >> >> >> > 1. e1000_can_receive:
> >> >> >> >> > static bool e1000_can_receive(NetClientState *nc)
> >> >> >> >> > {
> >> >> >> >> >     E1000State *s = qemu_get_nic_opaque(nc);
> >> >> >> >> >     // e1000_has_rxbufs return true means ring buffer has
> >> >> >> >> >     // available descriptors to use for QEMU.
> >> >> >> >> >     // false means ring buffer overrun and QEMU should queue
> the packet
> >> >> >> >> >     // and wait for the RDT update and available descriptors
> can be used.
> >> >> >> >> >
> >> >> >> >> >     return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
> >> >> >> >> >         e1000_has_rxbufs(s, 1) &&
> !timer_pending(s->flush_queue_timer);
> >> >> >> >> > }
> >> >> >> >>
> >> >> >> >> Well we had in e1000_has_rx_bufs
> >> >> >> >>
> >> >> >> >>     if (total_size <= s->rxbuf_size) {
> >> >> >> >>         return s->mac_reg[RDH] != s->mac_reg[RDT];
> >> >> >> >>     }
> >> >> >> >>
> >> >> >> >> RDT!=RDH means RX ring has available descriptors for hardware?
> >> >> >> >
> >> >> >> >
> >> >> >> > IMHO, Yes.
> >> >> >>
> >> >> >> Just to make sure we are on the same page, so
> >> >> >>
> >> >> >> RDT!=RDH, descriptors available for hardware
> >> >> >> RDT==RDH, descriptor ring is empty for hardware
> >> >> >>
> >> >> >>
> >> >> >> That is currently what the code did. Seems nothing wrong, or
> anything
> >> >> >> I missed here?
> >> >> >
> >> >> >
> >> >> > There are two cases for RDT == RDH.
> >> >> >
> >> >> > 1. Hardware has filled all available descriptors and overrun.
> >> >> >    In this case, hardware cannot add any new packets to the ring.
> >> >> >
> >> >> > 2. Software has consumed all descriptors, and all the descriptors
> >> >> >     on the ring can be used by hardware. (Let's name this case
> "empty.")
> >> >> >    In this case, hardware should keep putting new packets to the
> ring
> >> >>
> >> >> Well this seems not what spec said. See Figure 3-2, when RDT==RDH,
> >> >> nothing is owned by hardware. And this is what Dmitry said in the
> >> >> commit mentioned above.
> >> >
> >> >
> >> > Yes, this is the main cause of network interruptions. Hardware should
> >> > never touch the location of the descriptor that RDT points to. This is
> >> > what the specification declares. IMHO, it also implies that the
> descriptor
> >> > referred to by the RDT has available packets.
> >> >
> >> > In our case, hardware has not added any new packets to the descriptor,
> >> > and the expected behavior is that software never updates the RDT to
> >> > the location of that descriptor. But now, it does.
> >> >
> >> > If hardware and software both work as expected under the e1000
> >> > specification, the issue does not exist. I have no objection that the
> root
> >> > cause is the Windows driver bug in e1000, and I'm not insisting that
> >> > QEMU should take the responsibility for fixing that.
> >> >
> >> >>
> >> >>
> >> >> Which version of the driver did you use in the guest? (Or have you
> >> >> tried to download the one from Intel website) I'm asking since e1000
> >> >> support has been deprecated by Microsoft for years.
> >> >
> >> >
> >> > I use Windows Server 2019 and the driver version is 8.4.13.0.
> >>
> >> Is this the one you download from the Intel website?
> >
> >
> > No, this one is the default driver of Windows Server 2019 if my
> > information is correct.
> >
> >>
> >>
> >> >
> >> > Since Microsoft no longer supports e1000, as you already mentioned,
> >> > this patch merely offers a workaround.
> >>
> >> Is there a chance to switch to use e1000e, it has been actively
> >> maintained and supported.
> >
> >
> > Yes, this is another workaround, we'll try this in the end, thanks for
> > the advice.
> >
> >>
> >>
> >> > Since some users still use
> >> > e1000 in production environments, it doesn't seem to have any side
> >> > effects.
> >>
> >> We need to make sure it matches the hardware behaviour. Otherwise it
> >> might break other operating systems. Looking at the history, we used
> >> to break e1000 on various operating systems: windows, BSD, minix,
> >> windriver ....
> >
> >
> > Yes, I absolutely agree with you. Determining if this patch will affect
> > other operating systems is therefore crucial.
> >
> >>
> >>
> >> >
> >> > It would be really appreciated if the patch was given some thought.
> >>
> >> We would try to evaluate each patch carefully. Qemu is a function
> >> emulator so we need to make sure the function matches hardware
> >> behaviour before it can be merged.
> >
> >
> > Two solutions could be applied:
> >
> > 1. fix the windows driver to ensure that RDT does not point to the
> location
> >   of the descriptors hardware doesn't put any packets into. Let's name
> this
> >   state of descriptor as "invalid".
> >
> > 2. modify the QEMU to handle this case while not breaking other OSes.
> >
> > Since this patch only does a sanity check, it appears to adhere to the
> > second solution, in my opinion.
> >
> > The patch's sole unintended consequence is that it prevents guests from
> > purposefully updating the RDT to link to the "invalid" description.
> >
> > What do you think?
>
> For 1), please try the e1000 driver from Intel (I remember it could be
> downloaded from the Intel website).
> For 2), it would be hard as it needs to be tested with a lot of
> operating systems (or we can have Intel Engineers involved).
>
> And I really suggest you shift to e1000e (or igb) if it is possible.
>

Okay, I get it, I appreciate the review and the recommendation. :)

Thanks

Yong


>
> Thanks
>
> >
> > Yong
> >
> >>
> >> One way is to test via real hardware or involve Intel Engineers.
> >>
> >>
> >> Thanks
> >>
> >> >
> >> > Thanks,
> >> > Yong
> >> >
> >> >>
> >> >>
> >> >> Thanks
> >> >>
> >> >> >
> >> >> > But at the moment, the logic of e1000_has_rx_bufs acts exactly
> like it was
> >> >> > the first case, unable to differentiate between the two scenarios.
> >> >> >
> >> >> >>
> >> >> >>
> >> >> >> Thanks
> >> >> >>
> >> >> >> >
> >> >> >> >>
> >> >> >> >> Adding more people.
> >> >> >> >>
> >> >> >> >> Thanks
> >> >> >> >>
> >> >> >> >
> >> >> >> >
> >> >> >> > --
> >> >> >> > Best regards
> >> >> >>
> >> >> >
> >> >> > Yong
> >> >> >
> >> >> > --
> >> >> > Best regards
> >> >>
> >> >
> >> >
> >> > --
> >> > Best regards
> >>
> >
> >
> > --
> > Best regards
>
>
diff mbox series

Patch

diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index 5012b96464..f80cb70283 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -126,6 +126,12 @@  struct E1000State_st {
 
     QEMUTimer *flush_queue_timer;
 
+    /*
+     * Indicate that the receive circular buffer queue overrun
+     * the last time hardware produced packets.
+     */
+    bool last_overrun;
+
 /* Compatibility flags for migration to/from qemu 1.3.0 and older */
 #define E1000_FLAG_MAC_BIT 2
 #define E1000_FLAG_TSO_BIT 3
@@ -832,7 +838,12 @@  static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
     int bufs;
     /* Fast-path short packets */
     if (total_size <= s->rxbuf_size) {
-        return s->mac_reg[RDH] != s->mac_reg[RDT];
+        if (s->mac_reg[RDH] == s->mac_reg[RDT] && s->last_overrun) {
+            return false;
+        }
+
+        DBGOUT(RX, "Receive ring buffer is not full unexpectedly!\n");
+        return true;
     }
     if (s->mac_reg[RDH] < s->mac_reg[RDT]) {
         bufs = s->mac_reg[RDT] - s->mac_reg[RDH];
@@ -840,7 +851,12 @@  static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
         bufs = s->mac_reg[RDLEN] /  sizeof(struct e1000_rx_desc) +
             s->mac_reg[RDT] - s->mac_reg[RDH];
     } else {
-        return false;
+        if (s->last_overrun) {
+            return false;
+        }
+
+        DBGOUT(RX, "Receive ring buffer is not full unexpectedly!\n");
+        return true;
     }
     return total_size <= bufs * s->rxbuf_size;
 }
@@ -999,6 +1015,8 @@  e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
 
     e1000x_update_rx_total_stats(s->mac_reg, pkt_type, size, total_size);
 
+    s->last_overrun = (s->mac_reg[RDH] == s->mac_reg[RDT]) ? true : false;
+
     n = E1000_ICS_RXT0;
     if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH])
         rdt += s->mac_reg[RDLEN] / sizeof(desc);