diff mbox

[RFC,v2,04/10] virtio-net rsc: Detailed IPv4 and General TCP data coalescing

Message ID 1454264009-24094-5-git-send-email-wexu@redhat.com
State New
Headers show

Commit Message

Wei Xu Jan. 31, 2016, 6:13 p.m. UTC
From: Wei Xu <wei@wei-thinkpad.nay.redhat.com>

Since this feature also needs to support IPv6, and there are
some protocol specific differences difference for IPv4/6 in the header,
so try to make the interface to be general.

IPv4/6 should set up both the new and old IP/TCP header before invoking
TCP coalescing, and should also tell the real payload.

The main handler of TCP includes TCP window update, duplicated ACK check
and the real data coalescing if the new segment passed invalid filter
and is identified as an expected one.

An expected segment means:
1. Segment is within current window and the sequence is the expected one.
2. ACK of the segment is in the valid window.
3. If the ACK in the segment is a duplicated one, then it must less than 2,
   this is to notify upper layer TCP starting retransmission due to the spec.

Signed-off-by: Wei Xu <wexu@redhat.com>
---
 hw/net/virtio-net.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 124 insertions(+), 3 deletions(-)

Comments

Jason Wang Feb. 1, 2016, 6:21 a.m. UTC | #1
On 02/01/2016 02:13 AM, wexu@redhat.com wrote:
> From: Wei Xu <wei@wei-thinkpad.nay.redhat.com>
>
> Since this feature also needs to support IPv6, and there are
> some protocol specific differences difference for IPv4/6 in the header,
> so try to make the interface to be general.
>
> IPv4/6 should set up both the new and old IP/TCP header before invoking
> TCP coalescing, and should also tell the real payload.
>
> The main handler of TCP includes TCP window update, duplicated ACK check
> and the real data coalescing if the new segment passed invalid filter
> and is identified as an expected one.
>
> An expected segment means:
> 1. Segment is within current window and the sequence is the expected one.
> 2. ACK of the segment is in the valid window.
> 3. If the ACK in the segment is a duplicated one, then it must less than 2,
>    this is to notify upper layer TCP starting retransmission due to the spec.
>
> Signed-off-by: Wei Xu <wexu@redhat.com>
> ---
>  hw/net/virtio-net.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 124 insertions(+), 3 deletions(-)
>
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index cfbac6d..4f77fbe 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -41,6 +41,10 @@
>  
>  #define VIRTIO_HEADER   12    /* Virtio net header size */
>  #define IP_OFFSET (VIRTIO_HEADER + sizeof(struct eth_header))
> +#define TCP_WINDOW      65535

The name is confusing, how about TCP_MAX_WINDOW_SIZE ?

> +
> +/* IPv4 max payload, 16 bits in the header */
> +#define MAX_IP4_PAYLOAD  (65535 - sizeof(struct ip_header))
>  
>  #define MAX_VIRTIO_IP_PAYLOAD  (65535 + IP_OFFSET)
>  
> @@ -1670,13 +1674,130 @@ out:
>      return 0;
>  }
>  
> +static int32_t virtio_net_rsc_handle_ack(NetRscChain *chain, NetRscSeg *seg,
> +                                 const uint8_t *buf, struct tcp_header *n_tcp,
> +                                 struct tcp_header *o_tcp)
> +{
> +    uint32_t nack, oack;
> +    uint16_t nwin, owin;
> +
> +    nack = htonl(n_tcp->th_ack);
> +    nwin = htons(n_tcp->th_win);
> +    oack = htonl(o_tcp->th_ack);
> +    owin = htons(o_tcp->th_win);
> +
> +    if ((nack - oack) >= TCP_WINDOW) {
> +        return RSC_FINAL;
> +    } else if (nack == oack) {
> +        /* duplicated ack or window probe */
> +        if (nwin == owin) {
> +            /* duplicated ack, add dup ack count due to whql test up to 1 */
> +
> +            if (seg->dup_ack_count == 0) {
> +                seg->dup_ack_count++;
> +                return RSC_COALESCE;
> +            } else {
> +                /* Spec says should send it directly */
> +                return RSC_FINAL;
> +            }
> +        } else {
> +            /* Coalesce window update */

Need we flush this immediately consider it was a window update?

> +            o_tcp->th_win = n_tcp->th_win;
> +            return RSC_COALESCE;
> +        }
> +    } else {

What if nack < oack here?

> +        /* pure ack, update ack */
> +        o_tcp->th_ack = n_tcp->th_ack;
> +        return RSC_COALESCE;
> +    }
> +}
> +
> +static int32_t virtio_net_rsc_coalesce_tcp(NetRscChain *chain, NetRscSeg *seg,
> +               const uint8_t *buf, struct tcp_header *n_tcp, uint16_t n_tcp_len,
> +               uint16_t n_data, struct tcp_header *o_tcp, uint16_t o_tcp_len,
> +               uint16_t o_data, uint16_t *p_ip_len, uint16_t max_data)
> +{
> +    void *data;
> +    uint16_t o_ip_len;
> +    uint32_t nseq, oseq;
> +
> +    o_ip_len = htons(*p_ip_len);
> +    nseq = htonl(n_tcp->th_seq);
> +    oseq = htonl(o_tcp->th_seq);
> +

Need to the tcp header check here. And looks like we need also check more:

- Flags
- Data offset
- URG pointer

> +    /* Ignore packet with more/larger tcp options */
> +    if (n_tcp_len > o_tcp_len) {

What if n_tcp_len < o_tcp_len ?

> +        return RSC_FINAL;
> +    }
> +
> +    /* out of order or retransmitted. */
> +    if ((nseq - oseq) > TCP_WINDOW) {
> +        return RSC_FINAL;
> +    }
> +
> +    data = ((uint8_t *)n_tcp) + n_tcp_len;
> +    if (nseq == oseq) {
> +        if ((0 == o_data) && n_data) {
> +            /* From no payload to payload, normal case, not a dup ack or etc */
> +            goto coalesce;
> +        } else {
> +            return virtio_net_rsc_handle_ack(chain, seg, buf, n_tcp, o_tcp);
> +        }
> +    } else if ((nseq - oseq) != o_data) {
> +        /* Not a consistent packet, out of order */
> +        return RSC_FINAL;
> +    } else {
> +coalesce:
> +        if ((o_ip_len + n_data) > max_data) {
> +            return RSC_FINAL;
> +        }
> +
> +        /* Here comes the right data, the payload lengh in v4/v6 is different,
> +           so use the field value to update */
> +        *p_ip_len = htons(o_ip_len + n_data); /* Update new data len */
> +        o_tcp->th_offset_flags = n_tcp->th_offset_flags; /* Bring 'PUSH' big */

Is it correct? How about URG pointer?

> +        o_tcp->th_ack = n_tcp->th_ack;
> +        o_tcp->th_win = n_tcp->th_win;
> +
> +        memmove(seg->buf + seg->size, data, n_data);
> +        seg->size += n_data;
> +        return RSC_COALESCE;
> +    }
> +}
>  
>  static int32_t virtio_net_rsc_try_coalesce4(NetRscChain *chain,
>                         NetRscSeg *seg, const uint8_t *buf, size_t size)
>  {
> -    /* This real part of this function will be introduced in next patch, just
> -    *  return a 'final' to feed the compilation. */
> -    return RSC_FINAL;
> +    uint16_t o_ip_len, n_ip_len;    /* len in ip header field */
> +    uint16_t n_ip_hdrlen, o_ip_hdrlen;  /* ipv4 header len */
> +    uint16_t n_tcp_len, o_tcp_len;  /* tcp header len */
> +    uint16_t o_data, n_data;          /* payload without virtio/eth/ip/tcp */
> +    struct ip_header *n_ip, *o_ip;
> +    struct tcp_header *n_tcp, *o_tcp;
> +
> +    n_ip = (struct ip_header *)(buf + IP_OFFSET);
> +    n_ip_hdrlen = ((0xF & n_ip->ip_ver_len) << 2);
> +    n_ip_len = htons(n_ip->ip_len);
> +    n_tcp = (struct tcp_header *)(((uint8_t *)n_ip) + n_ip_hdrlen);
> +    n_tcp_len = (htons(n_tcp->th_offset_flags) & 0xF000) >> 10;
> +    n_data = n_ip_len - n_ip_hdrlen - n_tcp_len;
> +
> +    o_ip = (struct ip_header *)(seg->buf + IP_OFFSET);
> +    o_ip_hdrlen = ((0xF & o_ip->ip_ver_len) << 2);
> +    o_ip_len = htons(o_ip->ip_len);
> +    o_tcp = (struct tcp_header *)(((uint8_t *)o_ip) + o_ip_hdrlen);
> +    o_tcp_len = (htons(o_tcp->th_offset_flags) & 0xF000) >> 10;
> +    o_data = o_ip_len - o_ip_hdrlen - o_tcp_len;

Any chance to eliminate the above codes duplication into a helper? To
simplify the codes, you may want just store two pointers in seg (one is
ip header another is tcp header).

> +
> +    if ((n_ip->ip_src ^ o_ip->ip_src) || (n_ip->ip_dst ^ o_ip->ip_dst)
> +        || (n_tcp->th_sport ^ o_tcp->th_sport)
> +        || (n_tcp->th_dport ^ o_tcp->th_dport)) {

Lots of other fields need to be checked here:

- header length
- TOS
- Flag
- identification

I think we could not try to merge the packet with if any of the above
fields do not match. Since you split the coalescing function into
different layers, tcp header check should be postponed.

> +        return RSC_NO_MATCH;
> +    }
> +
> +    return virtio_net_rsc_coalesce_tcp(chain, seg, buf,
> +                                    n_tcp, n_tcp_len, n_data, o_tcp, o_tcp_len,
> +                                    o_data, &o_ip->ip_len, MAX_IP4_PAYLOAD);

Since you've passed seg and buf, do we really need such a huge parameter
list? Looks like some of the header parsing has already been done in
virtio_net_rsc_coalesce_tcp().

>  }
>  
>  static size_t virtio_net_rsc_callback(NetRscChain *chain, NetClientState *nc,
Wei Xu Feb. 1, 2016, 8:29 a.m. UTC | #2
On 02/01/2016 02:21 PM, Jason Wang wrote:

>
> On 02/01/2016 02:13 AM, wexu@redhat.com wrote:
>> From: Wei Xu <wei@wei-thinkpad.nay.redhat.com>
>>
>> Since this feature also needs to support IPv6, and there are
>> some protocol specific differences difference for IPv4/6 in the header,
>> so try to make the interface to be general.
>>
>> IPv4/6 should set up both the new and old IP/TCP header before invoking
>> TCP coalescing, and should also tell the real payload.
>>
>> The main handler of TCP includes TCP window update, duplicated ACK check
>> and the real data coalescing if the new segment passed invalid filter
>> and is identified as an expected one.
>>
>> An expected segment means:
>> 1. Segment is within current window and the sequence is the expected one.
>> 2. ACK of the segment is in the valid window.
>> 3. If the ACK in the segment is a duplicated one, then it must less than 2,
>>     this is to notify upper layer TCP starting retransmission due to the spec.
>>
>> Signed-off-by: Wei Xu <wexu@redhat.com>
>> ---
>>   hw/net/virtio-net.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 124 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
>> index cfbac6d..4f77fbe 100644
>> --- a/hw/net/virtio-net.c
>> +++ b/hw/net/virtio-net.c
>> @@ -41,6 +41,10 @@
>>   
>>   #define VIRTIO_HEADER   12    /* Virtio net header size */
>>   #define IP_OFFSET (VIRTIO_HEADER + sizeof(struct eth_header))
>> +#define TCP_WINDOW      65535
> The name is confusing, how about TCP_MAX_WINDOW_SIZE ?

Sounds better, will take it in.

>
>> +
>> +/* IPv4 max payload, 16 bits in the header */
>> +#define MAX_IP4_PAYLOAD  (65535 - sizeof(struct ip_header))
>>   
>>   #define MAX_VIRTIO_IP_PAYLOAD  (65535 + IP_OFFSET)
>>   
>> @@ -1670,13 +1674,130 @@ out:
>>       return 0;
>>   }
>>   
>> +static int32_t virtio_net_rsc_handle_ack(NetRscChain *chain, NetRscSeg *seg,
>> +                                 const uint8_t *buf, struct tcp_header *n_tcp,
>> +                                 struct tcp_header *o_tcp)
>> +{
>> +    uint32_t nack, oack;
>> +    uint16_t nwin, owin;
>> +
>> +    nack = htonl(n_tcp->th_ack);
>> +    nwin = htons(n_tcp->th_win);
>> +    oack = htonl(o_tcp->th_ack);
>> +    owin = htons(o_tcp->th_win);
>> +
>> +    if ((nack - oack) >= TCP_WINDOW) {
>> +        return RSC_FINAL;
>> +    } else if (nack == oack) {
>> +        /* duplicated ack or window probe */
>> +        if (nwin == owin) {
>> +            /* duplicated ack, add dup ack count due to whql test up to 1 */
>> +
>> +            if (seg->dup_ack_count == 0) {
>> +                seg->dup_ack_count++;
>> +                return RSC_COALESCE;
>> +            } else {
>> +                /* Spec says should send it directly */
>> +                return RSC_FINAL;
>> +            }
>> +        } else {
>> +            /* Coalesce window update */
> Need we flush this immediately consider it was a window update?

The flowchart in the spec says this can be coalesced as normal.

https://msdn.microsoft.com/en-us/library/windows/hardware/jj853325%28v=vs.85%29.aspx

>
>> +            o_tcp->th_win = n_tcp->th_win;
>> +            return RSC_COALESCE;
>> +        }
>> +    } else {
> What if nack < oack here?

That should happen, the  modulo-232 arithmetic check at the begin of this function will keep the ack is in the current window.

>
>> +        /* pure ack, update ack */
>> +        o_tcp->th_ack = n_tcp->th_ack;
>> +        return RSC_COALESCE;
>> +    }
>> +}
>> +
>> +static int32_t virtio_net_rsc_coalesce_tcp(NetRscChain *chain, NetRscSeg *seg,
>> +               const uint8_t *buf, struct tcp_header *n_tcp, uint16_t n_tcp_len,
>> +               uint16_t n_data, struct tcp_header *o_tcp, uint16_t o_tcp_len,
>> +               uint16_t o_data, uint16_t *p_ip_len, uint16_t max_data)
>> +{
>> +    void *data;
>> +    uint16_t o_ip_len;
>> +    uint32_t nseq, oseq;
>> +
>> +    o_ip_len = htons(*p_ip_len);
>> +    nseq = htonl(n_tcp->th_seq);
>> +    oseq = htonl(o_tcp->th_seq);
>> +
> Need to the tcp header check here. And looks like we need also check more:
>
> - Flags
> - Data offset
> - URG pointer

ok.

>
>> +    /* Ignore packet with more/larger tcp options */
>> +    if (n_tcp_len > o_tcp_len) {
> What if n_tcp_len < o_tcp_len ?

This maybe a bug, it's better to bypass it.

>
>> +        return RSC_FINAL;
>> +    }
>> +
>> +    /* out of order or retransmitted. */
>> +    if ((nseq - oseq) > TCP_WINDOW) {
>> +        return RSC_FINAL;
>> +    }
>> +
>> +    data = ((uint8_t *)n_tcp) + n_tcp_len;
>> +    if (nseq == oseq) {
>> +        if ((0 == o_data) && n_data) {
>> +            /* From no payload to payload, normal case, not a dup ack or etc */
>> +            goto coalesce;
>> +        } else {
>> +            return virtio_net_rsc_handle_ack(chain, seg, buf, n_tcp, o_tcp);
>> +        }
>> +    } else if ((nseq - oseq) != o_data) {
>> +        /* Not a consistent packet, out of order */
>> +        return RSC_FINAL;
>> +    } else {
>> +coalesce:
>> +        if ((o_ip_len + n_data) > max_data) {
>> +            return RSC_FINAL;
>> +        }
>> +
>> +        /* Here comes the right data, the payload lengh in v4/v6 is different,
>> +           so use the field value to update */
>> +        *p_ip_len = htons(o_ip_len + n_data); /* Update new data len */
>> +        o_tcp->th_offset_flags = n_tcp->th_offset_flags; /* Bring 'PUSH' big */
> Is it correct? How about URG pointer?

It's better to bypass URG packets too.

>
>> +        o_tcp->th_ack = n_tcp->th_ack;
>> +        o_tcp->th_win = n_tcp->th_win;
>> +
>> +        memmove(seg->buf + seg->size, data, n_data);
>> +        seg->size += n_data;
>> +        return RSC_COALESCE;
>> +    }
>> +}
>>   
>>   static int32_t virtio_net_rsc_try_coalesce4(NetRscChain *chain,
>>                          NetRscSeg *seg, const uint8_t *buf, size_t size)
>>   {
>> -    /* This real part of this function will be introduced in next patch, just
>> -    *  return a 'final' to feed the compilation. */
>> -    return RSC_FINAL;
>> +    uint16_t o_ip_len, n_ip_len;    /* len in ip header field */
>> +    uint16_t n_ip_hdrlen, o_ip_hdrlen;  /* ipv4 header len */
>> +    uint16_t n_tcp_len, o_tcp_len;  /* tcp header len */
>> +    uint16_t o_data, n_data;          /* payload without virtio/eth/ip/tcp */
>> +    struct ip_header *n_ip, *o_ip;
>> +    struct tcp_header *n_tcp, *o_tcp;
>> +
>> +    n_ip = (struct ip_header *)(buf + IP_OFFSET);
>> +    n_ip_hdrlen = ((0xF & n_ip->ip_ver_len) << 2);
>> +    n_ip_len = htons(n_ip->ip_len);
>> +    n_tcp = (struct tcp_header *)(((uint8_t *)n_ip) + n_ip_hdrlen);
>> +    n_tcp_len = (htons(n_tcp->th_offset_flags) & 0xF000) >> 10;
>> +    n_data = n_ip_len - n_ip_hdrlen - n_tcp_len;
>> +
>> +    o_ip = (struct ip_header *)(seg->buf + IP_OFFSET);
>> +    o_ip_hdrlen = ((0xF & o_ip->ip_ver_len) << 2);
>> +    o_ip_len = htons(o_ip->ip_len);
>> +    o_tcp = (struct tcp_header *)(((uint8_t *)o_ip) + o_ip_hdrlen);
>> +    o_tcp_len = (htons(o_tcp->th_offset_flags) & 0xF000) >> 10;
>> +    o_data = o_ip_len - o_ip_hdrlen - o_tcp_len;
> Any chance to eliminate the above codes duplication into a helper? To
> simplify the codes, you may want just store two pointers in seg (one is
> ip header another is tcp header).

OK, will investigate it.

>
>> +
>> +    if ((n_ip->ip_src ^ o_ip->ip_src) || (n_ip->ip_dst ^ o_ip->ip_dst)
>> +        || (n_tcp->th_sport ^ o_tcp->th_sport)
>> +        || (n_tcp->th_dport ^ o_tcp->th_dport)) {
> Lots of other fields need to be checked here:
>
> - header length
> - TOS
> - Flag
> - identification
>
> I think we could not try to merge the packet with if any of the above
> fields do not match. Since you split the coalescing function into
> different layers, tcp header check should be postponed.

OK.

>
>> +        return RSC_NO_MATCH;
>> +    }
>> +
>> +    return virtio_net_rsc_coalesce_tcp(chain, seg, buf,
>> +                                    n_tcp, n_tcp_len, n_data, o_tcp, o_tcp_len,
>> +                                    o_data, &o_ip->ip_len, MAX_IP4_PAYLOAD);
> Since you've passed seg and buf, do we really need such a huge parameter
> list? Looks like some of the header parsing has already been done in
> virtio_net_rsc_coalesce_tcp().

ok, will investigate it.

>
>>   }
>>   
>>   static size_t virtio_net_rsc_callback(NetRscChain *chain, NetClientState *nc,
>
Jason Wang Feb. 1, 2016, 9:29 a.m. UTC | #3
On 02/01/2016 04:29 PM, Wei Xu wrote:
>
> On 02/01/2016 02:21 PM, Jason Wang wrote:
>
>>
>> On 02/01/2016 02:13 AM, wexu@redhat.com wrote:
>>> From: Wei Xu <wei@wei-thinkpad.nay.redhat.com>
>>>
>>> Since this feature also needs to support IPv6, and there are
>>> some protocol specific differences difference for IPv4/6 in the header,
>>> so try to make the interface to be general.
>>>
>>> IPv4/6 should set up both the new and old IP/TCP header before invoking
>>> TCP coalescing, and should also tell the real payload.
>>>
>>> The main handler of TCP includes TCP window update, duplicated ACK
>>> check
>>> and the real data coalescing if the new segment passed invalid filter
>>> and is identified as an expected one.
>>>
>>> An expected segment means:
>>> 1. Segment is within current window and the sequence is the expected
>>> one.
>>> 2. ACK of the segment is in the valid window.
>>> 3. If the ACK in the segment is a duplicated one, then it must less
>>> than 2,
>>>     this is to notify upper layer TCP starting retransmission due to
>>> the spec.
>>>
>>> Signed-off-by: Wei Xu <wexu@redhat.com>
>>> ---
>>>   hw/net/virtio-net.c | 127
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>   1 file changed, 124 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
>>> index cfbac6d..4f77fbe 100644
>>> --- a/hw/net/virtio-net.c
>>> +++ b/hw/net/virtio-net.c
>>> @@ -41,6 +41,10 @@
>>>     #define VIRTIO_HEADER   12    /* Virtio net header size */
>>>   #define IP_OFFSET (VIRTIO_HEADER + sizeof(struct eth_header))
>>> +#define TCP_WINDOW      65535
>> The name is confusing, how about TCP_MAX_WINDOW_SIZE ?
>
> Sounds better, will take it in.
>
>>
>>> +
>>> +/* IPv4 max payload, 16 bits in the header */
>>> +#define MAX_IP4_PAYLOAD  (65535 - sizeof(struct ip_header))
>>>     #define MAX_VIRTIO_IP_PAYLOAD  (65535 + IP_OFFSET)
>>>   @@ -1670,13 +1674,130 @@ out:
>>>       return 0;
>>>   }
>>>   +static int32_t virtio_net_rsc_handle_ack(NetRscChain *chain,
>>> NetRscSeg *seg,
>>> +                                 const uint8_t *buf, struct
>>> tcp_header *n_tcp,
>>> +                                 struct tcp_header *o_tcp)
>>> +{
>>> +    uint32_t nack, oack;
>>> +    uint16_t nwin, owin;
>>> +
>>> +    nack = htonl(n_tcp->th_ack);
>>> +    nwin = htons(n_tcp->th_win);
>>> +    oack = htonl(o_tcp->th_ack);
>>> +    owin = htons(o_tcp->th_win);
>>> +
>>> +    if ((nack - oack) >= TCP_WINDOW) {
>>> +        return RSC_FINAL;
>>> +    } else if (nack == oack) {
>>> +        /* duplicated ack or window probe */
>>> +        if (nwin == owin) {
>>> +            /* duplicated ack, add dup ack count due to whql test
>>> up to 1 */
>>> +
>>> +            if (seg->dup_ack_count == 0) {
>>> +                seg->dup_ack_count++;
>>> +                return RSC_COALESCE;
>>> +            } else {
>>> +                /* Spec says should send it directly */
>>> +                return RSC_FINAL;
>>> +            }
>>> +        } else {
>>> +            /* Coalesce window update */
>> Need we flush this immediately consider it was a window update?
>
> The flowchart in the spec says this can be coalesced as normal.
>
> https://msdn.microsoft.com/en-us/library/windows/hardware/jj853325%28v=vs.85%29.aspx

I see.

>
>
>>
>>> +            o_tcp->th_win = n_tcp->th_win;
>>> +            return RSC_COALESCE;
>>> +        }
>>> +    } else {
>> What if nack < oack here?
>
> That should happen, the  modulo-232 arithmetic check at the begin of
> this function will keep the ack is in the current window. 

Ok.

Thanks
diff mbox

Patch

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index cfbac6d..4f77fbe 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -41,6 +41,10 @@ 
 
 #define VIRTIO_HEADER   12    /* Virtio net header size */
 #define IP_OFFSET (VIRTIO_HEADER + sizeof(struct eth_header))
+#define TCP_WINDOW      65535
+
+/* IPv4 max payload, 16 bits in the header */
+#define MAX_IP4_PAYLOAD  (65535 - sizeof(struct ip_header))
 
 #define MAX_VIRTIO_IP_PAYLOAD  (65535 + IP_OFFSET)
 
@@ -1670,13 +1674,130 @@  out:
     return 0;
 }
 
+static int32_t virtio_net_rsc_handle_ack(NetRscChain *chain, NetRscSeg *seg,
+                                 const uint8_t *buf, struct tcp_header *n_tcp,
+                                 struct tcp_header *o_tcp)
+{
+    uint32_t nack, oack;
+    uint16_t nwin, owin;
+
+    nack = htonl(n_tcp->th_ack);
+    nwin = htons(n_tcp->th_win);
+    oack = htonl(o_tcp->th_ack);
+    owin = htons(o_tcp->th_win);
+
+    if ((nack - oack) >= TCP_WINDOW) {
+        return RSC_FINAL;
+    } else if (nack == oack) {
+        /* duplicated ack or window probe */
+        if (nwin == owin) {
+            /* duplicated ack, add dup ack count due to whql test up to 1 */
+
+            if (seg->dup_ack_count == 0) {
+                seg->dup_ack_count++;
+                return RSC_COALESCE;
+            } else {
+                /* Spec says should send it directly */
+                return RSC_FINAL;
+            }
+        } else {
+            /* Coalesce window update */
+            o_tcp->th_win = n_tcp->th_win;
+            return RSC_COALESCE;
+        }
+    } else {
+        /* pure ack, update ack */
+        o_tcp->th_ack = n_tcp->th_ack;
+        return RSC_COALESCE;
+    }
+}
+
+static int32_t virtio_net_rsc_coalesce_tcp(NetRscChain *chain, NetRscSeg *seg,
+               const uint8_t *buf, struct tcp_header *n_tcp, uint16_t n_tcp_len,
+               uint16_t n_data, struct tcp_header *o_tcp, uint16_t o_tcp_len,
+               uint16_t o_data, uint16_t *p_ip_len, uint16_t max_data)
+{
+    void *data;
+    uint16_t o_ip_len;
+    uint32_t nseq, oseq;
+
+    o_ip_len = htons(*p_ip_len);
+    nseq = htonl(n_tcp->th_seq);
+    oseq = htonl(o_tcp->th_seq);
+
+    /* Ignore packet with more/larger tcp options */
+    if (n_tcp_len > o_tcp_len) {
+        return RSC_FINAL;
+    }
+
+    /* out of order or retransmitted. */
+    if ((nseq - oseq) > TCP_WINDOW) {
+        return RSC_FINAL;
+    }
+
+    data = ((uint8_t *)n_tcp) + n_tcp_len;
+    if (nseq == oseq) {
+        if ((0 == o_data) && n_data) {
+            /* From no payload to payload, normal case, not a dup ack or etc */
+            goto coalesce;
+        } else {
+            return virtio_net_rsc_handle_ack(chain, seg, buf, n_tcp, o_tcp);
+        }
+    } else if ((nseq - oseq) != o_data) {
+        /* Not a consistent packet, out of order */
+        return RSC_FINAL;
+    } else {
+coalesce:
+        if ((o_ip_len + n_data) > max_data) {
+            return RSC_FINAL;
+        }
+
+        /* Here comes the right data, the payload lengh in v4/v6 is different,
+           so use the field value to update */
+        *p_ip_len = htons(o_ip_len + n_data); /* Update new data len */
+        o_tcp->th_offset_flags = n_tcp->th_offset_flags; /* Bring 'PUSH' big */
+        o_tcp->th_ack = n_tcp->th_ack;
+        o_tcp->th_win = n_tcp->th_win;
+
+        memmove(seg->buf + seg->size, data, n_data);
+        seg->size += n_data;
+        return RSC_COALESCE;
+    }
+}
 
 static int32_t virtio_net_rsc_try_coalesce4(NetRscChain *chain,
                        NetRscSeg *seg, const uint8_t *buf, size_t size)
 {
-    /* This real part of this function will be introduced in next patch, just
-    *  return a 'final' to feed the compilation. */
-    return RSC_FINAL;
+    uint16_t o_ip_len, n_ip_len;    /* len in ip header field */
+    uint16_t n_ip_hdrlen, o_ip_hdrlen;  /* ipv4 header len */
+    uint16_t n_tcp_len, o_tcp_len;  /* tcp header len */
+    uint16_t o_data, n_data;          /* payload without virtio/eth/ip/tcp */
+    struct ip_header *n_ip, *o_ip;
+    struct tcp_header *n_tcp, *o_tcp;
+
+    n_ip = (struct ip_header *)(buf + IP_OFFSET);
+    n_ip_hdrlen = ((0xF & n_ip->ip_ver_len) << 2);
+    n_ip_len = htons(n_ip->ip_len);
+    n_tcp = (struct tcp_header *)(((uint8_t *)n_ip) + n_ip_hdrlen);
+    n_tcp_len = (htons(n_tcp->th_offset_flags) & 0xF000) >> 10;
+    n_data = n_ip_len - n_ip_hdrlen - n_tcp_len;
+
+    o_ip = (struct ip_header *)(seg->buf + IP_OFFSET);
+    o_ip_hdrlen = ((0xF & o_ip->ip_ver_len) << 2);
+    o_ip_len = htons(o_ip->ip_len);
+    o_tcp = (struct tcp_header *)(((uint8_t *)o_ip) + o_ip_hdrlen);
+    o_tcp_len = (htons(o_tcp->th_offset_flags) & 0xF000) >> 10;
+    o_data = o_ip_len - o_ip_hdrlen - o_tcp_len;
+
+    if ((n_ip->ip_src ^ o_ip->ip_src) || (n_ip->ip_dst ^ o_ip->ip_dst)
+        || (n_tcp->th_sport ^ o_tcp->th_sport)
+        || (n_tcp->th_dport ^ o_tcp->th_dport)) {
+        return RSC_NO_MATCH;
+    }
+
+    return virtio_net_rsc_coalesce_tcp(chain, seg, buf,
+                                    n_tcp, n_tcp_len, n_data, o_tcp, o_tcp_len,
+                                    o_data, &o_ip->ip_len, MAX_IP4_PAYLOAD);
 }
 
 static size_t virtio_net_rsc_callback(NetRscChain *chain, NetClientState *nc,