diff mbox

[ovs-dev] datapath-windows: Compute checksums for VXLAN inner packets

Message ID 1441961323-11256-1-git-send-email-aserdean@cloudbasesolutions.com
State Changes Requested
Headers show

Commit Message

Alin Serdean Sept. 11, 2015, 8:48 a.m. UTC
Windows does not support VXLAN hardware offloading.

Currently we do not compute IP/TCP/UDP checksums for the inner packet. This
patch computes the checksums mentioned above in regards with the enabled
settings.

i.e. if IP checksum offloading is enabled for the inner packet we compute it.
The same applies for TCP and UDP packets.

This patch also revizes the computation of ones' complement over different
memory blocks, in the case the lengths are odd.

Also per documentation:
https://msdn.microsoft.com/en-us/library/windows/hardware/ff568840%28v=vs.85%29.aspx
set the TCP flags FIN and PSH only for the last segment in the case LSO is
enabled.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
---
This patch is also intended for branch 2.4
---
 datapath-windows/ovsext/BufferMgmt.c | 32 ++++++++++----
 datapath-windows/ovsext/Checksum.c   | 64 +++++++++++++++++----------
 datapath-windows/ovsext/Vxlan.c      | 83 +++++++++++++++++++++++++++++++-----
 3 files changed, 138 insertions(+), 41 deletions(-)

Comments

Sairam Venugopal Sept. 12, 2015, 12:18 a.m. UTC | #1
Hey Alin,

Am taking another stab at this code review. I will prefix my comments with
a ³Sai:² just to be safe.

I had few concerns over checksum calculations when LSO is enabled.

Thanks,
Sairam Venugopal

On 9/11/15, 1:48 AM, "Alin Serdean" <aserdean@cloudbasesolutions.com>
wrote:

>Windows does not support VXLAN hardware offloading.
>
>Currently we do not compute IP/TCP/UDP checksums for the inner packet.
>This
>patch computes the checksums mentioned above in regards with the enabled
>settings.
>
>i.e. if IP checksum offloading is enabled for the inner packet we compute
>it.
>The same applies for TCP and UDP packets.
>
>This patch also revizes the computation of ones' complement over different
>memory blocks, in the case the lengths are odd.
>
>Also per documentation:
>https://urldefense.proofpoint.com/v2/url?u=https-3A__msdn.microsoft.com_en
>-2Dus_library_windows_hardware_ff568840-2528v-3Dvs.85-2529.aspx&d=BQIGaQ&c
>=Sqcl0Ez6M0X8aeM67LKIiDJAXVeAw-YihVMNtXt-uEs&r=Dcruz40PROJ40ROzSpxyQSLw6fc
>rOWpJgEcEmNR3JEQ&m=bVjQF29IE0w5Wo086RFicGNW5uvzLeF03S9puquy74A&s=RPxjJER_P
>eb_paAWncvgmvJAOme0FkCJrAcfx4OLNws&e=
>set the TCP flags FIN and PSH only for the last segment in the case LSO is
>enabled.
>
>Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
>---
>This patch is also intended for branch 2.4
>---
> datapath-windows/ovsext/BufferMgmt.c | 32 ++++++++++----
> datapath-windows/ovsext/Checksum.c   | 64 +++++++++++++++++----------
> datapath-windows/ovsext/Vxlan.c      | 83
>+++++++++++++++++++++++++++++++-----
> 3 files changed, 138 insertions(+), 41 deletions(-)
>
>diff --git a/datapath-windows/ovsext/BufferMgmt.c
>b/datapath-windows/ovsext/BufferMgmt.c
>index 3550e20..bf53fc3 100644
>--- a/datapath-windows/ovsext/BufferMgmt.c
>+++ b/datapath-windows/ovsext/BufferMgmt.c
>@@ -1116,7 +1116,8 @@ GetSegmentHeaderInfo(PNET_BUFFER_LIST nbl,
>  * 
>--------------------------------------------------------------------------
>  */
> static NDIS_STATUS
>-FixSegmentHeader(PNET_BUFFER nb, UINT16 segmentSize, UINT32 seqNumber)
>+FixSegmentHeader(PNET_BUFFER nb, UINT16 segmentSize, UINT32 seqNumber,
>+                 BOOLEAN lastPacket, UINT16 packetCounter)
> {
>     EthHdr *dstEth;
>     IPHdr *dstIP;
>@@ -1141,18 +1142,29 @@ FixSegmentHeader(PNET_BUFFER nb, UINT16
>segmentSize, UINT32 seqNumber)
>     /* Fix IP length and checksum */
>     ASSERT(dstIP->protocol == IPPROTO_TCP);
>     dstIP->tot_len = htons(segmentSize + dstIP->ihl * 4 +
>TCP_HDR_LEN(dstTCP));
>+    dstIP->id += packetCounter;
>     dstIP->check = 0;
>     dstIP->check = IPChecksum((UINT8 *)dstIP, dstIP->ihl * 4, 0);
> 
>     /* Fix TCP checksum */
>     dstTCP->seq = htonl(seqNumber);
>-    dstTCP->check =
>-        IPPseudoChecksum((UINT32 *)&dstIP->saddr,
>-                         (UINT32 *)&dstIP->daddr,
>-                         IPPROTO_TCP, segmentSize + TCP_HDR_LEN(dstTCP));
>+
>+    if (dstTCP->fin) {
>+        dstTCP->fin = lastPacket;
>+    }
>+    if (dstTCP->psh) {
>+        dstTCP->psh = lastPacket;
>+    }
>+
>+    UINT16 csumLength = segmentSize + TCP_HDR_LEN(dstTCP);;
>+    dstTCP->check = IPPseudoChecksum(&dstIP->saddr,
>+        &dstIP->daddr,
>+        IPPROTO_TCP,
>+        csumLength);
>     dstTCP->check = CalculateChecksumNB(nb,
>-            (UINT16)(NET_BUFFER_DATA_LENGTH(nb) - sizeof *dstEth -
>dstIP->ihl * 4),
>-            sizeof *dstEth + dstIP->ihl * 4);
>+        csumLength,
>+        sizeof *dstEth + dstIP->ihl * 4);
>+
>     return STATUS_SUCCESS;
> }
> 
>@@ -1190,6 +1202,7 @@ OvsTcpSegmentNBL(PVOID ovsContext,
>     NDIS_STATUS status;
>     UINT16 segmentSize;
>     ULONG copiedSize;
>+    UINT16 packetCounter = 0;
> 
>     srcCtx = 
>(POVS_BUFFER_CONTEXT)NET_BUFFER_LIST_CONTEXT_DATA_START(nbl);
>     if (srcCtx == NULL || srcCtx->magic != OVS_CTX_MAGIC) {
>@@ -1232,7 +1245,9 @@ OvsTcpSegmentNBL(PVOID ovsContext,
>             goto nblcopy_error;
>         }
> 
>-        status = FixSegmentHeader(newNb, segmentSize, seqNumber);
>+        status = FixSegmentHeader(newNb, segmentSize, seqNumber,
>+                                  NET_BUFFER_NEXT_NB(newNb) == NULL,
>+                                  packetCounter);
>         if (status != NDIS_STATUS_SUCCESS) {
>             goto nblcopy_error;
>         }
>@@ -1241,6 +1256,7 @@ OvsTcpSegmentNBL(PVOID ovsContext,
>         /* Move on to the next segment */
>         size -= segmentSize;
>         seqNumber += segmentSize;
>+        packetCounter++;
>     }
> 
>     status = OvsAllocateNBLContext(context, newNbl);
>diff --git a/datapath-windows/ovsext/Checksum.c
>b/datapath-windows/ovsext/Checksum.c
>index 510a094..5d9b035 100644
>--- a/datapath-windows/ovsext/Checksum.c
>+++ b/datapath-windows/ovsext/Checksum.c

Sai: Are there any tests to validate these changes in Checksum.c? If not,
can someone else ack it.

>@@ -68,34 +68,48 @@ CalculateOnesComplement(UINT8 *start,
> {
>     UINT64  sum = 0, val;
>     UINT64  *src = (UINT64 *)start;
>-    union {
>-        UINT32 val;
>-        UINT8  b8[4];
>-    } tmp;
>-
>     while (totalLength > 7) {
>         val = *src;
>-        sum += (val >> 32) + (val & 0xffffffff);
>+        sum += val;
>+        if (sum < val) sum++;
>         src++;
>         totalLength -= 8;
>     }
>+
>+    start = (UINT8 *)src;
>+
>     if (totalLength > 3) {
>-        sum += *(UINT32 *)src;
>-        src = (UINT64 *)((UINT8 *)src + 4);
>+        UINT32 val = *(UINT32 *)start;
>+        sum += val;
>+        if (sum < val) sum++;
>+        start += 4;
>         totalLength -= 4;
>     }
>-    start = (UINT8 *)src;
>-    tmp.val = 0;
>-    switch (totalLength) {
>-    case 3:
>-        tmp.b8[2] = start[2];
>-    case 2:
>-        tmp.b8[1] = start[1];
>-    case 1:
>-        tmp.b8[0] = start[0];
>-        sum += tmp.val;
>+
>+    if (totalLength > 1) {
>+        UINT16 val = *(UINT16 *)start;
>+        sum += val;
>+        if (sum < val) sum++;
>+        start += 2;
>+        totalLength -= 2;
>     }
>-    sum = (isEvenStart ? sum : swap64(sum)) + initial;
>+
>+    if (totalLength > 0) {
>+        UINT8 val = *start;
>+        sum += val;
>+        if (sum < val) sum++;
>+        start += 1;
>+        totalLength -= 1;
>+    }
>+    ASSERT(totalLength == 0);
>+
>+    if (!isEvenStart) {
>+        sum = _byteswap_uint64(sum);
>+    }
>+
>+    sum += initial;
>+    if (sum < initial) sum++;
>+
>     return sum;
> }
> 
>@@ -428,6 +442,7 @@ CalculateChecksumNB(const PNET_BUFFER nb,
>     ULONG firstMdlLen;
>     /* Running count of bytes in remainder of the MDLs including
>current. */
>     ULONG packetLen;
>+    BOOLEAN swapEnd = 1 & csumDataLen;
> 
>     if ((nb == NULL) || (csumDataLen == 0)
>             || (offset >= NET_BUFFER_DATA_LENGTH(nb))
>@@ -482,10 +497,8 @@ CalculateChecksumNB(const PNET_BUFFER nb,
>     while (csumDataLen && (currentMdl != NULL)) {
>         ASSERT(mdlLen < 65536);
>         csLen = MIN((UINT16) mdlLen, csumDataLen);
>-        //XXX Not handling odd bytes yet.
>-        ASSERT(((csLen & 0x1) == 0) || csumDataLen <= mdlLen);
> 
>-        csum = CalculateOnesComplement(src, csLen, csum, TRUE);

Sai: You can reuse the swapEnd below - !swapEnd

>+        csum = CalculateOnesComplement(src, csLen, csum, !(1 &
>csumDataLen));
>         fold64(csum);
> 
>         csumDataLen -= csLen;
>@@ -504,9 +517,14 @@ CalculateChecksumNB(const PNET_BUFFER nb,
>         }
>     }
> 
>+    fold64(csum);
>     ASSERT(csumDataLen == 0);
>     ASSERT((csum & ~0xffff) == 0);
>-    return (UINT16) ~csum;
>+    csum = (UINT16)~csum;
>+    if (swapEnd) {
>+        return _byteswap_ushort((UINT16)csum);
>+    }
>+    return (UINT16)csum;
> }
> 
> /*
>diff --git a/datapath-windows/ovsext/Vxlan.c
>b/datapath-windows/ovsext/Vxlan.c
>index 2364f28..a179fbe 100644
>--- a/datapath-windows/ovsext/Vxlan.c
>+++ b/datapath-windows/ovsext/Vxlan.c
>@@ -152,9 +152,9 @@ OvsCleanupVxlanTunnel(PIRP irp,
> 
>     if (vxlanPort->filterID != 0) {
>         status = OvsTunnelFilterDelete(irp,
>-				       vxlanPort->filterID,
>-				       callback,
>-				       tunnelContext);
>+                                       vxlanPort->filterID,
>+                                       callback,
>+                                       tunnelContext);
>     } else {
>         OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
>         vport->priv = NULL;
>@@ -190,6 +190,9 @@ OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
>     POVS_VXLAN_VPORT vportVxlan;
>     UINT32 headRoom = OvsGetVxlanTunHdrSize();
>     UINT32 packetLength;
>+    NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
>+    csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
>+                 
>TcpIpChecksumNetBufferListInfo);
> 
>     /*
>      * XXX: the assumption currently is that the NBL is owned by OVS, and
>@@ -198,20 +201,24 @@ OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
>      */
>     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
>+
>     if (layers->isTcp) {
>         NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo;
> 
>         tsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
>                 TcpLargeSendNetBufferListInfo);
>-        OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS,
>packetLength);
>+        OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS,
>+                      packetLength);
>         if (tsoInfo.LsoV1Transmit.MSS) {
>             OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
>             *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
>-                        tsoInfo.LsoV1Transmit.MSS, headRoom);
>+                                       tsoInfo.LsoV1Transmit.MSS,
>headRoom);
>             if (*newNbl == NULL) {
>                 OVS_LOG_ERROR("Unable to segment NBL");
>                 return NDIS_STATUS_FAILURE;
>             }
>+            /* Clear out LSO flags after this point */
>+            NET_BUFFER_LIST_INFO(curNbl, TcpLargeSendNetBufferListInfo)
>= 0;
>         }
>     }
> 
>@@ -226,6 +233,61 @@ OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
>             OVS_LOG_ERROR("Unable to copy NBL");
>             return NDIS_STATUS_FAILURE;
>         }
>+        /*
>+         * To this point we do not have VXLAN offloading.
>+         * Apply defined checksums
>+         */

Sai: Correct me if am wrong, this computes checksum for the inner packet
if there is no segmentation.
In case the inner packet is segmented via LSO, shouldn't there be
additional checksum calculations for each inner-segment?
If this is just for non-lso packet, then you can do the following prior to
creating *newNbl to avoid having to compute bufferStart.


>+        curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
>+        curMdl = NET_BUFFER_CURRENT_MDL(curNb);
>+        bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl,
>LowPagePriority);
>+        bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
>+
>+        if (layers->isIPv4) {
>+            IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
>+
>+            if (csumInfo.Transmit.IpHeaderChecksum) {
>+                ip->check = 0;
>+                ip->check = IPChecksum((UINT8 *)ip, 4 * ip->ihl, 0);
>+            }
>+
>+            if (layers->isTcp && csumInfo.Transmit.TcpChecksum) {
>+                UINT16 csumLength = (UINT16)(packetLength -
>layers->l4Offset);
>+                TCPHdr *tcp = (TCPHdr *)(bufferStart + layers->l4Offset);
>+                tcp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
>+                                              IPPROTO_TCP, csumLength);
>+                tcp->check = CalculateChecksumNB(curNb, csumLength,
>+                 
>(UINT32)(layers->l4Offset));
>+            }
>+            else if (layers->isUdp && csumInfo.Transmit.UdpChecksum) {
>+                UINT16 csumLength = (UINT16)(packetLength -
>layers->l4Offset);
>+                UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
>+                udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
>+                                              IPPROTO_UDP, csumLength);
>+                udp->check = CalculateChecksumNB(curNb, csumLength,
>+                 
>(UINT32)(layers->l4Offset));
>+            }
>+        } else if (layers->isIPv6) {
>+            IPv6Hdr *ip = (IPv6Hdr *)(bufferStart + layers->l3Offset);
>+
>+            if (layers->isTcp && csumInfo.Transmit.TcpChecksum) {
>+                UINT16 csumLength = (UINT16)(packetLength -
>layers->l4Offset);
>+                TCPHdr *tcp = (TCPHdr *)(bufferStart + layers->l4Offset);
>+                tcp->check = IPv6PseudoChecksum((UINT32 *) &ip->saddr,
>+                                                (UINT32 *) &ip->daddr,
>+                                                IPPROTO_TCP, csumLength);
>+                tcp->check = CalculateChecksumNB(curNb, csumLength,
>+                 
>(UINT32)(layers->l4Offset));
>+            }
>+            else if (layers->isUdp && csumInfo.Transmit.UdpChecksum) {
>+                UINT16 csumLength = (UINT16)(packetLength -
>layers->l4Offset);
>+                UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
>+                udp->check = IPPseudoChecksum((UINT32 *) &ip->saddr,
>+                                              (UINT32 *)&ip->daddr,
>+                                              IPPROTO_UDP, csumLength);
>+                udp->check = CalculateChecksumNB(curNb, csumLength,
>+                 
>(UINT32)(layers->l4Offset));
>+            }
>+        }
>     }
> 
>     curNbl = *newNbl;
>@@ -257,9 +319,6 @@ OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
>                        sizeof ethHdr->Destination + sizeof
>ethHdr->Source);
>         ethHdr->Type = htons(ETH_TYPE_IPV4);
> 
>-        // XXX: question: there are fields in the OvsIPv4TunnelKey for
>ttl and such,
>-        // should we use those values instead? or will they end up being
>-        // uninitialized;
>         /* IP header */
>         ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
> 
>@@ -277,8 +336,12 @@ OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
>         ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
>         ipHdr->saddr = fwdInfo->srcIpAddr;
>         ipHdr->daddr = fwdInfo->dstIpAddr;
>-        ipHdr->check = 0;
>-        ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
>+
>+        /* Compute IP checksum only if the NIC does not support
>offloading */

Sai: I don't think this check is appropriate.
csumInfo is still pointing at the inner packet. The ipHdr is for outer
packet. 
This just checks if the underlying VM had offloaded the IpHeaderChecksum
calculation.

>+        if (!csumInfo.Transmit.IpHeaderChecksum) {
>+            ipHdr->check = 0;
>+            ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
>+        }
> 
>         /* UDP header */
>         udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
>-- 
>1.9.5.msysgit.0
>_______________________________________________
>dev mailing list
>dev@openvswitch.org
>https://urldefense.proofpoint.com/v2/url?u=http-3A__openvswitch.org_mailma
>n_listinfo_dev&d=BQIGaQ&c=Sqcl0Ez6M0X8aeM67LKIiDJAXVeAw-YihVMNtXt-uEs&r=Dc
>ruz40PROJ40ROzSpxyQSLw6fcrOWpJgEcEmNR3JEQ&m=bVjQF29IE0w5Wo086RFicGNW5uvzLe
>F03S9puquy74A&s=4hfAdgUzkEE18RcBwWXcY5SyW5K7eP4LuzMPh_vHLT0&e=
Alin Serdean Sept. 14, 2015, 9:14 p.m. UTC | #2
Thanks for the review Sairam.

I trimmed the message a bit.

See my answers inlined.

>     status = OvsAllocateNBLContext(context, newNbl); diff --git 
>a/datapath-windows/ovsext/Checksum.c
>b/datapath-windows/ovsext/Checksum.c
>index 510a094..5d9b035 100644
>--- a/datapath-windows/ovsext/Checksum.c
>+++ b/datapath-windows/ovsext/Checksum.c

Sai: Are there any tests to validate these changes in Checksum.c? If not, can someone else ack it.

I forgot to add a comment in the code which details on what should happen in tcp segmentation.
 I will add it in v2. But basically it is the following: 
https://msdn.microsoft.com/en-us/library/windows/hardware/ff568840%28v=vs.85%29.aspx

I used a tool to test the changes which is based on winrm.

>-        csum = CalculateOnesComplement(src, csLen, csum, TRUE);

Sai: You can reuse the swapEnd below - !swapEnd

>+        csum = CalculateOnesComplement(src, csLen, csum, !(1 &
>csumDataLen));
>         fold64(csum);
> 
>         csumDataLen -= csLen;

csumDataLen  changes in the inner loop, I cannot reuse swapend.

>@@ -504,9 +517,14 @@ CalculateChecksumNB(const PNET_BUFFER nb,
>+        /*
>+         * To this point we do not have VXLAN offloading.
>+         * Apply defined checksums
>+         */

Sai: Correct me if am wrong, this computes checksum for the inner packet if there is no segmentation.
Alin: You are correct.
Sai: In case the inner packet is segmented via LSO, shouldn't there be additional checksum calculations for each inner-segment?
Alin: it is computed for each inner segment OvsTcpSegmentNBL-> FixSegmentHeader which computes tcp checksums for each NB created by NdisAllocateFragmentNetBufferList.
          Looking again over the code I think we could change it to compute IP checksum only once and not for each segment. I will change this in v2.
Sai: If this is just for non-lso packet, then you can do the following prior to creating *newNbl to avoid having to compute bufferStart.
Alin: I do not want to compute the checksum if the copy failed 

>+        /* Compute IP checksum only if the NIC does not support
>offloading */

Sai: I don't think this check is appropriate.
csumInfo is still pointing at the inner packet. The ipHdr is for outer packet. 
This just checks if the underlying VM had offloaded the IpHeaderChecksum calculation.

>+        if (!csumInfo.Transmit.IpHeaderChecksum) {
>+            ipHdr->check = 0;
>+            ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
>+        }
> 
>         /* UDP header */
>         udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
>--

Alin: I am going on the following assumption:
If you have LSO or IP/TCP/UDP checksums enabled in your VM it means that the NIC on which the switch is created has them enabled.
The check above means that the VM did not have IP checksum enabled, which means that the physical one did not had it enabled also, telling us to compute it.
If the VM had it enabled, means the physical one had it thus meaning we should not compute it.
It is a rather hard assumption but I will leave it for more discussion if we should leave it or not.

Alin
Sairam Venugopal Sept. 14, 2015, 10:36 p.m. UTC | #3
I have added my response underneath.

Thanks,
Sairam Venugopal

On 9/14/15, 2:14 PM, "Alin Serdean" <aserdean@cloudbasesolutions.com>
wrote:

>Thanks for the review Sairam.
>
>I trimmed the message a bit.
>
>See my answers inlined.
>
>>     status = OvsAllocateNBLContext(context, newNbl); diff --git
>>a/datapath-windows/ovsext/Checksum.c
>>b/datapath-windows/ovsext/Checksum.c
>>index 510a094..5d9b035 100644
>>--- a/datapath-windows/ovsext/Checksum.c
>>+++ b/datapath-windows/ovsext/Checksum.c
>
>Sai: Are there any tests to validate these changes in Checksum.c? If not,
>can someone else ack it.
>
>I forgot to add a comment in the code which details on what should happen
>in tcp segmentation.
> I will add it in v2. But basically it is the following:
>https://urldefense.proofpoint.com/v2/url?u=https-3A__msdn.microsoft.com_en
>-2Dus_library_windows_hardware_ff568840-2528v-3Dvs.85-2529.aspx&d=BQIFAg&c
>=Sqcl0Ez6M0X8aeM67LKIiDJAXVeAw-YihVMNtXt-uEs&r=Dcruz40PROJ40ROzSpxyQSLw6fc
>rOWpJgEcEmNR3JEQ&m=ExYraM9o-Pfuh9GGoulFsO4aDPXT9KTIqsfbz0SZeT8&s=n91dZBAap
>M63Q5s8alYDHlAyaACeBmYW_d-MeAQqeOY&e=
>
>I used a tool to test the changes which is based on winrm.
>
>>-        csum = CalculateOnesComplement(src, csLen, csum, TRUE);
>
>Sai: You can reuse the swapEnd below - !swapEnd
>
>>+        csum = CalculateOnesComplement(src, csLen, csum, !(1 &
>>csumDataLen));
>>         fold64(csum);
>> 
>>         csumDataLen -= csLen;
>
>csumDataLen  changes in the inner loop, I cannot reuse swapend.

Sai: You are right. You can ignore this.
 
>
>>@@ -504,9 +517,14 @@ CalculateChecksumNB(const PNET_BUFFER nb,
>>+        /*
>>+         * To this point we do not have VXLAN offloading.
>>+         * Apply defined checksums
>>+         */
>
>Sai: Correct me if am wrong, this computes checksum for the inner packet
>if there is no segmentation.
>Alin: You are correct.
>Sai: In case the inner packet is segmented via LSO, shouldn't there be
>additional checksum calculations for each inner-segment?
>Alin: it is computed for each inner segment OvsTcpSegmentNBL->
>FixSegmentHeader which computes tcp checksums for each NB created by
>NdisAllocateFragmentNetBufferList.
>          Looking again over the code I think we could change it to
>compute IP checksum only once and not for each segment. I will change
>this in v2.

Sai: OvsTcpSegmentNBL gets called prior to this code. Are you going to
reorder the code to achieve this?

>Sai: If this is just for non-lso packet, then you can do the following
>prior to creating *newNbl to avoid having to compute bufferStart.
>Alin: I do not want to compute the checksum if the copy failed
>
>>+        /* Compute IP checksum only if the NIC does not support
>>offloading */
>
>Sai: I don't think this check is appropriate.
>csumInfo is still pointing at the inner packet. The ipHdr is for outer
>packet. 
>This just checks if the underlying VM had offloaded the IpHeaderChecksum
>calculation.
>
>>+        if (!csumInfo.Transmit.IpHeaderChecksum) {
>>+            ipHdr->check = 0;
>>+            ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
>>+        }
>> 
>>         /* UDP header */
>>         udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
>>--
>
>Alin: I am going on the following assumption:
>If you have LSO or IP/TCP/UDP checksums enabled in your VM it means that
>the NIC on which the switch is created has them enabled.
>The check above means that the VM did not have IP checksum enabled, which
>means that the physical one did not had it enabled also, telling us to
>compute it.
>If the VM had it enabled, means the physical one had it thus meaning we
>should not compute it.
>It is a rather hard assumption but I will leave it for more discussion if
>we should leave it or not.

Sai: I think you can skip this check and instead offload the task to NDIS.
I did something similar in my patch for enabling checksum in STT.
I was able to get pings to work even when the offloads were disabled on
the Hyper-V Host, but enabled in the underlying VM.
Alin Serdean Sept. 15, 2015, 3:24 a.m. UTC | #4
>
>>@@ -504,9 +517,14 @@ CalculateChecksumNB(const PNET_BUFFER nb,
>>+        /*
>>+         * To this point we do not have VXLAN offloading.
>>+         * Apply defined checksums
>>+         */
>
>Sai: Correct me if am wrong, this computes checksum for the inner 
>packet if there is no segmentation.
>Alin: You are correct.
>Sai: In case the inner packet is segmented via LSO, shouldn't there be 
>additional checksum calculations for each inner-segment?
>Alin: it is computed for each inner segment OvsTcpSegmentNBL-> 
>FixSegmentHeader which computes tcp checksums for each NB created by 
>NdisAllocateFragmentNetBufferList.
>          Looking again over the code I think we could change it to 
>compute IP checksum only once and not for each segment. I will change 
>this in v2.

Sai: OvsTcpSegmentNBL gets called prior to this code. Are you going to reorder the code to achieve this?

Alin: I will change FixSegmentHeader to use a predefined value for ip checksums instead of computing it for every net buffer.
Maybe I misunderstood your commentary, could you please detail your thoughts.

>Sai: If this is just for non-lso packet, then you can do the following 
>prior to creating *newNbl to avoid having to compute bufferStart.
>Alin: I do not want to compute the checksum if the copy failed
>
>>+        /* Compute IP checksum only if the NIC does not support
>>offloading */
>
>Sai: I don't think this check is appropriate.
>csumInfo is still pointing at the inner packet. The ipHdr is for outer 
>packet.
>This just checks if the underlying VM had offloaded the 
>IpHeaderChecksum calculation.
>
>>+        if (!csumInfo.Transmit.IpHeaderChecksum) {
>>+            ipHdr->check = 0;
>>+            ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
>>+        }
>> 
>>         /* UDP header */
>>         udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
>>--
>
>Alin: I am going on the following assumption:
>If you have LSO or IP/TCP/UDP checksums enabled in your VM it means 
>that the NIC on which the switch is created has them enabled.
>The check above means that the VM did not have IP checksum enabled, 
>which means that the physical one did not had it enabled also, telling 
>us to compute it.
>If the VM had it enabled, means the physical one had it thus meaning we 
>should not compute it.
>It is a rather hard assumption but I will leave it for more discussion 
>if we should leave it or not.

Sai: I think you can skip this check and instead offload the task to NDIS.
I did something similar in my patch for enabling checksum in STT.
I was able to get pings to work even when the offloads were disabled on the Hyper-V Host, but enabled in the underlying VM.

Alin: I will look over your patch tonight. It isn't a question if it works or not. The idea is that I am not fond of lying to the user of the VM that we support capability which doesn't exist. It is a virtual http://www.hackerthreads.org/Topic-10367 :).
diff mbox

Patch

diff --git a/datapath-windows/ovsext/BufferMgmt.c b/datapath-windows/ovsext/BufferMgmt.c
index 3550e20..bf53fc3 100644
--- a/datapath-windows/ovsext/BufferMgmt.c
+++ b/datapath-windows/ovsext/BufferMgmt.c
@@ -1116,7 +1116,8 @@  GetSegmentHeaderInfo(PNET_BUFFER_LIST nbl,
  * --------------------------------------------------------------------------
  */
 static NDIS_STATUS
-FixSegmentHeader(PNET_BUFFER nb, UINT16 segmentSize, UINT32 seqNumber)
+FixSegmentHeader(PNET_BUFFER nb, UINT16 segmentSize, UINT32 seqNumber,
+                 BOOLEAN lastPacket, UINT16 packetCounter)
 {
     EthHdr *dstEth;
     IPHdr *dstIP;
@@ -1141,18 +1142,29 @@  FixSegmentHeader(PNET_BUFFER nb, UINT16 segmentSize, UINT32 seqNumber)
     /* Fix IP length and checksum */
     ASSERT(dstIP->protocol == IPPROTO_TCP);
     dstIP->tot_len = htons(segmentSize + dstIP->ihl * 4 + TCP_HDR_LEN(dstTCP));
+    dstIP->id += packetCounter;
     dstIP->check = 0;
     dstIP->check = IPChecksum((UINT8 *)dstIP, dstIP->ihl * 4, 0);
 
     /* Fix TCP checksum */
     dstTCP->seq = htonl(seqNumber);
-    dstTCP->check =
-        IPPseudoChecksum((UINT32 *)&dstIP->saddr,
-                         (UINT32 *)&dstIP->daddr,
-                         IPPROTO_TCP, segmentSize + TCP_HDR_LEN(dstTCP));
+
+    if (dstTCP->fin) {
+        dstTCP->fin = lastPacket;
+    }
+    if (dstTCP->psh) {
+        dstTCP->psh = lastPacket;
+    }
+
+    UINT16 csumLength = segmentSize + TCP_HDR_LEN(dstTCP);;
+    dstTCP->check = IPPseudoChecksum(&dstIP->saddr,
+        &dstIP->daddr,
+        IPPROTO_TCP,
+        csumLength);
     dstTCP->check = CalculateChecksumNB(nb,
-            (UINT16)(NET_BUFFER_DATA_LENGTH(nb) - sizeof *dstEth - dstIP->ihl * 4),
-            sizeof *dstEth + dstIP->ihl * 4);
+        csumLength,
+        sizeof *dstEth + dstIP->ihl * 4);
+
     return STATUS_SUCCESS;
 }
 
@@ -1190,6 +1202,7 @@  OvsTcpSegmentNBL(PVOID ovsContext,
     NDIS_STATUS status;
     UINT16 segmentSize;
     ULONG copiedSize;
+    UINT16 packetCounter = 0;
 
     srcCtx = (POVS_BUFFER_CONTEXT)NET_BUFFER_LIST_CONTEXT_DATA_START(nbl);
     if (srcCtx == NULL || srcCtx->magic != OVS_CTX_MAGIC) {
@@ -1232,7 +1245,9 @@  OvsTcpSegmentNBL(PVOID ovsContext,
             goto nblcopy_error;
         }
 
-        status = FixSegmentHeader(newNb, segmentSize, seqNumber);
+        status = FixSegmentHeader(newNb, segmentSize, seqNumber,
+                                  NET_BUFFER_NEXT_NB(newNb) == NULL,
+                                  packetCounter);
         if (status != NDIS_STATUS_SUCCESS) {
             goto nblcopy_error;
         }
@@ -1241,6 +1256,7 @@  OvsTcpSegmentNBL(PVOID ovsContext,
         /* Move on to the next segment */
         size -= segmentSize;
         seqNumber += segmentSize;
+        packetCounter++;
     }
 
     status = OvsAllocateNBLContext(context, newNbl);
diff --git a/datapath-windows/ovsext/Checksum.c b/datapath-windows/ovsext/Checksum.c
index 510a094..5d9b035 100644
--- a/datapath-windows/ovsext/Checksum.c
+++ b/datapath-windows/ovsext/Checksum.c
@@ -68,34 +68,48 @@  CalculateOnesComplement(UINT8 *start,
 {
     UINT64  sum = 0, val;
     UINT64  *src = (UINT64 *)start;
-    union {
-        UINT32 val;
-        UINT8  b8[4];
-    } tmp;
-
     while (totalLength > 7) {
         val = *src;
-        sum += (val >> 32) + (val & 0xffffffff);
+        sum += val;
+        if (sum < val) sum++;
         src++;
         totalLength -= 8;
     }
+
+    start = (UINT8 *)src;
+
     if (totalLength > 3) {
-        sum += *(UINT32 *)src;
-        src = (UINT64 *)((UINT8 *)src + 4);
+        UINT32 val = *(UINT32 *)start;
+        sum += val;
+        if (sum < val) sum++;
+        start += 4;
         totalLength -= 4;
     }
-    start = (UINT8 *)src;
-    tmp.val = 0;
-    switch (totalLength) {
-    case 3:
-        tmp.b8[2] = start[2];
-    case 2:
-        tmp.b8[1] = start[1];
-    case 1:
-        tmp.b8[0] = start[0];
-        sum += tmp.val;
+
+    if (totalLength > 1) {
+        UINT16 val = *(UINT16 *)start;
+        sum += val;
+        if (sum < val) sum++;
+        start += 2;
+        totalLength -= 2;
     }
-    sum = (isEvenStart ? sum : swap64(sum)) + initial;
+
+    if (totalLength > 0) {
+        UINT8 val = *start;
+        sum += val;
+        if (sum < val) sum++;
+        start += 1;
+        totalLength -= 1;
+    }
+    ASSERT(totalLength == 0);
+
+    if (!isEvenStart) {
+        sum = _byteswap_uint64(sum);
+    }
+
+    sum += initial;
+    if (sum < initial) sum++;
+
     return sum;
 }
 
@@ -428,6 +442,7 @@  CalculateChecksumNB(const PNET_BUFFER nb,
     ULONG firstMdlLen;
     /* Running count of bytes in remainder of the MDLs including current. */
     ULONG packetLen;
+    BOOLEAN swapEnd = 1 & csumDataLen;
 
     if ((nb == NULL) || (csumDataLen == 0)
             || (offset >= NET_BUFFER_DATA_LENGTH(nb))
@@ -482,10 +497,8 @@  CalculateChecksumNB(const PNET_BUFFER nb,
     while (csumDataLen && (currentMdl != NULL)) {
         ASSERT(mdlLen < 65536);
         csLen = MIN((UINT16) mdlLen, csumDataLen);
-        //XXX Not handling odd bytes yet.
-        ASSERT(((csLen & 0x1) == 0) || csumDataLen <= mdlLen);
 
-        csum = CalculateOnesComplement(src, csLen, csum, TRUE);
+        csum = CalculateOnesComplement(src, csLen, csum, !(1 & csumDataLen));
         fold64(csum);
 
         csumDataLen -= csLen;
@@ -504,9 +517,14 @@  CalculateChecksumNB(const PNET_BUFFER nb,
         }
     }
 
+    fold64(csum);
     ASSERT(csumDataLen == 0);
     ASSERT((csum & ~0xffff) == 0);
-    return (UINT16) ~csum;
+    csum = (UINT16)~csum;
+    if (swapEnd) {
+        return _byteswap_ushort((UINT16)csum);
+    }
+    return (UINT16)csum;
 }
 
 /*
diff --git a/datapath-windows/ovsext/Vxlan.c b/datapath-windows/ovsext/Vxlan.c
index 2364f28..a179fbe 100644
--- a/datapath-windows/ovsext/Vxlan.c
+++ b/datapath-windows/ovsext/Vxlan.c
@@ -152,9 +152,9 @@  OvsCleanupVxlanTunnel(PIRP irp,
 
     if (vxlanPort->filterID != 0) {
         status = OvsTunnelFilterDelete(irp,
-				       vxlanPort->filterID,
-				       callback,
-				       tunnelContext);
+                                       vxlanPort->filterID,
+                                       callback,
+                                       tunnelContext);
     } else {
         OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
         vport->priv = NULL;
@@ -190,6 +190,9 @@  OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
     POVS_VXLAN_VPORT vportVxlan;
     UINT32 headRoom = OvsGetVxlanTunHdrSize();
     UINT32 packetLength;
+    NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
+    csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
+                                          TcpIpChecksumNetBufferListInfo);
 
     /*
      * XXX: the assumption currently is that the NBL is owned by OVS, and
@@ -198,20 +201,24 @@  OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
      */
     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
+
     if (layers->isTcp) {
         NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo;
 
         tsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
                 TcpLargeSendNetBufferListInfo);
-        OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS, packetLength);
+        OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS,
+                      packetLength);
         if (tsoInfo.LsoV1Transmit.MSS) {
             OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
             *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
-                        tsoInfo.LsoV1Transmit.MSS, headRoom);
+                                       tsoInfo.LsoV1Transmit.MSS, headRoom);
             if (*newNbl == NULL) {
                 OVS_LOG_ERROR("Unable to segment NBL");
                 return NDIS_STATUS_FAILURE;
             }
+            /* Clear out LSO flags after this point */
+            NET_BUFFER_LIST_INFO(curNbl, TcpLargeSendNetBufferListInfo) = 0;
         }
     }
 
@@ -226,6 +233,61 @@  OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
             OVS_LOG_ERROR("Unable to copy NBL");
             return NDIS_STATUS_FAILURE;
         }
+        /*
+         * To this point we do not have VXLAN offloading.
+         * Apply defined checksums
+         */
+        curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
+        curMdl = NET_BUFFER_CURRENT_MDL(curNb);
+        bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
+        bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
+
+        if (layers->isIPv4) {
+            IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
+
+            if (csumInfo.Transmit.IpHeaderChecksum) {
+                ip->check = 0;
+                ip->check = IPChecksum((UINT8 *)ip, 4 * ip->ihl, 0);
+            }
+
+            if (layers->isTcp && csumInfo.Transmit.TcpChecksum) {
+                UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
+                TCPHdr *tcp = (TCPHdr *)(bufferStart + layers->l4Offset);
+                tcp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
+                                              IPPROTO_TCP, csumLength);
+                tcp->check = CalculateChecksumNB(curNb, csumLength,
+                                                 (UINT32)(layers->l4Offset));
+            }
+            else if (layers->isUdp && csumInfo.Transmit.UdpChecksum) {
+                UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
+                UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
+                udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
+                                              IPPROTO_UDP, csumLength);
+                udp->check = CalculateChecksumNB(curNb, csumLength,
+                                                 (UINT32)(layers->l4Offset));
+            }
+        } else if (layers->isIPv6) {
+            IPv6Hdr *ip = (IPv6Hdr *)(bufferStart + layers->l3Offset);
+
+            if (layers->isTcp && csumInfo.Transmit.TcpChecksum) {
+                UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
+                TCPHdr *tcp = (TCPHdr *)(bufferStart + layers->l4Offset);
+                tcp->check = IPv6PseudoChecksum((UINT32 *) &ip->saddr,
+                                                (UINT32 *) &ip->daddr,
+                                                IPPROTO_TCP, csumLength);
+                tcp->check = CalculateChecksumNB(curNb, csumLength,
+                                                 (UINT32)(layers->l4Offset));
+            }
+            else if (layers->isUdp && csumInfo.Transmit.UdpChecksum) {
+                UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
+                UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
+                udp->check = IPPseudoChecksum((UINT32 *) &ip->saddr,
+                                              (UINT32 *)&ip->daddr,
+                                              IPPROTO_UDP, csumLength);
+                udp->check = CalculateChecksumNB(curNb, csumLength,
+                                                 (UINT32)(layers->l4Offset));
+            }
+        }
     }
 
     curNbl = *newNbl;
@@ -257,9 +319,6 @@  OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
                        sizeof ethHdr->Destination + sizeof ethHdr->Source);
         ethHdr->Type = htons(ETH_TYPE_IPV4);
 
-        // XXX: question: there are fields in the OvsIPv4TunnelKey for ttl and such,
-        // should we use those values instead? or will they end up being
-        // uninitialized;
         /* IP header */
         ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
 
@@ -277,8 +336,12 @@  OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
         ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
         ipHdr->saddr = fwdInfo->srcIpAddr;
         ipHdr->daddr = fwdInfo->dstIpAddr;
-        ipHdr->check = 0;
-        ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
+
+        /* Compute IP checksum only if the NIC does not support offloading */
+        if (!csumInfo.Transmit.IpHeaderChecksum) {
+            ipHdr->check = 0;
+            ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
+        }
 
         /* UDP header */
         udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);