diff mbox series

[ovs-dev,v1] netdev-native-tnl: Fix inner L2 pad loss causing checksum errors.

Message ID 1719974958-20931-1-git-send-email-junwang01@cestc.cn
State Changes Requested
Headers show
Series [ovs-dev,v1] netdev-native-tnl: Fix inner L2 pad loss causing checksum errors. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Jun Wang July 3, 2024, 2:49 a.m. UTC
We encountered a scenario where, if the received packet contains
padding bytes, and we then add Geneve tunnel encapsulation without
carrying the padding bytes, it results in checksum errors when sending
out. Therefore, adding an inner_l2_pad is necessary.

For example, this type of packet format:
0000   06 6c 6a 71 e1 d3 6c e2 d3 8b ea 24 81 00 03 e9
0010   08 00 45 00 00 28 9d e5 40 00 3b 06 6e 64 0a 6f
0020   05 14 0a fe 19 06 01 bb 22 ae 59 c0 8c 61 8e 26
0030   14 e3 50 10 00 7f ce 3a 00 00 00 00 9e 38 cf 64

Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.")

Signed-off-by: Jun Wang <junwang01@cestc.cn>
---
 lib/dp-packet.h         | 21 ++++++++++++++++++++-
 lib/netdev-native-tnl.c |  3 +++
 2 files changed, 23 insertions(+), 1 deletion(-)

Comments

Mike Pattrick July 5, 2024, 4:11 p.m. UTC | #1
On Tue, Jul 2, 2024 at 10:56 PM Jun Wang <junwang01@cestc.cn> wrote:
>
> We encountered a scenario where, if the received packet contains
> padding bytes, and we then add Geneve tunnel encapsulation without
> carrying the padding bytes, it results in checksum errors when sending
> out. Therefore, adding an inner_l2_pad is necessary.
>
> For example, this type of packet format:
> 0000   06 6c 6a 71 e1 d3 6c e2 d3 8b ea 24 81 00 03 e9
> 0010   08 00 45 00 00 28 9d e5 40 00 3b 06 6e 64 0a 6f
> 0020   05 14 0a fe 19 06 01 bb 22 ae 59 c0 8c 61 8e 26
> 0030   14 e3 50 10 00 7f ce 3a 00 00 00 00 9e 38 cf 64

Hello Jun,

Thank you for this submission. This is an interesting case and I don't
know that we have an appropriate test case for micrograms like this.
Was this the

One question I have is shouldn't we remove the padding while we
encapsulate, not keep it? The encapsulation should always push the
frame size past 64 bytes.


Thanks,
M

>
> Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.")
>
> Signed-off-by: Jun Wang <junwang01@cestc.cn>
> ---
>  lib/dp-packet.h         | 21 ++++++++++++++++++++-
>  lib/netdev-native-tnl.c |  3 +++
>  2 files changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index a75b1c5..d583b28 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -176,6 +176,8 @@ struct dp_packet {
>      ovs_be32 packet_type;          /* Packet type as defined in OpenFlow */
>      uint16_t csum_start;           /* Position to start checksumming from. */
>      uint16_t csum_offset;          /* Offset to place checksum. */
> +    uint16_t inner_l2_pad_size;    /* Detected inner l2 padding size.
> +                                    * Padding is non-pullable. */
>      union {
>          struct pkt_metadata md;
>          uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];
> @@ -209,7 +211,10 @@ static inline void *dp_packet_eth(const struct dp_packet *);
>  static inline void dp_packet_reset_offsets(struct dp_packet *);
>  static inline void dp_packet_reset_offload(struct dp_packet *);
>  static inline uint16_t dp_packet_l2_pad_size(const struct dp_packet *);
> +static inline uint16_t dp_packet_inner_l2_pad_size(const struct dp_packet *);
>  static inline void dp_packet_set_l2_pad_size(struct dp_packet *, uint16_t);
> +static inline void dp_packet_set_inner_l2_pad_size(struct dp_packet *,
> +                                                   uint16_t);
>  static inline void *dp_packet_l2_5(const struct dp_packet *);
>  static inline void dp_packet_set_l2_5(struct dp_packet *, void *);
>  static inline void *dp_packet_l3(const struct dp_packet *);
> @@ -435,6 +440,7 @@ static inline void
>  dp_packet_reset_offsets(struct dp_packet *b)
>  {
>      b->l2_pad_size = 0;
> +    b->inner_l2_pad_size = 0;
>      b->l2_5_ofs = UINT16_MAX;
>      b->l3_ofs = UINT16_MAX;
>      b->l4_ofs = UINT16_MAX;
> @@ -448,6 +454,12 @@ dp_packet_l2_pad_size(const struct dp_packet *b)
>      return b->l2_pad_size;
>  }
>
> +static inline uint16_t
> +dp_packet_inner_l2_pad_size(const struct dp_packet *b)
> +{
> +    return b->inner_l2_pad_size;
> +}
> +
>  static inline void
>  dp_packet_set_l2_pad_size(struct dp_packet *b, uint16_t pad_size)
>  {
> @@ -455,6 +467,13 @@ dp_packet_set_l2_pad_size(struct dp_packet *b, uint16_t pad_size)
>      b->l2_pad_size = pad_size;
>  }
>
> +static inline void
> +dp_packet_set_inner_l2_pad_size(struct dp_packet *b, uint16_t pad_size)
> +{
> +    ovs_assert(pad_size <= dp_packet_size(b));
> +    b->inner_l2_pad_size = pad_size;
> +}
> +
>  static inline void *
>  dp_packet_l2_5(const struct dp_packet *b)
>  {
> @@ -543,7 +562,7 @@ dp_packet_inner_l4_size(const struct dp_packet *b)
>      return OVS_LIKELY(b->inner_l4_ofs != UINT16_MAX)
>             ? (const char *) dp_packet_tail(b)
>             - (const char *) dp_packet_inner_l4(b)
> -           - dp_packet_l2_pad_size(b)
> +           - dp_packet_inner_l2_pad_size(b)
>             : 0;
>  }
>
> diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
> index 0f9f07f..96ffdc1 100644
> --- a/lib/netdev-native-tnl.c
> +++ b/lib/netdev-native-tnl.c
> @@ -156,6 +156,7 @@ netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
>      struct eth_header *eth;
>      struct ip_header *ip;
>      struct ovs_16aligned_ip6_hdr *ip6;
> +    uint16_t l2_pad_size;
>
>      eth = dp_packet_push_uninit(packet, size);
>      *ip_tot_size = dp_packet_size(packet) - sizeof (struct eth_header);
> @@ -163,7 +164,9 @@ netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
>      memcpy(eth, header, size);
>      /* The encapsulated packet has type Ethernet. Adjust dp_packet. */
>      packet->packet_type = htonl(PT_ETH);
> +    l2_pad_size = dp_packet_l2_pad_size(packet);
>      dp_packet_reset_offsets(packet);
> +    dp_packet_set_inner_l2_pad_size(packet, l2_pad_size);
>      packet->l3_ofs = sizeof (struct eth_header);
>
>      if (netdev_tnl_is_header_ipv6(header)) {
> --
> 1.8.3.1
>
>
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Jun Wang July 8, 2024, 8 a.m. UTC | #2
>On Tue, Jul 2, 2024 at 10:56 PM Jun Wang <junwang01@cestc.cn> wrote:
>>
>> We encountered a scenario where, if the received packet contains
>> padding bytes, and we then add Geneve tunnel encapsulation without
>> carrying the padding bytes, it results in checksum errors when sending
>> out. Therefore, adding an inner_l2_pad is necessary.
>>
>> For example, this type of packet format:
>> 0000   06 6c 6a 71 e1 d3 6c e2 d3 8b ea 24 81 00 03 e9
>> 0010   08 00 45 00 00 28 9d e5 40 00 3b 06 6e 64 0a 6f
>> 0020   05 14 0a fe 19 06 01 bb 22 ae 59 c0 8c 61 8e 26
>> 0030   14 e3 50 10 00 7f ce 3a 00 00 00 00 9e 38 cf 64
>
> Hello Jun,
>
> Thank you for this submission. This is an interesting case and I don't
> know that we have an appropriate test case for micrograms like this.
> Was this the

Yes, it does require the appropriate test case, but I am not very familiar with this area.

> One question I have is shouldn't we remove the padding while we
> encapsulate, not keep it? The encapsulation should always push the
> frame size past 64 bytes.

I have also considered this approach. Theoretically, padding bytes
should be invalid and need to be removed. Shouldn't this be handled
at the packet reception stage rather than during the subsequent
encapsulation phase?
Alternatively, from another perspective, shouldn't we avoid modifying
any lengths, including meaningless padding bytes?

>Thanks,
>M
>
>>
>> Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.")
>>
>> Signed-off-by: Jun Wang <junwang01@cestc.cn>
>> ---
>>  lib/dp-packet.h         | 21 ++++++++++++++++++++-
>>  lib/netdev-native-tnl.c |  3 +++
>>  2 files changed, 23 insertions(+), 1 deletion(-)
>>
>> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
>> index a75b1c5..d583b28 100644
>> --- a/lib/dp-packet.h
>> +++ b/lib/dp-packet.h
>> @@ -176,6 +176,8 @@ struct dp_packet {
>>      ovs_be32 packet_type;          /* Packet type as defined in OpenFlow */
>>      uint16_t csum_start;           /* Position to start checksumming from. */
>>      uint16_t csum_offset;          /* Offset to place checksum. */
>> +    uint16_t inner_l2_pad_size;    /* Detected inner l2 padding size.
>> +                                    * Padding is non-pullable. */
>>      union {
>>          struct pkt_metadata md;
>>          uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];
>> @@ -209,7 +211,10 @@ static inline void *dp_packet_eth(const struct dp_packet *);
>>  static inline void dp_packet_reset_offsets(struct dp_packet *);
>>  static inline void dp_packet_reset_offload(struct dp_packet *);
>>  static inline uint16_t dp_packet_l2_pad_size(const struct dp_packet *);
>> +static inline uint16_t dp_packet_inner_l2_pad_size(const struct dp_packet *);
>>  static inline void dp_packet_set_l2_pad_size(struct dp_packet *, uint16_t);
>> +static inline void dp_packet_set_inner_l2_pad_size(struct dp_packet *,
>> +                                                   uint16_t);
>>  static inline void *dp_packet_l2_5(const struct dp_packet *);
>>  static inline void dp_packet_set_l2_5(struct dp_packet *, void *);
>>  static inline void *dp_packet_l3(const struct dp_packet *);
>> @@ -435,6 +440,7 @@ static inline void
>>  dp_packet_reset_offsets(struct dp_packet *b)
>>  {
>>      b->l2_pad_size = 0;
>> +    b->inner_l2_pad_size = 0;
>>      b->l2_5_ofs = UINT16_MAX;
>>      b->l3_ofs = UINT16_MAX;
>>      b->l4_ofs = UINT16_MAX;
>> @@ -448,6 +454,12 @@ dp_packet_l2_pad_size(const struct dp_packet *b)
>>      return b->l2_pad_size;
>>  }
>>
>> +static inline uint16_t
>> +dp_packet_inner_l2_pad_size(const struct dp_packet *b)
>> +{
>> +    return b->inner_l2_pad_size;
>> +}
>> +
>>  static inline void
>>  dp_packet_set_l2_pad_size(struct dp_packet *b, uint16_t pad_size)
>>  {
>> @@ -455,6 +467,13 @@ dp_packet_set_l2_pad_size(struct dp_packet *b, uint16_t pad_size)
>>      b->l2_pad_size = pad_size;
>>  }
>>
>> +static inline void
>> +dp_packet_set_inner_l2_pad_size(struct dp_packet *b, uint16_t pad_size)
>> +{
>> +    ovs_assert(pad_size <= dp_packet_size(b));
>> +    b->inner_l2_pad_size = pad_size;
>> +}
>> +
>>  static inline void *
>>  dp_packet_l2_5(const struct dp_packet *b)
>>  {
>> @@ -543,7 +562,7 @@ dp_packet_inner_l4_size(const struct dp_packet *b)
>>      return OVS_LIKELY(b->inner_l4_ofs != UINT16_MAX)
>>             ? (const char *) dp_packet_tail(b)
>>             - (const char *) dp_packet_inner_l4(b)
>> -           - dp_packet_l2_pad_size(b)
>> +           - dp_packet_inner_l2_pad_size(b)
>>             : 0;
>>  }
>>
>> diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
>> index 0f9f07f..96ffdc1 100644
>> --- a/lib/netdev-native-tnl.c
>> +++ b/lib/netdev-native-tnl.c
>> @@ -156,6 +156,7 @@ netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
>>      struct eth_header *eth;
>>      struct ip_header *ip;
>>      struct ovs_16aligned_ip6_hdr *ip6;
>> +    uint16_t l2_pad_size;
>>
>>      eth = dp_packet_push_uninit(packet, size);
>>      *ip_tot_size = dp_packet_size(packet) - sizeof (struct eth_header);
>> @@ -163,7 +164,9 @@ netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
>>      memcpy(eth, header, size);
>>      /* The encapsulated packet has type Ethernet. Adjust dp_packet. */
>>      packet->packet_type = htonl(PT_ETH);
>> +    l2_pad_size = dp_packet_l2_pad_size(packet);
>>      dp_packet_reset_offsets(packet);
>> +    dp_packet_set_inner_l2_pad_size(packet, l2_pad_size);
>>      packet->l3_ofs = sizeof (struct eth_header);
>>
>>      if (netdev_tnl_is_header_ipv6(header)) {
>> --
>> 1.8.3.1
>>
>>
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>




Jun Wang
Ilya Maximets Aug. 2, 2024, 10:37 p.m. UTC | #3
>>On Tue, Jul 2, 2024 at 10:56 PM Jun Wang <junwang01 at cestc.cn> wrote:
>>>
>>> We encountered a scenario where, if the received packet contains
>>> padding bytes, and we then add Geneve tunnel encapsulation without
>>> carrying the padding bytes, it results in checksum errors when sending
>>> out. Therefore, adding an inner_l2_pad is necessary.
>>>
>>> For example, this type of packet format:
>>> 0000   06 6c 6a 71 e1 d3 6c e2 d3 8b ea 24 81 00 03 e9
>>> 0010   08 00 45 00 00 28 9d e5 40 00 3b 06 6e 64 0a 6f
>>> 0020   05 14 0a fe 19 06 01 bb 22 ae 59 c0 8c 61 8e 26
>>> 0030   14 e3 50 10 00 7f ce 3a 00 00 00 00 9e 38 cf 64
>>
>> Hello Jun,
>>
>> Thank you for this submission. This is an interesting case and I don't
>> know that we have an appropriate test case for micrograms like this.
>> Was this the
> 
> Yes, it does require the appropriate test case, but I am not very familiar with this area.
> 
>> One question I have is shouldn't we remove the padding while we
>> encapsulate, not keep it? The encapsulation should always push the
>> frame size past 64 bytes.
> 
> I have also considered this approach. Theoretically, padding bytes
> should be invalid and need to be removed. Shouldn't this be handled
> at the packet reception stage rather than during the subsequent
> encapsulation phase?

We do not really care about padding to be present in most other cases
and in a basic port-to-port forwarding it may be easier if we just keep
the original padding and not spend extra cycles on removing it.  So, it
might be better to strip it on encapsulation instead of on receive.

> Alternatively, from another perspective, shouldn't we avoid modifying
> any lengths, including meaningless padding bytes?

Padding is not part of any header field, so we don't actually change
much removing it.  It should be fine.


One general question about the problem here though - padding should
not affect checksums under normal circumstances, because it supposed to
be all-zero.  And zeroes do not change network checksums.  Is it not
all-zero in your case?

For the test, you may try something like this (some more changes in
the other tests may be needed due to new ol_tcp_csum option):

diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index e8bbf8d51..a94837ac5 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -154,6 +154,8 @@ struct netdev_dummy {
     bool ol_ip_csum OVS_GUARDED;
     /* Flag RX packet with good csum. */
     bool ol_ip_csum_set_good OVS_GUARDED;
+    /* Enable netdev TCP csum offload and flag RX packets for offloading. */
+    bool ol_tcp_csum OVS_GUARDED;
     /* Set the segment size for netdev TSO support. */
     int ol_tso_segsz OVS_GUARDED;
 };
@@ -810,6 +812,10 @@ netdev_dummy_get_config(const struct netdev *dev, struct smap *args)
         smap_add_format(args, "ol_ip_csum_set_good", "%s", "true");
     }
 
+    if (netdev->ol_tcp_csum) {
+        smap_add_format(args, "ol_tcp_csum", "%s", "true");
+    }
+
     if (netdev->ol_tso_segsz && userspace_tso_enabled()) {
         smap_add_format(args, "ol_tso_segsz", "%d", netdev->ol_tso_segsz);
     }
@@ -945,6 +951,11 @@ netdev_dummy_set_config(struct netdev *netdev_, const struct smap *args,
         netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
     }
 
+    netdev->ol_tcp_csum = smap_get_bool(args, "ol_tcp_csum", false);
+    if (netdev->ol_tcp_csum) {
+        netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
+    }
+
     if (userspace_tso_enabled()) {
         netdev->ol_tso_segsz = smap_get_int(args, "ol_tso_segsz", 0);
         if (netdev->ol_tso_segsz) {
@@ -1136,6 +1147,10 @@ netdev_dummy_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
         dp_packet_ol_set_ip_csum_good(packet);
     }
 
+    if (netdev->ol_tcp_csum) {
+        dp_packet_hwol_set_csum_tcp(packet);
+    }
+
     if (userspace_tso_enabled() && netdev->ol_tso_segsz) {
         dp_packet_set_tso_segsz(packet, netdev->ol_tso_segsz);
         dp_packet_hwol_set_tcp_seg(packet);
diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at
index 7ec4c31ab..1779ede73 100644
--- a/tests/tunnel-push-pop.at
+++ b/tests/tunnel-push-pop.at
@@ -1259,3 +1259,92 @@ hash(l4(0)),recirc(0x2)
 
 OVS_VSWITCHD_STOP
 AT_CLEANUP
+
+AT_SETUP([tunnel_push_pop - encapsulation with L2 padding])
+
+OVS_VSWITCHD_START(
+    [add-port br0 p0 \
+     -- set Interface p0 type=dummy ofport_request=1 \
+                         other-config:hwaddr=aa:55:aa:55:00:00])
+AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg])
+AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy])
+AT_CHECK([ovs-vsctl add-port int-br t2 \
+          -- set Interface t2 type=geneve options:remote_ip=1.1.2.92 \
+                              options:key=123 options:csum=true \
+                              ofport_request=2])
+
+dnl Setting up an IP address.
+AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK
+])
+dnl Checking that a local route for added IP was successfully installed.
+AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl
+Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local
+])
+AT_CHECK([ovs-ofctl add-flow br0 action=normal])
+AT_CHECK([ovs-ofctl add-flow int-br action=normal])
+
+dnl This ARP reply from p0 has two effects:
+dnl 1. The ARP cache will learn that 1.1.2.92 is at f8:bc:12:44:34:b6.
+dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0.
+AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl
+ 'recirc_id(0),in_port(1),dnl
+  eth(src=f8:bc:12:44:34:b6,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),dnl
+  arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=00:00:00:00:00:00)'
+])
+
+AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap])
+on_exit 'ovs-pcap p0.pcap'
+
+dnl Make sure ephemeral ports stay static across tests.
+AT_CHECK([ovs-appctl tnl/egress_port_range 50000 50000], [0], [OK
+])
+
+m4_define([TCP_PKT], [m4_join([,],
+  [eth_src=50:54:00:00:00:0a,eth_dst=50:54:00:00:00:09,eth_type=0x0800],
+  [nw_src=10.1.1.1,nw_dst=10.1.1.2],
+  [nw_proto=6,nw_ttl=64,nw_frag=no],
+  [tp_src=54392,tp_dst=5201,tcp_flags=ack])])
+
+packet=$(ovs-ofctl compose-packet --bare 'TCP_PKT' '')
+echo packet: ${packet}
+dnl Normally the padding should consist of zero bytes, but using non-zero
+dnl bytes here to make sure they are not used for checksum calculation.
+padding=0102030405060708090a0b0c0d0e0f
+
+dnl Encapsulation header.
+eth=f8bc124434b6aa55aa5500000800
+ip4=4500005a00004000401133de010102580101025c
+udp=c35017c10046aaa5
+geneve=0000655800007b00
+encap=${eth}${ip4}${udp}${geneve}
+
+dnl First, trying with checksum offload disabled and a good packet.
+AT_CHECK([ovs-vsctl set Interface int-br options:ol_ip_csum=false])
+AT_CHECK([ovs-vsctl set Interface int-br options:ol_ip_csum_set_good=false])
+AT_CHECK([ovs-vsctl set Interface int-br options:ol_tcp_csum=false])
+
+dnl Output to tunnel from a int-br internal port.
+dnl Checking that the packet arrived and it was correctly encapsulated,
+dnl i.e. has no padding.
+AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}${padding}"])
+OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}$") -eq 1])
+dnl Sending again to exercise the non-miss upcall path.
+AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}${padding}"])
+OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}$") -eq 2])
+
+dnl Enable checksum offload and check that datapath fixes bad IP and TCP
+dnl checksums on the inner packet.
+AT_CHECK([ovs-vsctl set Interface int-br options:ol_ip_csum=true])
+AT_CHECK([ovs-vsctl set Interface int-br options:ol_ip_csum_set_good=true])
+AT_CHECK([ovs-vsctl set Interface int-br options:ol_tcp_csum=true])
+
+dnl Break the IP checksum with --bad-csum and replace the correct TCP
+dnl checksum '0xb106' with incorrect '0xabcd'.
+bad_packet=$(ovs-ofctl compose-packet --bare --bad-csum 'TCP_PKT' '' \
+                | sed 's/b106/abcd/' )
+echo bad_packet: ${bad_packet}
+AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${bad_packet}${padding}"])
+OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}$") -eq 3])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
---

Best regards, Ilya Maximets.
Jun Wang Aug. 6, 2024, 3:03 a.m. UTC | #4
Yes, in my environment, the padding bytes in the packets sent by the switch are not zero, 
but according to the RFC standard, they should be all zeros. So, should we enhance the 
code robustness to adapt to this scenario, as in many cases we cannot ensure the switch
operates correctly?

Best regards, Jun Wang.



Jun Wang
Ilya Maximets Aug. 6, 2024, 10:01 a.m. UTC | #5
On 8/6/24 05:03, Jun Wang wrote:
> 
> Yes, in my environment, the padding bytes in the packets sent by the switch are not zero,
> but according to the RFC standard, they should be all zeros. So, should we enhance the
> code robustness to adapt to this scenario, as in many cases we cannot ensure the switch
> operates correctly?

Yeah.  I think it's OK and even good to strip the padding before encapsulation.
So we should do that.  I just wanted to make sure I understand how the issue
happened in the first place.

Thanks for confirming that the padding indeed is not zero.  Though it is concerning
that we're receiving more reports about hardware switches adding non-zero padding
to packets.

Best regards, Ilya Maximets.
diff mbox series

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index a75b1c5..d583b28 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -176,6 +176,8 @@  struct dp_packet {
     ovs_be32 packet_type;          /* Packet type as defined in OpenFlow */
     uint16_t csum_start;           /* Position to start checksumming from. */
     uint16_t csum_offset;          /* Offset to place checksum. */
+    uint16_t inner_l2_pad_size;    /* Detected inner l2 padding size.
+                                    * Padding is non-pullable. */
     union {
         struct pkt_metadata md;
         uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];
@@ -209,7 +211,10 @@  static inline void *dp_packet_eth(const struct dp_packet *);
 static inline void dp_packet_reset_offsets(struct dp_packet *);
 static inline void dp_packet_reset_offload(struct dp_packet *);
 static inline uint16_t dp_packet_l2_pad_size(const struct dp_packet *);
+static inline uint16_t dp_packet_inner_l2_pad_size(const struct dp_packet *);
 static inline void dp_packet_set_l2_pad_size(struct dp_packet *, uint16_t);
+static inline void dp_packet_set_inner_l2_pad_size(struct dp_packet *,
+                                                   uint16_t);
 static inline void *dp_packet_l2_5(const struct dp_packet *);
 static inline void dp_packet_set_l2_5(struct dp_packet *, void *);
 static inline void *dp_packet_l3(const struct dp_packet *);
@@ -435,6 +440,7 @@  static inline void
 dp_packet_reset_offsets(struct dp_packet *b)
 {
     b->l2_pad_size = 0;
+    b->inner_l2_pad_size = 0;
     b->l2_5_ofs = UINT16_MAX;
     b->l3_ofs = UINT16_MAX;
     b->l4_ofs = UINT16_MAX;
@@ -448,6 +454,12 @@  dp_packet_l2_pad_size(const struct dp_packet *b)
     return b->l2_pad_size;
 }
 
+static inline uint16_t
+dp_packet_inner_l2_pad_size(const struct dp_packet *b)
+{
+    return b->inner_l2_pad_size;
+}
+
 static inline void
 dp_packet_set_l2_pad_size(struct dp_packet *b, uint16_t pad_size)
 {
@@ -455,6 +467,13 @@  dp_packet_set_l2_pad_size(struct dp_packet *b, uint16_t pad_size)
     b->l2_pad_size = pad_size;
 }
 
+static inline void
+dp_packet_set_inner_l2_pad_size(struct dp_packet *b, uint16_t pad_size)
+{
+    ovs_assert(pad_size <= dp_packet_size(b));
+    b->inner_l2_pad_size = pad_size;
+}
+
 static inline void *
 dp_packet_l2_5(const struct dp_packet *b)
 {
@@ -543,7 +562,7 @@  dp_packet_inner_l4_size(const struct dp_packet *b)
     return OVS_LIKELY(b->inner_l4_ofs != UINT16_MAX)
            ? (const char *) dp_packet_tail(b)
            - (const char *) dp_packet_inner_l4(b)
-           - dp_packet_l2_pad_size(b)
+           - dp_packet_inner_l2_pad_size(b)
            : 0;
 }
 
diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
index 0f9f07f..96ffdc1 100644
--- a/lib/netdev-native-tnl.c
+++ b/lib/netdev-native-tnl.c
@@ -156,6 +156,7 @@  netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
     struct eth_header *eth;
     struct ip_header *ip;
     struct ovs_16aligned_ip6_hdr *ip6;
+    uint16_t l2_pad_size;
 
     eth = dp_packet_push_uninit(packet, size);
     *ip_tot_size = dp_packet_size(packet) - sizeof (struct eth_header);
@@ -163,7 +164,9 @@  netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
     memcpy(eth, header, size);
     /* The encapsulated packet has type Ethernet. Adjust dp_packet. */
     packet->packet_type = htonl(PT_ETH);
+    l2_pad_size = dp_packet_l2_pad_size(packet);
     dp_packet_reset_offsets(packet);
+    dp_packet_set_inner_l2_pad_size(packet, l2_pad_size);
     packet->l3_ofs = sizeof (struct eth_header);
 
     if (netdev_tnl_is_header_ipv6(header)) {