diff mbox series

[net-next,1/3] udp_tunnel: allow to turn off path mtu discovery on encap sockets

Message ID 20200712200705.9796-2-fw@strlen.de
State Changes Requested
Delegated to: David Miller
Headers show
Series vxlan, geneve: allow to turn off PMTU updates on encap socket | expand

Commit Message

Florian Westphal July 12, 2020, 8:07 p.m. UTC
vxlan and geneve take the to-be-transmitted skb, prepend the
encapsulation header and send the result.

Neither vxlan nor geneve can do anything about a lowered path mtu
except notifying the peer/upper dst entry.
In routed setups, vxlan takes the updated pmtu from the encap sockets'
dst entry and will notify/update the dst entry of the current skb.

Some setups, however, will use vxlan as a bridge port (or openvs vport).

In both cases, no upper dst entry exists.

Without this patch:

1. Client sends x bytes, where x == MTU of vxlan/geneve interface.
2. the encap header is prepended and the encap packet is passed to
   ip_output.
3. If the sk received a pmtu error in the mean time, then ip_output
   will fetch the mtu from the encap socket instead of dev->mtu.
4. ip_output emits an ICMP error to encap socket

The step #4 prevents the route exception from timing out, and setup
remains in a state where the upper layer cannot send MTU-sized packets,
even though the encapsulated packet doesn't exceed the link MTU.

It appears best to configure the encap socket to never learn about path
MTU in these setups.

Next patch will add the VXLAN config plane to use this.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/ipv6.h         | 7 +++++++
 include/net/udp_tunnel.h   | 2 ++
 net/ipv4/udp_tunnel_core.c | 2 ++
 net/ipv6/ip6_udp_tunnel.c  | 7 +++++++
 4 files changed, 18 insertions(+)

Comments

Stefano Brivio July 12, 2020, 10:38 p.m. UTC | #1
Hi,

On Sun, 12 Jul 2020 22:07:03 +0200
Florian Westphal <fw@strlen.de> wrote:

> vxlan and geneve take the to-be-transmitted skb, prepend the
> encapsulation header and send the result.
> 
> Neither vxlan nor geneve can do anything about a lowered path mtu
> except notifying the peer/upper dst entry.

It could, and I think it should, update its MTU, though. I didn't
include this in the original implementation of PMTU discovery for UDP
tunnels as it worked just fine for locally generated and routed
traffic, but here we go.

As PMTU discovery happens, we have a route exception on the lower
layer for the given path, and we know that VXLAN will use that path,
so we also know there's no point in having a higher MTU on the VXLAN
device, it's really the maximum packet size we can use.

See the change to vxlan_xmit_one() in the sketch patch below.

> In routed setups, vxlan takes the updated pmtu from the encap sockets'
> dst entry and will notify/update the dst entry of the current skb.
> 
> Some setups, however, will use vxlan as a bridge port (or openvs vport).

And, on top of that, I think what we're missing on the bridge is to
update the MTU when a port lowers its MTU. The MTU is changed only as
interfaces are added, which feels like a bug. We could use the lower
layer notifier to fix this.

In the sketch below, I'm changing a few lines to adjust the MTU to the
lowest MTU value between all ports, for testing purposes.

I tried to represent the issue you're hitting with a new test case in
the pmtu.sh selftest, also included in the diff. Would that work for
Open vSwitch?

If OVS queries the MTU of VXLAN devices, I guess that should be enough.
I'm not sure it does that though. The changes to the bridge wouldn't
even be needed, but I think it's something we should also fix
eventually.

---
 drivers/net/vxlan.c                 |   13 +++++++++++++
 net/bridge/br_if.c                  |    8 +++++---
 net/bridge/br_input.c               |    6 ++++++
 tools/testing/selftests/net/pmtu.sh |   17 ++++++++++++++++-
 4 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 5bb448ae6c9c..2e051b7366bf 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2580,6 +2580,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
 	return 0;
 }
 
+static int vxlan_change_mtu(struct net_device *dev, int new_mtu);
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   __be32 default_vni, struct vxlan_rdst *rdst,
 			   bool did_rsc)
@@ -2714,6 +2715,18 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		ndst = &rt->dst;
 		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
 
+		/* TODO: doesn't conflict with RFC 7348, RFC 1191, but not ideal
+		 * as we can't track PMTU increases:
+		 * - use a notifier on route cache and add a configuration field
+		 *   to track user changes
+		 * - embed logic from skb_tunnel_check_pmtu() and get this fixed
+		 *   for free for all the tunnels
+		 */
+		if (skb->len > dst_mtu(ndst) - VXLAN_HEADROOM) {
+			vxlan_change_mtu(vxlan->dev,
+					 dst_mtu(ndst) - VXLAN_HEADROOM);
+		}
+
 		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 		err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index a0e9a7937412..6253b6d40d43 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -505,9 +505,11 @@ void br_mtu_auto_adjust(struct net_bridge *br)
 {
 	ASSERT_RTNL();
 
-	/* if the bridge MTU was manually configured don't mess with it */
-	if (br_opt_get(br, BROPT_MTU_SET_BY_USER))
-		return;
+	/* TODO: if (br_opt_get(br, BROPT_MTU_SET_BY_USER)), we should not
+	 * increase the MTU, but skipping decreases breaks functionality:
+	 * - add an 'opt' to track the set value and allow the user to decrease
+	 *   the MTU arbitrarily
+	 */
 
 	/* change to the minimum MTU and clear the flag which was set by
 	 * the bridge ndo_change_mtu callback
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 59a318b9f646..2429f70ce4ee 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -283,6 +283,12 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 			goto drop;
 	}
 
+	/* TODO: use lower layer notifier instead. Some tunnels implement this
+	 * properly (see e.g. vti6 and pmtu_vti6_link_change_mtu selftest in
+	 * net/pmtu.sh)
+	 */
+	br_mtu_auto_adjust(p->br);
+
 	if (unlikely(is_link_local_ether_addr(dest))) {
 		u16 fwd_mask = p->br->group_fwd_mask_required;
 
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 77c09cd339c3..09731d9ea11a 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -169,7 +169,8 @@ tests="
 	cleanup_ipv4_exception		ipv4: cleanup of cached exceptions	1
 	cleanup_ipv6_exception		ipv6: cleanup of cached exceptions	1
 	list_flush_ipv4_exception	ipv4: list and flush cached exceptions	1
-	list_flush_ipv6_exception	ipv6: list and flush cached exceptions	1"
+	list_flush_ipv6_exception	ipv6: list and flush cached exceptions	1
+	pmtu_ipv4_vxlan4_exception_bridge	IPv4 over vxlan4 with bridge		1"
 
 NS_A="ns-A"
 NS_B="ns-B"
@@ -864,6 +865,20 @@ test_pmtu_ipv4_vxlan4_exception() {
 	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  4 4
 }
 
+test_pmtu_ipv4_vxlan4_exception_bridge() {
+	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  4 4
+
+	ip -n ns-A link add br0 type bridge
+	ip -n ns-A link set br0 up
+	ip -n ns-A link set dev br0 mtu 5000
+	ip -n ns-A link set vxlan_a master br0
+
+	ip -n ns-A addr del 192.168.2.1/24 dev vxlan_a
+	ip -n ns-A addr add 192.168.2.1/24 dev br0
+
+	ping -c 1 -w 2 -M want -s 5000 192.168.2.2
+}
+
 test_pmtu_ipv6_vxlan4_exception() {
 	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  6 4
 }
Florian Westphal July 13, 2020, 8:04 a.m. UTC | #2
Stefano Brivio <sbrivio@redhat.com> wrote:
> Hi,
> 
> On Sun, 12 Jul 2020 22:07:03 +0200
> Florian Westphal <fw@strlen.de> wrote:
> 
> > vxlan and geneve take the to-be-transmitted skb, prepend the
> > encapsulation header and send the result.
> > 
> > Neither vxlan nor geneve can do anything about a lowered path mtu
> > except notifying the peer/upper dst entry.
> 
> It could, and I think it should, update its MTU, though. I didn't
> include this in the original implementation of PMTU discovery for UDP
> tunnels as it worked just fine for locally generated and routed
> traffic, but here we go.

I don't think its a good idea to muck with network config in response
to untrusted entity.

> As PMTU discovery happens, we have a route exception on the lower
> layer for the given path, and we know that VXLAN will use that path,
> so we also know there's no point in having a higher MTU on the VXLAN
> device, it's really the maximum packet size we can use.

No, in the setup that prompted this series the route exception is wrong.
The current "fix" is a shell script that flushes the exception as soon
as its added to keep the tunnel working...

> > Some setups, however, will use vxlan as a bridge port (or openvs vport).
> 
> And, on top of that, I think what we're missing on the bridge is to
> update the MTU when a port lowers its MTU. The MTU is changed only as
> interfaces are added, which feels like a bug. We could use the lower
> layer notifier to fix this.

I will defer to someone who knows bridges better but I think that
in bridge case we 100% depend on a human to set everything.

bridge might be forwarding frames of non-ip protocol and I worry that
this is a self-induced DoS when we start to alter configuration behind
sysadmins back.

> I tried to represent the issue you're hitting with a new test case in
> the pmtu.sh selftest, also included in the diff. Would that work for
> Open vSwitch?

No idea, I don't understand how it can work at all, we can't 'chop
up'/mangle l2 frame in arbitrary fashion to somehow make them pass to
the output port.  We also can't influence MTU config of the links peer.

> If OVS queries the MTU of VXLAN devices, I guess that should be enough.

What should it be doing...?
Stefano Brivio July 13, 2020, 10:04 a.m. UTC | #3
On Mon, 13 Jul 2020 10:04:13 +0200
Florian Westphal <fw@strlen.de> wrote:

> Stefano Brivio <sbrivio@redhat.com> wrote:
> > Hi,
> > 
> > On Sun, 12 Jul 2020 22:07:03 +0200
> > Florian Westphal <fw@strlen.de> wrote:
> >   
> > > vxlan and geneve take the to-be-transmitted skb, prepend the
> > > encapsulation header and send the result.
> > > 
> > > Neither vxlan nor geneve can do anything about a lowered path mtu
> > > except notifying the peer/upper dst entry.  
> > 
> > It could, and I think it should, update its MTU, though. I didn't
> > include this in the original implementation of PMTU discovery for UDP
> > tunnels as it worked just fine for locally generated and routed
> > traffic, but here we go.  
> 
> I don't think its a good idea to muck with network config in response
> to untrusted entity.

I agree that this (changing MTU on VXLAN) looks like a further step,
but the practical effect is zero: we can't send those packets already
today.

PMTU discovery has security impacts, and they are mentioned in the
RFCs. Also here, we wouldn't increase the MTU as a result, and if the
entity is considered untrusted, considerations from RFC 8201 and RFC
4890 cover that.

In practice, we might have broken networks, but at a practical level, I
guess it's enough to not make the situation any worse.

> > As PMTU discovery happens, we have a route exception on the lower
> > layer for the given path, and we know that VXLAN will use that path,
> > so we also know there's no point in having a higher MTU on the VXLAN
> > device, it's really the maximum packet size we can use.  
> 
> No, in the setup that prompted this series the route exception is wrong.
> The current "fix" is a shell script that flushes the exception as soon
> as its added to keep the tunnel working...

Oh, oops.

Well, as I mentioned, if this is breaking setups and this series is the
only way to fix things, I have nothing against it, we can still work on
a more comprehensive solution (including the bridge) once we have it.

> > > Some setups, however, will use vxlan as a bridge port (or openvs vport).  
> > 
> > And, on top of that, I think what we're missing on the bridge is to
> > update the MTU when a port lowers its MTU. The MTU is changed only as
> > interfaces are added, which feels like a bug. We could use the lower
> > layer notifier to fix this.  
> 
> I will defer to someone who knows bridges better but I think that
> in bridge case we 100% depend on a human to set everything.

Not entirely, MTU is auto-adjusted when interfaces are added (unless
the user set it explicitly), however:

> bridge might be forwarding frames of non-ip protocol and I worry that
> this is a self-induced DoS when we start to alter configuration behind
> sysadmins back.

...yes, I agree that the matter with the bridge is different. And we
don't know if that fixes anything else than the selftest I showed, so
let's forget about the bridge for a moment.

> > I tried to represent the issue you're hitting with a new test case in
> > the pmtu.sh selftest, also included in the diff. Would that work for
> > Open vSwitch?  
> 
> No idea, I don't understand how it can work at all, we can't 'chop
> up'/mangle l2 frame in arbitrary fashion to somehow make them pass to
> the output port.  We also can't influence MTU config of the links peer.

Sorry I didn't expand right away.

In the test case I showed, it works because at that point sending
packets to the bridge will result in an error, and the (local) sender
fragments. Let's set this aside together with the bridge affair, though.

Back to VXLAN and OVS: OVS implements a "check_pkt_len" action, cf.
commit 4d5ec89fc8d1 ("net: openvswitch: Add a new action
check_pkt_len"), that should be used when packets exceed link MTUs:

  With the help of this action, OVN will check the packet length
  and if it is greater than the MTU size, it will generate an
  ICMP packet (type 3, code 4) and includes the next hop mtu in it
  so that the sender can fragment the packets.

and my understanding is that this can only work if we reflect the
effective MTU on the device itself (including VXLAN).

Side note: I'm not fond of the idea behind that OVS action because I
think it competes with the kernel (and with ICMP itself, or PLPMTUD if
ICMP is not an option) to do PMTU discovery.

However, if that already works for OVS (I really don't know. Aaron,
Numan?), perhaps you could simply consider going with that solution...
Numan Siddique July 13, 2020, 10:51 a.m. UTC | #4
On Mon, Jul 13, 2020 at 3:34 PM Stefano Brivio <sbrivio@redhat.com> wrote:
>
> On Mon, 13 Jul 2020 10:04:13 +0200
> Florian Westphal <fw@strlen.de> wrote:
>
> > Stefano Brivio <sbrivio@redhat.com> wrote:
> > > Hi,
> > >
> > > On Sun, 12 Jul 2020 22:07:03 +0200
> > > Florian Westphal <fw@strlen.de> wrote:
> > >
> > > > vxlan and geneve take the to-be-transmitted skb, prepend the
> > > > encapsulation header and send the result.
> > > >
> > > > Neither vxlan nor geneve can do anything about a lowered path mtu
> > > > except notifying the peer/upper dst entry.
> > >
> > > It could, and I think it should, update its MTU, though. I didn't
> > > include this in the original implementation of PMTU discovery for UDP
> > > tunnels as it worked just fine for locally generated and routed
> > > traffic, but here we go.
> >
> > I don't think its a good idea to muck with network config in response
> > to untrusted entity.
>
> I agree that this (changing MTU on VXLAN) looks like a further step,
> but the practical effect is zero: we can't send those packets already
> today.
>
> PMTU discovery has security impacts, and they are mentioned in the
> RFCs. Also here, we wouldn't increase the MTU as a result, and if the
> entity is considered untrusted, considerations from RFC 8201 and RFC
> 4890 cover that.
>
> In practice, we might have broken networks, but at a practical level, I
> guess it's enough to not make the situation any worse.
>
> > > As PMTU discovery happens, we have a route exception on the lower
> > > layer for the given path, and we know that VXLAN will use that path,
> > > so we also know there's no point in having a higher MTU on the VXLAN
> > > device, it's really the maximum packet size we can use.
> >
> > No, in the setup that prompted this series the route exception is wrong.
> > The current "fix" is a shell script that flushes the exception as soon
> > as its added to keep the tunnel working...
>
> Oh, oops.
>
> Well, as I mentioned, if this is breaking setups and this series is the
> only way to fix things, I have nothing against it, we can still work on
> a more comprehensive solution (including the bridge) once we have it.
>
> > > > Some setups, however, will use vxlan as a bridge port (or openvs vport).
> > >
> > > And, on top of that, I think what we're missing on the bridge is to
> > > update the MTU when a port lowers its MTU. The MTU is changed only as
> > > interfaces are added, which feels like a bug. We could use the lower
> > > layer notifier to fix this.
> >
> > I will defer to someone who knows bridges better but I think that
> > in bridge case we 100% depend on a human to set everything.
>
> Not entirely, MTU is auto-adjusted when interfaces are added (unless
> the user set it explicitly), however:
>
> > bridge might be forwarding frames of non-ip protocol and I worry that
> > this is a self-induced DoS when we start to alter configuration behind
> > sysadmins back.
>
> ...yes, I agree that the matter with the bridge is different. And we
> don't know if that fixes anything else than the selftest I showed, so
> let's forget about the bridge for a moment.
>
> > > I tried to represent the issue you're hitting with a new test case in
> > > the pmtu.sh selftest, also included in the diff. Would that work for
> > > Open vSwitch?
> >
> > No idea, I don't understand how it can work at all, we can't 'chop
> > up'/mangle l2 frame in arbitrary fashion to somehow make them pass to
> > the output port.  We also can't influence MTU config of the links peer.
>
> Sorry I didn't expand right away.
>
> In the test case I showed, it works because at that point sending
> packets to the bridge will result in an error, and the (local) sender
> fragments. Let's set this aside together with the bridge affair, though.
>
> Back to VXLAN and OVS: OVS implements a "check_pkt_len" action, cf.
> commit 4d5ec89fc8d1 ("net: openvswitch: Add a new action
> check_pkt_len"), that should be used when packets exceed link MTUs:
>
>   With the help of this action, OVN will check the packet length
>   and if it is greater than the MTU size, it will generate an
>   ICMP packet (type 3, code 4) and includes the next hop mtu in it
>   so that the sender can fragment the packets.
>
> and my understanding is that this can only work if we reflect the
> effective MTU on the device itself (including VXLAN).
>

check_pkt_len is OVS datapath action and the corresponding OVS action
is  check_pkt_larger.

Logically It is expected to use this way in the OVS flows- >
    reg0[0] = check_pkt_larger(1500);
    if reg0[0[ == 1; then take some action.

In the case of OVN, if the register reg0[0] bit is set, then we
generate ICMP error packet (type 3, code 4).

I don't know the requirements or the issue this patch is trying to
address. But I think for OVS, there has to be
a controller (like OVN) which makes use of the check_pkt_larger action
and takes necessary action by adding
appropriate OF flows based on the result of check_pkt_larger.

Thanks
Numan

>
> Side note: I'm not fond of the idea behind that OVS action because I
> think it competes with the kernel (and with ICMP itself, or PLPMTUD if
> ICMP is not an option) to do PMTU discovery.
>
> However, if that already works for OVS (I really don't know. Aaron,
> Numan?), perhaps you could simply consider going with that solution...
>
> --
> Stefano
>
David Ahern July 13, 2020, 1:25 p.m. UTC | #5
On 7/13/20 2:04 AM, Florian Westphal wrote:
>> As PMTU discovery happens, we have a route exception on the lower
>> layer for the given path, and we know that VXLAN will use that path,
>> so we also know there's no point in having a higher MTU on the VXLAN
>> device, it's really the maximum packet size we can use.
> No, in the setup that prompted this series the route exception is wrong.

Why is the exception wrong and why can't the exception code be fixed to
include tunnel headers?
Florian Westphal July 13, 2020, 2:02 p.m. UTC | #6
David Ahern <dsahern@gmail.com> wrote:
> On 7/13/20 2:04 AM, Florian Westphal wrote:
> >> As PMTU discovery happens, we have a route exception on the lower
> >> layer for the given path, and we know that VXLAN will use that path,
> >> so we also know there's no point in having a higher MTU on the VXLAN
> >> device, it's really the maximum packet size we can use.
> > No, in the setup that prompted this series the route exception is wrong.
> 
> Why is the exception wrong and why can't the exception code be fixed to
> include tunnel headers?

I don't know.  This occurs in a 3rd party (read: "cloud") environment.
After some days, tcp connections on the overlay network hang.

Flushing the route exception in the namespace of the vxlan interface makes
the traffic flow again, i.e. if the vxlan tunnel would just use the
physical devices MTU things would be fine.

I don't know what you mean by 'fix exception code to include tunnel
headers'.  Can you elaborate?

AFAICS everyhing functions as designed, except:
1. The route exception should not exist in first place in this case
2. The route exception never times out (gets refreshed every time
   tunnel tries to send a mtu-sized packet).
3. The original sender never learns about the pmtu event

Regarding 3) I had cooked up patches to inject a new ICMP error
into the bridge input path from vxlan_err_lookup() to let the sender
know the path MTU reduction.

Unfortunately it only works with Linux bridge (openvswitch tosses the
packet).  Also, too many (internal) reviews told me they consider this
an ugly hack, so I am not too keen on continuing down that route:

https://git.breakpoint.cc/cgit/fw/net-next.git/commit/?h=udp_tun_pmtud_12&id=ca5b0af203b6f8010f1e585850620db4561baae7
David Ahern July 13, 2020, 2:41 p.m. UTC | #7
On 7/13/20 8:02 AM, Florian Westphal wrote:
> David Ahern <dsahern@gmail.com> wrote:
>> On 7/13/20 2:04 AM, Florian Westphal wrote:
>>>> As PMTU discovery happens, we have a route exception on the lower
>>>> layer for the given path, and we know that VXLAN will use that path,
>>>> so we also know there's no point in having a higher MTU on the VXLAN
>>>> device, it's really the maximum packet size we can use.
>>> No, in the setup that prompted this series the route exception is wrong.
>>
>> Why is the exception wrong and why can't the exception code be fixed to
>> include tunnel headers?
> 
> I don't know.  This occurs in a 3rd party (read: "cloud") environment.
> After some days, tcp connections on the overlay network hang.
> 
> Flushing the route exception in the namespace of the vxlan interface makes
> the traffic flow again, i.e. if the vxlan tunnel would just use the
> physical devices MTU things would be fine.
> 
> I don't know what you mean by 'fix exception code to include tunnel
> headers'.  Can you elaborate?

lwtunnel has lwtunnel_headroom which allows ipv4_mtu to accommodate the
space needed for the encap header. Can something similar be adapted for
the device based tunnels?

> 
> AFAICS everyhing functions as designed, except:
> 1. The route exception should not exist in first place in this case
> 2. The route exception never times out (gets refreshed every time
>    tunnel tries to send a mtu-sized packet).
> 3. The original sender never learns about the pmtu event

meaning the VM / container? ie., this is a VPC using VxLAN in the host
to send packets to another hypervisor. If that is the case why isn't the
underlay MTU bumped to handle the encap header, or the VMs MTU lowered
to handle the encap header? seems like a config problem.
Florian Westphal July 13, 2020, 2:59 p.m. UTC | #8
David Ahern <dsahern@gmail.com> wrote:
> On 7/13/20 8:02 AM, Florian Westphal wrote:
> > David Ahern <dsahern@gmail.com> wrote:
> >> On 7/13/20 2:04 AM, Florian Westphal wrote:
> >>>> As PMTU discovery happens, we have a route exception on the lower
> >>>> layer for the given path, and we know that VXLAN will use that path,
> >>>> so we also know there's no point in having a higher MTU on the VXLAN
> >>>> device, it's really the maximum packet size we can use.
> >>> No, in the setup that prompted this series the route exception is wrong.
> >>
> >> Why is the exception wrong and why can't the exception code be fixed to
> >> include tunnel headers?
> > 
> > I don't know.  This occurs in a 3rd party (read: "cloud") environment.
> > After some days, tcp connections on the overlay network hang.
> > 
> > Flushing the route exception in the namespace of the vxlan interface makes
> > the traffic flow again, i.e. if the vxlan tunnel would just use the
> > physical devices MTU things would be fine.
> > 
> > I don't know what you mean by 'fix exception code to include tunnel
> > headers'.  Can you elaborate?
> 
> lwtunnel has lwtunnel_headroom which allows ipv4_mtu to accommodate the
> space needed for the encap header. Can something similar be adapted for
> the device based tunnels?

I don't see how it would help for this particular problem.

> > AFAICS everyhing functions as designed, except:
> > 1. The route exception should not exist in first place in this case
> > 2. The route exception never times out (gets refreshed every time
> >    tunnel tries to send a mtu-sized packet).
> > 3. The original sender never learns about the pmtu event
> 
> meaning the VM / container? ie., this is a VPC using VxLAN in the host
> to send packets to another hypervisor. If that is the case why isn't the
> underlay MTU bumped to handle the encap header, or the VMs MTU lowered
> to handle the encap header? seems like a config problem.

Its configured properly:

ovs bridge mtu: 1450
vxlan device mtu: 1450
physical link: 1500

so, packets coming in on the bridge (local tx or from remote bridge port)
can have the enap header (50 bytes) prepended without exceeding the
physical link mtu.

When the vxlan driver calls the ip output path, this line:

        mtu = ip_skb_dst_mtu(sk, skb);

in __ip_finish_output() will fetch the MTU based of the encap socket,
which will now be 1450 due to that route exception.

So this will behave as if someone had lowered the physical link mtu to 1450:
IP stack drops the packet and sends an icmp error (fragmentation needed,
MTU 1450).  The MTU of the VXLAN port is already at 1450.

I could make a patch that lowers the vxlan port MTU to 1450 - 50 (encap
overhead) automatically, but I don't think making such change
automatically is a good idea.

With this proposed patch, the MTU retrieved would always be the link
MTU.

I don't think this patch is enough to resolve PMTU in general of course,
after all the VXLAN peer might be unable to receive packets larger than
what the ICMP error announces.  But I do not know how to resolve this
in the general case as everyone has a differnt opinion on how (and where)
this needs to be handled.
Stefano Brivio July 13, 2020, 3:57 p.m. UTC | #9
On Mon, 13 Jul 2020 16:59:11 +0200
Florian Westphal <fw@strlen.de> wrote:

> Its configured properly:
> 
> ovs bridge mtu: 1450
> vxlan device mtu: 1450
> physical link: 1500

Okay, so my proposal to reflect the discovered PMTU on the MTU of the
VXLAN device won't help in your case.

In the test case I drafted, configuring bridge and VXLAN with those
MTUs (by means of PMTU discovery) is enough for the sender to adjust
packet size and MTU-sized packets go through. I guess the OVS case is
not equivalent to it, then.

> so, packets coming in on the bridge (local tx or from remote bridge port)
> can have the enap header (50 bytes) prepended without exceeding the
> physical link mtu.
> 
> When the vxlan driver calls the ip output path, this line:
> 
>         mtu = ip_skb_dst_mtu(sk, skb);
> 
> in __ip_finish_output() will fetch the MTU based of the encap socket,
> which will now be 1450 due to that route exception.
> 
> So this will behave as if someone had lowered the physical link mtu to 1450:
> IP stack drops the packet and sends an icmp error (fragmentation needed,
> MTU 1450).  The MTU of the VXLAN port is already at 1450.

It's not clear to me why the behaviour on this path is different from
routed traffic. I understand the impact of bridged traffic on error
reporting, but not here.

Does it have something to do with metadata-based tunnels? Should we omit
the call to skb_tunnel_check_pmtu() call in vxlan_xmit_one() in that
case (if (info)) because the dst is not the same dst?

> [...]
>
> I don't think this patch is enough to resolve PMTU in general of course,
> after all the VXLAN peer might be unable to receive packets larger than
> what the ICMP error announces.  But I do not know how to resolve this
> in the general case as everyone has a differnt opinion on how (and where)
> this needs to be handled.

The sender here is sending packets matching the MTU, interface MTUs are
correct, so we wouldn't benefit from "extending" PMTU discovery for
this specific problem and we can let that topic aside for now, correct?
Florian Westphal July 13, 2020, 4:22 p.m. UTC | #10
Stefano Brivio <sbrivio@redhat.com> wrote:
> > so, packets coming in on the bridge (local tx or from remote bridge port)
> > can have the enap header (50 bytes) prepended without exceeding the
> > physical link mtu.
> > 
> > When the vxlan driver calls the ip output path, this line:
> > 
> >         mtu = ip_skb_dst_mtu(sk, skb);
> > 
> > in __ip_finish_output() will fetch the MTU based of the encap socket,
> > which will now be 1450 due to that route exception.
> > 
> > So this will behave as if someone had lowered the physical link mtu to 1450:
> > IP stack drops the packet and sends an icmp error (fragmentation needed,
> > MTU 1450).  The MTU of the VXLAN port is already at 1450.
> 
> It's not clear to me why the behaviour on this path is different from
> routed traffic. I understand the impact of bridged traffic on error
> reporting, but not here.

In routing case:
1. pmtu notification is received
2. route exception is added
3. next MTU-sized packet in vxlan triggers the if () condition in
   skb_tunnel_check_pmtu()
4. skb_dst_update_pmtu() gets called, new nexthop exception is added
5. packet is dropped in ip_output (too large)
6. next MTU-sized packet to be forwarded triggers PMTU check in
   ip_forward()
7. ip_forward drops packet and sends an icmp error for new mtu (1400 in
    the example)
8. sender receives+updates path mtu
9. next packet will be small enough

In Bridge case, 4) is a noop and even if we had dst entries here,
we do not enter ip_forward path for bridged case.

> Does it have something to do with metadata-based tunnels?

No.

> Should we omit
> the call to skb_tunnel_check_pmtu() call in vxlan_xmit_one() in that
> case (if (info)) because the dst is not the same dst?

skb_dst_update_pmtu is already omitted in this scenario since dst is NULL.

> > I don't think this patch is enough to resolve PMTU in general of course,
> > after all the VXLAN peer might be unable to receive packets larger than
> > what the ICMP error announces.  But I do not know how to resolve this
> > in the general case as everyone has a differnt opinion on how (and where)
> > this needs to be handled.
> 
> The sender here is sending packets matching the MTU, interface MTUs are
> correct, so we wouldn't benefit from "extending" PMTU discovery for
> this specific problem and we can let that topic aside for now, correct?

Yes and no.  What the hack patches (not this series, the icmp error
injection series for bridge...) does is to inject a new icmp error from
the vxlan icmp error processing callback that will report an MTU of
'received mtu - vxlan_overhead' to the sender.

So, the sender receives a PMTU update for 1400 in the given scenario.

Its not nice of course, as sender emitted a MTU-sized packet (1450)
to an on-link destination, only to be told by that *alleged* on-link
destination (address spoofed by bridge) that it needs to use 1400.

I don't see any better solution, since netdev police failed to make
such setups illegal 8)
Stefano Brivio July 14, 2020, 12:33 p.m. UTC | #11
On Mon, 13 Jul 2020 18:22:55 +0200
Florian Westphal <fw@strlen.de> wrote:

> Stefano Brivio <sbrivio@redhat.com> wrote:
> > > so, packets coming in on the bridge (local tx or from remote bridge port)
> > > can have the enap header (50 bytes) prepended without exceeding the
> > > physical link mtu.
> > > 
> > > When the vxlan driver calls the ip output path, this line:
> > > 
> > >         mtu = ip_skb_dst_mtu(sk, skb);
> > > 
> > > in __ip_finish_output() will fetch the MTU based of the encap socket,
> > > which will now be 1450 due to that route exception.
> > > 
> > > So this will behave as if someone had lowered the physical link mtu to 1450:
> > > IP stack drops the packet and sends an icmp error (fragmentation needed,
> > > MTU 1450).  The MTU of the VXLAN port is already at 1450.  
> > 
> > It's not clear to me why the behaviour on this path is different from
> > routed traffic. I understand the impact of bridged traffic on error
> > reporting, but not here.  
> 
> In routing case:
> 1. pmtu notification is received
> 2. route exception is added
> 3. next MTU-sized packet in vxlan triggers the if () condition in
>    skb_tunnel_check_pmtu()
> 4. skb_dst_update_pmtu() gets called, new nexthop exception is added
> 5. packet is dropped in ip_output (too large)
> 6. next MTU-sized packet to be forwarded triggers PMTU check in
>    ip_forward()
> 7. ip_forward drops packet and sends an icmp error for new mtu (1400 in
>     the example)
> 8. sender receives+updates path mtu
> 9. next packet will be small enough

I'm not sure it changes the conclusion or if it affects your problem in
any way, but what I see in the routing case is a bit different.

Running the pmtu_ipv4_vxlan4_exception test from pmtu.sh with 1500 as
lowest MTU on the path shows effectively a MTU of 1450 on the link
(1424 bytes of inner IP payload in the first packet going through,
inner IPv4 total length being 1444).

That's because we already relay an ICMP Fragmentation Needed at step 5,
and in the next step the packet is small enough.

> In Bridge case, 4) is a noop and even if we had dst entries here,
> we do not enter ip_forward path for bridged case.

Also not in my test, because:

> [...]
>
> > Should we omit
> > the call to skb_tunnel_check_pmtu() call in vxlan_xmit_one() in that
> > case (if (info)) because the dst is not the same dst?  
> 
> skb_dst_update_pmtu is already omitted in this scenario since dst is NULL.

...skb_dst_update_pmtu(), there, is called with 'ndst' (dst for the
route returned by vxlan_get_route()), not skb->dst. But yes, as you
mentioned, we don't hit ip_forward(), so that doesn't matter.

The original problem remains, and unless we find another explanation
I'd go ahead and start reviewing this series, FWIW.

> > > I don't think this patch is enough to resolve PMTU in general of course,
> > > after all the VXLAN peer might be unable to receive packets larger than
> > > what the ICMP error announces.  But I do not know how to resolve this
> > > in the general case as everyone has a differnt opinion on how (and where)
> > > this needs to be handled.  
> > 
> > The sender here is sending packets matching the MTU, interface MTUs are
> > correct, so we wouldn't benefit from "extending" PMTU discovery for
> > this specific problem and we can let that topic aside for now, correct?  
> 
> Yes and no.  What the hack patches (not this series, the icmp error
> injection series for bridge...) does is to inject a new icmp error from
> the vxlan icmp error processing callback that will report an MTU of
> 'received mtu - vxlan_overhead' to the sender.

Okay, I see.

> So, the sender receives a PMTU update for 1400 in the given scenario.
> 
> Its not nice of course, as sender emitted a MTU-sized packet (1450)
> to an on-link destination, only to be told by that *alleged* on-link
> destination (address spoofed by bridge) that it needs to use 1400.

Still, I don't think that we should use 1400, assuming that 1450 is in
fact the right value.

> I don't see any better solution, since netdev police failed to make
> such setups illegal 8)

I'll file a complaint :)
Stefano Brivio July 14, 2020, 12:33 p.m. UTC | #12
On Mon, 13 Jul 2020 16:02:19 +0200
Florian Westphal <fw@strlen.de> wrote:

> AFAICS everyhing functions as designed, except:
> 1. The route exception should not exist in first place in this case
> 2. The route exception never times out (gets refreshed every time
>    tunnel tries to send a mtu-sized packet).
> 3. The original sender never learns about the pmtu event
> 
> Regarding 3) I had cooked up patches to inject a new ICMP error
> into the bridge input path from vxlan_err_lookup() to let the sender
> know the path MTU reduction.
> 
> Unfortunately it only works with Linux bridge (openvswitch tosses the
> packet).  Also, too many (internal) reviews told me they consider this
> an ugly hack, so I am not too keen on continuing down that route:
> 
> https://git.breakpoint.cc/cgit/fw/net-next.git/commit/?h=udp_tun_pmtud_12&id=ca5b0af203b6f8010f1e585850620db4561baae7

To be honest, after considering other solutions, yours suddenly appears
to be a lot less ugly. :) Well, I don't think that abusing the "lookup"
functions to do something completely different is a good idea, but that
would be a minor change to do it in another place).

I would still like the idea I proposed better (updating MTUs down the
chain), it's simpler and we don't have to duplicate existing
functionality (generating additional ICMP messages). We could also
decide to skip decreases of MTU on the bridge if the user ever set a
value manually (keeping that existing mechanism as it is).

Both should cover cases with a regular bridge. However, it's still not
clear to me what either solution covers in terms of Open vSwitch. I
think it would be interesting to know before proceeding further.
Aaron Conole July 14, 2020, 8:38 p.m. UTC | #13
Numan Siddique <nusiddiq@redhat.com> writes:

> On Mon, Jul 13, 2020 at 3:34 PM Stefano Brivio <sbrivio@redhat.com> wrote:
>>
>> On Mon, 13 Jul 2020 10:04:13 +0200
>> Florian Westphal <fw@strlen.de> wrote:
>>
>> > Stefano Brivio <sbrivio@redhat.com> wrote:
>> > > Hi,
>> > >
>> > > On Sun, 12 Jul 2020 22:07:03 +0200
>> > > Florian Westphal <fw@strlen.de> wrote:
>> > >
>> > > > vxlan and geneve take the to-be-transmitted skb, prepend the
>> > > > encapsulation header and send the result.
>> > > >
>> > > > Neither vxlan nor geneve can do anything about a lowered path mtu
>> > > > except notifying the peer/upper dst entry.
>> > >
>> > > It could, and I think it should, update its MTU, though. I didn't
>> > > include this in the original implementation of PMTU discovery for UDP
>> > > tunnels as it worked just fine for locally generated and routed
>> > > traffic, but here we go.
>> >
>> > I don't think its a good idea to muck with network config in response
>> > to untrusted entity.
>>
>> I agree that this (changing MTU on VXLAN) looks like a further step,
>> but the practical effect is zero: we can't send those packets already
>> today.
>>
>> PMTU discovery has security impacts, and they are mentioned in the
>> RFCs. Also here, we wouldn't increase the MTU as a result, and if the
>> entity is considered untrusted, considerations from RFC 8201 and RFC
>> 4890 cover that.
>>
>> In practice, we might have broken networks, but at a practical level, I
>> guess it's enough to not make the situation any worse.
>>
>> > > As PMTU discovery happens, we have a route exception on the lower
>> > > layer for the given path, and we know that VXLAN will use that path,
>> > > so we also know there's no point in having a higher MTU on the VXLAN
>> > > device, it's really the maximum packet size we can use.
>> >
>> > No, in the setup that prompted this series the route exception is wrong.
>> > The current "fix" is a shell script that flushes the exception as soon
>> > as its added to keep the tunnel working...
>>
>> Oh, oops.
>>
>> Well, as I mentioned, if this is breaking setups and this series is the
>> only way to fix things, I have nothing against it, we can still work on
>> a more comprehensive solution (including the bridge) once we have it.
>>
>> > > > Some setups, however, will use vxlan as a bridge port (or openvs vport).
>> > >
>> > > And, on top of that, I think what we're missing on the bridge is to
>> > > update the MTU when a port lowers its MTU. The MTU is changed only as
>> > > interfaces are added, which feels like a bug. We could use the lower
>> > > layer notifier to fix this.
>> >
>> > I will defer to someone who knows bridges better but I think that
>> > in bridge case we 100% depend on a human to set everything.
>>
>> Not entirely, MTU is auto-adjusted when interfaces are added (unless
>> the user set it explicitly), however:
>>
>> > bridge might be forwarding frames of non-ip protocol and I worry that
>> > this is a self-induced DoS when we start to alter configuration behind
>> > sysadmins back.
>>
>> ...yes, I agree that the matter with the bridge is different. And we
>> don't know if that fixes anything else than the selftest I showed, so
>> let's forget about the bridge for a moment.
>>
>> > > I tried to represent the issue you're hitting with a new test case in
>> > > the pmtu.sh selftest, also included in the diff. Would that work for
>> > > Open vSwitch?
>> >
>> > No idea, I don't understand how it can work at all, we can't 'chop
>> > up'/mangle l2 frame in arbitrary fashion to somehow make them pass to
>> > the output port.  We also can't influence MTU config of the links peer.
>>
>> Sorry I didn't expand right away.
>>
>> In the test case I showed, it works because at that point sending
>> packets to the bridge will result in an error, and the (local) sender
>> fragments. Let's set this aside together with the bridge affair, though.
>>
>> Back to VXLAN and OVS: OVS implements a "check_pkt_len" action, cf.
>> commit 4d5ec89fc8d1 ("net: openvswitch: Add a new action
>> check_pkt_len"), that should be used when packets exceed link MTUs:
>>
>>   With the help of this action, OVN will check the packet length
>>   and if it is greater than the MTU size, it will generate an
>>   ICMP packet (type 3, code 4) and includes the next hop mtu in it
>>   so that the sender can fragment the packets.
>>
>> and my understanding is that this can only work if we reflect the
>> effective MTU on the device itself (including VXLAN).
>>
>
> check_pkt_len is OVS datapath action and the corresponding OVS action
> is  check_pkt_larger.
>
> Logically It is expected to use this way in the OVS flows- >
>     reg0[0] = check_pkt_larger(1500);
>     if reg0[0[ == 1; then take some action.
>
> In the case of OVN, if the register reg0[0] bit is set, then we
> generate ICMP error packet (type 3, code 4).
>
> I don't know the requirements or the issue this patch is trying to
> address. But I think for OVS, there has to be
> a controller (like OVN) which makes use of the check_pkt_larger action
> and takes necessary action by adding
> appropriate OF flows based on the result of check_pkt_larger.

Hi Numan,

The issue is that a route exception might lower the MTU for the
destination, and the controller would need to be made aware of that.

In that case, it could update any check_packet_len rules.  But it's not
desirable for this type of rules to be explicitly required at all, imo.
And for setups where user wants to use action=normal, or no openflow
controller is used (a large number of OvS deployments), I think it will
still be broken.

I've cooked up a change to OvS to correspond to this series:

https://github.com/orgcandman/ovs/commit/0c063e4443dda1f62c9310bda7f54140b9dc9c31

(I'm going to additionally modify it to default the pmtudisc=interface
 by default, and the controller/operator can set it to
 do/dont/want/etc...)

Maybe I missed something?

> Thanks
> Numan
>
>>
>> Side note: I'm not fond of the idea behind that OVS action because I
>> think it competes with the kernel (and with ICMP itself, or PLPMTUD if
>> ICMP is not an option) to do PMTU discovery.
>>
>> However, if that already works for OVS (I really don't know. Aaron,
>> Numan?), perhaps you could simply consider going with that solution...
>>
>> --
>> Stefano
>>
Stefano Brivio July 15, 2020, 11:58 a.m. UTC | #14
Hi Aaron,

On Tue, 14 Jul 2020 16:38:29 -0400
Aaron Conole <aconole@redhat.com> wrote:

> Numan Siddique <nusiddiq@redhat.com> writes:
> 
> > On Mon, Jul 13, 2020 at 3:34 PM Stefano Brivio <sbrivio@redhat.com> wrote:  
> >>
> >> On Mon, 13 Jul 2020 10:04:13 +0200
> >> Florian Westphal <fw@strlen.de> wrote:
> >>  
> >> > Stefano Brivio <sbrivio@redhat.com> wrote:  
> >> > > Hi,
> >> > >
> >> > > On Sun, 12 Jul 2020 22:07:03 +0200
> >> > > Florian Westphal <fw@strlen.de> wrote:
> >> > >  
> >> > > > vxlan and geneve take the to-be-transmitted skb, prepend the
> >> > > > encapsulation header and send the result.
> >> > > >
> >> > > > Neither vxlan nor geneve can do anything about a lowered path mtu
> >> > > > except notifying the peer/upper dst entry.  
> >> > >
> >> > > It could, and I think it should, update its MTU, though. I didn't
> >> > > include this in the original implementation of PMTU discovery for UDP
> >> > > tunnels as it worked just fine for locally generated and routed
> >> > > traffic, but here we go.  
> >> >
> >> > I don't think its a good idea to muck with network config in response
> >> > to untrusted entity.  
> >>
> >> I agree that this (changing MTU on VXLAN) looks like a further step,
> >> but the practical effect is zero: we can't send those packets already
> >> today.
> >>
> >> PMTU discovery has security impacts, and they are mentioned in the
> >> RFCs. Also here, we wouldn't increase the MTU as a result, and if the
> >> entity is considered untrusted, considerations from RFC 8201 and RFC
> >> 4890 cover that.
> >>
> >> In practice, we might have broken networks, but at a practical level, I
> >> guess it's enough to not make the situation any worse.
> >>  
> >> > > As PMTU discovery happens, we have a route exception on the lower
> >> > > layer for the given path, and we know that VXLAN will use that path,
> >> > > so we also know there's no point in having a higher MTU on the VXLAN
> >> > > device, it's really the maximum packet size we can use.  
> >> >
> >> > No, in the setup that prompted this series the route exception is wrong.
> >> > The current "fix" is a shell script that flushes the exception as soon
> >> > as its added to keep the tunnel working...  
> >>
> >> Oh, oops.
> >>
> >> Well, as I mentioned, if this is breaking setups and this series is the
> >> only way to fix things, I have nothing against it, we can still work on
> >> a more comprehensive solution (including the bridge) once we have it.
> >>  
> >> > > > Some setups, however, will use vxlan as a bridge port (or openvs vport).  
> >> > >
> >> > > And, on top of that, I think what we're missing on the bridge is to
> >> > > update the MTU when a port lowers its MTU. The MTU is changed only as
> >> > > interfaces are added, which feels like a bug. We could use the lower
> >> > > layer notifier to fix this.  
> >> >
> >> > I will defer to someone who knows bridges better but I think that
> >> > in bridge case we 100% depend on a human to set everything.  
> >>
> >> Not entirely, MTU is auto-adjusted when interfaces are added (unless
> >> the user set it explicitly), however:
> >>  
> >> > bridge might be forwarding frames of non-ip protocol and I worry that
> >> > this is a self-induced DoS when we start to alter configuration behind
> >> > sysadmins back.  
> >>
> >> ...yes, I agree that the matter with the bridge is different. And we
> >> don't know if that fixes anything else than the selftest I showed, so
> >> let's forget about the bridge for a moment.
> >>  
> >> > > I tried to represent the issue you're hitting with a new test case in
> >> > > the pmtu.sh selftest, also included in the diff. Would that work for
> >> > > Open vSwitch?  
> >> >
> >> > No idea, I don't understand how it can work at all, we can't 'chop
> >> > up'/mangle l2 frame in arbitrary fashion to somehow make them pass to
> >> > the output port.  We also can't influence MTU config of the links peer.  
> >>
> >> Sorry I didn't expand right away.
> >>
> >> In the test case I showed, it works because at that point sending
> >> packets to the bridge will result in an error, and the (local) sender
> >> fragments. Let's set this aside together with the bridge affair, though.
> >>
> >> Back to VXLAN and OVS: OVS implements a "check_pkt_len" action, cf.
> >> commit 4d5ec89fc8d1 ("net: openvswitch: Add a new action
> >> check_pkt_len"), that should be used when packets exceed link MTUs:
> >>
> >>   With the help of this action, OVN will check the packet length
> >>   and if it is greater than the MTU size, it will generate an
> >>   ICMP packet (type 3, code 4) and includes the next hop mtu in it
> >>   so that the sender can fragment the packets.
> >>
> >> and my understanding is that this can only work if we reflect the
> >> effective MTU on the device itself (including VXLAN).
> >>  
> >
> > check_pkt_len is OVS datapath action and the corresponding OVS action
> > is  check_pkt_larger.
> >
> > Logically It is expected to use this way in the OVS flows- >
> >     reg0[0] = check_pkt_larger(1500);
> >     if reg0[0[ == 1; then take some action.
> >
> > In the case of OVN, if the register reg0[0] bit is set, then we
> > generate ICMP error packet (type 3, code 4).
> >
> > I don't know the requirements or the issue this patch is trying to
> > address. But I think for OVS, there has to be
> > a controller (like OVN) which makes use of the check_pkt_larger action
> > and takes necessary action by adding
> > appropriate OF flows based on the result of check_pkt_larger.  
> 
> Hi Numan,
> 
> The issue is that a route exception might lower the MTU for the
> destination, and the controller would need to be made aware of that.
> 
> In that case, it could update any check_packet_len rules.  But it's not
> desirable for this type of rules to be explicitly required at all, imo.
> And for setups where user wants to use action=normal, or no openflow
> controller is used (a large number of OvS deployments), I think it will
> still be broken.

I agree that the controller shouldn't deal with this, because it's not
a configuration topic, unless the user wants to set particular MTUs --
in which case sure, it's configuration.

Open vSwitch is responsible for forwarding packets, why can't it
relay/use the required ICMP messages?

> I've cooked up a change to OvS to correspond to this series:
> 
> https://github.com/orgcandman/ovs/commit/0c063e4443dda1f62c9310bda7f54140b9dc9c31

From the commit message:

> Some network topologies use vxlan/geneve tunnels in a non-routed setup,
> and the presence of a route exception would interfere with packet
> egress, causing unresolvable network conditions due to MTU mismatches.

Florian explained clearly why PMTU discovery is currently not working
*with a Linux bridge* connected to some type of tunnels (I haven't checked
IP tunnels yet), and proposed a solution fixing that problem. However,
he also mentioned that solution doesn't fix the issue with Open vSwitch,
because it's discarding the (additional) ICMP messages implemented there.

I proposed another solution that also works, only for a Linux bridge
though.

Open vSwitch, as far as I understand, doesn't use Linux bridges. What
is the actual problem with Open vSwitch? How does the route exception
"interfere with packet egress"? Why do you have "MTU mismatches"? The
route exception itself is correct. Can you elaborate?

> Some operating systems allow the user to choose the path MTU discovery
> strategy, whether it should ignore dst cache exception information, etc.

Linux does too, you can set /proc/sys/net/ipv4/ip_no_pmtu_disc to 1 and
PMTU discovery is disabled, for IPv4. I don't think that breaking PMTU
discovery only on some selected interfaces, that could be routed
between each other, is a sane option. I also wouldn't recommend
disabling it (for the reasons explained in RFC 1191), especially in a
complex networking environment.

For IPv6, it's "just" strongly recommended by RFC 8200 to implement it.
However, if a node doesn't implement it, it must restrict itself to the
minimum link MTU, 1280 bytes. By allowing to disable PMTU discovery on
IPv6 nodes and not enforcing that limit we introduce a significant
breakage.
Florian Westphal July 15, 2020, 12:42 p.m. UTC | #15
Stefano Brivio <sbrivio@redhat.com> wrote:
> I would still like the idea I proposed better (updating MTUs down the
> chain), it's simpler and we don't have to duplicate existing
> functionality (generating additional ICMP messages).

It doesn't make this work though.

With your skeleton patch, br0 updates MTU, but the sender still
won't know that unless input traffic to br0 is routed (or locally
generated).

Furthermore, such MTU reduction would require a mechanism to
auto-reconfig every device in the same linklevel broadcast domain,
and I am not aware of any such mechanism.
Stefano Brivio July 15, 2020, 1:35 p.m. UTC | #16
On Wed, 15 Jul 2020 14:42:58 +0200
Florian Westphal <fw@strlen.de> wrote:

> Stefano Brivio <sbrivio@redhat.com> wrote:
> > I would still like the idea I proposed better (updating MTUs down the
> > chain), it's simpler and we don't have to duplicate existing
> > functionality (generating additional ICMP messages).  
> 
> It doesn't make this work though.

Yeah, not knowing exactly what needs to work, that just fixes the two
cases you describe.

I thought that would be enough for Open vSwitch, but apparently it's
not (you mentioned the problem appeared with MTUs already set to
correct values). And also your (bulletproof, I thought) ICMP errors
don't work with it. :/

Anyway, about the Linux bridge:

> With your skeleton patch, br0 updates MTU, but the sender still
> won't know that unless input traffic to br0 is routed (or locally
> generated).

To let the sender know, I still think it's a bit simpler with this
approach, we don't have to do all the peeling. In br_handle_frame(), we
would need to add *something like*:

	if (skb->len > p->br->dev->mtu) {
		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(p->br->dev->mtu));
		goto drop;
	}

just like IP tunnels do, see tnl_update_pmtu().

Note that this doesn't work as it is because of a number of reasons
(skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
shouldn't be using icmp_send(), but at a glance that looks simpler.

Another slight preference I have towards this idea is that the only
known way we can break PMTU discovery right now is by using a bridge,
so fixing the problem there looks more future-proof than addressing any
kind of tunnel with this problem. I think FoU and GUE would hit the
same problem, I don't know about IP tunnels, sticking that selftest
snippet to whatever other test in pmtu.sh should tell.

I might be wrong of course as I haven't tried to implement this bit,
and if this turns out to be just moving the problem without making it
simpler, then sure, I'd rather stick to your approach.

> Furthermore, such MTU reduction would require a mechanism to
> auto-reconfig every device in the same linklevel broadcast domain,
> and I am not aware of any such mechanism.

You mean for other ports connected to the same bridge? They would then
get ICMP errors as well, no?

If you refer to other drivers that need to adjust the MTU, instead,
that's why I would use skb_tunnel_check_pmtu() for that, to avoid
implementing the same logic in every driver.
Florian Westphal July 15, 2020, 2:33 p.m. UTC | #17
Stefano Brivio <sbrivio@redhat.com> wrote:
> On Wed, 15 Jul 2020 14:42:58 +0200
> Florian Westphal <fw@strlen.de> wrote:
> > With your skeleton patch, br0 updates MTU, but the sender still
> > won't know that unless input traffic to br0 is routed (or locally
> > generated).
> 
> To let the sender know, I still think it's a bit simpler with this
> approach, we don't have to do all the peeling. In br_handle_frame(), we
> would need to add *something like*:
> 
> 	if (skb->len > p->br->dev->mtu) {
> 		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
> 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> 			  htonl(p->br->dev->mtu));
> 		goto drop;
> 	}
> 
> just like IP tunnels do, see tnl_update_pmtu().

Yes, but the caveat here is that a bridge might be transporting
non-IP protocol too.

So, MTU-reduction+ICMP won't help for them.
I would try to avoid mixing IP functionality into the bridge,
its a slippery slope (look at bridge netfilter for an example).

I agree that for a 'ip only' bridge that might work indeed.

> Note that this doesn't work as it is because of a number of reasons
> (skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
> shouldn't be using icmp_send(), but at a glance that looks simpler.

Yes, it also requires that the bridge has IP connectivity
to reach the inner ip, which might not be the case.

> Another slight preference I have towards this idea is that the only
> known way we can break PMTU discovery right now is by using a bridge,
> so fixing the problem there looks more future-proof than addressing any
> kind of tunnel with this problem. I think FoU and GUE would hit the
> same problem, I don't know about IP tunnels, sticking that selftest
> snippet to whatever other test in pmtu.sh should tell.

Every type of bridge port that needs to add additional header on egress
has this problem in the bridge scenario once the peer of the IP tunnel
signals a PMTU event.

I agree that excess copy&paste should be avoided, but at this point
I don't see an easy solution.

> I might be wrong of course as I haven't tried to implement this bit,
> and if this turns out to be just moving the problem without making it
> simpler, then sure, I'd rather stick to your approach.
> 
> > Furthermore, such MTU reduction would require a mechanism to
> > auto-reconfig every device in the same linklevel broadcast domain,
> > and I am not aware of any such mechanism.
> 
> You mean for other ports connected to the same bridge? They would then
> get ICMP errors as well, no?

Yes, if you don't do that then we have devices with MTU X hooked to
a bridge with MTU Y, where X > Y.  I don't see how this could work.

> If you refer to other drivers that need to adjust the MTU, instead,
> that's why I would use skb_tunnel_check_pmtu() for that, to avoid
> implementing the same logic in every driver.

Yes, it might be possible to move the proposed icmp inject into
skb_tunnel_check_pmtu() -- it gets the needed headroom passed as arg,
it could detect when device driver is in a bridge and it already knows
when skb has no dst entry that it a pmtu change could be propagated to.

Given the affected setups all use ovs I think it makes sense to make
sure the intended solution would work for ovs too, bridge seems more
like a nice-to-have thing at the moment.
Stefano Brivio July 17, 2020, 12:27 p.m. UTC | #18
On Wed, 15 Jul 2020 16:33:56 +0200
Florian Westphal <fw@strlen.de> wrote:

> Stefano Brivio <sbrivio@redhat.com> wrote:
> > On Wed, 15 Jul 2020 14:42:58 +0200
> > Florian Westphal <fw@strlen.de> wrote:  
> > > With your skeleton patch, br0 updates MTU, but the sender still
> > > won't know that unless input traffic to br0 is routed (or locally
> > > generated).  
> > 
> > To let the sender know, I still think it's a bit simpler with this
> > approach, we don't have to do all the peeling. In br_handle_frame(), we
> > would need to add *something like*:
> > 
> > 	if (skb->len > p->br->dev->mtu) {
> > 		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
> > 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> > 			  htonl(p->br->dev->mtu));
> > 		goto drop;
> > 	}
> > 
> > just like IP tunnels do, see tnl_update_pmtu().  
> 
> Yes, but the caveat here is that a bridge might be transporting
> non-IP protocol too.
> 
> So, MTU-reduction+ICMP won't help for them.

Well, it doesn't need to, PMTU discovery is only implemented for IP,
so, to handle this, we can just check if the frame contains an IP
packet. The kind of check (skb->protocol == htons(ETH_P_...)) is
already there on all sorts of paths in the bridge.

However,

> I would try to avoid mixing IP functionality into the bridge,

if we stick to the fact the bridge is a L2 device, sure, we should drop
packets silently. The problem is that bridging an UDP tunnel forces the
combination to become a router.

So, either we forbid that, or I guess it's acceptable to have (even
further) L3 functionality implemented in the bridge.

> its a slippery slope (look at bridge netfilter for an example).

Oops, I was taking it as a positive example :)

> I agree that for a 'ip only' bridge that might work indeed.
> 
> > Note that this doesn't work as it is because of a number of reasons
> > (skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
> > shouldn't be using icmp_send(), but at a glance that looks simpler.  
> 
> Yes, it also requires that the bridge has IP connectivity
> to reach the inner ip, which might not be the case.

If the VXLAN endpoint is a port of the bridge, that needs to be the
case, right? Otherwise the VXLAN endpoint can't be reached.

> > Another slight preference I have towards this idea is that the only
> > known way we can break PMTU discovery right now is by using a bridge,
> > so fixing the problem there looks more future-proof than addressing any
> > kind of tunnel with this problem. I think FoU and GUE would hit the
> > same problem, I don't know about IP tunnels, sticking that selftest
> > snippet to whatever other test in pmtu.sh should tell.  
> 
> Every type of bridge port that needs to add additional header on egress
> has this problem in the bridge scenario once the peer of the IP tunnel
> signals a PMTU event.

Yes :(

> I agree that excess copy&paste should be avoided, but at this point
> I don't see an easy solution.

I think what you mention below is way more acceptable.

> > I might be wrong of course as I haven't tried to implement this bit,
> > and if this turns out to be just moving the problem without making it
> > simpler, then sure, I'd rather stick to your approach.
> >   
> > > Furthermore, such MTU reduction would require a mechanism to
> > > auto-reconfig every device in the same linklevel broadcast domain,
> > > and I am not aware of any such mechanism.  
> > 
> > You mean for other ports connected to the same bridge? They would then
> > get ICMP errors as well, no?  
> 
> Yes, if you don't do that then we have devices with MTU X hooked to
> a bridge with MTU Y, where X > Y.  I don't see how this could work.

Yes, I see, and my point is: they would get ICMP errors from the
bridge, and that would create route exceptions.

> > If you refer to other drivers that need to adjust the MTU, instead,
> > that's why I would use skb_tunnel_check_pmtu() for that, to avoid
> > implementing the same logic in every driver.  
> 
> Yes, it might be possible to move the proposed icmp inject into
> skb_tunnel_check_pmtu() -- it gets the needed headroom passed as arg,
> it could detect when device driver is in a bridge and it already knows
> when skb has no dst entry that it a pmtu change could be propagated to.

I didn't mean to move the ICMP injection there, I meant we could
override the MTU there.

Moving the ICMP injection there: I think that would be totally
reasonable, it comes with none of the issues of the solution I proposed
and with almost none of the issues I raised about your idea.

> Given the affected setups all use ovs I think it makes sense to make
> sure the intended solution would work for ovs too, bridge seems more
> like a nice-to-have thing at the moment.

Yes, agreed.

As I see it, we don't know what the problem is there, we might even
have to do absolutely nothing in kernel. I guess we should know before
trying to hack up something.
David Ahern July 17, 2020, 3:04 p.m. UTC | #19
On 7/17/20 6:27 AM, Stefano Brivio wrote:
>>
>>> Note that this doesn't work as it is because of a number of reasons
>>> (skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
>>> shouldn't be using icmp_send(), but at a glance that looks simpler.  
>>
>> Yes, it also requires that the bridge has IP connectivity
>> to reach the inner ip, which might not be the case.
> 
> If the VXLAN endpoint is a port of the bridge, that needs to be the
> case, right? Otherwise the VXLAN endpoint can't be reached.
> 
>>> Another slight preference I have towards this idea is that the only
>>> known way we can break PMTU discovery right now is by using a bridge,
>>> so fixing the problem there looks more future-proof than addressing any
>>> kind of tunnel with this problem. I think FoU and GUE would hit the
>>> same problem, I don't know about IP tunnels, sticking that selftest
>>> snippet to whatever other test in pmtu.sh should tell.  
>>
>> Every type of bridge port that needs to add additional header on egress
>> has this problem in the bridge scenario once the peer of the IP tunnel
>> signals a PMTU event.
> 
> Yes :(
> 

The vxlan/tunnel device knows it is a bridge port, and it knows it is
going to push a udp and ip{v6} header. So why not use that information
in setting / updating the MTU? That's what I was getting at on Monday
with my comment about lwtunnel_headroom equivalent.
Florian Westphal July 17, 2020, 6:43 p.m. UTC | #20
David Ahern <dsahern@gmail.com> wrote:
> On 7/17/20 6:27 AM, Stefano Brivio wrote:
> >> Every type of bridge port that needs to add additional header on egress
> >> has this problem in the bridge scenario once the peer of the IP tunnel
> >> signals a PMTU event.
> > 
> > Yes :(
> > 
> 
> The vxlan/tunnel device knows it is a bridge port, and it knows it is
> going to push a udp and ip{v6} header. So why not use that information
> in setting / updating the MTU? That's what I was getting at on Monday
> with my comment about lwtunnel_headroom equivalent.

What action should be taken in the vxlan driver?  Say, here:

static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb,
	u32 mtu)
{
 struct dst_entry *dst = skb_dst(skb);

 if (dst && dst->ops->update_pmtu)
    dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
 else
    /* ??? HERE */
 }

We hit the (non-existent) else branch as skb has no dst entry.
Stefano Brivio July 18, 2020, 6:56 a.m. UTC | #21
On Fri, 17 Jul 2020 09:04:51 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 7/17/20 6:27 AM, Stefano Brivio wrote:
> >>  
> >>> Note that this doesn't work as it is because of a number of reasons
> >>> (skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
> >>> shouldn't be using icmp_send(), but at a glance that looks simpler.    
> >>
> >> Yes, it also requires that the bridge has IP connectivity
> >> to reach the inner ip, which might not be the case.  
> > 
> > If the VXLAN endpoint is a port of the bridge, that needs to be the
> > case, right? Otherwise the VXLAN endpoint can't be reached.
> >   
> >>> Another slight preference I have towards this idea is that the only
> >>> known way we can break PMTU discovery right now is by using a bridge,
> >>> so fixing the problem there looks more future-proof than addressing any
> >>> kind of tunnel with this problem. I think FoU and GUE would hit the
> >>> same problem, I don't know about IP tunnels, sticking that selftest
> >>> snippet to whatever other test in pmtu.sh should tell.    
> >>
> >> Every type of bridge port that needs to add additional header on egress
> >> has this problem in the bridge scenario once the peer of the IP tunnel
> >> signals a PMTU event.  
> > 
> > Yes :(
> 
> The vxlan/tunnel device knows it is a bridge port, and it knows it is
> going to push a udp and ip{v6} header. So why not use that information
> in setting / updating the MTU? That's what I was getting at on Monday
> with my comment about lwtunnel_headroom equivalent.

If I understand correctly, you're proposing something similar to my
earlier draft from:

	<20200713003813.01f2d5d3@elisabeth>
	https://lore.kernel.org/netdev/20200713003813.01f2d5d3@elisabeth/

the problem with it is that it wouldn't help: the MTU is already set to
the right value for both port and bridge in the case Florian originally
reported.

Also, given the implications on overriding configured MTUs, and
introducing (further) IP logic into the bridge, if Florian's idea of
injecting ICMP messages could be implemented in a generic function:

On Wed, 15 Jul 2020 16:33:56 +0200
Florian Westphal <fw@strlen.de> wrote:

> Yes, it might be possible to move the proposed icmp inject into
> skb_tunnel_check_pmtu() -- it gets the needed headroom passed as arg,
> it could detect when device driver is in a bridge and it already knows
> when skb has no dst entry that it a pmtu change could be propagated to.

I think that would be preferable: then it's fixed for all tunnels in a
generic, probably simpler way, without those two issues.

But then again, we're talking about Linux bridge. Unfortunately this
doesn't fix the problem with Open vSwitch either.
David Ahern July 18, 2020, 5:02 p.m. UTC | #22
On 7/18/20 12:56 AM, Stefano Brivio wrote:
> On Fri, 17 Jul 2020 09:04:51 -0600
> David Ahern <dsahern@gmail.com> wrote:
> 
>> On 7/17/20 6:27 AM, Stefano Brivio wrote:
>>>>  
>>>>> Note that this doesn't work as it is because of a number of reasons
>>>>> (skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
>>>>> shouldn't be using icmp_send(), but at a glance that looks simpler.    
>>>>
>>>> Yes, it also requires that the bridge has IP connectivity
>>>> to reach the inner ip, which might not be the case.  
>>>
>>> If the VXLAN endpoint is a port of the bridge, that needs to be the
>>> case, right? Otherwise the VXLAN endpoint can't be reached.
>>>   
>>>>> Another slight preference I have towards this idea is that the only
>>>>> known way we can break PMTU discovery right now is by using a bridge,
>>>>> so fixing the problem there looks more future-proof than addressing any
>>>>> kind of tunnel with this problem. I think FoU and GUE would hit the
>>>>> same problem, I don't know about IP tunnels, sticking that selftest
>>>>> snippet to whatever other test in pmtu.sh should tell.    
>>>>
>>>> Every type of bridge port that needs to add additional header on egress
>>>> has this problem in the bridge scenario once the peer of the IP tunnel
>>>> signals a PMTU event.  
>>>
>>> Yes :(
>>
>> The vxlan/tunnel device knows it is a bridge port, and it knows it is
>> going to push a udp and ip{v6} header. So why not use that information
>> in setting / updating the MTU? That's what I was getting at on Monday
>> with my comment about lwtunnel_headroom equivalent.
> 
> If I understand correctly, you're proposing something similar to my
> earlier draft from:
> 
> 	<20200713003813.01f2d5d3@elisabeth>
> 	https://lore.kernel.org/netdev/20200713003813.01f2d5d3@elisabeth/
> 
> the problem with it is that it wouldn't help: the MTU is already set to
> the right value for both port and bridge in the case Florian originally
> reported.

I am definitely hand waving; I have not had time to create a setup
showing the problem. Is there a reproducer using only namespaces?

> 
> Also, given the implications on overriding configured MTUs, and
> introducing (further) IP logic into the bridge, if Florian's idea of
> injecting ICMP messages could be implemented in a generic function:
> 
> On Wed, 15 Jul 2020 16:33:56 +0200
> Florian Westphal <fw@strlen.de> wrote:
> 
>> Yes, it might be possible to move the proposed icmp inject into
>> skb_tunnel_check_pmtu() -- it gets the needed headroom passed as arg,
>> it could detect when device driver is in a bridge and it already knows
>> when skb has no dst entry that it a pmtu change could be propagated to.
> 
> I think that would be preferable: then it's fixed for all tunnels in a
> generic, probably simpler way, without those two issues.
> 
> But then again, we're talking about Linux bridge. Unfortunately this
> doesn't fix the problem with Open vSwitch either.
> 

Of course. (insert sarcastic jab at ovs here)
Stefano Brivio July 18, 2020, 5:58 p.m. UTC | #23
On Sat, 18 Jul 2020 11:02:46 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 7/18/20 12:56 AM, Stefano Brivio wrote:
> > On Fri, 17 Jul 2020 09:04:51 -0600
> > David Ahern <dsahern@gmail.com> wrote:
> >   
> >> On 7/17/20 6:27 AM, Stefano Brivio wrote:  
> >>>>    
> >>>>> Note that this doesn't work as it is because of a number of reasons
> >>>>> (skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
> >>>>> shouldn't be using icmp_send(), but at a glance that looks simpler.      
> >>>>
> >>>> Yes, it also requires that the bridge has IP connectivity
> >>>> to reach the inner ip, which might not be the case.    
> >>>
> >>> If the VXLAN endpoint is a port of the bridge, that needs to be the
> >>> case, right? Otherwise the VXLAN endpoint can't be reached.
> >>>     
> >>>>> Another slight preference I have towards this idea is that the only
> >>>>> known way we can break PMTU discovery right now is by using a bridge,
> >>>>> so fixing the problem there looks more future-proof than addressing any
> >>>>> kind of tunnel with this problem. I think FoU and GUE would hit the
> >>>>> same problem, I don't know about IP tunnels, sticking that selftest
> >>>>> snippet to whatever other test in pmtu.sh should tell.      
> >>>>
> >>>> Every type of bridge port that needs to add additional header on egress
> >>>> has this problem in the bridge scenario once the peer of the IP tunnel
> >>>> signals a PMTU event.    
> >>>
> >>> Yes :(  
> >>
> >> The vxlan/tunnel device knows it is a bridge port, and it knows it is
> >> going to push a udp and ip{v6} header. So why not use that information
> >> in setting / updating the MTU? That's what I was getting at on Monday
> >> with my comment about lwtunnel_headroom equivalent.  
> > 
> > If I understand correctly, you're proposing something similar to my
> > earlier draft from:
> > 
> > 	<20200713003813.01f2d5d3@elisabeth>
> > 	https://lore.kernel.org/netdev/20200713003813.01f2d5d3@elisabeth/
> > 
> > the problem with it is that it wouldn't help: the MTU is already set to
> > the right value for both port and bridge in the case Florian originally
> > reported.  
> 
> I am definitely hand waving; I have not had time to create a setup
> showing the problem. Is there a reproducer using only namespaces?

And I'm laser pointing: check the bottom of that email ;)
Stefano Brivio July 18, 2020, 6:04 p.m. UTC | #24
On Sat, 18 Jul 2020 19:58:50 +0200
Stefano Brivio <sbrivio@redhat.com> wrote:

> On Sat, 18 Jul 2020 11:02:46 -0600
> David Ahern <dsahern@gmail.com> wrote:
> 
> > On 7/18/20 12:56 AM, Stefano Brivio wrote:  
> > > On Fri, 17 Jul 2020 09:04:51 -0600
> > > David Ahern <dsahern@gmail.com> wrote:
> > >     
> > >> On 7/17/20 6:27 AM, Stefano Brivio wrote:    
> > >>>>      
> > >>>>> Note that this doesn't work as it is because of a number of reasons
> > >>>>> (skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
> > >>>>> shouldn't be using icmp_send(), but at a glance that looks simpler.        
> > >>>>
> > >>>> Yes, it also requires that the bridge has IP connectivity
> > >>>> to reach the inner ip, which might not be the case.      
> > >>>
> > >>> If the VXLAN endpoint is a port of the bridge, that needs to be the
> > >>> case, right? Otherwise the VXLAN endpoint can't be reached.
> > >>>       
> > >>>>> Another slight preference I have towards this idea is that the only
> > >>>>> known way we can break PMTU discovery right now is by using a bridge,
> > >>>>> so fixing the problem there looks more future-proof than addressing any
> > >>>>> kind of tunnel with this problem. I think FoU and GUE would hit the
> > >>>>> same problem, I don't know about IP tunnels, sticking that selftest
> > >>>>> snippet to whatever other test in pmtu.sh should tell.        
> > >>>>
> > >>>> Every type of bridge port that needs to add additional header on egress
> > >>>> has this problem in the bridge scenario once the peer of the IP tunnel
> > >>>> signals a PMTU event.      
> > >>>
> > >>> Yes :(    
> > >>
> > >> The vxlan/tunnel device knows it is a bridge port, and it knows it is
> > >> going to push a udp and ip{v6} header. So why not use that information
> > >> in setting / updating the MTU? That's what I was getting at on Monday
> > >> with my comment about lwtunnel_headroom equivalent.    
> > > 
> > > If I understand correctly, you're proposing something similar to my
> > > earlier draft from:
> > > 
> > > 	<20200713003813.01f2d5d3@elisabeth>
> > > 	https://lore.kernel.org/netdev/20200713003813.01f2d5d3@elisabeth/
> > > 
> > > the problem with it is that it wouldn't help: the MTU is already set to
> > > the right value for both port and bridge in the case Florian originally
> > > reported.    
> > 
> > I am definitely hand waving; I have not had time to create a setup
> > showing the problem. Is there a reproducer using only namespaces?  
> 
> And I'm laser pointing: check the bottom of that email ;)

Oh, if you meant for Open vSwitch: then... I don't know exactly what I
should be doing. :)
David Ahern July 19, 2020, 6:43 p.m. UTC | #25
On 7/18/20 11:58 AM, Stefano Brivio wrote:
> On Sat, 18 Jul 2020 11:02:46 -0600
> David Ahern <dsahern@gmail.com> wrote:
> 
>> On 7/18/20 12:56 AM, Stefano Brivio wrote:
>>> On Fri, 17 Jul 2020 09:04:51 -0600
>>> David Ahern <dsahern@gmail.com> wrote:
>>>   
>>>> On 7/17/20 6:27 AM, Stefano Brivio wrote:  
>>>>>>    
>>>>>>> Note that this doesn't work as it is because of a number of reasons
>>>>>>> (skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
>>>>>>> shouldn't be using icmp_send(), but at a glance that looks simpler.      
>>>>>>
>>>>>> Yes, it also requires that the bridge has IP connectivity
>>>>>> to reach the inner ip, which might not be the case.    
>>>>>
>>>>> If the VXLAN endpoint is a port of the bridge, that needs to be the
>>>>> case, right? Otherwise the VXLAN endpoint can't be reached.
>>>>>     
>>>>>>> Another slight preference I have towards this idea is that the only
>>>>>>> known way we can break PMTU discovery right now is by using a bridge,
>>>>>>> so fixing the problem there looks more future-proof than addressing any
>>>>>>> kind of tunnel with this problem. I think FoU and GUE would hit the
>>>>>>> same problem, I don't know about IP tunnels, sticking that selftest
>>>>>>> snippet to whatever other test in pmtu.sh should tell.      
>>>>>>
>>>>>> Every type of bridge port that needs to add additional header on egress
>>>>>> has this problem in the bridge scenario once the peer of the IP tunnel
>>>>>> signals a PMTU event.    
>>>>>
>>>>> Yes :(  
>>>>
>>>> The vxlan/tunnel device knows it is a bridge port, and it knows it is
>>>> going to push a udp and ip{v6} header. So why not use that information
>>>> in setting / updating the MTU? That's what I was getting at on Monday
>>>> with my comment about lwtunnel_headroom equivalent.  
>>>
>>> If I understand correctly, you're proposing something similar to my
>>> earlier draft from:
>>>
>>> 	<20200713003813.01f2d5d3@elisabeth>
>>> 	https://lore.kernel.org/netdev/20200713003813.01f2d5d3@elisabeth/
>>>
>>> the problem with it is that it wouldn't help: the MTU is already set to
>>> the right value for both port and bridge in the case Florian originally
>>> reported.  
>>
>> I am definitely hand waving; I have not had time to create a setup
>> showing the problem. Is there a reproducer using only namespaces?
> 
> And I'm laser pointing: check the bottom of that email ;)
> 

With this test case, the lookup fails:

[  144.689378] vxlan: vxlan_xmit_one: dev vxlan_a 10.0.1.1/57864 ->
10.0.0.0/4789 len 5010 gw 10.0.1.2
[  144.692755] vxlan: skb_tunnel_check_pmtu: dst dev br0 skb dev vxlan_a
skb len 5010 encap_mtu 4000 headroom 50
[  144.697682] vxlan: skb_dst_update_pmtu_no_confirm: calling
ip_rt_update_pmtu+0x0/0x160/ffffffff825ee850 for dev br0 mtu 3950
[  144.703601] IPv4: __ip_rt_update_pmtu: dev br0 mtu 3950 old_mtu 5000
192.168.2.1 -> 192.168.2.2
[  144.708177] IPv4: __ip_rt_update_pmtu: fib_lookup failed for
192.168.2.1 -> 192.168.2.2

Because the lookup fails, __ip_rt_update_pmtu skips creating the exception.

This hack gets the lookup to succeed:

fl4->flowi4_oif = dst->dev->ifindex;
or
fl4->flowi4_oif = 0;

and the test passes.
Stefano Brivio July 19, 2020, 9:49 p.m. UTC | #26
On Sun, 19 Jul 2020 12:43:55 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 7/18/20 11:58 AM, Stefano Brivio wrote:
> > On Sat, 18 Jul 2020 11:02:46 -0600
> > David Ahern <dsahern@gmail.com> wrote:
> >   
> >> On 7/18/20 12:56 AM, Stefano Brivio wrote:  
> >>> On Fri, 17 Jul 2020 09:04:51 -0600
> >>> David Ahern <dsahern@gmail.com> wrote:
> >>>     
> >>>> On 7/17/20 6:27 AM, Stefano Brivio wrote:    
> >>>>>>      
> >>>>>>> Note that this doesn't work as it is because of a number of reasons
> >>>>>>> (skb doesn't have a dst, pkt_type is not PACKET_HOST), and perhaps we
> >>>>>>> shouldn't be using icmp_send(), but at a glance that looks simpler.        
> >>>>>>
> >>>>>> Yes, it also requires that the bridge has IP connectivity
> >>>>>> to reach the inner ip, which might not be the case.      
> >>>>>
> >>>>> If the VXLAN endpoint is a port of the bridge, that needs to be the
> >>>>> case, right? Otherwise the VXLAN endpoint can't be reached.
> >>>>>       
> >>>>>>> Another slight preference I have towards this idea is that the only
> >>>>>>> known way we can break PMTU discovery right now is by using a bridge,
> >>>>>>> so fixing the problem there looks more future-proof than addressing any
> >>>>>>> kind of tunnel with this problem. I think FoU and GUE would hit the
> >>>>>>> same problem, I don't know about IP tunnels, sticking that selftest
> >>>>>>> snippet to whatever other test in pmtu.sh should tell.        
> >>>>>>
> >>>>>> Every type of bridge port that needs to add additional header on egress
> >>>>>> has this problem in the bridge scenario once the peer of the IP tunnel
> >>>>>> signals a PMTU event.      
> >>>>>
> >>>>> Yes :(    
> >>>>
> >>>> The vxlan/tunnel device knows it is a bridge port, and it knows it is
> >>>> going to push a udp and ip{v6} header. So why not use that information
> >>>> in setting / updating the MTU? That's what I was getting at on Monday
> >>>> with my comment about lwtunnel_headroom equivalent.    
> >>>
> >>> If I understand correctly, you're proposing something similar to my
> >>> earlier draft from:
> >>>
> >>> 	<20200713003813.01f2d5d3@elisabeth>
> >>> 	https://lore.kernel.org/netdev/20200713003813.01f2d5d3@elisabeth/
> >>>
> >>> the problem with it is that it wouldn't help: the MTU is already set to
> >>> the right value for both port and bridge in the case Florian originally
> >>> reported.    
> >>
> >> I am definitely hand waving; I have not had time to create a setup
> >> showing the problem. Is there a reproducer using only namespaces?  
> > 
> > And I'm laser pointing: check the bottom of that email ;)
> >   
> 
> With this test case, the lookup fails:
> 
> [  144.689378] vxlan: vxlan_xmit_one: dev vxlan_a 10.0.1.1/57864 ->
> 10.0.0.0/4789 len 5010 gw 10.0.1.2
> [  144.692755] vxlan: skb_tunnel_check_pmtu: dst dev br0 skb dev vxlan_a
> skb len 5010 encap_mtu 4000 headroom 50
> [  144.697682] vxlan: skb_dst_update_pmtu_no_confirm: calling
> ip_rt_update_pmtu+0x0/0x160/ffffffff825ee850 for dev br0 mtu 3950
> [  144.703601] IPv4: __ip_rt_update_pmtu: dev br0 mtu 3950 old_mtu 5000
> 192.168.2.1 -> 192.168.2.2
> [  144.708177] IPv4: __ip_rt_update_pmtu: fib_lookup failed for
> 192.168.2.1 -> 192.168.2.2
> 
> Because the lookup fails, __ip_rt_update_pmtu skips creating the exception.
> 
> This hack gets the lookup to succeed:
> 
> fl4->flowi4_oif = dst->dev->ifindex;
> or
> fl4->flowi4_oif = 0;

Oh, I didn't consider that... route. :) Here comes an added twist, which
currently needs Florian's changes from:
	https://git.breakpoint.cc/cgit/fw/net-next.git/log/?h=udp_tun_pmtud_12

Test is as follows:

test_pmtu_ipv4_vxlan4_exception_bridge() {
	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  4 4

	ip netns add ns-C

	ip -n ns-C link add veth_c_a type veth peer name veth_a_c
	ip -n ns-C link set veth_a_c netns ns-A

	ip -n ns-C addr add 192.168.2.100/24 dev veth_c

	ip -n ns-C link set dev veth_c_a mtu 5000
	ip -n ns-C link set veth_c_a up
	ip -n ns-A link set dev veth_a_c mtu 5000
	ip -n ns-A link set veth_c_a up

	ip -n ns-A link add br0 type bridge
	ip -n ns-A link set br0 up
	ip -n ns-A link set dev br0 mtu 5000
	ip -n ns-A link set veth_a_c master br0
	ip -n ns-A link set vxlan_a master br0

	ip -n ns-A addr del 192.168.2.1/24 dev vxlan_a
	ip -n ns-A addr add 192.168.2.1/24 dev br0

	ip -n ns-C exec ping -c 1 -w 2 -M want -s 5000 192.168.2.2
}

I didn't check the test itself recently, I'm just copying from some
local changes I was trying last week, some commands might be wrong.

The idea is: what if we now have another host (here, it's ns-C) sending
traffic to that bridge? Then the exception on a local interface isn't
enough, we actually need to send Fragmentation Needed back to where the
packet came from, and the bridge won't do it for us (with routing, it
already works).

I haven't tried your hack, but I guess it would have the same problem.
David Ahern July 20, 2020, 3:19 a.m. UTC | #27
On 7/19/20 3:49 PM, Stefano Brivio wrote:
>>
>> With this test case, the lookup fails:
>>
>> [  144.689378] vxlan: vxlan_xmit_one: dev vxlan_a 10.0.1.1/57864 ->
>> 10.0.0.0/4789 len 5010 gw 10.0.1.2
>> [  144.692755] vxlan: skb_tunnel_check_pmtu: dst dev br0 skb dev vxlan_a
>> skb len 5010 encap_mtu 4000 headroom 50
>> [  144.697682] vxlan: skb_dst_update_pmtu_no_confirm: calling
>> ip_rt_update_pmtu+0x0/0x160/ffffffff825ee850 for dev br0 mtu 3950
>> [  144.703601] IPv4: __ip_rt_update_pmtu: dev br0 mtu 3950 old_mtu 5000
>> 192.168.2.1 -> 192.168.2.2
>> [  144.708177] IPv4: __ip_rt_update_pmtu: fib_lookup failed for
>> 192.168.2.1 -> 192.168.2.2
>>
>> Because the lookup fails, __ip_rt_update_pmtu skips creating the exception.
>>
>> This hack gets the lookup to succeed:
>>
>> fl4->flowi4_oif = dst->dev->ifindex;
>> or
>> fl4->flowi4_oif = 0;
> 
> Oh, I didn't consider that... route. :) Here comes an added twist, which
> currently needs Florian's changes from:
> 	https://git.breakpoint.cc/cgit/fw/net-next.git/log/?h=udp_tun_pmtud_12
> 
> Test is as follows:
> 
> test_pmtu_ipv4_vxlan4_exception_bridge() {
> 	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  4 4
> 
> 	ip netns add ns-C
> 
> 	ip -n ns-C link add veth_c_a type veth peer name veth_a_c
> 	ip -n ns-C link set veth_a_c netns ns-A
> 
> 	ip -n ns-C addr add 192.168.2.100/24 dev veth_c
> 
> 	ip -n ns-C link set dev veth_c_a mtu 5000
> 	ip -n ns-C link set veth_c_a up
> 	ip -n ns-A link set dev veth_a_c mtu 5000
> 	ip -n ns-A link set veth_c_a up
> 
> 	ip -n ns-A link add br0 type bridge
> 	ip -n ns-A link set br0 up
> 	ip -n ns-A link set dev br0 mtu 5000
> 	ip -n ns-A link set veth_a_c master br0
> 	ip -n ns-A link set vxlan_a master br0
> 
> 	ip -n ns-A addr del 192.168.2.1/24 dev vxlan_a
> 	ip -n ns-A addr add 192.168.2.1/24 dev br0
> 
> 	ip -n ns-C exec ping -c 1 -w 2 -M want -s 5000 192.168.2.2
> }
> 
> I didn't check the test itself recently, I'm just copying from some
> local changes I was trying last week, some commands might be wrong.

I fixed the exec typo, but yes even with my flowi4_oif hack it fails.

> 
> The idea is: what if we now have another host (here, it's ns-C) sending
> traffic to that bridge? Then the exception on a local interface isn't
> enough, we actually need to send Fragmentation Needed back to where the
> packet came from, and the bridge won't do it for us (with routing, it
> already works).
> 
> I haven't tried your hack, but I guess it would have the same problem.
> 

What I saw in my tests and debug statements is that vxlan xmit does
compensate for the tunnel overhead (e.g., skb_tunnel_check_pmtu in
vxlan_xmit_one). It still feels like there are some minor details that
are wrong - like the fib_lookup failing when called from the
vxlan_xmit_one path. Does finding and fixing those make it work vs
adding another config item? I can send my debug diff if it helps.
Stefano Brivio July 26, 2020, 5:01 p.m. UTC | #28
On Sun, 19 Jul 2020 21:19:44 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 7/19/20 3:49 PM, Stefano Brivio wrote:
> >>
> >> With this test case, the lookup fails:
> >>
> >> [  144.689378] vxlan: vxlan_xmit_one: dev vxlan_a 10.0.1.1/57864 ->
> >> 10.0.0.0/4789 len 5010 gw 10.0.1.2
> >> [  144.692755] vxlan: skb_tunnel_check_pmtu: dst dev br0 skb dev vxlan_a
> >> skb len 5010 encap_mtu 4000 headroom 50
> >> [  144.697682] vxlan: skb_dst_update_pmtu_no_confirm: calling
> >> ip_rt_update_pmtu+0x0/0x160/ffffffff825ee850 for dev br0 mtu 3950
> >> [  144.703601] IPv4: __ip_rt_update_pmtu: dev br0 mtu 3950 old_mtu 5000
> >> 192.168.2.1 -> 192.168.2.2
> >> [  144.708177] IPv4: __ip_rt_update_pmtu: fib_lookup failed for
> >> 192.168.2.1 -> 192.168.2.2
> >>
> >> Because the lookup fails, __ip_rt_update_pmtu skips creating the exception.
> >>
> >> This hack gets the lookup to succeed:
> >>
> >> fl4->flowi4_oif = dst->dev->ifindex;
> >> or
> >> fl4->flowi4_oif = 0;  
> > 
> > Oh, I didn't consider that... route. :) Here comes an added twist, which
> > currently needs Florian's changes from:
> > 	https://git.breakpoint.cc/cgit/fw/net-next.git/log/?h=udp_tun_pmtud_12
> > 
> > Test is as follows:
> > 
> > test_pmtu_ipv4_vxlan4_exception_bridge() {
> > 	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  4 4
> > 
> > 	ip netns add ns-C
> > 
> > 	ip -n ns-C link add veth_c_a type veth peer name veth_a_c
> > 	ip -n ns-C link set veth_a_c netns ns-A
> > 
> > 	ip -n ns-C addr add 192.168.2.100/24 dev veth_c
> > 
> > 	ip -n ns-C link set dev veth_c_a mtu 5000
> > 	ip -n ns-C link set veth_c_a up
> > 	ip -n ns-A link set dev veth_a_c mtu 5000
> > 	ip -n ns-A link set veth_c_a up
> > 
> > 	ip -n ns-A link add br0 type bridge
> > 	ip -n ns-A link set br0 up
> > 	ip -n ns-A link set dev br0 mtu 5000
> > 	ip -n ns-A link set veth_a_c master br0
> > 	ip -n ns-A link set vxlan_a master br0
> > 
> > 	ip -n ns-A addr del 192.168.2.1/24 dev vxlan_a
> > 	ip -n ns-A addr add 192.168.2.1/24 dev br0
> > 
> > 	ip -n ns-C exec ping -c 1 -w 2 -M want -s 5000 192.168.2.2
> > }
> > 
> > I didn't check the test itself recently, I'm just copying from some
> > local changes I was trying last week, some commands might be wrong.  
> 
> I fixed the exec typo, but yes even with my flowi4_oif hack it fails.
> 
> > 
> > The idea is: what if we now have another host (here, it's ns-C) sending
> > traffic to that bridge? Then the exception on a local interface isn't
> > enough, we actually need to send Fragmentation Needed back to where the
> > packet came from, and the bridge won't do it for us (with routing, it
> > already works).
> > 
> > I haven't tried your hack, but I guess it would have the same problem.
> >   
> 
> What I saw in my tests and debug statements is that vxlan xmit does
> compensate for the tunnel overhead (e.g., skb_tunnel_check_pmtu in
> vxlan_xmit_one). It still feels like there are some minor details that
> are wrong - like the fib_lookup failing when called from the
> vxlan_xmit_one path. Does finding and fixing those make it work vs
> adding another config item? I can send my debug diff if it helps.

Sorry, I forgot to answer this: I don't think so.

With your hack you can create an exception on the bridge, which fixes
the local bridge case, but if you add another node, the exception on
the local bridge doesn't help (Florian explained why in better detail),
nothing will be sent to the sender. The ICMP message is sent to the
sender in the routed case because of IP forwarding, but it won't work
here.

On top of your hack, we could now tell the bridge to send an ICMP
message if the packet is too big for the destination. The destination
isn't there, though -- finding it means building quite some IP logic
into the bridge.

The most logical thing to do, to me, seems to stick with Florian's
approach (tunnel implementation sending ICMP to the IP sender, as it
logically represents a part of the router forwarding implementation,
and it implies adjusted MTU) and try to expose that functionality in a
generic function. I'll try in a bit.
diff mbox series

Patch

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 5e65bf2fd32d..fa8e546546e3 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1195,6 +1195,13 @@  static inline void ip6_sock_set_recverr(struct sock *sk)
 	release_sock(sk);
 }
 
+static inline void ip6_sock_set_mtu_discover(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	inet6_sk(sk)->pmtudisc = val;
+	release_sock(sk);
+}
+
 static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
 {
 	unsigned int pref = 0;
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index dd20ce99740c..f02be73bdae1 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -34,6 +34,8 @@  struct udp_port_cfg {
 	unsigned int		use_udp_checksums:1,
 				use_udp6_tx_checksums:1,
 				use_udp6_rx_checksums:1,
+				ip_pmtudisc:1,
+				ip_pmtudiscv:3,
 				ipv6_v6only:1;
 };
 
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 3eecba0874aa..1d20bd5b72ac 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -26,6 +26,8 @@  int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 		if (err < 0)
 			goto error;
 	}
+	if (cfg->ip_pmtudisc)
+		ip_sock_set_mtu_discover(sock->sk, cfg->ip_pmtudiscv);
 
 	udp_addr.sin_family = AF_INET;
 	udp_addr.sin_addr = cfg->local_ip;
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index cdc4d4ee2420..63c22252a76f 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -34,6 +34,13 @@  int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 		if (err < 0)
 			goto error;
 	}
+	if (cfg->ip_pmtudisc) {
+		BUILD_BUG_ON(IP_PMTUDISC_DONT != IPV6_PMTUDISC_DONT);
+		BUILD_BUG_ON(IP_PMTUDISC_OMIT != IPV6_PMTUDISC_OMIT);
+
+		ip_sock_set_mtu_discover(sock->sk, cfg->ip_pmtudiscv);
+		ip6_sock_set_mtu_discover(sock->sk, cfg->ip_pmtudiscv);
+	}
 
 	udp6_addr.sin6_family = AF_INET6;
 	memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,