diff mbox series

[net] vrf: reset rt_iif for recirculated mcast out pkts

Message ID 20190625103359.31102-1-ssuryaextr@gmail.com
State Superseded
Delegated to: David Miller
Headers show
Series [net] vrf: reset rt_iif for recirculated mcast out pkts | expand

Commit Message

Stephen Suryaputra June 25, 2019, 10:33 a.m. UTC
Multicast egress packets has skb_rtable(skb)->rt_iif set to the oif.
Depending on the socket, these packets might be recirculated back as
input and raw sockets that are opened for them are bound to the VRF. But
since skb_rtable(skb) is set and its rt_iif is non-zero, inet_iif()
function returns rt_iif instead of skb_iif (the VRF netdev). Hence, the
socket lookup fails.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
---
 include/net/route.h  |  1 +
 net/ipv4/ip_output.c | 25 ++++++++++++++++++++++++-
 net/ipv4/route.c     | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)

Comments

David Ahern June 25, 2019, 8:22 p.m. UTC | #1
On 6/25/19 4:33 AM, Stephen Suryaputra wrote:
> @@ -363,10 +376,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
>  #endif
>  		   ) {
>  			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
> -			if (newskb)
> +			if (newskb) {
> +				/* Reset rt_iif so that inet_iif() will return
> +				 * skb->dev->ifIndex which is the VRF device for
> +				 * socket lookup. Setting this to VRF ifindex
> +				 * causes ipi_ifindex in in_pktinfo to be
> +				 * overwritten, see ipv4_pktinfo_prepare().
> +				 */
> +				if (netif_is_l3_slave(dev))

seems like the rt_iif is a problem for recirculated mcast packets in
general, not just ones tied to a VRF.

> +					ip_mc_reset_rt_iif(net, rt, newskb);
> +
>  				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
>  					net, sk, newskb, NULL, newskb->dev,
>  					ip_mc_finish_output);
> +			}
>  		}
>  
>  		/* Multicasts with ttl 0 must not go beyond the host */
David Miller June 25, 2019, 8:31 p.m. UTC | #2
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Tue, 25 Jun 2019 06:33:59 -0400

> Multicast egress packets has skb_rtable(skb)->rt_iif set to the oif.
> Depending on the socket, these packets might be recirculated back as
> input and raw sockets that are opened for them are bound to the VRF. But
> since skb_rtable(skb) is set and its rt_iif is non-zero, inet_iif()
> function returns rt_iif instead of skb_iif (the VRF netdev). Hence, the
> socket lookup fails.
> 
> Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>

David A., please review.

Thank you.
David Ahern June 25, 2019, 8:36 p.m. UTC | #3
On 6/25/19 2:22 PM, David Ahern wrote:
> On 6/25/19 4:33 AM, Stephen Suryaputra wrote:
>> @@ -363,10 +376,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
>>  #endif
>>  		   ) {
>>  			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
>> -			if (newskb)
>> +			if (newskb) {
>> +				/* Reset rt_iif so that inet_iif() will return
>> +				 * skb->dev->ifIndex which is the VRF device for
>> +				 * socket lookup. Setting this to VRF ifindex
>> +				 * causes ipi_ifindex in in_pktinfo to be
>> +				 * overwritten, see ipv4_pktinfo_prepare().
>> +				 */
>> +				if (netif_is_l3_slave(dev))
> 
> seems like the rt_iif is a problem for recirculated mcast packets in
> general, not just ones tied to a VRF.
> 
>> +					ip_mc_reset_rt_iif(net, rt, newskb);
>> +
>>  				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
>>  					net, sk, newskb, NULL, newskb->dev,
>>  					ip_mc_finish_output);
>> +			}
>>  		}
>>  
>>  		/* Multicasts with ttl 0 must not go beyond the host */

Also, wouldn't this problem apply to broadcast packets as well if they
get recirculated back up the stack? Maybe then the reset_rt_iif needs to
be done in ip_mc_finish_output.
Stephen Suryaputra June 25, 2019, 8:42 p.m. UTC | #4
On Tue, Jun 25, 2019 at 4:22 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 6/25/19 4:33 AM, Stephen Suryaputra wrote:
> > @@ -363,10 +376,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
> >  #endif
> >                  ) {
> >                       struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
> > -                     if (newskb)
> > +                     if (newskb) {
> > +                             /* Reset rt_iif so that inet_iif() will return
> > +                              * skb->dev->ifIndex which is the VRF device for
> > +                              * socket lookup. Setting this to VRF ifindex
> > +                              * causes ipi_ifindex in in_pktinfo to be
> > +                              * overwritten, see ipv4_pktinfo_prepare().
> > +                              */
> > +                             if (netif_is_l3_slave(dev))
>
> seems like the rt_iif is a problem for recirculated mcast packets in
> general, not just ones tied to a VRF.

It seems so to me too but I wonder why this hasn't been seen...
Can I get more feedbacks on this? If there is an agreement to fix this
generally, I will remove the if clause and respin the patch with more
accurate changelog.

>
> > +                                     ip_mc_reset_rt_iif(net, rt, newskb);
> > +
> >                               NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
> >                                       net, sk, newskb, NULL, newskb->dev,
> >                                       ip_mc_finish_output);
> > +                     }
> >               }
> >
> >               /* Multicasts with ttl 0 must not go beyond the host */

Thanks.
David Ahern June 25, 2019, 8:52 p.m. UTC | #5
On 6/25/19 2:42 PM, Stephen Suryaputra wrote:
> On Tue, Jun 25, 2019 at 4:22 PM David Ahern <dsahern@gmail.com> wrote:
>>
>> On 6/25/19 4:33 AM, Stephen Suryaputra wrote:
>>> @@ -363,10 +376,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
>>>  #endif
>>>                  ) {
>>>                       struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
>>> -                     if (newskb)
>>> +                     if (newskb) {
>>> +                             /* Reset rt_iif so that inet_iif() will return
>>> +                              * skb->dev->ifIndex which is the VRF device for
>>> +                              * socket lookup. Setting this to VRF ifindex
>>> +                              * causes ipi_ifindex in in_pktinfo to be
>>> +                              * overwritten, see ipv4_pktinfo_prepare().
>>> +                              */
>>> +                             if (netif_is_l3_slave(dev))
>>
>> seems like the rt_iif is a problem for recirculated mcast packets in
>> general, not just ones tied to a VRF.
> 
> It seems so to me too but I wonder why this hasn't been seen...
> Can I get more feedbacks on this? If there is an agreement to fix this
> generally, I will remove the if clause and respin the patch with more
> accurate changelog.

rt_iif is used to save the original oif during a route lookup so if
packets are delivered locally apps can know the real oif and not the
loopback/VRF device which is just a trick for local traffic.

The raw socket lookup was recently changed to handle local traffic with
raw sockets bound to a device. e.g., ping was recently changed to use
SO_BINDTODEVICE vs IP_PKTINFO and revealed that the socket lookup was
not considering rt_iif when doing the lookup for 'ping -I <dev> <dev addr>'

The mcast use case seems to get hung up on rt_iif being set when packets
are recirculated for local delivery which is slightly different use case
than local traffic to local addresses.
diff mbox series

Patch

diff --git a/include/net/route.h b/include/net/route.h
index 065b47754f05..55ff71ffb796 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -221,6 +221,7 @@  void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
 struct rtable *rt_dst_alloc(struct net_device *dev,
 			     unsigned int flags, u16 type,
 			     bool nopolicy, bool noxfrm, bool will_cache);
+struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt);
 
 struct in_ifaddr;
 void fib_add_ifaddr(struct in_ifaddr *);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 16f9159234a2..a5e240bad3ce 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -329,6 +329,19 @@  static int ip_mc_finish_output(struct net *net, struct sock *sk,
 	return dev_loopback_xmit(net, sk, skb);
 }
 
+static void ip_mc_reset_rt_iif(struct net *net, struct rtable *rt,
+			       struct sk_buff *newskb)
+{
+	struct rtable *new_rt;
+
+	new_rt = rt_dst_clone(net->loopback_dev, rt);
+	if (new_rt) {
+		new_rt->rt_iif = 0;
+		skb_dst_drop(newskb);
+		skb_dst_set(newskb, &new_rt->dst);
+	}
+}
+
 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct rtable *rt = skb_rtable(skb);
@@ -363,10 +376,20 @@  int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 #endif
 		   ) {
 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
-			if (newskb)
+			if (newskb) {
+				/* Reset rt_iif so that inet_iif() will return
+				 * skb->dev->ifIndex which is the VRF device for
+				 * socket lookup. Setting this to VRF ifindex
+				 * causes ipi_ifindex in in_pktinfo to be
+				 * overwritten, see ipv4_pktinfo_prepare().
+				 */
+				if (netif_is_l3_slave(dev))
+					ip_mc_reset_rt_iif(net, rt, newskb);
+
 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 					net, sk, newskb, NULL, newskb->dev,
 					ip_mc_finish_output);
+			}
 		}
 
 		/* Multicasts with ttl 0 must not go beyond the host */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6cb7cff22db9..8ea0735a6754 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1647,6 +1647,39 @@  struct rtable *rt_dst_alloc(struct net_device *dev,
 }
 EXPORT_SYMBOL(rt_dst_alloc);
 
+struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
+{
+	struct rtable *new_rt;
+
+	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+			   rt->dst.flags);
+
+	if (new_rt) {
+		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+		new_rt->rt_flags = rt->rt_flags;
+		new_rt->rt_type = rt->rt_type;
+		new_rt->rt_is_input = rt->rt_is_input;
+		new_rt->rt_iif = rt->rt_iif;
+		new_rt->rt_pmtu = rt->rt_pmtu;
+		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
+		new_rt->rt_gw_family = rt->rt_gw_family;
+		if (rt->rt_gw_family == AF_INET)
+			new_rt->rt_gw4 = rt->rt_gw4;
+		else if (rt->rt_gw_family == AF_INET6)
+			new_rt->rt_gw6 = rt->rt_gw6;
+		INIT_LIST_HEAD(&new_rt->rt_uncached);
+
+		new_rt->dst.flags |= DST_HOST;
+		new_rt->dst.input = rt->dst.input;
+		new_rt->dst.output = rt->dst.output;
+		new_rt->dst.error = rt->dst.error;
+		new_rt->dst.lastuse = jiffies;
+		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
+	}
+	return new_rt;
+}
+EXPORT_SYMBOL(rt_dst_clone);
+
 /* called in rcu_read_lock() section */
 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			  u8 tos, struct net_device *dev,