diff mbox

[net-next,1/4] net: mpls: Convert number of nexthops to u8

Message ID 1490461408-9551-2-git-send-email-dsa@cumulusnetworks.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

David Ahern March 25, 2017, 5:03 p.m. UTC
Number of nexthops and number of alive nexthops are tracked using an
unsigned int. A route should never have more than 255 nexthops so
convert both to u8. Update all references and intermediate variables
to consistently use u8 as well.

Shrinks the size of mpls_route from 32 bytes to 24 bytes with a 2-byte
hole before the nexthops.

Also, the ACCESS_ONCE is changed to READ_ONCE per checkpatch message.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 net/mpls/af_mpls.c  | 29 ++++++++++++++++++-----------
 net/mpls/internal.h |  4 ++--
 2 files changed, 20 insertions(+), 13 deletions(-)

Comments

Eric W. Biederman March 27, 2017, 3:11 a.m. UTC | #1
David Ahern <dsa@cumulusnetworks.com> writes:

> Number of nexthops and number of alive nexthops are tracked using an
> unsigned int. A route should never have more than 255 nexthops so
> convert both to u8. Update all references and intermediate variables
> to consistently use u8 as well.
>
> Shrinks the size of mpls_route from 32 bytes to 24 bytes with a 2-byte
> hole before the nexthops.
>
> Also, the ACCESS_ONCE is changed to READ_ONCE per checkpatch message.

I don't like this.  Byte writes don't exist on all architectures.

So while I think always writing to rtn_nhn_alive under the
rtn_lock ensures that we don't have wrong values written
it is quite subtle.  And I don't know how this will interact with other
fields that you are introducing.

AKA this might be ok, but I expect this formulation of the code
will easily bit-rot and break.

Eric

> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
> ---
>  net/mpls/af_mpls.c  | 29 ++++++++++++++++++-----------
>  net/mpls/internal.h |  4 ++--
>  2 files changed, 20 insertions(+), 13 deletions(-)
>
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index cd8be8d5e4ad..d3dc6c43a1d1 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -192,7 +192,7 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
>  static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
>  					     struct sk_buff *skb)
>  {
> -	int alive = ACCESS_ONCE(rt->rt_nhn_alive);
> +	u8 alive = READ_ONCE(rt->rt_nhn_alive);
>  	u32 hash = 0;
>  	int nh_index = 0;
>  	int n = 0;
> @@ -458,7 +458,7 @@ struct mpls_route_config {
>  	int			rc_mp_len;
>  };
>  
> -static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
> +static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen)
>  {
>  	u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN);
>  	struct mpls_route *rt;
> @@ -736,11 +736,11 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt,
>  	return err;
>  }
>  
> -static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
> -			       u8 cfg_via_alen, u8 *max_via_alen)
> +static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len,
> +			      u8 cfg_via_alen, u8 *max_via_alen)
>  {
> -	int nhs = 0;
>  	int remaining = len;
> +	u8 nhs = 0;
>  
>  	if (!rtnh) {
>  		*max_via_alen = cfg_via_alen;
> @@ -765,7 +765,14 @@ static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
>  						      via_alen);
>  		}
>  
> +		/* number of nexthops is tracked by a u8.
> +		 * Check for overflow.
> +		 */
> +		if (nhs == 255)
> +			return 0;
> +
>  		nhs++;
> +
>  		rtnh = rtnh_next(rtnh, &remaining);
>  	}
>  
> @@ -779,8 +786,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
>  	struct rtnexthop *rtnh = cfg->rc_mp;
>  	struct nlattr *nla_via, *nla_newdst;
>  	int remaining = cfg->rc_mp_len;
> -	int nhs = 0;
>  	int err = 0;
> +	u8 nhs = 0;
>  
>  	change_nexthops(rt) {
>  		int attrlen;
> @@ -834,7 +841,7 @@ static int mpls_route_add(struct mpls_route_config *cfg)
>  	int err = -EINVAL;
>  	u8 max_via_alen;
>  	unsigned index;
> -	int nhs;
> +	u8 nhs;
>  
>  	index = cfg->rc_label;
>  
> @@ -1299,8 +1306,8 @@ static void mpls_ifdown(struct net_device *dev, int event)
>  	struct mpls_route __rcu **platform_label;
>  	struct net *net = dev_net(dev);
>  	unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN;
> -	unsigned int alive;
>  	unsigned index;
> +	u8 alive;
>  
>  	platform_label = rtnl_dereference(net->mpls.platform_label);
>  	for (index = 0; index < net->mpls.platform_labels; index++) {
> @@ -1339,7 +1346,7 @@ static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
>  	struct mpls_route __rcu **platform_label;
>  	struct net *net = dev_net(dev);
>  	unsigned index;
> -	int alive;
> +	u8 alive;
>  
>  	platform_label = rtnl_dereference(net->mpls.platform_label);
>  	for (index = 0; index < net->mpls.platform_labels; index++) {
> @@ -1761,8 +1768,8 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
>  	} else {
>  		struct rtnexthop *rtnh;
>  		struct nlattr *mp;
> -		int dead = 0;
> -		int linkdown = 0;
> +		u8 linkdown = 0;
> +		u8 dead = 0;
>  
>  		mp = nla_nest_start(skb, RTA_MULTIPATH);
>  		if (!mp)
> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index 62928d8fabd1..66f388ba2d49 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -123,8 +123,8 @@ struct mpls_route { /* next hop label forwarding entry */
>  	u8			rt_payload_type;
>  	u8			rt_max_alen;
>  	u8			rt_ttl_propagate;
> -	unsigned int		rt_nhn;
> -	unsigned int		rt_nhn_alive;
> +	u8			rt_nhn;
> +	u8			rt_nhn_alive;
>  	struct mpls_nh		rt_nh[0];
>  };
David Ahern March 27, 2017, 2:43 p.m. UTC | #2
On 3/26/17 9:11 PM, Eric W. Biederman wrote:
> I don't like this.  Byte writes don't exist on all architectures.
> 
> So while I think always writing to rtn_nhn_alive under the
> rtn_lock ensures that we don't have wrong values written
> it is quite subtle.  And I don't know how this will interact with other
> fields that you are introducing.
> 
> AKA this might be ok, but I expect this formulation of the code
> will easily bit-rot and break.

net/ has other use cases -- e.g., ipv6 tunneling has proto as a u8.

It unrealistic for a route to have 255 or more nexthops so the point of
this patch is to not waste 8 bytes tracking it - especially when
removing it gets routes with ipv4 and ipv6 via's into a cache line.

I can make the alive counter a u16 without increasing the size of the
struct. I'd prefer to leave it as an u8 to have a u8 hole for flags
should something be needed later.
Eric W. Biederman March 27, 2017, 10:54 p.m. UTC | #3
David Ahern <dsa@cumulusnetworks.com> writes:

> On 3/26/17 9:11 PM, Eric W. Biederman wrote:
>> I don't like this.  Byte writes don't exist on all architectures.
>> 
>> So while I think always writing to rtn_nhn_alive under the
>> rtn_lock ensures that we don't have wrong values written
>> it is quite subtle.  And I don't know how this will interact with other
>> fields that you are introducing.
>> 
>> AKA this might be ok, but I expect this formulation of the code
>> will easily bit-rot and break.
>
> net/ has other use cases -- e.g., ipv6 tunneling has proto as a u8.
>
> It unrealistic for a route to have 255 or more nexthops so the point of
> this patch is to not waste 8 bytes tracking it - especially when
> removing it gets routes with ipv4 and ipv6 via's into a cache line.

The argument isn't that 255 nexthops is too few but that there is no
instruction to write to a single byte on some architectures.

My concern is that if we are writing a field using a non-byte write
without care we could easily have confusion with adjacent fields.

> I can make the alive counter a u16 without increasing the size of the
> struct. I'd prefer to leave it as an u8 to have a u8 hole for flags
> should something be needed later.

u16 is no better than u8.

The original architecture was that all changes to an mpls route would
be done in read, copy, allocate a new route, and replace the pointer
fashion.  Which allows rcu access.

There was argument made that it is silly to do that when a the network
device for a hop goes up or down.  Something about the memory allocation
not being reliable as I recall. And so we now have rt_nhn_alive and it
stored as an int so that it can be read and written atomically.

It is absolutely a no-brainer to change rt_nhn to a u8.  And I very much
appreciate all work to keep mpls_route into a single cache line.  As in
practices that is one of the most important parts to performance.

Which leads to the functions mpls_ifup, mpls_ifdown, and
mpls_select_multipath.

To make this all work correctly we need a couple of things.
- A big fat comment on struct mpls_route and mpls_nh about how
  and why these structures are modified and not replaced during
  nexthop processing.  Including the fact that it all modifications
  may only happen with rntl_lock held.

- The use of READ_ONCE and WRITE_ONCE on all rt->rt_nhn_alive accesses,
  that happen after the route is installed (and is thus rcu reachable).

- The use of READ_ONCE and WRITE_ONCE on all nh->nh_flags accesses,
  that happen after the route is installed (and is thus rcu reachable).

Someone needs to fix mpls_ifup AKA something like:

 			struct net_device *nh_dev =
 				rtnl_dereference(nh->nh_dev);

+			unhsigned int flags = READ_ONCE(nh->nh_flags);
+			if (nh_dev == dev) {
+				flags &= ~nh_flags;
+				WRITE_ONCE(nh->nh_flags, flags);
+			}
+			if (!(flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)))
+				alive++;
-			if (!(nh->nh_flags & nh_flags)) {
-				alive++;
-				continue;
-			}
-			if (nh_dev != dev)
-				continue;
-			alive++;
-			nh->nh_flags &= ~nh_flags;
 		} endfor_nexthops(rt);
 
-		ACCESS_ONCE(rt->rt_nhn_alive) = alive;
+		WRITE_ONCE(rt->rt_nhn_alive, alive);
 	}
 }

If we comment it all clearly and make very certain that the magic with
nh->nh_flags and rt->rt_nhn_alive works I don't object.  But we need to
let future people who touch the code know: here be dragons.

Especially as anything else in the same 32bits as rt->nhn_alive as our
update of that field will can rewrite those values too.  So we need
very careful to serialize any update like that.

Eric
David Ahern March 28, 2017, 3:25 p.m. UTC | #4
On 3/27/17 4:54 PM, Eric W. Biederman wrote:
> It is absolutely a no-brainer to change rt_nhn to a u8.  And I very much
> appreciate all work to keep mpls_route into a single cache line.  As in
> practices that is one of the most important parts to performance.
> 
> Which leads to the functions mpls_ifup, mpls_ifdown, and
> mpls_select_multipath.
> 
> To make this all work correctly we need a couple of things.
> - A big fat comment on struct mpls_route and mpls_nh about how
>   and why these structures are modified and not replaced during
>   nexthop processing.  Including the fact that it all modifications
>   may only happen with rntl_lock held.
> 
> - The use of READ_ONCE and WRITE_ONCE on all rt->rt_nhn_alive accesses,
>   that happen after the route is installed (and is thus rcu reachable).
> 
> - The use of READ_ONCE and WRITE_ONCE on all nh->nh_flags accesses,
>   that happen after the route is installed (and is thus rcu reachable).

For both of these, mpls_select_multipath does need to use READ_ONCE to
read the nh_flags and rt_nhn_alive. In this case it is reading a value
that could change behind its back.

The READ_ONCE is not necessary for mpls_ifdown or mpls_ifup as these are
the functions that change the values. These 2 functions only need a
WRITE_ONCE for both struct members.
Eric W. Biederman March 28, 2017, 6:39 p.m. UTC | #5
David Ahern <dsa@cumulusnetworks.com> writes:

> On 3/27/17 4:54 PM, Eric W. Biederman wrote:
>> It is absolutely a no-brainer to change rt_nhn to a u8.  And I very much
>> appreciate all work to keep mpls_route into a single cache line.  As in
>> practices that is one of the most important parts to performance.
>> 
>> Which leads to the functions mpls_ifup, mpls_ifdown, and
>> mpls_select_multipath.
>> 
>> To make this all work correctly we need a couple of things.
>> - A big fat comment on struct mpls_route and mpls_nh about how
>>   and why these structures are modified and not replaced during
>>   nexthop processing.  Including the fact that it all modifications
>>   may only happen with rntl_lock held.
>> 
>> - The use of READ_ONCE and WRITE_ONCE on all rt->rt_nhn_alive accesses,
>>   that happen after the route is installed (and is thus rcu reachable).
>> 
>> - The use of READ_ONCE and WRITE_ONCE on all nh->nh_flags accesses,
>>   that happen after the route is installed (and is thus rcu reachable).
>
> For both of these, mpls_select_multipath does need to use READ_ONCE to
> read the nh_flags and rt_nhn_alive. In this case it is reading a value
> that could change behind its back.
>
> The READ_ONCE is not necessary for mpls_ifdown or mpls_ifup as these are
> the functions that change the values. These 2 functions only need a
> WRITE_ONCE for both struct members.

True.  We don't need READ_ONCE under rtnl_lock which we use to protect writes.

Eric
diff mbox

Patch

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index cd8be8d5e4ad..d3dc6c43a1d1 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -192,7 +192,7 @@  static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
 static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
 					     struct sk_buff *skb)
 {
-	int alive = ACCESS_ONCE(rt->rt_nhn_alive);
+	u8 alive = READ_ONCE(rt->rt_nhn_alive);
 	u32 hash = 0;
 	int nh_index = 0;
 	int n = 0;
@@ -458,7 +458,7 @@  struct mpls_route_config {
 	int			rc_mp_len;
 };
 
-static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
+static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen)
 {
 	u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN);
 	struct mpls_route *rt;
@@ -736,11 +736,11 @@  static int mpls_nh_build(struct net *net, struct mpls_route *rt,
 	return err;
 }
 
-static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
-			       u8 cfg_via_alen, u8 *max_via_alen)
+static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len,
+			      u8 cfg_via_alen, u8 *max_via_alen)
 {
-	int nhs = 0;
 	int remaining = len;
+	u8 nhs = 0;
 
 	if (!rtnh) {
 		*max_via_alen = cfg_via_alen;
@@ -765,7 +765,14 @@  static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
 						      via_alen);
 		}
 
+		/* number of nexthops is tracked by a u8.
+		 * Check for overflow.
+		 */
+		if (nhs == 255)
+			return 0;
+
 		nhs++;
+
 		rtnh = rtnh_next(rtnh, &remaining);
 	}
 
@@ -779,8 +786,8 @@  static int mpls_nh_build_multi(struct mpls_route_config *cfg,
 	struct rtnexthop *rtnh = cfg->rc_mp;
 	struct nlattr *nla_via, *nla_newdst;
 	int remaining = cfg->rc_mp_len;
-	int nhs = 0;
 	int err = 0;
+	u8 nhs = 0;
 
 	change_nexthops(rt) {
 		int attrlen;
@@ -834,7 +841,7 @@  static int mpls_route_add(struct mpls_route_config *cfg)
 	int err = -EINVAL;
 	u8 max_via_alen;
 	unsigned index;
-	int nhs;
+	u8 nhs;
 
 	index = cfg->rc_label;
 
@@ -1299,8 +1306,8 @@  static void mpls_ifdown(struct net_device *dev, int event)
 	struct mpls_route __rcu **platform_label;
 	struct net *net = dev_net(dev);
 	unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN;
-	unsigned int alive;
 	unsigned index;
+	u8 alive;
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	for (index = 0; index < net->mpls.platform_labels; index++) {
@@ -1339,7 +1346,7 @@  static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
 	struct mpls_route __rcu **platform_label;
 	struct net *net = dev_net(dev);
 	unsigned index;
-	int alive;
+	u8 alive;
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	for (index = 0; index < net->mpls.platform_labels; index++) {
@@ -1761,8 +1768,8 @@  static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	} else {
 		struct rtnexthop *rtnh;
 		struct nlattr *mp;
-		int dead = 0;
-		int linkdown = 0;
+		u8 linkdown = 0;
+		u8 dead = 0;
 
 		mp = nla_nest_start(skb, RTA_MULTIPATH);
 		if (!mp)
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 62928d8fabd1..66f388ba2d49 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -123,8 +123,8 @@  struct mpls_route { /* next hop label forwarding entry */
 	u8			rt_payload_type;
 	u8			rt_max_alen;
 	u8			rt_ttl_propagate;
-	unsigned int		rt_nhn;
-	unsigned int		rt_nhn_alive;
+	u8			rt_nhn;
+	u8			rt_nhn_alive;
 	struct mpls_nh		rt_nh[0];
 };