Message ID | 1490461408-9551-2-git-send-email-dsa@cumulusnetworks.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
David Ahern <dsa@cumulusnetworks.com> writes: > Number of nexthops and number of alive nexthops are tracked using an > unsigned int. A route should never have more than 255 nexthops so > convert both to u8. Update all references and intermediate variables > to consistently use u8 as well. > > Shrinks the size of mpls_route from 32 bytes to 24 bytes with a 2-byte > hole before the nexthops. > > Also, the ACCESS_ONCE is changed to READ_ONCE per checkpatch message. I don't like this. Byte writes don't exist on all architectures. So while I think always writing to rtn_nhn_alive under the rtn_lock ensures that we don't have wrong values written it is quite subtle. And I don't know how this will interact with other fields that you are introducing. AKA this might be ok, but I expect this formulation of the code will easily bit-rot and break. Eric > Signed-off-by: David Ahern <dsa@cumulusnetworks.com> > --- > net/mpls/af_mpls.c | 29 ++++++++++++++++++----------- > net/mpls/internal.h | 4 ++-- > 2 files changed, 20 insertions(+), 13 deletions(-) > > diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c > index cd8be8d5e4ad..d3dc6c43a1d1 100644 > --- a/net/mpls/af_mpls.c > +++ b/net/mpls/af_mpls.c > @@ -192,7 +192,7 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb) > static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, > struct sk_buff *skb) > { > - int alive = ACCESS_ONCE(rt->rt_nhn_alive); > + u8 alive = READ_ONCE(rt->rt_nhn_alive); > u32 hash = 0; > int nh_index = 0; > int n = 0; > @@ -458,7 +458,7 @@ struct mpls_route_config { > int rc_mp_len; > }; > > -static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen) > +static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen) > { > u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN); > struct mpls_route *rt; > @@ -736,11 +736,11 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt, > return err; > } > > -static int mpls_count_nexthops(struct rtnexthop *rtnh, int len, > - u8 cfg_via_alen, u8 *max_via_alen) > +static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len, > + u8 cfg_via_alen, u8 *max_via_alen) > { > - int nhs = 0; > int remaining = len; > + u8 nhs = 0; > > if (!rtnh) { > *max_via_alen = cfg_via_alen; > @@ -765,7 +765,14 @@ static int mpls_count_nexthops(struct rtnexthop *rtnh, int len, > via_alen); > } > > + /* number of nexthops is tracked by a u8. > + * Check for overflow. > + */ > + if (nhs == 255) > + return 0; > + > nhs++; > + > rtnh = rtnh_next(rtnh, &remaining); > } > > @@ -779,8 +786,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg, > struct rtnexthop *rtnh = cfg->rc_mp; > struct nlattr *nla_via, *nla_newdst; > int remaining = cfg->rc_mp_len; > - int nhs = 0; > int err = 0; > + u8 nhs = 0; > > change_nexthops(rt) { > int attrlen; > @@ -834,7 +841,7 @@ static int mpls_route_add(struct mpls_route_config *cfg) > int err = -EINVAL; > u8 max_via_alen; > unsigned index; > - int nhs; > + u8 nhs; > > index = cfg->rc_label; > > @@ -1299,8 +1306,8 @@ static void mpls_ifdown(struct net_device *dev, int event) > struct mpls_route __rcu **platform_label; > struct net *net = dev_net(dev); > unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN; > - unsigned int alive; > unsigned index; > + u8 alive; > > platform_label = rtnl_dereference(net->mpls.platform_label); > for (index = 0; index < net->mpls.platform_labels; index++) { > @@ -1339,7 +1346,7 @@ static void mpls_ifup(struct net_device *dev, unsigned int nh_flags) > struct mpls_route __rcu **platform_label; > struct net *net = dev_net(dev); > unsigned index; > - int alive; > + u8 alive; > > platform_label = rtnl_dereference(net->mpls.platform_label); > for (index = 0; index < net->mpls.platform_labels; index++) { > @@ -1761,8 +1768,8 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, > } else { > struct rtnexthop *rtnh; > struct nlattr *mp; > - int dead = 0; > - int linkdown = 0; > + u8 linkdown = 0; > + u8 dead = 0; > > mp = nla_nest_start(skb, RTA_MULTIPATH); > if (!mp) > diff --git a/net/mpls/internal.h b/net/mpls/internal.h > index 62928d8fabd1..66f388ba2d49 100644 > --- a/net/mpls/internal.h > +++ b/net/mpls/internal.h > @@ -123,8 +123,8 @@ struct mpls_route { /* next hop label forwarding entry */ > u8 rt_payload_type; > u8 rt_max_alen; > u8 rt_ttl_propagate; > - unsigned int rt_nhn; > - unsigned int rt_nhn_alive; > + u8 rt_nhn; > + u8 rt_nhn_alive; > struct mpls_nh rt_nh[0]; > };
On 3/26/17 9:11 PM, Eric W. Biederman wrote: > I don't like this. Byte writes don't exist on all architectures. > > So while I think always writing to rtn_nhn_alive under the > rtn_lock ensures that we don't have wrong values written > it is quite subtle. And I don't know how this will interact with other > fields that you are introducing. > > AKA this might be ok, but I expect this formulation of the code > will easily bit-rot and break. net/ has other use cases -- e.g., ipv6 tunneling has proto as a u8. It unrealistic for a route to have 255 or more nexthops so the point of this patch is to not waste 8 bytes tracking it - especially when removing it gets routes with ipv4 and ipv6 via's into a cache line. I can make the alive counter a u16 without increasing the size of the struct. I'd prefer to leave it as an u8 to have a u8 hole for flags should something be needed later.
David Ahern <dsa@cumulusnetworks.com> writes: > On 3/26/17 9:11 PM, Eric W. Biederman wrote: >> I don't like this. Byte writes don't exist on all architectures. >> >> So while I think always writing to rtn_nhn_alive under the >> rtn_lock ensures that we don't have wrong values written >> it is quite subtle. And I don't know how this will interact with other >> fields that you are introducing. >> >> AKA this might be ok, but I expect this formulation of the code >> will easily bit-rot and break. > > net/ has other use cases -- e.g., ipv6 tunneling has proto as a u8. > > It unrealistic for a route to have 255 or more nexthops so the point of > this patch is to not waste 8 bytes tracking it - especially when > removing it gets routes with ipv4 and ipv6 via's into a cache line. The argument isn't that 255 nexthops is too few but that there is no instruction to write to a single byte on some architectures. My concern is that if we are writing a field using a non-byte write without care we could easily have confusion with adjacent fields. > I can make the alive counter a u16 without increasing the size of the > struct. I'd prefer to leave it as an u8 to have a u8 hole for flags > should something be needed later. u16 is no better than u8. The original architecture was that all changes to an mpls route would be done in read, copy, allocate a new route, and replace the pointer fashion. Which allows rcu access. There was argument made that it is silly to do that when a the network device for a hop goes up or down. Something about the memory allocation not being reliable as I recall. And so we now have rt_nhn_alive and it stored as an int so that it can be read and written atomically. It is absolutely a no-brainer to change rt_nhn to a u8. And I very much appreciate all work to keep mpls_route into a single cache line. As in practices that is one of the most important parts to performance. Which leads to the functions mpls_ifup, mpls_ifdown, and mpls_select_multipath. To make this all work correctly we need a couple of things. - A big fat comment on struct mpls_route and mpls_nh about how and why these structures are modified and not replaced during nexthop processing. Including the fact that it all modifications may only happen with rntl_lock held. - The use of READ_ONCE and WRITE_ONCE on all rt->rt_nhn_alive accesses, that happen after the route is installed (and is thus rcu reachable). - The use of READ_ONCE and WRITE_ONCE on all nh->nh_flags accesses, that happen after the route is installed (and is thus rcu reachable). Someone needs to fix mpls_ifup AKA something like: struct net_device *nh_dev = rtnl_dereference(nh->nh_dev); + unhsigned int flags = READ_ONCE(nh->nh_flags); + if (nh_dev == dev) { + flags &= ~nh_flags; + WRITE_ONCE(nh->nh_flags, flags); + } + if (!(flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))) + alive++; - if (!(nh->nh_flags & nh_flags)) { - alive++; - continue; - } - if (nh_dev != dev) - continue; - alive++; - nh->nh_flags &= ~nh_flags; } endfor_nexthops(rt); - ACCESS_ONCE(rt->rt_nhn_alive) = alive; + WRITE_ONCE(rt->rt_nhn_alive, alive); } } If we comment it all clearly and make very certain that the magic with nh->nh_flags and rt->rt_nhn_alive works I don't object. But we need to let future people who touch the code know: here be dragons. Especially as anything else in the same 32bits as rt->nhn_alive as our update of that field will can rewrite those values too. So we need very careful to serialize any update like that. Eric
On 3/27/17 4:54 PM, Eric W. Biederman wrote: > It is absolutely a no-brainer to change rt_nhn to a u8. And I very much > appreciate all work to keep mpls_route into a single cache line. As in > practices that is one of the most important parts to performance. > > Which leads to the functions mpls_ifup, mpls_ifdown, and > mpls_select_multipath. > > To make this all work correctly we need a couple of things. > - A big fat comment on struct mpls_route and mpls_nh about how > and why these structures are modified and not replaced during > nexthop processing. Including the fact that it all modifications > may only happen with rntl_lock held. > > - The use of READ_ONCE and WRITE_ONCE on all rt->rt_nhn_alive accesses, > that happen after the route is installed (and is thus rcu reachable). > > - The use of READ_ONCE and WRITE_ONCE on all nh->nh_flags accesses, > that happen after the route is installed (and is thus rcu reachable). For both of these, mpls_select_multipath does need to use READ_ONCE to read the nh_flags and rt_nhn_alive. In this case it is reading a value that could change behind its back. The READ_ONCE is not necessary for mpls_ifdown or mpls_ifup as these are the functions that change the values. These 2 functions only need a WRITE_ONCE for both struct members.
David Ahern <dsa@cumulusnetworks.com> writes: > On 3/27/17 4:54 PM, Eric W. Biederman wrote: >> It is absolutely a no-brainer to change rt_nhn to a u8. And I very much >> appreciate all work to keep mpls_route into a single cache line. As in >> practices that is one of the most important parts to performance. >> >> Which leads to the functions mpls_ifup, mpls_ifdown, and >> mpls_select_multipath. >> >> To make this all work correctly we need a couple of things. >> - A big fat comment on struct mpls_route and mpls_nh about how >> and why these structures are modified and not replaced during >> nexthop processing. Including the fact that it all modifications >> may only happen with rntl_lock held. >> >> - The use of READ_ONCE and WRITE_ONCE on all rt->rt_nhn_alive accesses, >> that happen after the route is installed (and is thus rcu reachable). >> >> - The use of READ_ONCE and WRITE_ONCE on all nh->nh_flags accesses, >> that happen after the route is installed (and is thus rcu reachable). > > For both of these, mpls_select_multipath does need to use READ_ONCE to > read the nh_flags and rt_nhn_alive. In this case it is reading a value > that could change behind its back. > > The READ_ONCE is not necessary for mpls_ifdown or mpls_ifup as these are > the functions that change the values. These 2 functions only need a > WRITE_ONCE for both struct members. True. We don't need READ_ONCE under rtnl_lock which we use to protect writes. Eric
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index cd8be8d5e4ad..d3dc6c43a1d1 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -192,7 +192,7 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb) static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, struct sk_buff *skb) { - int alive = ACCESS_ONCE(rt->rt_nhn_alive); + u8 alive = READ_ONCE(rt->rt_nhn_alive); u32 hash = 0; int nh_index = 0; int n = 0; @@ -458,7 +458,7 @@ struct mpls_route_config { int rc_mp_len; }; -static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen) +static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen) { u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN); struct mpls_route *rt; @@ -736,11 +736,11 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt, return err; } -static int mpls_count_nexthops(struct rtnexthop *rtnh, int len, - u8 cfg_via_alen, u8 *max_via_alen) +static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len, + u8 cfg_via_alen, u8 *max_via_alen) { - int nhs = 0; int remaining = len; + u8 nhs = 0; if (!rtnh) { *max_via_alen = cfg_via_alen; @@ -765,7 +765,14 @@ static int mpls_count_nexthops(struct rtnexthop *rtnh, int len, via_alen); } + /* number of nexthops is tracked by a u8. + * Check for overflow. + */ + if (nhs == 255) + return 0; + nhs++; + rtnh = rtnh_next(rtnh, &remaining); } @@ -779,8 +786,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg, struct rtnexthop *rtnh = cfg->rc_mp; struct nlattr *nla_via, *nla_newdst; int remaining = cfg->rc_mp_len; - int nhs = 0; int err = 0; + u8 nhs = 0; change_nexthops(rt) { int attrlen; @@ -834,7 +841,7 @@ static int mpls_route_add(struct mpls_route_config *cfg) int err = -EINVAL; u8 max_via_alen; unsigned index; - int nhs; + u8 nhs; index = cfg->rc_label; @@ -1299,8 +1306,8 @@ static void mpls_ifdown(struct net_device *dev, int event) struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN; - unsigned int alive; unsigned index; + u8 alive; platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { @@ -1339,7 +1346,7 @@ static void mpls_ifup(struct net_device *dev, unsigned int nh_flags) struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); unsigned index; - int alive; + u8 alive; platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { @@ -1761,8 +1768,8 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, } else { struct rtnexthop *rtnh; struct nlattr *mp; - int dead = 0; - int linkdown = 0; + u8 linkdown = 0; + u8 dead = 0; mp = nla_nest_start(skb, RTA_MULTIPATH); if (!mp) diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 62928d8fabd1..66f388ba2d49 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -123,8 +123,8 @@ struct mpls_route { /* next hop label forwarding entry */ u8 rt_payload_type; u8 rt_max_alen; u8 rt_ttl_propagate; - unsigned int rt_nhn; - unsigned int rt_nhn_alive; + u8 rt_nhn; + u8 rt_nhn_alive; struct mpls_nh rt_nh[0]; };
Number of nexthops and number of alive nexthops are tracked using an unsigned int. A route should never have more than 255 nexthops so convert both to u8. Update all references and intermediate variables to consistently use u8 as well. Shrinks the size of mpls_route from 32 bytes to 24 bytes with a 2-byte hole before the nexthops. Also, the ACCESS_ONCE is changed to READ_ONCE per checkpatch message. Signed-off-by: David Ahern <dsa@cumulusnetworks.com> --- net/mpls/af_mpls.c | 29 ++++++++++++++++++----------- net/mpls/internal.h | 4 ++-- 2 files changed, 20 insertions(+), 13 deletions(-)