Message ID | 20100804105515.22212.41598.sendpatchset@krkumar2.in.ibm.com |
---|---|
State | Superseded, archived |
Delegated to: | David Miller |
Headers | show |
On Wed, Aug 4, 2010 at 6:55 PM, Krishna Kumar <krkumar2@in.ibm.com> wrote: > From: Krishna Kumar <krkumar2@in.ibm.com> > > Factor out flow calculation code from get_rps_cpu, since other > functions can use the same code. > > Revisions: > > v2 - Ben: Separate flow calcuation out and use in select queue. > v3 - Arnd: Don't re-implement MIN. > v4 - Changli: skb->data points to ethernet header in macvtap, and > make a fast path. > > Tested macvtap with this patch. > > Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com> > --- > include/linux/skbuff.h | 11 ++++ > net/core/dev.c | 107 ++++++++++++++++++++++----------------- > 2 files changed, 74 insertions(+), 44 deletions(-) > > diff -ruNp org/include/linux/skbuff.h new/include/linux/skbuff.h > --- org/include/linux/skbuff.h 2010-08-04 09:07:05.000000000 +0530 > +++ new/include/linux/skbuff.h 2010-08-04 16:19:30.000000000 +0530 > @@ -558,6 +558,17 @@ extern unsigned int skb_find_text(stru > unsigned int to, struct ts_config *config, > struct ts_state *state); > > +extern __u32 __skb_get_rxhash(struct sk_buff *skb); > +static inline __u32 skb_get_rxhash(struct sk_buff *skb) > +{ > + __u32 rxhash = skb->rxhash; > + > + if (!rxhash) > + rxhash = __skb_get_rxhash(skb); > + > + return rxhash; > +} > + > #ifdef NET_SKBUFF_DATA_USES_OFFSET > static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) > { > diff -ruNp org/net/core/dev.c new/net/core/dev.c > --- org/net/core/dev.c 2010-08-04 09:07:05.000000000 +0530 > +++ new/net/core/dev.c 2010-08-04 16:19:30.000000000 +0530 > @@ -2259,69 +2259,41 @@ static inline void ____napi_schedule(str > __raise_softirq_irqoff(NET_RX_SOFTIRQ); > } > > -#ifdef CONFIG_RPS > - > -/* One global table that all flow-based protocols share. */ > -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; > -EXPORT_SYMBOL(rps_sock_flow_table); > - > /* > - * get_rps_cpu is called from netif_receive_skb and returns the target > - * CPU from the RPS map of the receiving queue for a given skb. > - * rcu_read_lock must be held on entry. > + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses > + * and src/dst port numbers. On success, returns a non-zero hash number, > + * otherwise 0. > */ > -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, > - struct rps_dev_flow **rflowp) > +__u32 __skb_get_rxhash(struct sk_buff *skb) > { > + int nhoff, hash = 0; > struct ipv6hdr *ip6; > struct iphdr *ip; > - struct netdev_rx_queue *rxqueue; > - struct rps_map *map; > - struct rps_dev_flow_table *flow_table; > - struct rps_sock_flow_table *sock_flow_table; > - int cpu = -1; > u8 ip_proto; > - u16 tcpu; > u32 addr1, addr2, ihl; > union { > u32 v32; > u16 v16[2]; > } ports; > > - if (skb_rx_queue_recorded(skb)) { > - u16 index = skb_get_rx_queue(skb); > - if (unlikely(index >= dev->num_rx_queues)) { > - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " > - "on queue %u, but number of RX queues is %u\n", > - dev->name, index, dev->num_rx_queues); > - goto done; > - } > - rxqueue = dev->_rx + index; > - } else > - rxqueue = dev->_rx; > - > - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) > - goto done; > - > - if (skb->rxhash) > - goto got_hash; /* Skip hash computation on packet header */ > + nhoff = skb_network_offset(skb); > > switch (skb->protocol) { > case __constant_htons(ETH_P_IP): > - if (!pskb_may_pull(skb, sizeof(*ip))) > + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) > goto done; > > - ip = (struct iphdr *) skb->data; > + ip = (struct iphdr *) skb->data + nhoff; > ip_proto = ip->protocol; > addr1 = (__force u32) ip->saddr; > addr2 = (__force u32) ip->daddr; > ihl = ip->ihl; > break; > case __constant_htons(ETH_P_IPV6): > - if (!pskb_may_pull(skb, sizeof(*ip6))) > + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) > goto done; > > - ip6 = (struct ipv6hdr *) skb->data; > + ip6 = (struct ipv6hdr *) skb->data + nhoff; > ip_proto = ip6->nexthdr; > addr1 = (__force u32) ip6->saddr.s6_addr32[3]; > addr2 = (__force u32) ip6->daddr.s6_addr32[3]; > @@ -2330,6 +2302,7 @@ static int get_rps_cpu(struct net_device > default: > goto done; > } > + > switch (ip_proto) { > case IPPROTO_TCP: > case IPPROTO_UDP: > @@ -2338,8 +2311,9 @@ static int get_rps_cpu(struct net_device > case IPPROTO_AH: > case IPPROTO_SCTP: > case IPPROTO_UDPLITE: > - if (pskb_may_pull(skb, (ihl * 4) + 4)) { > - ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); > + if (pskb_may_pull(skb, (ihl * 4) + 4 + nhoff)) { > + ports.v32 = * (__force u32 *) (skb->data + nhoff + > + (ihl * 4)); > if (ports.v16[1] < ports.v16[0]) > swap(ports.v16[0], ports.v16[1]); > break; > @@ -2352,11 +2326,56 @@ static int get_rps_cpu(struct net_device > /* get a consistent hash (same value on both flow directions) */ > if (addr2 < addr1) > swap(addr1, addr2); > - skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); > - if (!skb->rxhash) > - skb->rxhash = 1; > > -got_hash: > + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); > + if (!hash) > + hash = 1; > + > +done: > + return hash; > +} > +EXPORT_SYMBOL(__skb_get_rxhash); > + > +#ifdef CONFIG_RPS > + > +/* One global table that all flow-based protocols share. */ > +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; > +EXPORT_SYMBOL(rps_sock_flow_table); > + > +/* > + * get_rps_cpu is called from netif_receive_skb and returns the target > + * CPU from the RPS map of the receiving queue for a given skb. > + * rcu_read_lock must be held on entry. > + */ > +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, > + struct rps_dev_flow **rflowp) > +{ > + struct netdev_rx_queue *rxqueue; > + struct rps_map *map; > + struct rps_dev_flow_table *flow_table; > + struct rps_sock_flow_table *sock_flow_table; > + int cpu = -1; > + u16 tcpu; > + > + if (skb_rx_queue_recorded(skb)) { > + u16 index = skb_get_rx_queue(skb); > + if (unlikely(index >= dev->num_rx_queues)) { > + WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " > + "on queue %u, but number of RX queues is %u\n", > + dev->name, index, dev->num_rx_queues); > + goto done; > + } > + rxqueue = dev->_rx + index; > + } else > + rxqueue = dev->_rx; > + > + if (!rxqueue->rps_map && !rxqueue->rps_flow_table) > + goto done; > + > + skb->rxhash = skb_get_rxhash(skb); the return value of skb_get_rxhash() maybe skb->rxhash, so we don't need to assign it to skb->rxhash again. Use a local variable to save the value and __skb_get_rxhash() should cache the rxhash into skb->rxhash for future use. > + if (skb->rxhash < 0) > + goto done; > + > flow_table = rcu_dereference(rxqueue->rps_flow_table); > sock_flow_table = rcu_dereference(rps_sock_flow_table); > if (flow_table && sock_flow_table) { >
Changli Gao <xiaosuo@gmail.com> wrote on 08/04/2010 04:52:58 PM: > > +#ifdef CONFIG_RPS > > + > > +/* One global table that all flow-based protocols share. */ > > +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; > > +EXPORT_SYMBOL(rps_sock_flow_table); > > + > > +/* > > + * get_rps_cpu is called from netif_receive_skb and returns the target > > + * CPU from the RPS map of the receiving queue for a given skb. > > + * rcu_read_lock must be held on entry. > > + */ > > +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, > > + struct rps_dev_flow **rflowp) > > +{ > > + struct netdev_rx_queue *rxqueue; > > + struct rps_map *map; > > + struct rps_dev_flow_table *flow_table; > > + struct rps_sock_flow_table *sock_flow_table; > > + int cpu = -1; > > + u16 tcpu; > > + > > + if (skb_rx_queue_recorded(skb)) { > > + u16 index = skb_get_rx_queue(skb); > > + if (unlikely(index >= dev->num_rx_queues)) { > > + WARN_ONCE(dev->num_rx_queues > 1, "%s > received packet " > > + "on queue %u, but number of RX > queues is %u\n", > > + dev->name, index, dev->num_rx_queues); > > + goto done; > > + } > > + rxqueue = dev->_rx + index; > > + } else > > + rxqueue = dev->_rx; > > + > > + if (!rxqueue->rps_map && !rxqueue->rps_flow_table) > > + goto done; > > + > > + skb->rxhash = skb_get_rxhash(skb); > > the return value of skb_get_rxhash() maybe skb->rxhash, so we don't > need to assign it to skb->rxhash again. Use a local variable to save > the value and __skb_get_rxhash() should cache the rxhash into > skb->rxhash for future use. I wanted the function to return a rxhash instead of save+return, since other users, eg macvtap, doesn't need the rxhash beyond calculating a rxq with that hash. So for those users, we can avoid updating both skb->rxhash and a local variable (since with your suggestion, rps will update either one or two variables depending on whether rxhash is cached or not). I am not sure which is better. Thanks, - KK -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 4, 2010 at 8:02 PM, Krishna Kumar2 <krkumar2@in.ibm.com> wrote: > Changli Gao <xiaosuo@gmail.com> wrote on 08/04/2010 04:52:58 PM: >> >> the return value of skb_get_rxhash() maybe skb->rxhash, so we don't >> need to assign it to skb->rxhash again. Use a local variable to save >> the value and __skb_get_rxhash() should cache the rxhash into >> skb->rxhash for future use. > > I wanted the function to return a rxhash instead of save+return, > since other users, eg macvtap, doesn't need the rxhash beyond > calculating a rxq with that hash. So for those users, we can > avoid updating both skb->rxhash and a local variable (since with > your suggestion, rps will update either one or two variables > depending on whether rxhash is cached or not). I am not sure > which is better. > rxhash can also be generated by NIC(hardware) and used some where else. So for these users, generating the rxhash again is wasting time. Thanks.
Changli Gao <xiaosuo@gmail.com> wrote on 08/04/2010 07:08:39 PM: > Changli Gao <xiaosuo@gmail.com> > 08/04/2010 07:08 PM > > To > > Krishna Kumar2/India/IBM@IBMIN > > cc > > arnd@arndb.de, bhutchings@solarflare.com, davem@davemloft.net, > mst@redhat.com, netdev@vger.kernel.org, therbert@google.com > > Subject > > Re: [PATCH v4 1/2] core: Factor out flow calculation from get_rps_cpu > > On Wed, Aug 4, 2010 at 8:02 PM, Krishna Kumar2 <krkumar2@in.ibm.com> wrote: > > Changli Gao <xiaosuo@gmail.com> wrote on 08/04/2010 04:52:58 PM: > >> > >> the return value of skb_get_rxhash() maybe skb->rxhash, so we don't > >> need to assign it to skb->rxhash again. Use a local variable to save > >> the value and __skb_get_rxhash() should cache the rxhash into > >> skb->rxhash for future use. > > > > I wanted the function to return a rxhash instead of save+return, > > since other users, eg macvtap, doesn't need the rxhash beyond > > calculating a rxq with that hash. So for those users, we can > > avoid updating both skb->rxhash and a local variable (since with > > your suggestion, rps will update either one or two variables > > depending on whether rxhash is cached or not). I am not sure > > which is better. > > > > rxhash can also be generated by NIC(hardware) and used some where > else. So for these users, generating the rxhash again is wasting time. Yes, I expect that to happen once. By not saving it in skb, atleast for macvtap, I didn't see any further rx path. I guess I could do this: get_rps_cpu() { if (!skb_get_rxhash(skb)) goto done; /* use skb->rxhash from here on */ } Replying to your other mail here: > > macvtap *with* mq support would be used with mq devices - you > > open multiple queues on the macvtap device depending on the > > number of queues for the physical device. So since this is an > > unlikely case (as can be seen in the patch, and I guess I > > should add another "likely" to the "if (tap)" check since fd's > > should not be closed), I guess a simple % can be used. Does > > the following sound reasonable? > > > > 1. Use % to find the slot. > It is slower than the method used by get_rps_cpu(). Yes, but it should be an unlikely case. Is it still important, considering you add a lot of baggage for a rare case? So next patch has these two changes, unless contradicted :) 1. Change skb_get_rxhash() to set if not cached already. 2. Use % for macvtap unlikely case. Thanks, - KK -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff -ruNp org/include/linux/skbuff.h new/include/linux/skbuff.h --- org/include/linux/skbuff.h 2010-08-04 09:07:05.000000000 +0530 +++ new/include/linux/skbuff.h 2010-08-04 16:19:30.000000000 +0530 @@ -558,6 +558,17 @@ extern unsigned int skb_find_text(stru unsigned int to, struct ts_config *config, struct ts_state *state); +extern __u32 __skb_get_rxhash(struct sk_buff *skb); +static inline __u32 skb_get_rxhash(struct sk_buff *skb) +{ + __u32 rxhash = skb->rxhash; + + if (!rxhash) + rxhash = __skb_get_rxhash(skb); + + return rxhash; +} + #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { diff -ruNp org/net/core/dev.c new/net/core/dev.c --- org/net/core/dev.c 2010-08-04 09:07:05.000000000 +0530 +++ new/net/core/dev.c 2010-08-04 16:19:30.000000000 +0530 @@ -2259,69 +2259,41 @@ static inline void ____napi_schedule(str __raise_softirq_irqoff(NET_RX_SOFTIRQ); } -#ifdef CONFIG_RPS - -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); - /* - * get_rps_cpu is called from netif_receive_skb and returns the target - * CPU from the RPS map of the receiving queue for a given skb. - * rcu_read_lock must be held on entry. + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. On success, returns a non-zero hash number, + * otherwise 0. */ -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow **rflowp) +__u32 __skb_get_rxhash(struct sk_buff *skb) { + int nhoff, hash = 0; struct ipv6hdr *ip6; struct iphdr *ip; - struct netdev_rx_queue *rxqueue; - struct rps_map *map; - struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; - int cpu = -1; u8 ip_proto; - u16 tcpu; u32 addr1, addr2, ihl; union { u32 v32; u16 v16[2]; } ports; - if (skb_rx_queue_recorded(skb)) { - u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->num_rx_queues)) { - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " - "on queue %u, but number of RX queues is %u\n", - dev->name, index, dev->num_rx_queues); - goto done; - } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; - - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) - goto done; - - if (skb->rxhash) - goto got_hash; /* Skip hash computation on packet header */ + nhoff = skb_network_offset(skb); switch (skb->protocol) { case __constant_htons(ETH_P_IP): - if (!pskb_may_pull(skb, sizeof(*ip))) + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) skb->data; + ip = (struct iphdr *) skb->data + nhoff; ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; ihl = ip->ihl; break; case __constant_htons(ETH_P_IPV6): - if (!pskb_may_pull(skb, sizeof(*ip6))) + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) skb->data; + ip6 = (struct ipv6hdr *) skb->data + nhoff; ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; @@ -2330,6 +2302,7 @@ static int get_rps_cpu(struct net_device default: goto done; } + switch (ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: @@ -2338,8 +2311,9 @@ static int get_rps_cpu(struct net_device case IPPROTO_AH: case IPPROTO_SCTP: case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4)) { - ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); + if (pskb_may_pull(skb, (ihl * 4) + 4 + nhoff)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff + + (ihl * 4)); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); break; @@ -2352,11 +2326,56 @@ static int get_rps_cpu(struct net_device /* get a consistent hash (same value on both flow directions) */ if (addr2 < addr1) swap(addr1, addr2); - skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); - if (!skb->rxhash) - skb->rxhash = 1; -got_hash: + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + if (!hash) + hash = 1; + +done: + return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->num_rx_queues)) { + WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " + "on queue %u, but number of RX queues is %u\n", + dev->name, index, dev->num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + if (!rxqueue->rps_map && !rxqueue->rps_flow_table) + goto done; + + skb->rxhash = skb_get_rxhash(skb); + if (skb->rxhash < 0) + goto done; + flow_table = rcu_dereference(rxqueue->rps_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) {