Message ID | 1583245281-25999-4-git-send-email-paulb@mellanox.com |
---|---|
State | Changes Requested |
Delegated to: | David Miller |
Headers | show |
Series | act_ct: Software offload of conntrack_in | expand |
On 03/03/2020 16:21, Paul Blakey wrote: > Offload nf conntrack processing by looking up the 5-tuple in the > zone's flow table. > > The nf conntrack module will process the packets until a connection is > in established state. Once in established state, the ct state pointer > (nf_conn) will be restored on the skb from a successful ft lookup. > > Signed-off-by: Paul Blakey <paulb@mellanox.com> > Acked-by: Jiri Pirko <jiri@mellanox.com> > --- > Changelog: > v4->v5: > Re-read ip/ip6 header after pulling as skb ptrs may change > Use pskb_network_may_pull instaed of pskb_may_pull > v1->v2: > Add !skip_add curly braces > Removed extra setting thoff again > Check tcp proto outside of tcf_ct_flow_table_check_tcp > > net/sched/act_ct.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 160 insertions(+), 2 deletions(-) > Hi Paul, Thanks for making the changes, I have two more questions below, missed these changes on my previous review, sorry about that. > diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c > index 2ab38431..5aff5e7 100644 > --- a/net/sched/act_ct.c > +++ b/net/sched/act_ct.c > @@ -186,6 +186,157 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, > tcf_ct_flow_table_add(ct_ft, ct, tcp); > } > > +static bool > +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb, > + struct flow_offload_tuple *tuple) > +{ > + struct flow_ports *ports; > + unsigned int thoff; > + struct iphdr *iph; > + > + if (!pskb_network_may_pull(skb, sizeof(*iph))) > + return false; > + > + iph = ip_hdr(skb); > + thoff = iph->ihl * 4; > + > + if (ip_is_fragment(iph) || > + unlikely(thoff != sizeof(struct iphdr))) > + return false; > + > + if (iph->protocol != IPPROTO_TCP && > + iph->protocol != IPPROTO_UDP) > + return false; > + > + if (iph->ttl <= 1) > + return false; > + > + if (!pskb_network_may_pull(skb, thoff + sizeof(*ports))) > + return false; > + > + iph = ip_hdr(skb); > + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); > + > + tuple->src_v4.s_addr = iph->saddr; > + tuple->dst_v4.s_addr = iph->daddr; > + tuple->src_port = ports->source; > + tuple->dst_port = ports->dest; > + tuple->l3proto = AF_INET; > + tuple->l4proto = iph->protocol; > + > + return true; > +} > + > +static bool > +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb, > + struct flow_offload_tuple *tuple) > +{ > + struct flow_ports *ports; > + struct ipv6hdr *ip6h; > + unsigned int thoff; > + > + if (!pskb_network_may_pull(skb, sizeof(*ip6h))) > + return false; > + > + ip6h = ipv6_hdr(skb); > + > + if (ip6h->nexthdr != IPPROTO_TCP && > + ip6h->nexthdr != IPPROTO_UDP) > + return false; > + > + if (ip6h->hop_limit <= 1) > + return false; > + > + thoff = sizeof(*ip6h); > + if (!pskb_network_may_pull(skb, thoff + sizeof(*ports))) > + return false; > + > + ip6h = ipv6_hdr(skb); > + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); > + > + tuple->src_v6 = ip6h->saddr; > + tuple->dst_v6 = ip6h->daddr; > + tuple->src_port = ports->source; > + tuple->dst_port = ports->dest; > + tuple->l3proto = AF_INET6; > + tuple->l4proto = ip6h->nexthdr; > + > + return true; > +} > + > +static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow, > + struct sk_buff *skb, > + unsigned int thoff) > +{ > + struct tcphdr *tcph; > + > + if (!pskb_may_pull(skb, thoff + sizeof(*tcph))) Sorry, I missed this spot in my previous review, but shouldn't this follow the same logic for the pull ? > + return false; > + > + tcph = (void *)(skb_network_header(skb) + thoff); > + if (unlikely(tcph->fin || tcph->rst)) { > + flow_offload_teardown(flow); > + return false; > + } > + > + return true; > +} > + > +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, > + struct sk_buff *skb, > + u8 family) > +{ > + struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft; > + struct flow_offload_tuple_rhash *tuplehash; > + struct flow_offload_tuple tuple = {}; > + enum ip_conntrack_info ctinfo; > + struct flow_offload *flow; > + struct nf_conn *ct; > + unsigned int thoff; > + int ip_proto; > + u8 dir; > + > + /* Previously seen or loopback */ > + ct = nf_ct_get(skb, &ctinfo); > + if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) > + return false; > + > + switch (family) { > + case NFPROTO_IPV4: > + if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple)) > + return false; > + break; > + case NFPROTO_IPV6: > + if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple)) > + return false; > + break; > + default: > + return false; > + } > + > + tuplehash = flow_offload_lookup(nf_ft, &tuple); > + if (!tuplehash) > + return false; > + > + dir = tuplehash->tuple.dir; > + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); > + ct = flow->ct; > + > + ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : > + IP_CT_ESTABLISHED_REPLY; > + > + thoff = ip_hdr(skb)->ihl * 4; > + ip_proto = ip_hdr(skb)->protocol; I'm a bit confused about this part, above you treat the skb based on the family but down here it's always IPv4 ? > + if (ip_proto == IPPROTO_TCP && > + !tcf_ct_flow_table_check_tcp(flow, skb, thoff)) > + return false; > + > + nf_conntrack_get(&ct->ct_general); > + nf_ct_set(skb, ct, ctinfo); > + > + return true; > +} > + > static int tcf_ct_flow_tables_init(void) > { > return rhashtable_init(&zones_ht, &zones_params); > @@ -554,6 +705,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, > struct nf_hook_state state; > int nh_ofs, err, retval; > struct tcf_ct_params *p; > + bool skip_add = false; > struct nf_conn *ct; > u8 family; > > @@ -603,6 +755,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, > */ > cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); > if (!cached) { > + if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) { > + skip_add = true; > + goto do_nat; > + } > + > /* Associate skb with specified zone. */ > if (tmpl) { > ct = nf_ct_get(skb, &ctinfo); > @@ -620,6 +777,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, > goto out_push; > } > > +do_nat: > ct = nf_ct_get(skb, &ctinfo); > if (!ct) > goto out_push; > @@ -637,10 +795,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, > * even if the connection is already confirmed. > */ > nf_conntrack_confirm(skb); > + } else if (!skip_add) { > + tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); > } > > - tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); > - > out_push: > skb_push_rcsum(skb, nh_ofs); > >
On 3/3/2020 4:30 PM, Nikolay Aleksandrov wrote: > On 03/03/2020 16:21, Paul Blakey wrote: >> Offload nf conntrack processing by looking up the 5-tuple in the >> zone's flow table. >> >> The nf conntrack module will process the packets until a connection is >> in established state. Once in established state, the ct state pointer >> (nf_conn) will be restored on the skb from a successful ft lookup. >> >> Signed-off-by: Paul Blakey <paulb@mellanox.com> >> Acked-by: Jiri Pirko <jiri@mellanox.com> >> --- >> Changelog: >> v4->v5: >> Re-read ip/ip6 header after pulling as skb ptrs may change >> Use pskb_network_may_pull instaed of pskb_may_pull >> v1->v2: >> Add !skip_add curly braces >> Removed extra setting thoff again >> Check tcp proto outside of tcf_ct_flow_table_check_tcp >> >> net/sched/act_ct.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++- >> 1 file changed, 160 insertions(+), 2 deletions(-) >> > Hi Paul, > Thanks for making the changes, I have two more questions below, missed these changes > on my previous review, sorry about that. > >> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c >> index 2ab38431..5aff5e7 100644 >> --- a/net/sched/act_ct.c >> +++ b/net/sched/act_ct.c >> @@ -186,6 +186,157 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, >> tcf_ct_flow_table_add(ct_ft, ct, tcp); >> } >> >> +static bool >> +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb, >> + struct flow_offload_tuple *tuple) >> +{ >> + struct flow_ports *ports; >> + unsigned int thoff; >> + struct iphdr *iph; >> + >> + if (!pskb_network_may_pull(skb, sizeof(*iph))) >> + return false; >> + >> + iph = ip_hdr(skb); >> + thoff = iph->ihl * 4; >> + >> + if (ip_is_fragment(iph) || >> + unlikely(thoff != sizeof(struct iphdr))) >> + return false; >> + >> + if (iph->protocol != IPPROTO_TCP && >> + iph->protocol != IPPROTO_UDP) >> + return false; >> + >> + if (iph->ttl <= 1) >> + return false; >> + >> + if (!pskb_network_may_pull(skb, thoff + sizeof(*ports))) >> + return false; >> + >> + iph = ip_hdr(skb); >> + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); >> + >> + tuple->src_v4.s_addr = iph->saddr; >> + tuple->dst_v4.s_addr = iph->daddr; >> + tuple->src_port = ports->source; >> + tuple->dst_port = ports->dest; >> + tuple->l3proto = AF_INET; >> + tuple->l4proto = iph->protocol; >> + >> + return true; >> +} >> + >> +static bool >> +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb, >> + struct flow_offload_tuple *tuple) >> +{ >> + struct flow_ports *ports; >> + struct ipv6hdr *ip6h; >> + unsigned int thoff; >> + >> + if (!pskb_network_may_pull(skb, sizeof(*ip6h))) >> + return false; >> + >> + ip6h = ipv6_hdr(skb); >> + >> + if (ip6h->nexthdr != IPPROTO_TCP && >> + ip6h->nexthdr != IPPROTO_UDP) >> + return false; >> + >> + if (ip6h->hop_limit <= 1) >> + return false; >> + >> + thoff = sizeof(*ip6h); >> + if (!pskb_network_may_pull(skb, thoff + sizeof(*ports))) >> + return false; >> + >> + ip6h = ipv6_hdr(skb); >> + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); >> + >> + tuple->src_v6 = ip6h->saddr; >> + tuple->dst_v6 = ip6h->daddr; >> + tuple->src_port = ports->source; >> + tuple->dst_port = ports->dest; >> + tuple->l3proto = AF_INET6; >> + tuple->l4proto = ip6h->nexthdr; >> + >> + return true; >> +} >> + >> +static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow, >> + struct sk_buff *skb, >> + unsigned int thoff) >> +{ >> + struct tcphdr *tcph; >> + >> + if (!pskb_may_pull(skb, thoff + sizeof(*tcph))) > Sorry, I missed this spot in my previous review, but shouldn't this follow the > same logic for the pull ? Yes, will fix. >> + return false; >> + >> + tcph = (void *)(skb_network_header(skb) + thoff); >> + if (unlikely(tcph->fin || tcph->rst)) { >> + flow_offload_teardown(flow); >> + return false; >> + } >> + >> + return true; >> +} >> + >> +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, >> + struct sk_buff *skb, >> + u8 family) >> +{ >> + struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft; >> + struct flow_offload_tuple_rhash *tuplehash; >> + struct flow_offload_tuple tuple = {}; >> + enum ip_conntrack_info ctinfo; >> + struct flow_offload *flow; >> + struct nf_conn *ct; >> + unsigned int thoff; >> + int ip_proto; >> + u8 dir; >> + >> + /* Previously seen or loopback */ >> + ct = nf_ct_get(skb, &ctinfo); >> + if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) >> + return false; >> + >> + switch (family) { >> + case NFPROTO_IPV4: >> + if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple)) >> + return false; >> + break; >> + case NFPROTO_IPV6: >> + if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple)) >> + return false; >> + break; >> + default: >> + return false; >> + } >> + >> + tuplehash = flow_offload_lookup(nf_ft, &tuple); >> + if (!tuplehash) >> + return false; >> + >> + dir = tuplehash->tuple.dir; >> + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); >> + ct = flow->ct; >> + >> + ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : >> + IP_CT_ESTABLISHED_REPLY; >> + >> + thoff = ip_hdr(skb)->ihl * 4; >> + ip_proto = ip_hdr(skb)->protocol; > I'm a bit confused about this part, above you treat the skb based on the family > but down here it's always IPv4 ? Right,Ill move the tcp check to inside filler funcs. Thanks! > >> + if (ip_proto == IPPROTO_TCP && >> + !tcf_ct_flow_table_check_tcp(flow, skb, thoff)) >> + return false; >> + >> + nf_conntrack_get(&ct->ct_general); >> + nf_ct_set(skb, ct, ctinfo); >> + >> + return true; >> +} >> + >> static int tcf_ct_flow_tables_init(void) >> { >> return rhashtable_init(&zones_ht, &zones_params); >> @@ -554,6 +705,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, >> struct nf_hook_state state; >> int nh_ofs, err, retval; >> struct tcf_ct_params *p; >> + bool skip_add = false; >> struct nf_conn *ct; >> u8 family; >> >> @@ -603,6 +755,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, >> */ >> cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); >> if (!cached) { >> + if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) { >> + skip_add = true; >> + goto do_nat; >> + } >> + >> /* Associate skb with specified zone. */ >> if (tmpl) { >> ct = nf_ct_get(skb, &ctinfo); >> @@ -620,6 +777,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, >> goto out_push; >> } >> >> +do_nat: >> ct = nf_ct_get(skb, &ctinfo); >> if (!ct) >> goto out_push; >> @@ -637,10 +795,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, >> * even if the connection is already confirmed. >> */ >> nf_conntrack_confirm(skb); >> + } else if (!skip_add) { >> + tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); >> } >> >> - tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); >> - >> out_push: >> skb_push_rcsum(skb, nh_ofs); >> >>
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 2ab38431..5aff5e7 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -186,6 +186,157 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, tcf_ct_flow_table_add(ct_ft, ct, tcp); } +static bool +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + unsigned int thoff; + struct iphdr *iph; + + if (!pskb_network_may_pull(skb, sizeof(*iph))) + return false; + + iph = ip_hdr(skb); + thoff = iph->ihl * 4; + + if (ip_is_fragment(iph) || + unlikely(thoff != sizeof(struct iphdr))) + return false; + + if (iph->protocol != IPPROTO_TCP && + iph->protocol != IPPROTO_UDP) + return false; + + if (iph->ttl <= 1) + return false; + + if (!pskb_network_may_pull(skb, thoff + sizeof(*ports))) + return false; + + iph = ip_hdr(skb); + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v4.s_addr = iph->saddr; + tuple->dst_v4.s_addr = iph->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET; + tuple->l4proto = iph->protocol; + + return true; +} + +static bool +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + struct ipv6hdr *ip6h; + unsigned int thoff; + + if (!pskb_network_may_pull(skb, sizeof(*ip6h))) + return false; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_TCP && + ip6h->nexthdr != IPPROTO_UDP) + return false; + + if (ip6h->hop_limit <= 1) + return false; + + thoff = sizeof(*ip6h); + if (!pskb_network_may_pull(skb, thoff + sizeof(*ports))) + return false; + + ip6h = ipv6_hdr(skb); + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v6 = ip6h->saddr; + tuple->dst_v6 = ip6h->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET6; + tuple->l4proto = ip6h->nexthdr; + + return true; +} + +static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow, + struct sk_buff *skb, + unsigned int thoff) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph))) + return false; + + tcph = (void *)(skb_network_header(skb) + thoff); + if (unlikely(tcph->fin || tcph->rst)) { + flow_offload_teardown(flow); + return false; + } + + return true; +} + +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, + struct sk_buff *skb, + u8 family) +{ + struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft; + struct flow_offload_tuple_rhash *tuplehash; + struct flow_offload_tuple tuple = {}; + enum ip_conntrack_info ctinfo; + struct flow_offload *flow; + struct nf_conn *ct; + unsigned int thoff; + int ip_proto; + u8 dir; + + /* Previously seen or loopback */ + ct = nf_ct_get(skb, &ctinfo); + if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) + return false; + + switch (family) { + case NFPROTO_IPV4: + if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple)) + return false; + break; + case NFPROTO_IPV6: + if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple)) + return false; + break; + default: + return false; + } + + tuplehash = flow_offload_lookup(nf_ft, &tuple); + if (!tuplehash) + return false; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + ct = flow->ct; + + ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : + IP_CT_ESTABLISHED_REPLY; + + thoff = ip_hdr(skb)->ihl * 4; + ip_proto = ip_hdr(skb)->protocol; + if (ip_proto == IPPROTO_TCP && + !tcf_ct_flow_table_check_tcp(flow, skb, thoff)) + return false; + + nf_conntrack_get(&ct->ct_general); + nf_ct_set(skb, ct, ctinfo); + + return true; +} + static int tcf_ct_flow_tables_init(void) { return rhashtable_init(&zones_ht, &zones_params); @@ -554,6 +705,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, struct nf_hook_state state; int nh_ofs, err, retval; struct tcf_ct_params *p; + bool skip_add = false; struct nf_conn *ct; u8 family; @@ -603,6 +755,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, */ cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); if (!cached) { + if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) { + skip_add = true; + goto do_nat; + } + /* Associate skb with specified zone. */ if (tmpl) { ct = nf_ct_get(skb, &ctinfo); @@ -620,6 +777,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, goto out_push; } +do_nat: ct = nf_ct_get(skb, &ctinfo); if (!ct) goto out_push; @@ -637,10 +795,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, * even if the connection is already confirmed. */ nf_conntrack_confirm(skb); + } else if (!skip_add) { + tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); } - tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); - out_push: skb_push_rcsum(skb, nh_ofs);