Message ID | 152699745846.21931.4558451708304709296.stgit@alrua-kau |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
Series | sched: Add Common Applications Kept Enhanced (cake) qdisc | expand |
Hi Toke, On Tue, May 22, 2018 at 03:57:38PM +0200, Toke Høiland-Jørgensen wrote: > When CAKE is deployed on a gateway that also performs NAT (which is a > common deployment mode), the host fairness mechanism cannot distinguish > internal hosts from each other, and so fails to work correctly. > > To fix this, we add an optional NAT awareness mode, which will query the > kernel conntrack mechanism to obtain the pre-NAT addresses for each packet > and use that in the flow and host hashing. > > When the shaper is enabled and the host is already performing NAT, the cost > of this lookup is negligible. However, in unlimited mode with no NAT being > performed, there is a significant CPU cost at higher bandwidths. For this > reason, the feature is turned off by default. > > Cc: netfilter-devel@vger.kernel.org > Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk> > --- > net/sched/sch_cake.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 79 insertions(+) > > diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c > index 68ac908470f1..6f7cae705c84 100644 > --- a/net/sched/sch_cake.c > +++ b/net/sched/sch_cake.c > @@ -71,6 +71,12 @@ > #include <net/tcp.h> > #include <net/flow_dissector.h> > > +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) > +#include <net/netfilter/nf_conntrack_core.h> > +#include <net/netfilter/nf_conntrack_zones.h> > +#include <net/netfilter/nf_conntrack.h> > +#endif > + > #define CAKE_SET_WAYS (8) > #define CAKE_MAX_TINS (8) > #define CAKE_QUEUES (1024) > @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, > return drop; > } > > +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) > + > +static void cake_update_flowkeys(struct flow_keys *keys, > + const struct sk_buff *skb) > +{ > + const struct nf_conntrack_tuple *tuple; > + enum ip_conntrack_info ctinfo; > + struct nf_conn *ct; > + bool rev = false; > + > + if (tc_skb_protocol(skb) != htons(ETH_P_IP)) > + return; > + > + ct = nf_ct_get(skb, &ctinfo); > + if (ct) { > + tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); > + } else { > + const struct nf_conntrack_tuple_hash *hash; > + struct nf_conntrack_tuple srctuple; > + > + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), > + NFPROTO_IPV4, dev_net(skb->dev), > + &srctuple)) > + return; > + > + hash = nf_conntrack_find_get(dev_net(skb->dev), > + &nf_ct_zone_dflt, > + &srctuple); > + if (!hash) > + return; > + > + rev = true; > + ct = nf_ct_tuplehash_to_ctrack(hash); > + tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); > + } > + > + keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip; > + keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip; > + > + if (keys->ports.ports) { > + keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all; > + keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all; > + } > + if (rev) > + nf_ct_put(ct); > +} This is going to pull in the nf_conntrack module, even if you may not want it, as soon as cake is in place.
Pablo Neira Ayuso <pablo@netfilter.org> writes: > Hi Toke, > > On Tue, May 22, 2018 at 03:57:38PM +0200, Toke Høiland-Jørgensen wrote: >> When CAKE is deployed on a gateway that also performs NAT (which is a >> common deployment mode), the host fairness mechanism cannot distinguish >> internal hosts from each other, and so fails to work correctly. >> >> To fix this, we add an optional NAT awareness mode, which will query the >> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet >> and use that in the flow and host hashing. >> >> When the shaper is enabled and the host is already performing NAT, the cost >> of this lookup is negligible. However, in unlimited mode with no NAT being >> performed, there is a significant CPU cost at higher bandwidths. For this >> reason, the feature is turned off by default. >> >> Cc: netfilter-devel@vger.kernel.org >> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk> >> --- >> net/sched/sch_cake.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 79 insertions(+) >> >> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c >> index 68ac908470f1..6f7cae705c84 100644 >> --- a/net/sched/sch_cake.c >> +++ b/net/sched/sch_cake.c >> @@ -71,6 +71,12 @@ >> #include <net/tcp.h> >> #include <net/flow_dissector.h> >> >> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) >> +#include <net/netfilter/nf_conntrack_core.h> >> +#include <net/netfilter/nf_conntrack_zones.h> >> +#include <net/netfilter/nf_conntrack.h> >> +#endif >> + >> #define CAKE_SET_WAYS (8) >> #define CAKE_MAX_TINS (8) >> #define CAKE_QUEUES (1024) >> @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, >> return drop; >> } >> >> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) >> + >> +static void cake_update_flowkeys(struct flow_keys *keys, >> + const struct sk_buff *skb) >> +{ >> + const struct nf_conntrack_tuple *tuple; >> + enum ip_conntrack_info ctinfo; >> + struct nf_conn *ct; >> + bool rev = false; >> + >> + if (tc_skb_protocol(skb) != htons(ETH_P_IP)) >> + return; >> + >> + ct = nf_ct_get(skb, &ctinfo); >> + if (ct) { >> + tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); >> + } else { >> + const struct nf_conntrack_tuple_hash *hash; >> + struct nf_conntrack_tuple srctuple; >> + >> + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), >> + NFPROTO_IPV4, dev_net(skb->dev), >> + &srctuple)) >> + return; >> + >> + hash = nf_conntrack_find_get(dev_net(skb->dev), >> + &nf_ct_zone_dflt, >> + &srctuple); >> + if (!hash) >> + return; >> + >> + rev = true; >> + ct = nf_ct_tuplehash_to_ctrack(hash); >> + tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); >> + } >> + >> + keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip; >> + keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip; >> + >> + if (keys->ports.ports) { >> + keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all; >> + keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all; >> + } >> + if (rev) >> + nf_ct_put(ct); >> +} > > This is going to pull in the nf_conntrack module, even if you may not > want it, as soon as cake is in place. Yeah, we are aware of that; we get a moddep on nf_conntrack. Our main deployment scenario has been home routers where conntrack is used anyway, so this has not been much of an issue. However, if there is a way to avoid this, and instead detect at runtime if conntrack is available, that would certainly be useful. Is there? :) -Toke
From: Toke Høiland-Jørgensen <toke@toke.dk> Date: Tue, 22 May 2018 15:57:38 +0200 > When CAKE is deployed on a gateway that also performs NAT (which is a > common deployment mode), the host fairness mechanism cannot distinguish > internal hosts from each other, and so fails to work correctly. > > To fix this, we add an optional NAT awareness mode, which will query the > kernel conntrack mechanism to obtain the pre-NAT addresses for each packet > and use that in the flow and host hashing. > > When the shaper is enabled and the host is already performing NAT, the cost > of this lookup is negligible. However, in unlimited mode with no NAT being > performed, there is a significant CPU cost at higher bandwidths. For this > reason, the feature is turned off by default. > > Cc: netfilter-devel@vger.kernel.org > Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk> This is really pushing the limits of what a packet scheduler can require for correct operation. And this creates an incredibly ugly dependency. I'd much rather you do something NAT method agnostic, like save or compute the necessary information on ingress and then later use it on egress. Because what you have here will completely break when someone does NAT using eBPF, act_nat, or similar. There is even skb->rxhash, be creative :-)
> On 23 May, 2018, at 9:44 pm, David Miller <davem@davemloft.net> wrote: > > I'd much rather you do something NAT method agnostic, like save > or compute the necessary information on ingress and then later > use it on egress. We were under the impression that conntrack was the cleanest and most correct way to convey this information between qdiscs. Frankly it's difficult to see how else we could do it without major complications. Remember that it takes two different qdiscs to implement ingress and egress on the same physical interface, and there's no obvious logical link between them - especially since the ingress one has to be attached to an ifb, not to the actual interface, because there's no native support for ingress qdiscs. What's more, there's no information (besides conntrack) at ingress about the "inside" address of NATted traffic. There might be some residual information for egress traffic, but communicating that to the ingress side feels very much like we need to reimplement something very like conntrack. If not supporting "alternative" NAT mechanisms that don't register their data in conntrack is the penalty, it's one I personally can live with. - Jonathan Morton
From: Jonathan Morton <chromatix99@gmail.com> Date: Wed, 23 May 2018 22:31:53 +0300 > Remember that it takes two different qdiscs to implement ingress and > egress on the same physical interface, and there's no obvious > logical link between them - especially since the ingress one has to > be attached to an ifb, not to the actual interface, because there's > no native support for ingress qdiscs. Who said anything about using an ingress qdisc to record/remember this information?
> On 23 May, 2018, at 11:04 pm, David Miller <davem@davemloft.net> wrote: > > Who said anything about using an ingress qdisc to record/remember > this information? Now I'm *really* confused. Are you saying that the user has to set up their own conntrack mechanism using extra userspace commands? Because complicating the setup process that way runs directly counter to Cake's design philosophy. - Jonathan Morton
David Miller <davem@davemloft.net> writes: > From: Toke Høiland-Jørgensen <toke@toke.dk> > Date: Tue, 22 May 2018 15:57:38 +0200 > >> When CAKE is deployed on a gateway that also performs NAT (which is a >> common deployment mode), the host fairness mechanism cannot distinguish >> internal hosts from each other, and so fails to work correctly. >> >> To fix this, we add an optional NAT awareness mode, which will query the >> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet >> and use that in the flow and host hashing. >> >> When the shaper is enabled and the host is already performing NAT, the cost >> of this lookup is negligible. However, in unlimited mode with no NAT being >> performed, there is a significant CPU cost at higher bandwidths. For this >> reason, the feature is turned off by default. >> >> Cc: netfilter-devel@vger.kernel.org >> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk> > > This is really pushing the limits of what a packet scheduler can > require for correct operation. Well, Cake is all about pushing the limits of what a packet scheduler can do... ;) > And this creates an incredibly ugly dependency. Yeah, I do agree with that, and I'd love to get rid of it. I even tried prototyping what it would take to lookup the symbols at runtime using kallsyms. It wasn't exactly prettier; pushed it here in case anyone wants to recoil in horror (completely untested, just got it to the point where the module compiles with no nf_* symbols according to objdump): https://github.com/dtaht/sch_cake/commit/97270a10dcea236d137f5113aaeb4303098ab3f3 > I'd much rather you do something NAT method agnostic, like save or > compute the necessary information on ingress and then later use it on > egress. How would this work? We would have to add some kind of global state shared between all instances of the qdisc, and maintain state for all flows we see going through there, effectively duplicating conntrack, and also requiring people to run Cake on all interfaces? How is that better? > Because what you have here will completely break when someone does NAT > using eBPF, act_nat, or similar. > > There is even skb->rxhash, be creative :-) This is not actually about improving hashing; the post-NAT information is fine for that. It's about making sure the per-host fairness works when NATing, so we can distribute bandwidth between the hosts on the local LAN regardless of how many flows they open. This is one of the "killer features" of Cake - it was the top requested feature until we implemented it. So it would be a shame to drop it. Since act_nat is a 1-to-1 mapping I don't think we would have any loss of functionality with that. For eBPF, well, obviously all bets are off as far as reusing any state. But it's not unreasonable to expect people who do NAT in eBPF to also set skb->tc_classid if they want pre-nat host fairness, is it? Which means that the only remaining issue is the module dependency. Can we live with that (noting that it'll go away if conntrack is configured out of the kernel entirely)? Or is the kallsyms approach a viable way forward? I guess we could add a kconfig option that toggles between that and native calls, so that we'd at least get a compile error on suitably configured kernels if the API changes... -Toke
From: Jonathan Morton <chromatix99@gmail.com> Date: Wed, 23 May 2018 23:33:04 +0300 > Now I'm *really* confused. > > Are you saying that the user has to set up their own conntrack > mechanism using extra userspace commands? Because complicating the > setup process that way runs directly counter to Cake's design > philosophy. I mean not anything filtering or firewall related. We have a full flow dissector in the networking core, which often runs on every RX packet anyways. Record what we need and use it on egress after NAT has occurred.
From: Toke Høiland-Jørgensen <toke@toke.dk> Date: Wed, 23 May 2018 22:38:30 +0200 > How would this work? On egress the core networking flow dissector records what you need somewhere in SKB or wherever. You later retrieve it at egress time after NAT has occurred. > It's about making sure the per-host fairness works when NATing, so > we can distribute bandwidth between the hosts on the local LAN > regardless of how many flows they open. Ok, understood. > But it's not unreasonable to expect people who do NAT in eBPF to > also set skb->tc_classid if they want pre-nat host fairness, is it? And core networking can do it as well. Please remove this conntrack dependency, I don't think it is necessary and it is very short sighted.
David Miller <davem@davemloft.net> writes: > From: Toke Høiland-Jørgensen <toke@toke.dk> > Date: Wed, 23 May 2018 22:38:30 +0200 > >> How would this work? > > On egress the core networking flow dissector records what you need > somewhere in SKB or wherever. You later retrieve it at egress time > after NAT has occurred. Ah, right, that could work. Is there any particular field in sk_buff we should stomp on for this purpose, or would you prefer a new one? Looking through it, the only obvious one that comes to mind is, well, skb->_nfct :) If we wanted to avoid bloating sk_buff, we could add a union with that, fill it in the flow dissector, and just let conntrack overwrite it if active; then detect which is which in Cake, and read the data we need from _nfct if conntrack is active, and from what the flow dissector stored otherwise. Is that too many hoops to jump through to avoid adding an extra field? -Toke
From: Toke Høiland-Jørgensen <toke@toke.dk> Date: Wed, 23 May 2018 23:05:16 +0200 > Ah, right, that could work. Is there any particular field in sk_buff > we should stomp on for this purpose, or would you prefer a new one? > Looking through it, the only obvious one that comes to mind is, well, > skb->_nfct :) > > If we wanted to avoid bloating sk_buff, we could add a union with that, > fill it in the flow dissector, and just let conntrack overwrite it if > active; then detect which is which in Cake, and read the data we need > from _nfct if conntrack is active, and from what the flow dissector > stored otherwise. > > Is that too many hoops to jump through to avoid adding an extra field? Space is precious in sk_buff, so yes avoid adding new members at all costs. How much info do you need exactly?
David Miller <davem@davemloft.net> writes: > From: Toke Høiland-Jørgensen <toke@toke.dk> > Date: Wed, 23 May 2018 23:05:16 +0200 > >> Ah, right, that could work. Is there any particular field in sk_buff >> we should stomp on for this purpose, or would you prefer a new one? >> Looking through it, the only obvious one that comes to mind is, well, >> skb->_nfct :) >> >> If we wanted to avoid bloating sk_buff, we could add a union with that, >> fill it in the flow dissector, and just let conntrack overwrite it if >> active; then detect which is which in Cake, and read the data we need >> from _nfct if conntrack is active, and from what the flow dissector >> stored otherwise. >> >> Is that too many hoops to jump through to avoid adding an extra field? > > Space is precious in sk_buff, so yes avoid adding new members at all > costs. > > How much info do you need exactly? We use a u32 hash (from flow_hash_from_keys()) on the source address. Ideally we'd want that; but we could get away with less if we are willing to accept more hash collisions; we just need to map the source address into a hash bucket. We currently have 1024 of those, so 10 bits would suffice if we just drop the set-associative hashing for source hosts. Or maybe 16 bits to be on the safe side? It really is a pretty straight-forward tradeoff between space and collision probability. Hmm, and we still have an issue with ingress filtering (where cake is running on an ifb interface). That runs pre-NAT in the conntrack case, and we can't do the RX trick. Here we do the lookup manually in conntrack (and this part is actually what brings in most of the dependencies). Any neat tricks up your sleeve for this case? :) -Toke
On Tue, May 22, 2018 at 04:11:06PM +0200, Toke Høiland-Jørgensen wrote: > Pablo Neira Ayuso <pablo@netfilter.org> writes: > > > Hi Toke, > > > > On Tue, May 22, 2018 at 03:57:38PM +0200, Toke Høiland-Jørgensen wrote: > >> When CAKE is deployed on a gateway that also performs NAT (which is a > >> common deployment mode), the host fairness mechanism cannot distinguish > >> internal hosts from each other, and so fails to work correctly. > >> > >> To fix this, we add an optional NAT awareness mode, which will query the > >> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet > >> and use that in the flow and host hashing. > >> > >> When the shaper is enabled and the host is already performing NAT, the cost > >> of this lookup is negligible. However, in unlimited mode with no NAT being > >> performed, there is a significant CPU cost at higher bandwidths. For this > >> reason, the feature is turned off by default. > >> > >> Cc: netfilter-devel@vger.kernel.org > >> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk> > >> --- > >> net/sched/sch_cake.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ > >> 1 file changed, 79 insertions(+) > >> > >> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c > >> index 68ac908470f1..6f7cae705c84 100644 > >> --- a/net/sched/sch_cake.c > >> +++ b/net/sched/sch_cake.c > >> @@ -71,6 +71,12 @@ > >> #include <net/tcp.h> > >> #include <net/flow_dissector.h> > >> > >> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) > >> +#include <net/netfilter/nf_conntrack_core.h> > >> +#include <net/netfilter/nf_conntrack_zones.h> > >> +#include <net/netfilter/nf_conntrack.h> > >> +#endif > >> + > >> #define CAKE_SET_WAYS (8) > >> #define CAKE_MAX_TINS (8) > >> #define CAKE_QUEUES (1024) > >> @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, > >> return drop; > >> } > >> > >> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) > >> + > >> +static void cake_update_flowkeys(struct flow_keys *keys, > >> + const struct sk_buff *skb) > >> +{ > >> + const struct nf_conntrack_tuple *tuple; > >> + enum ip_conntrack_info ctinfo; > >> + struct nf_conn *ct; > >> + bool rev = false; > >> + > >> + if (tc_skb_protocol(skb) != htons(ETH_P_IP)) > >> + return; > >> + > >> + ct = nf_ct_get(skb, &ctinfo); > >> + if (ct) { > >> + tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); > >> + } else { > >> + const struct nf_conntrack_tuple_hash *hash; > >> + struct nf_conntrack_tuple srctuple; > >> + > >> + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), > >> + NFPROTO_IPV4, dev_net(skb->dev), > >> + &srctuple)) > >> + return; > >> + > >> + hash = nf_conntrack_find_get(dev_net(skb->dev), > >> + &nf_ct_zone_dflt, > >> + &srctuple); > >> + if (!hash) > >> + return; > >> + > >> + rev = true; > >> + ct = nf_ct_tuplehash_to_ctrack(hash); > >> + tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); > >> + } > >> + > >> + keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip; > >> + keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip; > >> + > >> + if (keys->ports.ports) { > >> + keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all; > >> + keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all; > >> + } > >> + if (rev) > >> + nf_ct_put(ct); > >> +} > > > > This is going to pull in the nf_conntrack module, even if you may not > > want it, as soon as cake is in place. > > Yeah, we are aware of that; we get a moddep on nf_conntrack. Our main > deployment scenario has been home routers where conntrack is used > anyway, so this has not been much of an issue. However, if there is a > way to avoid this, and instead detect at runtime if conntrack is > available, that would certainly be useful. Is there? :) Yes, there is. You place this function in net/netfilter/nf_conntrack_core.c, call it nf_conntrack_get_tuple() which internally uses a rcu hook for this. See nf_ct_attach() and ip_ct_attach() in net/netfilter/core.c for instance. This allows you to avoid the dependency with nf_conntrack (which would be only called if the module has been explicitly loaded), which is what you're searching for.
Pablo Neira Ayuso <pablo@netfilter.org> writes: > On Tue, May 22, 2018 at 04:11:06PM +0200, Toke Høiland-Jørgensen wrote: >> Pablo Neira Ayuso <pablo@netfilter.org> writes: >> >> > Hi Toke, >> > >> > On Tue, May 22, 2018 at 03:57:38PM +0200, Toke Høiland-Jørgensen wrote: >> >> When CAKE is deployed on a gateway that also performs NAT (which is a >> >> common deployment mode), the host fairness mechanism cannot distinguish >> >> internal hosts from each other, and so fails to work correctly. >> >> >> >> To fix this, we add an optional NAT awareness mode, which will query the >> >> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet >> >> and use that in the flow and host hashing. >> >> >> >> When the shaper is enabled and the host is already performing NAT, the cost >> >> of this lookup is negligible. However, in unlimited mode with no NAT being >> >> performed, there is a significant CPU cost at higher bandwidths. For this >> >> reason, the feature is turned off by default. >> >> >> >> Cc: netfilter-devel@vger.kernel.org >> >> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk> >> >> --- >> >> net/sched/sch_cake.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ >> >> 1 file changed, 79 insertions(+) >> >> >> >> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c >> >> index 68ac908470f1..6f7cae705c84 100644 >> >> --- a/net/sched/sch_cake.c >> >> +++ b/net/sched/sch_cake.c >> >> @@ -71,6 +71,12 @@ >> >> #include <net/tcp.h> >> >> #include <net/flow_dissector.h> >> >> >> >> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) >> >> +#include <net/netfilter/nf_conntrack_core.h> >> >> +#include <net/netfilter/nf_conntrack_zones.h> >> >> +#include <net/netfilter/nf_conntrack.h> >> >> +#endif >> >> + >> >> #define CAKE_SET_WAYS (8) >> >> #define CAKE_MAX_TINS (8) >> >> #define CAKE_QUEUES (1024) >> >> @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, >> >> return drop; >> >> } >> >> >> >> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) >> >> + >> >> +static void cake_update_flowkeys(struct flow_keys *keys, >> >> + const struct sk_buff *skb) >> >> +{ >> >> + const struct nf_conntrack_tuple *tuple; >> >> + enum ip_conntrack_info ctinfo; >> >> + struct nf_conn *ct; >> >> + bool rev = false; >> >> + >> >> + if (tc_skb_protocol(skb) != htons(ETH_P_IP)) >> >> + return; >> >> + >> >> + ct = nf_ct_get(skb, &ctinfo); >> >> + if (ct) { >> >> + tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); >> >> + } else { >> >> + const struct nf_conntrack_tuple_hash *hash; >> >> + struct nf_conntrack_tuple srctuple; >> >> + >> >> + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), >> >> + NFPROTO_IPV4, dev_net(skb->dev), >> >> + &srctuple)) >> >> + return; >> >> + >> >> + hash = nf_conntrack_find_get(dev_net(skb->dev), >> >> + &nf_ct_zone_dflt, >> >> + &srctuple); >> >> + if (!hash) >> >> + return; >> >> + >> >> + rev = true; >> >> + ct = nf_ct_tuplehash_to_ctrack(hash); >> >> + tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); >> >> + } >> >> + >> >> + keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip; >> >> + keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip; >> >> + >> >> + if (keys->ports.ports) { >> >> + keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all; >> >> + keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all; >> >> + } >> >> + if (rev) >> >> + nf_ct_put(ct); >> >> +} >> > >> > This is going to pull in the nf_conntrack module, even if you may not >> > want it, as soon as cake is in place. >> >> Yeah, we are aware of that; we get a moddep on nf_conntrack. Our main >> deployment scenario has been home routers where conntrack is used >> anyway, so this has not been much of an issue. However, if there is a >> way to avoid this, and instead detect at runtime if conntrack is >> available, that would certainly be useful. Is there? :) > > Yes, there is. > > You place this function in net/netfilter/nf_conntrack_core.c, call it > nf_conntrack_get_tuple() which internally uses a rcu hook for this. > See nf_ct_attach() and ip_ct_attach() in net/netfilter/core.c for > instance. > > This allows you to avoid the dependency with nf_conntrack (which would > be only called if the module has been explicitly loaded), which is > what you're searching for. Ah, awesome! I'll look into that; thanks :) -Toke
> On 23 May 2018, at 23:40, Toke Høiland-Jørgensen <toke@toke.dk> wrote: > <snip> > > Hmm, and we still have an issue with ingress filtering (where cake is > running on an ifb interface). That runs pre-NAT in the conntrack case, > and we can't do the RX trick. Here we do the lookup manually in > conntrack (and this part is actually what brings in most of the > dependencies). Any neat tricks up your sleeve for this case? :) I wonder here if our terminology with ‘ingress’ is causing confusion. For avoidance of doubt: Typical use case of cake on LAN/WAN router requires two instances. One instance (the egress) is on the WAN interface itself. It is post conntrack and hence uses skb->nfct to work out the real pre-nat source address of the LAN hosts. Since we cannot apply this qdisc to the ingress of our WAN interface we use an IFB to mirror the ingress packets, and then use a cake instance on the ifb interface on its egress path to in essence control the ingress traffic. Cake has two modes, the normal ‘egress’ mode which is designed to be used when controlling egress traffic output, and shapes post any dropped packets. ‘ingress’ mode is designed to be used on the egress of our ingress IFB, where the shaper counts all packets used (well they got here!) even if we decide to drop them a bit later. The ifb positioned cake has the additional fun factor that the conntrack field hasn’t yet been filled in, so the qdisc has to go looking in the conntrack tables itself to see if any NATting has taken place and balance LAN host fairness based on that. As far as I understand it, the flow dissector doesn’t obviously help with working out the pre-NAT addressing as the flow has already been mangled in the egress case, and is awaiting mangling on the ingress case. Kevin
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 68ac908470f1..6f7cae705c84 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -71,6 +71,12 @@ #include <net/tcp.h> #include <net/flow_dissector.h> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack.h> +#endif + #define CAKE_SET_WAYS (8) #define CAKE_MAX_TINS (8) #define CAKE_QUEUES (1024) @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, return drop; } +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) + +static void cake_update_flowkeys(struct flow_keys *keys, + const struct sk_buff *skb) +{ + const struct nf_conntrack_tuple *tuple; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + bool rev = false; + + if (tc_skb_protocol(skb) != htons(ETH_P_IP)) + return; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); + } else { + const struct nf_conntrack_tuple_hash *hash; + struct nf_conntrack_tuple srctuple; + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + NFPROTO_IPV4, dev_net(skb->dev), + &srctuple)) + return; + + hash = nf_conntrack_find_get(dev_net(skb->dev), + &nf_ct_zone_dflt, + &srctuple); + if (!hash) + return; + + rev = true; + ct = nf_ct_tuplehash_to_ctrack(hash); + tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); + } + + keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip; + keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip; + + if (keys->ports.ports) { + keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all; + keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all; + } + if (rev) + nf_ct_put(ct); +} +#else +static void cake_update_flowkeys(struct flow_keys *keys, + const struct sk_buff *skb) +{ + /* There is nothing we can do here without CONNTRACK */ +} +#endif + /* Cake has several subtle multiple bit settings. In these cases you * would be matching triple isolate mode as well. */ @@ -543,6 +603,9 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + if (flow_mode & CAKE_FLOW_NAT_FLAG) + cake_update_flowkeys(&keys, skb); + /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat * src and dst host addresses as independently selectable. @@ -1919,6 +1982,18 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + if (tb[TCA_CAKE_NAT]) { +#if IS_REACHABLE(CONFIG_NF_CONNTRACK) + q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; + q->flow_mode |= CAKE_FLOW_NAT_FLAG * + !!nla_get_u32(tb[TCA_CAKE_NAT]); +#else + NL_SET_ERR_MSG_ATTR(extack, "No conntrack support in kernel", + tb[TCA_CAKE_NAT]); + return -EOPNOTSUPP; +#endif + } + if (tb[TCA_CAKE_BASE_RATE64]) q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); @@ -2091,6 +2166,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_NAT, + !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure:
When CAKE is deployed on a gateway that also performs NAT (which is a common deployment mode), the host fairness mechanism cannot distinguish internal hosts from each other, and so fails to work correctly. To fix this, we add an optional NAT awareness mode, which will query the kernel conntrack mechanism to obtain the pre-NAT addresses for each packet and use that in the flow and host hashing. When the shaper is enabled and the host is already performing NAT, the cost of this lookup is negligible. However, in unlimited mode with no NAT being performed, there is a significant CPU cost at higher bandwidths. For this reason, the feature is turned off by default. Cc: netfilter-devel@vger.kernel.org Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk> --- net/sched/sch_cake.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+)