Message ID | 1583251072-10396-2-git-send-email-paulb@mellanox.com |
---|---|
State | Superseded |
Delegated to: | David Miller |
Headers | show |
Series | act_ct: Software offload of conntrack_in | expand |
On Tue, Mar 03, 2020 at 05:57:50PM +0200, Paul Blakey wrote: > Use the NF flow tables infrastructure for CT offload. > > Create a nf flow table per zone. > > Next patches will add FT entries to this table, and do > the software offload. > > Signed-off-by: Paul Blakey <paulb@mellanox.com> > Reviewed-by: Jiri Pirko <jiri@mellanox.com> Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> > --- > v4->v5: > Added reviewed by Jiri, thanks! > v3->v4: > Alloc GFP_ATOMIC > v2->v3: > Ditch re-locking to alloc, and use atomic allocation > v1->v2: > Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep) > Free ft on last tc act instance instead of last instance + last offloaded tuple, > this removes cleanup cb and netfilter patches, and is simpler > Removed accidental mlx5/core/en_tc.c change > Removed reviewed by Jiri - patch changed > > include/net/tc_act/tc_ct.h | 2 + > net/sched/Kconfig | 2 +- > net/sched/act_ct.c | 134 ++++++++++++++++++++++++++++++++++++++++++++- > 3 files changed, 136 insertions(+), 2 deletions(-) > > diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h > index a8b1564..cf3492e 100644 > --- a/include/net/tc_act/tc_ct.h > +++ b/include/net/tc_act/tc_ct.h > @@ -25,6 +25,8 @@ struct tcf_ct_params { > u16 ct_action; > > struct rcu_head rcu; > + > + struct tcf_ct_flow_table *ct_ft; > }; > > struct tcf_ct { > diff --git a/net/sched/Kconfig b/net/sched/Kconfig > index edde0e5..bfbefb7 100644 > --- a/net/sched/Kconfig > +++ b/net/sched/Kconfig > @@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY > > config NET_ACT_CT > tristate "connection tracking tc action" > - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT > + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE > help > Say Y here to allow sending the packets to conntrack module. > > diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c > index f685c0d..3321087 100644 > --- a/net/sched/act_ct.c > +++ b/net/sched/act_ct.c > @@ -15,6 +15,7 @@ > #include <linux/pkt_cls.h> > #include <linux/ip.h> > #include <linux/ipv6.h> > +#include <linux/rhashtable.h> > #include <net/netlink.h> > #include <net/pkt_sched.h> > #include <net/pkt_cls.h> > @@ -24,6 +25,7 @@ > #include <uapi/linux/tc_act/tc_ct.h> > #include <net/tc_act/tc_ct.h> > > +#include <net/netfilter/nf_flow_table.h> > #include <net/netfilter/nf_conntrack.h> > #include <net/netfilter/nf_conntrack_core.h> > #include <net/netfilter/nf_conntrack_zones.h> > @@ -31,6 +33,108 @@ > #include <net/netfilter/ipv6/nf_defrag_ipv6.h> > #include <uapi/linux/netfilter/nf_nat.h> > > +static struct workqueue_struct *act_ct_wq; > +static struct rhashtable zones_ht; > +static DEFINE_SPINLOCK(zones_lock); > + > +struct tcf_ct_flow_table { > + struct rhash_head node; /* In zones tables */ > + > + struct rcu_work rwork; > + struct nf_flowtable nf_ft; > + u16 zone; > + u32 ref; > + > + bool dying; > +}; > + > +static const struct rhashtable_params zones_params = { > + .head_offset = offsetof(struct tcf_ct_flow_table, node), > + .key_offset = offsetof(struct tcf_ct_flow_table, zone), > + .key_len = sizeof_field(struct tcf_ct_flow_table, zone), > + .automatic_shrinking = true, > +}; > + > +static struct nf_flowtable_type flowtable_ct = { > + .owner = THIS_MODULE, > +}; > + > +static int tcf_ct_flow_table_get(struct tcf_ct_params *params) > +{ > + struct tcf_ct_flow_table *ct_ft; > + int err = -ENOMEM; > + > + spin_lock_bh(&zones_lock); > + ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); > + if (ct_ft) > + goto take_ref; > + > + ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC); > + if (!ct_ft) > + goto err_alloc; > + > + ct_ft->zone = params->zone; > + err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); > + if (err) > + goto err_insert; > + > + ct_ft->nf_ft.type = &flowtable_ct; > + err = nf_flow_table_init(&ct_ft->nf_ft); > + if (err) > + goto err_init; > + > + __module_get(THIS_MODULE); > +take_ref: > + params->ct_ft = ct_ft; > + ct_ft->ref++; > + spin_unlock_bh(&zones_lock); > + > + return 0; > + > +err_init: > + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); > +err_insert: > + kfree(ct_ft); > +err_alloc: > + spin_unlock_bh(&zones_lock); > + return err; > +} > + > +static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) > +{ > + struct tcf_ct_flow_table *ct_ft; > + > + ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table, > + rwork); > + nf_flow_table_free(&ct_ft->nf_ft); > + kfree(ct_ft); > + > + module_put(THIS_MODULE); > +} > + > +static void tcf_ct_flow_table_put(struct tcf_ct_params *params) > +{ > + struct tcf_ct_flow_table *ct_ft = params->ct_ft; > + > + spin_lock_bh(&zones_lock); > + if (--params->ct_ft->ref == 0) { > + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); > + INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); > + queue_rcu_work(act_ct_wq, &ct_ft->rwork); > + } > + spin_unlock_bh(&zones_lock); > +} > + > +static int tcf_ct_flow_tables_init(void) > +{ > + return rhashtable_init(&zones_ht, &zones_params); > +} > + > +static void tcf_ct_flow_tables_uninit(void) > +{ > + rhashtable_destroy(&zones_ht); > +} > + > static struct tc_action_ops act_ct_ops; > static unsigned int ct_net_id; > > @@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head) > struct tcf_ct_params *params = container_of(head, > struct tcf_ct_params, rcu); > > + tcf_ct_flow_table_put(params); > + > if (params->tmpl) > nf_conntrack_put(¶ms->tmpl->ct_general); > kfree(params); > @@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, > if (err) > goto cleanup; > > + err = tcf_ct_flow_table_get(params); > + if (err) > + goto cleanup; > + > spin_lock_bh(&c->tcf_lock); > goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); > params = rcu_replace_pointer(c->params, params, > @@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list) > > static int __init ct_init_module(void) > { > - return tcf_register_action(&act_ct_ops, &ct_net_ops); > + int err; > + > + act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0); > + if (!act_ct_wq) > + return -ENOMEM; > + > + err = tcf_ct_flow_tables_init(); > + if (err) > + goto err_tbl_init; > + > + err = tcf_register_action(&act_ct_ops, &ct_net_ops); > + if (err) > + goto err_register; > + > + return 0; > + > +err_tbl_init: > + destroy_workqueue(act_ct_wq); > +err_register: > + tcf_ct_flow_tables_uninit(); > + return err; > } > > static void __exit ct_cleanup_module(void) > { > tcf_unregister_action(&act_ct_ops, &ct_net_ops); > + tcf_ct_flow_tables_uninit(); > + destroy_workqueue(act_ct_wq); > } > > module_init(ct_init_module); > -- > 1.8.3.1 >
On 3/3/20 7:57 AM, Paul Blakey wrote: > Use the NF flow tables infrastructure for CT offload. > > Create a nf flow table per zone. > > Next patches will add FT entries to this table, and do > the software offload. > > Signed-off-by: Paul Blakey <paulb@mellanox.com> > Reviewed-by: Jiri Pirko <jiri@mellanox.com> > --- > v4->v5: > Added reviewed by Jiri, thanks! > v3->v4: > Alloc GFP_ATOMIC > v2->v3: > Ditch re-locking to alloc, and use atomic allocation > v1->v2: > Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep) > Free ft on last tc act instance instead of last instance + last offloaded tuple, > this removes cleanup cb and netfilter patches, and is simpler > Removed accidental mlx5/core/en_tc.c change > Removed reviewed by Jiri - patch changed > > include/net/tc_act/tc_ct.h | 2 + > net/sched/Kconfig | 2 +- > net/sched/act_ct.c | 134 ++++++++++++++++++++++++++++++++++++++++++++- > 3 files changed, 136 insertions(+), 2 deletions(-) > > diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h > index a8b1564..cf3492e 100644 > --- a/include/net/tc_act/tc_ct.h > +++ b/include/net/tc_act/tc_ct.h > @@ -25,6 +25,8 @@ struct tcf_ct_params { > u16 ct_action; > > struct rcu_head rcu; > + > + struct tcf_ct_flow_table *ct_ft; > }; > > struct tcf_ct { > diff --git a/net/sched/Kconfig b/net/sched/Kconfig > index edde0e5..bfbefb7 100644 > --- a/net/sched/Kconfig > +++ b/net/sched/Kconfig > @@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY > > config NET_ACT_CT > tristate "connection tracking tc action" > - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT > + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE > help > Say Y here to allow sending the packets to conntrack module. > > diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c > index f685c0d..3321087 100644 > --- a/net/sched/act_ct.c > +++ b/net/sched/act_ct.c > @@ -15,6 +15,7 @@ > #include <linux/pkt_cls.h> > #include <linux/ip.h> > #include <linux/ipv6.h> > +#include <linux/rhashtable.h> > #include <net/netlink.h> > #include <net/pkt_sched.h> > #include <net/pkt_cls.h> > @@ -24,6 +25,7 @@ > #include <uapi/linux/tc_act/tc_ct.h> > #include <net/tc_act/tc_ct.h> > > +#include <net/netfilter/nf_flow_table.h> > #include <net/netfilter/nf_conntrack.h> > #include <net/netfilter/nf_conntrack_core.h> > #include <net/netfilter/nf_conntrack_zones.h> > @@ -31,6 +33,108 @@ > #include <net/netfilter/ipv6/nf_defrag_ipv6.h> > #include <uapi/linux/netfilter/nf_nat.h> > > +static struct workqueue_struct *act_ct_wq; > +static struct rhashtable zones_ht; > +static DEFINE_SPINLOCK(zones_lock); > + > +struct tcf_ct_flow_table { > + struct rhash_head node; /* In zones tables */ > + > + struct rcu_work rwork; > + struct nf_flowtable nf_ft; > + u16 zone; > + u32 ref; > + > + bool dying; > +}; > + > +static const struct rhashtable_params zones_params = { > + .head_offset = offsetof(struct tcf_ct_flow_table, node), > + .key_offset = offsetof(struct tcf_ct_flow_table, zone), > + .key_len = sizeof_field(struct tcf_ct_flow_table, zone), > + .automatic_shrinking = true, > +}; > + > +static struct nf_flowtable_type flowtable_ct = { > + .owner = THIS_MODULE, > +}; > + > +static int tcf_ct_flow_table_get(struct tcf_ct_params *params) > +{ > + struct tcf_ct_flow_table *ct_ft; > + int err = -ENOMEM; > + > + spin_lock_bh(&zones_lock); > + ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); > + if (ct_ft) > + goto take_ref; > + > + ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC); > + if (!ct_ft) > + goto err_alloc; > + > + ct_ft->zone = params->zone; > + err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); > + if (err) > + goto err_insert; > + > + ct_ft->nf_ft.type = &flowtable_ct; > + err = nf_flow_table_init(&ct_ft->nf_ft); This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep) Since you still hold zones_lock spinlock, a splat should occur. "BUG: sleeping function called from invalid context in ..." DEBUG_ATOMIC_SLEEP=y is your friend. And it is always a good thing to make sure a patch does not trigger a lockdep splat CONFIG_PROVE_LOCKING=y > + if (err) > + goto err_init; > + > + __module_get(THIS_MODULE); > +take_ref: > + params->ct_ft = ct_ft; > + ct_ft->ref++; > + spin_unlock_bh(&zones_lock); > + > + return 0; > + > +err_init: > + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); > +err_insert: > + kfree(ct_ft); > +err_alloc: > + spin_unlock_bh(&zones_lock); > + return err; > +} > + > +static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) > +{ > + struct tcf_ct_flow_table *ct_ft; > + > + ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table, > + rwork); > + nf_flow_table_free(&ct_ft->nf_ft); > + kfree(ct_ft); > + > + module_put(THIS_MODULE); > +} > + > +static void tcf_ct_flow_table_put(struct tcf_ct_params *params) > +{ > + struct tcf_ct_flow_table *ct_ft = params->ct_ft; > + > + spin_lock_bh(&zones_lock); > + if (--params->ct_ft->ref == 0) { > + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); > + INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); > + queue_rcu_work(act_ct_wq, &ct_ft->rwork); > + } > + spin_unlock_bh(&zones_lock); > +} > + > +static int tcf_ct_flow_tables_init(void) > +{ > + return rhashtable_init(&zones_ht, &zones_params); > +} > + > +static void tcf_ct_flow_tables_uninit(void) > +{ > + rhashtable_destroy(&zones_ht); > +} > + > static struct tc_action_ops act_ct_ops; > static unsigned int ct_net_id; > > @@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head) > struct tcf_ct_params *params = container_of(head, > struct tcf_ct_params, rcu); > > + tcf_ct_flow_table_put(params); > + > if (params->tmpl) > nf_conntrack_put(¶ms->tmpl->ct_general); > kfree(params); > @@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, > if (err) > goto cleanup; > > + err = tcf_ct_flow_table_get(params); > + if (err) > + goto cleanup; > + > spin_lock_bh(&c->tcf_lock); > goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); > params = rcu_replace_pointer(c->params, params, > @@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list) > > static int __init ct_init_module(void) > { > - return tcf_register_action(&act_ct_ops, &ct_net_ops); > + int err; > + > + act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0); > + if (!act_ct_wq) > + return -ENOMEM; > + > + err = tcf_ct_flow_tables_init(); > + if (err) > + goto err_tbl_init; > + > + err = tcf_register_action(&act_ct_ops, &ct_net_ops); > + if (err) > + goto err_register; > + > + return 0; > + > +err_tbl_init: > + destroy_workqueue(act_ct_wq); > +err_register: > + tcf_ct_flow_tables_uninit(); > + return err; > } > > static void __exit ct_cleanup_module(void) > { > tcf_unregister_action(&act_ct_ops, &ct_net_ops); > + tcf_ct_flow_tables_uninit(); > + destroy_workqueue(act_ct_wq); > } > > module_init(ct_init_module); >
On 3/7/20 12:12 PM, Eric Dumazet wrote: > > > On 3/3/20 7:57 AM, Paul Blakey wrote: >> Use the NF flow tables infrastructure for CT offload. >> >> Create a nf flow table per zone. >> >> Next patches will add FT entries to this table, and do >> the software offload. >> >> Signed-off-by: Paul Blakey <paulb@mellanox.com> >> Reviewed-by: Jiri Pirko <jiri@mellanox.com> >> --- >> v4->v5: >> Added reviewed by Jiri, thanks! >> v3->v4: >> Alloc GFP_ATOMIC >> v2->v3: >> Ditch re-locking to alloc, and use atomic allocation >> v1->v2: >> Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep) >> Free ft on last tc act instance instead of last instance + last offloaded tuple, >> this removes cleanup cb and netfilter patches, and is simpler >> Removed accidental mlx5/core/en_tc.c change >> Removed reviewed by Jiri - patch changed >> >> + err = nf_flow_table_init(&ct_ft->nf_ft); > > This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep) > > Since you still hold zones_lock spinlock, a splat should occur. > > "BUG: sleeping function called from invalid context in ..." > > DEBUG_ATOMIC_SLEEP=y is your friend. > > And it is always a good thing to make sure a patch does not trigger a lockdep splat > > CONFIG_PROVE_LOCKING=y Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged. I can not test the following fix, any objections before I submit this officially ? diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -35,15 +35,15 @@ static struct workqueue_struct *act_ct_wq; static struct rhashtable zones_ht; -static DEFINE_SPINLOCK(zones_lock); +static DEFINE_MUTEX(zones_mutex); struct tcf_ct_flow_table { struct rhash_head node; /* In zones tables */ struct rcu_work rwork; struct nf_flowtable nf_ft; + refcount_t ref; u16 zone; - u32 ref; bool dying; }; @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) struct tcf_ct_flow_table *ct_ft; int err = -ENOMEM; - spin_lock_bh(&zones_lock); + mutex_lock(&zones_mutex); ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); - if (ct_ft) - goto take_ref; + if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) + goto out_unlock; - ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC); + ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL); if (!ct_ft) goto err_alloc; + refcount_set(&ct_ft->ref, 1); ct_ft->zone = params->zone; err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) goto err_init; __module_get(THIS_MODULE); -take_ref: +out_unlock: params->ct_ft = ct_ft; - ct_ft->ref++; - spin_unlock_bh(&zones_lock); + mutex_unlock(&zones_mutex); return 0; @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) err_insert: kfree(ct_ft); err_alloc: - spin_unlock_bh(&zones_lock); + mutex_unlock(&zones_mutex); return err; } @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params) { struct tcf_ct_flow_table *ct_ft = params->ct_ft; - spin_lock_bh(&zones_lock); - if (--params->ct_ft->ref == 0) { + if (refcount_dec_and_test(¶ms->ct_ft->ref)) { rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); queue_rcu_work(act_ct_wq, &ct_ft->rwork); } - spin_unlock_bh(&zones_lock); } static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq. I got a possible deadlock splat for that. On 3/7/2020 10:53 PM, Eric Dumazet wrote: > > On 3/7/20 12:12 PM, Eric Dumazet wrote: >> >> On 3/3/20 7:57 AM, Paul Blakey wrote: >>> Use the NF flow tables infrastructure for CT offload. >>> >>> Create a nf flow table per zone. >>> >>> Next patches will add FT entries to this table, and do >>> the software offload. >>> >>> Signed-off-by: Paul Blakey <paulb@mellanox.com> >>> Reviewed-by: Jiri Pirko <jiri@mellanox.com> >>> --- >>> v4->v5: >>> Added reviewed by Jiri, thanks! >>> v3->v4: >>> Alloc GFP_ATOMIC >>> v2->v3: >>> Ditch re-locking to alloc, and use atomic allocation >>> v1->v2: >>> Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep) >>> Free ft on last tc act instance instead of last instance + last offloaded tuple, >>> this removes cleanup cb and netfilter patches, and is simpler >>> Removed accidental mlx5/core/en_tc.c change >>> Removed reviewed by Jiri - patch changed >>> >>> + err = nf_flow_table_init(&ct_ft->nf_ft); >> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep) >> >> Since you still hold zones_lock spinlock, a splat should occur. >> >> "BUG: sleeping function called from invalid context in ..." >> >> DEBUG_ATOMIC_SLEEP=y is your friend. >> >> And it is always a good thing to make sure a patch does not trigger a lockdep splat >> >> CONFIG_PROVE_LOCKING=y > Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged. > > I can not test the following fix, any objections before I submit this officially ? > > diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c > index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644 > --- a/net/sched/act_ct.c > +++ b/net/sched/act_ct.c > @@ -35,15 +35,15 @@ > > static struct workqueue_struct *act_ct_wq; > static struct rhashtable zones_ht; > -static DEFINE_SPINLOCK(zones_lock); > +static DEFINE_MUTEX(zones_mutex); > > struct tcf_ct_flow_table { > struct rhash_head node; /* In zones tables */ > > struct rcu_work rwork; > struct nf_flowtable nf_ft; > + refcount_t ref; > u16 zone; > - u32 ref; > > bool dying; > }; > @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) > struct tcf_ct_flow_table *ct_ft; > int err = -ENOMEM; > > - spin_lock_bh(&zones_lock); > + mutex_lock(&zones_mutex); > ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); > - if (ct_ft) > - goto take_ref; > + if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) > + goto out_unlock; > > - ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC); > + ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL); > if (!ct_ft) > goto err_alloc; > + refcount_set(&ct_ft->ref, 1); > > ct_ft->zone = params->zone; > err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); > @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) > goto err_init; > > __module_get(THIS_MODULE); > -take_ref: > +out_unlock: > params->ct_ft = ct_ft; > - ct_ft->ref++; > - spin_unlock_bh(&zones_lock); > + mutex_unlock(&zones_mutex); > > return 0; > > @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) > err_insert: > kfree(ct_ft); > err_alloc: > - spin_unlock_bh(&zones_lock); > + mutex_unlock(&zones_mutex); > return err; > } > > @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params) > { > struct tcf_ct_flow_table *ct_ft = params->ct_ft; > > - spin_lock_bh(&zones_lock); > - if (--params->ct_ft->ref == 0) { > + if (refcount_dec_and_test(¶ms->ct_ft->ref)) { > rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); > INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); > queue_rcu_work(act_ct_wq, &ct_ft->rwork); > } > - spin_unlock_bh(&zones_lock); > } > > static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, >
On 3/8/2020 10:11 AM, Paul Blakey wrote: > iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq. > > I got a possible deadlock splat for that. Here I meant this call rcu: static void tcf_ct_cleanup(struct tc_action *a) { >-------struct tcf_ct_params *params; >-------struct tcf_ct *c = to_ct(a); >-------params = rcu_dereference_protected(c->params, 1); >-------if (params) >------->-------call_rcu(¶ms->rcu, tcf_ct_params_free); } static void tcf_ct_params_free(struct rcu_head *head) { >-------struct tcf_ct_params *params = container_of(head, >------->------->------->------->------->------- struct tcf_ct_params, rcu); >-------tcf_ct_flow_table_put(params); ... > > > On 3/7/2020 10:53 PM, Eric Dumazet wrote: > >> On 3/7/20 12:12 PM, Eric Dumazet wrote: >>> On 3/3/20 7:57 AM, Paul Blakey wrote: >>>> Use the NF flow tables infrastructure for CT offload. >>>> >>>> Create a nf flow table per zone. >>>> >>>> Next patches will add FT entries to this table, and do >>>> the software offload. >>>> >>>> Signed-off-by: Paul Blakey <paulb@mellanox.com> >>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com> >>>> --- >>>> v4->v5: >>>> Added reviewed by Jiri, thanks! >>>> v3->v4: >>>> Alloc GFP_ATOMIC >>>> v2->v3: >>>> Ditch re-locking to alloc, and use atomic allocation >>>> v1->v2: >>>> Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep) >>>> Free ft on last tc act instance instead of last instance + last offloaded tuple, >>>> this removes cleanup cb and netfilter patches, and is simpler >>>> Removed accidental mlx5/core/en_tc.c change >>>> Removed reviewed by Jiri - patch changed >>>> >>>> + err = nf_flow_table_init(&ct_ft->nf_ft); >>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep) >>> >>> Since you still hold zones_lock spinlock, a splat should occur. >>> >>> "BUG: sleeping function called from invalid context in ..." >>> >>> DEBUG_ATOMIC_SLEEP=y is your friend. >>> >>> And it is always a good thing to make sure a patch does not trigger a lockdep splat >>> >>> CONFIG_PROVE_LOCKING=y >> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged. >> >> I can not test the following fix, any objections before I submit this officially ? >> >> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c >> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644 >> --- a/net/sched/act_ct.c >> +++ b/net/sched/act_ct.c >> @@ -35,15 +35,15 @@ >> >> static struct workqueue_struct *act_ct_wq; >> static struct rhashtable zones_ht; >> -static DEFINE_SPINLOCK(zones_lock); >> +static DEFINE_MUTEX(zones_mutex); >> >> struct tcf_ct_flow_table { >> struct rhash_head node; /* In zones tables */ >> >> struct rcu_work rwork; >> struct nf_flowtable nf_ft; >> + refcount_t ref; >> u16 zone; >> - u32 ref; >> >> bool dying; >> }; >> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) >> struct tcf_ct_flow_table *ct_ft; >> int err = -ENOMEM; >> >> - spin_lock_bh(&zones_lock); >> + mutex_lock(&zones_mutex); >> ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); >> - if (ct_ft) >> - goto take_ref; >> + if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) >> + goto out_unlock; >> >> - ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC); >> + ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL); >> if (!ct_ft) >> goto err_alloc; >> + refcount_set(&ct_ft->ref, 1); >> >> ct_ft->zone = params->zone; >> err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); >> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) >> goto err_init; >> >> __module_get(THIS_MODULE); >> -take_ref: >> +out_unlock: >> params->ct_ft = ct_ft; >> - ct_ft->ref++; >> - spin_unlock_bh(&zones_lock); >> + mutex_unlock(&zones_mutex); >> >> return 0; >> >> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) >> err_insert: >> kfree(ct_ft); >> err_alloc: >> - spin_unlock_bh(&zones_lock); >> + mutex_unlock(&zones_mutex); >> return err; >> } >> >> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params) >> { >> struct tcf_ct_flow_table *ct_ft = params->ct_ft; >> >> - spin_lock_bh(&zones_lock); >> - if (--params->ct_ft->ref == 0) { >> + if (refcount_dec_and_test(¶ms->ct_ft->ref)) { >> rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); >> INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); >> queue_rcu_work(act_ct_wq, &ct_ft->rwork); >> } >> - spin_unlock_bh(&zones_lock); >> } >> >> static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, >>
On 3/8/20 12:15 AM, Paul Blakey wrote: > On 3/8/2020 10:11 AM, Paul Blakey wrote: > >> iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq. >> >> I got a possible deadlock splat for that. > > Here I meant this call rcu: > > static void tcf_ct_cleanup(struct tc_action *a) > { >> -------struct tcf_ct_params *params; >> -------struct tcf_ct *c = to_ct(a); > >> -------params = rcu_dereference_protected(c->params, 1); >> -------if (params) >> ------->-------call_rcu(¶ms->rcu, tcf_ct_params_free); > } > Yes, understood, but to solve this problem we had many other choices, and still keeping GFP_KERNEL allocations and a mutex for control path. Have you read my patch ? By not even trying to get a spinlock in tcf_ct_flow_table_put(), and instead use a refcount for ->ref, we avoid having this issue in the first place. static void tcf_ct_flow_table_put(struct tcf_ct_params *params) { struct tcf_ct_flow_table *ct_ft = params->ct_ft; if (refcount_dec_and_test(¶ms->ct_ft->ref)) { rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); queue_rcu_work(act_ct_wq, &ct_ft->rwork); } } > static void tcf_ct_params_free(struct rcu_head *head) > { >> -------struct tcf_ct_params *params = container_of(head, >> ------->------->------->------->------->------- struct tcf_ct_params, rcu); > >> -------tcf_ct_flow_table_put(params); > > ... > > >> >> >> On 3/7/2020 10:53 PM, Eric Dumazet wrote: >> >>> On 3/7/20 12:12 PM, Eric Dumazet wrote: >>>> On 3/3/20 7:57 AM, Paul Blakey wrote: >>>>> Use the NF flow tables infrastructure for CT offload. >>>>> >>>>> Create a nf flow table per zone. >>>>> >>>>> Next patches will add FT entries to this table, and do >>>>> the software offload. >>>>> >>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com> >>>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com> >>>>> --- >>>>> v4->v5: >>>>> Added reviewed by Jiri, thanks! >>>>> v3->v4: >>>>> Alloc GFP_ATOMIC >>>>> v2->v3: >>>>> Ditch re-locking to alloc, and use atomic allocation >>>>> v1->v2: >>>>> Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep) >>>>> Free ft on last tc act instance instead of last instance + last offloaded tuple, >>>>> this removes cleanup cb and netfilter patches, and is simpler >>>>> Removed accidental mlx5/core/en_tc.c change >>>>> Removed reviewed by Jiri - patch changed >>>>> >>>>> + err = nf_flow_table_init(&ct_ft->nf_ft); >>>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep) >>>> >>>> Since you still hold zones_lock spinlock, a splat should occur. >>>> >>>> "BUG: sleeping function called from invalid context in ..." >>>> >>>> DEBUG_ATOMIC_SLEEP=y is your friend. >>>> >>>> And it is always a good thing to make sure a patch does not trigger a lockdep splat >>>> >>>> CONFIG_PROVE_LOCKING=y >>> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged. >>> >>> I can not test the following fix, any objections before I submit this officially ? >>> >>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c >>> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644 >>> --- a/net/sched/act_ct.c >>> +++ b/net/sched/act_ct.c >>> @@ -35,15 +35,15 @@ >>> >>> static struct workqueue_struct *act_ct_wq; >>> static struct rhashtable zones_ht; >>> -static DEFINE_SPINLOCK(zones_lock); >>> +static DEFINE_MUTEX(zones_mutex); >>> >>> struct tcf_ct_flow_table { >>> struct rhash_head node; /* In zones tables */ >>> >>> struct rcu_work rwork; >>> struct nf_flowtable nf_ft; >>> + refcount_t ref; >>> u16 zone; >>> - u32 ref; >>> >>> bool dying; >>> }; >>> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) >>> struct tcf_ct_flow_table *ct_ft; >>> int err = -ENOMEM; >>> >>> - spin_lock_bh(&zones_lock); >>> + mutex_lock(&zones_mutex); >>> ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); >>> - if (ct_ft) >>> - goto take_ref; >>> + if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) >>> + goto out_unlock; >>> >>> - ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC); >>> + ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL); >>> if (!ct_ft) >>> goto err_alloc; >>> + refcount_set(&ct_ft->ref, 1); >>> >>> ct_ft->zone = params->zone; >>> err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); >>> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) >>> goto err_init; >>> >>> __module_get(THIS_MODULE); >>> -take_ref: >>> +out_unlock: >>> params->ct_ft = ct_ft; >>> - ct_ft->ref++; >>> - spin_unlock_bh(&zones_lock); >>> + mutex_unlock(&zones_mutex); >>> >>> return 0; >>> >>> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) >>> err_insert: >>> kfree(ct_ft); >>> err_alloc: >>> - spin_unlock_bh(&zones_lock); >>> + mutex_unlock(&zones_mutex); >>> return err; >>> } >>> >>> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params) >>> { >>> struct tcf_ct_flow_table *ct_ft = params->ct_ft; >>> >>> - spin_lock_bh(&zones_lock); >>> - if (--params->ct_ft->ref == 0) { >>> + if (refcount_dec_and_test(¶ms->ct_ft->ref)) { >>> rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); >>> INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); >>> queue_rcu_work(act_ct_wq, &ct_ft->rwork); >>> } >>> - spin_unlock_bh(&zones_lock); >>> } >>> >>> static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, >>>
On 3/8/2020 10:42 PM, Eric Dumazet wrote: > > On 3/8/20 12:15 AM, Paul Blakey wrote: >> On 3/8/2020 10:11 AM, Paul Blakey wrote: >> >>> iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq. >>> >>> I got a possible deadlock splat for that. >> Here I meant this call rcu: >> >> static void tcf_ct_cleanup(struct tc_action *a) >> { >>> -------struct tcf_ct_params *params; >>> -------struct tcf_ct *c = to_ct(a); >>> -------params = rcu_dereference_protected(c->params, 1); >>> -------if (params) >>> ------->-------call_rcu(¶ms->rcu, tcf_ct_params_free); >> } >> > Yes, understood, but to solve this problem we had many other choices, > and still keeping GFP_KERNEL allocations and a mutex for control path. > > Have you read my patch ? > > By not even trying to get a spinlock in tcf_ct_flow_table_put(), > and instead use a refcount for ->ref, we avoid having this issue in the first place. > > static void tcf_ct_flow_table_put(struct tcf_ct_params *params) > { > struct tcf_ct_flow_table *ct_ft = params->ct_ft; > > if (refcount_dec_and_test(¶ms->ct_ft->ref)) { > rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); > INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); > queue_rcu_work(act_ct_wq, &ct_ft->rwork); > } > } Sorry missed that, thanks for the fix. >> static void tcf_ct_params_free(struct rcu_head *head) >> { >>> -------struct tcf_ct_params *params = container_of(head, >>> ------->------->------->------->------->------- struct tcf_ct_params, rcu); >>> -------tcf_ct_flow_table_put(params); >> ... >> >> >>> >>> On 3/7/2020 10:53 PM, Eric Dumazet wrote: >>> >>>> On 3/7/20 12:12 PM, Eric Dumazet wrote: >>>>> On 3/3/20 7:57 AM, Paul Blakey wrote: >>>>>> Use the NF flow tables infrastructure for CT offload. >>>>>> >>>>>> Create a nf flow table per zone. >>>>>> >>>>>> Next patches will add FT entries to this table, and do >>>>>> the software offload. >>>>>> >>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com> >>>>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com> >>>>>> --- >>>>>> v4->v5: >>>>>> Added reviewed by Jiri, thanks! >>>>>> v3->v4: >>>>>> Alloc GFP_ATOMIC >>>>>> v2->v3: >>>>>> Ditch re-locking to alloc, and use atomic allocation >>>>>> v1->v2: >>>>>> Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep) >>>>>> Free ft on last tc act instance instead of last instance + last offloaded tuple, >>>>>> this removes cleanup cb and netfilter patches, and is simpler >>>>>> Removed accidental mlx5/core/en_tc.c change >>>>>> Removed reviewed by Jiri - patch changed >>>>>> >>>>>> + err = nf_flow_table_init(&ct_ft->nf_ft); >>>>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep) >>>>> >>>>> Since you still hold zones_lock spinlock, a splat should occur. >>>>> >>>>> "BUG: sleeping function called from invalid context in ..." >>>>> >>>>> DEBUG_ATOMIC_SLEEP=y is your friend. >>>>> >>>>> And it is always a good thing to make sure a patch does not trigger a lockdep splat >>>>> >>>>> CONFIG_PROVE_LOCKING=y >>>> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged. >>>> >>>> I can not test the following fix, any objections before I submit this officially ? >>>> >>>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c >>>> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644 >>>> --- a/net/sched/act_ct.c >>>> +++ b/net/sched/act_ct.c >>>> @@ -35,15 +35,15 @@ >>>> >>>> static struct workqueue_struct *act_ct_wq; >>>> static struct rhashtable zones_ht; >>>> -static DEFINE_SPINLOCK(zones_lock); >>>> +static DEFINE_MUTEX(zones_mutex); >>>> >>>> struct tcf_ct_flow_table { >>>> struct rhash_head node; /* In zones tables */ >>>> >>>> struct rcu_work rwork; >>>> struct nf_flowtable nf_ft; >>>> + refcount_t ref; >>>> u16 zone; >>>> - u32 ref; >>>> >>>> bool dying; >>>> }; >>>> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) >>>> struct tcf_ct_flow_table *ct_ft; >>>> int err = -ENOMEM; >>>> >>>> - spin_lock_bh(&zones_lock); >>>> + mutex_lock(&zones_mutex); >>>> ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); >>>> - if (ct_ft) >>>> - goto take_ref; >>>> + if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) >>>> + goto out_unlock; >>>> >>>> - ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC); >>>> + ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL); >>>> if (!ct_ft) >>>> goto err_alloc; >>>> + refcount_set(&ct_ft->ref, 1); >>>> >>>> ct_ft->zone = params->zone; >>>> err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); >>>> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) >>>> goto err_init; >>>> >>>> __module_get(THIS_MODULE); >>>> -take_ref: >>>> +out_unlock: >>>> params->ct_ft = ct_ft; >>>> - ct_ft->ref++; >>>> - spin_unlock_bh(&zones_lock); >>>> + mutex_unlock(&zones_mutex); >>>> >>>> return 0; >>>> >>>> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) >>>> err_insert: >>>> kfree(ct_ft); >>>> err_alloc: >>>> - spin_unlock_bh(&zones_lock); >>>> + mutex_unlock(&zones_mutex); >>>> return err; >>>> } >>>> >>>> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params) >>>> { >>>> struct tcf_ct_flow_table *ct_ft = params->ct_ft; >>>> >>>> - spin_lock_bh(&zones_lock); >>>> - if (--params->ct_ft->ref == 0) { >>>> + if (refcount_dec_and_test(¶ms->ct_ft->ref)) { >>>> rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); >>>> INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); >>>> queue_rcu_work(act_ct_wq, &ct_ft->rwork); >>>> } >>>> - spin_unlock_bh(&zones_lock); >>>> } >>>> >>>> static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, >>>>
diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index a8b1564..cf3492e 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -25,6 +25,8 @@ struct tcf_ct_params { u16 ct_action; struct rcu_head rcu; + + struct tcf_ct_flow_table *ct_ft; }; struct tcf_ct { diff --git a/net/sched/Kconfig b/net/sched/Kconfig index edde0e5..bfbefb7 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY config NET_ACT_CT tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE help Say Y here to allow sending the packets to conntrack module. diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index f685c0d..3321087 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -15,6 +15,7 @@ #include <linux/pkt_cls.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/rhashtable.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -24,6 +25,7 @@ #include <uapi/linux/tc_act/tc_ct.h> #include <net/tc_act/tc_ct.h> +#include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> @@ -31,6 +33,108 @@ #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <uapi/linux/netfilter/nf_nat.h> +static struct workqueue_struct *act_ct_wq; +static struct rhashtable zones_ht; +static DEFINE_SPINLOCK(zones_lock); + +struct tcf_ct_flow_table { + struct rhash_head node; /* In zones tables */ + + struct rcu_work rwork; + struct nf_flowtable nf_ft; + u16 zone; + u32 ref; + + bool dying; +}; + +static const struct rhashtable_params zones_params = { + .head_offset = offsetof(struct tcf_ct_flow_table, node), + .key_offset = offsetof(struct tcf_ct_flow_table, zone), + .key_len = sizeof_field(struct tcf_ct_flow_table, zone), + .automatic_shrinking = true, +}; + +static struct nf_flowtable_type flowtable_ct = { + .owner = THIS_MODULE, +}; + +static int tcf_ct_flow_table_get(struct tcf_ct_params *params) +{ + struct tcf_ct_flow_table *ct_ft; + int err = -ENOMEM; + + spin_lock_bh(&zones_lock); + ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); + if (ct_ft) + goto take_ref; + + ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC); + if (!ct_ft) + goto err_alloc; + + ct_ft->zone = params->zone; + err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); + if (err) + goto err_insert; + + ct_ft->nf_ft.type = &flowtable_ct; + err = nf_flow_table_init(&ct_ft->nf_ft); + if (err) + goto err_init; + + __module_get(THIS_MODULE); +take_ref: + params->ct_ft = ct_ft; + ct_ft->ref++; + spin_unlock_bh(&zones_lock); + + return 0; + +err_init: + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); +err_insert: + kfree(ct_ft); +err_alloc: + spin_unlock_bh(&zones_lock); + return err; +} + +static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) +{ + struct tcf_ct_flow_table *ct_ft; + + ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table, + rwork); + nf_flow_table_free(&ct_ft->nf_ft); + kfree(ct_ft); + + module_put(THIS_MODULE); +} + +static void tcf_ct_flow_table_put(struct tcf_ct_params *params) +{ + struct tcf_ct_flow_table *ct_ft = params->ct_ft; + + spin_lock_bh(&zones_lock); + if (--params->ct_ft->ref == 0) { + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); + INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); + queue_rcu_work(act_ct_wq, &ct_ft->rwork); + } + spin_unlock_bh(&zones_lock); +} + +static int tcf_ct_flow_tables_init(void) +{ + return rhashtable_init(&zones_ht, &zones_params); +} + +static void tcf_ct_flow_tables_uninit(void) +{ + rhashtable_destroy(&zones_ht); +} + static struct tc_action_ops act_ct_ops; static unsigned int ct_net_id; @@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head) struct tcf_ct_params *params = container_of(head, struct tcf_ct_params, rcu); + tcf_ct_flow_table_put(params); + if (params->tmpl) nf_conntrack_put(¶ms->tmpl->ct_general); kfree(params); @@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (err) goto cleanup; + err = tcf_ct_flow_table_get(params); + if (err) + goto cleanup; + spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params = rcu_replace_pointer(c->params, params, @@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list) static int __init ct_init_module(void) { - return tcf_register_action(&act_ct_ops, &ct_net_ops); + int err; + + act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0); + if (!act_ct_wq) + return -ENOMEM; + + err = tcf_ct_flow_tables_init(); + if (err) + goto err_tbl_init; + + err = tcf_register_action(&act_ct_ops, &ct_net_ops); + if (err) + goto err_register; + + return 0; + +err_tbl_init: + destroy_workqueue(act_ct_wq); +err_register: + tcf_ct_flow_tables_uninit(); + return err; } static void __exit ct_cleanup_module(void) { tcf_unregister_action(&act_ct_ops, &ct_net_ops); + tcf_ct_flow_tables_uninit(); + destroy_workqueue(act_ct_wq); } module_init(ct_init_module);