diff mbox series

[net-next,v6,1/3] net/sched: act_ct: Create nf flow table per zone

Message ID 1583251072-10396-2-git-send-email-paulb@mellanox.com
State Superseded
Delegated to: David Miller
Headers show
Series act_ct: Software offload of conntrack_in | expand

Commit Message

Paul Blakey March 3, 2020, 3:57 p.m. UTC
Use the NF flow tables infrastructure for CT offload.

Create a nf flow table per zone.

Next patches will add FT entries to this table, and do
the software offload.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
---
  v4->v5:
    Added reviewed by Jiri, thanks!
  v3->v4:
    Alloc GFP_ATOMIC
  v2->v3:
    Ditch re-locking to alloc, and use atomic allocation
  v1->v2:
    Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
    Free ft on last tc act instance instead of last instance + last offloaded tuple,
    this removes cleanup cb and netfilter patches, and is simpler
    Removed accidental mlx5/core/en_tc.c change
    Removed reviewed by Jiri - patch changed

 include/net/tc_act/tc_ct.h |   2 +
 net/sched/Kconfig          |   2 +-
 net/sched/act_ct.c         | 134 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 2 deletions(-)

Comments

Marcelo Ricardo Leitner March 3, 2020, 4:51 p.m. UTC | #1
On Tue, Mar 03, 2020 at 05:57:50PM +0200, Paul Blakey wrote:
> Use the NF flow tables infrastructure for CT offload.
> 
> Create a nf flow table per zone.
> 
> Next patches will add FT entries to this table, and do
> the software offload.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Reviewed-by: Jiri Pirko <jiri@mellanox.com>

Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

> ---
>   v4->v5:
>     Added reviewed by Jiri, thanks!
>   v3->v4:
>     Alloc GFP_ATOMIC
>   v2->v3:
>     Ditch re-locking to alloc, and use atomic allocation
>   v1->v2:
>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>     this removes cleanup cb and netfilter patches, and is simpler
>     Removed accidental mlx5/core/en_tc.c change
>     Removed reviewed by Jiri - patch changed
> 
>  include/net/tc_act/tc_ct.h |   2 +
>  net/sched/Kconfig          |   2 +-
>  net/sched/act_ct.c         | 134 ++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 136 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
> index a8b1564..cf3492e 100644
> --- a/include/net/tc_act/tc_ct.h
> +++ b/include/net/tc_act/tc_ct.h
> @@ -25,6 +25,8 @@ struct tcf_ct_params {
>  	u16 ct_action;
>  
>  	struct rcu_head rcu;
> +
> +	struct tcf_ct_flow_table *ct_ft;
>  };
>  
>  struct tcf_ct {
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index edde0e5..bfbefb7 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY
>  
>  config NET_ACT_CT
>  	tristate "connection tracking tc action"
> -	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
> +	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
>  	help
>  	  Say Y here to allow sending the packets to conntrack module.
>  
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index f685c0d..3321087 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -15,6 +15,7 @@
>  #include <linux/pkt_cls.h>
>  #include <linux/ip.h>
>  #include <linux/ipv6.h>
> +#include <linux/rhashtable.h>
>  #include <net/netlink.h>
>  #include <net/pkt_sched.h>
>  #include <net/pkt_cls.h>
> @@ -24,6 +25,7 @@
>  #include <uapi/linux/tc_act/tc_ct.h>
>  #include <net/tc_act/tc_ct.h>
>  
> +#include <net/netfilter/nf_flow_table.h>
>  #include <net/netfilter/nf_conntrack.h>
>  #include <net/netfilter/nf_conntrack_core.h>
>  #include <net/netfilter/nf_conntrack_zones.h>
> @@ -31,6 +33,108 @@
>  #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
>  #include <uapi/linux/netfilter/nf_nat.h>
>  
> +static struct workqueue_struct *act_ct_wq;
> +static struct rhashtable zones_ht;
> +static DEFINE_SPINLOCK(zones_lock);
> +
> +struct tcf_ct_flow_table {
> +	struct rhash_head node; /* In zones tables */
> +
> +	struct rcu_work rwork;
> +	struct nf_flowtable nf_ft;
> +	u16 zone;
> +	u32 ref;
> +
> +	bool dying;
> +};
> +
> +static const struct rhashtable_params zones_params = {
> +	.head_offset = offsetof(struct tcf_ct_flow_table, node),
> +	.key_offset = offsetof(struct tcf_ct_flow_table, zone),
> +	.key_len = sizeof_field(struct tcf_ct_flow_table, zone),
> +	.automatic_shrinking = true,
> +};
> +
> +static struct nf_flowtable_type flowtable_ct = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +	int err = -ENOMEM;
> +
> +	spin_lock_bh(&zones_lock);
> +	ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
> +	if (ct_ft)
> +		goto take_ref;
> +
> +	ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
> +	if (!ct_ft)
> +		goto err_alloc;
> +
> +	ct_ft->zone = params->zone;
> +	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
> +	if (err)
> +		goto err_insert;
> +
> +	ct_ft->nf_ft.type = &flowtable_ct;
> +	err = nf_flow_table_init(&ct_ft->nf_ft);
> +	if (err)
> +		goto err_init;
> +
> +	__module_get(THIS_MODULE);
> +take_ref:
> +	params->ct_ft = ct_ft;
> +	ct_ft->ref++;
> +	spin_unlock_bh(&zones_lock);
> +
> +	return 0;
> +
> +err_init:
> +	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +err_insert:
> +	kfree(ct_ft);
> +err_alloc:
> +	spin_unlock_bh(&zones_lock);
> +	return err;
> +}
> +
> +static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +
> +	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
> +			     rwork);
> +	nf_flow_table_free(&ct_ft->nf_ft);
> +	kfree(ct_ft);
> +
> +	module_put(THIS_MODULE);
> +}
> +
> +static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft = params->ct_ft;
> +
> +	spin_lock_bh(&zones_lock);
> +	if (--params->ct_ft->ref == 0) {
> +		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
> +		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
> +	}
> +	spin_unlock_bh(&zones_lock);
> +}
> +
> +static int tcf_ct_flow_tables_init(void)
> +{
> +	return rhashtable_init(&zones_ht, &zones_params);
> +}
> +
> +static void tcf_ct_flow_tables_uninit(void)
> +{
> +	rhashtable_destroy(&zones_ht);
> +}
> +
>  static struct tc_action_ops act_ct_ops;
>  static unsigned int ct_net_id;
>  
> @@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head)
>  	struct tcf_ct_params *params = container_of(head,
>  						    struct tcf_ct_params, rcu);
>  
> +	tcf_ct_flow_table_put(params);
> +
>  	if (params->tmpl)
>  		nf_conntrack_put(&params->tmpl->ct_general);
>  	kfree(params);
> @@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
>  	if (err)
>  		goto cleanup;
>  
> +	err = tcf_ct_flow_table_get(params);
> +	if (err)
> +		goto cleanup;
> +
>  	spin_lock_bh(&c->tcf_lock);
>  	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
>  	params = rcu_replace_pointer(c->params, params,
> @@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list)
>  
>  static int __init ct_init_module(void)
>  {
> -	return tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	int err;
> +
> +	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
> +	if (!act_ct_wq)
> +		return -ENOMEM;
> +
> +	err = tcf_ct_flow_tables_init();
> +	if (err)
> +		goto err_tbl_init;
> +
> +	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	if (err)
> +		goto err_register;
> +
> +	return 0;
> +
> +err_tbl_init:
> +	destroy_workqueue(act_ct_wq);
> +err_register:
> +	tcf_ct_flow_tables_uninit();
> +	return err;
>  }
>  
>  static void __exit ct_cleanup_module(void)
>  {
>  	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
> +	tcf_ct_flow_tables_uninit();
> +	destroy_workqueue(act_ct_wq);
>  }
>  
>  module_init(ct_init_module);
> -- 
> 1.8.3.1
>
Eric Dumazet March 7, 2020, 8:12 p.m. UTC | #2
On 3/3/20 7:57 AM, Paul Blakey wrote:
> Use the NF flow tables infrastructure for CT offload.
> 
> Create a nf flow table per zone.
> 
> Next patches will add FT entries to this table, and do
> the software offload.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
> ---
>   v4->v5:
>     Added reviewed by Jiri, thanks!
>   v3->v4:
>     Alloc GFP_ATOMIC
>   v2->v3:
>     Ditch re-locking to alloc, and use atomic allocation
>   v1->v2:
>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>     this removes cleanup cb and netfilter patches, and is simpler
>     Removed accidental mlx5/core/en_tc.c change
>     Removed reviewed by Jiri - patch changed
> 
>  include/net/tc_act/tc_ct.h |   2 +
>  net/sched/Kconfig          |   2 +-
>  net/sched/act_ct.c         | 134 ++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 136 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
> index a8b1564..cf3492e 100644
> --- a/include/net/tc_act/tc_ct.h
> +++ b/include/net/tc_act/tc_ct.h
> @@ -25,6 +25,8 @@ struct tcf_ct_params {
>  	u16 ct_action;
>  
>  	struct rcu_head rcu;
> +
> +	struct tcf_ct_flow_table *ct_ft;
>  };
>  
>  struct tcf_ct {
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index edde0e5..bfbefb7 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY
>  
>  config NET_ACT_CT
>  	tristate "connection tracking tc action"
> -	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
> +	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
>  	help
>  	  Say Y here to allow sending the packets to conntrack module.
>  
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index f685c0d..3321087 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -15,6 +15,7 @@
>  #include <linux/pkt_cls.h>
>  #include <linux/ip.h>
>  #include <linux/ipv6.h>
> +#include <linux/rhashtable.h>
>  #include <net/netlink.h>
>  #include <net/pkt_sched.h>
>  #include <net/pkt_cls.h>
> @@ -24,6 +25,7 @@
>  #include <uapi/linux/tc_act/tc_ct.h>
>  #include <net/tc_act/tc_ct.h>
>  
> +#include <net/netfilter/nf_flow_table.h>
>  #include <net/netfilter/nf_conntrack.h>
>  #include <net/netfilter/nf_conntrack_core.h>
>  #include <net/netfilter/nf_conntrack_zones.h>
> @@ -31,6 +33,108 @@
>  #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
>  #include <uapi/linux/netfilter/nf_nat.h>
>  
> +static struct workqueue_struct *act_ct_wq;
> +static struct rhashtable zones_ht;
> +static DEFINE_SPINLOCK(zones_lock);
> +
> +struct tcf_ct_flow_table {
> +	struct rhash_head node; /* In zones tables */
> +
> +	struct rcu_work rwork;
> +	struct nf_flowtable nf_ft;
> +	u16 zone;
> +	u32 ref;
> +
> +	bool dying;
> +};
> +
> +static const struct rhashtable_params zones_params = {
> +	.head_offset = offsetof(struct tcf_ct_flow_table, node),
> +	.key_offset = offsetof(struct tcf_ct_flow_table, zone),
> +	.key_len = sizeof_field(struct tcf_ct_flow_table, zone),
> +	.automatic_shrinking = true,
> +};
> +
> +static struct nf_flowtable_type flowtable_ct = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +	int err = -ENOMEM;
> +
> +	spin_lock_bh(&zones_lock);
> +	ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
> +	if (ct_ft)
> +		goto take_ref;
> +
> +	ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
> +	if (!ct_ft)
> +		goto err_alloc;
> +
> +	ct_ft->zone = params->zone;
> +	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
> +	if (err)
> +		goto err_insert;
> +
> +	ct_ft->nf_ft.type = &flowtable_ct;
> +	err = nf_flow_table_init(&ct_ft->nf_ft);

This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)

Since you still hold zones_lock spinlock, a splat should occur.

"BUG: sleeping function called from invalid context in  ..."

DEBUG_ATOMIC_SLEEP=y is your friend.

And it is always a good thing to make sure a patch does not trigger a lockdep splat

CONFIG_PROVE_LOCKING=y


> +	if (err)
> +		goto err_init;
> +
> +	__module_get(THIS_MODULE);
> +take_ref:
> +	params->ct_ft = ct_ft;
> +	ct_ft->ref++;
> +	spin_unlock_bh(&zones_lock);
> +
> +	return 0;
> +
> +err_init:
> +	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +err_insert:
> +	kfree(ct_ft);
> +err_alloc:
> +	spin_unlock_bh(&zones_lock);
> +	return err;
> +}
> +
> +static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +
> +	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
> +			     rwork);
> +	nf_flow_table_free(&ct_ft->nf_ft);
> +	kfree(ct_ft);
> +
> +	module_put(THIS_MODULE);
> +}
> +
> +static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft = params->ct_ft;
> +
> +	spin_lock_bh(&zones_lock);
> +	if (--params->ct_ft->ref == 0) {
> +		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
> +		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
> +	}
> +	spin_unlock_bh(&zones_lock);
> +}
> +
> +static int tcf_ct_flow_tables_init(void)
> +{
> +	return rhashtable_init(&zones_ht, &zones_params);
> +}
> +
> +static void tcf_ct_flow_tables_uninit(void)
> +{
> +	rhashtable_destroy(&zones_ht);
> +}
> +
>  static struct tc_action_ops act_ct_ops;
>  static unsigned int ct_net_id;
>  
> @@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head)
>  	struct tcf_ct_params *params = container_of(head,
>  						    struct tcf_ct_params, rcu);
>  
> +	tcf_ct_flow_table_put(params);
> +
>  	if (params->tmpl)
>  		nf_conntrack_put(&params->tmpl->ct_general);
>  	kfree(params);
> @@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
>  	if (err)
>  		goto cleanup;
>  
> +	err = tcf_ct_flow_table_get(params);
> +	if (err)
> +		goto cleanup;
> +
>  	spin_lock_bh(&c->tcf_lock);
>  	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
>  	params = rcu_replace_pointer(c->params, params,
> @@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list)
>  
>  static int __init ct_init_module(void)
>  {
> -	return tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	int err;
> +
> +	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
> +	if (!act_ct_wq)
> +		return -ENOMEM;
> +
> +	err = tcf_ct_flow_tables_init();
> +	if (err)
> +		goto err_tbl_init;
> +
> +	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	if (err)
> +		goto err_register;
> +
> +	return 0;
> +
> +err_tbl_init:
> +	destroy_workqueue(act_ct_wq);
> +err_register:
> +	tcf_ct_flow_tables_uninit();
> +	return err;
>  }
>  
>  static void __exit ct_cleanup_module(void)
>  {
>  	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
> +	tcf_ct_flow_tables_uninit();
> +	destroy_workqueue(act_ct_wq);
>  }
>  
>  module_init(ct_init_module);
>
Eric Dumazet March 7, 2020, 8:53 p.m. UTC | #3
On 3/7/20 12:12 PM, Eric Dumazet wrote:
> 
> 
> On 3/3/20 7:57 AM, Paul Blakey wrote:
>> Use the NF flow tables infrastructure for CT offload.
>>
>> Create a nf flow table per zone.
>>
>> Next patches will add FT entries to this table, and do
>> the software offload.
>>
>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>   v4->v5:
>>     Added reviewed by Jiri, thanks!
>>   v3->v4:
>>     Alloc GFP_ATOMIC
>>   v2->v3:
>>     Ditch re-locking to alloc, and use atomic allocation
>>   v1->v2:
>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>     this removes cleanup cb and netfilter patches, and is simpler
>>     Removed accidental mlx5/core/en_tc.c change
>>     Removed reviewed by Jiri - patch changed
>>
>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
> 
> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
> 
> Since you still hold zones_lock spinlock, a splat should occur.
> 
> "BUG: sleeping function called from invalid context in  ..."
> 
> DEBUG_ATOMIC_SLEEP=y is your friend.
> 
> And it is always a good thing to make sure a patch does not trigger a lockdep splat
> 
> CONFIG_PROVE_LOCKING=y

Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.

I can not test the following fix, any objections before I submit this officially ?

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -35,15 +35,15 @@
 
 static struct workqueue_struct *act_ct_wq;
 static struct rhashtable zones_ht;
-static DEFINE_SPINLOCK(zones_lock);
+static DEFINE_MUTEX(zones_mutex);
 
 struct tcf_ct_flow_table {
        struct rhash_head node; /* In zones tables */
 
        struct rcu_work rwork;
        struct nf_flowtable nf_ft;
+       refcount_t ref;
        u16 zone;
-       u32 ref;
 
        bool dying;
 };
@@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
        struct tcf_ct_flow_table *ct_ft;
        int err = -ENOMEM;
 
-       spin_lock_bh(&zones_lock);
+       mutex_lock(&zones_mutex);
        ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
-       if (ct_ft)
-               goto take_ref;
+       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
+               goto out_unlock;
 
-       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
+       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
        if (!ct_ft)
                goto err_alloc;
+       refcount_set(&ct_ft->ref, 1);
 
        ct_ft->zone = params->zone;
        err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
@@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
                goto err_init;
 
        __module_get(THIS_MODULE);
-take_ref:
+out_unlock:
        params->ct_ft = ct_ft;
-       ct_ft->ref++;
-       spin_unlock_bh(&zones_lock);
+       mutex_unlock(&zones_mutex);
 
        return 0;
 
@@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
 err_insert:
        kfree(ct_ft);
 err_alloc:
-       spin_unlock_bh(&zones_lock);
+       mutex_unlock(&zones_mutex);
        return err;
 }
 
@@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
 {
        struct tcf_ct_flow_table *ct_ft = params->ct_ft;
 
-       spin_lock_bh(&zones_lock);
-       if (--params->ct_ft->ref == 0) {
+       if (refcount_dec_and_test(&params->ct_ft->ref)) {
                rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
                INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
                queue_rcu_work(act_ct_wq, &ct_ft->rwork);
        }
-       spin_unlock_bh(&zones_lock);
 }
 
 static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
Paul Blakey March 8, 2020, 8:11 a.m. UTC | #4
iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq.

I got a possible deadlock splat for that.



On 3/7/2020 10:53 PM, Eric Dumazet wrote:

>
> On 3/7/20 12:12 PM, Eric Dumazet wrote:
>>
>> On 3/3/20 7:57 AM, Paul Blakey wrote:
>>> Use the NF flow tables infrastructure for CT offload.
>>>
>>> Create a nf flow table per zone.
>>>
>>> Next patches will add FT entries to this table, and do
>>> the software offload.
>>>
>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>>> ---
>>>   v4->v5:
>>>     Added reviewed by Jiri, thanks!
>>>   v3->v4:
>>>     Alloc GFP_ATOMIC
>>>   v2->v3:
>>>     Ditch re-locking to alloc, and use atomic allocation
>>>   v1->v2:
>>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>>     this removes cleanup cb and netfilter patches, and is simpler
>>>     Removed accidental mlx5/core/en_tc.c change
>>>     Removed reviewed by Jiri - patch changed
>>>
>>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
>>
>> Since you still hold zones_lock spinlock, a splat should occur.
>>
>> "BUG: sleeping function called from invalid context in  ..."
>>
>> DEBUG_ATOMIC_SLEEP=y is your friend.
>>
>> And it is always a good thing to make sure a patch does not trigger a lockdep splat
>>
>> CONFIG_PROVE_LOCKING=y
> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.
>
> I can not test the following fix, any objections before I submit this officially ?
>
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -35,15 +35,15 @@
>  
>  static struct workqueue_struct *act_ct_wq;
>  static struct rhashtable zones_ht;
> -static DEFINE_SPINLOCK(zones_lock);
> +static DEFINE_MUTEX(zones_mutex);
>  
>  struct tcf_ct_flow_table {
>         struct rhash_head node; /* In zones tables */
>  
>         struct rcu_work rwork;
>         struct nf_flowtable nf_ft;
> +       refcount_t ref;
>         u16 zone;
> -       u32 ref;
>  
>         bool dying;
>  };
> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>         struct tcf_ct_flow_table *ct_ft;
>         int err = -ENOMEM;
>  
> -       spin_lock_bh(&zones_lock);
> +       mutex_lock(&zones_mutex);
>         ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
> -       if (ct_ft)
> -               goto take_ref;
> +       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
> +               goto out_unlock;
>  
> -       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
> +       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
>         if (!ct_ft)
>                 goto err_alloc;
> +       refcount_set(&ct_ft->ref, 1);
>  
>         ct_ft->zone = params->zone;
>         err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>                 goto err_init;
>  
>         __module_get(THIS_MODULE);
> -take_ref:
> +out_unlock:
>         params->ct_ft = ct_ft;
> -       ct_ft->ref++;
> -       spin_unlock_bh(&zones_lock);
> +       mutex_unlock(&zones_mutex);
>  
>         return 0;
>  
> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>  err_insert:
>         kfree(ct_ft);
>  err_alloc:
> -       spin_unlock_bh(&zones_lock);
> +       mutex_unlock(&zones_mutex);
>         return err;
>  }
>  
> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
>  {
>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>  
> -       spin_lock_bh(&zones_lock);
> -       if (--params->ct_ft->ref == 0) {
> +       if (refcount_dec_and_test(&params->ct_ft->ref)) {
>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>         }
> -       spin_unlock_bh(&zones_lock);
>  }
>  
>  static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
>
Paul Blakey March 8, 2020, 8:15 a.m. UTC | #5
On 3/8/2020 10:11 AM, Paul Blakey wrote:

> iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq.
>
> I got a possible deadlock splat for that.

Here I meant this call rcu:

static void tcf_ct_cleanup(struct tc_action *a)
{
>-------struct tcf_ct_params *params;
>-------struct tcf_ct *c = to_ct(a);

>-------params = rcu_dereference_protected(c->params, 1);
>-------if (params)
>------->-------call_rcu(&params->rcu, tcf_ct_params_free);
}

static void tcf_ct_params_free(struct rcu_head *head)
{
>-------struct tcf_ct_params *params = container_of(head,
>------->------->------->------->------->-------    struct tcf_ct_params, rcu);

>-------tcf_ct_flow_table_put(params);

...


>
>
> On 3/7/2020 10:53 PM, Eric Dumazet wrote:
>
>> On 3/7/20 12:12 PM, Eric Dumazet wrote:
>>> On 3/3/20 7:57 AM, Paul Blakey wrote:
>>>> Use the NF flow tables infrastructure for CT offload.
>>>>
>>>> Create a nf flow table per zone.
>>>>
>>>> Next patches will add FT entries to this table, and do
>>>> the software offload.
>>>>
>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>>>> ---
>>>>   v4->v5:
>>>>     Added reviewed by Jiri, thanks!
>>>>   v3->v4:
>>>>     Alloc GFP_ATOMIC
>>>>   v2->v3:
>>>>     Ditch re-locking to alloc, and use atomic allocation
>>>>   v1->v2:
>>>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>>>     this removes cleanup cb and netfilter patches, and is simpler
>>>>     Removed accidental mlx5/core/en_tc.c change
>>>>     Removed reviewed by Jiri - patch changed
>>>>
>>>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
>>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
>>>
>>> Since you still hold zones_lock spinlock, a splat should occur.
>>>
>>> "BUG: sleeping function called from invalid context in  ..."
>>>
>>> DEBUG_ATOMIC_SLEEP=y is your friend.
>>>
>>> And it is always a good thing to make sure a patch does not trigger a lockdep splat
>>>
>>> CONFIG_PROVE_LOCKING=y
>> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.
>>
>> I can not test the following fix, any objections before I submit this officially ?
>>
>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
>> --- a/net/sched/act_ct.c
>> +++ b/net/sched/act_ct.c
>> @@ -35,15 +35,15 @@
>>  
>>  static struct workqueue_struct *act_ct_wq;
>>  static struct rhashtable zones_ht;
>> -static DEFINE_SPINLOCK(zones_lock);
>> +static DEFINE_MUTEX(zones_mutex);
>>  
>>  struct tcf_ct_flow_table {
>>         struct rhash_head node; /* In zones tables */
>>  
>>         struct rcu_work rwork;
>>         struct nf_flowtable nf_ft;
>> +       refcount_t ref;
>>         u16 zone;
>> -       u32 ref;
>>  
>>         bool dying;
>>  };
>> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>         struct tcf_ct_flow_table *ct_ft;
>>         int err = -ENOMEM;
>>  
>> -       spin_lock_bh(&zones_lock);
>> +       mutex_lock(&zones_mutex);
>>         ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
>> -       if (ct_ft)
>> -               goto take_ref;
>> +       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
>> +               goto out_unlock;
>>  
>> -       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
>> +       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
>>         if (!ct_ft)
>>                 goto err_alloc;
>> +       refcount_set(&ct_ft->ref, 1);
>>  
>>         ct_ft->zone = params->zone;
>>         err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
>> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>                 goto err_init;
>>  
>>         __module_get(THIS_MODULE);
>> -take_ref:
>> +out_unlock:
>>         params->ct_ft = ct_ft;
>> -       ct_ft->ref++;
>> -       spin_unlock_bh(&zones_lock);
>> +       mutex_unlock(&zones_mutex);
>>  
>>         return 0;
>>  
>> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>  err_insert:
>>         kfree(ct_ft);
>>  err_alloc:
>> -       spin_unlock_bh(&zones_lock);
>> +       mutex_unlock(&zones_mutex);
>>         return err;
>>  }
>>  
>> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
>>  {
>>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>>  
>> -       spin_lock_bh(&zones_lock);
>> -       if (--params->ct_ft->ref == 0) {
>> +       if (refcount_dec_and_test(&params->ct_ft->ref)) {
>>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>>         }
>> -       spin_unlock_bh(&zones_lock);
>>  }
>>  
>>  static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
>>
Eric Dumazet March 8, 2020, 8:42 p.m. UTC | #6
On 3/8/20 12:15 AM, Paul Blakey wrote:
> On 3/8/2020 10:11 AM, Paul Blakey wrote:
> 
>> iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq.
>>
>> I got a possible deadlock splat for that.
> 
> Here I meant this call rcu:
> 
> static void tcf_ct_cleanup(struct tc_action *a)
> {
>> -------struct tcf_ct_params *params;
>> -------struct tcf_ct *c = to_ct(a);
> 
>> -------params = rcu_dereference_protected(c->params, 1);
>> -------if (params)
>> ------->-------call_rcu(&params->rcu, tcf_ct_params_free);
> }
> 

Yes, understood, but to solve this problem we had many other choices,
and still keeping GFP_KERNEL allocations and a mutex for control path.

Have you read my patch ?

By not even trying to get a spinlock in tcf_ct_flow_table_put(),
and instead use a refcount for ->ref, we avoid having this issue in the first place.

static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
{
        struct tcf_ct_flow_table *ct_ft = params->ct_ft;

        if (refcount_dec_and_test(&params->ct_ft->ref)) {
                rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
                INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
                queue_rcu_work(act_ct_wq, &ct_ft->rwork);
        }
}

> static void tcf_ct_params_free(struct rcu_head *head)
> {
>> -------struct tcf_ct_params *params = container_of(head,
>> ------->------->------->------->------->-------    struct tcf_ct_params, rcu);
> 
>> -------tcf_ct_flow_table_put(params);
> 
> ...
> 
> 
>>
>>
>> On 3/7/2020 10:53 PM, Eric Dumazet wrote:
>>
>>> On 3/7/20 12:12 PM, Eric Dumazet wrote:
>>>> On 3/3/20 7:57 AM, Paul Blakey wrote:
>>>>> Use the NF flow tables infrastructure for CT offload.
>>>>>
>>>>> Create a nf flow table per zone.
>>>>>
>>>>> Next patches will add FT entries to this table, and do
>>>>> the software offload.
>>>>>
>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>>>>> ---
>>>>>   v4->v5:
>>>>>     Added reviewed by Jiri, thanks!
>>>>>   v3->v4:
>>>>>     Alloc GFP_ATOMIC
>>>>>   v2->v3:
>>>>>     Ditch re-locking to alloc, and use atomic allocation
>>>>>   v1->v2:
>>>>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>>>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>>>>     this removes cleanup cb and netfilter patches, and is simpler
>>>>>     Removed accidental mlx5/core/en_tc.c change
>>>>>     Removed reviewed by Jiri - patch changed
>>>>>
>>>>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
>>>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
>>>>
>>>> Since you still hold zones_lock spinlock, a splat should occur.
>>>>
>>>> "BUG: sleeping function called from invalid context in  ..."
>>>>
>>>> DEBUG_ATOMIC_SLEEP=y is your friend.
>>>>
>>>> And it is always a good thing to make sure a patch does not trigger a lockdep splat
>>>>
>>>> CONFIG_PROVE_LOCKING=y
>>> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.
>>>
>>> I can not test the following fix, any objections before I submit this officially ?
>>>
>>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>>> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
>>> --- a/net/sched/act_ct.c
>>> +++ b/net/sched/act_ct.c
>>> @@ -35,15 +35,15 @@
>>>  
>>>  static struct workqueue_struct *act_ct_wq;
>>>  static struct rhashtable zones_ht;
>>> -static DEFINE_SPINLOCK(zones_lock);
>>> +static DEFINE_MUTEX(zones_mutex);
>>>  
>>>  struct tcf_ct_flow_table {
>>>         struct rhash_head node; /* In zones tables */
>>>  
>>>         struct rcu_work rwork;
>>>         struct nf_flowtable nf_ft;
>>> +       refcount_t ref;
>>>         u16 zone;
>>> -       u32 ref;
>>>  
>>>         bool dying;
>>>  };
>>> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>         struct tcf_ct_flow_table *ct_ft;
>>>         int err = -ENOMEM;
>>>  
>>> -       spin_lock_bh(&zones_lock);
>>> +       mutex_lock(&zones_mutex);
>>>         ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
>>> -       if (ct_ft)
>>> -               goto take_ref;
>>> +       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
>>> +               goto out_unlock;
>>>  
>>> -       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
>>> +       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
>>>         if (!ct_ft)
>>>                 goto err_alloc;
>>> +       refcount_set(&ct_ft->ref, 1);
>>>  
>>>         ct_ft->zone = params->zone;
>>>         err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
>>> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>                 goto err_init;
>>>  
>>>         __module_get(THIS_MODULE);
>>> -take_ref:
>>> +out_unlock:
>>>         params->ct_ft = ct_ft;
>>> -       ct_ft->ref++;
>>> -       spin_unlock_bh(&zones_lock);
>>> +       mutex_unlock(&zones_mutex);
>>>  
>>>         return 0;
>>>  
>>> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>  err_insert:
>>>         kfree(ct_ft);
>>>  err_alloc:
>>> -       spin_unlock_bh(&zones_lock);
>>> +       mutex_unlock(&zones_mutex);
>>>         return err;
>>>  }
>>>  
>>> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
>>>  {
>>>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>>>  
>>> -       spin_lock_bh(&zones_lock);
>>> -       if (--params->ct_ft->ref == 0) {
>>> +       if (refcount_dec_and_test(&params->ct_ft->ref)) {
>>>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>>>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>>>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>>>         }
>>> -       spin_unlock_bh(&zones_lock);
>>>  }
>>>  
>>>  static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
>>>
Paul Blakey March 9, 2020, 8:02 a.m. UTC | #7
On 3/8/2020 10:42 PM, Eric Dumazet wrote:
>
> On 3/8/20 12:15 AM, Paul Blakey wrote:
>> On 3/8/2020 10:11 AM, Paul Blakey wrote:
>>
>>> iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq.
>>>
>>> I got a possible deadlock splat for that.
>> Here I meant this call rcu:
>>
>> static void tcf_ct_cleanup(struct tc_action *a)
>> {
>>> -------struct tcf_ct_params *params;
>>> -------struct tcf_ct *c = to_ct(a);
>>> -------params = rcu_dereference_protected(c->params, 1);
>>> -------if (params)
>>> ------->-------call_rcu(&params->rcu, tcf_ct_params_free);
>> }
>>
> Yes, understood, but to solve this problem we had many other choices,
> and still keeping GFP_KERNEL allocations and a mutex for control path.
>
> Have you read my patch ?
>
> By not even trying to get a spinlock in tcf_ct_flow_table_put(),
> and instead use a refcount for ->ref, we avoid having this issue in the first place.
>
> static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
> {
>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>
>         if (refcount_dec_and_test(&params->ct_ft->ref)) {
>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>         }
> }
Sorry missed that, thanks for the fix.
>> static void tcf_ct_params_free(struct rcu_head *head)
>> {
>>> -------struct tcf_ct_params *params = container_of(head,
>>> ------->------->------->------->------->-------    struct tcf_ct_params, rcu);
>>> -------tcf_ct_flow_table_put(params);
>> ...
>>
>>
>>>
>>> On 3/7/2020 10:53 PM, Eric Dumazet wrote:
>>>
>>>> On 3/7/20 12:12 PM, Eric Dumazet wrote:
>>>>> On 3/3/20 7:57 AM, Paul Blakey wrote:
>>>>>> Use the NF flow tables infrastructure for CT offload.
>>>>>>
>>>>>> Create a nf flow table per zone.
>>>>>>
>>>>>> Next patches will add FT entries to this table, and do
>>>>>> the software offload.
>>>>>>
>>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>>>>>> ---
>>>>>>   v4->v5:
>>>>>>     Added reviewed by Jiri, thanks!
>>>>>>   v3->v4:
>>>>>>     Alloc GFP_ATOMIC
>>>>>>   v2->v3:
>>>>>>     Ditch re-locking to alloc, and use atomic allocation
>>>>>>   v1->v2:
>>>>>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>>>>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>>>>>     this removes cleanup cb and netfilter patches, and is simpler
>>>>>>     Removed accidental mlx5/core/en_tc.c change
>>>>>>     Removed reviewed by Jiri - patch changed
>>>>>>
>>>>>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
>>>>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
>>>>>
>>>>> Since you still hold zones_lock spinlock, a splat should occur.
>>>>>
>>>>> "BUG: sleeping function called from invalid context in  ..."
>>>>>
>>>>> DEBUG_ATOMIC_SLEEP=y is your friend.
>>>>>
>>>>> And it is always a good thing to make sure a patch does not trigger a lockdep splat
>>>>>
>>>>> CONFIG_PROVE_LOCKING=y
>>>> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.
>>>>
>>>> I can not test the following fix, any objections before I submit this officially ?
>>>>
>>>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>>>> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
>>>> --- a/net/sched/act_ct.c
>>>> +++ b/net/sched/act_ct.c
>>>> @@ -35,15 +35,15 @@
>>>>  
>>>>  static struct workqueue_struct *act_ct_wq;
>>>>  static struct rhashtable zones_ht;
>>>> -static DEFINE_SPINLOCK(zones_lock);
>>>> +static DEFINE_MUTEX(zones_mutex);
>>>>  
>>>>  struct tcf_ct_flow_table {
>>>>         struct rhash_head node; /* In zones tables */
>>>>  
>>>>         struct rcu_work rwork;
>>>>         struct nf_flowtable nf_ft;
>>>> +       refcount_t ref;
>>>>         u16 zone;
>>>> -       u32 ref;
>>>>  
>>>>         bool dying;
>>>>  };
>>>> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>>         struct tcf_ct_flow_table *ct_ft;
>>>>         int err = -ENOMEM;
>>>>  
>>>> -       spin_lock_bh(&zones_lock);
>>>> +       mutex_lock(&zones_mutex);
>>>>         ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
>>>> -       if (ct_ft)
>>>> -               goto take_ref;
>>>> +       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
>>>> +               goto out_unlock;
>>>>  
>>>> -       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
>>>> +       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
>>>>         if (!ct_ft)
>>>>                 goto err_alloc;
>>>> +       refcount_set(&ct_ft->ref, 1);
>>>>  
>>>>         ct_ft->zone = params->zone;
>>>>         err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
>>>> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>>                 goto err_init;
>>>>  
>>>>         __module_get(THIS_MODULE);
>>>> -take_ref:
>>>> +out_unlock:
>>>>         params->ct_ft = ct_ft;
>>>> -       ct_ft->ref++;
>>>> -       spin_unlock_bh(&zones_lock);
>>>> +       mutex_unlock(&zones_mutex);
>>>>  
>>>>         return 0;
>>>>  
>>>> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>>  err_insert:
>>>>         kfree(ct_ft);
>>>>  err_alloc:
>>>> -       spin_unlock_bh(&zones_lock);
>>>> +       mutex_unlock(&zones_mutex);
>>>>         return err;
>>>>  }
>>>>  
>>>> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
>>>>  {
>>>>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>>>>  
>>>> -       spin_lock_bh(&zones_lock);
>>>> -       if (--params->ct_ft->ref == 0) {
>>>> +       if (refcount_dec_and_test(&params->ct_ft->ref)) {
>>>>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>>>>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>>>>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>>>>         }
>>>> -       spin_unlock_bh(&zones_lock);
>>>>  }
>>>>  
>>>>  static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
>>>>
diff mbox series

Patch

diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index a8b1564..cf3492e 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -25,6 +25,8 @@  struct tcf_ct_params {
 	u16 ct_action;
 
 	struct rcu_head rcu;
+
+	struct tcf_ct_flow_table *ct_ft;
 };
 
 struct tcf_ct {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index edde0e5..bfbefb7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -972,7 +972,7 @@  config NET_ACT_TUNNEL_KEY
 
 config NET_ACT_CT
 	tristate "connection tracking tc action"
-	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
+	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
 	help
 	  Say Y here to allow sending the packets to conntrack module.
 
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index f685c0d..3321087 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -15,6 +15,7 @@ 
 #include <linux/pkt_cls.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/rhashtable.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
@@ -24,6 +25,7 @@ 
 #include <uapi/linux/tc_act/tc_ct.h>
 #include <net/tc_act/tc_ct.h>
 
+#include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_zones.h>
@@ -31,6 +33,108 @@ 
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <uapi/linux/netfilter/nf_nat.h>
 
+static struct workqueue_struct *act_ct_wq;
+static struct rhashtable zones_ht;
+static DEFINE_SPINLOCK(zones_lock);
+
+struct tcf_ct_flow_table {
+	struct rhash_head node; /* In zones tables */
+
+	struct rcu_work rwork;
+	struct nf_flowtable nf_ft;
+	u16 zone;
+	u32 ref;
+
+	bool dying;
+};
+
+static const struct rhashtable_params zones_params = {
+	.head_offset = offsetof(struct tcf_ct_flow_table, node),
+	.key_offset = offsetof(struct tcf_ct_flow_table, zone),
+	.key_len = sizeof_field(struct tcf_ct_flow_table, zone),
+	.automatic_shrinking = true,
+};
+
+static struct nf_flowtable_type flowtable_ct = {
+	.owner		= THIS_MODULE,
+};
+
+static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
+{
+	struct tcf_ct_flow_table *ct_ft;
+	int err = -ENOMEM;
+
+	spin_lock_bh(&zones_lock);
+	ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
+	if (ct_ft)
+		goto take_ref;
+
+	ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
+	if (!ct_ft)
+		goto err_alloc;
+
+	ct_ft->zone = params->zone;
+	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
+	if (err)
+		goto err_insert;
+
+	ct_ft->nf_ft.type = &flowtable_ct;
+	err = nf_flow_table_init(&ct_ft->nf_ft);
+	if (err)
+		goto err_init;
+
+	__module_get(THIS_MODULE);
+take_ref:
+	params->ct_ft = ct_ft;
+	ct_ft->ref++;
+	spin_unlock_bh(&zones_lock);
+
+	return 0;
+
+err_init:
+	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+err_insert:
+	kfree(ct_ft);
+err_alloc:
+	spin_unlock_bh(&zones_lock);
+	return err;
+}
+
+static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
+{
+	struct tcf_ct_flow_table *ct_ft;
+
+	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
+			     rwork);
+	nf_flow_table_free(&ct_ft->nf_ft);
+	kfree(ct_ft);
+
+	module_put(THIS_MODULE);
+}
+
+static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
+{
+	struct tcf_ct_flow_table *ct_ft = params->ct_ft;
+
+	spin_lock_bh(&zones_lock);
+	if (--params->ct_ft->ref == 0) {
+		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
+		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
+	}
+	spin_unlock_bh(&zones_lock);
+}
+
+static int tcf_ct_flow_tables_init(void)
+{
+	return rhashtable_init(&zones_ht, &zones_params);
+}
+
+static void tcf_ct_flow_tables_uninit(void)
+{
+	rhashtable_destroy(&zones_ht);
+}
+
 static struct tc_action_ops act_ct_ops;
 static unsigned int ct_net_id;
 
@@ -207,6 +311,8 @@  static void tcf_ct_params_free(struct rcu_head *head)
 	struct tcf_ct_params *params = container_of(head,
 						    struct tcf_ct_params, rcu);
 
+	tcf_ct_flow_table_put(params);
+
 	if (params->tmpl)
 		nf_conntrack_put(&params->tmpl->ct_general);
 	kfree(params);
@@ -730,6 +836,10 @@  static int tcf_ct_init(struct net *net, struct nlattr *nla,
 	if (err)
 		goto cleanup;
 
+	err = tcf_ct_flow_table_get(params);
+	if (err)
+		goto cleanup;
+
 	spin_lock_bh(&c->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
 	params = rcu_replace_pointer(c->params, params,
@@ -974,12 +1084,34 @@  static void __net_exit ct_exit_net(struct list_head *net_list)
 
 static int __init ct_init_module(void)
 {
-	return tcf_register_action(&act_ct_ops, &ct_net_ops);
+	int err;
+
+	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
+	if (!act_ct_wq)
+		return -ENOMEM;
+
+	err = tcf_ct_flow_tables_init();
+	if (err)
+		goto err_tbl_init;
+
+	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
+	if (err)
+		goto err_register;
+
+	return 0;
+
+err_tbl_init:
+	destroy_workqueue(act_ct_wq);
+err_register:
+	tcf_ct_flow_tables_uninit();
+	return err;
 }
 
 static void __exit ct_cleanup_module(void)
 {
 	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+	tcf_ct_flow_tables_uninit();
+	destroy_workqueue(act_ct_wq);
 }
 
 module_init(ct_init_module);