diff mbox series

[nf] netfilter: nf_tables: do not defer rule destruction via call_rcu

Message ID 20241207111459.7191-1-fw@strlen.de
State Accepted
Headers show
Series [nf] netfilter: nf_tables: do not defer rule destruction via call_rcu | expand

Commit Message

Florian Westphal Dec. 7, 2024, 11:14 a.m. UTC
nf_tables_chain_destroy can sleep, it can't be used from call_rcu
callbacks.

Moreover, nf_tables_rule_release() is only safe for error unwinding,
while transaction mutex is held and the to-be-desroyed rule was not
exposed to either dataplane or dumps, as it deactives+frees without
the required synchronize_rcu() in-between.

nft_rule_expr_deactivate() callbacks will change ->use counters
of other chains/sets, see e.g. nft_lookup .deactivate callback, these
must be serialized via transaction mutex.

Also add a few lockdep asserts to make this more explicit.

Calling synchronize_rcu() isn't ideal, but fixing this without is hard
and way more intrusive.  As-is, we can get:

WARNING: .. net/netfilter/nf_tables_api.c:5515 nft_set_destroy+0x..
Workqueue: events nf_tables_trans_destroy_work
RIP: 0010:nft_set_destroy+0x3fe/0x5c0
Call Trace:
 <TASK>
 nf_tables_trans_destroy_work+0x6b7/0xad0
 process_one_work+0x64a/0xce0
 worker_thread+0x613/0x10d0

In case the synchronize_rcu becomes an issue, we can explore alternatives.

One way would be to allocate nft_trans_rule objects + one nft_trans_chain
object, deactivate the rules + the chain and then defer the freeing to the
nft destroy workqueue.  We'd still need to keep the synchronize_rcu path as
a fallback to handle -ENOMEM corner cases though.

Reported-by: syzbot+b26935466701e56cfdc2@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/67478d92.050a0220.253251.0062.GAE@google.com/T/
Fixes: c03d278fdf35 ("netfilter: nf_tables: wait for rcu grace period on net_device removal")
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_tables_api.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

Comments

Pablo Neira Ayuso Dec. 9, 2024, 9:27 p.m. UTC | #1
Hi Florian,

Thanks a lot for your quick fix.

On Sat, Dec 07, 2024 at 12:14:48PM +0100, Florian Westphal wrote:
> nf_tables_chain_destroy can sleep, it can't be used from call_rcu
> callbacks.
> 
> Moreover, nf_tables_rule_release() is only safe for error unwinding,
> while transaction mutex is held and the to-be-desroyed rule was not
> exposed to either dataplane or dumps, as it deactives+frees without
> the required synchronize_rcu() in-between.
> 
> nft_rule_expr_deactivate() callbacks will change ->use counters
> of other chains/sets, see e.g. nft_lookup .deactivate callback, these
> must be serialized via transaction mutex.
> 
> Also add a few lockdep asserts to make this more explicit.
> 
> Calling synchronize_rcu() isn't ideal, but fixing this without is hard
> and way more intrusive.  As-is, we can get:
> 
> WARNING: .. net/netfilter/nf_tables_api.c:5515 nft_set_destroy+0x..

Right, rhash needs this, that is why there is a workqueue to release
objects from nftables commit path.

> Workqueue: events nf_tables_trans_destroy_work
> RIP: 0010:nft_set_destroy+0x3fe/0x5c0
> Call Trace:
>  <TASK>
>  nf_tables_trans_destroy_work+0x6b7/0xad0
>  process_one_work+0x64a/0xce0
>  worker_thread+0x613/0x10d0
> 
> In case the synchronize_rcu becomes an issue, we can explore alternatives.
> 
> One way would be to allocate nft_trans_rule objects + one nft_trans_chain
> object, deactivate the rules + the chain and then defer the freeing to the
> nft destroy workqueue.  We'd still need to keep the synchronize_rcu path as
> a fallback to handle -ENOMEM corner cases though.

I think it can be done _without_ nft_trans objects.

Since the commit mutex is held in this netdev event path: Remove this
basechain, deactivate rules and add basechain to global list protected
with spinlock, it invokes worker. Then, worker zaps this list
basechains, it calls synchronize_rcu() and it destroys rules and then
basechain. No memory allocations needed in this case?

Thanks.

> Reported-by: syzbot+b26935466701e56cfdc2@syzkaller.appspotmail.com
> Closes: https://lore.kernel.org/all/67478d92.050a0220.253251.0062.GAE@google.com/T/
> Fixes: c03d278fdf35 ("netfilter: nf_tables: wait for rcu grace period on net_device removal")
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  net/netfilter/nf_tables_api.c | 31 +++++++++++++++----------------
>  1 file changed, 15 insertions(+), 16 deletions(-)
> 
> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> index 21b6f7410a1f..a3b6b6b32f72 100644
> --- a/net/netfilter/nf_tables_api.c
> +++ b/net/netfilter/nf_tables_api.c
> @@ -3987,8 +3987,11 @@ void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule)
>  	kfree(rule);
>  }
>  
> +/* can only be used if rule is no longer visible to dumps */
>  static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule)
>  {
> +	lockdep_commit_lock_is_held(ctx->net);
> +
>  	nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
>  	nf_tables_rule_destroy(ctx, rule);
>  }
> @@ -5757,6 +5760,8 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
>  			      struct nft_set_binding *binding,
>  			      enum nft_trans_phase phase)
>  {
> +	lockdep_commit_lock_is_held(ctx->net);
> +
>  	switch (phase) {
>  	case NFT_TRANS_PREPARE_ERROR:
>  		nft_set_trans_unbind(ctx, set);
> @@ -11695,19 +11700,6 @@ static void __nft_release_basechain_now(struct nft_ctx *ctx)
>  	nf_tables_chain_destroy(ctx->chain);
>  }
>  
> -static void nft_release_basechain_rcu(struct rcu_head *head)
> -{
> -	struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head);
> -	struct nft_ctx ctx = {
> -		.family	= chain->table->family,
> -		.chain	= chain,
> -		.net	= read_pnet(&chain->table->net),
> -	};
> -
> -	__nft_release_basechain_now(&ctx);
> -	put_net(ctx.net);
> -}
> -
>  int __nft_release_basechain(struct nft_ctx *ctx)
>  {
>  	struct nft_rule *rule;
> @@ -11722,11 +11714,18 @@ int __nft_release_basechain(struct nft_ctx *ctx)
>  	nft_chain_del(ctx->chain);
>  	nft_use_dec(&ctx->table->use);
>  
> -	if (maybe_get_net(ctx->net))
> -		call_rcu(&ctx->chain->rcu_head, nft_release_basechain_rcu);
> -	else
> +	if (!maybe_get_net(ctx->net)) {
>  		__nft_release_basechain_now(ctx);
> +		return 0;
> +	}
> +
> +	/* wait for ruleset dumps to complete.  Owning chain is no longer in
> +	 * lists, so new dumps can't find any of these rules anymore.
> +	 */
> +	synchronize_rcu();
>  
> +	__nft_release_basechain_now(ctx);
> +	put_net(ctx->net);
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(__nft_release_basechain);
> -- 
> 2.47.1
> 
>
Florian Westphal Dec. 9, 2024, 10:04 p.m. UTC | #2
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > One way would be to allocate nft_trans_rule objects + one nft_trans_chain
> > object, deactivate the rules + the chain and then defer the freeing to the
> > nft destroy workqueue.  We'd still need to keep the synchronize_rcu path as
> > a fallback to handle -ENOMEM corner cases though.
> 
> I think it can be done _without_ nft_trans objects.
> 
> Since the commit mutex is held in this netdev event path: Remove this
> basechain, deactivate rules and add basechain to global list protected
> with spinlock, it invokes worker. Then, worker zaps this list
> basechains, it calls synchronize_rcu() and it destroys rules and then
> basechain. No memory allocations needed in this case?

I don't think its possible due to netlink dumps.

Its safe for normal commit path because list_del_rcu() gets called on
these objects (making them unreachable for new dumps), then,
synchronize_rcu is called (so all concurrent dumpers are done) and then
we free.

It would work if you add a new list_head to the basechain, then you
could just link the basechain object to some other list and then
call nf_tables_rule_release() AFTER a synchronize_rcu because rules
can't be picked up if the owning chain is no longer reachable.

However, I do wonder how complex it would be.

This is tricky to get right and I'm not sure this patch adds a
noticeable issue, see nft_rcv_nl_event() which has similar pattern,
i.e. synchronize_rcu() is called.

I'm not saying we should not consider async route, but maybe better
to followup in -next?
Pablo Neira Ayuso Dec. 11, 2024, 4:16 p.m. UTC | #3
Hi Florian

On Mon, Dec 09, 2024 at 11:04:01PM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > One way would be to allocate nft_trans_rule objects + one nft_trans_chain
> > > object, deactivate the rules + the chain and then defer the freeing to the
> > > nft destroy workqueue.  We'd still need to keep the synchronize_rcu path as
> > > a fallback to handle -ENOMEM corner cases though.
> > 
> > I think it can be done _without_ nft_trans objects.
> > 
> > Since the commit mutex is held in this netdev event path: Remove this
> > basechain, deactivate rules and add basechain to global list protected
> > with spinlock, it invokes worker. Then, worker zaps this list
> > basechains, it calls synchronize_rcu() and it destroys rules and then
> > basechain. No memory allocations needed in this case?
> 
> I don't think its possible due to netlink dumps.
> 
> Its safe for normal commit path because list_del_rcu() gets called on
> these objects (making them unreachable for new dumps), then,
> synchronize_rcu is called (so all concurrent dumpers are done) and then
> we free.
> 
> It would work if you add a new list_head to the basechain, then you
> could just link the basechain object to some other list and then
> call nf_tables_rule_release() AFTER a synchronize_rcu because rules
> can't be picked up if the owning chain is no longer reachable.
> 
> However, I do wonder how complex it would be.

I started a patch, spent several hours in a row, but I need more time
to get this right.

There is also Phil making the line to remove this in nf-next.

> This is tricky to get right and I'm not sure this patch adds a
> noticeable issue, see nft_rcv_nl_event() which has similar pattern,
> i.e. synchronize_rcu() is called.

I can see veth hits synchronize_rcu(), this can be called from
default_device_exit_batch() which calls unregister_netdevice_many().

I need time, but it is difficult. We have to deliver a pull request
very late on wednesday so it can be taken on thursday.

Now the week has three days to fix and test stuff, so upstream
maintainers have a day to tidy up and submit upstream.

> I'm not saying we should not consider async route, but maybe better
> to followup in -next?

Yes, I can follow up in -next, but then Phil just remove this again in
-next.

I can just take this fix to address the existing situation which is
worse than before. Before this patch UAF was possible, although I
never manage to trigger it. Now crash is easy to trigger, I spent too
much time looking at the interaction with layers and I forgot basic
assumption regarding nf_tables in this patch.
diff mbox series

Patch

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 21b6f7410a1f..a3b6b6b32f72 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3987,8 +3987,11 @@  void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule)
 	kfree(rule);
 }
 
+/* can only be used if rule is no longer visible to dumps */
 static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule)
 {
+	lockdep_commit_lock_is_held(ctx->net);
+
 	nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
 	nf_tables_rule_destroy(ctx, rule);
 }
@@ -5757,6 +5760,8 @@  void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
 			      struct nft_set_binding *binding,
 			      enum nft_trans_phase phase)
 {
+	lockdep_commit_lock_is_held(ctx->net);
+
 	switch (phase) {
 	case NFT_TRANS_PREPARE_ERROR:
 		nft_set_trans_unbind(ctx, set);
@@ -11695,19 +11700,6 @@  static void __nft_release_basechain_now(struct nft_ctx *ctx)
 	nf_tables_chain_destroy(ctx->chain);
 }
 
-static void nft_release_basechain_rcu(struct rcu_head *head)
-{
-	struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head);
-	struct nft_ctx ctx = {
-		.family	= chain->table->family,
-		.chain	= chain,
-		.net	= read_pnet(&chain->table->net),
-	};
-
-	__nft_release_basechain_now(&ctx);
-	put_net(ctx.net);
-}
-
 int __nft_release_basechain(struct nft_ctx *ctx)
 {
 	struct nft_rule *rule;
@@ -11722,11 +11714,18 @@  int __nft_release_basechain(struct nft_ctx *ctx)
 	nft_chain_del(ctx->chain);
 	nft_use_dec(&ctx->table->use);
 
-	if (maybe_get_net(ctx->net))
-		call_rcu(&ctx->chain->rcu_head, nft_release_basechain_rcu);
-	else
+	if (!maybe_get_net(ctx->net)) {
 		__nft_release_basechain_now(ctx);
+		return 0;
+	}
+
+	/* wait for ruleset dumps to complete.  Owning chain is no longer in
+	 * lists, so new dumps can't find any of these rules anymore.
+	 */
+	synchronize_rcu();
 
+	__nft_release_basechain_now(ctx);
+	put_net(ctx->net);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__nft_release_basechain);