diff mbox series

[nf-next,v4,4/5] netfilter: nf_tables: switch trans_elem to real flex array

Message ID 20241107174415.4690-5-fw@strlen.de
State Changes Requested
Headers show
Series netfilter: nf_tables: reduce set element transaction size | expand

Commit Message

Florian Westphal Nov. 7, 2024, 5:44 p.m. UTC
When queueing a set element add or removal operation to the transaction
log, check if the previous operation already asks for a the identical
operation on the same set.

If so, store the element reference in the preceding operation.
This significantlty reduces memory consumption when many set add/delete
operations appear in a single transaction.

Example: 10k elements require 937kb of memory (10k allocations from
kmalloc-96 slab).

Assuming we can compact 4 elements in the same set, 468 kbytes
are needed (64 bytes for base struct, nft_trans_elemn, 32 bytes
for nft_trans_one_elem structure, so 2500 allocations from kmalloc-192
slab).

For large batch updates we can compact up to 62 elements
into one single nft_trans_elem structure (~65% mem reduction):
(64 bytes for base struct, nft_trans_elem, 32 byte for nft_trans_one_elem
 struct).

We can halve size of nft_trans_one_elem struct by moving
timeout/expire/update_flags into a dynamically allocated structure,
this allows to store 124 elements in a 2k slab nft_trans_elem struct.
This is done in a followup patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_tables_api.c | 71 +++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

Comments

Pablo Neira Ayuso Nov. 13, 2024, 10:15 a.m. UTC | #1
Hi Florian,

I'm making another pass on this series, a few thing I would like to
ask, see below.

On Thu, Nov 07, 2024 at 06:44:08PM +0100, Florian Westphal wrote:
> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> index bdf5ba21c76d..e96e538fe2eb 100644
> --- a/net/netfilter/nf_tables_api.c
> +++ b/net/netfilter/nf_tables_api.c
> @@ -25,6 +25,7 @@
>  
>  #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
>  #define NFT_SET_MAX_ANONLEN 16
> +#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem))

This NFT_MAX_SET_NELEMS is to stay in a specific kmalloc-X?

What is the logic behind this NFT_MAX_SET_NELEMS?

>  unsigned int nf_tables_net_id __read_mostly;
>  
> @@ -391,6 +392,69 @@ static void nf_tables_unregister_hook(struct net *net,
>  	return __nf_tables_unregister_hook(net, table, chain, false);
>  }
>  
> +static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b)
> +{
> +	return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS;

I think this a->bound == b->bound check defensive.

This code is collapsing only two consecutive transactions, the one at
the tail (where nelems > 1) and the new transaction (where nelems ==
1).

bound state should only change in case there is a NEWRULE transaction
in between.

I am trying to find a error scenario where a->bound == b->bound
evaluates false. I considered the following:

   newelem -> newrule -> newelem

where newrule has these expressions:

   lookup -> error

in this case, newrule error path is exercised:

   nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR);

this calls nf_tables_deactivate_set() that calls
nft_set_trans_unbind(), then a->bound is restored to false. Rule is
released and no transaction is added.

Because if this succeeds:

   newelem -> newrule -> newelem

then no element collapsing can happen, because we only collapse what
is at the tail.

TLDR; Check does not harm, but it looks unlikely to happen to me.

one more comment below.

> +}
> +
> +static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
> +					struct nft_trans_elem *tail,
> +					struct nft_trans_elem *trans,
> +					gfp_t gfp)
> +{
> +	unsigned int nelems, old_nelems = tail->nelems;
> +	struct nft_trans_elem *new_trans;
> +
> +	if (!nft_trans_collapse_set_elem_allowed(tail, trans))
> +		return false;
> +
> +	if (WARN_ON_ONCE(trans->nelems != 1))
> +		return false;
> +
> +	if (check_add_overflow(old_nelems, trans->nelems, &nelems))
> +		return false;
> +
> +	/* krealloc might free tail which invalidates list pointers */
> +	list_del_init(&tail->nft_trans.list);
> +
> +	new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp);
> +	if (!new_trans) {
> +		list_add_tail(&tail->nft_trans.list, &nft_net->commit_list);
> +		return false;
> +	}
> +
> +	INIT_LIST_HEAD(&new_trans->nft_trans.list);

This initialization is also defensive, this element is added via
list_add_tail().

> +	new_trans->nelems = nelems;
> +	new_trans->elems[old_nelems] = trans->elems[0];
> +	list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list);
> +
> +	return true;
> +}
Florian Westphal Nov. 13, 2024, 11:04 a.m. UTC | #2
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> I'm making another pass on this series, a few thing I would like to
> ask, see below.
> 
> On Thu, Nov 07, 2024 at 06:44:08PM +0100, Florian Westphal wrote:
> > diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> > index bdf5ba21c76d..e96e538fe2eb 100644
> > --- a/net/netfilter/nf_tables_api.c
> > +++ b/net/netfilter/nf_tables_api.c
> > @@ -25,6 +25,7 @@
> >  
> >  #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
> >  #define NFT_SET_MAX_ANONLEN 16
> > +#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem))
> 
> This NFT_MAX_SET_NELEMS is to stay in a specific kmalloc-X?
> 
> What is the logic behind this NFT_MAX_SET_NELEMS?

I want to avoid making huge kmalloc requests, plus avoid huge krealloc
overhead.

I think that kmalloc-2048 slab is a good fit.
I can add a comment, or increase to kmalloc-4096 but I'd prefer to
not go over that, since kmalloc allocations > 1 page are more prone
to allocation failure.

> >  unsigned int nf_tables_net_id __read_mostly;
> >  
> > @@ -391,6 +392,69 @@ static void nf_tables_unregister_hook(struct net *net,
> >  	return __nf_tables_unregister_hook(net, table, chain, false);
> >  }
> >  
> > +static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b)
> > +{
> > +	return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS;
> 
> I think this a->bound == b->bound check defensive.
> 
> This code is collapsing only two consecutive transactions, the one at
> the tail (where nelems > 1) and the new transaction (where nelems ==
> 1).

Yes.

> bound state should only change in case there is a NEWRULE transaction
> in between.

Yes.

> I am trying to find a error scenario where a->bound == b->bound
> evaluates false. I considered the following:
> 
>    newelem -> newrule -> newelem
> 
> where newrule has these expressions:
> 
>    lookup -> error
> 
> in this case, newrule error path is exercised:
> 
>    nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR);
> 
> this calls nf_tables_deactivate_set() that calls
> nft_set_trans_unbind(), then a->bound is restored to false. Rule is
> released and no transaction is added.
> 
> Because if this succeeds:
> 
>    newelem -> newrule -> newelem
> 
> then no element collapsing can happen, because we only collapse what
> is at the tail.
> 
> TLDR; Check does not harm, but it looks unlikely to happen to me.

Yes, its defensive check.  I could add a comment.
The WARN_ON_ONCE for trans->nelems != 1 exists for same reason.

> > +}
> > +
> > +static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
> > +					struct nft_trans_elem *tail,
> > +					struct nft_trans_elem *trans,
> > +					gfp_t gfp)
> > +{
> > +	unsigned int nelems, old_nelems = tail->nelems;
> > +	struct nft_trans_elem *new_trans;
> > +
> > +	if (!nft_trans_collapse_set_elem_allowed(tail, trans))
> > +		return false;
> > +
> > +	if (WARN_ON_ONCE(trans->nelems != 1))
> > +		return false;
> > +
> > +	if (check_add_overflow(old_nelems, trans->nelems, &nelems))
> > +		return false;
> > +
> > +	/* krealloc might free tail which invalidates list pointers */
> > +	list_del_init(&tail->nft_trans.list);
> > +
> > +	new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp);
> > +	if (!new_trans) {
> > +		list_add_tail(&tail->nft_trans.list, &nft_net->commit_list);
> > +		return false;
> > +	}
> > +
> > +	INIT_LIST_HEAD(&new_trans->nft_trans.list);
> 
> This initialization is also defensive, this element is added via
> list_add_tail().

Yes, the first arg to list_add(_tail) can live without initialisation.
Pablo Neira Ayuso Nov. 13, 2024, 11:11 a.m. UTC | #3
On Wed, Nov 13, 2024 at 12:04:05PM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > I'm making another pass on this series, a few thing I would like to
> > ask, see below.
> > 
> > On Thu, Nov 07, 2024 at 06:44:08PM +0100, Florian Westphal wrote:
> > > diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> > > index bdf5ba21c76d..e96e538fe2eb 100644
> > > --- a/net/netfilter/nf_tables_api.c
> > > +++ b/net/netfilter/nf_tables_api.c
> > > @@ -25,6 +25,7 @@
> > >  
> > >  #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
> > >  #define NFT_SET_MAX_ANONLEN 16
> > > +#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem))
> > 
> > This NFT_MAX_SET_NELEMS is to stay in a specific kmalloc-X?
> > 
> > What is the logic behind this NFT_MAX_SET_NELEMS?
> 
> I want to avoid making huge kmalloc requests, plus avoid huge krealloc
> overhead.
> 
> I think that kmalloc-2048 slab is a good fit.
> I can add a comment, or increase to kmalloc-4096 but I'd prefer to
> not go over that, since kmalloc allocations > 1 page are more prone
> to allocation failure.

Makes sense as it is now, thanks for explaining.

> > >  unsigned int nf_tables_net_id __read_mostly;
> > >  
> > > @@ -391,6 +392,69 @@ static void nf_tables_unregister_hook(struct net *net,
> > >  	return __nf_tables_unregister_hook(net, table, chain, false);
> > >  }
> > >  
> > > +static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b)
> > > +{
> > > +	return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS;
> > 
> > I think this a->bound == b->bound check defensive.
> > 
> > This code is collapsing only two consecutive transactions, the one at
> > the tail (where nelems > 1) and the new transaction (where nelems ==
> > 1).
> 
> Yes.
> 
> > bound state should only change in case there is a NEWRULE transaction
> > in between.
> 
> Yes.
> 
> > I am trying to find a error scenario where a->bound == b->bound
> > evaluates false. I considered the following:
> > 
> >    newelem -> newrule -> newelem
> > 
> > where newrule has these expressions:
> > 
> >    lookup -> error
> > 
> > in this case, newrule error path is exercised:
> > 
> >    nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR);
> > 
> > this calls nf_tables_deactivate_set() that calls
> > nft_set_trans_unbind(), then a->bound is restored to false. Rule is
> > released and no transaction is added.
> > 
> > Because if this succeeds:
> > 
> >    newelem -> newrule -> newelem
> > 
> > then no element collapsing can happen, because we only collapse what
> > is at the tail.
> > 
> > TLDR; Check does not harm, but it looks unlikely to happen to me.
> 
> Yes, its defensive check.  I could add a comment.

Please, do it so we don't forget about this subtle detail.

> The WARN_ON_ONCE for trans->nelems != 1 exists for same reason.

Right.

> > > +}
> > > +
> > > +static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
> > > +					struct nft_trans_elem *tail,
> > > +					struct nft_trans_elem *trans,
> > > +					gfp_t gfp)
> > > +{
> > > +	unsigned int nelems, old_nelems = tail->nelems;
> > > +	struct nft_trans_elem *new_trans;
> > > +
> > > +	if (!nft_trans_collapse_set_elem_allowed(tail, trans))
> > > +		return false;
> > > +
> > > +	if (WARN_ON_ONCE(trans->nelems != 1))
> > > +		return false;
> > > +
> > > +	if (check_add_overflow(old_nelems, trans->nelems, &nelems))
> > > +		return false;
> > > +
> > > +	/* krealloc might free tail which invalidates list pointers */
> > > +	list_del_init(&tail->nft_trans.list);
> > > +
> > > +	new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp);
> > > +	if (!new_trans) {
> > > +		list_add_tail(&tail->nft_trans.list, &nft_net->commit_list);
> > > +		return false;
> > > +	}
> > > +
> > > +	INIT_LIST_HEAD(&new_trans->nft_trans.list);
> > 
> > This initialization is also defensive, this element is added via
> > list_add_tail().
> 
> Yes, the first arg to list_add(_tail) can live without initialisation.

Let's remove it then.

Would you submit a new revision with all these little nitpicks?

Then you also have a chance to edit your explaination on the audit
aspect of this series.

If you are busy with other stuff I can take a look here, just let me know.

Thanks.
diff mbox series

Patch

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bdf5ba21c76d..e96e538fe2eb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -25,6 +25,7 @@ 
 
 #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
 #define NFT_SET_MAX_ANONLEN 16
+#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem))
 
 unsigned int nf_tables_net_id __read_mostly;
 
@@ -391,6 +392,69 @@  static void nf_tables_unregister_hook(struct net *net,
 	return __nf_tables_unregister_hook(net, table, chain, false);
 }
 
+static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b)
+{
+	return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS;
+}
+
+static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
+					struct nft_trans_elem *tail,
+					struct nft_trans_elem *trans,
+					gfp_t gfp)
+{
+	unsigned int nelems, old_nelems = tail->nelems;
+	struct nft_trans_elem *new_trans;
+
+	if (!nft_trans_collapse_set_elem_allowed(tail, trans))
+		return false;
+
+	if (WARN_ON_ONCE(trans->nelems != 1))
+		return false;
+
+	if (check_add_overflow(old_nelems, trans->nelems, &nelems))
+		return false;
+
+	/* krealloc might free tail which invalidates list pointers */
+	list_del_init(&tail->nft_trans.list);
+
+	new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp);
+	if (!new_trans) {
+		list_add_tail(&tail->nft_trans.list, &nft_net->commit_list);
+		return false;
+	}
+
+	INIT_LIST_HEAD(&new_trans->nft_trans.list);
+	new_trans->nelems = nelems;
+	new_trans->elems[old_nelems] = trans->elems[0];
+	list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list);
+
+	return true;
+}
+
+static bool nft_trans_try_collapse(struct nftables_pernet *nft_net,
+				   struct nft_trans *trans, gfp_t gfp)
+{
+	struct nft_trans *tail;
+
+	if (list_empty(&nft_net->commit_list))
+		return false;
+
+	tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list);
+
+	if (tail->msg_type != trans->msg_type)
+		return false;
+
+	switch (trans->msg_type) {
+	case NFT_MSG_NEWSETELEM:
+	case NFT_MSG_DELSETELEM:
+		return nft_trans_collapse_set_elem(nft_net,
+						   nft_trans_container_elem(tail),
+						   nft_trans_container_elem(trans), gfp);
+	}
+
+	return false;
+}
+
 static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans)
 {
 	struct nftables_pernet *nft_net = nft_pernet(net);
@@ -424,11 +488,18 @@  static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr
 static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans,
 					   gfp_t gfp)
 {
+	struct nftables_pernet *nft_net = nft_pernet(net);
+
 	WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM &&
 		     trans->msg_type != NFT_MSG_DELSETELEM);
 
 	might_alloc(gfp);
 
+	if (nft_trans_try_collapse(nft_net, trans, gfp)) {
+		kfree(trans);
+		return;
+	}
+
 	nft_trans_commit_list_add_tail(net, trans);
 }