diff mbox series

[net,4/4] netfilter: nft_inner: incorrect percpu area handling under softirq

Message ID 20241128123840.49034-5-pablo@netfilter.org
State Changes Requested
Headers show
Series [net,1/4] ipvs: fix UB due to uninitialized stack access in ip_vs_protocol_init() | expand

Commit Message

Pablo Neira Ayuso Nov. 28, 2024, 12:38 p.m. UTC
Softirq can interrupt packet from process context which walks over the
percpu area.

Add routines to disable bh while restoring and saving the tunnel parser
context from percpu area to stack. Add a skbuff owner for this percpu
area to catch softirq interference to exercise the packet tunnel parser
again in such case.

Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
 2 files changed, 45 insertions(+), 12 deletions(-)

Comments

Eric Dumazet Nov. 29, 2024, 9:14 a.m. UTC | #1
On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>
> Softirq can interrupt packet from process context which walks over the
> percpu area.
>
> Add routines to disable bh while restoring and saving the tunnel parser
> context from percpu area to stack. Add a skbuff owner for this percpu
> area to catch softirq interference to exercise the packet tunnel parser
> again in such case.
>
> Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
>  include/net/netfilter/nf_tables_core.h |  1 +
>  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
>  2 files changed, 45 insertions(+), 12 deletions(-)
>
> diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> index ff27cb2e1662..dae0e7592934 100644
> --- a/include/net/netfilter/nf_tables_core.h
> +++ b/include/net/netfilter/nf_tables_core.h
> @@ -161,6 +161,7 @@ enum {
>  };
>
>  struct nft_inner_tun_ctx {
> +       struct sk_buff *skb;    /* percpu area owner */
>         u16     type;
>         u16     inner_tunoff;
>         u16     inner_lloff;
> diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> index 928312d01eb1..fcaa126ac8da 100644
> --- a/net/netfilter/nft_inner.c
> +++ b/net/netfilter/nft_inner.c
> @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
>                            struct nft_pktinfo *pkt,
>                            struct nft_inner_tun_ctx *tun_ctx)
>  {
> -       struct nft_inner_tun_ctx ctx = {};
>         u32 off = pkt->inneroff;
>
>         if (priv->flags & NFT_INNER_HDRSIZE &&
> -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
>                 return -1;
>
>         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
>                         return -1;
>         } else if (priv->flags & NFT_INNER_TH) {
> -               ctx.inner_thoff = off;
> -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> +               tun_ctx->inner_thoff = off;
> +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
>         }
>
> -       *tun_ctx = ctx;
>         tun_ctx->type = priv->type;
> +       tun_ctx->skb = pkt->skb;
>         pkt->flags |= NFT_PKTINFO_INNER_FULL;
>
>         return 0;
>  }
>
> +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> +                                     struct nft_inner_tun_ctx *tun_ctx)
> +{
> +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> +
> +       local_bh_disable();
> +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> +       if (this_cpu_tun_ctx->skb != pkt->skb) {

I must say I do not understand this patch.

If a context is used by a save/restore more than one time per packet
traversal, then this means we can not use per-cpu storage,
or risk flakes.

Also, skb could be freed and re-allocated ?

Perhaps describe a bit more what is going on in the changelog.
Pablo Neira Ayuso Dec. 2, 2024, 1:24 a.m. UTC | #2
Hi Eric,

On Fri, Nov 29, 2024 at 10:14:34AM +0100, Eric Dumazet wrote:
> On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >
> > Softirq can interrupt packet from process context which walks over the
> > percpu area.
> >
> > Add routines to disable bh while restoring and saving the tunnel parser
> > context from percpu area to stack. Add a skbuff owner for this percpu
> > area to catch softirq interference to exercise the packet tunnel parser
> > again in such case.
> >
> > Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> > Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > ---
> >  include/net/netfilter/nf_tables_core.h |  1 +
> >  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
> >  2 files changed, 45 insertions(+), 12 deletions(-)
> >
> > diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> > index ff27cb2e1662..dae0e7592934 100644
> > --- a/include/net/netfilter/nf_tables_core.h
> > +++ b/include/net/netfilter/nf_tables_core.h
> > @@ -161,6 +161,7 @@ enum {
> >  };
> >
> >  struct nft_inner_tun_ctx {
> > +       struct sk_buff *skb;    /* percpu area owner */
> >         u16     type;
> >         u16     inner_tunoff;
> >         u16     inner_lloff;
> > diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> > index 928312d01eb1..fcaa126ac8da 100644
> > --- a/net/netfilter/nft_inner.c
> > +++ b/net/netfilter/nft_inner.c
> > @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
> >                            struct nft_pktinfo *pkt,
> >                            struct nft_inner_tun_ctx *tun_ctx)
> >  {
> > -       struct nft_inner_tun_ctx ctx = {};
> >         u32 off = pkt->inneroff;
> >
> >         if (priv->flags & NFT_INNER_HDRSIZE &&
> > -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> > +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
> >                 return -1;
> >
> >         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> > -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> > +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
> >                         return -1;
> >         } else if (priv->flags & NFT_INNER_TH) {
> > -               ctx.inner_thoff = off;
> > -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > +               tun_ctx->inner_thoff = off;
> > +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
> >         }
> >
> > -       *tun_ctx = ctx;
> >         tun_ctx->type = priv->type;
> > +       tun_ctx->skb = pkt->skb;
> >         pkt->flags |= NFT_PKTINFO_INNER_FULL;
> >
> >         return 0;
> >  }
> >
> > +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> > +                                     struct nft_inner_tun_ctx *tun_ctx)
> > +{
> > +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> > +
> > +       local_bh_disable();
> > +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> > +       if (this_cpu_tun_ctx->skb != pkt->skb) {
> 
> I must say I do not understand this patch.
> 
> If a context is used by a save/restore more than one time per packet
> traversal, then this means we can not use per-cpu storage,
> or risk flakes.
> 
> Also, skb could be freed and re-allocated ?
> 
> Perhaps describe a bit more what is going on in the changelog.

There is an on-stack nft_pktinfo structure with a flags field. This
nft_pktinfo is a wrapper for the sk_buff, containing header offsets
and metainformation. This is initialize when entering this chain.

If the NFT_PKTINFO_INNER_FULL flag is set on, then the percpu area
could contain information on the inner header offsets (ie. packet was
already parsed in this chain).

There is a check to validate that the percpu area refers to this
skbuff. If there is a mismatch, then this needs to parse the inner
headers because the data in the percpu area is stale. Otherwise, if
there is a match, the percpu area is copied on-stack.

After processing (payload/meta fetching), the on-stack copy is stored
back to the percpu area. I can make an improvement on this patch to
check if this skbuff still owns the percpu area in the store/exit
section of this inner evaluation routine, to avoid a unnecessary update.

So, it is basically the NFT_PKTINFO_INNER_FULL flag that handles the
skbuff reallocation scenario. This is not blindly trusting this percpu
area per-se.

One comestic change I can apply to this: I can also turn the struct
sk_buff into unsigned long so it clear to the reader this pointer to
skbuff is not meant to be used for being dereferenced.

If the explaination above is sufficient, I can revamp to extend the
changelog as you suggest and post a new version of this patch.

Thanks.
Eric Dumazet Dec. 2, 2024, 9:17 a.m. UTC | #3
On Mon, Dec 2, 2024 at 2:24 AM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>
> Hi Eric,
>
> On Fri, Nov 29, 2024 at 10:14:34AM +0100, Eric Dumazet wrote:
> > On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > >
> > > Softirq can interrupt packet from process context which walks over the
> > > percpu area.
> > >
> > > Add routines to disable bh while restoring and saving the tunnel parser
> > > context from percpu area to stack. Add a skbuff owner for this percpu
> > > area to catch softirq interference to exercise the packet tunnel parser
> > > again in such case.
> > >
> > > Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> > > Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> > > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > > ---
> > >  include/net/netfilter/nf_tables_core.h |  1 +
> > >  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
> > >  2 files changed, 45 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> > > index ff27cb2e1662..dae0e7592934 100644
> > > --- a/include/net/netfilter/nf_tables_core.h
> > > +++ b/include/net/netfilter/nf_tables_core.h
> > > @@ -161,6 +161,7 @@ enum {
> > >  };
> > >
> > >  struct nft_inner_tun_ctx {
> > > +       struct sk_buff *skb;    /* percpu area owner */
> > >         u16     type;
> > >         u16     inner_tunoff;
> > >         u16     inner_lloff;
> > > diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> > > index 928312d01eb1..fcaa126ac8da 100644
> > > --- a/net/netfilter/nft_inner.c
> > > +++ b/net/netfilter/nft_inner.c
> > > @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
> > >                            struct nft_pktinfo *pkt,
> > >                            struct nft_inner_tun_ctx *tun_ctx)
> > >  {
> > > -       struct nft_inner_tun_ctx ctx = {};
> > >         u32 off = pkt->inneroff;
> > >
> > >         if (priv->flags & NFT_INNER_HDRSIZE &&
> > > -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> > > +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
> > >                 return -1;
> > >
> > >         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> > > -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> > > +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
> > >                         return -1;
> > >         } else if (priv->flags & NFT_INNER_TH) {
> > > -               ctx.inner_thoff = off;
> > > -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > +               tun_ctx->inner_thoff = off;
> > > +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > >         }
> > >
> > > -       *tun_ctx = ctx;
> > >         tun_ctx->type = priv->type;
> > > +       tun_ctx->skb = pkt->skb;
> > >         pkt->flags |= NFT_PKTINFO_INNER_FULL;
> > >
> > >         return 0;
> > >  }
> > >
> > > +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> > > +                                     struct nft_inner_tun_ctx *tun_ctx)
> > > +{
> > > +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> > > +
> > > +       local_bh_disable();
> > > +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> > > +       if (this_cpu_tun_ctx->skb != pkt->skb) {
> >
> > I must say I do not understand this patch.
> >
> > If a context is used by a save/restore more than one time per packet
> > traversal, then this means we can not use per-cpu storage,
> > or risk flakes.
> >
> > Also, skb could be freed and re-allocated ?
> >
> > Perhaps describe a bit more what is going on in the changelog.
>
> There is an on-stack nft_pktinfo structure with a flags field. This
> nft_pktinfo is a wrapper for the sk_buff, containing header offsets
> and metainformation. This is initialize when entering this chain.
>
> If the NFT_PKTINFO_INNER_FULL flag is set on, then the percpu area
> could contain information on the inner header offsets (ie. packet was
> already parsed in this chain).
>
> There is a check to validate that the percpu area refers to this
> skbuff. If there is a mismatch, then this needs to parse the inner
> headers because the data in the percpu area is stale. Otherwise, if
> there is a match, the percpu area is copied on-stack.
>
> After processing (payload/meta fetching), the on-stack copy is stored
> back to the percpu area. I can make an improvement on this patch to
> check if this skbuff still owns the percpu area in the store/exit
> section of this inner evaluation routine, to avoid a unnecessary update.
>
> So, it is basically the NFT_PKTINFO_INNER_FULL flag that handles the
> skbuff reallocation scenario. This is not blindly trusting this percpu
> area per-se.
>
> One comestic change I can apply to this: I can also turn the struct
> sk_buff into unsigned long so it clear to the reader this pointer to
> skbuff is not meant to be used for being dereferenced.
>
> If the explaination above is sufficient, I can revamp to extend the
> changelog as you suggest and post a new version of this patch.
>
> Thanks.

The part I do not understand is that tun_ctx->skb is not cleared at
the end of processing (or at some point)

That would make clear that a future (tun_ctx->skb == skb) test is not
confused by skb reuse (free/alloc).

If you use it as a cookie, then we need something else than a pointer.
Pablo Neira Ayuso Dec. 2, 2024, 9:28 a.m. UTC | #4
On Mon, Dec 02, 2024 at 10:17:10AM +0100, Eric Dumazet wrote:
> On Mon, Dec 2, 2024 at 2:24 AM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >
> > Hi Eric,
> >
> > On Fri, Nov 29, 2024 at 10:14:34AM +0100, Eric Dumazet wrote:
> > > On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > >
> > > > Softirq can interrupt packet from process context which walks over the
> > > > percpu area.
> > > >
> > > > Add routines to disable bh while restoring and saving the tunnel parser
> > > > context from percpu area to stack. Add a skbuff owner for this percpu
> > > > area to catch softirq interference to exercise the packet tunnel parser
> > > > again in such case.
> > > >
> > > > Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> > > > Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> > > > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > ---
> > > >  include/net/netfilter/nf_tables_core.h |  1 +
> > > >  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
> > > >  2 files changed, 45 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> > > > index ff27cb2e1662..dae0e7592934 100644
> > > > --- a/include/net/netfilter/nf_tables_core.h
> > > > +++ b/include/net/netfilter/nf_tables_core.h
> > > > @@ -161,6 +161,7 @@ enum {
> > > >  };
> > > >
> > > >  struct nft_inner_tun_ctx {
> > > > +       struct sk_buff *skb;    /* percpu area owner */
> > > >         u16     type;
> > > >         u16     inner_tunoff;
> > > >         u16     inner_lloff;
> > > > diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> > > > index 928312d01eb1..fcaa126ac8da 100644
> > > > --- a/net/netfilter/nft_inner.c
> > > > +++ b/net/netfilter/nft_inner.c
> > > > @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
> > > >                            struct nft_pktinfo *pkt,
> > > >                            struct nft_inner_tun_ctx *tun_ctx)
> > > >  {
> > > > -       struct nft_inner_tun_ctx ctx = {};
> > > >         u32 off = pkt->inneroff;
> > > >
> > > >         if (priv->flags & NFT_INNER_HDRSIZE &&
> > > > -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> > > > +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
> > > >                 return -1;
> > > >
> > > >         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> > > > -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> > > > +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
> > > >                         return -1;
> > > >         } else if (priv->flags & NFT_INNER_TH) {
> > > > -               ctx.inner_thoff = off;
> > > > -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > > +               tun_ctx->inner_thoff = off;
> > > > +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > >         }
> > > >
> > > > -       *tun_ctx = ctx;
> > > >         tun_ctx->type = priv->type;
> > > > +       tun_ctx->skb = pkt->skb;
> > > >         pkt->flags |= NFT_PKTINFO_INNER_FULL;
> > > >
> > > >         return 0;
> > > >  }
> > > >
> > > > +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> > > > +                                     struct nft_inner_tun_ctx *tun_ctx)
> > > > +{
> > > > +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> > > > +
> > > > +       local_bh_disable();
> > > > +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> > > > +       if (this_cpu_tun_ctx->skb != pkt->skb) {
> > >
> > > I must say I do not understand this patch.
> > >
> > > If a context is used by a save/restore more than one time per packet
> > > traversal, then this means we can not use per-cpu storage,
> > > or risk flakes.
> > >
> > > Also, skb could be freed and re-allocated ?
> > >
> > > Perhaps describe a bit more what is going on in the changelog.
> >
> > There is an on-stack nft_pktinfo structure with a flags field. This
> > nft_pktinfo is a wrapper for the sk_buff, containing header offsets
> > and metainformation. This is initialize when entering this chain.
> >
> > If the NFT_PKTINFO_INNER_FULL flag is set on, then the percpu area
> > could contain information on the inner header offsets (ie. packet was
> > already parsed in this chain).
> >
> > There is a check to validate that the percpu area refers to this
> > skbuff. If there is a mismatch, then this needs to parse the inner
> > headers because the data in the percpu area is stale. Otherwise, if
> > there is a match, the percpu area is copied on-stack.
> >
> > After processing (payload/meta fetching), the on-stack copy is stored
> > back to the percpu area. I can make an improvement on this patch to
> > check if this skbuff still owns the percpu area in the store/exit
> > section of this inner evaluation routine, to avoid a unnecessary update.
> >
> > So, it is basically the NFT_PKTINFO_INNER_FULL flag that handles the
> > skbuff reallocation scenario. This is not blindly trusting this percpu
> > area per-se.
> >
> > One comestic change I can apply to this: I can also turn the struct
> > sk_buff into unsigned long so it clear to the reader this pointer to
> > skbuff is not meant to be used for being dereferenced.
> >
> > If the explaination above is sufficient, I can revamp to extend the
> > changelog as you suggest and post a new version of this patch.
> >
> > Thanks.
> 
> The part I do not understand is that tun_ctx->skb is not cleared at
> the end of processing (or at some point)

I believe on-stack NFT_PKTINFO_INNER_FULL flag is sufficient, but
I will clear it as you suggest to make this more robust.

> That would make clear that a future (tun_ctx->skb == skb) test is not
> confused by skb reuse (free/alloc).
> 
> If you use it as a cookie, then we need something else than a pointer.

Yes, this is a cookie, I can turn this field into unsigned long
instead.

Thanks.
Pablo Neira Ayuso Dec. 3, 2024, 8:22 p.m. UTC | #5
Hi Eric,

On Mon, Dec 02, 2024 at 10:17:10AM +0100, Eric Dumazet wrote:
> On Mon, Dec 2, 2024 at 2:24 AM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >
> > Hi Eric,
> >
> > On Fri, Nov 29, 2024 at 10:14:34AM +0100, Eric Dumazet wrote:
> > > On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > >
> > > > Softirq can interrupt packet from process context which walks over the
> > > > percpu area.
> > > >
> > > > Add routines to disable bh while restoring and saving the tunnel parser
> > > > context from percpu area to stack. Add a skbuff owner for this percpu
> > > > area to catch softirq interference to exercise the packet tunnel parser
> > > > again in such case.
> > > >
> > > > Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> > > > Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> > > > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > ---
> > > >  include/net/netfilter/nf_tables_core.h |  1 +
> > > >  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
> > > >  2 files changed, 45 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> > > > index ff27cb2e1662..dae0e7592934 100644
> > > > --- a/include/net/netfilter/nf_tables_core.h
> > > > +++ b/include/net/netfilter/nf_tables_core.h
> > > > @@ -161,6 +161,7 @@ enum {
> > > >  };
> > > >
> > > >  struct nft_inner_tun_ctx {
> > > > +       struct sk_buff *skb;    /* percpu area owner */
> > > >         u16     type;
> > > >         u16     inner_tunoff;
> > > >         u16     inner_lloff;
> > > > diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> > > > index 928312d01eb1..fcaa126ac8da 100644
> > > > --- a/net/netfilter/nft_inner.c
> > > > +++ b/net/netfilter/nft_inner.c
> > > > @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
> > > >                            struct nft_pktinfo *pkt,
> > > >                            struct nft_inner_tun_ctx *tun_ctx)
> > > >  {
> > > > -       struct nft_inner_tun_ctx ctx = {};
> > > >         u32 off = pkt->inneroff;
> > > >
> > > >         if (priv->flags & NFT_INNER_HDRSIZE &&
> > > > -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> > > > +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
> > > >                 return -1;
> > > >
> > > >         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> > > > -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> > > > +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
> > > >                         return -1;
> > > >         } else if (priv->flags & NFT_INNER_TH) {
> > > > -               ctx.inner_thoff = off;
> > > > -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > > +               tun_ctx->inner_thoff = off;
> > > > +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > >         }
> > > >
> > > > -       *tun_ctx = ctx;
> > > >         tun_ctx->type = priv->type;
> > > > +       tun_ctx->skb = pkt->skb;
> > > >         pkt->flags |= NFT_PKTINFO_INNER_FULL;
> > > >
> > > >         return 0;
> > > >  }
> > > >
> > > > +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> > > > +                                     struct nft_inner_tun_ctx *tun_ctx)
> > > > +{
> > > > +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> > > > +
> > > > +       local_bh_disable();
> > > > +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> > > > +       if (this_cpu_tun_ctx->skb != pkt->skb) {
> > >
> > > I must say I do not understand this patch.
> > >
> > > If a context is used by a save/restore more than one time per packet
> > > traversal, then this means we can not use per-cpu storage,
> > > or risk flakes.
> > >
> > > Also, skb could be freed and re-allocated ?
> > >
> > > Perhaps describe a bit more what is going on in the changelog.
> >
> > There is an on-stack nft_pktinfo structure with a flags field. This
> > nft_pktinfo is a wrapper for the sk_buff, containing header offsets
> > and metainformation. This is initialize when entering this chain.
> >
> > If the NFT_PKTINFO_INNER_FULL flag is set on, then the percpu area
> > could contain information on the inner header offsets (ie. packet was
> > already parsed in this chain).
> >
> > There is a check to validate that the percpu area refers to this
> > skbuff. If there is a mismatch, then this needs to parse the inner
> > headers because the data in the percpu area is stale. Otherwise, if
> > there is a match, the percpu area is copied on-stack.
> >
> > After processing (payload/meta fetching), the on-stack copy is stored
> > back to the percpu area. I can make an improvement on this patch to
> > check if this skbuff still owns the percpu area in the store/exit
> > section of this inner evaluation routine, to avoid a unnecessary update.
> >
> > So, it is basically the NFT_PKTINFO_INNER_FULL flag that handles the
> > skbuff reallocation scenario. This is not blindly trusting this percpu
> > area per-se.
> >
> > One comestic change I can apply to this: I can also turn the struct
> > sk_buff into unsigned long so it clear to the reader this pointer to
> > skbuff is not meant to be used for being dereferenced.
> >
> > If the explaination above is sufficient, I can revamp to extend the
> > changelog as you suggest and post a new version of this patch.
> >
> > Thanks.
> 
> The part I do not understand is that tun_ctx->skb is not cleared at
> the end of processing (or at some point)
> 
> That would make clear that a future (tun_ctx->skb == skb) test is not
> confused by skb reuse (free/alloc).
>
> If you use it as a cookie, then we need something else than a pointer.

Revisiting this...

This is performing _three checks_ to validate that the percpu area is
valid for this skbuff:

static bool nft_inner_parse_needed(const struct nft_inner *priv,
                                   const struct nft_pktinfo *pkt,
                                   struct nft_inner_tun_ctx *tun_ctx)
{
        if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
                return true;

1) pkt->flags & NFT_PKTINFO_INNER_FULL tells us this percpu area could
   contain information for this skbuff already _in this chain_.

        if (!nft_inner_restore_tun_ctx(pkt, tun_ctx))
                return true;

2) this above checks if (tun_ctx->skb == skb)

        if (priv->type != tun_ctx->type)
                return true;

3) this also checks if inner header type in percpu area is the same
   as the type of this match.

But NFT_PKTINFO_INNER_FULL is only set for this packet in the context
of the chain.

I don't have a way to clear the percpu area, because I don't know what
would be the last rule that will try to match on inner headers.

As far as I understand, the problematic scenario is this:

   (process context)
        skb A Rule 1, no NFT_PKTINFO_INNER_FULL flag, initialize percpu area
        skb A Rule 2, NFT_PKTINFO_INNER_FULL is set, use percpu area ... but softirq comes while evaluating
        (softirq)
             skb B Rule 1, no NFT_PKTINFO_INNER_FULL flag, initialize percpu area
             (this is overriding the percpu and updating the cookie).
             skb B Rule 2, NFT_PKTINFO_INNER_FULL flag is set, use percpu area
             skb B Rule 3, NFT_PKTINFO_INNER_FULL flag is set, use percpu area
             skb B end of processing
   (resume to process context)
        skb A Rule 2, override percpu area
        skb A Rule 3, NFT_PKTINFO_INNER_FULL is set, use percpu area

I think this approach in this patch is sufficient to deal with this.

I can expand commit description and rename variable to _cookie_ to
make it more obvious to the reader.

Thanks.
diff mbox series

Patch

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index ff27cb2e1662..dae0e7592934 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -161,6 +161,7 @@  enum {
 };
 
 struct nft_inner_tun_ctx {
+	struct sk_buff *skb;	/* percpu area owner */
 	u16	type;
 	u16	inner_tunoff;
 	u16	inner_lloff;
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 928312d01eb1..fcaa126ac8da 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -210,35 +210,65 @@  static int nft_inner_parse(const struct nft_inner *priv,
 			   struct nft_pktinfo *pkt,
 			   struct nft_inner_tun_ctx *tun_ctx)
 {
-	struct nft_inner_tun_ctx ctx = {};
 	u32 off = pkt->inneroff;
 
 	if (priv->flags & NFT_INNER_HDRSIZE &&
-	    nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
+	    nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
 		return -1;
 
 	if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
-		if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
+		if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
 			return -1;
 	} else if (priv->flags & NFT_INNER_TH) {
-		ctx.inner_thoff = off;
-		ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
+		tun_ctx->inner_thoff = off;
+		tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
 	}
 
-	*tun_ctx = ctx;
 	tun_ctx->type = priv->type;
+	tun_ctx->skb = pkt->skb;
 	pkt->flags |= NFT_PKTINFO_INNER_FULL;
 
 	return 0;
 }
 
+static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
+				      struct nft_inner_tun_ctx *tun_ctx)
+{
+	struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+	local_bh_disable();
+	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+	if (this_cpu_tun_ctx->skb != pkt->skb) {
+		local_bh_enable();
+		return false;
+	}
+	*tun_ctx = *this_cpu_tun_ctx;
+	local_bh_enable();
+
+	return true;
+}
+
+static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,
+				   const struct nft_inner_tun_ctx *tun_ctx)
+{
+	struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+	local_bh_disable();
+	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+	*this_cpu_tun_ctx = *tun_ctx;
+	local_bh_enable();
+}
+
 static bool nft_inner_parse_needed(const struct nft_inner *priv,
 				   const struct nft_pktinfo *pkt,
-				   const struct nft_inner_tun_ctx *tun_ctx)
+				   struct nft_inner_tun_ctx *tun_ctx)
 {
 	if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
 		return true;
 
+	if (!nft_inner_restore_tun_ctx(pkt, tun_ctx))
+		return true;
+
 	if (priv->type != tun_ctx->type)
 		return true;
 
@@ -248,27 +278,29 @@  static bool nft_inner_parse_needed(const struct nft_inner *priv,
 static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
 			   const struct nft_pktinfo *pkt)
 {
-	struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
 	const struct nft_inner *priv = nft_expr_priv(expr);
+	struct nft_inner_tun_ctx tun_ctx = {};
 
 	if (nft_payload_inner_offset(pkt) < 0)
 		goto err;
 
-	if (nft_inner_parse_needed(priv, pkt, tun_ctx) &&
-	    nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0)
+	if (nft_inner_parse_needed(priv, pkt, &tun_ctx) &&
+	    nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0)
 		goto err;
 
 	switch (priv->expr_type) {
 	case NFT_INNER_EXPR_PAYLOAD:
-		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
 		break;
 	case NFT_INNER_EXPR_META:
-		nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+		nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
 		break;
 	default:
 		WARN_ON_ONCE(1);
 		goto err;
 	}
+	nft_inner_save_tun_ctx(pkt, &tun_ctx);
+
 	return;
 err:
 	regs->verdict.code = NFT_BREAK;