Message ID | 1459560118-5582-2-git-send-email-bblanco@plumgrid.com |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On Fri, Apr 1, 2016 at 9:21 PM, Brenden Blanco <bblanco@plumgrid.com> wrote: > Add a new bpf prog type that is intended to run in early stages of the > packet rx path. Only minimal packet metadata will be available, hence a new > context type, struct xdp_metadata, is exposed to userspace. So far only > expose the readable packet length, and only in read mode. > This would eventually be a generic abstraction of receive descriptors? > The PHYS_DEV name is chosen to represent that the program is meant only > for physical adapters, rather than all netdevs. > Is there a hard restriction that this could only work with physical devices? > While the user visible struct is new, the underlying context must be > implemented as a minimal skb in order for the packet load_* instructions > to work. The skb filled in by the driver must have skb->len, skb->head, > and skb->data set, and skb->data_len == 0. > > Signed-off-by: Brenden Blanco <bblanco@plumgrid.com> > --- > include/uapi/linux/bpf.h | 5 ++++ > kernel/bpf/verifier.c | 1 + > net/core/filter.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 74 insertions(+) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 924f537..b8a4ef2 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -92,6 +92,7 @@ enum bpf_prog_type { > BPF_PROG_TYPE_KPROBE, > BPF_PROG_TYPE_SCHED_CLS, > BPF_PROG_TYPE_SCHED_ACT, > + BPF_PROG_TYPE_PHYS_DEV, > }; > > #define BPF_PSEUDO_MAP_FD 1 > @@ -367,6 +368,10 @@ struct __sk_buff { > __u32 tc_classid; > }; > > +struct xdp_metadata { > + __u32 len; > +}; > + > struct bpf_tunnel_key { > __u32 tunnel_id; > union { > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 2e08f8e..804ca70 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -1340,6 +1340,7 @@ static bool may_access_skb(enum bpf_prog_type type) > case BPF_PROG_TYPE_SOCKET_FILTER: > case BPF_PROG_TYPE_SCHED_CLS: > case BPF_PROG_TYPE_SCHED_ACT: > + case BPF_PROG_TYPE_PHYS_DEV: > return true; > default: > return false; > diff --git a/net/core/filter.c b/net/core/filter.c > index b7177d0..c417db6 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2018,6 +2018,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) > } > } > > +static const struct bpf_func_proto * > +phys_dev_func_proto(enum bpf_func_id func_id) > +{ > + return sk_filter_func_proto(func_id); > +} > + > static bool __is_valid_access(int off, int size, enum bpf_access_type type) > { > /* check bounds */ > @@ -2073,6 +2079,36 @@ static bool tc_cls_act_is_valid_access(int off, int size, > return __is_valid_access(off, size, type); > } > > +static bool __is_valid_xdp_access(int off, int size, > + enum bpf_access_type type) > +{ > + if (off < 0 || off >= sizeof(struct xdp_metadata)) > + return false; > + > + if (off % size != 0) > + return false; > + > + if (size != 4) > + return false; > + > + return true; > +} > + > +static bool phys_dev_is_valid_access(int off, int size, > + enum bpf_access_type type) > +{ > + if (type == BPF_WRITE) > + return false; > + > + switch (off) { > + case offsetof(struct xdp_metadata, len): > + break; > + default: > + return false; > + } > + return __is_valid_xdp_access(off, size, type); > +} > + > static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, > int src_reg, int ctx_off, > struct bpf_insn *insn_buf, > @@ -2210,6 +2246,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, > return insn - insn_buf; > } > > +static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type, > + int dst_reg, int src_reg, > + int ctx_off, > + struct bpf_insn *insn_buf, > + struct bpf_prog *prog) > +{ > + struct bpf_insn *insn = insn_buf; > + > + switch (ctx_off) { > + case offsetof(struct xdp_metadata, len): > + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); > + > + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, > + offsetof(struct sk_buff, len)); > + break; > + } > + > + return insn - insn_buf; > +} > + > static const struct bpf_verifier_ops sk_filter_ops = { > .get_func_proto = sk_filter_func_proto, > .is_valid_access = sk_filter_is_valid_access, > @@ -2222,6 +2278,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { > .convert_ctx_access = bpf_net_convert_ctx_access, > }; > > +static const struct bpf_verifier_ops phys_dev_ops = { > + .get_func_proto = phys_dev_func_proto, > + .is_valid_access = phys_dev_is_valid_access, > + .convert_ctx_access = bpf_phys_dev_convert_ctx_access, > +}; > + > static struct bpf_prog_type_list sk_filter_type __read_mostly = { > .ops = &sk_filter_ops, > .type = BPF_PROG_TYPE_SOCKET_FILTER, > @@ -2237,11 +2299,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = { > .type = BPF_PROG_TYPE_SCHED_ACT, > }; > > +static struct bpf_prog_type_list phys_dev_type __read_mostly = { > + .ops = &phys_dev_ops, > + .type = BPF_PROG_TYPE_PHYS_DEV, > +}; > + > static int __init register_sk_filter_ops(void) > { > bpf_register_prog_type(&sk_filter_type); > bpf_register_prog_type(&sched_cls_type); > bpf_register_prog_type(&sched_act_type); > + bpf_register_prog_type(&phys_dev_type); > > return 0; > } > -- > 2.8.0 >
On Sat, Apr 02, 2016 at 12:39:45PM -0400, Tom Herbert wrote: > On Fri, Apr 1, 2016 at 9:21 PM, Brenden Blanco <bblanco@plumgrid.com> wrote: > > Add a new bpf prog type that is intended to run in early stages of the > > packet rx path. Only minimal packet metadata will be available, hence a new > > context type, struct xdp_metadata, is exposed to userspace. So far only > > expose the readable packet length, and only in read mode. > > > This would eventually be a generic abstraction of receive descriptors? Exactly. For instance, maybe let the hw's hash be available. > > > The PHYS_DEV name is chosen to represent that the program is meant only > > for physical adapters, rather than all netdevs. > > > Is there a hard restriction that this could only work with physical devices? I suppose not, but there wouldn't be much use case as compared to tc ingress, no? Since I was imagining that this new hook would be more restricted in functionality due to operating on descriptors rather than with a full skb, I tried to think of an appropriate name. If you think that this hook would spread, then a better name is needed. > [...]
On 04/02/2016 03:21 AM, Brenden Blanco wrote: > Add a new bpf prog type that is intended to run in early stages of the > packet rx path. Only minimal packet metadata will be available, hence a new > context type, struct xdp_metadata, is exposed to userspace. So far only > expose the readable packet length, and only in read mode. > > The PHYS_DEV name is chosen to represent that the program is meant only > for physical adapters, rather than all netdevs. > > While the user visible struct is new, the underlying context must be > implemented as a minimal skb in order for the packet load_* instructions > to work. The skb filled in by the driver must have skb->len, skb->head, > and skb->data set, and skb->data_len == 0. > > Signed-off-by: Brenden Blanco <bblanco@plumgrid.com> > --- > include/uapi/linux/bpf.h | 5 ++++ > kernel/bpf/verifier.c | 1 + > net/core/filter.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 74 insertions(+) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 924f537..b8a4ef2 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -92,6 +92,7 @@ enum bpf_prog_type { > BPF_PROG_TYPE_KPROBE, > BPF_PROG_TYPE_SCHED_CLS, > BPF_PROG_TYPE_SCHED_ACT, > + BPF_PROG_TYPE_PHYS_DEV, > }; > > #define BPF_PSEUDO_MAP_FD 1 > @@ -367,6 +368,10 @@ struct __sk_buff { > __u32 tc_classid; > }; > > +struct xdp_metadata { > + __u32 len; > +}; Should this consistently be called 'xdp' or rather 'phys dev', because currently it's a mixture of both everywhere? > struct bpf_tunnel_key { > __u32 tunnel_id; > union { > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 2e08f8e..804ca70 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -1340,6 +1340,7 @@ static bool may_access_skb(enum bpf_prog_type type) > case BPF_PROG_TYPE_SOCKET_FILTER: > case BPF_PROG_TYPE_SCHED_CLS: > case BPF_PROG_TYPE_SCHED_ACT: > + case BPF_PROG_TYPE_PHYS_DEV: > return true; > default: > return false; > diff --git a/net/core/filter.c b/net/core/filter.c > index b7177d0..c417db6 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2018,6 +2018,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) > } > } > > +static const struct bpf_func_proto * > +phys_dev_func_proto(enum bpf_func_id func_id) > +{ > + return sk_filter_func_proto(func_id); Do you plan to support bpf_skb_load_bytes() as well? I like using this API especially when dealing with larger chunks (>4 bytes) to load into stack memory, plus content is kept in network byte order. What about other helpers such as bpf_skb_store_bytes() et al that work on skbs. Do you intent to reuse them as is and thus populate the per cpu skb with needed fields (faking linear data), or do you see larger obstacles that prevent for this? Thanks, Daniel
On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net> wrote: > On 04/02/2016 03:21 AM, Brenden Blanco wrote: > > Add a new bpf prog type that is intended to run in early stages of the > > packet rx path. Only minimal packet metadata will be available, hence a new > > context type, struct xdp_metadata, is exposed to userspace. So far only > > expose the readable packet length, and only in read mode. > > > > The PHYS_DEV name is chosen to represent that the program is meant only > > for physical adapters, rather than all netdevs. > > > > While the user visible struct is new, the underlying context must be > > implemented as a minimal skb in order for the packet load_* instructions > > to work. The skb filled in by the driver must have skb->len, skb->head, > > and skb->data set, and skb->data_len == 0. > > [...] > > Do you plan to support bpf_skb_load_bytes() as well? I like using > this API especially when dealing with larger chunks (>4 bytes) to > load into stack memory, plus content is kept in network byte order. > > What about other helpers such as bpf_skb_store_bytes() et al that > work on skbs. Do you intent to reuse them as is and thus populate > the per cpu skb with needed fields (faking linear data), or do you > see larger obstacles that prevent for this? Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send to users of this API. The hole idea is that an SKB is NOT allocated yet, and not needed at this level. If we start supporting calling underlying SKB functions, then we will end-up in the same place (performance wise).
On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote: > On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net> wrote: >> On 04/02/2016 03:21 AM, Brenden Blanco wrote: >>> Add a new bpf prog type that is intended to run in early stages of the >>> packet rx path. Only minimal packet metadata will be available, hence a new >>> context type, struct xdp_metadata, is exposed to userspace. So far only >>> expose the readable packet length, and only in read mode. >>> >>> The PHYS_DEV name is chosen to represent that the program is meant only >>> for physical adapters, rather than all netdevs. >>> >>> While the user visible struct is new, the underlying context must be >>> implemented as a minimal skb in order for the packet load_* instructions >>> to work. The skb filled in by the driver must have skb->len, skb->head, >>> and skb->data set, and skb->data_len == 0. >>> > [...] >> >> Do you plan to support bpf_skb_load_bytes() as well? I like using >> this API especially when dealing with larger chunks (>4 bytes) to >> load into stack memory, plus content is kept in network byte order. >> >> What about other helpers such as bpf_skb_store_bytes() et al that >> work on skbs. Do you intent to reuse them as is and thus populate >> the per cpu skb with needed fields (faking linear data), or do you >> see larger obstacles that prevent for this? > > Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send > to users of this API. > > The hole idea is that an SKB is NOT allocated yet, and not needed at > this level. If we start supporting calling underlying SKB functions, > then we will end-up in the same place (performance wise). I'm talking about the current skb-related BPF helper functions we have, so the question is how much from that code we have we can reuse under these constraints (obviously things like the tunnel helpers are a different story) and if that trade-off is acceptable for us. I'm also thinking that, for example, if you need to parse the packet data anyway for a drop verdict, you might as well pass some meta data (that is set in the real skb later on) for those packets that go up the stack.
On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: > On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote: >> >> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net> >> wrote: >>> >>> On 04/02/2016 03:21 AM, Brenden Blanco wrote: >>>> >>>> Add a new bpf prog type that is intended to run in early stages of the >>>> packet rx path. Only minimal packet metadata will be available, hence a >>>> new >>>> context type, struct xdp_metadata, is exposed to userspace. So far only >>>> expose the readable packet length, and only in read mode. >>>> >>>> The PHYS_DEV name is chosen to represent that the program is meant only >>>> for physical adapters, rather than all netdevs. >>>> >>>> While the user visible struct is new, the underlying context must be >>>> implemented as a minimal skb in order for the packet load_* instructions >>>> to work. The skb filled in by the driver must have skb->len, skb->head, >>>> and skb->data set, and skb->data_len == 0. >>>> >> [...] >>> >>> >>> Do you plan to support bpf_skb_load_bytes() as well? I like using >>> this API especially when dealing with larger chunks (>4 bytes) to >>> load into stack memory, plus content is kept in network byte order. >>> >>> What about other helpers such as bpf_skb_store_bytes() et al that >>> work on skbs. Do you intent to reuse them as is and thus populate >>> the per cpu skb with needed fields (faking linear data), or do you >>> see larger obstacles that prevent for this? >> >> >> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send >> to users of this API. >> >> The hole idea is that an SKB is NOT allocated yet, and not needed at >> this level. If we start supporting calling underlying SKB functions, >> then we will end-up in the same place (performance wise). > > > I'm talking about the current skb-related BPF helper functions we have, > so the question is how much from that code we have we can reuse under > these constraints (obviously things like the tunnel helpers are a different > story) and if that trade-off is acceptable for us. I'm also thinking > that, for example, if you need to parse the packet data anyway for a drop > verdict, you might as well pass some meta data (that is set in the real > skb later on) for those packets that go up the stack. Right, the meta data in this case is an abstracted receive descriptor. This would include items that we get in a device receive descriptor (computed checksum, hash, VLAN tag). This is purposely a small restricted data structure. I'm hoping we can minimize the size of this to not much more than 32 bytes (including pointers to data and linkage). How this translates to skb to maintain compatibility is with BPF interesting question. One other consideration is that skb's are kernel specific, we should be able to use the same BPF filter program in userspace over DPDK for instance-- so an skb interface as the packet abstraction might not be the right model... Tom
On Mon, 2016-04-04 at 15:07 +0200, Jesper Dangaard Brouer wrote: > Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send > to users of this API. > > The hole idea is that an SKB is NOT allocated yet, and not needed at > this level. If we start supporting calling underlying SKB functions, > then we will end-up in the same place (performance wise). A BPF program can access many skb fields. If you plan to support BPF, your fake skb needs to be populated like a real one. Looks like some code will be replicated in all drivers that want this facility... Or accept (document ?) that some BPF instructions are just not there. (hash, queue_mapping ...)
On Mon, 4 Apr 2016 11:09:57 -0300 Tom Herbert <tom@herbertland.com> wrote: > On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: > > On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote: > >> > >> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net> > >> wrote: > >>> > >>> On 04/02/2016 03:21 AM, Brenden Blanco wrote: > >>>> > >>>> Add a new bpf prog type that is intended to run in early stages of the > >>>> packet rx path. Only minimal packet metadata will be available, hence a > >>>> new > >>>> context type, struct xdp_metadata, is exposed to userspace. So far only > >>>> expose the readable packet length, and only in read mode. > >>>> > >>>> The PHYS_DEV name is chosen to represent that the program is meant only > >>>> for physical adapters, rather than all netdevs. > >>>> > >>>> While the user visible struct is new, the underlying context must be > >>>> implemented as a minimal skb in order for the packet load_* instructions > >>>> to work. The skb filled in by the driver must have skb->len, skb->head, > >>>> and skb->data set, and skb->data_len == 0. > >>>> > >> [...] > >>> > >>> > >>> Do you plan to support bpf_skb_load_bytes() as well? I like using > >>> this API especially when dealing with larger chunks (>4 bytes) to > >>> load into stack memory, plus content is kept in network byte order. > >>> > >>> What about other helpers such as bpf_skb_store_bytes() et al that > >>> work on skbs. Do you intent to reuse them as is and thus populate > >>> the per cpu skb with needed fields (faking linear data), or do you > >>> see larger obstacles that prevent for this? > >> > >> > >> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send > >> to users of this API. > >> > >> The hole idea is that an SKB is NOT allocated yet, and not needed at > >> this level. If we start supporting calling underlying SKB functions, > >> then we will end-up in the same place (performance wise). > > > > > > I'm talking about the current skb-related BPF helper functions we have, > > so the question is how much from that code we have we can reuse under > > these constraints (obviously things like the tunnel helpers are a different > > story) and if that trade-off is acceptable for us. I'm also thinking > > that, for example, if you need to parse the packet data anyway for a drop > > verdict, you might as well pass some meta data (that is set in the real > > skb later on) for those packets that go up the stack. > > Right, the meta data in this case is an abstracted receive descriptor. > This would include items that we get in a device receive descriptor > (computed checksum, hash, VLAN tag). This is purposely a small > restricted data structure. I'm hoping we can minimize the size of this > to not much more than 32 bytes (including pointers to data and > linkage). I agree. > How this translates to skb to maintain compatibility is with BPF > interesting question. One other consideration is that skb's are kernel > specific, we should be able to use the same BPF filter program in > userspace over DPDK for instance-- so an skb interface as the packet > abstraction might not be the right model... I agree. I don't think reusing the SKB data structure is the right model. We should drop the SKB pointer from the API. As Tom also points out, making the BPF interface independent of the SKB meta-data structure, would also make the eBPF program more generally applicable.
On 04/04/16 15:33, Eric Dumazet wrote: > On Mon, 2016-04-04 at 15:07 +0200, Jesper Dangaard Brouer wrote: > >> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send >> to users of this API. >> >> The hole idea is that an SKB is NOT allocated yet, and not needed at >> this level. If we start supporting calling underlying SKB functions, >> then we will end-up in the same place (performance wise). > > A BPF program can access many skb fields. > > If you plan to support BPF, your fake skb needs to be populated like a > real one. Looks like some code will be replicated in all drivers that > want this facility... > > Or accept (document ?) that some BPF instructions are just not there. > (hash, queue_mapping ...) If these progs are eventually going to get pushed down into supporting hardware, many skb things won't make sense at all at that level. I would suggest that anything hardware wouldn't reasonably have available should be "just not there"; I suspect that'll lead you to the right API for early driver filter as well. And it probably won't look much like an skb. -Ed
On Mon, Apr 04, 2016 at 05:12:27PM +0200, Jesper Dangaard Brouer wrote: > On Mon, 4 Apr 2016 11:09:57 -0300 > Tom Herbert <tom@herbertland.com> wrote: > > > On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: > > > On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote: > > >> > > >> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net> > > >> wrote: > > >>> > > >>> On 04/02/2016 03:21 AM, Brenden Blanco wrote: > > >>>> > > >>>> Add a new bpf prog type that is intended to run in early stages of the > > >>>> packet rx path. Only minimal packet metadata will be available, hence a > > >>>> new > > >>>> context type, struct xdp_metadata, is exposed to userspace. So far only > > >>>> expose the readable packet length, and only in read mode. > > >>>> > > >>>> The PHYS_DEV name is chosen to represent that the program is meant only > > >>>> for physical adapters, rather than all netdevs. > > >>>> > > >>>> While the user visible struct is new, the underlying context must be > > >>>> implemented as a minimal skb in order for the packet load_* instructions > > >>>> to work. The skb filled in by the driver must have skb->len, skb->head, > > >>>> and skb->data set, and skb->data_len == 0. > > >>>> > > >> [...] > > >>> > > >>> > > >>> Do you plan to support bpf_skb_load_bytes() as well? I like using > > >>> this API especially when dealing with larger chunks (>4 bytes) to > > >>> load into stack memory, plus content is kept in network byte order. > > >>> > > >>> What about other helpers such as bpf_skb_store_bytes() et al that > > >>> work on skbs. Do you intent to reuse them as is and thus populate > > >>> the per cpu skb with needed fields (faking linear data), or do you > > >>> see larger obstacles that prevent for this? > > >> > > >> > > >> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send > > >> to users of this API. > > >> > > >> The hole idea is that an SKB is NOT allocated yet, and not needed at > > >> this level. If we start supporting calling underlying SKB functions, > > >> then we will end-up in the same place (performance wise). > > > > > > > > > I'm talking about the current skb-related BPF helper functions we have, > > > so the question is how much from that code we have we can reuse under > > > these constraints (obviously things like the tunnel helpers are a different > > > story) and if that trade-off is acceptable for us. I'm also thinking > > > that, for example, if you need to parse the packet data anyway for a drop > > > verdict, you might as well pass some meta data (that is set in the real > > > skb later on) for those packets that go up the stack. > > > > Right, the meta data in this case is an abstracted receive descriptor. > > This would include items that we get in a device receive descriptor > > (computed checksum, hash, VLAN tag). This is purposely a small > > restricted data structure. I'm hoping we can minimize the size of this > > to not much more than 32 bytes (including pointers to data and > > linkage). > > I agree. > > > How this translates to skb to maintain compatibility is with BPF > > interesting question. One other consideration is that skb's are kernel > > specific, we should be able to use the same BPF filter program in > > userspace over DPDK for instance-- so an skb interface as the packet > > abstraction might not be the right model... > > I agree. I don't think reusing the SKB data structure is the right > model. We should drop the SKB pointer from the API. > > As Tom also points out, making the BPF interface independent of the SKB > meta-data structure, would also make the eBPF program more generally > applicable. The initial approach that I tried went down this path. Alexei advised that I use the pseudo skb, and in the future the API between drivers and bpf can change to adopt non-skb context. The only user facing ABIs in this patchset are the IFLA, the xdp_metadata struct, and the name of the new enum. The reason to use a pseudo skb for now is that there will be a fair amount of churn to get bpf jit and interpreter to understand non-skb context in the bpf_load_pointer() code. I don't see the need for requiring that for this patchset, as it will be internal-only change if/when we use something else. > > -- > Best regards, > Jesper Dangaard Brouer > MSc.CS, Principal Kernel Engineer at Red Hat > Author of http://www.iptv-analyzer.org > LinkedIn: http://www.linkedin.com/in/brouer
On 16-04-04 08:29 AM, Brenden Blanco wrote: > On Mon, Apr 04, 2016 at 05:12:27PM +0200, Jesper Dangaard Brouer wrote: >> On Mon, 4 Apr 2016 11:09:57 -0300 >> Tom Herbert <tom@herbertland.com> wrote: >> >>> On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: >>>> On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote: >>>>> >>>>> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net> >>>>> wrote: >>>>>> >>>>>> On 04/02/2016 03:21 AM, Brenden Blanco wrote: >>>>>>> >>>>>>> Add a new bpf prog type that is intended to run in early stages of the >>>>>>> packet rx path. Only minimal packet metadata will be available, hence a >>>>>>> new >>>>>>> context type, struct xdp_metadata, is exposed to userspace. So far only >>>>>>> expose the readable packet length, and only in read mode. >>>>>>> >>>>>>> The PHYS_DEV name is chosen to represent that the program is meant only >>>>>>> for physical adapters, rather than all netdevs. >>>>>>> >>>>>>> While the user visible struct is new, the underlying context must be >>>>>>> implemented as a minimal skb in order for the packet load_* instructions >>>>>>> to work. The skb filled in by the driver must have skb->len, skb->head, >>>>>>> and skb->data set, and skb->data_len == 0. >>>>>>> >>>>> [...] >>>>>> >>>>>> >>>>>> Do you plan to support bpf_skb_load_bytes() as well? I like using >>>>>> this API especially when dealing with larger chunks (>4 bytes) to >>>>>> load into stack memory, plus content is kept in network byte order. >>>>>> >>>>>> What about other helpers such as bpf_skb_store_bytes() et al that >>>>>> work on skbs. Do you intent to reuse them as is and thus populate >>>>>> the per cpu skb with needed fields (faking linear data), or do you >>>>>> see larger obstacles that prevent for this? >>>>> >>>>> >>>>> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send >>>>> to users of this API. >>>>> >>>>> The hole idea is that an SKB is NOT allocated yet, and not needed at >>>>> this level. If we start supporting calling underlying SKB functions, >>>>> then we will end-up in the same place (performance wise). >>>> >>>> >>>> I'm talking about the current skb-related BPF helper functions we have, >>>> so the question is how much from that code we have we can reuse under >>>> these constraints (obviously things like the tunnel helpers are a different >>>> story) and if that trade-off is acceptable for us. I'm also thinking >>>> that, for example, if you need to parse the packet data anyway for a drop >>>> verdict, you might as well pass some meta data (that is set in the real >>>> skb later on) for those packets that go up the stack. >>> >>> Right, the meta data in this case is an abstracted receive descriptor. >>> This would include items that we get in a device receive descriptor >>> (computed checksum, hash, VLAN tag). This is purposely a small >>> restricted data structure. I'm hoping we can minimize the size of this >>> to not much more than 32 bytes (including pointers to data and >>> linkage). >> >> I agree. >> >>> How this translates to skb to maintain compatibility is with BPF >>> interesting question. One other consideration is that skb's are kernel >>> specific, we should be able to use the same BPF filter program in >>> userspace over DPDK for instance-- so an skb interface as the packet >>> abstraction might not be the right model... >> >> I agree. I don't think reusing the SKB data structure is the right >> model. We should drop the SKB pointer from the API. >> >> As Tom also points out, making the BPF interface independent of the SKB >> meta-data structure, would also make the eBPF program more generally >> applicable. > The initial approach that I tried went down this path. Alexei advised > that I use the pseudo skb, and in the future the API between drivers and > bpf can change to adopt non-skb context. The only user facing ABIs in > this patchset are the IFLA, the xdp_metadata struct, and the name of the > new enum. > > The reason to use a pseudo skb for now is that there will be a fair > amount of churn to get bpf jit and interpreter to understand non-skb > context in the bpf_load_pointer() code. I don't see the need for > requiring that for this patchset, as it will be internal-only change > if/when we use something else. Another option would be to have per driver JIT code to patch up the skb read/loads with descriptor reads and metadata. From a strictly performance stand point it should be better than pseudo skbs. .John
On Mon, Apr 04, 2016 at 09:07:03AM -0700, John Fastabend wrote: > On 16-04-04 08:29 AM, Brenden Blanco wrote: > > On Mon, Apr 04, 2016 at 05:12:27PM +0200, Jesper Dangaard Brouer wrote: > >> On Mon, 4 Apr 2016 11:09:57 -0300 > >> Tom Herbert <tom@herbertland.com> wrote: > >> > >>> On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: > >>>> On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote: > >>>>> > >>>>> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net> > >>>>> wrote: > >>>>>> > >>>>>> On 04/02/2016 03:21 AM, Brenden Blanco wrote: > >>>>>>> > >>>>>>> Add a new bpf prog type that is intended to run in early stages of the > >>>>>>> packet rx path. Only minimal packet metadata will be available, hence a > >>>>>>> new > >>>>>>> context type, struct xdp_metadata, is exposed to userspace. So far only > >>>>>>> expose the readable packet length, and only in read mode. > >>>>>>> > >>>>>>> The PHYS_DEV name is chosen to represent that the program is meant only > >>>>>>> for physical adapters, rather than all netdevs. > >>>>>>> > >>>>>>> While the user visible struct is new, the underlying context must be > >>>>>>> implemented as a minimal skb in order for the packet load_* instructions > >>>>>>> to work. The skb filled in by the driver must have skb->len, skb->head, > >>>>>>> and skb->data set, and skb->data_len == 0. > >>>>>>> > >>>>> [...] > >>>>>> > >>>>>> > >>>>>> Do you plan to support bpf_skb_load_bytes() as well? I like using > >>>>>> this API especially when dealing with larger chunks (>4 bytes) to > >>>>>> load into stack memory, plus content is kept in network byte order. > >>>>>> > >>>>>> What about other helpers such as bpf_skb_store_bytes() et al that > >>>>>> work on skbs. Do you intent to reuse them as is and thus populate > >>>>>> the per cpu skb with needed fields (faking linear data), or do you > >>>>>> see larger obstacles that prevent for this? > >>>>> > >>>>> > >>>>> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send > >>>>> to users of this API. > >>>>> > >>>>> The hole idea is that an SKB is NOT allocated yet, and not needed at > >>>>> this level. If we start supporting calling underlying SKB functions, > >>>>> then we will end-up in the same place (performance wise). > >>>> > >>>> > >>>> I'm talking about the current skb-related BPF helper functions we have, > >>>> so the question is how much from that code we have we can reuse under > >>>> these constraints (obviously things like the tunnel helpers are a different > >>>> story) and if that trade-off is acceptable for us. I'm also thinking > >>>> that, for example, if you need to parse the packet data anyway for a drop > >>>> verdict, you might as well pass some meta data (that is set in the real > >>>> skb later on) for those packets that go up the stack. > >>> > >>> Right, the meta data in this case is an abstracted receive descriptor. > >>> This would include items that we get in a device receive descriptor > >>> (computed checksum, hash, VLAN tag). This is purposely a small > >>> restricted data structure. I'm hoping we can minimize the size of this > >>> to not much more than 32 bytes (including pointers to data and > >>> linkage). > >> > >> I agree. > >> > >>> How this translates to skb to maintain compatibility is with BPF > >>> interesting question. One other consideration is that skb's are kernel > >>> specific, we should be able to use the same BPF filter program in > >>> userspace over DPDK for instance-- so an skb interface as the packet > >>> abstraction might not be the right model... > >> > >> I agree. I don't think reusing the SKB data structure is the right > >> model. We should drop the SKB pointer from the API. > >> > >> As Tom also points out, making the BPF interface independent of the SKB > >> meta-data structure, would also make the eBPF program more generally > >> applicable. > > The initial approach that I tried went down this path. Alexei advised > > that I use the pseudo skb, and in the future the API between drivers and > > bpf can change to adopt non-skb context. The only user facing ABIs in > > this patchset are the IFLA, the xdp_metadata struct, and the name of the > > new enum. > > > > The reason to use a pseudo skb for now is that there will be a fair > > amount of churn to get bpf jit and interpreter to understand non-skb > > context in the bpf_load_pointer() code. I don't see the need for > > requiring that for this patchset, as it will be internal-only change > > if/when we use something else. > > Another option would be to have per driver JIT code to patch up the > skb read/loads with descriptor reads and metadata. From a strictly > performance stand point it should be better than pseudo skbs. I considered (and implemented) this as well, but there my problem was that I needed to inform the bpf() syscall at BPF_PROG_LOAD time which ifindex to look at for fixups, so I had to add a new ifindex field to bpf_attr. Then during verification I had to use a new ndo to get the driver-specific offsets for its particular descriptor format. It seemed kludgy. > > .John
On Mon, Apr 04, 2016 at 09:17:22AM -0700, Brenden Blanco wrote: > > >> > > >> As Tom also points out, making the BPF interface independent of the SKB > > >> meta-data structure, would also make the eBPF program more generally > > >> applicable. > > > The initial approach that I tried went down this path. Alexei advised > > > that I use the pseudo skb, and in the future the API between drivers and > > > bpf can change to adopt non-skb context. The only user facing ABIs in > > > this patchset are the IFLA, the xdp_metadata struct, and the name of the > > > new enum. Exactly. That the most important part of this rfc. Right now redirect to different queue, batching, prefetch and tons of other code are mising. We have to plan the whole project, so we can incrementally add features without breaking abi. So new IFLA, xdp_metadata struct and enum for bpf return codes are the main things to agree on. > > > The reason to use a pseudo skb for now is that there will be a fair > > > amount of churn to get bpf jit and interpreter to understand non-skb > > > context in the bpf_load_pointer() code. I don't see the need for > > > requiring that for this patchset, as it will be internal-only change > > > if/when we use something else. > > > > Another option would be to have per driver JIT code to patch up the > > skb read/loads with descriptor reads and metadata. From a strictly > > performance stand point it should be better than pseudo skbs. Per-driver pre/post JIT-like phase is actually on the table. It may still be needed. If we can avoid it while keeping high performance, great. > I considered (and implemented) this as well, but there my problem was > that I needed to inform the bpf() syscall at BPF_PROG_LOAD time which > ifindex to look at for fixups, so I had to add a new ifindex field to > bpf_attr. Then during verification I had to use a new ndo to get the > driver-specific offsets for its particular descriptor format. It seemed > kludgy. Another reason for going with 'pseudo skb' structure was to reuse load_byte/half/word instructions in bpf interpreter as-is. Right now these instructions have to see in-kernel 'struct sk_buff' as context (note we have mirror __sk_buff for user space), so to use load_byte for bpf_prog_type_phys_dev we have to give real 'struct sk_buff' to interpter with data, head, len, data_len fields initialized, so that interpreter 'just works'. The potential fix would be to add phys_dev specific load_byte/word instructions. Then we can drop all the legacy negative offset stuff that <1% uses, but it slows down everyone. We can also drop byteswap that load_word does (which turned out to be confusing and often programs do 2nd byteswap to go back to cpu endiannes). And if we do it smart, we can drop length check as well, then new_load_byte will actually be single load byte cpu instruction. We can drop length check when offset is constant in the verfier and that constant is less than 64, since all packets are larger. As seen in 'perf report' from patch 5: 3.32% ksoftirqd/1 [kernel.vmlinux] [k] sk_load_byte_positive_offset this is 14Mpps and 4 assembler instructions in the above function are consuming 3% of the cpu. Making new_load_byte to be single x86 insn would be really cool. Of course, there are other pieces to accelerate: 12.71% ksoftirqd/1 [mlx4_en] [k] mlx4_en_alloc_frags 6.87% ksoftirqd/1 [mlx4_en] [k] mlx4_en_free_frag 4.20% ksoftirqd/1 [kernel.vmlinux] [k] get_page_from_freelist 4.09% swapper [mlx4_en] [k] mlx4_en_process_rx_cq and I think Jesper's work on batch allocation is going help that a lot.
On 04/04/16 at 01:00pm, Alexei Starovoitov wrote: > Exactly. That the most important part of this rfc. > Right now redirect to different queue, batching, prefetch and tons of > other code are mising. We have to plan the whole project, so we can > incrementally add features without breaking abi. > So new IFLA, xdp_metadata struct and enum for bpf return codes are > the main things to agree on. +1 This is the most important statement in this thread so far. A plan that gets us from this RFC series to a functional forwarding engine with redirect and load/write is essential. [...] > Another reason for going with 'pseudo skb' structure was to reuse > load_byte/half/word instructions in bpf interpreter as-is. > Right now these instructions have to see in-kernel > 'struct sk_buff' as context (note we have mirror __sk_buff > for user space), so to use load_byte for bpf_prog_type_phys_dev > we have to give real 'struct sk_buff' to interpter with > data, head, len, data_len fields initialized, so that > interpreter 'just works'. > The potential fix would be to add phys_dev specific load_byte/word > instructions. Then we can drop all the legacy negative offset > stuff that <1% uses, but it slows down everyone. > We can also drop byteswap that load_word does (which turned out > to be confusing and often programs do 2nd byteswap to go > back to cpu endiannes). [...] I would really like to see a common set of helpers which applies to both cls_bpf and phys_dev. Given the existing skb based helpers cannot be overloaded, at least the phys_dev helpers should be made to work in cls_bpf context as well to allow for some portability. Otherwise we'll end up with half a dozen flavours of BPF which are all incompatible. > And if we do it smart, we can drop length check as well, > then new_load_byte will actually be single load byte cpu instruction. > We can drop length check when offset is constant in the verfier > and that constant is less than 64, since all packets are larger. > As seen in 'perf report' from patch 5: > 3.32% ksoftirqd/1 [kernel.vmlinux] [k] sk_load_byte_positive_offset > this is 14Mpps and 4 assembler instructions in the above function > are consuming 3% of the cpu. Making new_load_byte to be single > x86 insn would be really cool. Neat.
On 04/03/16 at 12:02am, Brenden Blanco wrote: > On Sat, Apr 02, 2016 at 12:39:45PM -0400, Tom Herbert wrote: > > Is there a hard restriction that this could only work with physical devices? > I suppose not, but there wouldn't be much use case as compared to tc > ingress, no? Since I was imagining that this new hook would be more > restricted in functionality due to operating on descriptors rather than > with a full skb, I tried to think of an appropriate name. > > If you think that this hook would spread, then a better name is needed. The thing that comes to mind is that this prog type makes it easier to implement batched processing of packets in BPF which would require major surgery across the tc layer to make it available in cls_bpf. Batched processing will be beneficial for software devices as well.
On Tue, Apr 05, 2016 at 12:04:39AM +0200, Thomas Graf wrote: > On 04/04/16 at 01:00pm, Alexei Starovoitov wrote: > > Exactly. That the most important part of this rfc. > > Right now redirect to different queue, batching, prefetch and tons of > > other code are mising. We have to plan the whole project, so we can > > incrementally add features without breaking abi. > > So new IFLA, xdp_metadata struct and enum for bpf return codes are > > the main things to agree on. > > +1 > This is the most important statement in this thread so far. A plan > that gets us from this RFC series to a functional forwarding engine > with redirect and load/write is essential. [...] exactly. I think the next step 2 is to figure out the redirect return code and 'rewiring' of the rx dma buffer into tx ring and auto-batching. As this rfc showed even when using standard page alloc/free the peformance is hitting 10Gbps hw limit and not being cpu bounded, so recycling of the pages and avoiding map/unmap will come at step 3. Batching is necessary even for basic redirect, since ringing doorbell for every tx buffer is not an option. > [...] I would really like to see a common set of helpers which > applies to both cls_bpf and phys_dev. Given the existing skb based > helpers cannot be overloaded, at least the phys_dev helpers should > be made to work in cls_bpf context as well to allow for some > portability. Otherwise we'll end up with half a dozen flavours of > BPF which are all incompatible. The helpers can be 'overloaded'. In my upcoming patches for bpf+tracepoints the bpf_perf_event_output() helper is different depending on program type (kprobe vs tracepoint), but logically it looks exactly the same from program point of view and BPF_FUNC_id is reused. So for cls_bpf vs bpf_phys_dev we can have the same bpf_csum_diff() helper which will have different internal implementation depending on program type.
On Mon, 4 Apr 2016 19:25:08 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Tue, Apr 05, 2016 at 12:04:39AM +0200, Thomas Graf wrote: > > On 04/04/16 at 01:00pm, Alexei Starovoitov wrote: > > > Exactly. That the most important part of this rfc. > > > Right now redirect to different queue, batching, prefetch and tons of > > > other code are mising. We have to plan the whole project, so we can > > > incrementally add features without breaking abi. > > > So new IFLA, xdp_metadata struct and enum for bpf return codes are > > > the main things to agree on. > > > > +1 > > This is the most important statement in this thread so far. A plan > > that gets us from this RFC series to a functional forwarding engine > > with redirect and load/write is essential. [...] +1 agreed, I love to see the energy in this thread! :-) > exactly. I think the next step 2 is to figure out the redirect return code > and 'rewiring' of the rx dma buffer into tx ring and auto-batching. > > As this rfc showed even when using standard page alloc/free the peformance > is hitting 10Gbps hw limit and not being cpu bounded, so recycling of > the pages and avoiding map/unmap will come at step 3. Yes, I've also noticed the standard page alloc/free performance is slowing us down. I will be working on identifying (and measuring) page allocator bottlenecks. For the early drop case, we should be able to hack the driver to, recycle the page directly (and even avoid the DMA unmap). But for the TX (forward) case, we would need some kind of page-pool cache API to recycle the page through (also useful for normal netstack usage). I'm interested in implementing a generic page-pool cache mechanism, and I plan to bring up this topic at the MM-summit in 2 weeks. Input are welcome... but as Alexei says this is likely a step 3 project. > Batching is necessary even for basic redirect, since ringing doorbell > for every tx buffer is not an option. Yes, we know TX batching is essential for performance. If we create a RX bundle/batch step (in the driver) then eBFP forward step can work on a RX-bundle and build up TX-bundle(s) (per TX device), that can TX bulk send these. Notice that I propose building TX-bundles, not sending each individual packet and flushing tail/doorbell, because I want to maximize icache efficiency of the eBFP program.
On Tue, 5 Apr 2016 00:07:14 +0200 Thomas Graf <tgraf@suug.ch> wrote: > On 04/03/16 at 12:02am, Brenden Blanco wrote: > > On Sat, Apr 02, 2016 at 12:39:45PM -0400, Tom Herbert wrote: > > > Is there a hard restriction that this could only work with physical devices? > > I suppose not, but there wouldn't be much use case as compared to tc > > ingress, no? Since I was imagining that this new hook would be more > > restricted in functionality due to operating on descriptors rather than > > with a full skb, I tried to think of an appropriate name. > > > > If you think that this hook would spread, then a better name is needed. > > The thing that comes to mind is that this prog type makes it easier to > implement batched processing of packets in BPF which would require major > surgery across the tc layer to make it available in cls_bpf. Batched > processing will be beneficial for software devices as well. +1 agreed, I would really like to see batched processing of packets in BPF, for this packet type, designed in from the start.
On Mon, 4 Apr 2016 13:00:34 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > As seen in 'perf report' from patch 5: > 3.32% ksoftirqd/1 [kernel.vmlinux] [k] sk_load_byte_positive_offset > this is 14Mpps and 4 assembler instructions in the above function > are consuming 3% of the cpu. At this level we also need to take into account the cost/overhead of a function call. Which I've measured to between 5-7 cycles, part of my time_bench_sample[1] test. > Making new_load_byte to be single x86 insn would be really cool. > > Of course, there are other pieces to accelerate: > 12.71% ksoftirqd/1 [mlx4_en] [k] mlx4_en_alloc_frags > 6.87% ksoftirqd/1 [mlx4_en] [k] mlx4_en_free_frag > 4.20% ksoftirqd/1 [kernel.vmlinux] [k] get_page_from_freelist > 4.09% swapper [mlx4_en] [k] mlx4_en_process_rx_cq > and I think Jesper's work on batch allocation is going help that a lot. Actually, it looks like all of this "overhead" comes from the page alloc/free (+ dma unmap/map). We would need a page-pool recycle mechanism to solve/remove this overhead. For the early drop case we might be able to hack recycle the page directly in the driver (and also avoid dma_unmap/map cycle). [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench_sample.c
On Tue, Apr 05, 2016 at 11:29:05AM +0200, Jesper Dangaard Brouer wrote: > > > > Of course, there are other pieces to accelerate: > > 12.71% ksoftirqd/1 [mlx4_en] [k] mlx4_en_alloc_frags > > 6.87% ksoftirqd/1 [mlx4_en] [k] mlx4_en_free_frag > > 4.20% ksoftirqd/1 [kernel.vmlinux] [k] get_page_from_freelist > > 4.09% swapper [mlx4_en] [k] mlx4_en_process_rx_cq > > and I think Jesper's work on batch allocation is going help that a lot. > > Actually, it looks like all of this "overhead" comes from the page > alloc/free (+ dma unmap/map). We would need a page-pool recycle > mechanism to solve/remove this overhead. For the early drop case we > might be able to hack recycle the page directly in the driver (and also > avoid dma_unmap/map cycle). Exactly. A cache of allocated and mapped pages will help a lot both drop and redirect use cases. After tx completion we can recycle still mmaped page into the cache (need to make sure to map them PCI_DMA_BIDIRECTIONAL) and rx can refill the ring with it. For load balancer steady state we won't have any calls to page allocator and dma. Being able to do cheap percpu pool like this is a huge advantage that any kernel bypass cannot have. I'm pretty sure it will be possible to avoid local_cmpxchg as well.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 924f537..b8a4ef2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -92,6 +92,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, + BPF_PROG_TYPE_PHYS_DEV, }; #define BPF_PSEUDO_MAP_FD 1 @@ -367,6 +368,10 @@ struct __sk_buff { __u32 tc_classid; }; +struct xdp_metadata { + __u32 len; +}; + struct bpf_tunnel_key { __u32 tunnel_id; union { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e08f8e..804ca70 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1340,6 +1340,7 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_PHYS_DEV: return true; default: return false; diff --git a/net/core/filter.c b/net/core/filter.c index b7177d0..c417db6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2018,6 +2018,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +phys_dev_func_proto(enum bpf_func_id func_id) +{ + return sk_filter_func_proto(func_id); +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { /* check bounds */ @@ -2073,6 +2079,36 @@ static bool tc_cls_act_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool __is_valid_xdp_access(int off, int size, + enum bpf_access_type type) +{ + if (off < 0 || off >= sizeof(struct xdp_metadata)) + return false; + + if (off % size != 0) + return false; + + if (size != 4) + return false; + + return true; +} + +static bool phys_dev_is_valid_access(int off, int size, + enum bpf_access_type type) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct xdp_metadata, len): + break; + default: + return false; + } + return __is_valid_xdp_access(off, size, type); +} + static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2210,6 +2246,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type, + int dst_reg, int src_reg, + int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct xdp_metadata, len): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, len)); + break; + } + + return insn - insn_buf; +} + static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -2222,6 +2278,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { .convert_ctx_access = bpf_net_convert_ctx_access, }; +static const struct bpf_verifier_ops phys_dev_ops = { + .get_func_proto = phys_dev_func_proto, + .is_valid_access = phys_dev_is_valid_access, + .convert_ctx_access = bpf_phys_dev_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -2237,11 +2299,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = { .type = BPF_PROG_TYPE_SCHED_ACT, }; +static struct bpf_prog_type_list phys_dev_type __read_mostly = { + .ops = &phys_dev_ops, + .type = BPF_PROG_TYPE_PHYS_DEV, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); + bpf_register_prog_type(&phys_dev_type); return 0; }
Add a new bpf prog type that is intended to run in early stages of the packet rx path. Only minimal packet metadata will be available, hence a new context type, struct xdp_metadata, is exposed to userspace. So far only expose the readable packet length, and only in read mode. The PHYS_DEV name is chosen to represent that the program is meant only for physical adapters, rather than all netdevs. While the user visible struct is new, the underlying context must be implemented as a minimal skb in order for the packet load_* instructions to work. The skb filled in by the driver must have skb->len, skb->head, and skb->data set, and skb->data_len == 0. Signed-off-by: Brenden Blanco <bblanco@plumgrid.com> --- include/uapi/linux/bpf.h | 5 ++++ kernel/bpf/verifier.c | 1 + net/core/filter.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+)