Message ID | 1570515415-45593-3-git-send-email-sridhar.samudrala@intel.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | Enable direct receive on AF_XDP sockets | expand |
Sridhar Samudrala <sridhar.samudrala@intel.com> writes: > int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, > struct bpf_prog *xdp_prog) > { > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > struct bpf_map *map = READ_ONCE(ri->map); > + struct xdp_sock *xsk; > + > + xsk = xdp_get_direct_xsk(ri); > + if (xsk) > + return xsk_rcv(xsk, xdp); This is a new branch and a read barrier in the XDP_REDIRECT fast path. What's the performance impact of that for non-XSK redirect? -Toke
On 2019-10-08 08:16, Sridhar Samudrala wrote: > Introduce a flag that can be specified during the bind() call > of an AF_XDP socket to receive packets directly from a queue when there is > no XDP program attached to the device. > > This is enabled by introducing a special BPF prog pointer called > BPF_PROG_DIRECT_XSK and a new bind flag XDP_DIRECT that can be specified > when binding an AF_XDP socket to a queue. At the time of bind(), an AF_XDP > socket in XDP_DIRECT mode, will attach BPF_PROG_DIRECT_XSK as a bpf program > if there is no other XDP program attached to the device. The device receive > queue is also associated with the AF_XDP socket. > > In the XDP receive path, if the bpf program is a DIRECT_XSK program, the > XDP buffer is passed to the AF_XDP socket that is associated with the > receive queue on which the packet is received. > > To avoid any overhead for nomal XDP programs, a static key is used to keep > a count of AF_XDP direct xsk sockets and static_branch_unlikely() is used > to handle receives for direct XDP sockets. > > Any attach of a normal XDP program will take precedence and the direct xsk > program will be removed. The direct XSK program will be attached > automatically when the normal XDP program is removed when there are any > AF_XDP direct sockets associated with that device. > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> > --- > include/linux/filter.h | 18 ++++++++++++ > include/linux/netdevice.h | 10 +++++++ > include/net/xdp_sock.h | 5 ++++ > include/uapi/linux/if_xdp.h | 5 ++++ > kernel/bpf/syscall.c | 7 +++-- > net/core/dev.c | 50 +++++++++++++++++++++++++++++++++ > net/core/filter.c | 58 +++++++++++++++++++++++++++++++++++++++ > net/xdp/xsk.c | 51 ++++++++++++++++++++++++++++++++-- > tools/include/uapi/linux/if_xdp.h | 5 ++++ > 9 files changed, 204 insertions(+), 5 deletions(-) > > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 2ce57645f3cd..db4ad85d8321 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -585,6 +585,9 @@ struct bpf_redirect_info { > struct bpf_map *map; > struct bpf_map *map_to_flush; > u32 kern_flags; > +#ifdef CONFIG_XDP_SOCKETS > + struct xdp_sock *xsk; > +#endif > }; > > DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); > @@ -693,6 +696,16 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, > return res; > } > > +#ifdef CONFIG_XDP_SOCKETS > +#define BPF_PROG_DIRECT_XSK 0x1 > +#define bpf_is_direct_xsk_prog(prog) \ > + ((unsigned long)prog == BPF_PROG_DIRECT_XSK) > +DECLARE_STATIC_KEY_FALSE(xdp_direct_xsk_needed); > +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp); > +#else > +#define bpf_is_direct_xsk_prog(prog) (false) > +#endif > + > static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, > struct xdp_buff *xdp) > { > @@ -702,6 +715,11 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, > * already takes rcu_read_lock() when fetching the program, so > * it's not necessary here anymore. > */ > +#ifdef CONFIG_XDP_SOCKETS > + if (static_branch_unlikely(&xdp_direct_xsk_needed) && > + bpf_is_direct_xsk_prog(prog)) > + return bpf_direct_xsk(prog, xdp); > +#endif > return BPF_PROG_RUN(prog, xdp); > } > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 48cc71aae466..f4d0f70aa718 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -743,6 +743,7 @@ struct netdev_rx_queue { > struct xdp_rxq_info xdp_rxq; > #ifdef CONFIG_XDP_SOCKETS > struct xdp_umem *umem; > + struct xdp_sock *xsk; > #endif > } ____cacheline_aligned_in_smp; > > @@ -1836,6 +1837,10 @@ struct net_device { > atomic_t carrier_up_count; > atomic_t carrier_down_count; > > +#ifdef CONFIG_XDP_SOCKETS > + u16 direct_xsk_count; Why u16? num_rx/tx_queues are unsigned ints. > +#endif > + > #ifdef CONFIG_WIRELESS_EXT > const struct iw_handler_def *wireless_handlers; > struct iw_public_data *wireless_data; > @@ -3712,6 +3717,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); > bool is_skb_forwardable(const struct net_device *dev, > const struct sk_buff *skb); > > +#ifdef CONFIG_XDP_SOCKETS > +int dev_set_direct_xsk_prog(struct net_device *dev); > +int dev_clear_direct_xsk_prog(struct net_device *dev); > +#endif > + > static __always_inline int ____dev_forward_skb(struct net_device *dev, > struct sk_buff *skb) > { > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h > index c9398ce7960f..9158233d34e1 100644 > --- a/include/net/xdp_sock.h > +++ b/include/net/xdp_sock.h > @@ -76,6 +76,9 @@ struct xsk_map_node { > struct xdp_sock **map_entry; > }; > > +/* Flags for the xdp_sock flags field. */ > +#define XDP_SOCK_DIRECT (1 << 0) > + > struct xdp_sock { > /* struct sock must be the first member of struct xdp_sock */ > struct sock sk; > @@ -104,6 +107,7 @@ struct xdp_sock { > struct list_head map_list; > /* Protects map_list */ > spinlock_t map_list_lock; > + u16 flags; Right now only the XDP_DIRECT is tracked here. Maybe track all flags, and show them in xsk_diag. > }; > > struct xdp_buff; > @@ -129,6 +133,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem); > void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); > void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); > bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); > +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id); > > void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, > struct xdp_sock **map_entry); > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h > index be328c59389d..d202b5d40859 100644 > --- a/include/uapi/linux/if_xdp.h > +++ b/include/uapi/linux/if_xdp.h > @@ -25,6 +25,11 @@ > * application. > */ > #define XDP_USE_NEED_WAKEUP (1 << 3) > +/* This option allows an AF_XDP socket bound to a queue to receive all > + * the packets directly from that queue when there is no XDP program > + * attached to the device. > + */ > +#define XDP_DIRECT (1 << 4) > > /* Flags for xsk_umem_config flags */ > #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 205f95af67d2..871d738a78d2 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -1349,13 +1349,14 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) > > void bpf_prog_put(struct bpf_prog *prog) > { > - __bpf_prog_put(prog, true); > + if (!bpf_is_direct_xsk_prog(prog)) > + __bpf_prog_put(prog, true); > } > EXPORT_SYMBOL_GPL(bpf_prog_put); > > u32 bpf_get_prog_id(const struct bpf_prog *prog) > { > - if (prog) > + if (prog && !bpf_is_direct_xsk_prog(prog)) > return prog->aux->id; > > return 0; > @@ -1364,7 +1365,7 @@ EXPORT_SYMBOL(bpf_get_prog_id); > > void bpf_set_prog_id(struct bpf_prog *prog, u32 id) > { > - if (prog) > + if (prog && !bpf_is_direct_xsk_prog(prog)) > prog->aux->id = id; > } > EXPORT_SYMBOL(bpf_set_prog_id); > diff --git a/net/core/dev.c b/net/core/dev.c > index 866d0ad936a5..eb3cd718e580 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -8269,6 +8269,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, > } else { > if (!__dev_xdp_query(dev, bpf_op, query)) > return 0; > +#ifdef CONFIG_XDP_SOCKETS > + if (dev->direct_xsk_count) > + prog = (void *)BPF_PROG_DIRECT_XSK; > +#endif Nit, but maybe hide this weirdness in a function? > } > > err = dev_xdp_install(dev, bpf_op, extack, flags, prog); > @@ -8278,6 +8282,52 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, > return err; > } > > +#ifdef CONFIG_XDP_SOCKETS > +int dev_set_direct_xsk_prog(struct net_device *dev) > +{ > + const struct net_device_ops *ops = dev->netdev_ops; > + struct bpf_prog *prog; > + bpf_op_t bpf_op; > + > + ASSERT_RTNL(); > + > + dev->direct_xsk_count++; > + > + bpf_op = ops->ndo_bpf; > + if (!bpf_op) > + return -EOPNOTSUPP; > + > + if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG)) > + return 0; > + > + prog = (void *)BPF_PROG_DIRECT_XSK; > + > + return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, prog); > +} > + > +int dev_clear_direct_xsk_prog(struct net_device *dev) > +{ > + const struct net_device_ops *ops = dev->netdev_ops; > + bpf_op_t bpf_op; > + > + ASSERT_RTNL(); > + > + dev->direct_xsk_count--; > + > + if (dev->direct_xsk_count) > + return 0; > + > + bpf_op = ops->ndo_bpf; > + if (!bpf_op) > + return -EOPNOTSUPP; > + > + if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG)) > + return 0; > + > + return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, NULL); > +} > +#endif > + > /** > * dev_new_index - allocate an ifindex > * @net: the applicable net namespace > diff --git a/net/core/filter.c b/net/core/filter.c > index ed6563622ce3..391d7d600284 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -73,6 +73,7 @@ > #include <net/lwtunnel.h> > #include <net/ipv6_stubs.h> > #include <net/bpf_sk_storage.h> > +#include <linux/static_key.h> > > /** > * sk_filter_trim_cap - run a packet through a socket filter > @@ -3546,6 +3547,22 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, > return 0; > } > > +#ifdef CONFIG_XDP_SOCKETS > +static void xdp_do_flush_xsk(struct bpf_redirect_info *ri) > +{ > + struct xdp_sock *xsk = READ_ONCE(ri->xsk); Why READ_ONCE here? > + > + if (xsk) { > + ri->xsk = NULL; > + xsk_flush(xsk); > + } > +} > +#else > +static inline void xdp_do_flush_xsk(struct bpf_redirect_info *ri) > +{ > +} > +#endif > + Move CONFIG_XDP_SOCKETS into the function, and remove the empty/bottom one. > void xdp_do_flush_map(void) > { > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > @@ -3568,6 +3585,8 @@ void xdp_do_flush_map(void) > break; > } > } > + > + xdp_do_flush_xsk(ri); > } > EXPORT_SYMBOL_GPL(xdp_do_flush_map); > > @@ -3631,11 +3650,28 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, > return err; > } > > +#ifdef CONFIG_XDP_SOCKETS > +static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri) > +{ > + return READ_ONCE(ri->xsk); Again, why READ_ONCE? Please leave the inlining to the compiler in .c files. > +} > +#else > +static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri) > +{ > + return NULL; > +} > +#endif > + > int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, > struct bpf_prog *xdp_prog) > { > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > struct bpf_map *map = READ_ONCE(ri->map); > + struct xdp_sock *xsk; > + > + xsk = xdp_get_direct_xsk(ri); > + if (xsk) > + return xsk_rcv(xsk, xdp); Hmm, maybe you need a xsk_to_flush as well. Say that a user swaps in a regular XDP program, then xsk_rcv() will be called until the flush occurs, right? IOW, all packets processed (napi budget) in the napi_poll will end up in the socket. > > if (likely(map)) > return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); > @@ -8934,4 +8970,26 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { > > const struct bpf_prog_ops sk_reuseport_prog_ops = { > }; > + > +#ifdef CONFIG_XDP_SOCKETS > +DEFINE_STATIC_KEY_FALSE(xdp_direct_xsk_needed); > +EXPORT_SYMBOL_GPL(xdp_direct_xsk_needed); > + > +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp) > +{ > + struct xdp_sock *xsk; > + > + xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index); > + if (xsk) { > + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > + > + ri->xsk = xsk; > + return XDP_REDIRECT; From the comment above. I *think* you need to ri->xsk_to_flush. Can the direct socket (swap socket) change before flush? > + } > + > + return XDP_PASS; > +} > +EXPORT_SYMBOL(bpf_direct_xsk); > +#endif > + > #endif /* CONFIG_INET */ > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c > index fa8fbb8fa3c8..8a29939bac7e 100644 > --- a/net/xdp/xsk.c > +++ b/net/xdp/xsk.c > @@ -24,6 +24,7 @@ > #include <linux/rculist.h> > #include <net/xdp_sock.h> > #include <net/xdp.h> > +#include <linux/if_link.h> > > #include "xsk_queue.h" > #include "xdp_umem.h" > @@ -264,6 +265,45 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) > return err; > } > > +static void xdp_clear_direct_xsk(struct xdp_sock *xsk) Use xs, and not xsk for consistency. > +{ > + struct net_device *dev = xsk->dev; > + u32 qid = xsk->queue_id; > + > + if (!dev->_rx[qid].xsk) > + return; > + > + dev_clear_direct_xsk_prog(dev); > + dev->_rx[qid].xsk = NULL; > + static_branch_dec(&xdp_direct_xsk_needed); > + xsk->flags &= ~XDP_SOCK_DIRECT; > +} > + > +static int xdp_set_direct_xsk(struct xdp_sock *xsk) Same here. > +{ > + struct net_device *dev = xsk->dev; > + u32 qid = xsk->queue_id; > + int err = 0; > + > + if (dev->_rx[qid].xsk) > + return -EEXIST; > + > + xsk->flags |= XDP_SOCK_DIRECT; > + static_branch_inc(&xdp_direct_xsk_needed); > + dev->_rx[qid].xsk = xsk; > + err = dev_set_direct_xsk_prog(dev); > + if (err) > + xdp_clear_direct_xsk(xsk); > + > + return err; > +} > + > +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id) > +{ > + return dev->_rx[queue_id].xsk; > +} > +EXPORT_SYMBOL(xdp_get_xsk_from_qid); > + > void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) > { > xskq_produce_flush_addr_n(umem->cq, nb_entries); > @@ -464,6 +504,11 @@ static void xsk_unbind_dev(struct xdp_sock *xs) > return; > WRITE_ONCE(xs->state, XSK_UNBOUND); > > + if (xs->flags & XDP_SOCK_DIRECT) { > + rtnl_lock(); > + xdp_clear_direct_xsk(xs); > + rtnl_unlock(); > + } > /* Wait for driver to stop using the xdp socket. */ > xdp_del_sk_umem(xs->umem, xs); > xs->dev = NULL; > @@ -604,7 +649,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) > > flags = sxdp->sxdp_flags; > if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | > - XDP_USE_NEED_WAKEUP)) > + XDP_USE_NEED_WAKEUP | XDP_DIRECT)) > return -EINVAL; > > rtnl_lock(); > @@ -632,7 +677,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) > struct socket *sock; > > if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || > - (flags & XDP_USE_NEED_WAKEUP)) { > + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_DIRECT)) { > /* Cannot specify flags for shared sockets. */ > err = -EINVAL; > goto out_unlock; > @@ -688,6 +733,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) > xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); > xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); > xdp_add_sk_umem(xs->umem, xs); > + if (flags & XDP_DIRECT) > + err = xdp_set_direct_xsk(xs); > > out_unlock: > if (err) { > diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h > index be328c59389d..d202b5d40859 100644 > --- a/tools/include/uapi/linux/if_xdp.h > +++ b/tools/include/uapi/linux/if_xdp.h > @@ -25,6 +25,11 @@ > * application. > */ > #define XDP_USE_NEED_WAKEUP (1 << 3) > +/* This option allows an AF_XDP socket bound to a queue to receive all > + * the packets directly from that queue when there is no XDP program > + * attached to the device. > + */ > +#define XDP_DIRECT (1 << 4) > > /* Flags for xsk_umem_config flags */ > #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) >
On Tue, 8 Oct 2019 at 08:59, Toke Høiland-Jørgensen <toke@redhat.com> wrote: > > Sridhar Samudrala <sridhar.samudrala@intel.com> writes: > > > int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, > > struct bpf_prog *xdp_prog) > > { > > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > > struct bpf_map *map = READ_ONCE(ri->map); > > + struct xdp_sock *xsk; > > + > > + xsk = xdp_get_direct_xsk(ri); > > + if (xsk) > > + return xsk_rcv(xsk, xdp); > > This is a new branch and a read barrier in the XDP_REDIRECT fast path. > What's the performance impact of that for non-XSK redirect? > The dependent-read-barrier in READ_ONCE? Another branch -- leave that to the branch-predictor already! ;-) No, you're right, performance impact here is interesting. I guess the same static_branch could be used here as well... > -Toke >
On Tue, 8 Oct 2019 at 10:47, Björn Töpel <bjorn.topel@gmail.com> wrote: > [...] > > The dependent-read-barrier in READ_ONCE? Another branch -- leave that > to the branch-predictor already! ;-) No, you're right, performance > impact here is interesting. I guess the same static_branch could be > used here as well... > ...and I think the READ_ONCE can be omitted.
Björn Töpel <bjorn.topel@gmail.com> writes: > On Tue, 8 Oct 2019 at 08:59, Toke Høiland-Jørgensen <toke@redhat.com> wrote: >> >> Sridhar Samudrala <sridhar.samudrala@intel.com> writes: >> >> > int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, >> > struct bpf_prog *xdp_prog) >> > { >> > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); >> > struct bpf_map *map = READ_ONCE(ri->map); >> > + struct xdp_sock *xsk; >> > + >> > + xsk = xdp_get_direct_xsk(ri); >> > + if (xsk) >> > + return xsk_rcv(xsk, xdp); >> >> This is a new branch and a read barrier in the XDP_REDIRECT fast path. >> What's the performance impact of that for non-XSK redirect? >> > > The dependent-read-barrier in READ_ONCE? Another branch -- leave that > to the branch-predictor already! ;-) No, you're right, performance > impact here is interesting. I guess the same static_branch could be > used here as well... In any case, some measurements to see the impact (or lack thereof) would be useful :) -Toke
On Mon, Oct 7, 2019 at 11:18 PM Sridhar Samudrala <sridhar.samudrala@intel.com> wrote: > + > +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp) > +{ > + struct xdp_sock *xsk; > + > + xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index); > + if (xsk) { > + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > + > + ri->xsk = xsk; > + return XDP_REDIRECT; > + } > + > + return XDP_PASS; > +} > +EXPORT_SYMBOL(bpf_direct_xsk); So you're saying there is a: """ xdpsock rxdrop 1 core (both app and queue's irq pinned to the same core) default : taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 direct-xsk :taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 6.1x improvement in drop rate """ 6.1x gain running above C code vs exactly equivalent BPF code? How is that possible?
On 10/8/2019 1:05 AM, Björn Töpel wrote: > On 2019-10-08 08:16, Sridhar Samudrala wrote: >> Introduce a flag that can be specified during the bind() call >> of an AF_XDP socket to receive packets directly from a queue when >> there is >> no XDP program attached to the device. >> >> This is enabled by introducing a special BPF prog pointer called >> BPF_PROG_DIRECT_XSK and a new bind flag XDP_DIRECT that can be specified >> when binding an AF_XDP socket to a queue. At the time of bind(), an >> AF_XDP >> socket in XDP_DIRECT mode, will attach BPF_PROG_DIRECT_XSK as a bpf >> program >> if there is no other XDP program attached to the device. The device >> receive >> queue is also associated with the AF_XDP socket. >> >> In the XDP receive path, if the bpf program is a DIRECT_XSK program, the >> XDP buffer is passed to the AF_XDP socket that is associated with the >> receive queue on which the packet is received. >> >> To avoid any overhead for nomal XDP programs, a static key is used to >> keep >> a count of AF_XDP direct xsk sockets and static_branch_unlikely() is used >> to handle receives for direct XDP sockets. >> >> Any attach of a normal XDP program will take precedence and the direct >> xsk >> program will be removed. The direct XSK program will be attached >> automatically when the normal XDP program is removed when there are any >> AF_XDP direct sockets associated with that device. >> >> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> >> --- >> include/linux/filter.h | 18 ++++++++++++ >> include/linux/netdevice.h | 10 +++++++ >> include/net/xdp_sock.h | 5 ++++ >> include/uapi/linux/if_xdp.h | 5 ++++ >> kernel/bpf/syscall.c | 7 +++-- >> net/core/dev.c | 50 >> +++++++++++++++++++++++++++++++++ >> net/core/filter.c | 58 >> +++++++++++++++++++++++++++++++++++++++ >> net/xdp/xsk.c | 51 >> ++++++++++++++++++++++++++++++++-- >> tools/include/uapi/linux/if_xdp.h | 5 ++++ >> 9 files changed, 204 insertions(+), 5 deletions(-) >> >> diff --git a/include/linux/filter.h b/include/linux/filter.h >> index 2ce57645f3cd..db4ad85d8321 100644 >> --- a/include/linux/filter.h >> +++ b/include/linux/filter.h >> @@ -585,6 +585,9 @@ struct bpf_redirect_info { >> struct bpf_map *map; >> struct bpf_map *map_to_flush; >> u32 kern_flags; >> +#ifdef CONFIG_XDP_SOCKETS >> + struct xdp_sock *xsk; >> +#endif >> }; >> DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); >> @@ -693,6 +696,16 @@ static inline u32 bpf_prog_run_clear_cb(const >> struct bpf_prog *prog, >> return res; >> } >> +#ifdef CONFIG_XDP_SOCKETS >> +#define BPF_PROG_DIRECT_XSK 0x1 >> +#define bpf_is_direct_xsk_prog(prog) \ >> + ((unsigned long)prog == BPF_PROG_DIRECT_XSK) >> +DECLARE_STATIC_KEY_FALSE(xdp_direct_xsk_needed); >> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp); >> +#else >> +#define bpf_is_direct_xsk_prog(prog) (false) >> +#endif >> + >> static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog >> *prog, >> struct xdp_buff *xdp) >> { >> @@ -702,6 +715,11 @@ static __always_inline u32 bpf_prog_run_xdp(const >> struct bpf_prog *prog, >> * already takes rcu_read_lock() when fetching the program, so >> * it's not necessary here anymore. >> */ >> +#ifdef CONFIG_XDP_SOCKETS >> + if (static_branch_unlikely(&xdp_direct_xsk_needed) && >> + bpf_is_direct_xsk_prog(prog)) >> + return bpf_direct_xsk(prog, xdp); >> +#endif >> return BPF_PROG_RUN(prog, xdp); >> } >> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >> index 48cc71aae466..f4d0f70aa718 100644 >> --- a/include/linux/netdevice.h >> +++ b/include/linux/netdevice.h >> @@ -743,6 +743,7 @@ struct netdev_rx_queue { >> struct xdp_rxq_info xdp_rxq; >> #ifdef CONFIG_XDP_SOCKETS >> struct xdp_umem *umem; >> + struct xdp_sock *xsk; >> #endif >> } ____cacheline_aligned_in_smp; >> @@ -1836,6 +1837,10 @@ struct net_device { >> atomic_t carrier_up_count; >> atomic_t carrier_down_count; >> +#ifdef CONFIG_XDP_SOCKETS >> + u16 direct_xsk_count; > > Why u16? num_rx/tx_queues are unsigned ints. OK. Will changes to unsigned int > >> +#endif >> + >> #ifdef CONFIG_WIRELESS_EXT >> const struct iw_handler_def *wireless_handlers; >> struct iw_public_data *wireless_data; >> @@ -3712,6 +3717,11 @@ int dev_forward_skb(struct net_device *dev, >> struct sk_buff *skb); >> bool is_skb_forwardable(const struct net_device *dev, >> const struct sk_buff *skb); >> +#ifdef CONFIG_XDP_SOCKETS >> +int dev_set_direct_xsk_prog(struct net_device *dev); >> +int dev_clear_direct_xsk_prog(struct net_device *dev); >> +#endif >> + >> static __always_inline int ____dev_forward_skb(struct net_device *dev, >> struct sk_buff *skb) >> { >> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h >> index c9398ce7960f..9158233d34e1 100644 >> --- a/include/net/xdp_sock.h >> +++ b/include/net/xdp_sock.h >> @@ -76,6 +76,9 @@ struct xsk_map_node { >> struct xdp_sock **map_entry; >> }; >> +/* Flags for the xdp_sock flags field. */ >> +#define XDP_SOCK_DIRECT (1 << 0) >> + >> struct xdp_sock { >> /* struct sock must be the first member of struct xdp_sock */ >> struct sock sk; >> @@ -104,6 +107,7 @@ struct xdp_sock { >> struct list_head map_list; >> /* Protects map_list */ >> spinlock_t map_list_lock; >> + u16 flags; > > Right now only the XDP_DIRECT is tracked here. Maybe track all flags, > and show them in xsk_diag. I see zc as the other field that can be converted into a flag. Do you want to include that as part of this series or can it be done later? > >> }; >> struct xdp_buff; >> @@ -129,6 +133,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem); >> void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); >> void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); >> bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); >> +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 >> queue_id); >> void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, >> struct xdp_sock **map_entry); >> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h >> index be328c59389d..d202b5d40859 100644 >> --- a/include/uapi/linux/if_xdp.h >> +++ b/include/uapi/linux/if_xdp.h >> @@ -25,6 +25,11 @@ >> * application. >> */ >> #define XDP_USE_NEED_WAKEUP (1 << 3) >> +/* This option allows an AF_XDP socket bound to a queue to receive all >> + * the packets directly from that queue when there is no XDP program >> + * attached to the device. >> + */ >> +#define XDP_DIRECT (1 << 4) >> /* Flags for xsk_umem_config flags */ >> #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index 205f95af67d2..871d738a78d2 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -1349,13 +1349,14 @@ static void __bpf_prog_put(struct bpf_prog >> *prog, bool do_idr_lock) >> void bpf_prog_put(struct bpf_prog *prog) >> { >> - __bpf_prog_put(prog, true); >> + if (!bpf_is_direct_xsk_prog(prog)) >> + __bpf_prog_put(prog, true); >> } >> EXPORT_SYMBOL_GPL(bpf_prog_put); >> u32 bpf_get_prog_id(const struct bpf_prog *prog) >> { >> - if (prog) >> + if (prog && !bpf_is_direct_xsk_prog(prog)) >> return prog->aux->id; >> return 0; >> @@ -1364,7 +1365,7 @@ EXPORT_SYMBOL(bpf_get_prog_id); >> void bpf_set_prog_id(struct bpf_prog *prog, u32 id) >> { >> - if (prog) >> + if (prog && !bpf_is_direct_xsk_prog(prog)) >> prog->aux->id = id; >> } >> EXPORT_SYMBOL(bpf_set_prog_id); >> diff --git a/net/core/dev.c b/net/core/dev.c >> index 866d0ad936a5..eb3cd718e580 100644 >> --- a/net/core/dev.c >> +++ b/net/core/dev.c >> @@ -8269,6 +8269,10 @@ int dev_change_xdp_fd(struct net_device *dev, >> struct netlink_ext_ack *extack, >> } else { >> if (!__dev_xdp_query(dev, bpf_op, query)) >> return 0; >> +#ifdef CONFIG_XDP_SOCKETS >> + if (dev->direct_xsk_count) >> + prog = (void *)BPF_PROG_DIRECT_XSK; >> +#endif > > Nit, but maybe hide this weirdness in a function? OK. > >> } >> err = dev_xdp_install(dev, bpf_op, extack, flags, prog); >> @@ -8278,6 +8282,52 @@ int dev_change_xdp_fd(struct net_device *dev, >> struct netlink_ext_ack *extack, >> return err; >> } >> +#ifdef CONFIG_XDP_SOCKETS >> +int dev_set_direct_xsk_prog(struct net_device *dev) >> +{ >> + const struct net_device_ops *ops = dev->netdev_ops; >> + struct bpf_prog *prog; >> + bpf_op_t bpf_op; >> + >> + ASSERT_RTNL(); >> + >> + dev->direct_xsk_count++; >> + >> + bpf_op = ops->ndo_bpf; >> + if (!bpf_op) >> + return -EOPNOTSUPP; >> + >> + if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG)) >> + return 0; >> + >> + prog = (void *)BPF_PROG_DIRECT_XSK; >> + >> + return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, prog); >> +} >> + >> +int dev_clear_direct_xsk_prog(struct net_device *dev) >> +{ >> + const struct net_device_ops *ops = dev->netdev_ops; >> + bpf_op_t bpf_op; >> + >> + ASSERT_RTNL(); >> + >> + dev->direct_xsk_count--; >> + >> + if (dev->direct_xsk_count) >> + return 0; >> + >> + bpf_op = ops->ndo_bpf; >> + if (!bpf_op) >> + return -EOPNOTSUPP; >> + >> + if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG)) >> + return 0; >> + >> + return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, NULL); >> +} >> +#endif >> + >> /** >> * dev_new_index - allocate an ifindex >> * @net: the applicable net namespace >> diff --git a/net/core/filter.c b/net/core/filter.c >> index ed6563622ce3..391d7d600284 100644 >> --- a/net/core/filter.c >> +++ b/net/core/filter.c >> @@ -73,6 +73,7 @@ >> #include <net/lwtunnel.h> >> #include <net/ipv6_stubs.h> >> #include <net/bpf_sk_storage.h> >> +#include <linux/static_key.h> >> /** >> * sk_filter_trim_cap - run a packet through a socket filter >> @@ -3546,6 +3547,22 @@ static int __bpf_tx_xdp_map(struct net_device >> *dev_rx, void *fwd, >> return 0; >> } >> +#ifdef CONFIG_XDP_SOCKETS >> +static void xdp_do_flush_xsk(struct bpf_redirect_info *ri) >> +{ >> + struct xdp_sock *xsk = READ_ONCE(ri->xsk); > > Why READ_ONCE here? > >> + >> + if (xsk) { >> + ri->xsk = NULL; >> + xsk_flush(xsk); >> + } >> +} >> +#else >> +static inline void xdp_do_flush_xsk(struct bpf_redirect_info *ri) >> +{ >> +} >> +#endif >> + > > Move CONFIG_XDP_SOCKETS into the function, and remove the empty/bottom one. > >> void xdp_do_flush_map(void) >> { >> struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); >> @@ -3568,6 +3585,8 @@ void xdp_do_flush_map(void) >> break; >> } >> } >> + >> + xdp_do_flush_xsk(ri); >> } >> EXPORT_SYMBOL_GPL(xdp_do_flush_map); >> @@ -3631,11 +3650,28 @@ static int xdp_do_redirect_map(struct >> net_device *dev, struct xdp_buff *xdp, >> return err; >> } >> +#ifdef CONFIG_XDP_SOCKETS >> +static inline struct xdp_sock *xdp_get_direct_xsk(struct >> bpf_redirect_info *ri) >> +{ >> + return READ_ONCE(ri->xsk); > > Again, why READ_ONCE? Please leave the inlining to the compiler in .c > files. > >> +} >> +#else >> +static inline struct xdp_sock *xdp_get_direct_xsk(struct >> bpf_redirect_info *ri) >> +{ >> + return NULL; >> +} >> +#endif >> + >> int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, >> struct bpf_prog *xdp_prog) >> { >> struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); >> struct bpf_map *map = READ_ONCE(ri->map); >> + struct xdp_sock *xsk; >> + >> + xsk = xdp_get_direct_xsk(ri); >> + if (xsk) >> + return xsk_rcv(xsk, xdp); > > Hmm, maybe you need a xsk_to_flush as well. Say that a user swaps in a > regular XDP program, then xsk_rcv() will be called until the flush > occurs, right? IOW, all packets processed (napi budget) in the napi_poll > will end up in the socket. I originally had xsk_to_flush considering this possibility of the XDP program changing before call to flush. Will re-introduce it in the next revision. Should i create a separate structure bpf_direct_xsk_info rather than adding these fields to bpf_redirect_info? > >> if (likely(map)) >> return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); >> @@ -8934,4 +8970,26 @@ const struct bpf_verifier_ops >> sk_reuseport_verifier_ops = { >> const struct bpf_prog_ops sk_reuseport_prog_ops = { >> }; >> + >> +#ifdef CONFIG_XDP_SOCKETS >> +DEFINE_STATIC_KEY_FALSE(xdp_direct_xsk_needed); >> +EXPORT_SYMBOL_GPL(xdp_direct_xsk_needed); >> + >> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp) >> +{ >> + struct xdp_sock *xsk; >> + >> + xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index); >> + if (xsk) { >> + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); >> + >> + ri->xsk = xsk; >> + return XDP_REDIRECT; > > From the comment above. I *think* you need to ri->xsk_to_flush. Can the > direct socket (swap socket) change before flush? Yes. > >> + } >> + >> + return XDP_PASS; >> +} >> +EXPORT_SYMBOL(bpf_direct_xsk); >> +#endif >> + >> #endif /* CONFIG_INET */ >> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c >> index fa8fbb8fa3c8..8a29939bac7e 100644 >> --- a/net/xdp/xsk.c >> +++ b/net/xdp/xsk.c >> @@ -24,6 +24,7 @@ >> #include <linux/rculist.h> >> #include <net/xdp_sock.h> >> #include <net/xdp.h> >> +#include <linux/if_link.h> >> #include "xsk_queue.h" >> #include "xdp_umem.h" >> @@ -264,6 +265,45 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct >> xdp_buff *xdp) >> return err; >> } >> +static void xdp_clear_direct_xsk(struct xdp_sock *xsk) > > Use xs, and not xsk for consistency. OK. > >> +{ >> + struct net_device *dev = xsk->dev; >> + u32 qid = xsk->queue_id; >> + >> + if (!dev->_rx[qid].xsk) >> + return; >> + >> + dev_clear_direct_xsk_prog(dev); >> + dev->_rx[qid].xsk = NULL; >> + static_branch_dec(&xdp_direct_xsk_needed); >> + xsk->flags &= ~XDP_SOCK_DIRECT; >> +} >> + >> +static int xdp_set_direct_xsk(struct xdp_sock *xsk) > > Same here. > >> +{ >> + struct net_device *dev = xsk->dev; >> + u32 qid = xsk->queue_id; >> + int err = 0; >> + >> + if (dev->_rx[qid].xsk) >> + return -EEXIST; >> + >> + xsk->flags |= XDP_SOCK_DIRECT; >> + static_branch_inc(&xdp_direct_xsk_needed); >> + dev->_rx[qid].xsk = xsk; >> + err = dev_set_direct_xsk_prog(dev); >> + if (err) >> + xdp_clear_direct_xsk(xsk); >> + >> + return err; >> +} >> + >> +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 >> queue_id) >> +{ >> + return dev->_rx[queue_id].xsk; >> +} >> +EXPORT_SYMBOL(xdp_get_xsk_from_qid); >> + >> void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) >> { >> xskq_produce_flush_addr_n(umem->cq, nb_entries); >> @@ -464,6 +504,11 @@ static void xsk_unbind_dev(struct xdp_sock *xs) >> return; >> WRITE_ONCE(xs->state, XSK_UNBOUND); >> + if (xs->flags & XDP_SOCK_DIRECT) { >> + rtnl_lock(); >> + xdp_clear_direct_xsk(xs); >> + rtnl_unlock(); >> + } >> /* Wait for driver to stop using the xdp socket. */ >> xdp_del_sk_umem(xs->umem, xs); >> xs->dev = NULL; >> @@ -604,7 +649,7 @@ static int xsk_bind(struct socket *sock, struct >> sockaddr *addr, int addr_len) >> flags = sxdp->sxdp_flags; >> if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | >> - XDP_USE_NEED_WAKEUP)) >> + XDP_USE_NEED_WAKEUP | XDP_DIRECT)) >> return -EINVAL; >> rtnl_lock(); >> @@ -632,7 +677,7 @@ static int xsk_bind(struct socket *sock, struct >> sockaddr *addr, int addr_len) >> struct socket *sock; >> if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || >> - (flags & XDP_USE_NEED_WAKEUP)) { >> + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_DIRECT)) { >> /* Cannot specify flags for shared sockets. */ >> err = -EINVAL; >> goto out_unlock; >> @@ -688,6 +733,8 @@ static int xsk_bind(struct socket *sock, struct >> sockaddr *addr, int addr_len) >> xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); >> xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); >> xdp_add_sk_umem(xs->umem, xs); >> + if (flags & XDP_DIRECT) >> + err = xdp_set_direct_xsk(xs); >> out_unlock: >> if (err) { >> diff --git a/tools/include/uapi/linux/if_xdp.h >> b/tools/include/uapi/linux/if_xdp.h >> index be328c59389d..d202b5d40859 100644 >> --- a/tools/include/uapi/linux/if_xdp.h >> +++ b/tools/include/uapi/linux/if_xdp.h >> @@ -25,6 +25,11 @@ >> * application. >> */ >> #define XDP_USE_NEED_WAKEUP (1 << 3) >> +/* This option allows an AF_XDP socket bound to a queue to receive all >> + * the packets directly from that queue when there is no XDP program >> + * attached to the device. >> + */ >> +#define XDP_DIRECT (1 << 4) >> /* Flags for xsk_umem_config flags */ >> #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) >>
>> + >> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp) >> +{ >> + struct xdp_sock *xsk; >> + >> + xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index); >> + if (xsk) { >> + struct bpf_redirect_info *ri = >> + this_cpu_ptr(&bpf_redirect_info); >> + >> + ri->xsk = xsk; >> + return XDP_REDIRECT; >> + } >> + >> + return XDP_PASS; >> +} >> +EXPORT_SYMBOL(bpf_direct_xsk); > > So you're saying there is a: > """ > xdpsock rxdrop 1 core (both app and queue's irq pinned to the same core) > default : taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 > direct-xsk :taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 6.1x improvement in drop rate """ > > 6.1x gain running above C code vs exactly equivalent BPF code? > How is that possible? It seems to be due to the overhead of __bpf_prog_run on older processors (Ivybridge). The overhead is smaller on newer processors, but even on skylake i see around 1.5x improvement. perf report with default xdpsock ================================ Samples: 2K of event 'cycles:ppp', Event count (approx.): 8437658090 Overhead Command Shared Object Symbol 34.57% xdpsock xdpsock [.] main 17.19% ksoftirqd/1 [kernel.vmlinux] [k] ___bpf_prog_run 13.12% xdpsock [kernel.vmlinux] [k] ___bpf_prog_run 4.09% ksoftirqd/1 [kernel.vmlinux] [k] __x86_indirect_thunk_rax 3.08% xdpsock [kernel.vmlinux] [k] nmi 2.76% ksoftirqd/1 [kernel.vmlinux] [k] xsk_map_lookup_elem 2.33% xdpsock [kernel.vmlinux] [k] __x86_indirect_thunk_rax 2.33% ksoftirqd/1 [i40e] [k] i40e_clean_rx_irq_zc 2.16% xdpsock [kernel.vmlinux] [k] bpf_map_lookup_elem 1.82% ksoftirqd/1 [kernel.vmlinux] [k] xdp_do_redirect 1.41% ksoftirqd/1 [kernel.vmlinux] [k] xsk_rcv 1.39% ksoftirqd/1 [kernel.vmlinux] [k] update_curr 1.09% ksoftirqd/1 [kernel.vmlinux] [k] bpf_xdp_redirect_map 1.09% xdpsock [i40e] [k] i40e_clean_rx_irq_zc 1.08% ksoftirqd/1 [kernel.vmlinux] [k] __xsk_map_redirect 1.07% swapper [kernel.vmlinux] [k] xsk_umem_peek_addr 1.05% ksoftirqd/1 [kernel.vmlinux] [k] xsk_umem_peek_addr 0.89% swapper [kernel.vmlinux] [k] __xsk_map_redirect 0.87% ksoftirqd/1 [kernel.vmlinux] [k] __bpf_prog_run32 0.87% swapper [kernel.vmlinux] [k] intel_idle 0.67% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map 0.57% xdpsock [kernel.vmlinux] [k] xdp_do_redirect perf report with direct xdpsock =============================== Samples: 2K of event 'cycles:ppp', Event count (approx.): 17996091975 Overhead Command Shared Object Symbol 18.44% xdpsock [i40e] [k] i40e_clean_rx_irq_zc 15.14% ksoftirqd/1 [i40e] [k] i40e_clean_rx_irq_zc 6.87% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr 5.03% ksoftirqd/1 [kernel.vmlinux] [k] xdp_do_redirect 4.21% xdpsock xdpsock [.] main 4.13% ksoftirqd/1 [i40e] [k] i40e_clean_programming_status 3.71% xdpsock [kernel.vmlinux] [k] xsk_rcv 3.44% ksoftirqd/1 [kernel.vmlinux] [k] nmi 3.41% xdpsock [kernel.vmlinux] [k] nmi 3.20% ksoftirqd/1 [kernel.vmlinux] [k] xsk_rcv 2.45% xdpsock [kernel.vmlinux] [k] xdp_get_xsk_from_qid 2.35% ksoftirqd/1 [kernel.vmlinux] [k] xsk_umem_peek_addr 2.33% ksoftirqd/1 [kernel.vmlinux] [k] net_rx_action 2.16% ksoftirqd/1 [kernel.vmlinux] [k] xsk_umem_consume_tx 2.10% swapper [kernel.vmlinux] [k] __softirqentry_text_start 2.06% xdpsock [kernel.vmlinux] [k] native_irq_return_iret 1.43% xdpsock [kernel.vmlinux] [k] check_preempt_wakeup 1.42% xdpsock [kernel.vmlinux] [k] xsk_umem_consume_tx 1.22% xdpsock [kernel.vmlinux] [k] xdp_do_redirect 1.21% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device 1.16% ksoftirqd/1 [kernel.vmlinux] [k] irqtime_account_irq 1.09% xdpsock [kernel.vmlinux] [k] sock_def_readable 0.99% swapper [kernel.vmlinux] [k] intel_idle 0.88% xdpsock [i40e] [k] i40e_clean_programming_status 0.74% ksoftirqd/1 [kernel.vmlinux] [k] xsk_umem_discard_addr 0.71% ksoftirqd/1 [kernel.vmlinux] [k] __switch_to 0.50% ksoftirqd/1 [kernel.vmlinux] [k] dma_direct_sync_single_for_device
On Wed, Oct 9, 2019 at 9:53 AM Samudrala, Sridhar <sridhar.samudrala@intel.com> wrote: > > > >> + > >> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp) > >> +{ > >> + struct xdp_sock *xsk; > >> + > >> + xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index); > >> + if (xsk) { > >> + struct bpf_redirect_info *ri = > >> + this_cpu_ptr(&bpf_redirect_info); > >> + > >> + ri->xsk = xsk; > >> + return XDP_REDIRECT; > >> + } > >> + > >> + return XDP_PASS; > >> +} > >> +EXPORT_SYMBOL(bpf_direct_xsk); > > > > So you're saying there is a: > > """ > > xdpsock rxdrop 1 core (both app and queue's irq pinned to the same core) > > default : taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 > > direct-xsk :taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 6.1x improvement in drop rate """ > > > > 6.1x gain running above C code vs exactly equivalent BPF code? > > How is that possible? > > It seems to be due to the overhead of __bpf_prog_run on older processors > (Ivybridge). The overhead is smaller on newer processors, but even on > skylake i see around 1.5x improvement. > > perf report with default xdpsock > ================================ > Samples: 2K of event 'cycles:ppp', Event count (approx.): 8437658090 > Overhead Command Shared Object Symbol > 34.57% xdpsock xdpsock [.] main > 17.19% ksoftirqd/1 [kernel.vmlinux] [k] ___bpf_prog_run > 13.12% xdpsock [kernel.vmlinux] [k] ___bpf_prog_run That must be a bad joke. The whole patch set is based on comparing native code to interpreter?! It's pretty awesome that interpreter is only 1.5x slower than native x86. Just turn the JIT on. Obvious Nack to the patch set.
On 10/9/2019 10:17 AM, Alexei Starovoitov wrote: > On Wed, Oct 9, 2019 at 9:53 AM Samudrala, Sridhar > <sridhar.samudrala@intel.com> wrote: >> >> >>>> + >>>> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp) >>>> +{ >>>> + struct xdp_sock *xsk; >>>> + >>>> + xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index); >>>> + if (xsk) { >>>> + struct bpf_redirect_info *ri = >>>> + this_cpu_ptr(&bpf_redirect_info); >>>> + >>>> + ri->xsk = xsk; >>>> + return XDP_REDIRECT; >>>> + } >>>> + >>>> + return XDP_PASS; >>>> +} >>>> +EXPORT_SYMBOL(bpf_direct_xsk); >>> >>> So you're saying there is a: >>> """ >>> xdpsock rxdrop 1 core (both app and queue's irq pinned to the same core) >>> default : taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 >>> direct-xsk :taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 6.1x improvement in drop rate """ >>> >>> 6.1x gain running above C code vs exactly equivalent BPF code? >>> How is that possible? >> >> It seems to be due to the overhead of __bpf_prog_run on older processors >> (Ivybridge). The overhead is smaller on newer processors, but even on >> skylake i see around 1.5x improvement. >> >> perf report with default xdpsock >> ================================ >> Samples: 2K of event 'cycles:ppp', Event count (approx.): 8437658090 >> Overhead Command Shared Object Symbol >> 34.57% xdpsock xdpsock [.] main >> 17.19% ksoftirqd/1 [kernel.vmlinux] [k] ___bpf_prog_run >> 13.12% xdpsock [kernel.vmlinux] [k] ___bpf_prog_run > > That must be a bad joke. > The whole patch set is based on comparing native code to interpreter?! > It's pretty awesome that interpreter is only 1.5x slower than native x86. > Just turn the JIT on. Thanks Alexei for pointing out that i didn't have JIT on. When i turn it on, the performance improvement is a more modest 1.5x with rxdrop and 1.2x with l2fwd. > > Obvious Nack to the patch set. > Will update the patchset with the right performance data and address feedback from Bjorn. Hope you are not totally against direct XDP approach as it does provide value when an AF_XDP socket is bound to a queue and a HW filter can direct packets targeted for that queue.
On Wed, Oct 9, 2019 at 12:12 PM Samudrala, Sridhar <sridhar.samudrala@intel.com> wrote: > >> 34.57% xdpsock xdpsock [.] main > >> 17.19% ksoftirqd/1 [kernel.vmlinux] [k] ___bpf_prog_run > >> 13.12% xdpsock [kernel.vmlinux] [k] ___bpf_prog_run > > > > That must be a bad joke. > > The whole patch set is based on comparing native code to interpreter?! > > It's pretty awesome that interpreter is only 1.5x slower than native x86. > > Just turn the JIT on. > > Thanks Alexei for pointing out that i didn't have JIT on. > When i turn it on, the performance improvement is a more modest 1.5x > with rxdrop and 1.2x with l2fwd. I want to see perf reports back to back with proper performance analysis. > > > > Obvious Nack to the patch set. > > > > Will update the patchset with the right performance data and address > feedback from Bjorn. > Hope you are not totally against direct XDP approach as it does provide > value when an AF_XDP socket is bound to a queue and a HW filter can > direct packets targeted for that queue. I used to be in favor of providing "prog bypass" for af_xdp, because of anecdotal evidence that it will make af_xdp faster. Now seeing the numbers and the way they were collected I'm against such bypass. I want to see hard proof that trivial bpf prog is actually slowing things down before reviewing any new patch sets.
On 10/9/2019 6:06 PM, Alexei Starovoitov wrote: >> >> Will update the patchset with the right performance data and address >> feedback from Bjorn. >> Hope you are not totally against direct XDP approach as it does provide >> value when an AF_XDP socket is bound to a queue and a HW filter can >> direct packets targeted for that queue. > > I used to be in favor of providing "prog bypass" for af_xdp, > because of anecdotal evidence that it will make af_xdp faster. > Now seeing the numbers and the way they were collected > I'm against such bypass. > I want to see hard proof that trivial bpf prog is actually slowing things down > before reviewing any new patch sets. > Here is a more detailed performance report that compares the current AF_XDP rx_drop with the patches that enable direct receive without any XDP program. I also collected and included kernel rxdrop data too as Jakub requested and also perf reports. Hope it addresses the concerns you raised with the earlier data I posted. Test Setup ========== 2 Skylake servers with Intel 40Gbe NICs connected via 100Gb Switch Server Configuration ==================== Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz (skylake) CPU(s): 112 On-line CPU(s) list: 0-111 Thread(s) per core: 2 Core(s) per socket: 28 Socket(s): 2 NUMA node(s): 2 NUMA node0 CPU(s): 0-27,56-83 NUMA node1 CPU(s): 28-55,84-111 Memory: 96GB NIC: Intel Corporation Ethernet Controller XL710 for 40GbE QSFP+ (rev 02) Distro ====== Fedora 29 (Server Edition) Kernel Configuration ==================== AF_XDP direct socket patches applied on top of bpf-next git repo HEAD: 05949f63055fcf53947886ddb8e23c8a5d41bd80 # cat /proc/cmdline BOOT_IMAGE=/vmlinuz-5.3.0-bpf-next-dxk+ root=/dev/mapper/fedora-root ro resume=/dev/mapper/fedora-swap rd.lvm.lv=fedora/root rd.lvm.lv=fedora/swap LANG=en_IN.UTF-8 For ‘mitigations OFF’ scenarios, the kernel command line parameter is changed to add ‘mitigations=off’ Packet Generator on the link partner ==================================== pktgen - sending 64 byte UDP packets at 43mpps HW filter to redirect packet to a queue ======================================= ethtool -N ens260f1 flow-type udp4 dst-ip 192.168.128.41 dst-port 9 action 28 Test Cases ========== kernel rxdrop taskset -c 28 samples/bpf/xdp_rxq_info -d ens260f1 -a XDP_DROP AF_XDP default rxdrop taskset -c 28 samples/bpf/xdpsock -i ens260f1 -r -q 28 AD_XDP direct rxdrop taskset -c 28 samples/bpf/xdpsock -i ens260f1 -r -d -q 28 Performance Results =================== Only 1 core is used in all these testcases as the app and the queue irq are pinned to the same core. ---------------------------------------------------------------------------------- mitigations ON mitigations OFF Testcase ---------------------------------------------------------- no patches with patches no patches with patches ---------------------------------------------------------------------------------- AF_XDP default rxdrop X X Y Y AF_XDP direct rxdrop N/A X+46% N/A Y+25% Kernel rxdrop X+61% X+61% Y+53% Y+53% ---------------------------------------------------------------------------------- Here Y is pps with CPU security mitigations turned OFF and it is 26% better than X. So there is 46% improvement in AF_XDP rxdrop performance with direct receive when mitigations are ON (default configuration) and 25% improvement when mitigations are turned OFF. As expected, the in-kernel rxdrop performance is higher even with direct receive in both scenarios. Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON ========================================================================== Samples: 44K of event 'cycles', Event count (approx.): 38532389541 Overhead Command Shared Object Symbol 15.31% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc 10.50% ksoftirqd/28 bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 9.48% xdpsock [i40e] [k] i40e_clean_rx_irq_zc 8.62% xdpsock xdpsock [.] main 7.11% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv 5.81% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect 4.46% xdpsock bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 3.83% xdpsock [kernel.vmlinux] [k] xsk_rcv 2.81% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map 2.78% ksoftirqd/28 [kernel.vmlinux] [k] xsk_map_lookup_elem 2.44% xdpsock [kernel.vmlinux] [k] xdp_do_redirect 2.19% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect 1.62% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr 1.57% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr 1.32% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu 1.28% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map 1.15% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device 1.12% xdpsock [kernel.vmlinux] [k] xsk_map_lookup_elem 1.06% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect 0.94% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device 0.75% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax 0.66% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status 0.64% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action 0.64% swapper [kernel.vmlinux] [k] intel_idle 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll 0.57% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON ========================================================================== Samples: 46K of event 'cycles', Event count (approx.): 38387018585 Overhead Command Shared Object Symbol 21.94% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc 14.36% xdpsock xdpsock [.] main 11.53% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv 11.32% xdpsock [i40e] [k] i40e_clean_rx_irq_zc 4.02% xdpsock [kernel.vmlinux] [k] xsk_rcv 2.91% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect 2.45% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr 2.19% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr 2.08% ksoftirqd/28 [kernel.vmlinux] [k] bpf_direct_xsk 2.07% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu 1.53% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device 1.39% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device 1.22% ksoftirqd/28 [kernel.vmlinux] [k] xdp_get_xsk_from_qid 1.12% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status 0.96% ksoftirqd/28 [i40e] [k] i40e_napi_poll 0.95% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action 0.89% xdpsock [kernel.vmlinux] [k] xdp_do_redirect 0.83% swapper [i40e] [k] i40e_clean_rx_irq_zc 0.70% swapper [kernel.vmlinux] [k] intel_idle 0.66% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu 0.60% xdpsock [kernel.vmlinux] [k] bpf_direct_xsk 0.50% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_discard_addr Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions bpf_prog_xxx bpf_xdp_redirect_map xsk_map_lookup_elem __xsk_map_redirect With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect() The above test results document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Thanks Sridhar
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> writes: > Performance Results > =================== > Only 1 core is used in all these testcases as the app and the queue irq are pinned to the same core. > > ---------------------------------------------------------------------------------- > mitigations ON mitigations OFF > Testcase ---------------------------------------------------------- > no patches with patches no patches with patches > ---------------------------------------------------------------------------------- > AF_XDP default rxdrop X X Y Y Is this really exactly the same with and without patches? You're adding an extra check to xdp_do_redirect(); are you really saying that the impact of that is zero? > AF_XDP direct rxdrop N/A X+46% N/A Y+25% > Kernel rxdrop X+61% X+61% Y+53% Y+53% > ---------------------------------------------------------------------------------- > > Here Y is pps with CPU security mitigations turned OFF and it is 26% > better than X. Any particular reason you're not sharing the values of X and Y? -Toke
On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote: > > Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON > ========================================================================== > Samples: 44K of event 'cycles', Event count (approx.): 38532389541 > Overhead Command Shared Object Symbol > 15.31% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > 10.50% ksoftirqd/28 bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 > 9.48% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > 8.62% xdpsock xdpsock [.] main > 7.11% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > 5.81% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > 4.46% xdpsock bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 > 3.83% xdpsock [kernel.vmlinux] [k] xsk_rcv why everything is duplicated? Same code runs in different tasks ? > 2.81% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map > 2.78% ksoftirqd/28 [kernel.vmlinux] [k] xsk_map_lookup_elem > 2.44% xdpsock [kernel.vmlinux] [k] xdp_do_redirect > 2.19% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect > 1.62% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr > 1.57% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr > 1.32% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > 1.28% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map > 1.15% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device > 1.12% xdpsock [kernel.vmlinux] [k] xsk_map_lookup_elem > 1.06% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect > 0.94% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device > 0.75% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax > 0.66% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status > 0.64% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action > 0.64% swapper [kernel.vmlinux] [k] intel_idle > 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll > 0.57% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > > Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON > ========================================================================== > Samples: 46K of event 'cycles', Event count (approx.): 38387018585 > Overhead Command Shared Object Symbol > 21.94% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > 14.36% xdpsock xdpsock [.] main > 11.53% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > 11.32% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > 4.02% xdpsock [kernel.vmlinux] [k] xsk_rcv > 2.91% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > 2.45% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr > 2.19% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr > 2.08% ksoftirqd/28 [kernel.vmlinux] [k] bpf_direct_xsk > 2.07% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > 1.53% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device > 1.39% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device > 1.22% ksoftirqd/28 [kernel.vmlinux] [k] xdp_get_xsk_from_qid > 1.12% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status > 0.96% ksoftirqd/28 [i40e] [k] i40e_napi_poll > 0.95% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action > 0.89% xdpsock [kernel.vmlinux] [k] xdp_do_redirect > 0.83% swapper [i40e] [k] i40e_clean_rx_irq_zc > 0.70% swapper [kernel.vmlinux] [k] intel_idle > 0.66% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > 0.60% xdpsock [kernel.vmlinux] [k] bpf_direct_xsk > 0.50% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_discard_addr > > Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that > AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions > bpf_prog_xxx > bpf_xdp_redirect_map > xsk_map_lookup_elem > __xsk_map_redirect > With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect() I don't think you're identifying the overhead correctly. xsk_map_lookup_elem is 1% but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem() which is a different function: ffffffff81493fe0 T __xsk_map_lookup_elem ffffffff81492e80 t xsk_map_lookup_elem 10% for bpf_prog_80b55d8a76303785 is huge. It's the actual code of the program _without_ any helpers. How does the program actually look?
On 10/18/2019 5:14 PM, Alexei Starovoitov wrote: > On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote: >> >> Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON >> ========================================================================== >> Samples: 44K of event 'cycles', Event count (approx.): 38532389541 >> Overhead Command Shared Object Symbol >> 15.31% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc >> 10.50% ksoftirqd/28 bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 >> 9.48% xdpsock [i40e] [k] i40e_clean_rx_irq_zc >> 8.62% xdpsock xdpsock [.] main >> 7.11% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv >> 5.81% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect >> 4.46% xdpsock bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 >> 3.83% xdpsock [kernel.vmlinux] [k] xsk_rcv > > why everything is duplicated? > Same code runs in different tasks ? Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context. > >> 2.81% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map >> 2.78% ksoftirqd/28 [kernel.vmlinux] [k] xsk_map_lookup_elem >> 2.44% xdpsock [kernel.vmlinux] [k] xdp_do_redirect >> 2.19% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect >> 1.62% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr >> 1.57% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr >> 1.32% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> 1.28% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map >> 1.15% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> 1.12% xdpsock [kernel.vmlinux] [k] xsk_map_lookup_elem >> 1.06% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect >> 0.94% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> 0.75% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax >> 0.66% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status >> 0.64% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action >> 0.64% swapper [kernel.vmlinux] [k] intel_idle >> 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll >> 0.57% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> >> Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON >> ========================================================================== >> Samples: 46K of event 'cycles', Event count (approx.): 38387018585 >> Overhead Command Shared Object Symbol >> 21.94% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc >> 14.36% xdpsock xdpsock [.] main >> 11.53% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv >> 11.32% xdpsock [i40e] [k] i40e_clean_rx_irq_zc >> 4.02% xdpsock [kernel.vmlinux] [k] xsk_rcv >> 2.91% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect >> 2.45% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr >> 2.19% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr >> 2.08% ksoftirqd/28 [kernel.vmlinux] [k] bpf_direct_xsk >> 2.07% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> 1.53% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> 1.39% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> 1.22% ksoftirqd/28 [kernel.vmlinux] [k] xdp_get_xsk_from_qid >> 1.12% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status >> 0.96% ksoftirqd/28 [i40e] [k] i40e_napi_poll >> 0.95% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action >> 0.89% xdpsock [kernel.vmlinux] [k] xdp_do_redirect >> 0.83% swapper [i40e] [k] i40e_clean_rx_irq_zc >> 0.70% swapper [kernel.vmlinux] [k] intel_idle >> 0.66% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> 0.60% xdpsock [kernel.vmlinux] [k] bpf_direct_xsk >> 0.50% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_discard_addr >> >> Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that >> AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions >> bpf_prog_xxx >> bpf_xdp_redirect_map >> xsk_map_lookup_elem >> __xsk_map_redirect >> With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect() > > I don't think you're identifying the overhead correctly. > xsk_map_lookup_elem is 1% > but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem() > which is a different function: > ffffffff81493fe0 T __xsk_map_lookup_elem > ffffffff81492e80 t xsk_map_lookup_elem > > 10% for bpf_prog_80b55d8a76303785 is huge. > It's the actual code of the program _without_ any helpers. > How does the program actually look? It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268 In the archives, i see that Toke had some comments, but somehow i didn't get his email in my inbox. > Performance Results > =================== > Only 1 core is used in all these testcases as the app and the queue irq are pinned to the same core. > > ---------------------------------------------------------------------------------- > mitigations ON mitigations OFF > Testcase ---------------------------------------------------------- > no patches with patches no patches with patches > ---------------------------------------------------------------------------------- > AF_XDP default rxdrop X X Y Y > Is this really exactly the same with and without patches? You're adding > an extra check to xdp_do_redirect(); are you really saying that the > impact of that is zero? Yes. I didn't see any impact. The variation is within +/- < 1% I could use static_key even for that check in xdp_do_redirect() if required. -Sridhar
On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote: > On 10/18/2019 5:14 PM, Alexei Starovoitov wrote: > > On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote: > > > > > > Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON > > > ========================================================================== > > > Samples: 44K of event 'cycles', Event count (approx.): 38532389541 > > > Overhead Command Shared Object Symbol > > > 15.31% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > > > 10.50% ksoftirqd/28 bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 > > > 9.48% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > > > 8.62% xdpsock xdpsock [.] main > > > 7.11% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > > > 5.81% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > > > 4.46% xdpsock bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 > > > 3.83% xdpsock [kernel.vmlinux] [k] xsk_rcv > > > > why everything is duplicated? > > Same code runs in different tasks ? > > Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context. > > > > > > 2.81% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map > > > 2.78% ksoftirqd/28 [kernel.vmlinux] [k] xsk_map_lookup_elem > > > 2.44% xdpsock [kernel.vmlinux] [k] xdp_do_redirect > > > 2.19% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect > > > 1.62% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr > > > 1.57% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr > > > 1.32% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > > > 1.28% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map > > > 1.15% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device > > > 1.12% xdpsock [kernel.vmlinux] [k] xsk_map_lookup_elem > > > 1.06% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect > > > 0.94% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device > > > 0.75% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax > > > 0.66% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status > > > 0.64% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action > > > 0.64% swapper [kernel.vmlinux] [k] intel_idle > > > 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll > > > 0.57% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > > > > > > Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON > > > ========================================================================== > > > Samples: 46K of event 'cycles', Event count (approx.): 38387018585 > > > Overhead Command Shared Object Symbol > > > 21.94% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > > > 14.36% xdpsock xdpsock [.] main > > > 11.53% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > > > 11.32% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > > > 4.02% xdpsock [kernel.vmlinux] [k] xsk_rcv > > > 2.91% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > > > 2.45% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr > > > 2.19% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr > > > 2.08% ksoftirqd/28 [kernel.vmlinux] [k] bpf_direct_xsk > > > 2.07% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > > > 1.53% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device > > > 1.39% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device > > > 1.22% ksoftirqd/28 [kernel.vmlinux] [k] xdp_get_xsk_from_qid > > > 1.12% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status > > > 0.96% ksoftirqd/28 [i40e] [k] i40e_napi_poll > > > 0.95% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action > > > 0.89% xdpsock [kernel.vmlinux] [k] xdp_do_redirect > > > 0.83% swapper [i40e] [k] i40e_clean_rx_irq_zc > > > 0.70% swapper [kernel.vmlinux] [k] intel_idle > > > 0.66% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > > > 0.60% xdpsock [kernel.vmlinux] [k] bpf_direct_xsk > > > 0.50% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_discard_addr > > > > > > Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that > > > AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions > > > bpf_prog_xxx > > > bpf_xdp_redirect_map > > > xsk_map_lookup_elem > > > __xsk_map_redirect > > > With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect() > > > > I don't think you're identifying the overhead correctly. > > xsk_map_lookup_elem is 1% > > but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem() > > which is a different function: > > ffffffff81493fe0 T __xsk_map_lookup_elem > > ffffffff81492e80 t xsk_map_lookup_elem > > > > 10% for bpf_prog_80b55d8a76303785 is huge. > > It's the actual code of the program _without_ any helpers. > > How does the program actually look? > > It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c > https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268 I see. Looks like map_gen_lookup was never implemented for xskmap. How about adding it first the way array_map_gen_lookup() is implemented? This will easily give 2x perf gain.
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes: > On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote: >> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote: >> > On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote: >> > > >> > > Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON >> > > ========================================================================== >> > > Samples: 44K of event 'cycles', Event count (approx.): 38532389541 >> > > Overhead Command Shared Object Symbol >> > > 15.31% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc >> > > 10.50% ksoftirqd/28 bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 >> > > 9.48% xdpsock [i40e] [k] i40e_clean_rx_irq_zc >> > > 8.62% xdpsock xdpsock [.] main >> > > 7.11% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv >> > > 5.81% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect >> > > 4.46% xdpsock bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 >> > > 3.83% xdpsock [kernel.vmlinux] [k] xsk_rcv >> > >> > why everything is duplicated? >> > Same code runs in different tasks ? >> >> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context. >> >> > >> > > 2.81% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map >> > > 2.78% ksoftirqd/28 [kernel.vmlinux] [k] xsk_map_lookup_elem >> > > 2.44% xdpsock [kernel.vmlinux] [k] xdp_do_redirect >> > > 2.19% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect >> > > 1.62% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr >> > > 1.57% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr >> > > 1.32% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> > > 1.28% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map >> > > 1.15% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> > > 1.12% xdpsock [kernel.vmlinux] [k] xsk_map_lookup_elem >> > > 1.06% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect >> > > 0.94% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> > > 0.75% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax >> > > 0.66% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status >> > > 0.64% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action >> > > 0.64% swapper [kernel.vmlinux] [k] intel_idle >> > > 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll >> > > 0.57% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> > > >> > > Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON >> > > ========================================================================== >> > > Samples: 46K of event 'cycles', Event count (approx.): 38387018585 >> > > Overhead Command Shared Object Symbol >> > > 21.94% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc >> > > 14.36% xdpsock xdpsock [.] main >> > > 11.53% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv >> > > 11.32% xdpsock [i40e] [k] i40e_clean_rx_irq_zc >> > > 4.02% xdpsock [kernel.vmlinux] [k] xsk_rcv >> > > 2.91% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect >> > > 2.45% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr >> > > 2.19% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr >> > > 2.08% ksoftirqd/28 [kernel.vmlinux] [k] bpf_direct_xsk >> > > 2.07% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> > > 1.53% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> > > 1.39% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> > > 1.22% ksoftirqd/28 [kernel.vmlinux] [k] xdp_get_xsk_from_qid >> > > 1.12% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status >> > > 0.96% ksoftirqd/28 [i40e] [k] i40e_napi_poll >> > > 0.95% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action >> > > 0.89% xdpsock [kernel.vmlinux] [k] xdp_do_redirect >> > > 0.83% swapper [i40e] [k] i40e_clean_rx_irq_zc >> > > 0.70% swapper [kernel.vmlinux] [k] intel_idle >> > > 0.66% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> > > 0.60% xdpsock [kernel.vmlinux] [k] bpf_direct_xsk >> > > 0.50% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_discard_addr >> > > >> > > Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that >> > > AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions >> > > bpf_prog_xxx >> > > bpf_xdp_redirect_map >> > > xsk_map_lookup_elem >> > > __xsk_map_redirect >> > > With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect() >> > >> > I don't think you're identifying the overhead correctly. >> > xsk_map_lookup_elem is 1% >> > but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem() >> > which is a different function: >> > ffffffff81493fe0 T __xsk_map_lookup_elem >> > ffffffff81492e80 t xsk_map_lookup_elem >> > >> > 10% for bpf_prog_80b55d8a76303785 is huge. >> > It's the actual code of the program _without_ any helpers. >> > How does the program actually look? >> >> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c >> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268 > > I see. Looks like map_gen_lookup was never implemented for xskmap. > How about adding it first the way array_map_gen_lookup() is implemented? > This will easily give 2x perf gain. I guess we should implement this for devmaps as well now that we allow lookups into those. However, in this particular example, the lookup from BPF is not actually needed, since bpf_redirect_map() will return a configurable error value when the map lookup fails (for exactly this use case). So replacing: if (bpf_map_lookup_elem(&xsks_map, &index)) return bpf_redirect_map(&xsks_map, index, 0); with simply return bpf_redirect_map(&xsks_map, index, XDP_PASS); would save the call to xsk_map_lookup_elem(). -Toke
On Sun, 20 Oct 2019 at 12:15, Toke Høiland-Jørgensen <toke@redhat.com> wrote: > > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes: > > > On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote: > >> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote: > >> > On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote: > >> > > > >> > > Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON > >> > > ========================================================================== > >> > > Samples: 44K of event 'cycles', Event count (approx.): 38532389541 > >> > > Overhead Command Shared Object Symbol > >> > > 15.31% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > >> > > 10.50% ksoftirqd/28 bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 > >> > > 9.48% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > >> > > 8.62% xdpsock xdpsock [.] main > >> > > 7.11% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > >> > > 5.81% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > >> > > 4.46% xdpsock bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 > >> > > 3.83% xdpsock [kernel.vmlinux] [k] xsk_rcv > >> > > >> > why everything is duplicated? > >> > Same code runs in different tasks ? > >> > >> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context. > >> > >> > > >> > > 2.81% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map > >> > > 2.78% ksoftirqd/28 [kernel.vmlinux] [k] xsk_map_lookup_elem > >> > > 2.44% xdpsock [kernel.vmlinux] [k] xdp_do_redirect > >> > > 2.19% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect > >> > > 1.62% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr > >> > > 1.57% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr > >> > > 1.32% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > >> > > 1.28% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map > >> > > 1.15% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device > >> > > 1.12% xdpsock [kernel.vmlinux] [k] xsk_map_lookup_elem > >> > > 1.06% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect > >> > > 0.94% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device > >> > > 0.75% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax > >> > > 0.66% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status > >> > > 0.64% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action > >> > > 0.64% swapper [kernel.vmlinux] [k] intel_idle > >> > > 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll > >> > > 0.57% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > >> > > > >> > > Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON > >> > > ========================================================================== > >> > > Samples: 46K of event 'cycles', Event count (approx.): 38387018585 > >> > > Overhead Command Shared Object Symbol > >> > > 21.94% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > >> > > 14.36% xdpsock xdpsock [.] main > >> > > 11.53% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > >> > > 11.32% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > >> > > 4.02% xdpsock [kernel.vmlinux] [k] xsk_rcv > >> > > 2.91% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > >> > > 2.45% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr > >> > > 2.19% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr > >> > > 2.08% ksoftirqd/28 [kernel.vmlinux] [k] bpf_direct_xsk > >> > > 2.07% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > >> > > 1.53% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device > >> > > 1.39% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device > >> > > 1.22% ksoftirqd/28 [kernel.vmlinux] [k] xdp_get_xsk_from_qid > >> > > 1.12% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status > >> > > 0.96% ksoftirqd/28 [i40e] [k] i40e_napi_poll > >> > > 0.95% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action > >> > > 0.89% xdpsock [kernel.vmlinux] [k] xdp_do_redirect > >> > > 0.83% swapper [i40e] [k] i40e_clean_rx_irq_zc > >> > > 0.70% swapper [kernel.vmlinux] [k] intel_idle > >> > > 0.66% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > >> > > 0.60% xdpsock [kernel.vmlinux] [k] bpf_direct_xsk > >> > > 0.50% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_discard_addr > >> > > > >> > > Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that > >> > > AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions > >> > > bpf_prog_xxx > >> > > bpf_xdp_redirect_map > >> > > xsk_map_lookup_elem > >> > > __xsk_map_redirect > >> > > With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect() > >> > > >> > I don't think you're identifying the overhead correctly. > >> > xsk_map_lookup_elem is 1% > >> > but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem() > >> > which is a different function: > >> > ffffffff81493fe0 T __xsk_map_lookup_elem > >> > ffffffff81492e80 t xsk_map_lookup_elem > >> > > >> > 10% for bpf_prog_80b55d8a76303785 is huge. > >> > It's the actual code of the program _without_ any helpers. > >> > How does the program actually look? > >> > >> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c > >> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268 > > > > I see. Looks like map_gen_lookup was never implemented for xskmap. > > How about adding it first the way array_map_gen_lookup() is implemented? > > This will easily give 2x perf gain. > > I guess we should implement this for devmaps as well now that we allow > lookups into those. > > However, in this particular example, the lookup from BPF is not actually > needed, since bpf_redirect_map() will return a configurable error value > when the map lookup fails (for exactly this use case). > > So replacing: > > if (bpf_map_lookup_elem(&xsks_map, &index)) > return bpf_redirect_map(&xsks_map, index, 0); > > with simply > > return bpf_redirect_map(&xsks_map, index, XDP_PASS); > > would save the call to xsk_map_lookup_elem(). > Thanks for the reminder! I just submitted a patch. Still, doing the map_gen_lookup() for xsk/devmaps still makes sense! Björn > -Toke > > _______________________________________________ > Intel-wired-lan mailing list > Intel-wired-lan@osuosl.org > https://lists.osuosl.org/mailman/listinfo/intel-wired-lan
On 10/20/2019 10:12 AM, Björn Töpel wrote: > On Sun, 20 Oct 2019 at 12:15, Toke Høiland-Jørgensen <toke@redhat.com> wrote: >> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes: >> >>> On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote: >>>> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote: >>>>> On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote: >>>>>> >>>>>> Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON >>>>>> ========================================================================== >>>>>> Samples: 44K of event 'cycles', Event count (approx.): 38532389541 >>>>>> Overhead Command Shared Object Symbol >>>>>> 15.31% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc >>>>>> 10.50% ksoftirqd/28 bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 >>>>>> 9.48% xdpsock [i40e] [k] i40e_clean_rx_irq_zc >>>>>> 8.62% xdpsock xdpsock [.] main >>>>>> 7.11% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv >>>>>> 5.81% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect >>>>>> 4.46% xdpsock bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 >>>>>> 3.83% xdpsock [kernel.vmlinux] [k] xsk_rcv >>>>> >>>>> why everything is duplicated? >>>>> Same code runs in different tasks ? >>>> >>>> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context. >>>> >>>>> >>>>>> 2.81% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map >>>>>> 2.78% ksoftirqd/28 [kernel.vmlinux] [k] xsk_map_lookup_elem >>>>>> 2.44% xdpsock [kernel.vmlinux] [k] xdp_do_redirect >>>>>> 2.19% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect >>>>>> 1.62% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr >>>>>> 1.57% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr >>>>>> 1.32% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >>>>>> 1.28% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map >>>>>> 1.15% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device >>>>>> 1.12% xdpsock [kernel.vmlinux] [k] xsk_map_lookup_elem >>>>>> 1.06% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect >>>>>> 0.94% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device >>>>>> 0.75% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax >>>>>> 0.66% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status >>>>>> 0.64% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action >>>>>> 0.64% swapper [kernel.vmlinux] [k] intel_idle >>>>>> 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll >>>>>> 0.57% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >>>>>> >>>>>> Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON >>>>>> ========================================================================== >>>>>> Samples: 46K of event 'cycles', Event count (approx.): 38387018585 >>>>>> Overhead Command Shared Object Symbol >>>>>> 21.94% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc >>>>>> 14.36% xdpsock xdpsock [.] main >>>>>> 11.53% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv >>>>>> 11.32% xdpsock [i40e] [k] i40e_clean_rx_irq_zc >>>>>> 4.02% xdpsock [kernel.vmlinux] [k] xsk_rcv >>>>>> 2.91% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect >>>>>> 2.45% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr >>>>>> 2.19% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr >>>>>> 2.08% ksoftirqd/28 [kernel.vmlinux] [k] bpf_direct_xsk >>>>>> 2.07% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >>>>>> 1.53% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device >>>>>> 1.39% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device >>>>>> 1.22% ksoftirqd/28 [kernel.vmlinux] [k] xdp_get_xsk_from_qid >>>>>> 1.12% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status >>>>>> 0.96% ksoftirqd/28 [i40e] [k] i40e_napi_poll >>>>>> 0.95% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action >>>>>> 0.89% xdpsock [kernel.vmlinux] [k] xdp_do_redirect >>>>>> 0.83% swapper [i40e] [k] i40e_clean_rx_irq_zc >>>>>> 0.70% swapper [kernel.vmlinux] [k] intel_idle >>>>>> 0.66% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >>>>>> 0.60% xdpsock [kernel.vmlinux] [k] bpf_direct_xsk >>>>>> 0.50% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_discard_addr >>>>>> >>>>>> Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that >>>>>> AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions >>>>>> bpf_prog_xxx >>>>>> bpf_xdp_redirect_map >>>>>> xsk_map_lookup_elem >>>>>> __xsk_map_redirect >>>>>> With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect() >>>>> >>>>> I don't think you're identifying the overhead correctly. >>>>> xsk_map_lookup_elem is 1% >>>>> but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem() >>>>> which is a different function: >>>>> ffffffff81493fe0 T __xsk_map_lookup_elem >>>>> ffffffff81492e80 t xsk_map_lookup_elem >>>>> >>>>> 10% for bpf_prog_80b55d8a76303785 is huge. >>>>> It's the actual code of the program _without_ any helpers. >>>>> How does the program actually look? >>>> >>>> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c >>>> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268 >>> >>> I see. Looks like map_gen_lookup was never implemented for xskmap. >>> How about adding it first the way array_map_gen_lookup() is implemented? >>> This will easily give 2x perf gain. >> >> I guess we should implement this for devmaps as well now that we allow >> lookups into those. >> >> However, in this particular example, the lookup from BPF is not actually >> needed, since bpf_redirect_map() will return a configurable error value >> when the map lookup fails (for exactly this use case). >> >> So replacing: >> >> if (bpf_map_lookup_elem(&xsks_map, &index)) >> return bpf_redirect_map(&xsks_map, index, 0); >> >> with simply >> >> return bpf_redirect_map(&xsks_map, index, XDP_PASS); >> >> would save the call to xsk_map_lookup_elem(). >> > > Thanks for the reminder! I just submitted a patch. Still, doing the > map_gen_lookup() for xsk/devmaps still makes sense! > I tried Bjorn's patch that avoids the lookups in the BPF prog. https://lore.kernel.org/netdev/20191021105938.11820-1-bjorn.topel@gmail.com/ With this patch I am also seeing around 3-4% increase in xdpsock rxdrop performance and the perf report looks like this. Samples: 44K of event 'cycles', Event count (approx.): 38749965204 Overhead Command Shared Object Symbol 16.06% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc 10.18% ksoftirqd/28 bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db 10.15% xdpsock [i40e] [k] i40e_clean_rx_irq_zc 10.06% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv 7.45% xdpsock xdpsock [.] main 5.76% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect 4.51% xdpsock bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db 3.67% xdpsock [kernel.vmlinux] [k] xsk_rcv 3.06% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map 2.34% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect 2.33% xdpsock [kernel.vmlinux] [k] xdp_do_redirect 1.69% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr 1.69% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr 1.42% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu 1.19% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map 1.13% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device 0.95% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device 0.92% swapper [kernel.vmlinux] [k] intel_idle 0.92% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect 0.80% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax 0.73% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status 0.71% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_lookup_elem 0.63% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll 0.58% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu So with this patch applied, direct receive performance improvement comes down from 46% to 42%. I think it is still substantial enough to provide an option to allow direct receive for certain use cases. If it is OK, i can re-spin and submit the patches on top of the latest bpf-next Thanks Sridhar
On Mon, Oct 21, 2019 at 1:10 PM Samudrala, Sridhar <sridhar.samudrala@intel.com> wrote: > > On 10/20/2019 10:12 AM, Björn Töpel wrote: > > On Sun, 20 Oct 2019 at 12:15, Toke Høiland-Jørgensen <toke@redhat.com> wrote: > >> > >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes: > >> > >>> On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote: > >>>> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote: > >>>>> On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote: > >>>>>> > >>>>>> Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON > >>>>>> ========================================================================== > >>>>>> Samples: 44K of event 'cycles', Event count (approx.): 38532389541 > >>>>>> Overhead Command Shared Object Symbol > >>>>>> 15.31% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > >>>>>> 10.50% ksoftirqd/28 bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 > >>>>>> 9.48% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > >>>>>> 8.62% xdpsock xdpsock [.] main > >>>>>> 7.11% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > >>>>>> 5.81% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > >>>>>> 4.46% xdpsock bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 > >>>>>> 3.83% xdpsock [kernel.vmlinux] [k] xsk_rcv > >>>>> > >>>>> why everything is duplicated? > >>>>> Same code runs in different tasks ? > >>>> > >>>> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context. > >>>> > >>>>> > >>>>>> 2.81% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map > >>>>>> 2.78% ksoftirqd/28 [kernel.vmlinux] [k] xsk_map_lookup_elem > >>>>>> 2.44% xdpsock [kernel.vmlinux] [k] xdp_do_redirect > >>>>>> 2.19% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect > >>>>>> 1.62% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr > >>>>>> 1.57% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr > >>>>>> 1.32% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > >>>>>> 1.28% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map > >>>>>> 1.15% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device > >>>>>> 1.12% xdpsock [kernel.vmlinux] [k] xsk_map_lookup_elem > >>>>>> 1.06% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect > >>>>>> 0.94% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device > >>>>>> 0.75% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax > >>>>>> 0.66% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status > >>>>>> 0.64% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action > >>>>>> 0.64% swapper [kernel.vmlinux] [k] intel_idle > >>>>>> 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll > >>>>>> 0.57% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > >>>>>> > >>>>>> Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON > >>>>>> ========================================================================== > >>>>>> Samples: 46K of event 'cycles', Event count (approx.): 38387018585 > >>>>>> Overhead Command Shared Object Symbol > >>>>>> 21.94% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > >>>>>> 14.36% xdpsock xdpsock [.] main > >>>>>> 11.53% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > >>>>>> 11.32% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > >>>>>> 4.02% xdpsock [kernel.vmlinux] [k] xsk_rcv > >>>>>> 2.91% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > >>>>>> 2.45% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr > >>>>>> 2.19% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr > >>>>>> 2.08% ksoftirqd/28 [kernel.vmlinux] [k] bpf_direct_xsk > >>>>>> 2.07% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > >>>>>> 1.53% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device > >>>>>> 1.39% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device > >>>>>> 1.22% ksoftirqd/28 [kernel.vmlinux] [k] xdp_get_xsk_from_qid > >>>>>> 1.12% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status > >>>>>> 0.96% ksoftirqd/28 [i40e] [k] i40e_napi_poll > >>>>>> 0.95% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action > >>>>>> 0.89% xdpsock [kernel.vmlinux] [k] xdp_do_redirect > >>>>>> 0.83% swapper [i40e] [k] i40e_clean_rx_irq_zc > >>>>>> 0.70% swapper [kernel.vmlinux] [k] intel_idle > >>>>>> 0.66% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > >>>>>> 0.60% xdpsock [kernel.vmlinux] [k] bpf_direct_xsk > >>>>>> 0.50% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_discard_addr > >>>>>> > >>>>>> Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that > >>>>>> AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions > >>>>>> bpf_prog_xxx > >>>>>> bpf_xdp_redirect_map > >>>>>> xsk_map_lookup_elem > >>>>>> __xsk_map_redirect > >>>>>> With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect() > >>>>> > >>>>> I don't think you're identifying the overhead correctly. > >>>>> xsk_map_lookup_elem is 1% > >>>>> but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem() > >>>>> which is a different function: > >>>>> ffffffff81493fe0 T __xsk_map_lookup_elem > >>>>> ffffffff81492e80 t xsk_map_lookup_elem > >>>>> > >>>>> 10% for bpf_prog_80b55d8a76303785 is huge. > >>>>> It's the actual code of the program _without_ any helpers. > >>>>> How does the program actually look? > >>>> > >>>> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c > >>>> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268 > >>> > >>> I see. Looks like map_gen_lookup was never implemented for xskmap. > >>> How about adding it first the way array_map_gen_lookup() is implemented? > >>> This will easily give 2x perf gain. > >> > >> I guess we should implement this for devmaps as well now that we allow > >> lookups into those. > >> > >> However, in this particular example, the lookup from BPF is not actually > >> needed, since bpf_redirect_map() will return a configurable error value > >> when the map lookup fails (for exactly this use case). > >> > >> So replacing: > >> > >> if (bpf_map_lookup_elem(&xsks_map, &index)) > >> return bpf_redirect_map(&xsks_map, index, 0); > >> > >> with simply > >> > >> return bpf_redirect_map(&xsks_map, index, XDP_PASS); > >> > >> would save the call to xsk_map_lookup_elem(). > >> > > > > Thanks for the reminder! I just submitted a patch. Still, doing the > > map_gen_lookup() for xsk/devmaps still makes sense! > > > > I tried Bjorn's patch that avoids the lookups in the BPF prog. > https://lore.kernel.org/netdev/20191021105938.11820-1-bjorn.topel@gmail.com/ > > With this patch I am also seeing around 3-4% increase in xdpsock rxdrop performance and > the perf report looks like this. > > Samples: 44K of event 'cycles', Event count (approx.): 38749965204 > Overhead Command Shared Object Symbol > 16.06% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > 10.18% ksoftirqd/28 bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db > 10.15% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > 10.06% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > 7.45% xdpsock xdpsock [.] main > 5.76% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > 4.51% xdpsock bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db > 3.67% xdpsock [kernel.vmlinux] [k] xsk_rcv > 3.06% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map > 2.34% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect > 2.33% xdpsock [kernel.vmlinux] [k] xdp_do_redirect > 1.69% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr > 1.69% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr > 1.42% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > 1.19% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map > 1.13% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device > 0.95% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device > 0.92% swapper [kernel.vmlinux] [k] intel_idle > 0.92% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect > 0.80% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax > 0.73% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status > 0.71% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_lookup_elem > 0.63% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action > 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll > 0.58% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu > > So with this patch applied, direct receive performance improvement comes down from 46% to 42%. > I think it is still substantial enough to provide an option to allow direct receive for > certain use cases. If it is OK, i can re-spin and submit the patches on top of the latest bpf-next I think it's too early to consider such drastic approach. The run-time performance of XDP program should be the same as C code. Something fishy in these numbers, since spending 10% cpu in few loads and single call to bpf_xdp_redirect_map() just not right.
On 10/21/2019 3:34 PM, Alexei Starovoitov wrote: > On Mon, Oct 21, 2019 at 1:10 PM Samudrala, Sridhar > <sridhar.samudrala@intel.com> wrote: >> >> On 10/20/2019 10:12 AM, Björn Töpel wrote: >>> On Sun, 20 Oct 2019 at 12:15, Toke Høiland-Jørgensen <toke@redhat.com> wrote: >>>> >>>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes: >>>> >>>>> On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote: >>>>>> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote: >>>>>>> On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote: >>>>>>>> >>>>>>>> Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON >>>>>>>> ========================================================================== >>>>>>>> Samples: 44K of event 'cycles', Event count (approx.): 38532389541 >>>>>>>> Overhead Command Shared Object Symbol >>>>>>>> 15.31% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc >>>>>>>> 10.50% ksoftirqd/28 bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 >>>>>>>> 9.48% xdpsock [i40e] [k] i40e_clean_rx_irq_zc >>>>>>>> 8.62% xdpsock xdpsock [.] main >>>>>>>> 7.11% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv >>>>>>>> 5.81% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect >>>>>>>> 4.46% xdpsock bpf_prog_80b55d8a76303785 [k] bpf_prog_80b55d8a76303785 >>>>>>>> 3.83% xdpsock [kernel.vmlinux] [k] xsk_rcv >>>>>>> >>>>>>> why everything is duplicated? >>>>>>> Same code runs in different tasks ? >>>>>> >>>>>> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context. >>>>>> >>>>>>> >>>>>>>> 2.81% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map >>>>>>>> 2.78% ksoftirqd/28 [kernel.vmlinux] [k] xsk_map_lookup_elem >>>>>>>> 2.44% xdpsock [kernel.vmlinux] [k] xdp_do_redirect >>>>>>>> 2.19% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect >>>>>>>> 1.62% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr >>>>>>>> 1.57% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr >>>>>>>> 1.32% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >>>>>>>> 1.28% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map >>>>>>>> 1.15% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device >>>>>>>> 1.12% xdpsock [kernel.vmlinux] [k] xsk_map_lookup_elem >>>>>>>> 1.06% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect >>>>>>>> 0.94% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device >>>>>>>> 0.75% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax >>>>>>>> 0.66% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status >>>>>>>> 0.64% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action >>>>>>>> 0.64% swapper [kernel.vmlinux] [k] intel_idle >>>>>>>> 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll >>>>>>>> 0.57% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >>>>>>>> >>>>>>>> Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON >>>>>>>> ========================================================================== >>>>>>>> Samples: 46K of event 'cycles', Event count (approx.): 38387018585 >>>>>>>> Overhead Command Shared Object Symbol >>>>>>>> 21.94% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc >>>>>>>> 14.36% xdpsock xdpsock [.] main >>>>>>>> 11.53% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv >>>>>>>> 11.32% xdpsock [i40e] [k] i40e_clean_rx_irq_zc >>>>>>>> 4.02% xdpsock [kernel.vmlinux] [k] xsk_rcv >>>>>>>> 2.91% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect >>>>>>>> 2.45% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr >>>>>>>> 2.19% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr >>>>>>>> 2.08% ksoftirqd/28 [kernel.vmlinux] [k] bpf_direct_xsk >>>>>>>> 2.07% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >>>>>>>> 1.53% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device >>>>>>>> 1.39% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device >>>>>>>> 1.22% ksoftirqd/28 [kernel.vmlinux] [k] xdp_get_xsk_from_qid >>>>>>>> 1.12% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status >>>>>>>> 0.96% ksoftirqd/28 [i40e] [k] i40e_napi_poll >>>>>>>> 0.95% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action >>>>>>>> 0.89% xdpsock [kernel.vmlinux] [k] xdp_do_redirect >>>>>>>> 0.83% swapper [i40e] [k] i40e_clean_rx_irq_zc >>>>>>>> 0.70% swapper [kernel.vmlinux] [k] intel_idle >>>>>>>> 0.66% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >>>>>>>> 0.60% xdpsock [kernel.vmlinux] [k] bpf_direct_xsk >>>>>>>> 0.50% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_discard_addr >>>>>>>> >>>>>>>> Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that >>>>>>>> AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions >>>>>>>> bpf_prog_xxx >>>>>>>> bpf_xdp_redirect_map >>>>>>>> xsk_map_lookup_elem >>>>>>>> __xsk_map_redirect >>>>>>>> With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect() >>>>>>> >>>>>>> I don't think you're identifying the overhead correctly. >>>>>>> xsk_map_lookup_elem is 1% >>>>>>> but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem() >>>>>>> which is a different function: >>>>>>> ffffffff81493fe0 T __xsk_map_lookup_elem >>>>>>> ffffffff81492e80 t xsk_map_lookup_elem >>>>>>> >>>>>>> 10% for bpf_prog_80b55d8a76303785 is huge. >>>>>>> It's the actual code of the program _without_ any helpers. >>>>>>> How does the program actually look? >>>>>> >>>>>> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c >>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268 >>>>> >>>>> I see. Looks like map_gen_lookup was never implemented for xskmap. >>>>> How about adding it first the way array_map_gen_lookup() is implemented? >>>>> This will easily give 2x perf gain. >>>> >>>> I guess we should implement this for devmaps as well now that we allow >>>> lookups into those. >>>> >>>> However, in this particular example, the lookup from BPF is not actually >>>> needed, since bpf_redirect_map() will return a configurable error value >>>> when the map lookup fails (for exactly this use case). >>>> >>>> So replacing: >>>> >>>> if (bpf_map_lookup_elem(&xsks_map, &index)) >>>> return bpf_redirect_map(&xsks_map, index, 0); >>>> >>>> with simply >>>> >>>> return bpf_redirect_map(&xsks_map, index, XDP_PASS); >>>> >>>> would save the call to xsk_map_lookup_elem(). >>>> >>> >>> Thanks for the reminder! I just submitted a patch. Still, doing the >>> map_gen_lookup() for xsk/devmaps still makes sense! >>> >> >> I tried Bjorn's patch that avoids the lookups in the BPF prog. >> https://lore.kernel.org/netdev/20191021105938.11820-1-bjorn.topel@gmail.com/ >> >> With this patch I am also seeing around 3-4% increase in xdpsock rxdrop performance and >> the perf report looks like this. >> >> Samples: 44K of event 'cycles', Event count (approx.): 38749965204 >> Overhead Command Shared Object Symbol >> 16.06% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc >> 10.18% ksoftirqd/28 bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db >> 10.15% xdpsock [i40e] [k] i40e_clean_rx_irq_zc >> 10.06% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv >> 7.45% xdpsock xdpsock [.] main >> 5.76% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect >> 4.51% xdpsock bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db >> 3.67% xdpsock [kernel.vmlinux] [k] xsk_rcv >> 3.06% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map >> 2.34% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect >> 2.33% xdpsock [kernel.vmlinux] [k] xdp_do_redirect >> 1.69% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr >> 1.69% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr >> 1.42% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> 1.19% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map >> 1.13% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> 0.95% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device >> 0.92% swapper [kernel.vmlinux] [k] intel_idle >> 0.92% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect >> 0.80% ksoftirqd/28 [kernel.vmlinux] [k] __x86_indirect_thunk_rax >> 0.73% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status >> 0.71% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_lookup_elem >> 0.63% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action >> 0.62% ksoftirqd/28 [i40e] [k] i40e_napi_poll >> 0.58% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu >> >> So with this patch applied, direct receive performance improvement comes down from 46% to 42%. >> I think it is still substantial enough to provide an option to allow direct receive for >> certain use cases. If it is OK, i can re-spin and submit the patches on top of the latest bpf-next > > I think it's too early to consider such drastic approach. > The run-time performance of XDP program should be the same as C code. > Something fishy in these numbers, since spending 10% cpu in few loads > and single call to bpf_xdp_redirect_map() just not right. OK. Here is another data point that shows the perf report with the same test but CPU mitigations turned OFF. Here bpf_prog overhead goes down from almost (10.18 + 4.51)% to (3.23 + 1.44%). 21.40% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc 14.13% xdpsock [i40e] [k] i40e_clean_rx_irq_zc 8.33% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv 6.09% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect 5.19% xdpsock xdpsock [.] main 3.48% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map 3.23% ksoftirqd/28 bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db 3.06% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_redirect 2.72% xdpsock [kernel.vmlinux] [k] xdp_do_redirect 2.27% xdpsock [kernel.vmlinux] [k] xsk_rcv 2.10% xdpsock [kernel.vmlinux] [k] xsk_umem_peek_addr 2.09% ksoftirqd/28 [kernel.vmlinux] [k] xsk_umem_peek_addr 1.89% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu 1.44% xdpsock bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db 1.36% xdpsock [kernel.vmlinux] [k] bpf_xdp_redirect_map 1.31% ksoftirqd/28 [kernel.vmlinux] [k] dma_direct_sync_single_for_device 1.30% xdpsock [kernel.vmlinux] [k] __xsk_map_redirect 1.23% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_device 0.97% ksoftirqd/28 [kernel.vmlinux] [k] __xsk_map_lookup_elem 0.90% ksoftirqd/28 [i40e] [k] i40e_clean_programming_status 0.81% xdpsock [kernel.vmlinux] [k] dma_direct_sync_single_for_cpu 0.76% swapper [i40e] [k] i40e_clean_rx_irq_zc 0.75% swapper [kernel.vmlinux] [k] intel_idle 0.59% ksoftirqd/28 [kernel.vmlinux] [k] net_rx_action So a major component of the bpf_prog overhead seems to be due to the CPU vulnerability mitigations. The other component is the bpf_xdp_redirect_map() codepath. Let me know if it helps to collect any other data that should further help with the perf analysis.
On Tue, Oct 22, 2019 at 12:06 PM Samudrala, Sridhar <sridhar.samudrala@intel.com> wrote: > > OK. Here is another data point that shows the perf report with the same test but CPU mitigations > turned OFF. Here bpf_prog overhead goes down from almost (10.18 + 4.51)% to (3.23 + 1.44%). > > 21.40% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > 14.13% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > 8.33% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > 6.09% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > 5.19% xdpsock xdpsock [.] main > 3.48% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map > 3.23% ksoftirqd/28 bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db > > So a major component of the bpf_prog overhead seems to be due to the CPU vulnerability mitigations. I feel that it's an incorrect conclusion because JIT is not doing any retpolines (because there are no indirect calls in bpf). There should be no difference in bpf_prog runtime with or without mitigations. Also you're running root, so no spectre mitigations either. This 3% seems like a lot for a function that does few loads that should hit d-cache and one direct call. Please investigate why you're seeing this 10% cpu cost when mitigations are on. perf report/annotate is the best. Also please double check that you're using the latest perf. Since bpf performance analysis was greatly improved several versions ago. I don't think old perf will be showing bogus numbers, but better to run the latest. > The other component is the bpf_xdp_redirect_map() codepath. > > Let me know if it helps to collect any other data that should further help with the perf analysis. >
> > OK. Here is another data point that shows the perf report with the same test but CPU mitigations > > turned OFF. Here bpf_prog overhead goes down from almost (10.18 + 4.51)% to (3.23 + 1.44%). > > > > 21.40% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > > 14.13% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > > 8.33% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > > 6.09% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > > 5.19% xdpsock xdpsock [.] main > > 3.48% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map > > 3.23% ksoftirqd/28 bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db > > > > So a major component of the bpf_prog overhead seems to be due to the CPU vulnerability mitigations. > I feel that it's an incorrect conclusion because JIT is not doing > any retpolines (because there are no indirect calls in bpf). > There should be no difference in bpf_prog runtime with or without mitigations. > Also you're running root, so no spectre mitigations either. > This 3% seems like a lot for a function that does few loads that should > hit d-cache and one direct call. > Please investigate why you're seeing this 10% cpu cost when mitigations are on. > perf report/annotate is the best. > Also please double check that you're using the latest perf. > Since bpf performance analysis was greatly improved several versions ago. > I don't think old perf will be showing bogus numbers, but better to > run the latest. Here is perf annotate output for bpf_prog_ with and without mitigations turned ON Using the perf built from the bpf-next tree. perf version 5.3.g4071324a76c1 With mitigations ON ------------------- Samples: 6K of event 'cycles', 4000 Hz, Event count (approx.): 5646512726 bpf_prog_3c8251c7e0fef8db bpf_prog_3c8251c7e0fef8db [Percent: local period] 45.05 push %rbp 0.02 mov %rsp,%rbp 0.03 sub $0x8,%rsp 22.09 push %rbx 7.66 push %r13 1.08 push %r14 1.85 push %r15 0.63 pushq $0x0 1.13 mov 0x28(%rdi),%rsi 0.47 mov 0x8(%rsi),%esi 3.47 mov %esi,-0x4(%rbp) 0.02 movabs $0xffff8ab414a83e00,%rdi 0.90 mov $0x2,%edx 2.85 callq *ffffffffd149fc5f 1.55 and $0x6,%rax test %rax,%rax 1.48 jne 72 mov %rbp,%rsi add $0xfffffffffffffffc,%rsi movabs $0xffff8ab414a83e00,%rdi callq *ffffffffd0e5fd5f mov %rax,%rdi mov $0x2,%eax test %rdi,%rdi je 72 mov -0x4(%rbp),%esi movabs $0xffff8ab414a83e00,%rdi xor %edx,%edx callq *ffffffffd149fc5f 72: pop %rbx pop %r15 1.90 pop %r14 1.93 pop %r13 pop %rbx 3.63 leaveq 2.27 retq With mitigations OFF -------------------- Samples: 2K of event 'cycles', 4000 Hz, Event count (approx.): 1872116166 bpf_prog_3c8251c7e0fef8db bpf_prog_3c8251c7e0fef8db [Percent: local period] 0.15 push %rbp mov %rsp,%rbp 13.79 sub $0x8,%rsp 0.30 push %rbx 0.15 push %r13 0.20 push %r14 14.50 push %r15 0.20 pushq $0x0 mov 0x28(%rdi),%rsi 0.25 mov 0x8(%rsi),%esi 14.37 mov %esi,-0x4(%rbp) 0.25 movabs $0xffff8ea2c673b800,%rdi mov $0x2,%edx 13.60 callq *ffffffffe50c2f38 14.33 and $0x6,%rax test %rax,%rax jne 72 mov %rbp,%rsi add $0xfffffffffffffffc,%rsi movabs $0xffff8ea2c673b800,%rdi callq *ffffffffe4a83038 mov %rax,%rdi mov $0x2,%eax test %rdi,%rdi je 72 mov -0x4(%rbp),%esi movabs $0xffff8ea2c673b800,%rdi xor %edx,%edx callq *ffffffffe50c2f38 72: pop %rbx pop %r15 13.97 pop %r14 0.10 pop %r13 pop %rbx 13.71 leaveq 0.15 retq Do you see any issues with this data? With mitigations ON push %rbp and push %rbx overhead seems to be pretty high. Thanks Sridhar
On Thu, 24 Oct 2019 at 20:12, Samudrala, Sridhar <sridhar.samudrala@intel.com> wrote: > [...] > > With mitigations ON > ------------------- > Samples: 6K of event 'cycles', 4000 Hz, Event count (approx.): 5646512726 > bpf_prog_3c8251c7e0fef8db bpf_prog_3c8251c7e0fef8db [Percent: local period] > 45.05 push %rbp > 0.02 mov %rsp,%rbp > 0.03 sub $0x8,%rsp > 22.09 push %rbx [...] > > Do you see any issues with this data? With mitigations ON push %rbp and push %rbx overhead seems to > be pretty high. That's sample skid from the retpoline thunk when entring the XDP program. Pretty expensive push otherwise! :-) Another thought; Disclaimer: I'm no spectrev2 expert, and probably not getting the mitigations well enough. So this is me trying to swim at the deep end! Would it be possible to avoid the retpoline when entering the XDP program. At least for some XDP program that can be proved "safe"? If so, PeterZ's upcoming static_call could be used from the driver side. Björn
On Wed, 23 Oct 2019 at 19:42, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > On Tue, Oct 22, 2019 at 12:06 PM Samudrala, Sridhar > <sridhar.samudrala@intel.com> wrote: > > > > OK. Here is another data point that shows the perf report with the same test but CPU mitigations > > turned OFF. Here bpf_prog overhead goes down from almost (10.18 + 4.51)% to (3.23 + 1.44%). > > > > 21.40% ksoftirqd/28 [i40e] [k] i40e_clean_rx_irq_zc > > 14.13% xdpsock [i40e] [k] i40e_clean_rx_irq_zc > > 8.33% ksoftirqd/28 [kernel.vmlinux] [k] xsk_rcv > > 6.09% ksoftirqd/28 [kernel.vmlinux] [k] xdp_do_redirect > > 5.19% xdpsock xdpsock [.] main > > 3.48% ksoftirqd/28 [kernel.vmlinux] [k] bpf_xdp_redirect_map > > 3.23% ksoftirqd/28 bpf_prog_3c8251c7e0fef8db [k] bpf_prog_3c8251c7e0fef8db > > > > So a major component of the bpf_prog overhead seems to be due to the CPU vulnerability mitigations. > > I feel that it's an incorrect conclusion because JIT is not doing > any retpolines (because there are no indirect calls in bpf). > There should be no difference in bpf_prog runtime with or without mitigations. > Also you're running root, so no spectre mitigations either. > > This 3% seems like a lot for a function that does few loads that should > hit d-cache and one direct call. > Please investigate why you're seeing this 10% cpu cost when mitigations are on. > perf report/annotate is the best. > Also please double check that you're using the latest perf. > Since bpf performance analysis was greatly improved several versions ago. > I don't think old perf will be showing bogus numbers, but better to > run the latest. > For comparison, on my Skylake 3GHz with mitigations off. (I have one internal patch that inlines xsk_rcv() into __xsk_map_redirect, so that's why it's not showing xsk_rcv(). I'll upstream that...) 41.79% [kernel] [k] i40e_clean_rx_irq_zc 15.55% [kernel] [k] __xsk_map_redirect 9.87% [kernel] [k] xdp_do_redirect 6.89% [kernel] [k] xsk_umem_peek_addr 6.37% [kernel] [k] bpf_xdp_redirect_map 5.02% bpf_prog_992d9ddc835e5629 [k] bpf_prog_992d9ddc835e5629 Again, it might look weird that simple functions like bpf_xdp_redirect_map and the XDP program is 6% and 5%. Let's dig into that. I let the xdpsock program (rxdrop) run on one core 22, and the ksoftirqd on core 20. Core 20 is only processing packets, plus the regular kernel householding. I did a processor trace for core 20 for 207 936 packets. In total it's 84,407,427 instructions, and bpf_xdp_redirect_map() is 8,109,504 instructions, which is 9.6%. bpf_xdp_redirect_map() executes 39 instructions for AF_XDP. As perf is reporting less than 9.6% means that the IPC count of that function is more than the average which perf-stat reports as IPC of 2.88. The BPF program executes fewer instructions than bpf_xdp_redirect_map(), so given that perf shows 5%, means that the IPC count is better than average here as well. So, it's roughly 405 instructions per packet, and with an IPC of 2.88 that'll give ~140 cycles per packet, which on this machine (3,000,000,000/140) is ~21.4 Mpps. The xdpsock application reports 21. This is sane. The TL;DR version is: 6% and 5% for bpf_xdp_redirect_map and bpf_prog_992d9ddc835e5629 might seem high, but it's just that the total number of instruction executing is fairly low. So, even though the functions are small in size, it will show up as non-negligible percentage in perf. At these speeds, really small things have an impact on performance. DPDK has ~50 cycles per packet. Björn > > The other component is the bpf_xdp_redirect_map() codepath. > > > > Let me know if it helps to collect any other data that should further help with the perf analysis. > >
[...] > > > > With mitigations ON > > ------------------- > > Samples: 6K of event 'cycles', 4000 Hz, Event count (approx.): 5646512726 > > bpf_prog_3c8251c7e0fef8db bpf_prog_3c8251c7e0fef8db [Percent: local period] > > 45.05 push %rbp > > 0.02 mov %rsp,%rbp > > 0.03 sub $0x8,%rsp > > 22.09 push %rbx > [...] > > > > Do you see any issues with this data? With mitigations ON push %rbp and push %rbx overhead seems to > > be pretty high. > That's sample skid from the retpoline thunk when entring the XDP > program. Pretty expensive push otherwise! :-) > Another thought; Disclaimer: I'm no spectrev2 expert, and probably not > getting the mitigations well enough. So this is me trying to swim at > the deep end! Would it be possible to avoid the retpoline when > entering the XDP program. At least for some XDP program that can be > proved "safe"? If so, PeterZ's upcoming static_call could be used from > the driver side. Alexei, Jakub Do you think it will be possible to avoid this overhead when mitigations are turned ON? The other part of the overhead is going through the redirect path. Can i assume that your silence as an indication that you are now okay with optional bypass flag as long as it doesn't effect the normal XDP datapath. If so, i will respin and submit the patches against the latest bpf-next -Sridhar
On Thu, Oct 31, 2019 at 3:38 PM Samudrala, Sridhar <sridhar.samudrala@intel.com> wrote: > > Alexei, Jakub > > Do you think it will be possible to avoid this overhead when mitigations are turned ON? yes > The other part of the overhead is going through the redirect path. > > Can i assume that your silence as an indication that you are now okay with optional bypass > flag as long as it doesn't effect the normal XDP datapath. If so, i will respin and submit > the patches against the latest bpf-next I'm still against it. Looks like the only motivation for it is extra overhead due to retpolines. imo it's not a good reason to introduce a bunch of extra code helping single kernel feature. We will have proper solution for indirect calls.
On Thu, 31 Oct 2019 15:38:42 -0700, Samudrala, Sridhar wrote: > Do you think it will be possible to avoid this overhead when mitigations are turned ON? > The other part of the overhead is going through the redirect path. Yes, you should help Maciej with the XDP bulking. > Can i assume that your silence as an indication that you are now okay with optional bypass > flag as long as it doesn't effect the normal XDP datapath. If so, i will respin and submit > the patches against the latest bpf-next This logic baffles me. I absolutely hate when people repost patches after I nack them without even as much as mentioning my objections in the cover letter. My concern was that we want the applications to encode fast path logic in BPF and load that into the kernel. So your patch works fundamentally against that goal: I worry that with the volume of patches that get posted each day objections of a measly contributor like myself will get forgotten if I don't reply to each posting, yet replying each time makes me look bullish or whatnot (apart from being an utter waste of time). Ugh.
On 10/31/2019 5:21 PM, Jakub Kicinski wrote: > On Thu, 31 Oct 2019 15:38:42 -0700, Samudrala, Sridhar wrote: >> Do you think it will be possible to avoid this overhead when mitigations are turned ON? >> The other part of the overhead is going through the redirect path. > > Yes, you should help Maciej with the XDP bulking. > >> Can i assume that your silence as an indication that you are now okay with optional bypass >> flag as long as it doesn't effect the normal XDP datapath. If so, i will respin and submit >> the patches against the latest bpf-next > > This logic baffles me. I absolutely hate when people repost patches > after I nack them without even as much as mentioning my objections in > the cover letter. Sorry if you got the impression that i didn't take your feedback. I CCed you and also included the kernel rxdrop data that you requested in the original series. > > My concern was that we want the applications to encode fast path logic > in BPF and load that into the kernel. So your patch works fundamentally > against that goal: So looks like you are saying that the fundamental requirement is that all AF_XDP packets need to go via a BPF program. The reason i proposed direct receive is because of the overhead we are seeing with going via BPF program for apps that want to receive all the packets on a specific queue. I agree that there is work going on to reduce this overhead with bulking and avoiding the retpoline. We can revisit after these optimizations get in and then see if it is still useful to provide a direct receive option. -Sridhar
On Thu, 2019-10-31 at 17:21 -0700, Jakub Kicinski wrote: > > My concern was that we want the applications to encode fast path > logic > in BPF and load that into the kernel. So your patch works > fundamentally > against that goal: I'm only one AF_XDP user but my main goal is to get packets to userspace as fast as possible. I don't (currently) need a BPF program in the path at all. I suspect that many other people that look at AF_XDP as a DPDK replacement have a similar view.
diff --git a/include/linux/filter.h b/include/linux/filter.h index 2ce57645f3cd..db4ad85d8321 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -585,6 +585,9 @@ struct bpf_redirect_info { struct bpf_map *map; struct bpf_map *map_to_flush; u32 kern_flags; +#ifdef CONFIG_XDP_SOCKETS + struct xdp_sock *xsk; +#endif }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); @@ -693,6 +696,16 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, return res; } +#ifdef CONFIG_XDP_SOCKETS +#define BPF_PROG_DIRECT_XSK 0x1 +#define bpf_is_direct_xsk_prog(prog) \ + ((unsigned long)prog == BPF_PROG_DIRECT_XSK) +DECLARE_STATIC_KEY_FALSE(xdp_direct_xsk_needed); +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp); +#else +#define bpf_is_direct_xsk_prog(prog) (false) +#endif + static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { @@ -702,6 +715,11 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, * already takes rcu_read_lock() when fetching the program, so * it's not necessary here anymore. */ +#ifdef CONFIG_XDP_SOCKETS + if (static_branch_unlikely(&xdp_direct_xsk_needed) && + bpf_is_direct_xsk_prog(prog)) + return bpf_direct_xsk(prog, xdp); +#endif return BPF_PROG_RUN(prog, xdp); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 48cc71aae466..f4d0f70aa718 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -743,6 +743,7 @@ struct netdev_rx_queue { struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_XDP_SOCKETS struct xdp_umem *umem; + struct xdp_sock *xsk; #endif } ____cacheline_aligned_in_smp; @@ -1836,6 +1837,10 @@ struct net_device { atomic_t carrier_up_count; atomic_t carrier_down_count; +#ifdef CONFIG_XDP_SOCKETS + u16 direct_xsk_count; +#endif + #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def *wireless_handlers; struct iw_public_data *wireless_data; @@ -3712,6 +3717,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb); +#ifdef CONFIG_XDP_SOCKETS +int dev_set_direct_xsk_prog(struct net_device *dev); +int dev_clear_direct_xsk_prog(struct net_device *dev); +#endif + static __always_inline int ____dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index c9398ce7960f..9158233d34e1 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -76,6 +76,9 @@ struct xsk_map_node { struct xdp_sock **map_entry; }; +/* Flags for the xdp_sock flags field. */ +#define XDP_SOCK_DIRECT (1 << 0) + struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; @@ -104,6 +107,7 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + u16 flags; }; struct xdp_buff; @@ -129,6 +133,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem); void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id); void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index be328c59389d..d202b5d40859 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -25,6 +25,11 @@ * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* This option allows an AF_XDP socket bound to a queue to receive all + * the packets directly from that queue when there is no XDP program + * attached to the device. + */ +#define XDP_DIRECT (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 205f95af67d2..871d738a78d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1349,13 +1349,14 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) void bpf_prog_put(struct bpf_prog *prog) { - __bpf_prog_put(prog, true); + if (!bpf_is_direct_xsk_prog(prog)) + __bpf_prog_put(prog, true); } EXPORT_SYMBOL_GPL(bpf_prog_put); u32 bpf_get_prog_id(const struct bpf_prog *prog) { - if (prog) + if (prog && !bpf_is_direct_xsk_prog(prog)) return prog->aux->id; return 0; @@ -1364,7 +1365,7 @@ EXPORT_SYMBOL(bpf_get_prog_id); void bpf_set_prog_id(struct bpf_prog *prog, u32 id) { - if (prog) + if (prog && !bpf_is_direct_xsk_prog(prog)) prog->aux->id = id; } EXPORT_SYMBOL(bpf_set_prog_id); diff --git a/net/core/dev.c b/net/core/dev.c index 866d0ad936a5..eb3cd718e580 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8269,6 +8269,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, } else { if (!__dev_xdp_query(dev, bpf_op, query)) return 0; +#ifdef CONFIG_XDP_SOCKETS + if (dev->direct_xsk_count) + prog = (void *)BPF_PROG_DIRECT_XSK; +#endif } err = dev_xdp_install(dev, bpf_op, extack, flags, prog); @@ -8278,6 +8282,52 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return err; } +#ifdef CONFIG_XDP_SOCKETS +int dev_set_direct_xsk_prog(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct bpf_prog *prog; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + dev->direct_xsk_count++; + + bpf_op = ops->ndo_bpf; + if (!bpf_op) + return -EOPNOTSUPP; + + if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG)) + return 0; + + prog = (void *)BPF_PROG_DIRECT_XSK; + + return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, prog); +} + +int dev_clear_direct_xsk_prog(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + dev->direct_xsk_count--; + + if (dev->direct_xsk_count) + return 0; + + bpf_op = ops->ndo_bpf; + if (!bpf_op) + return -EOPNOTSUPP; + + if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG)) + return 0; + + return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, NULL); +} +#endif + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace diff --git a/net/core/filter.c b/net/core/filter.c index ed6563622ce3..391d7d600284 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,6 +73,7 @@ #include <net/lwtunnel.h> #include <net/ipv6_stubs.h> #include <net/bpf_sk_storage.h> +#include <linux/static_key.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3546,6 +3547,22 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, return 0; } +#ifdef CONFIG_XDP_SOCKETS +static void xdp_do_flush_xsk(struct bpf_redirect_info *ri) +{ + struct xdp_sock *xsk = READ_ONCE(ri->xsk); + + if (xsk) { + ri->xsk = NULL; + xsk_flush(xsk); + } +} +#else +static inline void xdp_do_flush_xsk(struct bpf_redirect_info *ri) +{ +} +#endif + void xdp_do_flush_map(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -3568,6 +3585,8 @@ void xdp_do_flush_map(void) break; } } + + xdp_do_flush_xsk(ri); } EXPORT_SYMBOL_GPL(xdp_do_flush_map); @@ -3631,11 +3650,28 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, return err; } +#ifdef CONFIG_XDP_SOCKETS +static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri) +{ + return READ_ONCE(ri->xsk); +} +#else +static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri) +{ + return NULL; +} +#endif + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); + struct xdp_sock *xsk; + + xsk = xdp_get_direct_xsk(ri); + if (xsk) + return xsk_rcv(xsk, xdp); if (likely(map)) return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); @@ -8934,4 +8970,26 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; + +#ifdef CONFIG_XDP_SOCKETS +DEFINE_STATIC_KEY_FALSE(xdp_direct_xsk_needed); +EXPORT_SYMBOL_GPL(xdp_direct_xsk_needed); + +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp) +{ + struct xdp_sock *xsk; + + xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index); + if (xsk) { + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->xsk = xsk; + return XDP_REDIRECT; + } + + return XDP_PASS; +} +EXPORT_SYMBOL(bpf_direct_xsk); +#endif + #endif /* CONFIG_INET */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index fa8fbb8fa3c8..8a29939bac7e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -24,6 +24,7 @@ #include <linux/rculist.h> #include <net/xdp_sock.h> #include <net/xdp.h> +#include <linux/if_link.h> #include "xsk_queue.h" #include "xdp_umem.h" @@ -264,6 +265,45 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return err; } +static void xdp_clear_direct_xsk(struct xdp_sock *xsk) +{ + struct net_device *dev = xsk->dev; + u32 qid = xsk->queue_id; + + if (!dev->_rx[qid].xsk) + return; + + dev_clear_direct_xsk_prog(dev); + dev->_rx[qid].xsk = NULL; + static_branch_dec(&xdp_direct_xsk_needed); + xsk->flags &= ~XDP_SOCK_DIRECT; +} + +static int xdp_set_direct_xsk(struct xdp_sock *xsk) +{ + struct net_device *dev = xsk->dev; + u32 qid = xsk->queue_id; + int err = 0; + + if (dev->_rx[qid].xsk) + return -EEXIST; + + xsk->flags |= XDP_SOCK_DIRECT; + static_branch_inc(&xdp_direct_xsk_needed); + dev->_rx[qid].xsk = xsk; + err = dev_set_direct_xsk_prog(dev); + if (err) + xdp_clear_direct_xsk(xsk); + + return err; +} + +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id) +{ + return dev->_rx[queue_id].xsk; +} +EXPORT_SYMBOL(xdp_get_xsk_from_qid); + void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { xskq_produce_flush_addr_n(umem->cq, nb_entries); @@ -464,6 +504,11 @@ static void xsk_unbind_dev(struct xdp_sock *xs) return; WRITE_ONCE(xs->state, XSK_UNBOUND); + if (xs->flags & XDP_SOCK_DIRECT) { + rtnl_lock(); + xdp_clear_direct_xsk(xs); + rtnl_unlock(); + } /* Wait for driver to stop using the xdp socket. */ xdp_del_sk_umem(xs->umem, xs); xs->dev = NULL; @@ -604,7 +649,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | - XDP_USE_NEED_WAKEUP)) + XDP_USE_NEED_WAKEUP | XDP_DIRECT)) return -EINVAL; rtnl_lock(); @@ -632,7 +677,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || - (flags & XDP_USE_NEED_WAKEUP)) { + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_DIRECT)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -688,6 +733,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); xdp_add_sk_umem(xs->umem, xs); + if (flags & XDP_DIRECT) + err = xdp_set_direct_xsk(xs); out_unlock: if (err) { diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index be328c59389d..d202b5d40859 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -25,6 +25,11 @@ * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* This option allows an AF_XDP socket bound to a queue to receive all + * the packets directly from that queue when there is no XDP program + * attached to the device. + */ +#define XDP_DIRECT (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
Introduce a flag that can be specified during the bind() call of an AF_XDP socket to receive packets directly from a queue when there is no XDP program attached to the device. This is enabled by introducing a special BPF prog pointer called BPF_PROG_DIRECT_XSK and a new bind flag XDP_DIRECT that can be specified when binding an AF_XDP socket to a queue. At the time of bind(), an AF_XDP socket in XDP_DIRECT mode, will attach BPF_PROG_DIRECT_XSK as a bpf program if there is no other XDP program attached to the device. The device receive queue is also associated with the AF_XDP socket. In the XDP receive path, if the bpf program is a DIRECT_XSK program, the XDP buffer is passed to the AF_XDP socket that is associated with the receive queue on which the packet is received. To avoid any overhead for nomal XDP programs, a static key is used to keep a count of AF_XDP direct xsk sockets and static_branch_unlikely() is used to handle receives for direct XDP sockets. Any attach of a normal XDP program will take precedence and the direct xsk program will be removed. The direct XSK program will be attached automatically when the normal XDP program is removed when there are any AF_XDP direct sockets associated with that device. Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com> --- include/linux/filter.h | 18 ++++++++++++ include/linux/netdevice.h | 10 +++++++ include/net/xdp_sock.h | 5 ++++ include/uapi/linux/if_xdp.h | 5 ++++ kernel/bpf/syscall.c | 7 +++-- net/core/dev.c | 50 +++++++++++++++++++++++++++++++++ net/core/filter.c | 58 +++++++++++++++++++++++++++++++++++++++ net/xdp/xsk.c | 51 ++++++++++++++++++++++++++++++++-- tools/include/uapi/linux/if_xdp.h | 5 ++++ 9 files changed, 204 insertions(+), 5 deletions(-)