[net-next,v2,2/3] xdp: Always use a devmap for XDP_REDIRECT to a device

Message ID	155136035969.3381.6907311502528761435.stgit@alrua-x1
State	Changes Requested
Delegated to:	BPF Maintainers
Headers	show Return-Path: <netdev-owner@vger.kernel.org> Subject: [PATCH net-next v2 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device From: Toke =?utf-8?q?H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com> To: David Miller <davem@davemloft.net> Cc: netdev@vger.kernel.org, Jesper Dangaard Brouer <brouer@redhat.com>, Daniel Borkmann <daniel@iogearbox.net>, Alexei Starovoitov <ast@kernel.org>, Jakub Kicinski <jakub.kicinski@netronome.com> Date: Thu, 28 Feb 2019 14:25:59 +0100 Message-ID: <155136035969.3381.6907311502528761435.stgit@alrua-x1> In-Reply-To: <155136028377.3381.2072266362746015640.stgit@alrua-x1> References: <155136028377.3381.2072266362746015640.stgit@alrua-x1> User-Agent: StGit/unknown-version MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit Sender: netdev-owner@vger.kernel.org Precedence: bulk
Series	xdp: Use a default map for xdp_redirect helper \| expand [net-next,v2,0/3] xdp: Use a default map for xdp_redirect helper [net-next,v2,1/3] xdp: Refactor devmap code in preparation for subsequent additions [net-next,v2,3/3] xdp: Add devmap_idx map type for looking up devices by ifindex [net-next,v2,2/3] xdp: Always use a devmap for XDP_REDIRECT to a device

diff --git a/include/linux/bpf.h b/include/linux/bpf.h index de18227b3d95..b0f28989ccd7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -25,6 +25,7 @@ struct sock; struct seq_file; struct btf; struct btf_type; +struct net; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -533,6 +534,7 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); +struct bpf_prog *bpf_prog_get_by_id(u32 id); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); @@ -612,6 +614,11 @@ struct xdp_buff; struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct bpf_map *__dev_map_get_default_map(struct net_device *dev); +int dev_map_ensure_default_map(struct net *net); +void dev_map_put_default_map(struct net *net); +int dev_map_inc_redirect_count(void); +void dev_map_dec_redirect_count(void); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, @@ -641,6 +648,11 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } +static inline struct bpf_prog *bpf_prog_get_by_id(u32 id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) @@ -693,6 +705,28 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } +static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev) +{ + return NULL; +} + +static inline int dev_map_ensure_default_map(struct net *net) +{ + return 0; +} + +static inline void dev_map_put_default_map(struct net *net) +{ +} + +static inline int dev_map_inc_redirect_count(void) +{ +} + +static inline void dev_map_dec_redirect_count(void) +{ +} + static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index) { } diff --git a/include/linux/filter.h b/include/linux/filter.h index 95e2d7ebdf21..dd6bbbab32f7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -507,6 +507,8 @@ struct bpf_prog { gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ + redirect_needed:1, /* Does program need access to xdp_redirect? */ + redirect_used:1, /* Does program use xdp_redirect? */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index a68ced28d8f4..6706ecc25d8f 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -162,7 +162,7 @@ struct net { #if IS_ENABLED(CONFIG_CAN) struct netns_can can; #endif -#ifdef CONFIG_XDP_SOCKETS +#ifdef CONFIG_BPF_SYSCALL struct netns_xdp xdp; #endif struct sock *diag_nlsk; diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h index e5734261ba0a..4935dfe1cf43 100644 --- a/include/net/netns/xdp.h +++ b/include/net/netns/xdp.h @@ -4,10 +4,21 @@ #include <linux/rculist.h> #include <linux/mutex.h> +#include <linux/atomic.h> + +struct bpf_dtab; + +struct bpf_dtab_container { + struct bpf_dtab __rcu *dtab; + atomic_t refcnt; +}; struct netns_xdp { +#ifdef CONFIG_XDP_SOCKETS struct mutex lock; struct hlist_head list; +#endif + struct bpf_dtab_container default_map; }; #endif /* __NETNS_XDP_H__ */ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1037fc08c504..e55707e62b60 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -56,6 +56,9 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 +#define DEV_MAP_DEFAULT_SIZE 8 +#define BPF_MAX_REFCNT 32768 + struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct net_device *dev_rx; @@ -80,6 +83,7 @@ struct bpf_dtab { static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); +static atomic_t global_redirect_use = {}; static u64 dev_map_bitmap_size(const union bpf_attr *attr) { @@ -332,6 +336,18 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) return obj; } +/* This is only being called from xdp_do_redirect() if the xdp_redirect helper + * is used; the default map is allocated on XDP program load if the helper is + * used, so will always be available at this point. + */ +struct bpf_map *__dev_map_get_default_map(struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct bpf_dtab *dtab = rcu_dereference(net->xdp.default_map.dtab); + + return dtab ? &dtab->map : NULL; +} + /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ @@ -533,14 +549,210 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, }; +static inline struct net *bpf_default_map_to_net(struct bpf_dtab_container *cont) +{ + struct netns_xdp *xdp = container_of(cont, struct netns_xdp, default_map); + + return container_of(xdp, struct net, xdp); +} + +static void __dev_map_release_default_map(struct bpf_dtab_container *cont) +{ + struct bpf_dtab *dtab = NULL; + + lockdep_assert_held(&dev_map_lock); + + dtab = rcu_dereference(cont->dtab); + if (dtab) { + list_del_rcu(&dtab->list); + rcu_assign_pointer(cont->dtab, NULL); + bpf_clear_redirect_map(&dtab->map); + call_rcu(&dtab->rcu, __dev_map_free); + } +} + +void dev_map_put_default_map(struct net *net) +{ + if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) { + spin_lock(&dev_map_lock); + __dev_map_release_default_map(&net->xdp.default_map); + spin_unlock(&dev_map_lock); + } +} + +static int __init_default_map(struct bpf_dtab_container *cont) +{ + struct net *net = bpf_default_map_to_net(cont); + struct bpf_dtab *dtab, *old_dtab; + int size = DEV_MAP_DEFAULT_SIZE; + struct net_device *netdev; + union bpf_attr attr = {}; + u32 idx; + int err; + + lockdep_assert_held(&dev_map_lock); + + if (!atomic_read(&global_redirect_use)) + return 0; + + for_each_netdev(net, netdev) + if (netdev->ifindex >= size) + size <<= 1; + + old_dtab = rcu_dereference(cont->dtab); + if (old_dtab && old_dtab->map.max_entries == size) + return 0; + + dtab = kzalloc(sizeof(*dtab), GFP_USER); + if (!dtab) + return -ENOMEM; + + attr.map_type = BPF_MAP_TYPE_DEVMAP; + attr.max_entries = size; + attr.value_size = 4; + attr.key_size = 4; + + err = dev_map_init_map(dtab, &attr, false); + if (err) { + kfree(dtab); + return err; + } + + for_each_netdev(net, netdev) { + idx = netdev->ifindex; + err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0); + if (err) { + __dev_map_free(&dtab->rcu); + return err; + } + } + + rcu_assign_pointer(cont->dtab, dtab); + list_add_tail_rcu(&dtab->list, &dev_map_list); + + if (old_dtab) { + list_del_rcu(&old_dtab->list); + bpf_clear_redirect_map(&old_dtab->map); + call_rcu(&old_dtab->rcu, __dev_map_free); + } + + return 0; +} + +static int maybe_inc_refcnt(atomic_t *v) +{ + int refcnt; + + refcnt = atomic_inc_return(v); + if (refcnt > BPF_MAX_REFCNT) { + atomic_dec(v); + return -EBUSY; + } + + return refcnt; +} + +int dev_map_ensure_default_map(struct net *net) +{ + int refcnt, err = 0; + + refcnt = maybe_inc_refcnt(&net->xdp.default_map.refcnt); + if (refcnt < 0) + return refcnt; + + if (refcnt == 1) { + spin_lock(&dev_map_lock); + err = __init_default_map(&net->xdp.default_map); + spin_unlock(&dev_map_lock); + } + + return err; +} + +static void __dev_map_dec_redirect_count(void) +{ + struct net *net; + + lockdep_assert_held(&dev_map_lock); + + if (atomic_dec_and_test(&global_redirect_use)) + for_each_net_rcu(net) + __dev_map_release_default_map(&net->xdp.default_map); +} + +void dev_map_dec_redirect_count(void) +{ + spin_lock(&dev_map_lock); + __dev_map_dec_redirect_count(); + spin_unlock(&dev_map_lock); +} + +static int __dev_map_init_redirect_use(void) +{ + struct net *net; + int err; + + lockdep_assert_held(&dev_map_lock); + + for_each_net_rcu(net) { + if (atomic_read(&net->xdp.default_map.refcnt)) { + err = __init_default_map(&net->xdp.default_map); + if (err) + return err; + } + } + + return 0; +} + +int dev_map_inc_redirect_count(void) +{ + int refcnt, err = 0; + + spin_lock(&dev_map_lock); + refcnt = maybe_inc_refcnt(&global_redirect_use); + if (refcnt < 0) { + err = refcnt; + goto out; + } + + if (refcnt == 1) + err = __dev_map_init_redirect_use(); + + if (err) + __dev_map_dec_redirect_count(); + + out: + spin_unlock(&dev_map_lock); + return err; +} + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(netdev); + u32 idx = netdev->ifindex; struct bpf_dtab *dtab; - int i; + int i, err; switch (event) { + case NETDEV_REGISTER: + rcu_read_lock(); + dtab = rcu_dereference(net->xdp.default_map.dtab); + if (dtab) { + err = __dev_map_update_elem(net, &dtab->map, + &idx, &idx, 0); + if (err == -E2BIG) { + spin_lock(&dev_map_lock); + err = __init_default_map(&net->xdp.default_map); + if (err) + net_warn_ratelimited("Unable to re-allocate default map, xdp_redirect() may fail on some ifindexes\n"); + spin_unlock(&dev_map_lock); + } + } + rcu_read_unlock(); + break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ec7c552af76b..fd1b76f5da2d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1265,6 +1265,9 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) kvfree(prog->aux->func_info); bpf_prog_free_linfo(prog); + if (prog->redirect_used) + dev_map_dec_redirect_count(); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } @@ -1962,6 +1965,21 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id +struct bpf_prog *bpf_prog_get_by_id(u32 id) +{ + struct bpf_prog *prog; + + spin_lock_bh(&prog_idr_lock); + prog = idr_find(&prog_idr, id); + if (prog) + prog = bpf_prog_inc_not_zero(prog); + else + prog = ERR_PTR(-ENOENT); + spin_unlock_bh(&prog_idr_lock); + + return prog; +} + static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) { struct bpf_prog *prog; @@ -1974,14 +1992,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - spin_lock_bh(&prog_idr_lock); - prog = idr_find(&prog_idr, id); - if (prog) - prog = bpf_prog_inc_not_zero(prog); - else - prog = ERR_PTR(-ENOENT); - spin_unlock_bh(&prog_idr_lock); - + prog = bpf_prog_get_by_id(id); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1b9496c41383..f1b2f01e7ca1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7609,6 +7609,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_redirect) { + prog->redirect_needed = true; + if (!prog->redirect_used) { + int err = dev_map_inc_redirect_count(); + + if (err) + return err; + prog->redirect_used = true; + } + } + if (insn->imm == BPF_FUNC_override_return) prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { @@ -7618,6 +7629,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * the program array. */ prog->cb_access = 1; + prog->redirect_needed = true; env->prog->aux->stack_depth = MAX_BPF_STACK; env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; diff --git a/net/core/dev.c b/net/core/dev.c index 2b67f2aa59dd..1df20d529026 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7990,6 +7990,21 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, return xdp.prog_id; } +static struct bpf_prog *dev_xdp_get_prog(struct net_device *dev) +{ + struct bpf_prog *prog; + u32 prog_id; + + prog_id = __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG); + if (prog_id) { + prog = bpf_prog_get_by_id(prog_id); + if (!IS_ERR(prog)) + return prog; + } + + return NULL; +} + static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) @@ -8024,9 +8039,18 @@ static void dev_xdp_uninstall(struct net_device *dev) memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG; WARN_ON(ndo_bpf(dev, &xdp)); - if (xdp.prog_id) + if (xdp.prog_id) { + struct bpf_prog *prog = bpf_prog_get_by_id(xdp.prog_id); + + if (!IS_ERR(prog)) { + if (prog->redirect_needed) + dev_map_put_default_map(dev_net(dev)); + bpf_prog_put(prog); + } + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); + } /* Remove HW offload */ memset(&xdp, 0, sizeof(xdp)); @@ -8091,6 +8115,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_prog_put(prog); return -EINVAL; } + + if (!offload && bpf_op == ops->ndo_bpf && + prog->redirect_needed) { + err = dev_map_ensure_default_map(dev_net(dev)); + if (err) { + NL_SET_ERR_MSG(extack, "unable to allocate default map for xdp_redirect()"); + return err; + } + } + } else { + struct bpf_prog *old_prog = dev_xdp_get_prog(dev); + + if (old_prog) { + if (old_prog->redirect_needed) + dev_map_put_default_map(dev_net(dev)); + bpf_prog_put(old_prog); + } } err = dev_xdp_install(dev, bpf_op, extack, flags, prog); @@ -9333,6 +9374,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { int err, new_nsid, new_ifindex; + struct bpf_prog *prog = NULL; ASSERT_RTNL(); @@ -9350,6 +9392,15 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char if (net_eq(dev_net(dev), net)) goto out; + prog = dev_xdp_get_prog(dev); + if (prog) { + if (prog->redirect_needed) + err = dev_map_ensure_default_map(net); + + if (err) + goto out; + } + /* Pick the destination device name, and ensure * we can use it in the destination network namespace. */ @@ -9388,6 +9439,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); + if (prog && prog->redirect_needed) + dev_map_put_default_map(dev_net(dev)); + new_nsid = peernet2id_alloc(dev_net(dev), net); /* If there is an ifindex conflict assign a new one */ if (__dev_get_by_index(net, dev->ifindex)) @@ -9435,6 +9489,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char synchronize_net(); err = 0; out: + if (prog) + bpf_prog_put(prog); + return err; } EXPORT_SYMBOL_GPL(dev_change_net_namespace); diff --git a/net/core/filter.c b/net/core/filter.c index 5132c054c981..be02ea103d05 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3337,58 +3337,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp(struct net_device *dev, - struct bpf_map *map, - struct xdp_buff *xdp, - u32 index) -{ - struct xdp_frame *xdpf; - int err, sent; - - if (!dev->netdev_ops->ndo_xdp_xmit) { - return -EOPNOTSUPP; - } - - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); - if (unlikely(err)) - return err; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - - sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); - if (sent <= 0) - return sent; - return 0; -} - -static noinline int -xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) -{ - struct net_device *fwd; - u32 index = ri->ifindex; - int err; - - fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->ifindex = 0; - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - - err = __bpf_tx_xdp(fwd, NULL, xdp, 0); - if (unlikely(err)) - goto err; - - _trace_xdp_redirect(dev, xdp_prog, index); - return 0; -err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); - return err; -} - static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, struct xdp_buff *xdp, @@ -3519,10 +3467,10 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); - if (likely(map)) - return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); + if (unlikely(!map)) + map = __dev_map_get_default_map(dev); - return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri); + return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); } EXPORT_SYMBOL_GPL(xdp_do_redirect);

[net-next,v2,2/3] xdp: Always use a devmap for XDP_REDIRECT to a device

Commit Message

Patch