diff mbox series

[net-next,v4,3/6] xdp: Refactor devmap code in preparation for subsequent additions

Message ID 155474315675.24432.16407129756640477880.stgit@alrua-x1
State Changes Requested
Delegated to: David Miller
Headers show
Series xdp: Use a default map for xdp_redirect helper | expand

Commit Message

Toke Høiland-Jørgensen April 8, 2019, 5:05 p.m. UTC
The subsequent commits introducing default maps and a hash-based ifindex
devmap require a bit of refactoring of the devmap code. Perform this first
so the subsequent commits become easier to read.

Also split out the final freeing and flushing of devmaps into a workqueue,
to make it easier to queue up the freeing of maps in the subsequent
patches.

Finally, change the spin lock into a mutex, as subsequent patches add code
that needs to be able to sleep while holding the lock.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 kernel/bpf/devmap.c |  186 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 120 insertions(+), 66 deletions(-)
diff mbox series

Patch

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..92393b283b87 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,6 +48,7 @@ 
  * calls will fail at this point.
  */
 #include <linux/bpf.h>
+#include <linux/workqueue.h>
 #include <net/xdp.h>
 #include <linux/filter.h>
 #include <trace/events/xdp.h>
@@ -75,33 +76,30 @@  struct bpf_dtab {
 	struct bpf_dtab_netdev **netdev_map;
 	unsigned long __percpu *flush_needed;
 	struct list_head list;
+	struct work_struct free_work;
 };
 
-static DEFINE_SPINLOCK(dev_map_lock);
+static DEFINE_MUTEX(dev_map_mtx);
 static LIST_HEAD(dev_map_list);
 
+static struct workqueue_struct *dev_map_wq;
+static void __dev_map_free(struct work_struct *work);
+
 static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 {
 	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
 }
 
-static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
+			    bool check_memlock)
 {
-	struct bpf_dtab *dtab;
-	int err = -EINVAL;
 	u64 cost;
-
-	if (!capable(CAP_NET_ADMIN))
-		return ERR_PTR(-EPERM);
+	int err;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
-		return ERR_PTR(-EINVAL);
-
-	dtab = kzalloc(sizeof(*dtab), GFP_USER);
-	if (!dtab)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	bpf_map_init_from_attr(&dtab->map, attr);
 
@@ -109,59 +107,70 @@  static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
 	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_dtab;
+		return -EINVAL;
 
 	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(dtab->map.pages);
-	if (err)
-		goto free_dtab;
-
-	err = -ENOMEM;
+	if (check_memlock) {
+		/* if map size is larger than memlock limit, reject it early */
+		err = bpf_map_precharge_memlock(dtab->map.pages);
+		if (err)
+			return -EINVAL;
+	}
 
 	/* A per cpu bitfield with a bit per possible net device */
 	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
 						__alignof__(unsigned long),
 						GFP_KERNEL | __GFP_NOWARN);
 	if (!dtab->flush_needed)
-		goto free_dtab;
+		return -ENOMEM;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
 					      dtab->map.numa_node);
 	if (!dtab->netdev_map)
-		goto free_dtab;
+		goto free_map;
 
-	spin_lock(&dev_map_lock);
-	list_add_tail_rcu(&dtab->list, &dev_map_list);
-	spin_unlock(&dev_map_lock);
+	INIT_WORK(&dtab->free_work, __dev_map_free);
 
-	return &dtab->map;
-free_dtab:
+	return 0;
+
+free_map:
 	free_percpu(dtab->flush_needed);
-	kfree(dtab);
-	return ERR_PTR(err);
+	return -ENOMEM;
 }
 
-static void dev_map_free(struct bpf_map *map)
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 {
-	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	int i, cpu;
+	struct bpf_dtab *dtab;
+	int err;
 
-	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
-	 * so the programs (can be more than one that used this map) were
-	 * disconnected from events. Wait for outstanding critical sections in
-	 * these programs to complete. The rcu critical section only guarantees
-	 * no further reads against netdev_map. It does __not__ ensure pending
-	 * flush operations (if any) are complete.
-	 */
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
 
-	spin_lock(&dev_map_lock);
-	list_del_rcu(&dtab->list);
-	spin_unlock(&dev_map_lock);
+	dtab = kzalloc(sizeof(*dtab), GFP_USER);
+	if (!dtab)
+		return ERR_PTR(-ENOMEM);
 
-	bpf_clear_redirect_map(map);
+	err = dev_map_init_map(dtab, attr, true);
+	if (err) {
+		kfree(dtab);
+		return ERR_PTR(err);
+	}
+
+	mutex_lock(&dev_map_mtx);
+	list_add_tail_rcu(&dtab->list, &dev_map_list);
+	mutex_unlock(&dev_map_mtx);
+
+	return &dtab->map;
+}
+
+static void __dev_map_free(struct work_struct *work)
+{
+	struct bpf_dtab *dtab = container_of(work, struct bpf_dtab, free_work);
+	int i, cpu;
+
+	/* Make sure all references to this dtab are cleared out. */
 	synchronize_rcu();
 
 	/* To ensure all pending flush operations have completed wait for flush
@@ -192,6 +201,26 @@  static void dev_map_free(struct bpf_map *map)
 	kfree(dtab);
 }
 
+static void dev_map_free(struct bpf_map *map)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further reads against netdev_map. It does __not__ ensure pending
+	 * flush operations (if any) are complete.
+	 */
+
+	mutex_lock(&dev_map_mtx);
+	list_del_rcu(&dtab->list);
+	mutex_unlock(&dev_map_mtx);
+
+	bpf_clear_redirect_map(map);
+	queue_work(dev_map_wq, &dtab->free_work);
+}
+
 static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -429,12 +458,42 @@  static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
-				u64 map_flags)
+static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
+						    struct bpf_dtab *dtab,
+						    u32 ifindex,
+						    unsigned int bit)
 {
-	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct net *net = current->nsproxy->net_ns;
 	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+	struct bpf_dtab_netdev *dev;
+
+	dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+					sizeof(void *), gfp);
+	if (!dev->bulkq) {
+		kfree(dev);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dev->dev = dev_get_by_index(net, ifindex);
+	if (!dev->dev) {
+		free_percpu(dev->bulkq);
+		kfree(dev);
+		return ERR_PTR(-EINVAL);
+	}
+
+	dev->bit = bit;
+	dev->dtab = dtab;
+
+	return dev;
+}
+
+static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
+				 void *key, void *value, u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -449,26 +508,9 @@  static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
-		if (!dev)
-			return -ENOMEM;
-
-		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
-						sizeof(void *), gfp);
-		if (!dev->bulkq) {
-			kfree(dev);
-			return -ENOMEM;
-		}
-
-		dev->dev = dev_get_by_index(net, ifindex);
-		if (!dev->dev) {
-			free_percpu(dev->bulkq);
-			kfree(dev);
-			return -EINVAL;
-		}
-
-		dev->bit = i;
-		dev->dtab = dtab;
+		dev = __dev_map_alloc_node(net, dtab, ifindex, i);
+		if (IS_ERR(dev))
+			return PTR_ERR(dev);
 	}
 
 	/* Use call_rcu() here to ensure rcu critical sections have completed
@@ -482,6 +524,13 @@  static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	return 0;
 }
 
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       u64 map_flags)
+{
+	return __dev_map_update_elem(current->nsproxy->net_ns,
+				     map, key, value, map_flags);
+}
+
 const struct bpf_map_ops dev_map_ops = {
 	.map_alloc = dev_map_alloc,
 	.map_free = dev_map_free,
@@ -537,6 +586,11 @@  static int __init dev_map_init(void)
 	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
 	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
 		     offsetof(struct _bpf_dtab_netdev, dev));
+
+	dev_map_wq = alloc_workqueue("dev_map_wq", 0, 0);
+	if (!dev_map_wq)
+		return -ENOMEM;
+
 	register_netdevice_notifier(&dev_map_notifier);
 	return 0;
 }