@@ -75,6 +75,7 @@ struct bpf_dtab {
struct bpf_dtab_netdev **netdev_map;
unsigned long __percpu *flush_needed;
struct list_head list;
+ struct rcu_head rcu;
};
static DEFINE_SPINLOCK(dev_map_lock);
@@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
}
-static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
+ bool check_memlock)
{
- struct bpf_dtab *dtab;
- int err = -EINVAL;
u64 cost;
-
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
- /* check sanity of attributes */
- if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
- return ERR_PTR(-EINVAL);
-
- dtab = kzalloc(sizeof(*dtab), GFP_USER);
- if (!dtab)
- return ERR_PTR(-ENOMEM);
+ int err;
bpf_map_init_from_attr(&dtab->map, attr);
@@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
cost += dev_map_bitmap_size(attr) * num_possible_cpus();
if (cost >= U32_MAX - PAGE_SIZE)
- goto free_dtab;
+ return -EINVAL;
dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- /* if map size is larger than memlock limit, reject it early */
- err = bpf_map_precharge_memlock(dtab->map.pages);
- if (err)
- goto free_dtab;
-
- err = -ENOMEM;
+ if (check_memlock) {
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(dtab->map.pages);
+ if (err)
+ return -EINVAL;
+ }
/* A per cpu bitfield with a bit per possible net device */
dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
__alignof__(unsigned long),
GFP_KERNEL | __GFP_NOWARN);
if (!dtab->flush_needed)
- goto free_dtab;
+ goto err_alloc;
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
- goto free_dtab;
+ goto err_map;
- spin_lock(&dev_map_lock);
- list_add_tail_rcu(&dtab->list, &dev_map_list);
- spin_unlock(&dev_map_lock);
+ return 0;
- return &dtab->map;
-free_dtab:
+ err_map:
free_percpu(dtab->flush_needed);
- kfree(dtab);
- return ERR_PTR(err);
+ err_alloc:
+ return -ENOMEM;
}
-static void dev_map_free(struct bpf_map *map)
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i, cpu;
+ struct bpf_dtab *dtab;
+ int err;
- /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further reads against netdev_map. It does __not__ ensure pending
- * flush operations (if any) are complete.
- */
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ if (!dtab)
+ return ERR_PTR(-ENOMEM);
+
+ err = dev_map_init_map(dtab, attr, true);
+ if (err) {
+ kfree(dtab);
+ return ERR_PTR(err);
+ }
spin_lock(&dev_map_lock);
- list_del_rcu(&dtab->list);
+ list_add_tail_rcu(&dtab->list, &dev_map_list);
spin_unlock(&dev_map_lock);
- bpf_clear_redirect_map(map);
- synchronize_rcu();
+ return &dtab->map;
+}
+
+static void __dev_map_free(struct rcu_head *rcu)
+{
+ struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
+ int i, cpu;
/* To ensure all pending flush operations have completed wait for flush
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
@@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)
kfree(dtab);
}
+static void dev_map_free(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further reads against netdev_map. It does __not__ ensure pending
+ * flush operations (if any) are complete.
+ */
+
+ spin_lock(&dev_map_lock);
+ list_del_rcu(&dtab->list);
+ spin_unlock(&dev_map_lock);
+
+ bpf_clear_redirect_map(map);
+ call_rcu(&dtab->rcu, __dev_map_free);
+}
+
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -429,12 +450,42 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
+ struct bpf_dtab *dtab,
+ u32 ifindex,
+ unsigned int bit)
{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct net *net = current->nsproxy->net_ns;
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+ struct bpf_dtab_netdev *dev;
+
+ dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+ sizeof(void *), gfp);
+ if (!dev->bulkq) {
+ kfree(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ dev->dev = dev_get_by_index(net, ifindex);
+ if (!dev->dev) {
+ free_percpu(dev->bulkq);
+ kfree(dev);
+ return ERR_PTR(-EINVAL);
+ }
+
+ dev->bit = bit;
+ dev->dtab = dtab;
+
+ return dev;
+}
+
+static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value;
@@ -449,26 +500,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
if (!ifindex) {
dev = NULL;
} else {
- dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
- if (!dev)
- return -ENOMEM;
-
- dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
- sizeof(void *), gfp);
- if (!dev->bulkq) {
- kfree(dev);
- return -ENOMEM;
- }
-
- dev->dev = dev_get_by_index(net, ifindex);
- if (!dev->dev) {
- free_percpu(dev->bulkq);
- kfree(dev);
- return -EINVAL;
- }
-
- dev->bit = i;
- dev->dtab = dtab;
+ dev = __dev_map_alloc_node(net, dtab, ifindex, i);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
}
/* Use call_rcu() here to ensure rcu critical sections have completed
@@ -482,6 +516,13 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
return 0;
}
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return __dev_map_update_elem(current->nsproxy->net_ns,
+ map, key, value, map_flags);
+}
+
const struct bpf_map_ops dev_map_ops = {
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
The subsequent commits introducing default maps and a hash-based ifindex devmap require a bit of refactoring of the devmap code. Perform this first so the subsequent commits become easier to read. Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com> --- kernel/bpf/devmap.c | 177 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 109 insertions(+), 68 deletions(-)