Message ID | 20191211223344.165549-7-brianvv@google.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | add bpf batch ops to process more than 1 elem | expand |
On 12/11/19 2:33 PM, Brian Vazquez wrote: > From: Yonghong Song <yhs@fb.com> > > htab can't use generic batch support due some problematic behaviours > inherent to the data structre, i.e. while iterating the bpf map a > concurrent program might delete the next entry that batch was about to > use, in that case there's no easy solution to retrieve the next entry, > the issue has been discussed multiple times (see [1] and [2]). > > The only way hmap can be traversed without the problem previously > exposed is by making sure that the map is traversing entire buckets. > This commit implements those strict requirements for hmap, the > implementation follows the same interaction that generic support with > some exceptions: > > - If keys/values buffer are not big enough to traverse a bucket, > ENOSPC will be returned. > - out_batch contains the value of the next bucket in the iteration, not > the next key, but this is transparent for the user since the user > should never use out_batch for other than bpf batch syscalls. > > Note that only lookup and lookup_and_delete batch ops require the hmap > specific implementation, update/delete batch ops can be the generic > ones. > > [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/ > [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/ > > Signed-off-by: Yonghong Song <yhs@fb.com> > Signed-off-by: Brian Vazquez <brianvv@google.com> > --- > kernel/bpf/hashtab.c | 242 +++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 242 insertions(+) > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > index 22066a62c8c97..fac107bdaf9ec 100644 > --- a/kernel/bpf/hashtab.c > +++ b/kernel/bpf/hashtab.c > @@ -17,6 +17,17 @@ > (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ > BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) > > +#define BATCH_OPS(_name) \ > + .map_lookup_batch = \ > + _name##_map_lookup_batch, \ > + .map_lookup_and_delete_batch = \ > + _name##_map_lookup_and_delete_batch, \ > + .map_update_batch = \ > + generic_map_update_batch, \ > + .map_delete_batch = \ > + generic_map_delete_batch > + > + > struct bucket { > struct hlist_nulls_head head; > raw_spinlock_t lock; > @@ -1232,6 +1243,233 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, > rcu_read_unlock(); > } > > +static int > +__htab_map_lookup_and_delete_batch(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr, > + bool do_delete, bool is_lru_map, > + bool is_percpu) > +{ > + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); > + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; > + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; > + void __user *uvalues = u64_to_user_ptr(attr->batch.values); > + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); > + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); > + u64 elem_map_flags, map_flags; > + struct hlist_nulls_head *head; > + u32 batch, max_count, size; > + struct hlist_nulls_node *n; > + unsigned long flags; > + struct htab_elem *l; > + struct bucket *b; > + int ret = 0; > + > + max_count = attr->batch.count; > + if (!max_count) > + return 0; In all previous implementation, we did sanity checks for flags and then checked max_count. To be consistent, we should move this after map_flags check. > + > + elem_map_flags = attr->batch.elem_flags; > + if ((elem_map_flags & ~BPF_F_LOCK) || > + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) > + return -EINVAL; > + > + map_flags = attr->batch.flags; > + if (map_flags) > + return -EINVAL; > + > + batch = 0; > + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) > + return -EFAULT; > + > + if (batch >= htab->n_buckets) > + return -ENOENT; > + > + /* We cannot do copy_from_user or copy_to_user inside > + * the rcu_read_lock. Allocate enough space here. > + */ > + key_size = htab->map.key_size; > + roundup_key_size = round_up(htab->map.key_size, 8); > + value_size = htab->map.value_size; > + size = round_up(value_size, 8); > + if (is_percpu) > + value_size = size * num_possible_cpus(); > + keys = kvmalloc(key_size, GFP_USER | __GFP_NOWARN); > + values = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); > + if (!keys || !values) { > + ret = -ENOMEM; > + goto out; > + } > + > + dst_key = keys; > + dst_val = values; > + total = 0; > + > + preempt_disable(); > + this_cpu_inc(bpf_prog_active); > + rcu_read_lock(); > + > +again: > + b = &htab->buckets[batch]; > + head = &b->head; > + raw_spin_lock_irqsave(&b->lock, flags); > + > + bucket_cnt = 0; > + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) > + bucket_cnt++; > + > + if (bucket_cnt > (max_count - total)) { > + if (total == 0) > + ret = -ENOSPC; > + goto after_loop; > + } > + > + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { > + memcpy(dst_key, l->key, key_size); > + > + if (is_percpu) { > + int off = 0, cpu; > + void __percpu *pptr; > + > + pptr = htab_elem_get_ptr(l, map->key_size); > + for_each_possible_cpu(cpu) { > + bpf_long_memcpy(dst_val + off, > + per_cpu_ptr(pptr, cpu), size); > + off += size; > + } > + } else { > + value = l->key + roundup_key_size; > + if (elem_map_flags & BPF_F_LOCK) > + copy_map_value_locked(map, dst_val, value, > + true); > + else > + copy_map_value(map, dst_val, value); > + check_and_init_map_lock(map, dst_val); > + } > + if (do_delete) { > + hlist_nulls_del_rcu(&l->hash_node); > + if (is_lru_map) > + bpf_lru_push_free(&htab->lru, &l->lru_node); > + else > + free_htab_elem(htab, l); > + } > + if (copy_to_user(ukeys + total * key_size, keys, key_size) || > + copy_to_user(uvalues + total * value_size, values, > + value_size)) { > + ret = -EFAULT; > + goto after_loop; > + } > + total++; > + } > + > + batch++; > + if (batch >= htab->n_buckets) { > + ret = -ENOENT; > + goto after_loop; > + } > + > + raw_spin_unlock_irqrestore(&b->lock, flags); > + goto again; > + > +after_loop: > + raw_spin_unlock_irqrestore(&b->lock, flags); > + > + rcu_read_unlock(); > + this_cpu_dec(bpf_prog_active); > + preempt_enable(); > + When reaching here, only the following values are possible for 'ret': 0 <=== everything is okay, still have some buckets left -ENOENT <=== everything is okay, not all user buffer filled as we reach the end of hash table -EFAULT <=== fault during copy data to user space -ENOSPC <=== no enough buffer space to copy the 'batch'. > + if (ret && ret != -ENOENT) > + goto out; Maybe we should do: 0 <=== continue to copy data back to user -ENOSPC <=== continue to copy data back to user user needs to check -ENOSPC error code -EFAULT <=== we can do the copy below or go out since -EFAULT will be returned any way -ENOENT <=== we reached the end, so we actually have no error. we should still copy data back to user. The user can check -ENOENT to signal end of the traversal, similar to get_next_key(). Do this make sense? > + > + /* copy data back to user */ > + ubatch = u64_to_user_ptr(attr->batch.out_batch); > + if (copy_to_user(ubatch, &batch, sizeof(batch)) || > + put_user(total, &uattr->batch.count)) > + ret = -EFAULT; > + > +out: > + kvfree(keys); > + kvfree(values); > + return ret; > +} > + [...]
On 12/11/19 2:33 PM, Brian Vazquez wrote: > From: Yonghong Song <yhs@fb.com> > > htab can't use generic batch support due some problematic behaviours > inherent to the data structre, i.e. while iterating the bpf map a > concurrent program might delete the next entry that batch was about to > use, in that case there's no easy solution to retrieve the next entry, > the issue has been discussed multiple times (see [1] and [2]). > > The only way hmap can be traversed without the problem previously > exposed is by making sure that the map is traversing entire buckets. > This commit implements those strict requirements for hmap, the > implementation follows the same interaction that generic support with > some exceptions: > > - If keys/values buffer are not big enough to traverse a bucket, > ENOSPC will be returned. > - out_batch contains the value of the next bucket in the iteration, not > the next key, but this is transparent for the user since the user > should never use out_batch for other than bpf batch syscalls. > > Note that only lookup and lookup_and_delete batch ops require the hmap > specific implementation, update/delete batch ops can be the generic > ones. > > [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/ > [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/ > > Signed-off-by: Yonghong Song <yhs@fb.com> > Signed-off-by: Brian Vazquez <brianvv@google.com> > --- > kernel/bpf/hashtab.c | 242 +++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 242 insertions(+) > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > index 22066a62c8c97..fac107bdaf9ec 100644 > --- a/kernel/bpf/hashtab.c > +++ b/kernel/bpf/hashtab.c > @@ -17,6 +17,17 @@ > (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ > BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) > > +#define BATCH_OPS(_name) \ > + .map_lookup_batch = \ > + _name##_map_lookup_batch, \ > + .map_lookup_and_delete_batch = \ > + _name##_map_lookup_and_delete_batch, \ > + .map_update_batch = \ > + generic_map_update_batch, \ > + .map_delete_batch = \ > + generic_map_delete_batch > + > + > struct bucket { > struct hlist_nulls_head head; > raw_spinlock_t lock; > @@ -1232,6 +1243,233 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, > rcu_read_unlock(); > } > > +static int > +__htab_map_lookup_and_delete_batch(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr, > + bool do_delete, bool is_lru_map, > + bool is_percpu) > +{ > + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); > + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; > + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; > + void __user *uvalues = u64_to_user_ptr(attr->batch.values); > + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); > + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); > + u64 elem_map_flags, map_flags; > + struct hlist_nulls_head *head; > + u32 batch, max_count, size; > + struct hlist_nulls_node *n; > + unsigned long flags; > + struct htab_elem *l; > + struct bucket *b; > + int ret = 0; > + > + max_count = attr->batch.count; > + if (!max_count) > + return 0; > + > + elem_map_flags = attr->batch.elem_flags; > + if ((elem_map_flags & ~BPF_F_LOCK) || > + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) > + return -EINVAL; > + > + map_flags = attr->batch.flags; > + if (map_flags) > + return -EINVAL; > + > + batch = 0; > + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) > + return -EFAULT; > + > + if (batch >= htab->n_buckets) > + return -ENOENT; > + > + /* We cannot do copy_from_user or copy_to_user inside > + * the rcu_read_lock. Allocate enough space here. > + */ > + key_size = htab->map.key_size; > + roundup_key_size = round_up(htab->map.key_size, 8); > + value_size = htab->map.value_size; > + size = round_up(value_size, 8); > + if (is_percpu) > + value_size = size * num_possible_cpus(); > + keys = kvmalloc(key_size, GFP_USER | __GFP_NOWARN); > + values = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); > + if (!keys || !values) { > + ret = -ENOMEM; > + goto out; > + } > + > + dst_key = keys; > + dst_val = values; > + total = 0; > + > + preempt_disable(); > + this_cpu_inc(bpf_prog_active); > + rcu_read_lock(); > + > +again: > + b = &htab->buckets[batch]; > + head = &b->head; > + raw_spin_lock_irqsave(&b->lock, flags); > + > + bucket_cnt = 0; > + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) > + bucket_cnt++; > + > + if (bucket_cnt > (max_count - total)) { > + if (total == 0) > + ret = -ENOSPC; > + goto after_loop; > + } > + > + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { > + memcpy(dst_key, l->key, key_size); > + > + if (is_percpu) { > + int off = 0, cpu; > + void __percpu *pptr; > + > + pptr = htab_elem_get_ptr(l, map->key_size); > + for_each_possible_cpu(cpu) { > + bpf_long_memcpy(dst_val + off, > + per_cpu_ptr(pptr, cpu), size); > + off += size; > + } > + } else { > + value = l->key + roundup_key_size; > + if (elem_map_flags & BPF_F_LOCK) > + copy_map_value_locked(map, dst_val, value, > + true); > + else > + copy_map_value(map, dst_val, value); > + check_and_init_map_lock(map, dst_val); > + } > + if (do_delete) { > + hlist_nulls_del_rcu(&l->hash_node); > + if (is_lru_map) > + bpf_lru_push_free(&htab->lru, &l->lru_node); > + else > + free_htab_elem(htab, l); > + } > + if (copy_to_user(ukeys + total * key_size, keys, key_size) || > + copy_to_user(uvalues + total * value_size, values, > + value_size)) { We cannot do copy_to_user inside atomic region where irq is disabled with raw_spin_lock_irqsave(). We could do the following: . we kalloc memory before preempt_disable() with the current count of bucket size. . inside the raw_spin_lock_irqsave() region, we can do copy to kernel memory. . inside the raw_spin_lock_irqsave() region, if the bucket size changes, we can have a few retries to increase allocation size before giving up. Do you think this may work? > + ret = -EFAULT; > + goto after_loop; > + } > + total++; > + } > + > + batch++; > + if (batch >= htab->n_buckets) { > + ret = -ENOENT; > + goto after_loop; > + } > + > + raw_spin_unlock_irqrestore(&b->lock, flags); > + goto again; > + > +after_loop: > + raw_spin_unlock_irqrestore(&b->lock, flags); > + > + rcu_read_unlock(); > + this_cpu_dec(bpf_prog_active); > + preempt_enable(); > + > + if (ret && ret != -ENOENT) > + goto out; > + > + /* copy data back to user */ > + ubatch = u64_to_user_ptr(attr->batch.out_batch); > + if (copy_to_user(ubatch, &batch, sizeof(batch)) || > + put_user(total, &uattr->batch.count)) > + ret = -EFAULT; > + > +out: > + kvfree(keys); > + kvfree(values); > + return ret; > +} > + [...]
On Fri, Dec 13, 2019 at 12:58 PM Yonghong Song <yhs@fb.com> wrote: > > > > On 12/11/19 2:33 PM, Brian Vazquez wrote: > > From: Yonghong Song <yhs@fb.com> > > > > htab can't use generic batch support due some problematic behaviours > > inherent to the data structre, i.e. while iterating the bpf map a > > concurrent program might delete the next entry that batch was about to > > use, in that case there's no easy solution to retrieve the next entry, > > the issue has been discussed multiple times (see [1] and [2]). > > > > The only way hmap can be traversed without the problem previously > > exposed is by making sure that the map is traversing entire buckets. > > This commit implements those strict requirements for hmap, the > > implementation follows the same interaction that generic support with > > some exceptions: > > > > - If keys/values buffer are not big enough to traverse a bucket, > > ENOSPC will be returned. > > - out_batch contains the value of the next bucket in the iteration, not > > the next key, but this is transparent for the user since the user > > should never use out_batch for other than bpf batch syscalls. > > > > Note that only lookup and lookup_and_delete batch ops require the hmap > > specific implementation, update/delete batch ops can be the generic > > ones. > > > > [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/ > > [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/ > > > > Signed-off-by: Yonghong Song <yhs@fb.com> > > Signed-off-by: Brian Vazquez <brianvv@google.com> > > --- > > kernel/bpf/hashtab.c | 242 +++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 242 insertions(+) > > > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > > index 22066a62c8c97..fac107bdaf9ec 100644 > > --- a/kernel/bpf/hashtab.c > > +++ b/kernel/bpf/hashtab.c > > @@ -17,6 +17,17 @@ > > (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ > > BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) > > > > +#define BATCH_OPS(_name) \ > > + .map_lookup_batch = \ > > + _name##_map_lookup_batch, \ > > + .map_lookup_and_delete_batch = \ > > + _name##_map_lookup_and_delete_batch, \ > > + .map_update_batch = \ > > + generic_map_update_batch, \ > > + .map_delete_batch = \ > > + generic_map_delete_batch > > + > > + > > struct bucket { > > struct hlist_nulls_head head; > > raw_spinlock_t lock; > > @@ -1232,6 +1243,233 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, > > rcu_read_unlock(); > > } > > > > +static int > > +__htab_map_lookup_and_delete_batch(struct bpf_map *map, > > + const union bpf_attr *attr, > > + union bpf_attr __user *uattr, > > + bool do_delete, bool is_lru_map, > > + bool is_percpu) > > +{ > > + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); > > + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; > > + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; > > + void __user *uvalues = u64_to_user_ptr(attr->batch.values); > > + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); > > + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); > > + u64 elem_map_flags, map_flags; > > + struct hlist_nulls_head *head; > > + u32 batch, max_count, size; > > + struct hlist_nulls_node *n; > > + unsigned long flags; > > + struct htab_elem *l; > > + struct bucket *b; > > + int ret = 0; > > + > > + max_count = attr->batch.count; > > + if (!max_count) > > + return 0; > > + > > + elem_map_flags = attr->batch.elem_flags; > > + if ((elem_map_flags & ~BPF_F_LOCK) || > > + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) > > + return -EINVAL; > > + > > + map_flags = attr->batch.flags; > > + if (map_flags) > > + return -EINVAL; > > + > > + batch = 0; > > + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) > > + return -EFAULT; > > + > > + if (batch >= htab->n_buckets) > > + return -ENOENT; > > + > > + /* We cannot do copy_from_user or copy_to_user inside > > + * the rcu_read_lock. Allocate enough space here. > > + */ > > + key_size = htab->map.key_size; > > + roundup_key_size = round_up(htab->map.key_size, 8); > > + value_size = htab->map.value_size; > > + size = round_up(value_size, 8); > > + if (is_percpu) > > + value_size = size * num_possible_cpus(); > > + keys = kvmalloc(key_size, GFP_USER | __GFP_NOWARN); > > + values = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); > > + if (!keys || !values) { > > + ret = -ENOMEM; > > + goto out; > > + } > > + > > + dst_key = keys; > > + dst_val = values; > > + total = 0; > > + > > + preempt_disable(); > > + this_cpu_inc(bpf_prog_active); > > + rcu_read_lock(); > > + > > +again: > > + b = &htab->buckets[batch]; > > + head = &b->head; > > + raw_spin_lock_irqsave(&b->lock, flags); > > + > > + bucket_cnt = 0; > > + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) > > + bucket_cnt++; > > + > > + if (bucket_cnt > (max_count - total)) { > > + if (total == 0) > > + ret = -ENOSPC; > > + goto after_loop; > > + } > > + > > + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { > > + memcpy(dst_key, l->key, key_size); > > + > > + if (is_percpu) { > > + int off = 0, cpu; > > + void __percpu *pptr; > > + > > + pptr = htab_elem_get_ptr(l, map->key_size); > > + for_each_possible_cpu(cpu) { > > + bpf_long_memcpy(dst_val + off, > > + per_cpu_ptr(pptr, cpu), size); > > + off += size; > > + } > > + } else { > > + value = l->key + roundup_key_size; > > + if (elem_map_flags & BPF_F_LOCK) > > + copy_map_value_locked(map, dst_val, value, > > + true); > > + else > > + copy_map_value(map, dst_val, value); > > + check_and_init_map_lock(map, dst_val); > > + } > > + if (do_delete) { > > + hlist_nulls_del_rcu(&l->hash_node); > > + if (is_lru_map) > > + bpf_lru_push_free(&htab->lru, &l->lru_node); > > + else > > + free_htab_elem(htab, l); > > + } > > + if (copy_to_user(ukeys + total * key_size, keys, key_size) || > > + copy_to_user(uvalues + total * value_size, values, > > + value_size)) { > > We cannot do copy_to_user inside atomic region where irq is disabled > with raw_spin_lock_irqsave(). We could do the following: > . we kalloc memory before preempt_disable() with the current count > of bucket size. > . inside the raw_spin_lock_irqsave() region, we can do copy to kernel > memory. > . inside the raw_spin_lock_irqsave() region, if the bucket size > changes, we can have a few retries to increase allocation size > before giving up. > Do you think this may work? Yes, it does. What should be the initial value for the allocated memory max_entries/2? Do you see any issue if we just kalloc the entire buffer? > > > + ret = -EFAULT; > > + goto after_loop; > > + } > > + total++; > > + } > > + > > + batch++; > > + if (batch >= htab->n_buckets) { > > + ret = -ENOENT; > > + goto after_loop; > > + } > > + > > + raw_spin_unlock_irqrestore(&b->lock, flags); > > + goto again; > > + > > +after_loop: > > + raw_spin_unlock_irqrestore(&b->lock, flags); > > + > > + rcu_read_unlock(); > > + this_cpu_dec(bpf_prog_active); > > + preempt_enable(); > > + > > + if (ret && ret != -ENOENT) > > + goto out; > > + > > + /* copy data back to user */ > > + ubatch = u64_to_user_ptr(attr->batch.out_batch); > > + if (copy_to_user(ubatch, &batch, sizeof(batch)) || > > + put_user(total, &uattr->batch.count)) > > + ret = -EFAULT; > > + > > +out: > > + kvfree(keys); > > + kvfree(values); > > + return ret; > > +} > > + > [...]
On 1/6/20 11:02 PM, Brian Vazquez wrote: > On Fri, Dec 13, 2019 at 12:58 PM Yonghong Song <yhs@fb.com> wrote: >> >> >> >> On 12/11/19 2:33 PM, Brian Vazquez wrote: >>> From: Yonghong Song <yhs@fb.com> >>> >>> htab can't use generic batch support due some problematic behaviours >>> inherent to the data structre, i.e. while iterating the bpf map a >>> concurrent program might delete the next entry that batch was about to >>> use, in that case there's no easy solution to retrieve the next entry, >>> the issue has been discussed multiple times (see [1] and [2]). >>> >>> The only way hmap can be traversed without the problem previously >>> exposed is by making sure that the map is traversing entire buckets. >>> This commit implements those strict requirements for hmap, the >>> implementation follows the same interaction that generic support with >>> some exceptions: >>> >>> - If keys/values buffer are not big enough to traverse a bucket, >>> ENOSPC will be returned. >>> - out_batch contains the value of the next bucket in the iteration, not >>> the next key, but this is transparent for the user since the user >>> should never use out_batch for other than bpf batch syscalls. >>> >>> Note that only lookup and lookup_and_delete batch ops require the hmap >>> specific implementation, update/delete batch ops can be the generic >>> ones. >>> >>> [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/ >>> [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/ >>> >>> Signed-off-by: Yonghong Song <yhs@fb.com> >>> Signed-off-by: Brian Vazquez <brianvv@google.com> >>> --- >>> kernel/bpf/hashtab.c | 242 +++++++++++++++++++++++++++++++++++++++++++ >>> 1 file changed, 242 insertions(+) >>> >>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c >>> index 22066a62c8c97..fac107bdaf9ec 100644 >>> --- a/kernel/bpf/hashtab.c >>> +++ b/kernel/bpf/hashtab.c >>> @@ -17,6 +17,17 @@ >>> (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ >>> BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) >>> >>> +#define BATCH_OPS(_name) \ >>> + .map_lookup_batch = \ >>> + _name##_map_lookup_batch, \ >>> + .map_lookup_and_delete_batch = \ >>> + _name##_map_lookup_and_delete_batch, \ >>> + .map_update_batch = \ >>> + generic_map_update_batch, \ >>> + .map_delete_batch = \ >>> + generic_map_delete_batch >>> + >>> + >>> struct bucket { >>> struct hlist_nulls_head head; >>> raw_spinlock_t lock; >>> @@ -1232,6 +1243,233 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, >>> rcu_read_unlock(); >>> } >>> >>> +static int >>> +__htab_map_lookup_and_delete_batch(struct bpf_map *map, >>> + const union bpf_attr *attr, >>> + union bpf_attr __user *uattr, >>> + bool do_delete, bool is_lru_map, >>> + bool is_percpu) >>> +{ >>> + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); >>> + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; >>> + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; >>> + void __user *uvalues = u64_to_user_ptr(attr->batch.values); >>> + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); >>> + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); >>> + u64 elem_map_flags, map_flags; >>> + struct hlist_nulls_head *head; >>> + u32 batch, max_count, size; >>> + struct hlist_nulls_node *n; >>> + unsigned long flags; >>> + struct htab_elem *l; >>> + struct bucket *b; >>> + int ret = 0; >>> + >>> + max_count = attr->batch.count; >>> + if (!max_count) >>> + return 0; >>> + >>> + elem_map_flags = attr->batch.elem_flags; >>> + if ((elem_map_flags & ~BPF_F_LOCK) || >>> + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) >>> + return -EINVAL; >>> + >>> + map_flags = attr->batch.flags; >>> + if (map_flags) >>> + return -EINVAL; >>> + >>> + batch = 0; >>> + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) >>> + return -EFAULT; >>> + >>> + if (batch >= htab->n_buckets) >>> + return -ENOENT; >>> + >>> + /* We cannot do copy_from_user or copy_to_user inside >>> + * the rcu_read_lock. Allocate enough space here. >>> + */ >>> + key_size = htab->map.key_size; >>> + roundup_key_size = round_up(htab->map.key_size, 8); >>> + value_size = htab->map.value_size; >>> + size = round_up(value_size, 8); >>> + if (is_percpu) >>> + value_size = size * num_possible_cpus(); >>> + keys = kvmalloc(key_size, GFP_USER | __GFP_NOWARN); >>> + values = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); >>> + if (!keys || !values) { >>> + ret = -ENOMEM; >>> + goto out; >>> + } >>> + >>> + dst_key = keys; >>> + dst_val = values; >>> + total = 0; >>> + >>> + preempt_disable(); >>> + this_cpu_inc(bpf_prog_active); >>> + rcu_read_lock(); >>> + >>> +again: >>> + b = &htab->buckets[batch]; >>> + head = &b->head; >>> + raw_spin_lock_irqsave(&b->lock, flags); >>> + >>> + bucket_cnt = 0; >>> + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) >>> + bucket_cnt++; >>> + >>> + if (bucket_cnt > (max_count - total)) { >>> + if (total == 0) >>> + ret = -ENOSPC; >>> + goto after_loop; >>> + } >>> + >>> + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { >>> + memcpy(dst_key, l->key, key_size); >>> + >>> + if (is_percpu) { >>> + int off = 0, cpu; >>> + void __percpu *pptr; >>> + >>> + pptr = htab_elem_get_ptr(l, map->key_size); >>> + for_each_possible_cpu(cpu) { >>> + bpf_long_memcpy(dst_val + off, >>> + per_cpu_ptr(pptr, cpu), size); >>> + off += size; >>> + } >>> + } else { >>> + value = l->key + roundup_key_size; >>> + if (elem_map_flags & BPF_F_LOCK) >>> + copy_map_value_locked(map, dst_val, value, >>> + true); >>> + else >>> + copy_map_value(map, dst_val, value); >>> + check_and_init_map_lock(map, dst_val); >>> + } >>> + if (do_delete) { >>> + hlist_nulls_del_rcu(&l->hash_node); >>> + if (is_lru_map) >>> + bpf_lru_push_free(&htab->lru, &l->lru_node); >>> + else >>> + free_htab_elem(htab, l); >>> + } >>> + if (copy_to_user(ukeys + total * key_size, keys, key_size) || >>> + copy_to_user(uvalues + total * value_size, values, >>> + value_size)) { >> >> We cannot do copy_to_user inside atomic region where irq is disabled >> with raw_spin_lock_irqsave(). We could do the following: >> . we kalloc memory before preempt_disable() with the current count >> of bucket size. >> . inside the raw_spin_lock_irqsave() region, we can do copy to kernel >> memory. >> . inside the raw_spin_lock_irqsave() region, if the bucket size >> changes, we can have a few retries to increase allocation size >> before giving up. >> Do you think this may work? > > Yes, it does. > > What should be the initial value for the allocated memory > max_entries/2? Do you see any issue if we just kalloc the entire > buffer? Allocating max_entries/2 or entire buffer risks allocating too much memory from the system, which may not be a good thing in a production system. That is why I proposed to allocate memory at bucket level. For a reasonable balanced hash table, this should not cause large memory pressure on the host. What do you think? > >> >>> + ret = -EFAULT; >>> + goto after_loop; >>> + } >>> + total++; >>> + } >>> + >>> + batch++; >>> + if (batch >= htab->n_buckets) { >>> + ret = -ENOENT; >>> + goto after_loop; >>> + } >>> + >>> + raw_spin_unlock_irqrestore(&b->lock, flags); >>> + goto again; >>> + >>> +after_loop: >>> + raw_spin_unlock_irqrestore(&b->lock, flags); >>> + >>> + rcu_read_unlock(); >>> + this_cpu_dec(bpf_prog_active); >>> + preempt_enable(); >>> + >>> + if (ret && ret != -ENOENT) >>> + goto out; >>> + >>> + /* copy data back to user */ >>> + ubatch = u64_to_user_ptr(attr->batch.out_batch); >>> + if (copy_to_user(ubatch, &batch, sizeof(batch)) || >>> + put_user(total, &uattr->batch.count)) >>> + ret = -EFAULT; >>> + >>> +out: >>> + kvfree(keys); >>> + kvfree(values); >>> + return ret; >>> +} >>> + >> [...]
On Tue, Jan 7, 2020 at 12:18 PM Yonghong Song <yhs@fb.com> wrote: > > > > On 1/6/20 11:02 PM, Brian Vazquez wrote: > > On Fri, Dec 13, 2019 at 12:58 PM Yonghong Song <yhs@fb.com> wrote: > >> > >> > >> > >> On 12/11/19 2:33 PM, Brian Vazquez wrote: > >>> From: Yonghong Song <yhs@fb.com> > >>> > >>> htab can't use generic batch support due some problematic behaviours > >>> inherent to the data structre, i.e. while iterating the bpf map a > >>> concurrent program might delete the next entry that batch was about to > >>> use, in that case there's no easy solution to retrieve the next entry, > >>> the issue has been discussed multiple times (see [1] and [2]). > >>> > >>> The only way hmap can be traversed without the problem previously > >>> exposed is by making sure that the map is traversing entire buckets. > >>> This commit implements those strict requirements for hmap, the > >>> implementation follows the same interaction that generic support with > >>> some exceptions: > >>> > >>> - If keys/values buffer are not big enough to traverse a bucket, > >>> ENOSPC will be returned. > >>> - out_batch contains the value of the next bucket in the iteration, not > >>> the next key, but this is transparent for the user since the user > >>> should never use out_batch for other than bpf batch syscalls. > >>> > >>> Note that only lookup and lookup_and_delete batch ops require the hmap > >>> specific implementation, update/delete batch ops can be the generic > >>> ones. > >>> > >>> [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/ > >>> [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/ > >>> > >>> Signed-off-by: Yonghong Song <yhs@fb.com> > >>> Signed-off-by: Brian Vazquez <brianvv@google.com> > >>> --- > >>> kernel/bpf/hashtab.c | 242 +++++++++++++++++++++++++++++++++++++++++++ > >>> 1 file changed, 242 insertions(+) > >>> > >>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > >>> index 22066a62c8c97..fac107bdaf9ec 100644 > >>> --- a/kernel/bpf/hashtab.c > >>> +++ b/kernel/bpf/hashtab.c > >>> @@ -17,6 +17,17 @@ > >>> (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ > >>> BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) > >>> > >>> +#define BATCH_OPS(_name) \ > >>> + .map_lookup_batch = \ > >>> + _name##_map_lookup_batch, \ > >>> + .map_lookup_and_delete_batch = \ > >>> + _name##_map_lookup_and_delete_batch, \ > >>> + .map_update_batch = \ > >>> + generic_map_update_batch, \ > >>> + .map_delete_batch = \ > >>> + generic_map_delete_batch > >>> + > >>> + > >>> struct bucket { > >>> struct hlist_nulls_head head; > >>> raw_spinlock_t lock; > >>> @@ -1232,6 +1243,233 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, > >>> rcu_read_unlock(); > >>> } > >>> > >>> +static int > >>> +__htab_map_lookup_and_delete_batch(struct bpf_map *map, > >>> + const union bpf_attr *attr, > >>> + union bpf_attr __user *uattr, > >>> + bool do_delete, bool is_lru_map, > >>> + bool is_percpu) > >>> +{ > >>> + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); > >>> + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; > >>> + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; > >>> + void __user *uvalues = u64_to_user_ptr(attr->batch.values); > >>> + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); > >>> + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); > >>> + u64 elem_map_flags, map_flags; > >>> + struct hlist_nulls_head *head; > >>> + u32 batch, max_count, size; > >>> + struct hlist_nulls_node *n; > >>> + unsigned long flags; > >>> + struct htab_elem *l; > >>> + struct bucket *b; > >>> + int ret = 0; > >>> + > >>> + max_count = attr->batch.count; > >>> + if (!max_count) > >>> + return 0; > >>> + > >>> + elem_map_flags = attr->batch.elem_flags; > >>> + if ((elem_map_flags & ~BPF_F_LOCK) || > >>> + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) > >>> + return -EINVAL; > >>> + > >>> + map_flags = attr->batch.flags; > >>> + if (map_flags) > >>> + return -EINVAL; > >>> + > >>> + batch = 0; > >>> + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) > >>> + return -EFAULT; > >>> + > >>> + if (batch >= htab->n_buckets) > >>> + return -ENOENT; > >>> + > >>> + /* We cannot do copy_from_user or copy_to_user inside > >>> + * the rcu_read_lock. Allocate enough space here. > >>> + */ > >>> + key_size = htab->map.key_size; > >>> + roundup_key_size = round_up(htab->map.key_size, 8); > >>> + value_size = htab->map.value_size; > >>> + size = round_up(value_size, 8); > >>> + if (is_percpu) > >>> + value_size = size * num_possible_cpus(); > >>> + keys = kvmalloc(key_size, GFP_USER | __GFP_NOWARN); > >>> + values = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); > >>> + if (!keys || !values) { > >>> + ret = -ENOMEM; > >>> + goto out; > >>> + } > >>> + > >>> + dst_key = keys; > >>> + dst_val = values; > >>> + total = 0; > >>> + > >>> + preempt_disable(); > >>> + this_cpu_inc(bpf_prog_active); > >>> + rcu_read_lock(); > >>> + > >>> +again: > >>> + b = &htab->buckets[batch]; > >>> + head = &b->head; > >>> + raw_spin_lock_irqsave(&b->lock, flags); > >>> + > >>> + bucket_cnt = 0; > >>> + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) > >>> + bucket_cnt++; > >>> + > >>> + if (bucket_cnt > (max_count - total)) { > >>> + if (total == 0) > >>> + ret = -ENOSPC; > >>> + goto after_loop; > >>> + } > >>> + > >>> + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { > >>> + memcpy(dst_key, l->key, key_size); > >>> + > >>> + if (is_percpu) { > >>> + int off = 0, cpu; > >>> + void __percpu *pptr; > >>> + > >>> + pptr = htab_elem_get_ptr(l, map->key_size); > >>> + for_each_possible_cpu(cpu) { > >>> + bpf_long_memcpy(dst_val + off, > >>> + per_cpu_ptr(pptr, cpu), size); > >>> + off += size; > >>> + } > >>> + } else { > >>> + value = l->key + roundup_key_size; > >>> + if (elem_map_flags & BPF_F_LOCK) > >>> + copy_map_value_locked(map, dst_val, value, > >>> + true); > >>> + else > >>> + copy_map_value(map, dst_val, value); > >>> + check_and_init_map_lock(map, dst_val); > >>> + } > >>> + if (do_delete) { > >>> + hlist_nulls_del_rcu(&l->hash_node); > >>> + if (is_lru_map) > >>> + bpf_lru_push_free(&htab->lru, &l->lru_node); > >>> + else > >>> + free_htab_elem(htab, l); > >>> + } > >>> + if (copy_to_user(ukeys + total * key_size, keys, key_size) || > >>> + copy_to_user(uvalues + total * value_size, values, > >>> + value_size)) { > >> > >> We cannot do copy_to_user inside atomic region where irq is disabled > >> with raw_spin_lock_irqsave(). We could do the following: > >> . we kalloc memory before preempt_disable() with the current count > >> of bucket size. > >> . inside the raw_spin_lock_irqsave() region, we can do copy to kernel > >> memory. > >> . inside the raw_spin_lock_irqsave() region, if the bucket size > >> changes, we can have a few retries to increase allocation size > >> before giving up. > >> Do you think this may work? > > > > Yes, it does. > > > > What should be the initial value for the allocated memory > > max_entries/2? Do you see any issue if we just kalloc the entire > > buffer? > > Allocating max_entries/2 or entire buffer risks allocating too much > memory from the system, which may not be a good thing in a production > system. That is why I proposed to allocate memory at bucket level. > For a reasonable balanced hash table, this should not cause large > memory pressure on the host. What do you think? Sounds reasonable, I'll do that! Thanks for the feedback! > > > > >> > >>> + ret = -EFAULT; > >>> + goto after_loop; > >>> + } > >>> + total++; > >>> + } > >>> + > >>> + batch++; > >>> + if (batch >= htab->n_buckets) { > >>> + ret = -ENOENT; > >>> + goto after_loop; > >>> + } > >>> + > >>> + raw_spin_unlock_irqrestore(&b->lock, flags); > >>> + goto again; > >>> + > >>> +after_loop: > >>> + raw_spin_unlock_irqrestore(&b->lock, flags); > >>> + > >>> + rcu_read_unlock(); > >>> + this_cpu_dec(bpf_prog_active); > >>> + preempt_enable(); > >>> + > >>> + if (ret && ret != -ENOENT) > >>> + goto out; > >>> + > >>> + /* copy data back to user */ > >>> + ubatch = u64_to_user_ptr(attr->batch.out_batch); > >>> + if (copy_to_user(ubatch, &batch, sizeof(batch)) || > >>> + put_user(total, &uattr->batch.count)) > >>> + ret = -EFAULT; > >>> + > >>> +out: > >>> + kvfree(keys); > >>> + kvfree(values); > >>> + return ret; > >>> +} > >>> + > >> [...]
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 22066a62c8c97..fac107bdaf9ec 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -17,6 +17,17 @@ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) +#define BATCH_OPS(_name) \ + .map_lookup_batch = \ + _name##_map_lookup_batch, \ + .map_lookup_and_delete_batch = \ + _name##_map_lookup_and_delete_batch, \ + .map_update_batch = \ + generic_map_update_batch, \ + .map_delete_batch = \ + generic_map_delete_batch + + struct bucket { struct hlist_nulls_head head; raw_spinlock_t lock; @@ -1232,6 +1243,233 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int +__htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete, bool is_lru_map, + bool is_percpu) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); + u64 elem_map_flags, map_flags; + struct hlist_nulls_head *head; + u32 batch, max_count, size; + struct hlist_nulls_node *n; + unsigned long flags; + struct htab_elem *l; + struct bucket *b; + int ret = 0; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + elem_map_flags = attr->batch.elem_flags; + if ((elem_map_flags & ~BPF_F_LOCK) || + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + return -EINVAL; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + batch = 0; + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) + return -EFAULT; + + if (batch >= htab->n_buckets) + return -ENOENT; + + /* We cannot do copy_from_user or copy_to_user inside + * the rcu_read_lock. Allocate enough space here. + */ + key_size = htab->map.key_size; + roundup_key_size = round_up(htab->map.key_size, 8); + value_size = htab->map.value_size; + size = round_up(value_size, 8); + if (is_percpu) + value_size = size * num_possible_cpus(); + keys = kvmalloc(key_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!keys || !values) { + ret = -ENOMEM; + goto out; + } + + dst_key = keys; + dst_val = values; + total = 0; + + preempt_disable(); + this_cpu_inc(bpf_prog_active); + rcu_read_lock(); + +again: + b = &htab->buckets[batch]; + head = &b->head; + raw_spin_lock_irqsave(&b->lock, flags); + + bucket_cnt = 0; + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) + bucket_cnt++; + + if (bucket_cnt > (max_count - total)) { + if (total == 0) + ret = -ENOSPC; + goto after_loop; + } + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + memcpy(dst_key, l->key, key_size); + + if (is_percpu) { + int off = 0, cpu; + void __percpu *pptr; + + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(dst_val + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + } else { + value = l->key + roundup_key_size; + if (elem_map_flags & BPF_F_LOCK) + copy_map_value_locked(map, dst_val, value, + true); + else + copy_map_value(map, dst_val, value); + check_and_init_map_lock(map, dst_val); + } + if (do_delete) { + hlist_nulls_del_rcu(&l->hash_node); + if (is_lru_map) + bpf_lru_push_free(&htab->lru, &l->lru_node); + else + free_htab_elem(htab, l); + } + if (copy_to_user(ukeys + total * key_size, keys, key_size) || + copy_to_user(uvalues + total * value_size, values, + value_size)) { + ret = -EFAULT; + goto after_loop; + } + total++; + } + + batch++; + if (batch >= htab->n_buckets) { + ret = -ENOENT; + goto after_loop; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + goto again; + +after_loop: + raw_spin_unlock_irqrestore(&b->lock, flags); + + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + + if (ret && ret != -ENOENT) + goto out; + + /* copy data back to user */ + ubatch = u64_to_user_ptr(attr->batch.out_batch); + if (copy_to_user(ubatch, &batch, sizeof(batch)) || + put_user(total, &uattr->batch.count)) + ret = -EFAULT; + +out: + kvfree(keys); + kvfree(values); + return ret; +} + +static int +htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, true); +} + +static int +htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, true); +} + +static int +htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, false); +} + +static int +htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, false); +} + +static int +htab_map_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return generic_map_delete_batch(map, attr, uattr); +} + +static int +htab_lru_percpu_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, true); +} + +static int +htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, true); +} + +static int +htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, false); +} + +static int +htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, false); +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1242,6 +1480,7 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab), }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1255,6 +1494,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab_lru), }; /* Called from eBPF program */ @@ -1368,6 +1608,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_percpu), }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1379,6 +1620,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_lru_percpu), }; static int fd_htab_map_alloc_check(union bpf_attr *attr)