Message ID | 20191211223344.165549-3-brianvv@google.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | add bpf batch ops to process more than 1 elem | expand |
On 12/11/19 2:33 PM, Brian Vazquez wrote: > This commit introduces generic support for the bpf_map_lookup_batch and > bpf_map_lookup_and_delete_batch ops. This implementation can be used by > almost all the bpf maps since its core implementation is relying on the > existing map_get_next_key, map_lookup_elem and map_delete_elem > functions. The bpf syscall subcommands introduced are: > > BPF_MAP_LOOKUP_BATCH > BPF_MAP_LOOKUP_AND_DELETE_BATCH > > The UAPI attribute is: > > struct { /* struct used by BPF_MAP_*_BATCH commands */ > __aligned_u64 in_batch; /* start batch, > * NULL to start from beginning > */ > __aligned_u64 out_batch; /* output: next start batch */ > __aligned_u64 keys; > __aligned_u64 values; > __u32 count; /* input/output: > * input: # of key/value > * elements > * output: # of filled elements > */ > __u32 map_fd; > __u64 elem_flags; > __u64 flags; > } batch; > > in_batch/out_batch are opaque values use to communicate between > user/kernel space, in_batch/out_batch must be of key_size length. > > To start iterating from the beginning in_batch must be null, > count is the # of key/value elements to retrieve. Note that the 'keys' > buffer must be a buffer of key_size * count size and the 'values' buffer > must be value_size * count, where value_size must be aligned to 8 bytes > by userspace if it's dealing with percpu maps. 'count' will contain the > number of keys/values successfully retrieved. Note that 'count' is an > input/output variable and it can contain a lower value after a call. > > If there's no more entries to retrieve, ENOENT will be returned. If error > is ENOENT, count might be > 0 in case it copied some values but there were > no more entries to retrieve. > > Note that if the return code is an error and not -EFAULT, > count indicates the number of elements successfully processed. > > Suggested-by: Stanislav Fomichev <sdf@google.com> > Signed-off-by: Brian Vazquez <brianvv@google.com> > Signed-off-by: Yonghong Song <yhs@fb.com> > --- > include/linux/bpf.h | 11 +++ > include/uapi/linux/bpf.h | 19 +++++ > kernel/bpf/syscall.c | 172 +++++++++++++++++++++++++++++++++++++++ > 3 files changed, 202 insertions(+) [...] > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 2530266fa6477..708aa89fe2308 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -1206,6 +1206,120 @@ static int map_get_next_key(union bpf_attr *attr) > return err; > } > > +#define MAP_LOOKUP_RETRIES 3 > + > +static int __generic_map_lookup_batch(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr, > + bool do_delete) > +{ > + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); > + void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); > + void __user *values = u64_to_user_ptr(attr->batch.values); > + void __user *keys = u64_to_user_ptr(attr->batch.keys); > + void *buf, *prev_key, *key, *value; > + u32 value_size, cp, max_count; > + bool first_key = false; > + int err, retry = MAP_LOOKUP_RETRIES; Could you try to use reverse Christmas tree style declaration here? > + > + if (attr->batch.elem_flags & ~BPF_F_LOCK) > + return -EINVAL; > + > + if ((attr->batch.elem_flags & BPF_F_LOCK) && > + !map_value_has_spin_lock(map)) > + return -EINVAL; > + > + value_size = bpf_map_value_size(map); > + > + max_count = attr->batch.count; > + if (!max_count) > + return 0; > + > + buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); > + if (!buf) > + return -ENOMEM; > + > + err = -EFAULT; > + first_key = false; > + if (ubatch && copy_from_user(buf, ubatch, map->key_size)) > + goto free_buf; > + key = buf; > + value = key + map->key_size; > + if (!ubatch) { > + prev_key = NULL; > + first_key = true; > + } > + > + for (cp = 0; cp < max_count;) { > + if (cp || first_key) { > + rcu_read_lock(); > + err = map->ops->map_get_next_key(map, prev_key, key); > + rcu_read_unlock(); > + if (err) > + break; > + } > + err = bpf_map_copy_value(map, key, value, > + attr->batch.elem_flags, do_delete); > + > + if (err == -ENOENT) { > + if (retry) { > + retry--; > + continue; > + } > + err = -EINTR; > + break; > + } > + > + if (err) > + goto free_buf; > + > + if (copy_to_user(keys + cp * map->key_size, key, > + map->key_size)) { > + err = -EFAULT; > + goto free_buf; > + } > + if (copy_to_user(values + cp * value_size, value, value_size)) { > + err = -EFAULT; > + goto free_buf; > + } > + > + prev_key = key; > + retry = MAP_LOOKUP_RETRIES; > + cp++; > + } > + > + if (!err) { > + rcu_read_lock(); > + err = map->ops->map_get_next_key(map, prev_key, key); > + rcu_read_unlock(); > + } > + > + if (err) > + memset(key, 0, map->key_size); So if any error happens due to above map_get_next_key() or earlier error, the next "batch" returned to user could be "0". What should user space handle this? Ultimately, the user space needs to start from the beginning again? What I mean is here how we could design an interface so user space, if no -EFAULT error, can successfully get all elements without duplication. One way to do here is just return -EFAULT if we cannot get proper next key. But maybe we could have better mechanism when we try to implement what user space codes will look like. > + > + if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || > + (copy_to_user(uobatch, key, map->key_size)))) > + err = -EFAULT; > + > +free_buf: > + kfree(buf); > + return err; > +} > + [...]
On Fri, Dec 13, 2019 at 11:26 AM Yonghong Song <yhs@fb.com> wrote: > > > > On 12/11/19 2:33 PM, Brian Vazquez wrote: > > This commit introduces generic support for the bpf_map_lookup_batch and > > bpf_map_lookup_and_delete_batch ops. This implementation can be used by > > almost all the bpf maps since its core implementation is relying on the > > existing map_get_next_key, map_lookup_elem and map_delete_elem > > functions. The bpf syscall subcommands introduced are: > > > > BPF_MAP_LOOKUP_BATCH > > BPF_MAP_LOOKUP_AND_DELETE_BATCH > > > > The UAPI attribute is: > > > > struct { /* struct used by BPF_MAP_*_BATCH commands */ > > __aligned_u64 in_batch; /* start batch, > > * NULL to start from beginning > > */ > > __aligned_u64 out_batch; /* output: next start batch */ > > __aligned_u64 keys; > > __aligned_u64 values; > > __u32 count; /* input/output: > > * input: # of key/value > > * elements > > * output: # of filled elements > > */ > > __u32 map_fd; > > __u64 elem_flags; > > __u64 flags; > > } batch; > > > > in_batch/out_batch are opaque values use to communicate between > > user/kernel space, in_batch/out_batch must be of key_size length. > > > > To start iterating from the beginning in_batch must be null, > > count is the # of key/value elements to retrieve. Note that the 'keys' > > buffer must be a buffer of key_size * count size and the 'values' buffer > > must be value_size * count, where value_size must be aligned to 8 bytes > > by userspace if it's dealing with percpu maps. 'count' will contain the > > number of keys/values successfully retrieved. Note that 'count' is an > > input/output variable and it can contain a lower value after a call. > > > > If there's no more entries to retrieve, ENOENT will be returned. If error > > is ENOENT, count might be > 0 in case it copied some values but there were > > no more entries to retrieve. > > > > Note that if the return code is an error and not -EFAULT, > > count indicates the number of elements successfully processed. > > > > Suggested-by: Stanislav Fomichev <sdf@google.com> > > Signed-off-by: Brian Vazquez <brianvv@google.com> > > Signed-off-by: Yonghong Song <yhs@fb.com> > > --- > > include/linux/bpf.h | 11 +++ > > include/uapi/linux/bpf.h | 19 +++++ > > kernel/bpf/syscall.c | 172 +++++++++++++++++++++++++++++++++++++++ > > 3 files changed, 202 insertions(+) > [...] > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index 2530266fa6477..708aa89fe2308 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -1206,6 +1206,120 @@ static int map_get_next_key(union bpf_attr *attr) > > return err; > > } > > > > +#define MAP_LOOKUP_RETRIES 3 > > + > > +static int __generic_map_lookup_batch(struct bpf_map *map, > > + const union bpf_attr *attr, > > + union bpf_attr __user *uattr, > > + bool do_delete) > > +{ > > + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); > > + void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); > > + void __user *values = u64_to_user_ptr(attr->batch.values); > > + void __user *keys = u64_to_user_ptr(attr->batch.keys); > > + void *buf, *prev_key, *key, *value; > > + u32 value_size, cp, max_count; > > + bool first_key = false; > > + int err, retry = MAP_LOOKUP_RETRIES; > > Could you try to use reverse Christmas tree style declaration here? ACK > > > + > > + if (attr->batch.elem_flags & ~BPF_F_LOCK) > > + return -EINVAL; > > + > > + if ((attr->batch.elem_flags & BPF_F_LOCK) && > > + !map_value_has_spin_lock(map)) > > + return -EINVAL; > > + > > + value_size = bpf_map_value_size(map); > > + > > + max_count = attr->batch.count; > > + if (!max_count) > > + return 0; > > + > > + buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); > > + if (!buf) > > + return -ENOMEM; > > + > > + err = -EFAULT; > > + first_key = false; > > + if (ubatch && copy_from_user(buf, ubatch, map->key_size)) > > + goto free_buf; > > + key = buf; > > + value = key + map->key_size; > > + if (!ubatch) { > > + prev_key = NULL; > > + first_key = true; > > + } > > + > > + for (cp = 0; cp < max_count;) { > > + if (cp || first_key) { > > + rcu_read_lock(); > > + err = map->ops->map_get_next_key(map, prev_key, key); > > + rcu_read_unlock(); > > + if (err) > > + break; > > + } > > + err = bpf_map_copy_value(map, key, value, > > + attr->batch.elem_flags, do_delete); > > + > > + if (err == -ENOENT) { > > + if (retry) { > > + retry--; > > + continue; > > + } > > + err = -EINTR; > > + break; > > + } > > + > > + if (err) > > + goto free_buf; > > + > > + if (copy_to_user(keys + cp * map->key_size, key, > > + map->key_size)) { > > + err = -EFAULT; > > + goto free_buf; > > + } > > + if (copy_to_user(values + cp * value_size, value, value_size)) { > > + err = -EFAULT; > > + goto free_buf; > > + } > > + > > + prev_key = key; > > + retry = MAP_LOOKUP_RETRIES; > > + cp++; > > + } > > + > > + if (!err) { > > + rcu_read_lock(); > > + err = map->ops->map_get_next_key(map, prev_key, key); > > + rcu_read_unlock(); > > + } > > + > > + if (err) > > + memset(key, 0, map->key_size); > > So if any error happens due to above map_get_next_key() or earlier > error, the next "batch" returned to user could be "0". What should > user space handle this? Ultimately, the user space needs to start > from the beginning again? > > What I mean is here how we could design an interface so user > space, if no -EFAULT error, can successfully get all elements > without duplication. > > One way to do here is just return -EFAULT if we cannot get > proper next key. But maybe we could have better mechanism > when we try to implement what user space codes will look like. I was thinking that instead of using the "next key" as a token we could use the last value successfully copied as the token, that way user space code would always be able to start/retry from the last processed entry. Do you think this would work? > > > + > > + if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || > > + (copy_to_user(uobatch, key, map->key_size)))) > > + err = -EFAULT; > > + > > +free_buf: > > + kfree(buf); > > + return err; > > +} > > + > [...]
On 1/6/20 10:50 PM, Brian Vazquez wrote: > On Fri, Dec 13, 2019 at 11:26 AM Yonghong Song <yhs@fb.com> wrote: >> >> >> >> On 12/11/19 2:33 PM, Brian Vazquez wrote: >>> This commit introduces generic support for the bpf_map_lookup_batch and >>> bpf_map_lookup_and_delete_batch ops. This implementation can be used by >>> almost all the bpf maps since its core implementation is relying on the >>> existing map_get_next_key, map_lookup_elem and map_delete_elem >>> functions. The bpf syscall subcommands introduced are: >>> >>> BPF_MAP_LOOKUP_BATCH >>> BPF_MAP_LOOKUP_AND_DELETE_BATCH >>> >>> The UAPI attribute is: >>> >>> struct { /* struct used by BPF_MAP_*_BATCH commands */ >>> __aligned_u64 in_batch; /* start batch, >>> * NULL to start from beginning >>> */ >>> __aligned_u64 out_batch; /* output: next start batch */ >>> __aligned_u64 keys; >>> __aligned_u64 values; >>> __u32 count; /* input/output: >>> * input: # of key/value >>> * elements >>> * output: # of filled elements >>> */ >>> __u32 map_fd; >>> __u64 elem_flags; >>> __u64 flags; >>> } batch; >>> >>> in_batch/out_batch are opaque values use to communicate between >>> user/kernel space, in_batch/out_batch must be of key_size length. >>> >>> To start iterating from the beginning in_batch must be null, >>> count is the # of key/value elements to retrieve. Note that the 'keys' >>> buffer must be a buffer of key_size * count size and the 'values' buffer >>> must be value_size * count, where value_size must be aligned to 8 bytes >>> by userspace if it's dealing with percpu maps. 'count' will contain the >>> number of keys/values successfully retrieved. Note that 'count' is an >>> input/output variable and it can contain a lower value after a call. >>> >>> If there's no more entries to retrieve, ENOENT will be returned. If error >>> is ENOENT, count might be > 0 in case it copied some values but there were >>> no more entries to retrieve. >>> >>> Note that if the return code is an error and not -EFAULT, >>> count indicates the number of elements successfully processed. >>> >>> Suggested-by: Stanislav Fomichev <sdf@google.com> >>> Signed-off-by: Brian Vazquez <brianvv@google.com> >>> Signed-off-by: Yonghong Song <yhs@fb.com> >>> --- >>> include/linux/bpf.h | 11 +++ >>> include/uapi/linux/bpf.h | 19 +++++ >>> kernel/bpf/syscall.c | 172 +++++++++++++++++++++++++++++++++++++++ >>> 3 files changed, 202 insertions(+) >> [...] >>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >>> index 2530266fa6477..708aa89fe2308 100644 >>> --- a/kernel/bpf/syscall.c >>> +++ b/kernel/bpf/syscall.c >>> @@ -1206,6 +1206,120 @@ static int map_get_next_key(union bpf_attr *attr) >>> return err; >>> } >>> >>> +#define MAP_LOOKUP_RETRIES 3 >>> + >>> +static int __generic_map_lookup_batch(struct bpf_map *map, >>> + const union bpf_attr *attr, >>> + union bpf_attr __user *uattr, >>> + bool do_delete) >>> +{ >>> + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); >>> + void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); >>> + void __user *values = u64_to_user_ptr(attr->batch.values); >>> + void __user *keys = u64_to_user_ptr(attr->batch.keys); >>> + void *buf, *prev_key, *key, *value; >>> + u32 value_size, cp, max_count; >>> + bool first_key = false; >>> + int err, retry = MAP_LOOKUP_RETRIES; >> >> Could you try to use reverse Christmas tree style declaration here? > > ACK >> >>> + >>> + if (attr->batch.elem_flags & ~BPF_F_LOCK) >>> + return -EINVAL; >>> + >>> + if ((attr->batch.elem_flags & BPF_F_LOCK) && >>> + !map_value_has_spin_lock(map)) >>> + return -EINVAL; >>> + >>> + value_size = bpf_map_value_size(map); >>> + >>> + max_count = attr->batch.count; >>> + if (!max_count) >>> + return 0; >>> + >>> + buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); >>> + if (!buf) >>> + return -ENOMEM; >>> + >>> + err = -EFAULT; >>> + first_key = false; >>> + if (ubatch && copy_from_user(buf, ubatch, map->key_size)) >>> + goto free_buf; >>> + key = buf; >>> + value = key + map->key_size; >>> + if (!ubatch) { >>> + prev_key = NULL; >>> + first_key = true; >>> + } >>> + >>> + for (cp = 0; cp < max_count;) { >>> + if (cp || first_key) { >>> + rcu_read_lock(); >>> + err = map->ops->map_get_next_key(map, prev_key, key); >>> + rcu_read_unlock(); >>> + if (err) >>> + break; >>> + } >>> + err = bpf_map_copy_value(map, key, value, >>> + attr->batch.elem_flags, do_delete); >>> + >>> + if (err == -ENOENT) { >>> + if (retry) { >>> + retry--; >>> + continue; >>> + } >>> + err = -EINTR; >>> + break; >>> + } >>> + >>> + if (err) >>> + goto free_buf; >>> + >>> + if (copy_to_user(keys + cp * map->key_size, key, >>> + map->key_size)) { >>> + err = -EFAULT; >>> + goto free_buf; >>> + } >>> + if (copy_to_user(values + cp * value_size, value, value_size)) { >>> + err = -EFAULT; >>> + goto free_buf; >>> + } >>> + >>> + prev_key = key; >>> + retry = MAP_LOOKUP_RETRIES; >>> + cp++; >>> + } >>> + >>> + if (!err) { >>> + rcu_read_lock(); >>> + err = map->ops->map_get_next_key(map, prev_key, key); >>> + rcu_read_unlock(); >>> + } >>> + >>> + if (err) >>> + memset(key, 0, map->key_size); >> >> So if any error happens due to above map_get_next_key() or earlier >> error, the next "batch" returned to user could be "0". What should >> user space handle this? Ultimately, the user space needs to start >> from the beginning again? >> >> What I mean is here how we could design an interface so user >> space, if no -EFAULT error, can successfully get all elements >> without duplication. >> >> One way to do here is just return -EFAULT if we cannot get >> proper next key. But maybe we could have better mechanism >> when we try to implement what user space codes will look like. > > I was thinking that instead of using the "next key" as a token we > could use the last value successfully copied as the token, that way > user space code would always be able to start/retry from the last > processed entry. Do you think this would work? Yes, this should work. >> >>> + >>> + if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || >>> + (copy_to_user(uobatch, key, map->key_size)))) >>> + err = -EFAULT; >>> + >>> +free_buf: >>> + kfree(buf); >>> + return err; >>> +} >>> + >> [...]
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 35903f148be59..a16f209255a59 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -43,6 +43,11 @@ struct bpf_map_ops { int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); + int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr); + int (*map_lookup_and_delete_batch)(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); @@ -838,6 +843,12 @@ void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); +int generic_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); +int generic_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); extern int sysctl_unprivileged_bpf_disabled; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dbbcf0b02970b..36d3b885ddedd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -107,6 +107,8 @@ enum bpf_cmd { BPF_MAP_LOOKUP_AND_DELETE_ELEM, BPF_MAP_FREEZE, BPF_BTF_GET_NEXT_ID, + BPF_MAP_LOOKUP_BATCH, + BPF_MAP_LOOKUP_AND_DELETE_BATCH, }; enum bpf_map_type { @@ -403,6 +405,23 @@ union bpf_attr { __u64 flags; }; + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + struct { /* anonymous struct used by BPF_PROG_LOAD command */ __u32 prog_type; /* one of enum bpf_prog_type */ __u32 insn_cnt; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2530266fa6477..708aa89fe2308 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1206,6 +1206,120 @@ static int map_get_next_key(union bpf_attr *attr) return err; } +#define MAP_LOOKUP_RETRIES 3 + +static int __generic_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete) +{ + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); + void __user *values = u64_to_user_ptr(attr->batch.values); + void __user *keys = u64_to_user_ptr(attr->batch.keys); + void *buf, *prev_key, *key, *value; + u32 value_size, cp, max_count; + bool first_key = false; + int err, retry = MAP_LOOKUP_RETRIES; + + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) + return -EINVAL; + + value_size = bpf_map_value_size(map); + + max_count = attr->batch.count; + if (!max_count) + return 0; + + buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); + if (!buf) + return -ENOMEM; + + err = -EFAULT; + first_key = false; + if (ubatch && copy_from_user(buf, ubatch, map->key_size)) + goto free_buf; + key = buf; + value = key + map->key_size; + if (!ubatch) { + prev_key = NULL; + first_key = true; + } + + for (cp = 0; cp < max_count;) { + if (cp || first_key) { + rcu_read_lock(); + err = map->ops->map_get_next_key(map, prev_key, key); + rcu_read_unlock(); + if (err) + break; + } + err = bpf_map_copy_value(map, key, value, + attr->batch.elem_flags, do_delete); + + if (err == -ENOENT) { + if (retry) { + retry--; + continue; + } + err = -EINTR; + break; + } + + if (err) + goto free_buf; + + if (copy_to_user(keys + cp * map->key_size, key, + map->key_size)) { + err = -EFAULT; + goto free_buf; + } + if (copy_to_user(values + cp * value_size, value, value_size)) { + err = -EFAULT; + goto free_buf; + } + + prev_key = key; + retry = MAP_LOOKUP_RETRIES; + cp++; + } + + if (!err) { + rcu_read_lock(); + err = map->ops->map_get_next_key(map, prev_key, key); + rcu_read_unlock(); + } + + if (err) + memset(key, 0, map->key_size); + + if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || + (copy_to_user(uobatch, key, map->key_size)))) + err = -EFAULT; + +free_buf: + kfree(buf); + return err; +} + +int generic_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __generic_map_lookup_batch(map, attr, uattr, false); +} + +int generic_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __generic_map_lookup_batch(map, attr, uattr, true); +} + #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value static int map_lookup_and_delete_elem(union bpf_attr *attr) @@ -3046,6 +3160,57 @@ static int bpf_task_fd_query(const union bpf_attr *attr, return err; } +#define BPF_MAP_BATCH_LAST_FIELD batch.flags + +#define BPF_DO_BATCH(fn) \ + do { \ + if (!fn) { \ + err = -ENOTSUPP; \ + goto err_put; \ + } \ + err = fn(map, attr, uattr); \ + } while (0) + +static int bpf_map_do_batch(const union bpf_attr *attr, + union bpf_attr __user *uattr, + int cmd) +{ + struct bpf_map *map; + int err, ufd; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_BATCH)) + return -EINVAL; + + ufd = attr->batch.map_fd; + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if ((cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && + !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + + if (cmd != BPF_MAP_LOOKUP_BATCH && + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + if (cmd == BPF_MAP_LOOKUP_BATCH) + BPF_DO_BATCH(map->ops->map_lookup_batch); + else + BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); + +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -3143,6 +3308,13 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_AND_DELETE_ELEM: err = map_lookup_and_delete_elem(&attr); break; + case BPF_MAP_LOOKUP_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); + break; + case BPF_MAP_LOOKUP_AND_DELETE_BATCH: + err = bpf_map_do_batch(&attr, uattr, + BPF_MAP_LOOKUP_AND_DELETE_BATCH); + break; default: err = -EINVAL; break;