Message ID | 20180729205835.34850-1-dancol@google.com |
---|---|
State | Changes Requested, archived |
Delegated to: | BPF Maintainers |
Headers | show |
Series | [v3] Add BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES bpf(2) command | expand |
On 07/29/2018 10:58 PM, Daniel Colascione wrote: > BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits for the release of all > references to maps active at the instant the > BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is > issued. BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits only for the > expiration of map references obtained by BPF programs from other maps. > > The purpose of this command is to provide a means for userspace to > replace a BPF map with another, newer version, then ensure that no > component is still using the "old" map before manipulating the "old" > map in some way. > > Signed-off-by: Daniel Colascione <dancol@google.com> > --- > include/uapi/linux/bpf.h | 14 ++++++++++++++ > kernel/bpf/syscall.c | 13 +++++++++++++ > 2 files changed, 27 insertions(+) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index b7db3261c62d..ca3cfca76edc 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -75,6 +75,19 @@ struct bpf_lpm_trie_key { > __u8 data[0]; /* Arbitrary size */ > }; > > +/* BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits for the release of all > + * references to maps active at the instant the > + * BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is > + * issued. BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits only for the > + * expiration of map references obtained by BPF programs from > + * other maps. > + * > + * The purpose of this command is to provide a means for userspace to > + * replace a BPF map with another, newer version, then ensure that no > + * component is still using the "old" map before manipulating the > + * "old" map in some way. > + */ > + > /* BPF syscall commands, see bpf(2) man-page for details. */ > enum bpf_cmd { > BPF_MAP_CREATE, > @@ -98,6 +111,7 @@ enum bpf_cmd { > BPF_BTF_LOAD, > BPF_BTF_GET_FD_BY_ID, > BPF_TASK_FD_QUERY, > + BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES, > }; > > enum bpf_map_type { > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index a31a1ba0f8ea..bc9a0713f47d 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -2274,6 +2274,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) > return -EPERM; > > + if (cmd == BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES) { > + if (uattr != NULL || size != 0) > + return -EINVAL; > + err = security_bpf(cmd, NULL, 0); > + if (err < 0) > + return err; > + /* BPF programs always enter a critical section while > + * they have a map reference outstanding. > + */ > + synchronize_rcu(); > + return 0; > + } > + > err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); > if (err) > return err; > Hmm, I don't think such UAPI as above is future-proof. In case we would want a similar mechanism in future for other maps, we would need a whole new bpf command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though the underlying map may not even be a map-to-map. Additionally, we don't have any map object at hand in the above, so we couldn't make any finer grained decisions either. Something like below would be more suitable and leaves room for extending this further in future. Thanks, Daniel From 8dfea71b73fa0d402633b76f78c106e82a7a5007 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann <daniel@iogearbox.net> Date: Mon, 30 Jul 2018 11:47:37 +0200 Subject: [PATCH] sync map refs Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 1 + kernel/bpf/hashtab.c | 1 + kernel/bpf/map_in_map.c | 6 ++++++ kernel/bpf/map_in_map.h | 1 + kernel/bpf/syscall.c | 24 ++++++++++++++++++++++++ 7 files changed, 35 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b5ad95..7b51f86 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -34,6 +34,7 @@ struct bpf_map_ops { void (*map_free)(struct bpf_map *map); int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); + int (*map_sync_refs)(struct bpf_map *map); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8701139..e6ec1de 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -98,6 +98,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_MAP_SYNC_REFS, }; enum bpf_map_type { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 544e58f..ddaf42a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -748,5 +748,6 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, + .map_sync_refs = bpf_map_sync_refs, .map_gen_lookup = array_of_map_gen_lookup, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 513d9df..05380ea 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1407,5 +1407,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, + .map_sync_refs = bpf_map_sync_refs, .map_gen_lookup = htab_of_map_gen_lookup, }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 1da5746..698a50f 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -96,6 +96,12 @@ void bpf_map_fd_put_ptr(void *ptr) bpf_map_put(ptr); } +int bpf_map_sync_refs(struct bpf_map *map) +{ + synchronize_rcu(); + return 0; +} + u32 bpf_map_fd_sys_lookup_elem(void *ptr) { return ((struct bpf_map *)ptr)->id; diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 6183db9..ac02456 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -19,6 +19,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); void bpf_map_fd_put_ptr(void *ptr); +int bpf_map_sync_refs(struct bpf_map *map); u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a31a1ba..b1286cc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -896,6 +896,27 @@ static int map_get_next_key(union bpf_attr *attr) return err; } +#define BPF_MAP_SYNC_REFS_LAST_FIELD map_fd + +static int map_sync_refs(union bpf_attr *attr) +{ + int err = -ENOTSUPP, ufd = attr->map_fd; + struct bpf_map *map; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_SYNC_REFS)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->ops->map_sync_refs) + err = map->ops->map_sync_refs(map); + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2303,6 +2324,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_MAP_SYNC_REFS: + err = map_sync_refs(&attr); + break; case BPF_PROG_LOAD: err = bpf_prog_load(&attr); break;
On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: > Hmm, I don't think such UAPI as above is future-proof. In case we would want > a similar mechanism in future for other maps, we would need a whole new bpf > command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though > the underlying map may not even be a map-to-map. Additionally, we don't have > any map object at hand in the above, so we couldn't make any finer grained > decisions either. Something like below would be more suitable and leaves room > for extending this further in future. YAGNI. Your proposed mechanism doesn't add anything under the current implementation. It's also not clear how a map-specific synchronization command is supposed to work in cases where we swap multiple map references. Do we synchronize_rcu multiple times? Why would we impose that inefficiency just for the sake of some non-specific future extensibility? Add some kind of batching layer? The current approach works for the anticipated use cases. While my preference is not to talk about map-to-maps at all in the user API and instead spec the thing as talking about map references in general, I'd rather have something that talks about references-to-maps-acquired-from-maps than this interface.
On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote: > On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: > > Hmm, I don't think such UAPI as above is future-proof. In case we would want > > a similar mechanism in future for other maps, we would need a whole new bpf > > command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though > > the underlying map may not even be a map-to-map. Additionally, we don't have > > any map object at hand in the above, so we couldn't make any finer grained > > decisions either. Something like below would be more suitable and leaves room > > for extending this further in future. > > YAGNI. Your proposed mechanism doesn't add anything under the current > implementation. FWIW in case of HW offload targeting a particular map may allow users to avoid a potentially slow sync with all the devices on the system. > It's also not clear how a map-specific synchronization > command is supposed to work in cases where we swap multiple map > references. Do we synchronize_rcu multiple times? Why would we impose > that inefficiency just for the sake of some non-specific future > extensibility? Add some kind of batching layer? The current approach > works for the anticipated use cases. > > While my preference is not to talk about map-to-maps at all in the > user API and instead spec the thing as talking about map references in > general, I'd rather have something that talks about > references-to-maps-acquired-from-maps than this interface.
On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski <jakub.kicinski@netronome.com> wrote: > > On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote: > > On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: > > > Hmm, I don't think such UAPI as above is future-proof. In case we would want > > > a similar mechanism in future for other maps, we would need a whole new bpf > > > command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though > > > the underlying map may not even be a map-to-map. Additionally, we don't have > > > any map object at hand in the above, so we couldn't make any finer grained > > > decisions either. Something like below would be more suitable and leaves room > > > for extending this further in future. > > > > YAGNI. Your proposed mechanism doesn't add anything under the current > > implementation. > > FWIW in case of HW offload targeting a particular map may allow users > to avoid a potentially slow sync with all the devices on the system. Sure. But such a thing doesn't exist right now (right?), and we can add that more-efficient-in-that-one-case BPF interface when it lands. I'd rather keep things simple for now.
On Mon, 30 Jul 2018 17:33:39 -0700, Daniel Colascione wrote: > On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski wrote: > > On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote: > > > On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: > > > > Hmm, I don't think such UAPI as above is future-proof. In case we would want > > > > a similar mechanism in future for other maps, we would need a whole new bpf > > > > command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though > > > > the underlying map may not even be a map-to-map. Additionally, we don't have > > > > any map object at hand in the above, so we couldn't make any finer grained > > > > decisions either. Something like below would be more suitable and leaves room > > > > for extending this further in future. > > > > > > YAGNI. Your proposed mechanism doesn't add anything under the current > > > implementation. > > > > FWIW in case of HW offload targeting a particular map may allow users > > to avoid a potentially slow sync with all the devices on the system. > > Sure. But such a thing doesn't exist right now (right?), and we can > add that more-efficient-in-that-one-case BPF interface when it lands. > I'd rather keep things simple for now. You mean map-in-map offload doesn't exist today? True, but it's on the roadmap for Netronome. Tangentially it would be really useful for us to have a "the map has actually been freed" notification/barrier. We have complaints of users creating huge maps and then trying to free and recreate them quickly, and the creation part failing with -ENOMEM, because the free from a workqueue didn't run, yet :( It'd probably require a different API to solve than what's discussed here, but since we're talking about syncing things I thought I'd put it out there...
On Mon, Jul 30, 2018 at 5:45 PM, Jakub Kicinski <jakub.kicinski@netronome.com> wrote: > On Mon, 30 Jul 2018 17:33:39 -0700, Daniel Colascione wrote: >> On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski wrote: >> > On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote: >> > > On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: >> > > > Hmm, I don't think such UAPI as above is future-proof. In case we would want >> > > > a similar mechanism in future for other maps, we would need a whole new bpf >> > > > command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though >> > > > the underlying map may not even be a map-to-map. Additionally, we don't have >> > > > any map object at hand in the above, so we couldn't make any finer grained >> > > > decisions either. Something like below would be more suitable and leaves room >> > > > for extending this further in future. >> > > >> > > YAGNI. Your proposed mechanism doesn't add anything under the current >> > > implementation. >> > >> > FWIW in case of HW offload targeting a particular map may allow users >> > to avoid a potentially slow sync with all the devices on the system. >> >> Sure. But such a thing doesn't exist right now (right?), and we can >> add that more-efficient-in-that-one-case BPF interface when it lands. >> I'd rather keep things simple for now. > > You mean map-in-map offload doesn't exist today? True, but it's on the > roadmap for Netronome. Sounds cool. I'd still wait until map-in-map offload lands until adding the map-specific API though. > Tangentially it would be really useful for us to have a "the map has > actually been freed" notification/barrier. We have complaints of users > creating huge maps and then trying to free and recreate them quickly, > and the creation part failing with -ENOMEM, because the free from a > workqueue didn't run, yet :( It'd probably require a different API to > solve than what's discussed here, but since we're talking about syncing > things I thought I'd put it out there... Yeah. What you're talking about is what I meant upthread when I briefly mentioned a "refcount draining approach" --- you'd block until all references except your own to a map disappeared.
On Mon, 30 Jul 2018 17:50:15 -0700, Daniel Colascione wrote: > > Tangentially it would be really useful for us to have a "the map has > > actually been freed" notification/barrier. We have complaints of users > > creating huge maps and then trying to free and recreate them quickly, > > and the creation part failing with -ENOMEM, because the free from a > > workqueue didn't run, yet :( It'd probably require a different API to > > solve than what's discussed here, but since we're talking about syncing > > things I thought I'd put it out there... > > Yeah. What you're talking about is what I meant upthread when I > briefly mentioned a "refcount draining approach" --- you'd block until > all references except your own to a map disappeared. I don't think so. The ref count goes to 0, work gets scheduled to call free, work runs, free gets called, device memory gets freed. Now the memory can be reused. As long as there are any refs we can't free memory, unless we want to make the semantics really ugly. But as I said, only superficially related to this patch. Perhaps solution will naturally fall out of an API to query the device capabilities and resources, if we ever get there.
On 07/31/2018 02:33 AM, Daniel Colascione wrote: > On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski > <jakub.kicinski@netronome.com> wrote: >> On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote: >>> On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: >>>> Hmm, I don't think such UAPI as above is future-proof. In case we would want >>>> a similar mechanism in future for other maps, we would need a whole new bpf >>>> command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though >>>> the underlying map may not even be a map-to-map. Additionally, we don't have >>>> any map object at hand in the above, so we couldn't make any finer grained >>>> decisions either. Something like below would be more suitable and leaves room >>>> for extending this further in future. >>> >>> YAGNI. Your proposed mechanism doesn't add anything under the current >>> implementation. >> >> FWIW in case of HW offload targeting a particular map may allow users >> to avoid a potentially slow sync with all the devices on the system. > > Sure. But such a thing doesn't exist right now (right?), and we can > add that more-efficient-in-that-one-case BPF interface when it lands. > I'd rather keep things simple for now. I don't see a reason why that is even more complicated. An API command name such as BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is simply non-generic, and exposes specific map details (here: map-in-map) into the UAPI whereas it should reside within a specific implementation instead similar to other ops we have for maps. If in future other maps would be added that would have similar mechanisms of inner objects they return to the BPF program, we'll be adding yet another command just for this. Also, union bpf_attr is extensible, e.g. additional members could be added in future whenever needed for this subcommand instead of forcing it to NULL as done here. E.g. having a high-level one like bpf(BPF_MAP_WAIT, { .map_fd = fd }) could later on also cover the use-case from Jakub by adding an 'event' member into union bpf_attr where we could add other map specific wakeups for user space while the current default of 0 would be BPF_MAP_WAIT_PENDING_REFS (or the like). All I'm saying is to keep it generic so it can be extended later.
On Tue, Jul 31, 2018 at 1:34 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: > On 07/31/2018 02:33 AM, Daniel Colascione wrote: >> On Mon, Jul 30, 2018 at 5:26 PM, Jakub Kicinski >> <jakub.kicinski@netronome.com> wrote: >>> On Mon, 30 Jul 2018 03:25:43 -0700, Daniel Colascione wrote: >>>> On Mon, Jul 30, 2018 at 3:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote: >>>>> Hmm, I don't think such UAPI as above is future-proof. In case we would want >>>>> a similar mechanism in future for other maps, we would need a whole new bpf >>>>> command or reuse BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES as a workaround though >>>>> the underlying map may not even be a map-to-map. Additionally, we don't have >>>>> any map object at hand in the above, so we couldn't make any finer grained >>>>> decisions either. Something like below would be more suitable and leaves room >>>>> for extending this further in future. >>>> >>>> YAGNI. Your proposed mechanism doesn't add anything under the current >>>> implementation. >>> >>> FWIW in case of HW offload targeting a particular map may allow users >>> to avoid a potentially slow sync with all the devices on the system. >> >> Sure. But such a thing doesn't exist right now (right?), and we can >> add that more-efficient-in-that-one-case BPF interface when it lands. >> I'd rather keep things simple for now. > > I don't see a reason why that is even more complicated. Both the API and the implementation are much more complicated in the per-map ops version: just look at the patch size. The size argument isn't necessarily a dealbreaker, but I still don't see what the extra code size and complexity is buying. > An API command name > such as BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is simply non-generic, and > exposes specific map details (here: map-in-map) into the UAPI whereas it > should reside within a specific implementation instead similar to other ops > we have for maps. But synchronize isn't conceptually a command that applies to a specific map. It waits on all references. Did you address my point about your proposed map-specific interface requiring redundant synchronize_rcu calls in the case where we swap multiple maps and want to wait for all the references to drain? Under my proposal, you'd just BPF_SYNCHRONIZE_WHATEVER and call schedule_rcu once. Under your proposal, we'd make it a per-map operation, so we'd issue one synchronize_rcu per map. > If in future other maps would be added that would have > similar mechanisms of inner objects they return to the BPF program, we'll > be adding yet another command just for this. And that's why my personal preference is to just calling this thing BPF_SYNCHRONIZE, which I'd define to wait for all such "inner objects". Alexei is the one who asked for the very specific naming, I believe. Anyway, we have a very simple patch that we could apply today. It addresses a real need, and it doesn't preclude adding something more specific later, when we know we need it. Besides, it's not as if adding a BPF command is particularly expensive. > Also, union bpf_attr is extensible, > e.g. additional members could be added in future whenever needed for this > subcommand instead of forcing it to NULL as done here. We fail with EINVAL when attr != NULL now, which means that we can safely accept a non-NULL attr-based subcommand later without breaking anyone. The interface is already extensible. > All I'm saying is to > keep it generic so it can be extended later. Sure, but no more extensible than it has to be. Prematurely-added extension points tend to cause trouble later.
On Thu, Aug 09, 2018 at 10:17:50AM -0700, Daniel Colascione wrote:
> Ping.
sorry for delay. have been off grid. let's continue in the other thread for full context
On Tue, Jul 31, 2018 at 02:36:39AM -0700, Daniel Colascione wrote: > > > An API command name > > such as BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is simply non-generic, and > > exposes specific map details (here: map-in-map) into the UAPI whereas it > > should reside within a specific implementation instead similar to other ops > > we have for maps. > > But synchronize isn't conceptually a command that applies to a > specific map. It waits on all references. Did you address my point > about your proposed map-specific interface requiring redundant > synchronize_rcu calls in the case where we swap multiple maps and want > to wait for all the references to drain? Under my proposal, you'd just > BPF_SYNCHRONIZE_WHATEVER and call schedule_rcu once. Under your > proposal, we'd make it a per-map operation, so we'd issue one > synchronize_rcu per map. optimizing for multi-map sync sounds like premature optimization. In general I'd prefer DanielB proposal to make the sync logic map and fd specific, but before we argue about implementation further let's agree on the problem we're trying to solve. I believe the only issue being discussed is user space doesn't know when it's ok to start draining the inner map when it was replaced by bpf_map_update syscall command with another map, right? If we agree on that, should bpf_map_update handle it then? Wouldn't it be much easier to understand and use from user pov? No new commands to learn. map_update syscall replaced the map and old map is no longer accessed by the program via this given map-in-map. But if the replaced map is used directly or it sits in some other map-in-map slot the progs can still access it. My issue with DanielC SYNC cmd that it exposes implementation details and introduces complex 'synchronization' semantics. To majority of the users it won't be obvious what is being synchronized. My issue with DanielB WAIT_REF map_fd cmd that it needs to wait for all refs to this map to be dropped. I think combination of usercnt and refcnt can answer that, but feels dangerous to sleep potentially forever in a syscall until all prog->map references are gone, though such cmd is useful beyond map-in-map use case.
On Fri, Aug 10, 2018 at 3:52 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Tue, Jul 31, 2018 at 02:36:39AM -0700, Daniel Colascione wrote: >> >> > An API command name >> > such as BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is simply non-generic, and >> > exposes specific map details (here: map-in-map) into the UAPI whereas it >> > should reside within a specific implementation instead similar to other ops >> > we have for maps. >> >> But synchronize isn't conceptually a command that applies to a >> specific map. It waits on all references. Did you address my point >> about your proposed map-specific interface requiring redundant >> synchronize_rcu calls in the case where we swap multiple maps and want >> to wait for all the references to drain? Under my proposal, you'd just >> BPF_SYNCHRONIZE_WHATEVER and call schedule_rcu once. Under your >> proposal, we'd make it a per-map operation, so we'd issue one >> synchronize_rcu per map. > > optimizing for multi-map sync sounds like premature optimization. Maybe, but the per-map proposal is less efficient *and* more complicated! I don't want to spend more code just to go slower. > I believe the only issue being discussed is user space doesn't know > when it's ok to start draining the inner map when it was replaced > by bpf_map_update syscall command with another map, right? Yes. > If we agree on that, should bpf_map_update handle it then? > Wouldn't it be much easier to understand and use from user pov? > No new commands to learn. map_update syscall replaced the map > and old map is no longer accessed by the program via this given map-in-map. Maybe with a new BPF_SYNCHRONIZE flag for BPF_MAP_UPDATE_ELEM and BPF_MAP_DELETE_ELEM. Otherwise, it seems wrong to make every user of these commands pay for synchronization that only a few will need. > But if the replaced map is used directly or it sits in some other > map-in-map slot the progs can still access it. > > My issue with DanielC SYNC cmd that it exposes implementation details What implementation details? The command semantics are defined entirely in terms of existing user-visible primitives. > and introduces complex 'synchronization' semantics. To majority of > the users it won't be obvious what is being synchronized. > > My issue with DanielB WAIT_REF map_fd cmd that it needs to wait for all refs > to this map to be dropped. I think combination of usercnt and refcnt > can answer that, but feels dangerous to sleep potentially forever > in a syscall until all prog->map references are gone, though such > cmd is useful beyond map-in-map use case. In what scenarios? In any case, can we submit _something_?
On Tue, Aug 14, 2018 at 01:37:12PM -0700, Daniel Colascione wrote: > > > If we agree on that, should bpf_map_update handle it then? > > Wouldn't it be much easier to understand and use from user pov? > > No new commands to learn. map_update syscall replaced the map > > and old map is no longer accessed by the program via this given map-in-map. > > Maybe with a new BPF_SYNCHRONIZE flag for BPF_MAP_UPDATE_ELEM and > BPF_MAP_DELETE_ELEM. Otherwise, it seems wrong to make every user of > these commands pay for synchronization that only a few will need. I don't think extra flag is needed. Extra sync_rcu() for map-in-map is useful for all users. I would consider it a bugfix, since users that examine deleted map have this race today and removing the race is always a good thing especially since the cost is small.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b7db3261c62d..ca3cfca76edc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -75,6 +75,19 @@ struct bpf_lpm_trie_key { __u8 data[0]; /* Arbitrary size */ }; +/* BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits for the release of all + * references to maps active at the instant the + * BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is + * issued. BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits only for the + * expiration of map references obtained by BPF programs from + * other maps. + * + * The purpose of this command is to provide a means for userspace to + * replace a BPF map with another, newer version, then ensure that no + * component is still using the "old" map before manipulating the + * "old" map in some way. + */ + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -98,6 +111,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a31a1ba0f8ea..bc9a0713f47d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2274,6 +2274,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (cmd == BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES) { + if (uattr != NULL || size != 0) + return -EINVAL; + err = security_bpf(cmd, NULL, 0); + if (err < 0) + return err; + /* BPF programs always enter a critical section while + * they have a map reference outstanding. + */ + synchronize_rcu(); + return 0; + } + err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err;
BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits for the release of all references to maps active at the instant the BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES is issued. BPF_SYNCHRONIZE_MAP_TO_MAP_REFERENCES waits only for the expiration of map references obtained by BPF programs from other maps. The purpose of this command is to provide a means for userspace to replace a BPF map with another, newer version, then ensure that no component is still using the "old" map before manipulating the "old" map in some way. Signed-off-by: Daniel Colascione <dancol@google.com> --- include/uapi/linux/bpf.h | 14 ++++++++++++++ kernel/bpf/syscall.c | 13 +++++++++++++ 2 files changed, 27 insertions(+)