Message ID | 20161229172855.14910-2-daniel@zonque.org |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On 12/29/2016 06:28 PM, Daniel Mack wrote: > This trie implements a longest prefix match algorithm that can be used > to match IP addresses to a stored set of ranges. > > Internally, data is stored in an unbalanced trie of nodes that has a > maximum height of n, where n is the prefixlen the trie was created > with. > > Tries may be created with prefix lengths that are multiples of 8, in > the range from 8 to 2048. The key used for lookup and update operations > is a struct bpf_lpm_trie_key, and the value is a uint64_t. > > The code carries more information about the internal implementation. > > Signed-off-by: Daniel Mack <daniel@zonque.org> > Reviewed-by: David Herrmann <dh.herrmann@gmail.com> Thanks for working on it, and sorry for late reply. In addition to Alexei's earlier comments on the cover letter, a few comments inline: [...] > diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c > new file mode 100644 > index 0000000..8b6a61d > --- /dev/null > +++ b/kernel/bpf/lpm_trie.c > @@ -0,0 +1,468 @@ > +/* > + * Longest prefix match list implementation > + * > + * Copyright (c) 2016 Daniel Mack > + * Copyright (c) 2016 David Herrmann > + * > + * This file is subject to the terms and conditions of version 2 of the GNU > + * General Public License. See the file COPYING in the main directory of the > + * Linux distribution for more details. > + */ > + > +#include <linux/bpf.h> > +#include <linux/err.h> > +#include <linux/slab.h> > +#include <linux/spinlock.h> > +#include <linux/vmalloc.h> > +#include <net/ipv6.h> > + > +/* Intermediate node */ > +#define LPM_TREE_NODE_FLAG_IM BIT(0) > + > +struct lpm_trie_node; > + > +struct lpm_trie_node { > + struct rcu_head rcu; > + struct lpm_trie_node __rcu *child[2]; > + u32 prefixlen; > + u32 flags; > + u64 value; > + u8 data[0]; > +}; > + > +struct lpm_trie { > + struct bpf_map map; > + struct lpm_trie_node __rcu *root; > + size_t n_entries; > + size_t max_prefixlen; > + size_t data_size; > + spinlock_t lock; > +}; > + [...] > + > +static inline int extract_bit(const u8 *data, size_t index) > +{ > + return !!(data[index / 8] & (1 << (7 - (index % 8)))); > +} > + [...] > + > +static struct lpm_trie_node *lpm_trie_node_alloc(size_t data_size) > +{ > + return kmalloc(sizeof(struct lpm_trie_node) + data_size, > + GFP_ATOMIC | __GFP_NOWARN); > +} > + > +/* Called from syscall or from eBPF program */ > +static int trie_update_elem(struct bpf_map *map, > + void *_key, void *value, u64 flags) > +{ > + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); > + struct lpm_trie_node *node, *im_node, *new_node = NULL; > + struct lpm_trie_node __rcu **slot; > + struct bpf_lpm_trie_key *key = _key; > + unsigned int next_bit; > + size_t matchlen = 0; > + int ret = 0; We should guard for future map flags here: if (unlikely(flags > BPF_EXIST)) return -EINVAL; And further below we'd need to check for BPF_{NO,}EXIST when replacing resp. adding the node? > + if (key->prefixlen > trie->max_prefixlen) > + return -EINVAL; > + > + spin_lock(&trie->lock); That spin lock would need to be converted to a raw lock, see commit ac00881f9221 ("bpf: convert hashtab lock to raw lock"). The comment in htab also mentions that bpf_map_update_elem() can be called in irq context (I assume as a map from tracing side?), so we'd need to use the *_irqsave variants here as well. > + /* Allocate and fill a new node */ > + > + if (trie->n_entries == trie->map.max_entries) { > + ret = -ENOSPC; > + goto out; > + } > + > + new_node = lpm_trie_node_alloc(trie->data_size); > + if (!new_node) { > + ret = -ENOMEM; > + goto out; > + } > + > + trie->n_entries++; > + new_node->value = *(u64 *) value; > + new_node->prefixlen = key->prefixlen; > + new_node->flags = 0; > + new_node->child[0] = NULL; > + new_node->child[1] = NULL; Should this be ... RCU_INIT_POINTER(new_node->child[0], NULL); RCU_INIT_POINTER(new_node->child[1], NULL); > + memcpy(new_node->data, key->data, trie->data_size); > + > + /* > + * Now find a slot to attach the new node. To do that, walk the tree > + * from the root match as many bits as possible for each node until we > + * either find an empty slot or a slot that needs to be replaced by an > + * intermediate node. > + */ > + slot = &trie->root; > + > + while ((node = rcu_dereference_protected(*slot, > + lockdep_is_held(&trie->lock)))) { > + matchlen = longest_prefix_match(trie, node, key); > + > + if (node->prefixlen != matchlen || > + node->prefixlen == key->prefixlen || > + node->prefixlen == trie->max_prefixlen) > + break; > + > + next_bit = extract_bit(key->data, node->prefixlen); > + slot = &node->child[next_bit]; > + } > + > + /* > + * If the slot is empty (a free child pointer or an empty root), > + * simply assign the @new_node to that slot and be done. > + */ > + if (!node) { > + rcu_assign_pointer(*slot, new_node); > + goto out; > + } > + > + /* > + * If the slot we picked already exists, replace it with @new_node > + * which already has the correct data array and value set. > + */ > + if (node->prefixlen == matchlen) { > + new_node->child[0] = node->child[0]; > + new_node->child[1] = node->child[1]; > + > + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) > + trie->n_entries--; > + > + rcu_assign_pointer(*slot, new_node); > + kfree_rcu(node, rcu); > + > + goto out; > + } > + > + /* > + * If the new node matches the prefix completely, it must be an > + * inserted as an ancestor. Simply insert it between @node and @*slot. > + */ > + if (matchlen == key->prefixlen) { > + next_bit = extract_bit(node->data, matchlen); > + rcu_assign_pointer(new_node->child[next_bit], node); > + rcu_assign_pointer(*slot, new_node); > + goto out; > + } > + > + im_node = lpm_trie_node_alloc(trie->data_size); > + if (!im_node) { > + ret = -ENOMEM; > + goto out; > + } > + > + im_node->prefixlen = matchlen; > + im_node->flags |= LPM_TREE_NODE_FLAG_IM; > + memcpy(im_node->data, node->data, trie->data_size); > + > + /* Now determine which child to install in which slot */ > + if (extract_bit(key->data, matchlen)) { > + rcu_assign_pointer(im_node->child[0], node); > + rcu_assign_pointer(im_node->child[1], new_node); > + } else { > + rcu_assign_pointer(im_node->child[0], new_node); > + rcu_assign_pointer(im_node->child[1], node); > + } > + > + /* Finally, assign the intermediate node to the determined spot */ > + rcu_assign_pointer(*slot, im_node); > + > +out: > + if (ret) { > + if (new_node) > + trie->n_entries--; > + > + kfree(new_node); > + kfree(im_node); > + } > + > + spin_unlock(&trie->lock); > + > + return ret; > +} > + > +static struct bpf_map *trie_alloc(union bpf_attr *attr) > +{ > + struct lpm_trie *trie; > + > + /* check sanity of attributes */ > + if (attr->max_entries == 0 || attr->map_flags || > + attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 || > + attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 || > + attr->value_size != sizeof(u64)) > + return ERR_PTR(-EINVAL); The correct attr->map_flags test here would need to be ... attr->map_flags != BPF_F_NO_PREALLOC ... since in this case we don't have any prealloc pool, and should that come one day that test could be relaxed again. > + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); > + if (!trie) > + return NULL; > + > + /* copy mandatory map attributes */ > + trie->map.map_type = attr->map_type; > + trie->map.key_size = attr->key_size; > + trie->map.value_size = attr->value_size; > + trie->map.max_entries = attr->max_entries; You also need to fill in trie->map.pages as that is eventually used to charge memory against in bpf_map_charge_memlock(), right now that would remain as 0 meaning the map is not accounted for. > + trie->data_size = attr->key_size - > + offsetof(struct bpf_lpm_trie_key, data); > + trie->max_prefixlen = trie->data_size * 8; > + > + spin_lock_init(&trie->lock); > + > + return &trie->map; > +} > + > +static void trie_free(struct bpf_map *map) > +{ > + struct lpm_trie_node __rcu **slot; > + struct lpm_trie_node *node; > + struct lpm_trie *trie = > + container_of(map, struct lpm_trie, map); > + > + spin_lock(&trie->lock); > + > + /* > + * Always start at the root and walk down to a node that has no > + * children. Then free that node, nullify its pointer in the parent, > + * then start over. > + */ > + > + for (;;) { > + slot = &trie->root; > + > + for (;;) { > + node = rcu_dereference_protected(*slot, > + lockdep_is_held(&trie->lock)); > + if (!node) > + goto out; > + > + if (node->child[0]) { rcu_access_pointer(node->child[0]) (at least to keep sparse happy?) > + slot = &node->child[0]; > + continue; > + } > + > + if (node->child[1]) { Here too? > + slot = &node->child[1]; > + continue; > + } > + > + kfree(node); > + rcu_assign_pointer(*slot, NULL); RCU_INIT_POINTER(*slot, NULL) > + break; > + } > + } > + > +out: > + spin_unlock(&trie->lock); > +} > + > +static const struct bpf_map_ops trie_ops = { > + .map_alloc = trie_alloc, > + .map_free = trie_free, > + .map_lookup_elem = trie_lookup_elem, > + .map_update_elem = trie_update_elem, delete ops still planned to add? > +}; > + > +static struct bpf_map_type_list trie_type __read_mostly = { > + .ops = &trie_ops, > + .type = BPF_MAP_TYPE_LPM_TRIE, > +}; > + > +static int __init register_trie_map(void) > +{ > + bpf_register_map_type(&trie_type); > + return 0; > +} > +late_initcall(register_trie_map); Thanks, Daniel
On 01/05/2017 05:25 PM, Daniel Borkmann wrote: > On 12/29/2016 06:28 PM, Daniel Mack wrote: >> This trie implements a longest prefix match algorithm that can be used >> to match IP addresses to a stored set of ranges. >> >> Internally, data is stored in an unbalanced trie of nodes that has a >> maximum height of n, where n is the prefixlen the trie was created >> with. >> >> Tries may be created with prefix lengths that are multiples of 8, in >> the range from 8 to 2048. The key used for lookup and update operations >> is a struct bpf_lpm_trie_key, and the value is a uint64_t. >> >> The code carries more information about the internal implementation. >> >> Signed-off-by: Daniel Mack <daniel@zonque.org> >> Reviewed-by: David Herrmann <dh.herrmann@gmail.com> > > Thanks for working on it, and sorry for late reply. In addition to > Alexei's earlier comments on the cover letter, a few comments inline: > > [...] >> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c >> new file mode 100644 >> index 0000000..8b6a61d >> --- /dev/null >> +++ b/kernel/bpf/lpm_trie.c >> @@ -0,0 +1,468 @@ >> +/* >> + * Longest prefix match list implementation >> + * >> + * Copyright (c) 2016 Daniel Mack >> + * Copyright (c) 2016 David Herrmann >> + * >> + * This file is subject to the terms and conditions of version 2 of the GNU >> + * General Public License. See the file COPYING in the main directory of the >> + * Linux distribution for more details. >> + */ >> + >> +#include <linux/bpf.h> >> +#include <linux/err.h> >> +#include <linux/slab.h> >> +#include <linux/spinlock.h> >> +#include <linux/vmalloc.h> >> +#include <net/ipv6.h> >> + >> +/* Intermediate node */ >> +#define LPM_TREE_NODE_FLAG_IM BIT(0) >> + >> +struct lpm_trie_node; >> + >> +struct lpm_trie_node { >> + struct rcu_head rcu; >> + struct lpm_trie_node __rcu *child[2]; >> + u32 prefixlen; >> + u32 flags; >> + u64 value; >> + u8 data[0]; >> +}; >> + >> +struct lpm_trie { >> + struct bpf_map map; >> + struct lpm_trie_node __rcu *root; >> + size_t n_entries; >> + size_t max_prefixlen; >> + size_t data_size; >> + spinlock_t lock; >> +}; >> + > [...] >> + >> +static inline int extract_bit(const u8 *data, size_t index) >> +{ >> + return !!(data[index / 8] & (1 << (7 - (index % 8)))); >> +} >> + > [...] >> + >> +static struct lpm_trie_node *lpm_trie_node_alloc(size_t data_size) >> +{ >> + return kmalloc(sizeof(struct lpm_trie_node) + data_size, >> + GFP_ATOMIC | __GFP_NOWARN); >> +} >> + >> +/* Called from syscall or from eBPF program */ >> +static int trie_update_elem(struct bpf_map *map, >> + void *_key, void *value, u64 flags) >> +{ >> + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); >> + struct lpm_trie_node *node, *im_node, *new_node = NULL; >> + struct lpm_trie_node __rcu **slot; >> + struct bpf_lpm_trie_key *key = _key; >> + unsigned int next_bit; >> + size_t matchlen = 0; >> + int ret = 0; > > We should guard for future map flags here: > > if (unlikely(flags > BPF_EXIST)) > return -EINVAL; > > And further below we'd need to check for BPF_{NO,}EXIST when replacing > resp. adding the node? > >> + if (key->prefixlen > trie->max_prefixlen) >> + return -EINVAL; >> + >> + spin_lock(&trie->lock); > > That spin lock would need to be converted to a raw lock, see commit > ac00881f9221 ("bpf: convert hashtab lock to raw lock"). The comment > in htab also mentions that bpf_map_update_elem() can be called in > irq context (I assume as a map from tracing side?), so we'd need to > use the *_irqsave variants here as well. > >> + /* Allocate and fill a new node */ >> + >> + if (trie->n_entries == trie->map.max_entries) { >> + ret = -ENOSPC; >> + goto out; >> + } >> + >> + new_node = lpm_trie_node_alloc(trie->data_size); >> + if (!new_node) { >> + ret = -ENOMEM; >> + goto out; >> + } >> + >> + trie->n_entries++; >> + new_node->value = *(u64 *) value; >> + new_node->prefixlen = key->prefixlen; >> + new_node->flags = 0; >> + new_node->child[0] = NULL; >> + new_node->child[1] = NULL; > > Should this be ... > > RCU_INIT_POINTER(new_node->child[0], NULL); > RCU_INIT_POINTER(new_node->child[1], NULL); > >> + memcpy(new_node->data, key->data, trie->data_size); >> + >> + /* >> + * Now find a slot to attach the new node. To do that, walk the tree >> + * from the root match as many bits as possible for each node until we >> + * either find an empty slot or a slot that needs to be replaced by an >> + * intermediate node. >> + */ >> + slot = &trie->root; >> + >> + while ((node = rcu_dereference_protected(*slot, >> + lockdep_is_held(&trie->lock)))) { >> + matchlen = longest_prefix_match(trie, node, key); >> + >> + if (node->prefixlen != matchlen || >> + node->prefixlen == key->prefixlen || >> + node->prefixlen == trie->max_prefixlen) >> + break; >> + >> + next_bit = extract_bit(key->data, node->prefixlen); >> + slot = &node->child[next_bit]; >> + } >> + >> + /* >> + * If the slot is empty (a free child pointer or an empty root), >> + * simply assign the @new_node to that slot and be done. >> + */ >> + if (!node) { >> + rcu_assign_pointer(*slot, new_node); >> + goto out; >> + } >> + >> + /* >> + * If the slot we picked already exists, replace it with @new_node >> + * which already has the correct data array and value set. >> + */ >> + if (node->prefixlen == matchlen) { >> + new_node->child[0] = node->child[0]; >> + new_node->child[1] = node->child[1]; >> + >> + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) >> + trie->n_entries--; >> + >> + rcu_assign_pointer(*slot, new_node); >> + kfree_rcu(node, rcu); >> + >> + goto out; >> + } >> + >> + /* >> + * If the new node matches the prefix completely, it must be an >> + * inserted as an ancestor. Simply insert it between @node and @*slot. >> + */ >> + if (matchlen == key->prefixlen) { >> + next_bit = extract_bit(node->data, matchlen); >> + rcu_assign_pointer(new_node->child[next_bit], node); >> + rcu_assign_pointer(*slot, new_node); >> + goto out; >> + } >> + >> + im_node = lpm_trie_node_alloc(trie->data_size); >> + if (!im_node) { >> + ret = -ENOMEM; >> + goto out; >> + } >> + >> + im_node->prefixlen = matchlen; >> + im_node->flags |= LPM_TREE_NODE_FLAG_IM; >> + memcpy(im_node->data, node->data, trie->data_size); >> + >> + /* Now determine which child to install in which slot */ >> + if (extract_bit(key->data, matchlen)) { >> + rcu_assign_pointer(im_node->child[0], node); >> + rcu_assign_pointer(im_node->child[1], new_node); >> + } else { >> + rcu_assign_pointer(im_node->child[0], new_node); >> + rcu_assign_pointer(im_node->child[1], node); >> + } >> + >> + /* Finally, assign the intermediate node to the determined spot */ >> + rcu_assign_pointer(*slot, im_node); >> + >> +out: >> + if (ret) { >> + if (new_node) >> + trie->n_entries--; >> + >> + kfree(new_node); >> + kfree(im_node); >> + } >> + >> + spin_unlock(&trie->lock); >> + >> + return ret; >> +} >> + >> +static struct bpf_map *trie_alloc(union bpf_attr *attr) >> +{ >> + struct lpm_trie *trie; >> + >> + /* check sanity of attributes */ >> + if (attr->max_entries == 0 || attr->map_flags || >> + attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 || >> + attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 || >> + attr->value_size != sizeof(u64)) >> + return ERR_PTR(-EINVAL); > > The correct attr->map_flags test here would need to be ... > > attr->map_flags != BPF_F_NO_PREALLOC > > ... since in this case we don't have any prealloc pool, and > should that come one day that test could be relaxed again. > >> + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); >> + if (!trie) >> + return NULL; Ohh and this needs to be return ERR_PTR(-ENOMEM), otherwise find_and_alloc_map() will pass a NULL map onwards and we get a NULL ptr deref as a result. >> + >> + /* copy mandatory map attributes */ >> + trie->map.map_type = attr->map_type; >> + trie->map.key_size = attr->key_size; >> + trie->map.value_size = attr->value_size; >> + trie->map.max_entries = attr->max_entries; > > You also need to fill in trie->map.pages as that is eventually > used to charge memory against in bpf_map_charge_memlock(), right > now that would remain as 0 meaning the map is not accounted for. > >> + trie->data_size = attr->key_size - >> + offsetof(struct bpf_lpm_trie_key, data); >> + trie->max_prefixlen = trie->data_size * 8; >> + >> + spin_lock_init(&trie->lock); >> + >> + return &trie->map; >> +} >> + >> +static void trie_free(struct bpf_map *map) >> +{ >> + struct lpm_trie_node __rcu **slot; >> + struct lpm_trie_node *node; >> + struct lpm_trie *trie = >> + container_of(map, struct lpm_trie, map); >> + >> + spin_lock(&trie->lock); >> + >> + /* >> + * Always start at the root and walk down to a node that has no >> + * children. Then free that node, nullify its pointer in the parent, >> + * then start over. >> + */ >> + >> + for (;;) { >> + slot = &trie->root; >> + >> + for (;;) { >> + node = rcu_dereference_protected(*slot, >> + lockdep_is_held(&trie->lock)); >> + if (!node) >> + goto out; >> + >> + if (node->child[0]) { > > rcu_access_pointer(node->child[0]) (at least to keep sparse happy?) > >> + slot = &node->child[0]; >> + continue; >> + } >> + >> + if (node->child[1]) { > > Here too? > >> + slot = &node->child[1]; >> + continue; >> + } >> + >> + kfree(node); >> + rcu_assign_pointer(*slot, NULL); > > RCU_INIT_POINTER(*slot, NULL) > >> + break; >> + } >> + } >> + >> +out: >> + spin_unlock(&trie->lock); >> +} >> + >> +static const struct bpf_map_ops trie_ops = { >> + .map_alloc = trie_alloc, >> + .map_free = trie_free, >> + .map_lookup_elem = trie_lookup_elem, >> + .map_update_elem = trie_update_elem, > > delete ops still planned to add? > >> +}; >> + >> +static struct bpf_map_type_list trie_type __read_mostly = { >> + .ops = &trie_ops, >> + .type = BPF_MAP_TYPE_LPM_TRIE, >> +}; >> + >> +static int __init register_trie_map(void) >> +{ >> + bpf_register_map_type(&trie_type); >> + return 0; >> +} >> +late_initcall(register_trie_map); > > Thanks, > Daniel
On 01/05/2017 05:25 PM, Daniel Borkmann wrote: > On 12/29/2016 06:28 PM, Daniel Mack wrote: >> This trie implements a longest prefix match algorithm that can be used >> to match IP addresses to a stored set of ranges. >> >> Internally, data is stored in an unbalanced trie of nodes that has a >> maximum height of n, where n is the prefixlen the trie was created >> with. >> >> Tries may be created with prefix lengths that are multiples of 8, in >> the range from 8 to 2048. The key used for lookup and update operations >> is a struct bpf_lpm_trie_key, and the value is a uint64_t. >> >> The code carries more information about the internal implementation. >> >> Signed-off-by: Daniel Mack <daniel@zonque.org> >> Reviewed-by: David Herrmann <dh.herrmann@gmail.com> > > Thanks for working on it, and sorry for late reply. In addition to > Alexei's earlier comments on the cover letter, a few comments inline: > [...] >> +static struct bpf_map *trie_alloc(union bpf_attr *attr) >> +{ >> + struct lpm_trie *trie; >> + >> + /* check sanity of attributes */ >> + if (attr->max_entries == 0 || attr->map_flags || >> + attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 || >> + attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 || >> + attr->value_size != sizeof(u64)) >> + return ERR_PTR(-EINVAL); One more question on this regarding value size as u64 (perhaps I missed it along the way): reason this was chosen was because for keeping stats? Why not making user choose a size as in other maps, so also custom structs could be stored there? Thanks, Daniel
Hi Daniel, Thanks for your feedback! I agree on all points. Two questions below. On 01/05/2017 05:25 PM, Daniel Borkmann wrote: > On 12/29/2016 06:28 PM, Daniel Mack wrote: >> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c >> new file mode 100644 >> index 0000000..8b6a61d >> --- /dev/null >> +++ b/kernel/bpf/lpm_trie.c [..] >> +static struct bpf_map *trie_alloc(union bpf_attr *attr) >> +{ >> + struct lpm_trie *trie; >> + >> + /* check sanity of attributes */ >> + if (attr->max_entries == 0 || attr->map_flags || >> + attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 || >> + attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 || >> + attr->value_size != sizeof(u64)) >> + return ERR_PTR(-EINVAL); > > The correct attr->map_flags test here would need to be ... > > attr->map_flags != BPF_F_NO_PREALLOC > > ... since in this case we don't have any prealloc pool, and > should that come one day that test could be relaxed again. > >> + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); >> + if (!trie) >> + return NULL; >> + >> + /* copy mandatory map attributes */ >> + trie->map.map_type = attr->map_type; >> + trie->map.key_size = attr->key_size; >> + trie->map.value_size = attr->value_size; >> + trie->map.max_entries = attr->max_entries; > > You also need to fill in trie->map.pages as that is eventually > used to charge memory against in bpf_map_charge_memlock(), right > now that would remain as 0 meaning the map is not accounted for. Hmm, okay. The nodes are, however, allocated dynamically at runtime in this case. That means that we have trie->map.pages on each allocation, right? >> +static void trie_free(struct bpf_map *map) >> +{ >> + struct lpm_trie_node __rcu **slot; >> + struct lpm_trie_node *node; >> + struct lpm_trie *trie = >> + container_of(map, struct lpm_trie, map); >> + >> + spin_lock(&trie->lock); >> + >> + /* >> + * Always start at the root and walk down to a node that has no >> + * children. Then free that node, nullify its pointer in the parent, >> + * then start over. >> + */ >> + >> + for (;;) { >> + slot = &trie->root; >> + >> + for (;;) { >> + node = rcu_dereference_protected(*slot, >> + lockdep_is_held(&trie->lock)); >> + if (!node) >> + goto out; >> + >> + if (node->child[0]) { > > rcu_access_pointer(node->child[0]) (at least to keep sparse happy?) Done, but sparse does not actually complain here. Thanks, Daniel
Hi, On 01/05/2017 09:01 PM, Daniel Borkmann wrote: > On 01/05/2017 05:25 PM, Daniel Borkmann wrote: >> On 12/29/2016 06:28 PM, Daniel Mack wrote: > [...] >>> +static struct bpf_map *trie_alloc(union bpf_attr *attr) >>> +{ >>> + struct lpm_trie *trie; >>> + >>> + /* check sanity of attributes */ >>> + if (attr->max_entries == 0 || attr->map_flags || >>> + attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 || >>> + attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 || >>> + attr->value_size != sizeof(u64)) >>> + return ERR_PTR(-EINVAL); > > One more question on this regarding value size as u64 (perhaps I > missed it along the way): reason this was chosen was because for > keeping stats? Why not making user choose a size as in other maps, > so also custom structs could be stored there? In my use case, the actual value of a node is in fact ignored, all that matters is whether a node exists in a trie or not. The test code uses u64 for its tests. I can change it around so that the value size can be defined by userspace, but ideally it would also support 0-byte lengths then. The bpf map syscall handler should handle the latter just fine if I read the code correctly? Thanks, Daniel
Hi Daniel, On 01/05/2017 09:04 PM, Daniel Mack wrote: > On 01/05/2017 05:25 PM, Daniel Borkmann wrote: >> On 12/29/2016 06:28 PM, Daniel Mack wrote: > >>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c >>> new file mode 100644 >>> index 0000000..8b6a61d >>> --- /dev/null >>> +++ b/kernel/bpf/lpm_trie.c > > [..] > >>> +static struct bpf_map *trie_alloc(union bpf_attr *attr) >>> +{ >>> + struct lpm_trie *trie; >>> + >>> + /* check sanity of attributes */ >>> + if (attr->max_entries == 0 || attr->map_flags || >>> + attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 || >>> + attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 || >>> + attr->value_size != sizeof(u64)) >>> + return ERR_PTR(-EINVAL); >> >> The correct attr->map_flags test here would need to be ... >> >> attr->map_flags != BPF_F_NO_PREALLOC >> >> ... since in this case we don't have any prealloc pool, and >> should that come one day that test could be relaxed again. >> >>> + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); >>> + if (!trie) >>> + return NULL; >>> + >>> + /* copy mandatory map attributes */ >>> + trie->map.map_type = attr->map_type; >>> + trie->map.key_size = attr->key_size; >>> + trie->map.value_size = attr->value_size; >>> + trie->map.max_entries = attr->max_entries; >> >> You also need to fill in trie->map.pages as that is eventually >> used to charge memory against in bpf_map_charge_memlock(), right >> now that would remain as 0 meaning the map is not accounted for. > > Hmm, okay. The nodes are, however, allocated dynamically at runtime in > this case. That means that we have trie->map.pages on each allocation, > right? The current scheme (f.e. htab_map_alloc() has some details, although probably not too obvious) that was done charges worst-case cost up front, so it would be in trie_alloc() where you fill map.pages and map_create() will later account for them. Thanks, Daniel
On 01/05/2017 09:14 PM, Daniel Mack wrote: [...] > In my use case, the actual value of a node is in fact ignored, all that > matters is whether a node exists in a trie or not. The test code uses > u64 for its tests. > > I can change it around so that the value size can be defined by > userspace, but ideally it would also support 0-byte lengths then. The > bpf map syscall handler should handle the latter just fine if I read the > code correctly? Right now no map is allowed to have value size of 0, but since kmalloc() would return ZERO_SIZE_PTR in such case, it looks like it should work^tm, although I haven't checked whether it's guaranteed that all the copy_{from,to}_user() implementations work with 0 size as well and whether ubsan would complain on the ZERO_SIZE_PTR for memcpy() etc. Perhaps better to reject value size of 0 initially and later on follow up with making the syscall code more robust for such cases (afaik, for the htab this was also on todo.)? Thanks, Daniel
On 1/6/17 2:43 AM, Daniel Borkmann wrote: > On 01/05/2017 09:14 PM, Daniel Mack wrote: > [...] >> In my use case, the actual value of a node is in fact ignored, all that >> matters is whether a node exists in a trie or not. The test code uses >> u64 for its tests. >> >> I can change it around so that the value size can be defined by >> userspace, but ideally it would also support 0-byte lengths then. The >> bpf map syscall handler should handle the latter just fine if I read the >> code correctly? > > Right now no map is allowed to have value size of 0, but since kmalloc() > would return ZERO_SIZE_PTR in such case, it looks like it should > work^tm, although I haven't checked whether it's guaranteed that all > the copy_{from,to}_user() implementations work with 0 size as well > and whether ubsan would complain on the ZERO_SIZE_PTR for memcpy() etc. > Perhaps better to reject value size of 0 initially and later on follow > up with making the syscall code more robust for such cases (afaik, for > the htab this was also on todo.)? yes. the support for value_size=0 was on todo list pretty much as soon as htab was introduced and early on the verifier was done the way to make sure such case should work as-is from bpf program point of view, but for syscall lookup/update commands I didn't want to add checks for zero value until it's actually needed. So definitely some work around syscall handling is needed. Also agree that for lpm I would check value_size > 0 initially and then relax it for hash and lpm together.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0eb0e87..d564277 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -63,6 +63,12 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +struct bpf_lpm_trie_key { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ + __u8 data[0]; /* Arbitrary size */ +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -89,6 +95,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CGROUP_ARRAY, BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + BPF_MAP_TYPE_LPM_TRIE, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1276474..e1ce4f4 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c new file mode 100644 index 0000000..8b6a61d --- /dev/null +++ b/kernel/bpf/lpm_trie.c @@ -0,0 +1,468 @@ +/* + * Longest prefix match list implementation + * + * Copyright (c) 2016 Daniel Mack + * Copyright (c) 2016 David Herrmann + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <net/ipv6.h> + +/* Intermediate node */ +#define LPM_TREE_NODE_FLAG_IM BIT(0) + +struct lpm_trie_node; + +struct lpm_trie_node { + struct rcu_head rcu; + struct lpm_trie_node __rcu *child[2]; + u32 prefixlen; + u32 flags; + u64 value; + u8 data[0]; +}; + +struct lpm_trie { + struct bpf_map map; + struct lpm_trie_node __rcu *root; + size_t n_entries; + size_t max_prefixlen; + size_t data_size; + spinlock_t lock; +}; + +/* + * This trie implements a longest prefix match algorithm that can be used to + * match IP addresses to a stored set of ranges. + * + * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is + * interpreted as big endian, so data[0] stores the most significant byte. + * + * Match ranges are internally stored in instances of struct lpm_trie_node + * which each contain their prefix length as well as two pointers that may + * lead to more nodes containing more specific matches. Each node also stores + * a value that is defined by and returned to userspace via the update_elem + * and lookup functions. + * + * For instance, let's start with a trie that was created with a prefix length + * of 32, so it can be used for IPv4 addresses, and one single element that + * matches 192.168.0.0/16. The data array would hence contain + * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will + * stick to IP-address notation for readability though. + * + * As the trie is empty initially, the new node (1) will be places as root + * node, denoted as (R) in the example below. As there are no other node, both + * child pointers are %NULL. + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * + * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already + * a node with the same data and a smaller prefix (ie, a less specific one), + * node (2) will become a child of (1). In child index depends on the next bit + * that is outside of that (1) matches, and that bit is 0, so (2) will be + * child[0] of (1): + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | + * +----------------+ + * | (2) | + * | 192.168.0.0/24 | + * | value: 2 | + * | [0] [1] | + * +----------------+ + * + * The child[1] slot of (1) could be filled with another node which has bit #17 + * (the next bit after the ones that (1) matches on) set to 1. For instance, + * 192.168.128.0/24: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (2) | | (3) | + * | 192.168.0.0/24 | | 192.168.128.0/24 | + * | value: 2 | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * + * Let's add another node (4) to the game for 192.168.1.0/24. In order to place + * it, node (1) is looked at first, and because (4) of the semantics laid out + * above (bit #17 is 0), it would normally be attached to (1) as child[0]. + * However, that slot is already allocated, so a new node is needed in between. + * That node is does not have a value attached to it and it will never be + * returned to users as result of a lookup. It is only there to differenciate + * the traversal further. It will get a prefix as wide as necessary to + * distinguish its two children: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (4) (I) | | (3) | + * | 192.168.0.0/23 | | 192.168.128.0/24 | + * | value: --- | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * | | + * +----------------+ +----------------+ + * | (2) | | (5) | + * | 192.168.0.0/24 | | 192.168.1.0/24 | + * | value: 2 | | value: 5 | + * | [0] [1] | | [0] [1] | + * +----------------+ +----------------+ + * + * 192.168.1.1/32 would be a child of (5) etc. + * + * An intermediate node will be turned into a 'real' node on demand. In the + * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie. + * + * A fully populated trie would have a height of 32 nodes, as the trie was + * created with a prefix length of 32. + * + * The lookup starts at the root node. If the current node matches and if there + * is a child that can be used to become more specific, the trie is traversed + * downwards. The last node in the traversal that is a non-intermediate one is + * returned. + */ + +static inline int extract_bit(const u8 *data, size_t index) +{ + return !!(data[index / 8] & (1 << (7 - (index % 8)))); +} + +/** + * longest_prefix_match() - determine the longest prefix + * @trie: The trie to get internal sizes from + * @node: The node to operate on + * @key: The key to compare to @node + * + * Determine the longest prefix of @node that matches the bits in @key. + */ +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key *key) +{ + size_t prefixlen = 0; + int i; + + for (i = 0; i < trie->data_size; i++) { + size_t b; + + b = 8 - fls(node->data[i] ^ key->data[i]); + prefixlen += b; + + if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) + return min(node->prefixlen, key->prefixlen); + + if (b < 8) + break; + } + + return prefixlen; +} + +/* Called from syscall or from eBPF program */ +static void *trie_lookup_elem(struct bpf_map *map, void *_key) +{ + struct lpm_trie_node *node, *found = NULL; + struct bpf_lpm_trie_key *key = _key; + struct lpm_trie *trie = + container_of(map, struct lpm_trie, map); + + /* Start walking the trie from the root node ... */ + + for (node = rcu_dereference(trie->root); node;) { + unsigned int next_bit; + size_t matchlen; + + /* + * Determine the longest prefix of @node that matches @key. + * If it's the maximum possible prefix for this trie, we have + * an exact match and can return it directly. + */ + matchlen = longest_prefix_match(trie, node, key); + if (matchlen == trie->max_prefixlen) + return &node->value; + + /* + * If the number of bits that match is smaller than the prefix + * length of @node, bail out and return the node we have seen + * last in the traversal (ie, the parent). + */ + if (matchlen < node->prefixlen) + break; + + /* + * Consider this node as return candidate unless it is an + * artificially added intermediate one + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + found = node; + + /* + * If the node match is fully satisfied, let's see if we can + * become more specific. Determine the next bit in the key and + * traverse down. + */ + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + + return found ? &found->value : NULL; +} + +static struct lpm_trie_node *lpm_trie_node_alloc(size_t data_size) +{ + return kmalloc(sizeof(struct lpm_trie_node) + data_size, + GFP_ATOMIC | __GFP_NOWARN); +} + +/* Called from syscall or from eBPF program */ +static int trie_update_elem(struct bpf_map *map, + void *_key, void *value, u64 flags) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *im_node, *new_node = NULL; + struct lpm_trie_node __rcu **slot; + struct bpf_lpm_trie_key *key = _key; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + spin_lock(&trie->lock); + + /* Allocate and fill a new node */ + + if (trie->n_entries == trie->map.max_entries) { + ret = -ENOSPC; + goto out; + } + + new_node = lpm_trie_node_alloc(trie->data_size); + if (!new_node) { + ret = -ENOMEM; + goto out; + } + + trie->n_entries++; + new_node->value = *(u64 *) value; + new_node->prefixlen = key->prefixlen; + new_node->flags = 0; + new_node->child[0] = NULL; + new_node->child[1] = NULL; + memcpy(new_node->data, key->data, trie->data_size); + + /* + * Now find a slot to attach the new node. To do that, walk the tree + * from the root match as many bits as possible for each node until we + * either find an empty slot or a slot that needs to be replaced by an + * intermediate node. + */ + slot = &trie->root; + + while ((node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)))) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen || + node->prefixlen == trie->max_prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + slot = &node->child[next_bit]; + } + + /* + * If the slot is empty (a free child pointer or an empty root), + * simply assign the @new_node to that slot and be done. + */ + if (!node) { + rcu_assign_pointer(*slot, new_node); + goto out; + } + + /* + * If the slot we picked already exists, replace it with @new_node + * which already has the correct data array and value set. + */ + if (node->prefixlen == matchlen) { + new_node->child[0] = node->child[0]; + new_node->child[1] = node->child[1]; + + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + trie->n_entries--; + + rcu_assign_pointer(*slot, new_node); + kfree_rcu(node, rcu); + + goto out; + } + + /* + * If the new node matches the prefix completely, it must be an + * inserted as an ancestor. Simply insert it between @node and @*slot. + */ + if (matchlen == key->prefixlen) { + next_bit = extract_bit(node->data, matchlen); + rcu_assign_pointer(new_node->child[next_bit], node); + rcu_assign_pointer(*slot, new_node); + goto out; + } + + im_node = lpm_trie_node_alloc(trie->data_size); + if (!im_node) { + ret = -ENOMEM; + goto out; + } + + im_node->prefixlen = matchlen; + im_node->flags |= LPM_TREE_NODE_FLAG_IM; + memcpy(im_node->data, node->data, trie->data_size); + + /* Now determine which child to install in which slot */ + if (extract_bit(key->data, matchlen)) { + rcu_assign_pointer(im_node->child[0], node); + rcu_assign_pointer(im_node->child[1], new_node); + } else { + rcu_assign_pointer(im_node->child[0], new_node); + rcu_assign_pointer(im_node->child[1], node); + } + + /* Finally, assign the intermediate node to the determined spot */ + rcu_assign_pointer(*slot, im_node); + +out: + if (ret) { + if (new_node) + trie->n_entries--; + + kfree(new_node); + kfree(im_node); + } + + spin_unlock(&trie->lock); + + return ret; +} + +static struct bpf_map *trie_alloc(union bpf_attr *attr) +{ + struct lpm_trie *trie; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->map_flags || + attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 || + attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 || + attr->value_size != sizeof(u64)) + return ERR_PTR(-EINVAL); + + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); + if (!trie) + return NULL; + + /* copy mandatory map attributes */ + trie->map.map_type = attr->map_type; + trie->map.key_size = attr->key_size; + trie->map.value_size = attr->value_size; + trie->map.max_entries = attr->max_entries; + trie->data_size = attr->key_size - + offsetof(struct bpf_lpm_trie_key, data); + trie->max_prefixlen = trie->data_size * 8; + + spin_lock_init(&trie->lock); + + return &trie->map; +} + +static void trie_free(struct bpf_map *map) +{ + struct lpm_trie_node __rcu **slot; + struct lpm_trie_node *node; + struct lpm_trie *trie = + container_of(map, struct lpm_trie, map); + + spin_lock(&trie->lock); + + /* + * Always start at the root and walk down to a node that has no + * children. Then free that node, nullify its pointer in the parent, + * then start over. + */ + + for (;;) { + slot = &trie->root; + + for (;;) { + node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)); + if (!node) + goto out; + + if (node->child[0]) { + slot = &node->child[0]; + continue; + } + + if (node->child[1]) { + slot = &node->child[1]; + continue; + } + + kfree(node); + rcu_assign_pointer(*slot, NULL); + break; + } + } + +out: + spin_unlock(&trie->lock); +} + +static const struct bpf_map_ops trie_ops = { + .map_alloc = trie_alloc, + .map_free = trie_free, + .map_lookup_elem = trie_lookup_elem, + .map_update_elem = trie_update_elem, +}; + +static struct bpf_map_type_list trie_type __read_mostly = { + .ops = &trie_ops, + .type = BPF_MAP_TYPE_LPM_TRIE, +}; + +static int __init register_trie_map(void) +{ + bpf_register_map_type(&trie_type); + return 0; +} +late_initcall(register_trie_map);