Message ID | 1587032223-49460-2-git-send-email-xiangxia.m.yue@gmail.com |
---|---|
State | Superseded |
Headers | show |
Series | expand meter tables and fix bug | expand |
On Sat, Apr 18, 2020 at 10:25 AM <xiangxia.m.yue@gmail.com> wrote: > > From: Tonghao Zhang <xiangxia.m.yue@gmail.com> > > In kernel datapath of Open vSwitch, there are only 1024 > buckets of meter in one dp. If installing more than 1024 > (e.g. 8192) meters, it may lead to the performance drop. > But in some case, for example, Open vSwitch used as edge > gateway, there should be 200,000+ at least, meters used for > IP address bandwidth limitation. > > [Open vSwitch userspace datapath has this issue too.] > > For more scalable meter, this patch expands the buckets > when necessary, so we can install more meters in the datapath. > Introducing the struct *dp_meter_instance*, it's easy to > expand meter though changing the *ti* point in the struct > *dp_meter_table*. > > Cc: Pravin B Shelar <pshelar@ovn.org> > Cc: Andy Zhou <azhou@ovn.org> > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> > --- > net/openvswitch/datapath.h | 2 +- > net/openvswitch/meter.c | 200 +++++++++++++++++++++++++++++-------- > net/openvswitch/meter.h | 15 ++- > 3 files changed, 169 insertions(+), 48 deletions(-) > > diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h > index e239a46c2f94..785105578448 100644 > --- a/net/openvswitch/datapath.h > +++ b/net/openvswitch/datapath.h > @@ -82,7 +82,7 @@ struct datapath { > u32 max_headroom; > > /* Switch meters. */ > - struct hlist_head *meters; > + struct dp_meter_table *meters; lets define it as part of this struct to avoid indirection. > }; > > /** > diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c > index 5010d1ddd4bd..494a0014ecd8 100644 > --- a/net/openvswitch/meter.c > +++ b/net/openvswitch/meter.c > @@ -19,8 +19,6 @@ > #include "datapath.h" > #include "meter.h" > > -#define METER_HASH_BUCKETS 1024 > - > static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { > [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, > [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, > @@ -39,6 +37,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { > [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, > }; > > +static u32 meter_hash(struct dp_meter_instance *ti, u32 id) > +{ > + return id % ti->n_meters; > +} > + > static void ovs_meter_free(struct dp_meter *meter) > { > if (!meter) > @@ -47,40 +50,141 @@ static void ovs_meter_free(struct dp_meter *meter) > kfree_rcu(meter, rcu); > } > > -static struct hlist_head *meter_hash_bucket(const struct datapath *dp, > - u32 meter_id) > -{ > - return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; > -} > - > /* Call with ovs_mutex or RCU read lock. */ > -static struct dp_meter *lookup_meter(const struct datapath *dp, > +static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, > u32 meter_id) > { > + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); > + u32 hash = meter_hash(ti, meter_id); > struct dp_meter *meter; > - struct hlist_head *head; > > - head = meter_hash_bucket(dp, meter_id); > - hlist_for_each_entry_rcu(meter, head, dp_hash_node, > - lockdep_ovsl_is_held()) { > - if (meter->id == meter_id) > - return meter; > - } > + meter = rcu_dereference_ovsl(ti->dp_meters[hash]); > + if (meter && likely(meter->id == meter_id)) > + return meter; > + > return NULL; > } > > -static void attach_meter(struct datapath *dp, struct dp_meter *meter) > +static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) > +{ > + struct dp_meter_instance *ti; > + > + ti = kvzalloc(sizeof(*ti) + > + sizeof(struct dp_meter *) * size, > + GFP_KERNEL); > + if (!ti) > + return NULL; Given this is a kernel space array we need to have hard limit inplace. > + > + ti->n_meters = size; > + > + return ti; > +} > + > +static void dp_meter_instance_free(struct dp_meter_instance *ti) > +{ > + kvfree(ti); > +} > + > +static void dp_meter_instance_free_rcu(struct rcu_head *rcu) > +{ > + struct dp_meter_instance *ti; > + > + ti = container_of(rcu, struct dp_meter_instance, rcu); > + kvfree(ti); > +} > + > +static int > +dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) > +{ > + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); > + int n_meters = min(size, ti->n_meters); > + struct dp_meter_instance *new_ti; > + int i; > + > + new_ti = dp_meter_instance_alloc(size); > + if (!new_ti) > + return -ENOMEM; > + > + for (i = 0; i < n_meters; i++) > + new_ti->dp_meters[i] = > + rcu_dereference_ovsl(ti->dp_meters[i]); > + > + rcu_assign_pointer(tbl->ti, new_ti); > + call_rcu(&ti->rcu, dp_meter_instance_free_rcu); > + > + return 0; > +} > + > +static void dp_meter_instance_insert(struct dp_meter_instance *ti, > + struct dp_meter *meter) > +{ > + u32 hash; > + > + hash = meter_hash(ti, meter->id); > + rcu_assign_pointer(ti->dp_meters[hash], meter); > +} > + > +static void dp_meter_instance_remove(struct dp_meter_instance *ti, > + struct dp_meter *meter) > { > - struct hlist_head *head = meter_hash_bucket(dp, meter->id); > + u32 hash; > > - hlist_add_head_rcu(&meter->dp_hash_node, head); > + hash = meter_hash(ti, meter->id); > + RCU_INIT_POINTER(ti->dp_meters[hash], NULL); > } > > -static void detach_meter(struct dp_meter *meter) > +static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) > { > + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); > + u32 hash = meter_hash(ti, meter->id); > + > + /* > + * In generally, slot selected should be empty, because > + * OvS uses id-pool to fetch a available id. > + */ > + if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) > + return -EINVAL; we could return -EBUSY instead. > + > + dp_meter_instance_insert(ti, meter); > + > + /* That function is thread-safe. */ > + if (++tbl->count >= ti->n_meters) > + if (dp_meter_instance_realloc(tbl, ti->n_meters * 2)) > + goto expand_err; > + > + return 0; > + > +expand_err: > + dp_meter_instance_remove(ti, meter); > + tbl->count--; > + return -ENOMEM; > +} > + > +static void detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) > +{ > + struct dp_meter_instance *ti; > + > ASSERT_OVSL(); > - if (meter) > - hlist_del_rcu(&meter->dp_hash_node); > + if (!meter) > + return; > + > + ti = rcu_dereference_ovsl(tbl->ti); > + dp_meter_instance_remove(ti, meter); > + > + tbl->count--; > + > + /* Shrink the meter array if necessary. */ > + if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && > + tbl->count <= (ti->n_meters / 4)) { > + int half_size = ti->n_meters / 2; > + int i; > + Lets add a comment about this. > + for (i = half_size; i < ti->n_meters; i++) > + if (rcu_dereference_ovsl(ti->dp_meters[i])) > + return; > + > + dp_meter_instance_realloc(tbl, half_size); > + } > } > > static struct sk_buff * > @@ -303,9 +407,13 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) > meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); > > /* Cannot fail after this. */ > - old_meter = lookup_meter(dp, meter_id); > - detach_meter(old_meter); > - attach_meter(dp, meter); > + old_meter = lookup_meter(dp->meters, meter_id); in new scheme this can fail due to hash collision, lets check for NULL. > + detach_meter(dp->meters, old_meter); > + > + err = attach_meter(dp->meters, meter); > + if (err) > + goto exit_unlock; > + > ovs_unlock(); > > /* Build response with the meter_id and stats from > @@ -365,7 +473,7 @@ static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) > } > > /* Locate meter, copy stats. */ > - meter = lookup_meter(dp, meter_id); > + meter = lookup_meter(dp->meters, meter_id); > if (!meter) { > err = -ENOENT; > goto exit_unlock; > @@ -416,13 +524,13 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) > goto exit_unlock; > } > > - old_meter = lookup_meter(dp, meter_id); > + old_meter = lookup_meter(dp->meters, meter_id); > if (old_meter) { > spin_lock_bh(&old_meter->lock); > err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); > WARN_ON(err); > spin_unlock_bh(&old_meter->lock); > - detach_meter(old_meter); > + detach_meter(dp->meters, old_meter); > } > ovs_unlock(); > ovs_meter_free(old_meter); > @@ -452,7 +560,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, > int i, band_exceeded_max = -1; > u32 band_exceeded_rate = 0; > > - meter = lookup_meter(dp, meter_id); > + meter = lookup_meter(dp->meters, meter_id); > /* Do not drop the packet when there is no meter. */ > if (!meter) > return false; > @@ -570,32 +678,36 @@ struct genl_family dp_meter_genl_family __ro_after_init = { > > int ovs_meters_init(struct datapath *dp) > { > - int i; > + struct dp_meter_instance *ti; > + struct dp_meter_table *tbl; > + > + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); > + if (!tbl) > + return -ENOMEM; > > - dp->meters = kmalloc_array(METER_HASH_BUCKETS, > - sizeof(struct hlist_head), GFP_KERNEL); > + tbl->count = 0; > > - if (!dp->meters) > + ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); > + if (!ti) { > + kfree(tbl); > return -ENOMEM; > + } > > - for (i = 0; i < METER_HASH_BUCKETS; i++) > - INIT_HLIST_HEAD(&dp->meters[i]); > + rcu_assign_pointer(tbl->ti, ti); > + dp->meters = tbl; > > return 0; > } > > void ovs_meters_exit(struct datapath *dp) > { > + struct dp_meter_table *tbl = dp->meters; > + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); > int i; > > - for (i = 0; i < METER_HASH_BUCKETS; i++) { > - struct hlist_head *head = &dp->meters[i]; > - struct dp_meter *meter; > - struct hlist_node *n; > - > - hlist_for_each_entry_safe(meter, n, head, dp_hash_node) > - kfree(meter); > - } > + for (i = 0; i < ti->n_meters; i++) > + ovs_meter_free(ti->dp_meters[i]); > > - kfree(dp->meters); > + dp_meter_instance_free(ti); > + kfree(tbl); > } > diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h > index f645913870bd..d91940383bbe 100644 > --- a/net/openvswitch/meter.h > +++ b/net/openvswitch/meter.h > @@ -18,6 +18,7 @@ > struct datapath; > > #define DP_MAX_BANDS 1 > +#define DP_METER_ARRAY_SIZE_MIN (1ULL << 10) > > struct dp_meter_band { > u32 type; > @@ -30,9 +31,6 @@ struct dp_meter_band { > struct dp_meter { > spinlock_t lock; /* Per meter lock */ > struct rcu_head rcu; > - struct hlist_node dp_hash_node; /*Element in datapath->meters > - * hash table. > - */ > u32 id; > u16 kbps:1, keep_stats:1; > u16 n_bands; > @@ -42,6 +40,17 @@ struct dp_meter { > struct dp_meter_band bands[]; > }; > > +struct dp_meter_instance { > + struct rcu_head rcu; > + u32 n_meters; > + struct dp_meter __rcu *dp_meters[]; > +}; > + > +struct dp_meter_table { > + struct dp_meter_instance __rcu *ti; > + u32 count; > +}; > + > extern struct genl_family dp_meter_genl_family; > int ovs_meters_init(struct datapath *dp); > void ovs_meters_exit(struct datapath *dp); > -- > 2.23.0 >
On Mon, Apr 20, 2020 at 1:29 AM Pravin Shelar <pravin.ovn@gmail.com> wrote: > > On Sat, Apr 18, 2020 at 10:25 AM <xiangxia.m.yue@gmail.com> wrote: > > > > From: Tonghao Zhang <xiangxia.m.yue@gmail.com> > > > > In kernel datapath of Open vSwitch, there are only 1024 > > buckets of meter in one dp. If installing more than 1024 > > (e.g. 8192) meters, it may lead to the performance drop. > > But in some case, for example, Open vSwitch used as edge > > gateway, there should be 200,000+ at least, meters used for > > IP address bandwidth limitation. > > > > [Open vSwitch userspace datapath has this issue too.] > > > > For more scalable meter, this patch expands the buckets > > when necessary, so we can install more meters in the datapath. > > Introducing the struct *dp_meter_instance*, it's easy to > > expand meter though changing the *ti* point in the struct > > *dp_meter_table*. > > > > Cc: Pravin B Shelar <pshelar@ovn.org> > > Cc: Andy Zhou <azhou@ovn.org> > > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> > > --- > > net/openvswitch/datapath.h | 2 +- > > net/openvswitch/meter.c | 200 +++++++++++++++++++++++++++++-------- > > net/openvswitch/meter.h | 15 ++- > > 3 files changed, 169 insertions(+), 48 deletions(-) > > > > diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h > > index e239a46c2f94..785105578448 100644 > > --- a/net/openvswitch/datapath.h > > +++ b/net/openvswitch/datapath.h > > @@ -82,7 +82,7 @@ struct datapath { > > u32 max_headroom; > > > > /* Switch meters. */ > > - struct hlist_head *meters; > > + struct dp_meter_table *meters; > lets define it as part of this struct to avoid indirection. > > > }; > > > > /** > > diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c > > index 5010d1ddd4bd..494a0014ecd8 100644 > > --- a/net/openvswitch/meter.c > > +++ b/net/openvswitch/meter.c > > @@ -19,8 +19,6 @@ > > #include "datapath.h" > > #include "meter.h" > > > > -#define METER_HASH_BUCKETS 1024 > > - > > static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { > > [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, > > [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, > > @@ -39,6 +37,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { > > [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, > > }; > > > > +static u32 meter_hash(struct dp_meter_instance *ti, u32 id) > > +{ > > + return id % ti->n_meters; > > +} > > + > > static void ovs_meter_free(struct dp_meter *meter) > > { > > if (!meter) > > @@ -47,40 +50,141 @@ static void ovs_meter_free(struct dp_meter *meter) > > kfree_rcu(meter, rcu); > > } > > > > -static struct hlist_head *meter_hash_bucket(const struct datapath *dp, > > - u32 meter_id) > > -{ > > - return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; > > -} > > - > > /* Call with ovs_mutex or RCU read lock. */ > > -static struct dp_meter *lookup_meter(const struct datapath *dp, > > +static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, > > u32 meter_id) > > { > > + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); > > + u32 hash = meter_hash(ti, meter_id); > > struct dp_meter *meter; > > - struct hlist_head *head; > > > > - head = meter_hash_bucket(dp, meter_id); > > - hlist_for_each_entry_rcu(meter, head, dp_hash_node, > > - lockdep_ovsl_is_held()) { > > - if (meter->id == meter_id) > > - return meter; > > - } > > + meter = rcu_dereference_ovsl(ti->dp_meters[hash]); > > + if (meter && likely(meter->id == meter_id)) > > + return meter; > > + > > return NULL; > > } > > > > -static void attach_meter(struct datapath *dp, struct dp_meter *meter) > > +static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) > > +{ > > + struct dp_meter_instance *ti; > > + > > + ti = kvzalloc(sizeof(*ti) + > > + sizeof(struct dp_meter *) * size, > > + GFP_KERNEL); > > + if (!ti) > > + return NULL; > Given this is a kernel space array we need to have hard limit inplace. In patch 2, I limited the meter number, should we add hard limit here ? > > + > > + ti->n_meters = size; > > + > > + return ti; > > +} > > + > > +static void dp_meter_instance_free(struct dp_meter_instance *ti) > > +{ > > + kvfree(ti); > > +} > > + > > +static void dp_meter_instance_free_rcu(struct rcu_head *rcu) > > +{ > > + struct dp_meter_instance *ti; > > + > > + ti = container_of(rcu, struct dp_meter_instance, rcu); > > + kvfree(ti); > > +} > > + > > +static int > > +dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) > > +{ > > + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); > > + int n_meters = min(size, ti->n_meters); > > + struct dp_meter_instance *new_ti; > > + int i; > > + > > + new_ti = dp_meter_instance_alloc(size); > > + if (!new_ti) > > + return -ENOMEM; > > + > > + for (i = 0; i < n_meters; i++) > > + new_ti->dp_meters[i] = > > + rcu_dereference_ovsl(ti->dp_meters[i]); > > + > > + rcu_assign_pointer(tbl->ti, new_ti); > > + call_rcu(&ti->rcu, dp_meter_instance_free_rcu); > > + > > + return 0; > > +} > > + > > +static void dp_meter_instance_insert(struct dp_meter_instance *ti, > > + struct dp_meter *meter) > > +{ > > + u32 hash; > > + > > + hash = meter_hash(ti, meter->id); > > + rcu_assign_pointer(ti->dp_meters[hash], meter); > > +} > > + > > +static void dp_meter_instance_remove(struct dp_meter_instance *ti, > > + struct dp_meter *meter) > > { > > - struct hlist_head *head = meter_hash_bucket(dp, meter->id); > > + u32 hash; > > > > - hlist_add_head_rcu(&meter->dp_hash_node, head); > > + hash = meter_hash(ti, meter->id); > > + RCU_INIT_POINTER(ti->dp_meters[hash], NULL); > > } > > > > -static void detach_meter(struct dp_meter *meter) > > +static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) > > { > > + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); > > + u32 hash = meter_hash(ti, meter->id); > > + > > + /* > > + * In generally, slot selected should be empty, because > > + * OvS uses id-pool to fetch a available id. > > + */ > > + if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) > > + return -EINVAL; > we could return -EBUSY instead. > > + > > + dp_meter_instance_insert(ti, meter); > > + > > + /* That function is thread-safe. */ > > + if (++tbl->count >= ti->n_meters) > > + if (dp_meter_instance_realloc(tbl, ti->n_meters * 2)) > > + goto expand_err; > > + > > + return 0; > > + > > +expand_err: > > + dp_meter_instance_remove(ti, meter); > > + tbl->count--; > > + return -ENOMEM; > > +} > > + > > +static void detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) > > +{ > > + struct dp_meter_instance *ti; > > + > > ASSERT_OVSL(); > > - if (meter) > > - hlist_del_rcu(&meter->dp_hash_node); > > + if (!meter) > > + return; > > + > > + ti = rcu_dereference_ovsl(tbl->ti); > > + dp_meter_instance_remove(ti, meter); > > + > > + tbl->count--; > > + > > + /* Shrink the meter array if necessary. */ > > + if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && > > + tbl->count <= (ti->n_meters / 4)) { > > + int half_size = ti->n_meters / 2; > > + int i; > > + > Lets add a comment about this. > > + for (i = half_size; i < ti->n_meters; i++) > > + if (rcu_dereference_ovsl(ti->dp_meters[i])) > > + return; > > + > > + dp_meter_instance_realloc(tbl, half_size); > > + } > > } > > > > static struct sk_buff * > > @@ -303,9 +407,13 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) > > meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); > > > > /* Cannot fail after this. */ > > - old_meter = lookup_meter(dp, meter_id); > > - detach_meter(old_meter); > > - attach_meter(dp, meter); > > + old_meter = lookup_meter(dp->meters, meter_id); > in new scheme this can fail due to hash collision, lets check for NULL. If old_meter is NULL, detach_meter will do nothing. > > + detach_meter(dp->meters, old_meter); > > + > > + err = attach_meter(dp->meters, meter); > > + if (err) > > + goto exit_unlock; > > + > > ovs_unlock(); > > > > /* Build response with the meter_id and stats from > > @@ -365,7 +473,7 @@ static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) > > } > > > > /* Locate meter, copy stats. */ > > - meter = lookup_meter(dp, meter_id); > > + meter = lookup_meter(dp->meters, meter_id); > > if (!meter) { > > err = -ENOENT; > > goto exit_unlock; > > @@ -416,13 +524,13 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) > > goto exit_unlock; > > } > > > > - old_meter = lookup_meter(dp, meter_id); > > + old_meter = lookup_meter(dp->meters, meter_id); > > if (old_meter) { > > spin_lock_bh(&old_meter->lock); > > err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); > > WARN_ON(err); > > spin_unlock_bh(&old_meter->lock); > > - detach_meter(old_meter); > > + detach_meter(dp->meters, old_meter); > > } > > ovs_unlock(); > > ovs_meter_free(old_meter); > > @@ -452,7 +560,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, > > int i, band_exceeded_max = -1; > > u32 band_exceeded_rate = 0; > > > > - meter = lookup_meter(dp, meter_id); > > + meter = lookup_meter(dp->meters, meter_id); > > /* Do not drop the packet when there is no meter. */ > > if (!meter) > > return false; > > @@ -570,32 +678,36 @@ struct genl_family dp_meter_genl_family __ro_after_init = { > > > > int ovs_meters_init(struct datapath *dp) > > { > > - int i; > > + struct dp_meter_instance *ti; > > + struct dp_meter_table *tbl; > > + > > + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); > > + if (!tbl) > > + return -ENOMEM; > > > > - dp->meters = kmalloc_array(METER_HASH_BUCKETS, > > - sizeof(struct hlist_head), GFP_KERNEL); > > + tbl->count = 0; > > > > - if (!dp->meters) > > + ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); > > + if (!ti) { > > + kfree(tbl); > > return -ENOMEM; > > + } > > > > - for (i = 0; i < METER_HASH_BUCKETS; i++) > > - INIT_HLIST_HEAD(&dp->meters[i]); > > + rcu_assign_pointer(tbl->ti, ti); > > + dp->meters = tbl; > > > > return 0; > > } > > > > void ovs_meters_exit(struct datapath *dp) > > { > > + struct dp_meter_table *tbl = dp->meters; > > + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); > > int i; > > > > - for (i = 0; i < METER_HASH_BUCKETS; i++) { > > - struct hlist_head *head = &dp->meters[i]; > > - struct dp_meter *meter; > > - struct hlist_node *n; > > - > > - hlist_for_each_entry_safe(meter, n, head, dp_hash_node) > > - kfree(meter); > > - } > > + for (i = 0; i < ti->n_meters; i++) > > + ovs_meter_free(ti->dp_meters[i]); > > > > - kfree(dp->meters); > > + dp_meter_instance_free(ti); > > + kfree(tbl); > > } > > diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h > > index f645913870bd..d91940383bbe 100644 > > --- a/net/openvswitch/meter.h > > +++ b/net/openvswitch/meter.h > > @@ -18,6 +18,7 @@ > > struct datapath; > > > > #define DP_MAX_BANDS 1 > > +#define DP_METER_ARRAY_SIZE_MIN (1ULL << 10) > > > > struct dp_meter_band { > > u32 type; > > @@ -30,9 +31,6 @@ struct dp_meter_band { > > struct dp_meter { > > spinlock_t lock; /* Per meter lock */ > > struct rcu_head rcu; > > - struct hlist_node dp_hash_node; /*Element in datapath->meters > > - * hash table. > > - */ > > u32 id; > > u16 kbps:1, keep_stats:1; > > u16 n_bands; > > @@ -42,6 +40,17 @@ struct dp_meter { > > struct dp_meter_band bands[]; > > }; > > > > +struct dp_meter_instance { > > + struct rcu_head rcu; > > + u32 n_meters; > > + struct dp_meter __rcu *dp_meters[]; > > +}; > > + > > +struct dp_meter_table { > > + struct dp_meter_instance __rcu *ti; > > + u32 count; > > +}; > > + > > extern struct genl_family dp_meter_genl_family; > > int ovs_meters_init(struct datapath *dp); > > void ovs_meters_exit(struct datapath *dp); > > -- > > 2.23.0 > >
On Sun, Apr 19, 2020 at 5:23 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote: > > On Mon, Apr 20, 2020 at 1:29 AM Pravin Shelar <pravin.ovn@gmail.com> wrote: > > > > On Sat, Apr 18, 2020 at 10:25 AM <xiangxia.m.yue@gmail.com> wrote: > > > > > > From: Tonghao Zhang <xiangxia.m.yue@gmail.com> > > > > > > In kernel datapath of Open vSwitch, there are only 1024 > > > buckets of meter in one dp. If installing more than 1024 > > > (e.g. 8192) meters, it may lead to the performance drop. > > > But in some case, for example, Open vSwitch used as edge > > > gateway, there should be 200,000+ at least, meters used for > > > IP address bandwidth limitation. > > > > > > [Open vSwitch userspace datapath has this issue too.] > > > > > > For more scalable meter, this patch expands the buckets > > > when necessary, so we can install more meters in the datapath. > > > Introducing the struct *dp_meter_instance*, it's easy to > > > expand meter though changing the *ti* point in the struct > > > *dp_meter_table*. > > > > > > Cc: Pravin B Shelar <pshelar@ovn.org> > > > Cc: Andy Zhou <azhou@ovn.org> > > > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> > > > --- > > > net/openvswitch/datapath.h | 2 +- > > > net/openvswitch/meter.c | 200 +++++++++++++++++++++++++++++-------- > > > net/openvswitch/meter.h | 15 ++- > > > 3 files changed, 169 insertions(+), 48 deletions(-) > > > > > > diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h > > > index e239a46c2f94..785105578448 100644 > > > --- a/net/openvswitch/datapath.h > > > +++ b/net/openvswitch/datapath.h > > > @@ -82,7 +82,7 @@ struct datapath { > > > u32 max_headroom; > > > > > > /* Switch meters. */ > > > - struct hlist_head *meters; > > > + struct dp_meter_table *meters; > > lets define it as part of this struct to avoid indirection. > > > > > }; > > > > > > /** > > > diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c > > > index 5010d1ddd4bd..494a0014ecd8 100644 > > > --- a/net/openvswitch/meter.c > > > +++ b/net/openvswitch/meter.c > > > @@ -19,8 +19,6 @@ > > > #include "datapath.h" > > > #include "meter.h" > > > > > > -#define METER_HASH_BUCKETS 1024 > > > - > > > static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { > > > [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, > > > [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, > > > @@ -39,6 +37,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { > > > [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, > > > }; > > > > > > +static u32 meter_hash(struct dp_meter_instance *ti, u32 id) > > > +{ > > > + return id % ti->n_meters; > > > +} > > > + > > > static void ovs_meter_free(struct dp_meter *meter) > > > { > > > if (!meter) > > > @@ -47,40 +50,141 @@ static void ovs_meter_free(struct dp_meter *meter) > > > kfree_rcu(meter, rcu); > > > } > > > > > > -static struct hlist_head *meter_hash_bucket(const struct datapath *dp, > > > - u32 meter_id) > > > -{ > > > - return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; > > > -} > > > - > > > /* Call with ovs_mutex or RCU read lock. */ > > > -static struct dp_meter *lookup_meter(const struct datapath *dp, > > > +static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, > > > u32 meter_id) > > > { > > > + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); > > > + u32 hash = meter_hash(ti, meter_id); > > > struct dp_meter *meter; > > > - struct hlist_head *head; > > > > > > - head = meter_hash_bucket(dp, meter_id); > > > - hlist_for_each_entry_rcu(meter, head, dp_hash_node, > > > - lockdep_ovsl_is_held()) { > > > - if (meter->id == meter_id) > > > - return meter; > > > - } > > > + meter = rcu_dereference_ovsl(ti->dp_meters[hash]); > > > + if (meter && likely(meter->id == meter_id)) > > > + return meter; > > > + > > > return NULL; > > > } > > > > > > -static void attach_meter(struct datapath *dp, struct dp_meter *meter) > > > +static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) > > > +{ > > > + struct dp_meter_instance *ti; > > > + > > > + ti = kvzalloc(sizeof(*ti) + > > > + sizeof(struct dp_meter *) * size, > > > + GFP_KERNEL); > > > + if (!ti) > > > + return NULL; > > Given this is a kernel space array we need to have hard limit inplace. > In patch 2, I limited the meter number, should we add hard limit here ? I guess its not needed here. ... > > > static struct sk_buff * > > > @@ -303,9 +407,13 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) > > > meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); > > > > > > /* Cannot fail after this. */ > > > - old_meter = lookup_meter(dp, meter_id); > > > - detach_meter(old_meter); > > > - attach_meter(dp, meter); > > > + old_meter = lookup_meter(dp->meters, meter_id); > > in new scheme this can fail due to hash collision, lets check for NULL. > If old_meter is NULL, detach_meter will do nothing. Lets return error.
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index e239a46c2f94..785105578448 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -82,7 +82,7 @@ struct datapath { u32 max_headroom; /* Switch meters. */ - struct hlist_head *meters; + struct dp_meter_table *meters; }; /** diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 5010d1ddd4bd..494a0014ecd8 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -19,8 +19,6 @@ #include "datapath.h" #include "meter.h" -#define METER_HASH_BUCKETS 1024 - static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, @@ -39,6 +37,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; +static u32 meter_hash(struct dp_meter_instance *ti, u32 id) +{ + return id % ti->n_meters; +} + static void ovs_meter_free(struct dp_meter *meter) { if (!meter) @@ -47,40 +50,141 @@ static void ovs_meter_free(struct dp_meter *meter) kfree_rcu(meter, rcu); } -static struct hlist_head *meter_hash_bucket(const struct datapath *dp, - u32 meter_id) -{ - return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; -} - /* Call with ovs_mutex or RCU read lock. */ -static struct dp_meter *lookup_meter(const struct datapath *dp, +static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, u32 meter_id) { + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter_id); struct dp_meter *meter; - struct hlist_head *head; - head = meter_hash_bucket(dp, meter_id); - hlist_for_each_entry_rcu(meter, head, dp_hash_node, - lockdep_ovsl_is_held()) { - if (meter->id == meter_id) - return meter; - } + meter = rcu_dereference_ovsl(ti->dp_meters[hash]); + if (meter && likely(meter->id == meter_id)) + return meter; + return NULL; } -static void attach_meter(struct datapath *dp, struct dp_meter *meter) +static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) +{ + struct dp_meter_instance *ti; + + ti = kvzalloc(sizeof(*ti) + + sizeof(struct dp_meter *) * size, + GFP_KERNEL); + if (!ti) + return NULL; + + ti->n_meters = size; + + return ti; +} + +static void dp_meter_instance_free(struct dp_meter_instance *ti) +{ + kvfree(ti); +} + +static void dp_meter_instance_free_rcu(struct rcu_head *rcu) +{ + struct dp_meter_instance *ti; + + ti = container_of(rcu, struct dp_meter_instance, rcu); + kvfree(ti); +} + +static int +dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + int n_meters = min(size, ti->n_meters); + struct dp_meter_instance *new_ti; + int i; + + new_ti = dp_meter_instance_alloc(size); + if (!new_ti) + return -ENOMEM; + + for (i = 0; i < n_meters; i++) + new_ti->dp_meters[i] = + rcu_dereference_ovsl(ti->dp_meters[i]); + + rcu_assign_pointer(tbl->ti, new_ti); + call_rcu(&ti->rcu, dp_meter_instance_free_rcu); + + return 0; +} + +static void dp_meter_instance_insert(struct dp_meter_instance *ti, + struct dp_meter *meter) +{ + u32 hash; + + hash = meter_hash(ti, meter->id); + rcu_assign_pointer(ti->dp_meters[hash], meter); +} + +static void dp_meter_instance_remove(struct dp_meter_instance *ti, + struct dp_meter *meter) { - struct hlist_head *head = meter_hash_bucket(dp, meter->id); + u32 hash; - hlist_add_head_rcu(&meter->dp_hash_node, head); + hash = meter_hash(ti, meter->id); + RCU_INIT_POINTER(ti->dp_meters[hash], NULL); } -static void detach_meter(struct dp_meter *meter) +static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter->id); + + /* + * In generally, slot selected should be empty, because + * OvS uses id-pool to fetch a available id. + */ + if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) + return -EINVAL; + + dp_meter_instance_insert(ti, meter); + + /* That function is thread-safe. */ + if (++tbl->count >= ti->n_meters) + if (dp_meter_instance_realloc(tbl, ti->n_meters * 2)) + goto expand_err; + + return 0; + +expand_err: + dp_meter_instance_remove(ti, meter); + tbl->count--; + return -ENOMEM; +} + +static void detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) +{ + struct dp_meter_instance *ti; + ASSERT_OVSL(); - if (meter) - hlist_del_rcu(&meter->dp_hash_node); + if (!meter) + return; + + ti = rcu_dereference_ovsl(tbl->ti); + dp_meter_instance_remove(ti, meter); + + tbl->count--; + + /* Shrink the meter array if necessary. */ + if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && + tbl->count <= (ti->n_meters / 4)) { + int half_size = ti->n_meters / 2; + int i; + + for (i = half_size; i < ti->n_meters; i++) + if (rcu_dereference_ovsl(ti->dp_meters[i])) + return; + + dp_meter_instance_realloc(tbl, half_size); + } } static struct sk_buff * @@ -303,9 +407,13 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); /* Cannot fail after this. */ - old_meter = lookup_meter(dp, meter_id); - detach_meter(old_meter); - attach_meter(dp, meter); + old_meter = lookup_meter(dp->meters, meter_id); + detach_meter(dp->meters, old_meter); + + err = attach_meter(dp->meters, meter); + if (err) + goto exit_unlock; + ovs_unlock(); /* Build response with the meter_id and stats from @@ -365,7 +473,7 @@ static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) } /* Locate meter, copy stats. */ - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(dp->meters, meter_id); if (!meter) { err = -ENOENT; goto exit_unlock; @@ -416,13 +524,13 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } - old_meter = lookup_meter(dp, meter_id); + old_meter = lookup_meter(dp->meters, meter_id); if (old_meter) { spin_lock_bh(&old_meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); spin_unlock_bh(&old_meter->lock); - detach_meter(old_meter); + detach_meter(dp->meters, old_meter); } ovs_unlock(); ovs_meter_free(old_meter); @@ -452,7 +560,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, int i, band_exceeded_max = -1; u32 band_exceeded_rate = 0; - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(dp->meters, meter_id); /* Do not drop the packet when there is no meter. */ if (!meter) return false; @@ -570,32 +678,36 @@ struct genl_family dp_meter_genl_family __ro_after_init = { int ovs_meters_init(struct datapath *dp) { - int i; + struct dp_meter_instance *ti; + struct dp_meter_table *tbl; + + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + if (!tbl) + return -ENOMEM; - dp->meters = kmalloc_array(METER_HASH_BUCKETS, - sizeof(struct hlist_head), GFP_KERNEL); + tbl->count = 0; - if (!dp->meters) + ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); + if (!ti) { + kfree(tbl); return -ENOMEM; + } - for (i = 0; i < METER_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->meters[i]); + rcu_assign_pointer(tbl->ti, ti); + dp->meters = tbl; return 0; } void ovs_meters_exit(struct datapath *dp) { + struct dp_meter_table *tbl = dp->meters; + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); int i; - for (i = 0; i < METER_HASH_BUCKETS; i++) { - struct hlist_head *head = &dp->meters[i]; - struct dp_meter *meter; - struct hlist_node *n; - - hlist_for_each_entry_safe(meter, n, head, dp_hash_node) - kfree(meter); - } + for (i = 0; i < ti->n_meters; i++) + ovs_meter_free(ti->dp_meters[i]); - kfree(dp->meters); + dp_meter_instance_free(ti); + kfree(tbl); } diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index f645913870bd..d91940383bbe 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -18,6 +18,7 @@ struct datapath; #define DP_MAX_BANDS 1 +#define DP_METER_ARRAY_SIZE_MIN (1ULL << 10) struct dp_meter_band { u32 type; @@ -30,9 +31,6 @@ struct dp_meter_band { struct dp_meter { spinlock_t lock; /* Per meter lock */ struct rcu_head rcu; - struct hlist_node dp_hash_node; /*Element in datapath->meters - * hash table. - */ u32 id; u16 kbps:1, keep_stats:1; u16 n_bands; @@ -42,6 +40,17 @@ struct dp_meter { struct dp_meter_band bands[]; }; +struct dp_meter_instance { + struct rcu_head rcu; + u32 n_meters; + struct dp_meter __rcu *dp_meters[]; +}; + +struct dp_meter_table { + struct dp_meter_instance __rcu *ti; + u32 count; +}; + extern struct genl_family dp_meter_genl_family; int ovs_meters_init(struct datapath *dp); void ovs_meters_exit(struct datapath *dp);