diff mbox series

[ovs-dev,ovs,v2,1/4] dpif-netdev: Expand the meters supported number

Message ID 20200513133135.48474-2-xiangxia.m.yue@gmail.com
State Changes Requested
Headers show
Series expand the meter table and fix bug | expand

Commit Message

Tonghao Zhang May 13, 2020, 1:31 p.m. UTC
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

For now, ovs-vswitchd use the array of the dp_meter struct
to store meter's data, and at most, there are only 65536
(defined by MAX_METERS) meters that can be used. But in some
case, for example, in the edge gateway, we should use 200,000,
at least, meters for IP address bandwidth limitation.
Every one IP address will use two meters for its rx and tx
path[1]. In other way, ovs-vswitchd should support meter-offload
(rte_mtr_xxx api introduced by dpdk.), but there are more than
65536 meters in the hardware, such as Mellanox ConnectX-6.

This patch use array to manage the meter, but it can ben expanded.

[1].
$ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
$ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0

Cc: Ilya Maximets <i.maximets@ovn.org>
Cc: William Tu <u9012063@gmail.com>
Cc: Jarno Rajahalme <jarno@ovn.org>
Cc: Ben Pfaff <blp@ovn.org>
Cc: Andy Zhou <azhou@ovn.org>
Cc: Pravin Shelar <pshelar@ovn.org>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
v2:
* add comments for dp_meter_instance
* change the log
* remove extra newline
* I don't move the dp_netdev_meter_init/destroy up. because
  them depends other meters function and put all meter function
  together may make the codes clean.
---
 lib/dpif-netdev.c | 319 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 250 insertions(+), 69 deletions(-)

Comments

Ilya Maximets May 15, 2020, 5:09 p.m. UTC | #1
On 5/13/20 3:31 PM, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> 
> For now, ovs-vswitchd use the array of the dp_meter struct
> to store meter's data, and at most, there are only 65536
> (defined by MAX_METERS) meters that can be used. But in some
> case, for example, in the edge gateway, we should use 200,000,
> at least, meters for IP address bandwidth limitation.
> Every one IP address will use two meters for its rx and tx
> path[1]. In other way, ovs-vswitchd should support meter-offload
> (rte_mtr_xxx api introduced by dpdk.), but there are more than
> 65536 meters in the hardware, such as Mellanox ConnectX-6.
> 
> This patch use array to manage the meter, but it can ben expanded.
> 
> [1].
> $ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
> $ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0
> 
> Cc: Ilya Maximets <i.maximets@ovn.org>
> Cc: William Tu <u9012063@gmail.com>
> Cc: Jarno Rajahalme <jarno@ovn.org>
> Cc: Ben Pfaff <blp@ovn.org>
> Cc: Andy Zhou <azhou@ovn.org>
> Cc: Pravin Shelar <pshelar@ovn.org>
> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> ---
> v2:
> * add comments for dp_meter_instance
> * change the log
> * remove extra newline
> * I don't move the dp_netdev_meter_init/destroy up. because
>   them depends other meters function and put all meter function
>   together may make the codes clean.
> ---

Hi.  Thanks for working on this!

This is not a full review, just a few things that I spotted on a quick glance.
I didn't review any thread safety/rcu aspects yet.

Best regards, Ilya Maximets.


>  lib/dpif-netdev.c | 319 ++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 250 insertions(+), 69 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index ef14e83b5f06..b5deaab31eb0 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -98,9 +98,12 @@ DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
>  
>  /* Configuration parameters. */
>  enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
> -enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
> -enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
> -enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
> +
> +/* Maximum number of meters in the table. */
> +#define METER_ENTRY_MAX (200000ULL)
> +/* Maximum number of bands / meter. */
> +#define METER_BAND_MAX  (8)
> +#define DP_METER_ARRAY_SIZE_MIN (1ULL << 10)

Why we need to change enums to defines and also rename them?

>  
>  COVERAGE_DEFINE(datapath_drop_meter);
>  COVERAGE_DEFINE(datapath_drop_upcall_error);
> @@ -283,12 +286,26 @@ struct dp_meter {
>      uint16_t flags;
>      uint16_t n_bands;
>      uint32_t max_delta_t;
> +    uint32_t id;
> +    struct ovs_mutex lock;
>      uint64_t used;
>      uint64_t packet_count;
>      uint64_t byte_count;
>      struct dp_meter_band bands[];
>  };
>  
> +struct dp_meter_instance {
> +    uint32_t n_meters;

This should be called 'n_allocated' or smething like this.
'n_meters' makes me think that it's the number of actually used meters.

> +    /* Followed by struct dp_meter[n]; where n is the n_meters. */
> +    OVSRCU_TYPE(struct dp_meter *) dp_meters[];
> +};
> +
> +struct dp_meter_table {
> +    OVSRCU_TYPE(struct dp_meter_instance *) ti;

What does 'ti' mean?  I looked throught the code it always stands for meter instance,
but how 'meter instance' relates to 'ti'?  That is confusing.

> +    uint32_t count;

Why count is part of 'dp_meter_table'?  I think it should be part of 'dp_meter_instance'
and named something like 'n_used', or actually 'n_meters'.

> +    struct ovs_mutex lock;
> +};

Why we need this structure at all?  Can it be just 3 fields inside struct dp_netdev?
Why it is table?  It's not a table. 'instance' is a table.  Confusing.

> +
>  struct pmd_auto_lb {
>      bool auto_lb_requested;     /* Auto load balancing requested by user. */
>      bool is_enabled;            /* Current status of Auto load balancing. */
> @@ -329,8 +346,7 @@ struct dp_netdev {
>      atomic_uint32_t tx_flush_interval;
>  
>      /* Meters. */
> -    struct ovs_mutex meter_locks[N_METER_LOCKS];
> -    struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
> +    struct dp_meter_table meter_tbl;
>  
>      /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
>      OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
> @@ -378,19 +394,6 @@ struct dp_netdev {
>      struct pmd_auto_lb pmd_alb;
>  };
>  
> -static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
> -    OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> -    ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
> -}
> -
> -static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
> -    OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> -    ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
> -}
> -
> -
>  static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
>                                                      odp_port_t)
>      OVS_REQUIRES(dp->port_mutex);
> @@ -1523,6 +1526,9 @@ choose_port(struct dp_netdev *dp, const char *name)
>      return ODPP_NONE;
>  }
>  
> +static void dp_netdev_meter_init(struct dp_meter_table *tbl);
> +static void dp_netdev_meter_destroy(struct dp_meter_table *tbl);

These functions should be named dp_netdev_meter_table_{init,destroy}.

> +
>  static int
>  create_dp_netdev(const char *name, const struct dpif_class *class,
>                   struct dp_netdev **dpp)
> @@ -1556,9 +1562,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
>      dp->reconfigure_seq = seq_create();
>      dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
>  
> -    for (int i = 0; i < N_METER_LOCKS; ++i) {
> -        ovs_mutex_init_adaptive(&dp->meter_locks[i]);
> -    }
> +    dp_netdev_meter_init(&dp->meter_tbl);
>  
>      /* Disable upcalls by default. */
>      dp_netdev_disable_upcall(dp);
> @@ -1647,16 +1651,6 @@ dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
>      fat_rwlock_destroy(&dp->upcall_rwlock);
>  }
>  
> -static void
> -dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
> -    OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> -    if (dp->meters[meter_id]) {
> -        free(dp->meters[meter_id]);
> -        dp->meters[meter_id] = NULL;
> -    }
> -}
> -
>  /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
>   * through the 'dp_netdevs' shash while freeing 'dp'. */
>  static void
> @@ -1694,16 +1688,7 @@ dp_netdev_free(struct dp_netdev *dp)
>      /* Upcalls must be disabled at this point */
>      dp_netdev_destroy_upcall_lock(dp);
>  
> -    int i;
> -
> -    for (i = 0; i < MAX_METERS; ++i) {
> -        meter_lock(dp, i);
> -        dp_delete_meter(dp, i);
> -        meter_unlock(dp, i);
> -    }
> -    for (i = 0; i < N_METER_LOCKS; ++i) {
> -        ovs_mutex_destroy(&dp->meter_locks[i]);
> -    }
> +    dp_netdev_meter_destroy(&dp->meter_tbl);
>  
>      free(dp->pmd_cmask);
>      free(CONST_CAST(char *, dp->name));
> @@ -5713,14 +5698,197 @@ dp_netdev_disable_upcall(struct dp_netdev *dp)
>  
>  
>  /* Meters */
> +static uint32_t
> +meter_hash(struct dp_meter_instance *ti, uint32_t id)
> +{
> +    uint32_t n_meters = ti->n_meters;
> +
> +    return id % n_meters;
> +}

Why we need a hash here in this implementation?
Below code will be broken if meter_hash will hash different ids to the
same hash value.  There should be no hash or there should be good collision
protection.

> +
> +static void
> +dp_meter_free(struct dp_meter *meter)
> +{
> +    ovs_mutex_destroy(&meter->lock);
> +    free(meter);
> +}
> +
> +static struct dp_meter_instance *
> +dp_meter_instance_alloc(const uint32_t size)
> +{
> +    struct dp_meter_instance *ti;
> +
> +    ti = xzalloc(sizeof(*ti) + sizeof(struct dp_meter *) * size);

Don't parenthesize argument of sizeof if it's a variable.

> +    ti->n_meters = size;
> +
> +    return ti;
> +}
> +
> +static void
> +dp_meter_instance_realloc(struct dp_meter_table *tbl, const uint32_t size)
> +{
> +    struct dp_meter_instance *new_ti;
> +    struct dp_meter_instance *ti;
> +    int n_meters;
> +    int i;
> +
> +    new_ti = dp_meter_instance_alloc(size);
> +
> +    ti = ovsrcu_get(struct dp_meter_instance *, &tbl->ti);
> +    n_meters = MIN(size, ti->n_meters);
> +
> +    for (i = 0; i < n_meters; i++) {
> +        if (ovsrcu_get(struct dp_meter *, &ti->dp_meters[i])) {
> +            new_ti->dp_meters[i] = ti->dp_meters[i];
> +        }
> +    }
> +
> +    ovsrcu_set(&tbl->ti, new_ti);
> +    ovsrcu_postpone(free, ti);
> +}
> +
> +static void
> +dp_meter_instance_insert(struct dp_meter_instance *ti,

'dp_meter_instance_insert' sounds like we're going to create a new
dp_meter_instance and insert it to dp_meter_table, but it's not the
case.

> +                         struct dp_meter *meter)
> +{
> +    uint32_t hash;
> +
> +    hash = meter_hash(ti, meter->id);
> +    ovsrcu_set(&ti->dp_meters[hash], meter);
> +}
> +
> +static void
> +dp_meter_instance_remove(struct dp_meter_instance *ti,
> +                         struct dp_meter *meter)
> +{
> +    uint32_t hash;
> +
> +    hash = meter_hash(ti, meter->id);
> +    ovsrcu_set(&ti->dp_meters[hash], NULL);
> +}
> +
> +static void
> +dp_netdev_meter_init(struct dp_meter_table *tbl)
> +{
> +    struct dp_meter_instance *ti;
> +
> +    ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN);
> +    ovsrcu_set(&tbl->ti, ti);
> +
> +    ovs_mutex_init(&tbl->lock);
> +    tbl->count = 0;
> +}
> +
> +static void
> +dp_netdev_meter_destroy(struct dp_meter_table *tbl)

'dp_netdev_meter_destroy' sounds like we're going to destroy a single meter.

> +{
> +    struct dp_meter_instance *ti;
> +    int i;
> +
> +    ti = ovsrcu_get(struct dp_meter_instance *, &tbl->ti);
> +    for (i = 0; i < ti->n_meters; i++) {
> +        struct dp_meter *meter;
> +
> +        meter = ovsrcu_get(struct dp_meter *, &ti->dp_meters[i]);
> +        if (meter) {
> +            ovsrcu_postpone(dp_meter_free, meter);
> +        }
> +    }
> +
> +    ovsrcu_postpone(free, ti);
> +    ovs_mutex_destroy(&tbl->lock);
> +}
> +
> +static struct dp_meter *
> +dp_meter_lookup(struct dp_meter_table *meter_tbl, uint32_t meter_id)
> +{
> +    struct dp_meter_instance *ti;
> +    struct dp_meter *meter;
> +    uint32_t hash;
> +
> +    ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);

After gitting rcu protected pointer, you have to check if it's a valid pointer.

> +    hash = meter_hash(ti, meter_id);
> +
> +    meter = ovsrcu_get(struct dp_meter *, &ti->dp_meters[hash]);
> +    if (meter && meter->id == meter_id) {
> +        return meter;
> +    }
> +
> +    return NULL;
> +}
> +
> +static void
> +dp_meter_detach_free(struct dp_meter_table *meter_tbl, uint32_t meter_id)
> +                     OVS_REQUIRES(meter_tbl->lock)

Please, keep thread safety annotations indented with 4 spaces from the left.

> +{
> +    struct dp_meter_instance *ti;
> +    struct dp_meter *meter;
> +
> +    meter = dp_meter_lookup(meter_tbl, meter_id);
> +    if (!meter) {
> +        return;
> +    }
> +
> +    ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
> +    dp_meter_instance_remove(ti, meter);
> +    ovsrcu_postpone(dp_meter_free, meter);
> +
> +    meter_tbl->count--;
> +    /* Shrink the meter array if necessary. */
> +    if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN &&
> +        meter_tbl->count <= (ti->n_meters / 4)) {
> +        int half_size = ti->n_meters / 2;
> +        int i;
> +
> +        /* Avoid hash collision, don't move slots to other place.
> +         * Make sure there are no references of meters in array
> +         * which will be released.
> +         */
> +        for (i = half_size; i < ti->n_meters; i++) {
> +            if (ovsrcu_get(struct dp_meter *, &ti->dp_meters[i])) {
> +                return;
> +            }
> +        }
> +
> +        dp_meter_instance_realloc(meter_tbl, half_size);
> +    }
> +}
> +
> +static int
> +dp_meter_attach(struct dp_meter_table *meter_tbl, struct dp_meter *meter)
> +                OVS_REQUIRES(meter_tbl->lock)

ditto.

> +{
> +    struct dp_meter_instance *ti;
> +    uint32_t hash;
> +
> +    ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
> +    hash = meter_hash(ti, meter->id);
> +
> +    if (OVS_UNLIKELY(ovsrcu_get(struct dp_meter *,
> +                                &ti->dp_meters[hash]))) {
> +        VLOG_WARN("Failed to attach meter id %u to slot %u/%u.\n",
> +                  meter->id, hash, ti->n_meters);
> +        return EBUSY;

How this could happen if you're always calling _detach_free before calling attach?

> +    }
> +
> +    dp_meter_instance_insert(ti, meter);
> +
> +    meter_tbl->count++;
> +    if (meter_tbl->count >= ti->n_meters) {
> +        dp_meter_instance_realloc(meter_tbl, ti->n_meters * 2);
> +    }
> +
> +    return 0;
> +}
> +
>  static void
>  dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
>                                 struct ofputil_meter_features *features)
>  {
> -    features->max_meters = MAX_METERS;
> +    features->max_meters = METER_ENTRY_MAX;
>      features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
>      features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
> -    features->max_bands = MAX_BANDS;
> +    features->max_bands = METER_BAND_MAX;
>      features->max_color = 0;
>  }
>  
> @@ -5742,14 +5910,13 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
>      uint32_t exceeded_rate[NETDEV_MAX_BURST];
>      int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
>  
> -    if (meter_id >= MAX_METERS) {
> +    if (meter_id >= METER_ENTRY_MAX) {
>          return;
>      }
>  
> -    meter_lock(dp, meter_id);
> -    meter = dp->meters[meter_id];
> +    meter = dp_meter_lookup(&dp->meter_tbl, meter_id);
>      if (!meter) {
> -        goto out;
> +        return;
>      }
>  
>      /* Initialize as negative values. */
> @@ -5757,6 +5924,7 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
>      /* Initialize as zeroes. */
>      memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
>  
> +    ovs_mutex_lock(&meter->lock);
>      /* All packets will hit the meter at the same time. */
>      long_delta_t = now / 1000 - meter->used / 1000; /* msec */
>  
> @@ -5874,8 +6042,8 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
>              dp_packet_batch_refill(packets_, packet, j);
>          }
>      }
> - out:
> -    meter_unlock(dp, meter_id);
> +
> +    ovs_mutex_unlock(&meter->lock);
>  }
>  
>  /* Meter set/get/del processing is still single-threaded. */
> @@ -5884,11 +6052,12 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
>                        struct ofputil_meter_config *config)
>  {
>      struct dp_netdev *dp = get_dp_netdev(dpif);
> +    struct dp_meter_table *meter_tbl = &dp->meter_tbl;
>      uint32_t mid = meter_id.uint32;
>      struct dp_meter *meter;
> -    int i;
> +    int err, i;
>  
> -    if (mid >= MAX_METERS) {
> +    if (mid >= METER_ENTRY_MAX) {
>          return EFBIG; /* Meter_id out of range. */
>      }
>  
> @@ -5896,7 +6065,7 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
>          return EBADF; /* Unsupported flags set */
>      }
>  
> -    if (config->n_bands > MAX_BANDS) {
> +    if (config->n_bands > METER_BAND_MAX) {
>          return EINVAL;
>      }
>  
> @@ -5917,6 +6086,8 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
>      meter->n_bands = config->n_bands;
>      meter->max_delta_t = 0;
>      meter->used = time_usec();
> +    meter->id = mid;
> +    ovs_mutex_init(&meter->lock);
>  
>      /* set up bands */
>      for (i = 0; i < config->n_bands; ++i) {
> @@ -5942,12 +6113,22 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
>          }
>      }
>  
> -    meter_lock(dp, mid);
> -    dp_delete_meter(dp, mid); /* Free existing meter, if any */
> -    dp->meters[mid] = meter;
> -    meter_unlock(dp, mid);
> +    ovs_mutex_lock(&meter_tbl->lock);
> +
> +    dp_meter_detach_free(meter_tbl, mid); /* Free existing meter, if any */

This doesn't look correct.  Why should we destroy some other meter to create
this new one?

> +    err = dp_meter_attach(meter_tbl, meter);
> +    if (err) {
> +        goto unlock_out;
> +    }
> +
> +    ovs_mutex_unlock(&meter_tbl->lock);
>  
>      return 0;
> +
> +unlock_out:
> +    ovs_mutex_unlock(&meter_tbl->lock);
> +    dp_meter_free(meter);
> +    return err;
>  }
>  
>  static int
> @@ -5955,23 +6136,23 @@ dpif_netdev_meter_get(const struct dpif *dpif,
>                        ofproto_meter_id meter_id_,
>                        struct ofputil_meter_stats *stats, uint16_t n_bands)
>  {
> -    const struct dp_netdev *dp = get_dp_netdev(dpif);
> +    struct dp_netdev *dp = get_dp_netdev(dpif);
>      uint32_t meter_id = meter_id_.uint32;
> -    int retval = 0;
> +    const struct dp_meter *meter;
>  
> -    if (meter_id >= MAX_METERS) {
> +    if (meter_id >= METER_ENTRY_MAX) {
>          return EFBIG;
>      }
>  
> -    meter_lock(dp, meter_id);
> -    const struct dp_meter *meter = dp->meters[meter_id];
> +    meter = dp_meter_lookup(&dp->meter_tbl, meter_id);
>      if (!meter) {
> -        retval = ENOENT;
> -        goto done;
> +        return ENOENT;
>      }
> +
>      if (stats) {
>          int i = 0;
>  
> +        ovs_mutex_lock(&meter->lock);
>          stats->packet_in_count = meter->packet_count;
>          stats->byte_in_count = meter->byte_count;
>  
> @@ -5979,13 +6160,12 @@ dpif_netdev_meter_get(const struct dpif *dpif,
>              stats->bands[i].packet_count = meter->bands[i].packet_count;
>              stats->bands[i].byte_count = meter->bands[i].byte_count;
>          }
> +        ovs_mutex_unlock(&meter->lock);
>  
>          stats->n_bands = i;
>      }
>  
> -done:
> -    meter_unlock(dp, meter_id);
> -    return retval;
> +    return 0;
>  }
>  
>  static int
> @@ -5994,15 +6174,16 @@ dpif_netdev_meter_del(struct dpif *dpif,
>                        struct ofputil_meter_stats *stats, uint16_t n_bands)
>  {
>      struct dp_netdev *dp = get_dp_netdev(dpif);
> +    struct dp_meter_table *meter_tbl = &dp->meter_tbl;
>      int error;
>  
>      error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
>      if (!error) {
>          uint32_t meter_id = meter_id_.uint32;
>  
> -        meter_lock(dp, meter_id);
> -        dp_delete_meter(dp, meter_id);
> -        meter_unlock(dp, meter_id);
> +        ovs_mutex_lock(&meter_tbl->lock);
> +        dp_meter_detach_free(meter_tbl, meter_id);
> +        ovs_mutex_unlock(&meter_tbl->lock);
>      }
>      return error;
>  }
>
Tonghao Zhang May 18, 2020, 6:02 a.m. UTC | #2
On Sat, May 16, 2020 at 1:09 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 5/13/20 3:31 PM, xiangxia.m.yue@gmail.com wrote:
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > For now, ovs-vswitchd use the array of the dp_meter struct
> > to store meter's data, and at most, there are only 65536
> > (defined by MAX_METERS) meters that can be used. But in some
> > case, for example, in the edge gateway, we should use 200,000,
> > at least, meters for IP address bandwidth limitation.
> > Every one IP address will use two meters for its rx and tx
> > path[1]. In other way, ovs-vswitchd should support meter-offload
> > (rte_mtr_xxx api introduced by dpdk.), but there are more than
> > 65536 meters in the hardware, such as Mellanox ConnectX-6.
> >
> > This patch use array to manage the meter, but it can ben expanded.
> >
> > [1].
> > $ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
> > $ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0
> >
> > Cc: Ilya Maximets <i.maximets@ovn.org>
> > Cc: William Tu <u9012063@gmail.com>
> > Cc: Jarno Rajahalme <jarno@ovn.org>
> > Cc: Ben Pfaff <blp@ovn.org>
> > Cc: Andy Zhou <azhou@ovn.org>
> > Cc: Pravin Shelar <pshelar@ovn.org>
> > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> > ---
> > v2:
> > * add comments for dp_meter_instance
> > * change the log
> > * remove extra newline
> > * I don't move the dp_netdev_meter_init/destroy up. because
> >   them depends other meters function and put all meter function
> >   together may make the codes clean.
> > ---
>
> Hi.  Thanks for working on this!
>
> This is not a full review, just a few things that I spotted on a quick glance.
> I didn't review any thread safety/rcu aspects yet.
>
> Best regards, Ilya Maximets.
>
>
> >  lib/dpif-netdev.c | 319 ++++++++++++++++++++++++++++++++++++----------
> >  1 file changed, 250 insertions(+), 69 deletions(-)
> >
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index ef14e83b5f06..b5deaab31eb0 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -98,9 +98,12 @@ DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
> >
> >  /* Configuration parameters. */
> >  enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
> > -enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
> > -enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
> > -enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
> > +
> > +/* Maximum number of meters in the table. */
> > +#define METER_ENTRY_MAX (200000ULL)
> > +/* Maximum number of bands / meter. */
> > +#define METER_BAND_MAX  (8)
> > +#define DP_METER_ARRAY_SIZE_MIN (1ULL << 10)
>
> Why we need to change enums to defines and also rename them?
Hi, thanks for reviews.
1. I send a patch which will remove the MAX_FLOWS, but may be
discussed again:
http://patchwork.ozlabs.org/project/openvswitch/patch/1584309363-15048-1-git-send-email-xiangxia.m.yue@gmail.com/
2. I introduce the DP_METER_ARRAY_SIZE_MIN which use the #define. so I
change the enums to defines, and make the name more readable.
> >
> >  COVERAGE_DEFINE(datapath_drop_meter);
> >  COVERAGE_DEFINE(datapath_drop_upcall_error);
> > @@ -283,12 +286,26 @@ struct dp_meter {
> >      uint16_t flags;
> >      uint16_t n_bands;
> >      uint32_t max_delta_t;
> > +    uint32_t id;
> > +    struct ovs_mutex lock;
> >      uint64_t used;
> >      uint64_t packet_count;
> >      uint64_t byte_count;
> >      struct dp_meter_band bands[];
> >  };
> >
> > +struct dp_meter_instance {
> > +    uint32_t n_meters;
>
> This should be called 'n_allocated' or smething like this.
> 'n_meters' makes me think that it's the number of actually used meters.
OK
> > +    /* Followed by struct dp_meter[n]; where n is the n_meters. */
> > +    OVSRCU_TYPE(struct dp_meter *) dp_meters[];
> > +};
> > +
> > +struct dp_meter_table {
> > +    OVSRCU_TYPE(struct dp_meter_instance *) ti;
>
> What does 'ti' mean?  I looked throught the code it always stands for meter instance,
Yes
> but how 'meter instance' relates to 'ti'?  That is confusing.
Change "ti" to "meter_instance" is better.
> > +    uint32_t count;
>
> Why count is part of 'dp_meter_table'?  I think it should be part of 'dp_meter_instance'
when we expand the table, just relloc the instance, instance is only a array.
> and named something like 'n_used', or actually 'n_meters'.
yes, we also can store it to instance.
> > +    struct ovs_mutex lock;
> > +};
>
> Why we need this structure at all?  Can it be just 3 fields inside struct dp_netdev?
I try to make the meter a separate module like kernel. and make the
code readable.
> Why it is table?  It's not a table. 'instance' is a table.  Confusing.
introduce the instance is for easily expanding and shrinking the table.
> > +
> >  struct pmd_auto_lb {
> >      bool auto_lb_requested;     /* Auto load balancing requested by user. */
> >      bool is_enabled;            /* Current status of Auto load balancing. */
> > @@ -329,8 +346,7 @@ struct dp_netdev {
> >      atomic_uint32_t tx_flush_interval;
> >
> >      /* Meters. */
> > -    struct ovs_mutex meter_locks[N_METER_LOCKS];
> > -    struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
> > +    struct dp_meter_table meter_tbl;
> >
> >      /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
> >      OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
> > @@ -378,19 +394,6 @@ struct dp_netdev {
> >      struct pmd_auto_lb pmd_alb;
> >  };
> >
> > -static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
> > -    OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
> > -{
> > -    ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
> > -}
> > -
> > -static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
> > -    OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
> > -{
> > -    ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
> > -}
> > -
> > -
> >  static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
> >                                                      odp_port_t)
> >      OVS_REQUIRES(dp->port_mutex);
> > @@ -1523,6 +1526,9 @@ choose_port(struct dp_netdev *dp, const char *name)
> >      return ODPP_NONE;
> >  }
> >
> > +static void dp_netdev_meter_init(struct dp_meter_table *tbl);
> > +static void dp_netdev_meter_destroy(struct dp_meter_table *tbl);
>
> These functions should be named dp_netdev_meter_table_{init,destroy}.
Ok
> > +
> >  static int
> >  create_dp_netdev(const char *name, const struct dpif_class *class,
> >                   struct dp_netdev **dpp)
> > @@ -1556,9 +1562,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
> >      dp->reconfigure_seq = seq_create();
> >      dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
> >
> > -    for (int i = 0; i < N_METER_LOCKS; ++i) {
> > -        ovs_mutex_init_adaptive(&dp->meter_locks[i]);
> > -    }
> > +    dp_netdev_meter_init(&dp->meter_tbl);
> >
> >      /* Disable upcalls by default. */
> >      dp_netdev_disable_upcall(dp);
> > @@ -1647,16 +1651,6 @@ dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
> >      fat_rwlock_destroy(&dp->upcall_rwlock);
> >  }
> >
> > -static void
> > -dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
> > -    OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
> > -{
> > -    if (dp->meters[meter_id]) {
> > -        free(dp->meters[meter_id]);
> > -        dp->meters[meter_id] = NULL;
> > -    }
> > -}
> > -
> >  /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
> >   * through the 'dp_netdevs' shash while freeing 'dp'. */
> >  static void
> > @@ -1694,16 +1688,7 @@ dp_netdev_free(struct dp_netdev *dp)
> >      /* Upcalls must be disabled at this point */
> >      dp_netdev_destroy_upcall_lock(dp);
> >
> > -    int i;
> > -
> > -    for (i = 0; i < MAX_METERS; ++i) {
> > -        meter_lock(dp, i);
> > -        dp_delete_meter(dp, i);
> > -        meter_unlock(dp, i);
> > -    }
> > -    for (i = 0; i < N_METER_LOCKS; ++i) {
> > -        ovs_mutex_destroy(&dp->meter_locks[i]);
> > -    }
> > +    dp_netdev_meter_destroy(&dp->meter_tbl);
> >
> >      free(dp->pmd_cmask);
> >      free(CONST_CAST(char *, dp->name));
> > @@ -5713,14 +5698,197 @@ dp_netdev_disable_upcall(struct dp_netdev *dp)
> >
> >
> >  /* Meters */
> > +static uint32_t
> > +meter_hash(struct dp_meter_instance *ti, uint32_t id)
> > +{
> > +    uint32_t n_meters = ti->n_meters;
> > +
> > +    return id % n_meters;
> > +}
>
> Why we need a hash here in this implementation?
> Below code will be broken if meter_hash will hash different ids to the
I guess that may avoid a bug which generate a meter id is larger than
"ti->n_meters", and then
access the memory which not allocated. That does not break the codes.
because "id % n_meters" always is id.
because meter id is generated by id pool.
> same hash value.  There should be no hash or there should be good collision
> protection.
Yes, a little confused.

> > +
> > +static void
> > +dp_meter_free(struct dp_meter *meter)
> > +{
> > +    ovs_mutex_destroy(&meter->lock);
> > +    free(meter);
> > +}
> > +
> > +static struct dp_meter_instance *
> > +dp_meter_instance_alloc(const uint32_t size)
> > +{
> > +    struct dp_meter_instance *ti;
> > +
> > +    ti = xzalloc(sizeof(*ti) + sizeof(struct dp_meter *) * size);
>
> Don't parenthesize argument of sizeof if it's a variable.
Ok
> > +    ti->n_meters = size;
> > +
> > +    return ti;
> > +}
> > +
> > +static void
> > +dp_meter_instance_realloc(struct dp_meter_table *tbl, const uint32_t size)
> > +{
> > +    struct dp_meter_instance *new_ti;
> > +    struct dp_meter_instance *ti;
> > +    int n_meters;
> > +    int i;
> > +
> > +    new_ti = dp_meter_instance_alloc(size);
> > +
> > +    ti = ovsrcu_get(struct dp_meter_instance *, &tbl->ti);
> > +    n_meters = MIN(size, ti->n_meters);
> > +
> > +    for (i = 0; i < n_meters; i++) {
> > +        if (ovsrcu_get(struct dp_meter *, &ti->dp_meters[i])) {
> > +            new_ti->dp_meters[i] = ti->dp_meters[i];
> > +        }
> > +    }
> > +
> > +    ovsrcu_set(&tbl->ti, new_ti);
> > +    ovsrcu_postpone(free, ti);
> > +}
> > +
> > +static void
> > +dp_meter_instance_insert(struct dp_meter_instance *ti,
>
> 'dp_meter_instance_insert' sounds like we're going to create a new
> dp_meter_instance and insert it to dp_meter_table, but it's not the
> case.
In this patch, dp_meter_instance_alloc/relloc will create the meter instance.
and dp_meter_instance_insert/remove will insert or remove the meter.
I guess we introduce the meter instance in table, will make the
expand/shrink the table
easy: realloc a instance and switch it protected by rcu lock. and I
don't introduce instance
to netdev datapath, because I think meter should be a separate mode,
like it implemented
in kernel datapath.
> > +                         struct dp_meter *meter)
> > +{
> > +    uint32_t hash;
> > +
> > +    hash = meter_hash(ti, meter->id);
> > +    ovsrcu_set(&ti->dp_meters[hash], meter);
> > +}
> > +
> > +static void
> > +dp_meter_instance_remove(struct dp_meter_instance *ti,
> > +                         struct dp_meter *meter)
> > +{
> > +    uint32_t hash;
> > +
> > +    hash = meter_hash(ti, meter->id);
> > +    ovsrcu_set(&ti->dp_meters[hash], NULL);
> > +}
> > +
> > +static void
> > +dp_netdev_meter_init(struct dp_meter_table *tbl)
> > +{
> > +    struct dp_meter_instance *ti;
> > +
> > +    ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN);
> > +    ovsrcu_set(&tbl->ti, ti);
> > +
> > +    ovs_mutex_init(&tbl->lock);
> > +    tbl->count = 0;
> > +}
> > +
> > +static void
> > +dp_netdev_meter_destroy(struct dp_meter_table *tbl)
>
> 'dp_netdev_meter_destroy' sounds like we're going to destroy a single meter.
will be updated in v3
> > +{
> > +    struct dp_meter_instance *ti;
> > +    int i;
> > +
> > +    ti = ovsrcu_get(struct dp_meter_instance *, &tbl->ti);
> > +    for (i = 0; i < ti->n_meters; i++) {
> > +        struct dp_meter *meter;
> > +
> > +        meter = ovsrcu_get(struct dp_meter *, &ti->dp_meters[i]);
> > +        if (meter) {
> > +            ovsrcu_postpone(dp_meter_free, meter);
> > +        }
> > +    }
> > +
> > +    ovsrcu_postpone(free, ti);
> > +    ovs_mutex_destroy(&tbl->lock);
> > +}
> > +
> > +static struct dp_meter *
> > +dp_meter_lookup(struct dp_meter_table *meter_tbl, uint32_t meter_id)
> > +{
> > +    struct dp_meter_instance *ti;
> > +    struct dp_meter *meter;
> > +    uint32_t hash;
> > +
> > +    ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
>
> After gitting rcu protected pointer, you have to check if it's a valid pointer.
I guess meter_tbl->ti is set protected by rcu. there shoud  not a
NULL. but we can check it again.
> > +    hash = meter_hash(ti, meter_id);
> > +
> > +    meter = ovsrcu_get(struct dp_meter *, &ti->dp_meters[hash]);
> > +    if (meter && meter->id == meter_id) {
> > +        return meter;
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +static void
> > +dp_meter_detach_free(struct dp_meter_table *meter_tbl, uint32_t meter_id)
> > +                     OVS_REQUIRES(meter_tbl->lock)
>
> Please, keep thread safety annotations indented with 4 spaces from the left.
Ok
> > +{
> > +    struct dp_meter_instance *ti;
> > +    struct dp_meter *meter;
> > +
> > +    meter = dp_meter_lookup(meter_tbl, meter_id);
> > +    if (!meter) {
> > +        return;
> > +    }
> > +
> > +    ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
> > +    dp_meter_instance_remove(ti, meter);
> > +    ovsrcu_postpone(dp_meter_free, meter);
> > +
> > +    meter_tbl->count--;
> > +    /* Shrink the meter array if necessary. */
> > +    if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN &&
> > +        meter_tbl->count <= (ti->n_meters / 4)) {
> > +        int half_size = ti->n_meters / 2;
> > +        int i;
> > +
> > +        /* Avoid hash collision, don't move slots to other place.
> > +         * Make sure there are no references of meters in array
> > +         * which will be released.
> > +         */
> > +        for (i = half_size; i < ti->n_meters; i++) {
> > +            if (ovsrcu_get(struct dp_meter *, &ti->dp_meters[i])) {
> > +                return;
> > +            }
> > +        }
> > +
> > +        dp_meter_instance_realloc(meter_tbl, half_size);
> > +    }
> > +}
> > +
> > +static int
> > +dp_meter_attach(struct dp_meter_table *meter_tbl, struct dp_meter *meter)
> > +                OVS_REQUIRES(meter_tbl->lock)
>
> ditto.
>
> > +{
> > +    struct dp_meter_instance *ti;
> > +    uint32_t hash;
> > +
> > +    ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
> > +    hash = meter_hash(ti, meter->id);
> > +
> > +    if (OVS_UNLIKELY(ovsrcu_get(struct dp_meter *,
> > +                                &ti->dp_meters[hash]))) {
> > +        VLOG_WARN("Failed to attach meter id %u to slot %u/%u.\n",
> > +                  meter->id, hash, ti->n_meters);
> > +        return EBUSY;
>
> How this could happen if you're always calling _detach_free before calling attach?
I guess what you mean is that:
1. mod_meter comand
2. del the meter while adding the meter

* that code is checking the whether the slot is used, if used, there
may be a bug. but we log the info: meter id, and slot, and return
EBUSY.
* calling the _deatch_free  is for mod the meter case, in that case we
should delete it and then attach a new one.
* when we add/del/mod the meter, we will get the lock in the table.

> > +    }
> > +
> > +    dp_meter_instance_insert(ti, meter);
> > +
> > +    meter_tbl->count++;
> > +    if (meter_tbl->count >= ti->n_meters) {
> > +        dp_meter_instance_realloc(meter_tbl, ti->n_meters * 2);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  static void
> >  dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
> >                                 struct ofputil_meter_features *features)
> >  {
> > -    features->max_meters = MAX_METERS;
> > +    features->max_meters = METER_ENTRY_MAX;
> >      features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
> >      features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
> > -    features->max_bands = MAX_BANDS;
> > +    features->max_bands = METER_BAND_MAX;
> >      features->max_color = 0;
> >  }
> >
> > @@ -5742,14 +5910,13 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
> >      uint32_t exceeded_rate[NETDEV_MAX_BURST];
> >      int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
> >
> > -    if (meter_id >= MAX_METERS) {
> > +    if (meter_id >= METER_ENTRY_MAX) {
> >          return;
> >      }
> >
> > -    meter_lock(dp, meter_id);
> > -    meter = dp->meters[meter_id];
> > +    meter = dp_meter_lookup(&dp->meter_tbl, meter_id);
> >      if (!meter) {
> > -        goto out;
> > +        return;
> >      }
> >
> >      /* Initialize as negative values. */
> > @@ -5757,6 +5924,7 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
> >      /* Initialize as zeroes. */
> >      memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
> >
> > +    ovs_mutex_lock(&meter->lock);
> >      /* All packets will hit the meter at the same time. */
> >      long_delta_t = now / 1000 - meter->used / 1000; /* msec */
> >
> > @@ -5874,8 +6042,8 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
> >              dp_packet_batch_refill(packets_, packet, j);
> >          }
> >      }
> > - out:
> > -    meter_unlock(dp, meter_id);
> > +
> > +    ovs_mutex_unlock(&meter->lock);
> >  }
> >
> >  /* Meter set/get/del processing is still single-threaded. */
> > @@ -5884,11 +6052,12 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
> >                        struct ofputil_meter_config *config)
> >  {
> >      struct dp_netdev *dp = get_dp_netdev(dpif);
> > +    struct dp_meter_table *meter_tbl = &dp->meter_tbl;
> >      uint32_t mid = meter_id.uint32;
> >      struct dp_meter *meter;
> > -    int i;
> > +    int err, i;
> >
> > -    if (mid >= MAX_METERS) {
> > +    if (mid >= METER_ENTRY_MAX) {
> >          return EFBIG; /* Meter_id out of range. */
> >      }
> >
> > @@ -5896,7 +6065,7 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
> >          return EBADF; /* Unsupported flags set */
> >      }
> >
> > -    if (config->n_bands > MAX_BANDS) {
> > +    if (config->n_bands > METER_BAND_MAX) {
> >          return EINVAL;
> >      }
> >
> > @@ -5917,6 +6086,8 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
> >      meter->n_bands = config->n_bands;
> >      meter->max_delta_t = 0;
> >      meter->used = time_usec();
> > +    meter->id = mid;
> > +    ovs_mutex_init(&meter->lock);
> >
> >      /* set up bands */
> >      for (i = 0; i < config->n_bands; ++i) {
> > @@ -5942,12 +6113,22 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
> >          }
> >      }
> >
> > -    meter_lock(dp, mid);
> > -    dp_delete_meter(dp, mid); /* Free existing meter, if any */
> > -    dp->meters[mid] = meter;
> > -    meter_unlock(dp, mid);
> > +    ovs_mutex_lock(&meter_tbl->lock);
> > +
> > +    dp_meter_detach_free(meter_tbl, mid); /* Free existing meter, if any */
>
> This doesn't look correct.  Why should we destroy some other meter to create
> this new one?
Think about modifying the meter case, do it as original logic. and
this patch does not change the logic.
> > +    err = dp_meter_attach(meter_tbl, meter);
> > +    if (err) {
> > +        goto unlock_out;
> > +    }
> > +
> > +    ovs_mutex_unlock(&meter_tbl->lock);
> >
> >      return 0;
> > +
> > +unlock_out:
> > +    ovs_mutex_unlock(&meter_tbl->lock);
> > +    dp_meter_free(meter);
> > +    return err;
> >  }
> >
> >  static int
> > @@ -5955,23 +6136,23 @@ dpif_netdev_meter_get(const struct dpif *dpif,
> >                        ofproto_meter_id meter_id_,
> >                        struct ofputil_meter_stats *stats, uint16_t n_bands)
> >  {
> > -    const struct dp_netdev *dp = get_dp_netdev(dpif);
> > +    struct dp_netdev *dp = get_dp_netdev(dpif);
> >      uint32_t meter_id = meter_id_.uint32;
> > -    int retval = 0;
> > +    const struct dp_meter *meter;
> >
> > -    if (meter_id >= MAX_METERS) {
> > +    if (meter_id >= METER_ENTRY_MAX) {
> >          return EFBIG;
> >      }
> >
> > -    meter_lock(dp, meter_id);
> > -    const struct dp_meter *meter = dp->meters[meter_id];
> > +    meter = dp_meter_lookup(&dp->meter_tbl, meter_id);
> >      if (!meter) {
> > -        retval = ENOENT;
> > -        goto done;
> > +        return ENOENT;
> >      }
> > +
> >      if (stats) {
> >          int i = 0;
> >
> > +        ovs_mutex_lock(&meter->lock);
> >          stats->packet_in_count = meter->packet_count;
> >          stats->byte_in_count = meter->byte_count;
> >
> > @@ -5979,13 +6160,12 @@ dpif_netdev_meter_get(const struct dpif *dpif,
> >              stats->bands[i].packet_count = meter->bands[i].packet_count;
> >              stats->bands[i].byte_count = meter->bands[i].byte_count;
> >          }
> > +        ovs_mutex_unlock(&meter->lock);
> >
> >          stats->n_bands = i;
> >      }
> >
> > -done:
> > -    meter_unlock(dp, meter_id);
> > -    return retval;
> > +    return 0;
> >  }
> >
> >  static int
> > @@ -5994,15 +6174,16 @@ dpif_netdev_meter_del(struct dpif *dpif,
> >                        struct ofputil_meter_stats *stats, uint16_t n_bands)
> >  {
> >      struct dp_netdev *dp = get_dp_netdev(dpif);
> > +    struct dp_meter_table *meter_tbl = &dp->meter_tbl;
> >      int error;
> >
> >      error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
> >      if (!error) {
> >          uint32_t meter_id = meter_id_.uint32;
> >
> > -        meter_lock(dp, meter_id);
> > -        dp_delete_meter(dp, meter_id);
> > -        meter_unlock(dp, meter_id);
> > +        ovs_mutex_lock(&meter_tbl->lock);
> > +        dp_meter_detach_free(meter_tbl, meter_id);
> > +        ovs_mutex_unlock(&meter_tbl->lock);
> >      }
> >      return error;
> >  }
> >
>
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index ef14e83b5f06..b5deaab31eb0 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -98,9 +98,12 @@  DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
 
 /* Configuration parameters. */
 enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
-enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
-enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
-enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
+
+/* Maximum number of meters in the table. */
+#define METER_ENTRY_MAX (200000ULL)
+/* Maximum number of bands / meter. */
+#define METER_BAND_MAX  (8)
+#define DP_METER_ARRAY_SIZE_MIN (1ULL << 10)
 
 COVERAGE_DEFINE(datapath_drop_meter);
 COVERAGE_DEFINE(datapath_drop_upcall_error);
@@ -283,12 +286,26 @@  struct dp_meter {
     uint16_t flags;
     uint16_t n_bands;
     uint32_t max_delta_t;
+    uint32_t id;
+    struct ovs_mutex lock;
     uint64_t used;
     uint64_t packet_count;
     uint64_t byte_count;
     struct dp_meter_band bands[];
 };
 
+struct dp_meter_instance {
+    uint32_t n_meters;
+    /* Followed by struct dp_meter[n]; where n is the n_meters. */
+    OVSRCU_TYPE(struct dp_meter *) dp_meters[];
+};
+
+struct dp_meter_table {
+    OVSRCU_TYPE(struct dp_meter_instance *) ti;
+    uint32_t count;
+    struct ovs_mutex lock;
+};
+
 struct pmd_auto_lb {
     bool auto_lb_requested;     /* Auto load balancing requested by user. */
     bool is_enabled;            /* Current status of Auto load balancing. */
@@ -329,8 +346,7 @@  struct dp_netdev {
     atomic_uint32_t tx_flush_interval;
 
     /* Meters. */
-    struct ovs_mutex meter_locks[N_METER_LOCKS];
-    struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
+    struct dp_meter_table meter_tbl;
 
     /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
@@ -378,19 +394,6 @@  struct dp_netdev {
     struct pmd_auto_lb pmd_alb;
 };
 
-static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
-    OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
-{
-    ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
-}
-
-static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
-    OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
-{
-    ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
-}
-
-
 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
                                                     odp_port_t)
     OVS_REQUIRES(dp->port_mutex);
@@ -1523,6 +1526,9 @@  choose_port(struct dp_netdev *dp, const char *name)
     return ODPP_NONE;
 }
 
+static void dp_netdev_meter_init(struct dp_meter_table *tbl);
+static void dp_netdev_meter_destroy(struct dp_meter_table *tbl);
+
 static int
 create_dp_netdev(const char *name, const struct dpif_class *class,
                  struct dp_netdev **dpp)
@@ -1556,9 +1562,7 @@  create_dp_netdev(const char *name, const struct dpif_class *class,
     dp->reconfigure_seq = seq_create();
     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
 
-    for (int i = 0; i < N_METER_LOCKS; ++i) {
-        ovs_mutex_init_adaptive(&dp->meter_locks[i]);
-    }
+    dp_netdev_meter_init(&dp->meter_tbl);
 
     /* Disable upcalls by default. */
     dp_netdev_disable_upcall(dp);
@@ -1647,16 +1651,6 @@  dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
     fat_rwlock_destroy(&dp->upcall_rwlock);
 }
 
-static void
-dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
-    OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
-{
-    if (dp->meters[meter_id]) {
-        free(dp->meters[meter_id]);
-        dp->meters[meter_id] = NULL;
-    }
-}
-
 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
  * through the 'dp_netdevs' shash while freeing 'dp'. */
 static void
@@ -1694,16 +1688,7 @@  dp_netdev_free(struct dp_netdev *dp)
     /* Upcalls must be disabled at this point */
     dp_netdev_destroy_upcall_lock(dp);
 
-    int i;
-
-    for (i = 0; i < MAX_METERS; ++i) {
-        meter_lock(dp, i);
-        dp_delete_meter(dp, i);
-        meter_unlock(dp, i);
-    }
-    for (i = 0; i < N_METER_LOCKS; ++i) {
-        ovs_mutex_destroy(&dp->meter_locks[i]);
-    }
+    dp_netdev_meter_destroy(&dp->meter_tbl);
 
     free(dp->pmd_cmask);
     free(CONST_CAST(char *, dp->name));
@@ -5713,14 +5698,197 @@  dp_netdev_disable_upcall(struct dp_netdev *dp)
 
 
 /* Meters */
+static uint32_t
+meter_hash(struct dp_meter_instance *ti, uint32_t id)
+{
+    uint32_t n_meters = ti->n_meters;
+
+    return id % n_meters;
+}
+
+static void
+dp_meter_free(struct dp_meter *meter)
+{
+    ovs_mutex_destroy(&meter->lock);
+    free(meter);
+}
+
+static struct dp_meter_instance *
+dp_meter_instance_alloc(const uint32_t size)
+{
+    struct dp_meter_instance *ti;
+
+    ti = xzalloc(sizeof(*ti) + sizeof(struct dp_meter *) * size);
+    ti->n_meters = size;
+
+    return ti;
+}
+
+static void
+dp_meter_instance_realloc(struct dp_meter_table *tbl, const uint32_t size)
+{
+    struct dp_meter_instance *new_ti;
+    struct dp_meter_instance *ti;
+    int n_meters;
+    int i;
+
+    new_ti = dp_meter_instance_alloc(size);
+
+    ti = ovsrcu_get(struct dp_meter_instance *, &tbl->ti);
+    n_meters = MIN(size, ti->n_meters);
+
+    for (i = 0; i < n_meters; i++) {
+        if (ovsrcu_get(struct dp_meter *, &ti->dp_meters[i])) {
+            new_ti->dp_meters[i] = ti->dp_meters[i];
+        }
+    }
+
+    ovsrcu_set(&tbl->ti, new_ti);
+    ovsrcu_postpone(free, ti);
+}
+
+static void
+dp_meter_instance_insert(struct dp_meter_instance *ti,
+                         struct dp_meter *meter)
+{
+    uint32_t hash;
+
+    hash = meter_hash(ti, meter->id);
+    ovsrcu_set(&ti->dp_meters[hash], meter);
+}
+
+static void
+dp_meter_instance_remove(struct dp_meter_instance *ti,
+                         struct dp_meter *meter)
+{
+    uint32_t hash;
+
+    hash = meter_hash(ti, meter->id);
+    ovsrcu_set(&ti->dp_meters[hash], NULL);
+}
+
+static void
+dp_netdev_meter_init(struct dp_meter_table *tbl)
+{
+    struct dp_meter_instance *ti;
+
+    ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN);
+    ovsrcu_set(&tbl->ti, ti);
+
+    ovs_mutex_init(&tbl->lock);
+    tbl->count = 0;
+}
+
+static void
+dp_netdev_meter_destroy(struct dp_meter_table *tbl)
+{
+    struct dp_meter_instance *ti;
+    int i;
+
+    ti = ovsrcu_get(struct dp_meter_instance *, &tbl->ti);
+    for (i = 0; i < ti->n_meters; i++) {
+        struct dp_meter *meter;
+
+        meter = ovsrcu_get(struct dp_meter *, &ti->dp_meters[i]);
+        if (meter) {
+            ovsrcu_postpone(dp_meter_free, meter);
+        }
+    }
+
+    ovsrcu_postpone(free, ti);
+    ovs_mutex_destroy(&tbl->lock);
+}
+
+static struct dp_meter *
+dp_meter_lookup(struct dp_meter_table *meter_tbl, uint32_t meter_id)
+{
+    struct dp_meter_instance *ti;
+    struct dp_meter *meter;
+    uint32_t hash;
+
+    ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
+    hash = meter_hash(ti, meter_id);
+
+    meter = ovsrcu_get(struct dp_meter *, &ti->dp_meters[hash]);
+    if (meter && meter->id == meter_id) {
+        return meter;
+    }
+
+    return NULL;
+}
+
+static void
+dp_meter_detach_free(struct dp_meter_table *meter_tbl, uint32_t meter_id)
+                     OVS_REQUIRES(meter_tbl->lock)
+{
+    struct dp_meter_instance *ti;
+    struct dp_meter *meter;
+
+    meter = dp_meter_lookup(meter_tbl, meter_id);
+    if (!meter) {
+        return;
+    }
+
+    ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
+    dp_meter_instance_remove(ti, meter);
+    ovsrcu_postpone(dp_meter_free, meter);
+
+    meter_tbl->count--;
+    /* Shrink the meter array if necessary. */
+    if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN &&
+        meter_tbl->count <= (ti->n_meters / 4)) {
+        int half_size = ti->n_meters / 2;
+        int i;
+
+        /* Avoid hash collision, don't move slots to other place.
+         * Make sure there are no references of meters in array
+         * which will be released.
+         */
+        for (i = half_size; i < ti->n_meters; i++) {
+            if (ovsrcu_get(struct dp_meter *, &ti->dp_meters[i])) {
+                return;
+            }
+        }
+
+        dp_meter_instance_realloc(meter_tbl, half_size);
+    }
+}
+
+static int
+dp_meter_attach(struct dp_meter_table *meter_tbl, struct dp_meter *meter)
+                OVS_REQUIRES(meter_tbl->lock)
+{
+    struct dp_meter_instance *ti;
+    uint32_t hash;
+
+    ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
+    hash = meter_hash(ti, meter->id);
+
+    if (OVS_UNLIKELY(ovsrcu_get(struct dp_meter *,
+                                &ti->dp_meters[hash]))) {
+        VLOG_WARN("Failed to attach meter id %u to slot %u/%u.\n",
+                  meter->id, hash, ti->n_meters);
+        return EBUSY;
+    }
+
+    dp_meter_instance_insert(ti, meter);
+
+    meter_tbl->count++;
+    if (meter_tbl->count >= ti->n_meters) {
+        dp_meter_instance_realloc(meter_tbl, ti->n_meters * 2);
+    }
+
+    return 0;
+}
+
 static void
 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
                                struct ofputil_meter_features *features)
 {
-    features->max_meters = MAX_METERS;
+    features->max_meters = METER_ENTRY_MAX;
     features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
     features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
-    features->max_bands = MAX_BANDS;
+    features->max_bands = METER_BAND_MAX;
     features->max_color = 0;
 }
 
@@ -5742,14 +5910,13 @@  dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
     uint32_t exceeded_rate[NETDEV_MAX_BURST];
     int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
 
-    if (meter_id >= MAX_METERS) {
+    if (meter_id >= METER_ENTRY_MAX) {
         return;
     }
 
-    meter_lock(dp, meter_id);
-    meter = dp->meters[meter_id];
+    meter = dp_meter_lookup(&dp->meter_tbl, meter_id);
     if (!meter) {
-        goto out;
+        return;
     }
 
     /* Initialize as negative values. */
@@ -5757,6 +5924,7 @@  dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
     /* Initialize as zeroes. */
     memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
 
+    ovs_mutex_lock(&meter->lock);
     /* All packets will hit the meter at the same time. */
     long_delta_t = now / 1000 - meter->used / 1000; /* msec */
 
@@ -5874,8 +6042,8 @@  dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
             dp_packet_batch_refill(packets_, packet, j);
         }
     }
- out:
-    meter_unlock(dp, meter_id);
+
+    ovs_mutex_unlock(&meter->lock);
 }
 
 /* Meter set/get/del processing is still single-threaded. */
@@ -5884,11 +6052,12 @@  dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
                       struct ofputil_meter_config *config)
 {
     struct dp_netdev *dp = get_dp_netdev(dpif);
+    struct dp_meter_table *meter_tbl = &dp->meter_tbl;
     uint32_t mid = meter_id.uint32;
     struct dp_meter *meter;
-    int i;
+    int err, i;
 
-    if (mid >= MAX_METERS) {
+    if (mid >= METER_ENTRY_MAX) {
         return EFBIG; /* Meter_id out of range. */
     }
 
@@ -5896,7 +6065,7 @@  dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
         return EBADF; /* Unsupported flags set */
     }
 
-    if (config->n_bands > MAX_BANDS) {
+    if (config->n_bands > METER_BAND_MAX) {
         return EINVAL;
     }
 
@@ -5917,6 +6086,8 @@  dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
     meter->n_bands = config->n_bands;
     meter->max_delta_t = 0;
     meter->used = time_usec();
+    meter->id = mid;
+    ovs_mutex_init(&meter->lock);
 
     /* set up bands */
     for (i = 0; i < config->n_bands; ++i) {
@@ -5942,12 +6113,22 @@  dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
         }
     }
 
-    meter_lock(dp, mid);
-    dp_delete_meter(dp, mid); /* Free existing meter, if any */
-    dp->meters[mid] = meter;
-    meter_unlock(dp, mid);
+    ovs_mutex_lock(&meter_tbl->lock);
+
+    dp_meter_detach_free(meter_tbl, mid); /* Free existing meter, if any */
+    err = dp_meter_attach(meter_tbl, meter);
+    if (err) {
+        goto unlock_out;
+    }
+
+    ovs_mutex_unlock(&meter_tbl->lock);
 
     return 0;
+
+unlock_out:
+    ovs_mutex_unlock(&meter_tbl->lock);
+    dp_meter_free(meter);
+    return err;
 }
 
 static int
@@ -5955,23 +6136,23 @@  dpif_netdev_meter_get(const struct dpif *dpif,
                       ofproto_meter_id meter_id_,
                       struct ofputil_meter_stats *stats, uint16_t n_bands)
 {
-    const struct dp_netdev *dp = get_dp_netdev(dpif);
+    struct dp_netdev *dp = get_dp_netdev(dpif);
     uint32_t meter_id = meter_id_.uint32;
-    int retval = 0;
+    const struct dp_meter *meter;
 
-    if (meter_id >= MAX_METERS) {
+    if (meter_id >= METER_ENTRY_MAX) {
         return EFBIG;
     }
 
-    meter_lock(dp, meter_id);
-    const struct dp_meter *meter = dp->meters[meter_id];
+    meter = dp_meter_lookup(&dp->meter_tbl, meter_id);
     if (!meter) {
-        retval = ENOENT;
-        goto done;
+        return ENOENT;
     }
+
     if (stats) {
         int i = 0;
 
+        ovs_mutex_lock(&meter->lock);
         stats->packet_in_count = meter->packet_count;
         stats->byte_in_count = meter->byte_count;
 
@@ -5979,13 +6160,12 @@  dpif_netdev_meter_get(const struct dpif *dpif,
             stats->bands[i].packet_count = meter->bands[i].packet_count;
             stats->bands[i].byte_count = meter->bands[i].byte_count;
         }
+        ovs_mutex_unlock(&meter->lock);
 
         stats->n_bands = i;
     }
 
-done:
-    meter_unlock(dp, meter_id);
-    return retval;
+    return 0;
 }
 
 static int
@@ -5994,15 +6174,16 @@  dpif_netdev_meter_del(struct dpif *dpif,
                       struct ofputil_meter_stats *stats, uint16_t n_bands)
 {
     struct dp_netdev *dp = get_dp_netdev(dpif);
+    struct dp_meter_table *meter_tbl = &dp->meter_tbl;
     int error;
 
     error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
     if (!error) {
         uint32_t meter_id = meter_id_.uint32;
 
-        meter_lock(dp, meter_id);
-        dp_delete_meter(dp, meter_id);
-        meter_unlock(dp, meter_id);
+        ovs_mutex_lock(&meter_tbl->lock);
+        dp_meter_detach_free(meter_tbl, meter_id);
+        ovs_mutex_unlock(&meter_tbl->lock);
     }
     return error;
 }