diff mbox

[ovs-dev,ovs,V2,12/21] dpif-netlink: Use netdev flow put api to insert a flow

Message ID 1482665989-791-13-git-send-email-paulb@mellanox.com
State Changes Requested
Headers show

Commit Message

Paul Blakey Dec. 25, 2016, 11:39 a.m. UTC
Using the new netdev flow api operate will now try and
offload flows to the relevant netdev of the input port.
Other operate methods flows will come in later patches.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
---
 lib/dpif-netlink.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 228 insertions(+), 4 deletions(-)

Comments

Joe Stringer Jan. 5, 2017, 11:25 p.m. UTC | #1
On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote:
> Using the new netdev flow api operate will now try and
> offload flows to the relevant netdev of the input port.
> Other operate methods flows will come in later patches.
>
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>
> ---
>  lib/dpif-netlink.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 228 insertions(+), 4 deletions(-)
>
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 3d8940e..717af90 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink *dpif,
>      return n_ops;
>  }
>
> +static int
> +parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
> +                            const struct nlattr *mask, size_t mask_len,
> +                            struct match *match)
> +{
> +    enum odp_key_fitness fitness;
> +
> +    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
> +    if (fitness) {
> +        /* This should not happen: it indicates that odp_flow_key_from_flow()
> +         * and odp_flow_key_to_flow() disagree on the acceptable form of a
> +         * flow.  Log the problem as an error, with enough details to enable
> +         * debugging. */
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +
> +        if (!VLOG_DROP_ERR(&rl)) {
> +            struct ds s;
> +
> +            ds_init(&s);
> +            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
> +            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
> +            ds_destroy(&s);
> +        }
> +
> +        return EINVAL;
> +    }
> +
> +    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc, &match->flow);
> +    if (fitness) {
> +        /* This should not happen: it indicates that
> +         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
> +         * disagree on the acceptable form of a mask.  Log the problem
> +         * as an error, with enough details to enable debugging. */
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +
> +        if (!VLOG_DROP_ERR(&rl)) {
> +            struct ds s;
> +
> +            VLOG_ERR("internal error parsing flow mask %s (%s)",
> +                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
> +            ds_destroy(&s);
> +        }
> +
> +        return EINVAL;
> +    }
> +
> +    return 0;
> +}

Duplicated code. Please refactor and reuse. (Separate patch for
refactoring to keep the patches clear).

> +
> +static bool
> +parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
> +{
> +    struct match match;
> +    odp_port_t in_port;
> +    const struct nlattr *nla;
> +    size_t left;
> +    int outputs = 0;
> +    struct ofpbuf buf;
> +    uint64_t act_stub[1024 / 8];
> +    size_t offset;
> +    struct nlattr *act;
> +    struct netdev *dev;
> +    int err;
> +
> +    /* 0x1234 - fake eth type sent to probe feature */
> +    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type == htons(0x1234)) {
> +        return false;
> +    }

Shouldn't DPIF_FP_PROBE be sufficient?

> +
> +    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
> +                                put->mask_len, &match)) {
> +        return false;
> +    }
> +
> +    in_port = match.flow.in_port.odp_port;
> +    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
> +    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
> +    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
> +        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
> +            struct netdev *outdev;
> +            int ifindex_out;
> +            const struct netdev_tunnel_config *tnl_cfg;
> +            size_t out_off;
> +            odp_port_t out_port;
> +
> +            outputs++;
> +            if (outputs > 1) {
> +                break;
> +            }
> +
> +            out_port = nl_attr_get_u32(nla);
> +            outdev = netdev_hmap_port_get(out_port, dpif->dpif.dpif_class);
> +            tnl_cfg = netdev_get_tunnel_config(outdev);
> +
> +            out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
> +            ifindex_out = netdev_get_ifindex(outdev);

Can this fail? (port not backed by device with ifindex?)

> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST, tnl_cfg->dst_port);
> +            }
> +            nl_msg_end_nested(&buf, out_off);
> +
> +            if (outdev) {
> +                netdev_close(outdev);
> +            }
> +        } else {
> +            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
> +                              nl_attr_get_size(nla));
> +        }
> +    }
> +    nl_msg_end_nested(&buf, offset);

Hmm. I'm a bit uneasy about this whole copying/rewriting of the
actions. Firstly, the tunnel stuff just looks wrong. A quick "git
grep" shows that OVS_TUNNEL_KEY_ATTR_TP_DST is using the wrong type,
and looking closer at the other places it is used, it looks like this
function is shortcutting a whole bunch of the tunnel config.

I recognise that the ODP port number needs to be translated into an
ifindex. Do we need to do anything else?

Also, these actions are only going to end up getting translated again,
into the actual tc format so is there a way we can get rid of this
extra piece?

'buf' is used with ofpbuf_use_stub(), and the comment says that the
caller must call ofpbuf_uninit() on the buffer in case it was
reallocated with malloc'd memory.

> +
> +    if (outputs > 1) {
> +        return false;
> +    }
> +
> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
> +                                                  nl_attr_get(act)),
> +                          nl_attr_get_size(act), put->stats,
> +                          CONST_CAST(ovs_u128 *, put->ufid));
> +    netdev_close(dev);
> +
> +    if (!err) {
> +        if (put->flags & DPIF_FP_MODIFY) {
> +            struct dpif_op *opp;
> +            struct dpif_op op;
> +
> +            op.type = DPIF_OP_FLOW_DEL;
> +            op.u.flow_del.key = put->key;
> +            op.u.flow_del.key_len = put->key_len;
> +            op.u.flow_del.ufid = put->ufid;
> +            op.u.flow_del.pmd_id = put->pmd_id;
> +            op.u.flow_del.stats = NULL;
> +            op.u.flow_del.terse = false;
> +
> +            opp = &op;
> +            dpif_netlink_operate__(dpif, &opp, 1);

Won't this just delete the flow that was just added?

Is there a modify at the tc layer?

> +        }
> +        VLOG_DBG("added flow");
> +        return true;
> +    }
> +    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
> +
> +    return false;
> +}
> +
> +static void
> +dbg_print_flow(const struct nlattr *key, size_t key_len,
> +               const struct nlattr *mask, size_t mask_len,
> +               const struct nlattr *actions, size_t actions_len,
> +               const ovs_u128 *ufid,
> +               const char *op)
> +{
> +        struct ds s;
> +
> +        ds_init(&s);
> +        ds_put_cstr(&s, op);
> +        ds_put_cstr(&s, " (");
> +        odp_format_ufid(ufid, &s);
> +        ds_put_cstr(&s, ")");
> +        if (key_len) {
> +            ds_put_cstr(&s, "\nflow (verbose): ");
> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, true);
> +            ds_put_cstr(&s, "\nflow: ");
> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, false);
> +        }
> +        if (actions_len) {
> +            ds_put_cstr(&s, "\nactions: ");
> +            format_odp_actions(&s, actions, actions_len);
> +        }
> +        VLOG_DBG("\n%s", ds_cstr(&s));
> +        ds_destroy(&s);
> +}
> +
> +static bool
> +try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
> +{
> +    switch (op->type) {
> +    case DPIF_OP_FLOW_PUT: {
> +        struct dpif_flow_put *put = &op->u.flow_put;
> +
> +        if (!put->ufid) {
> +            return false;
> +        }
> +        dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
> +                       put->actions, put->actions_len, put->ufid, "PUT");
> +        return parse_flow_put(dpif, put);
> +    }
> +    case DPIF_OP_FLOW_DEL:
> +    case DPIF_OP_FLOW_GET:
> +    case DPIF_OP_EXECUTE:
> +    default:

Do you never delete from hardware?

> +        break;
> +    }
> +    return false;
> +}
> +
>  static void
>  dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
>  {
>      struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
> +    struct dpif_op **new_ops;
> +    int n_new_ops = 0;
> +    int i = 0;
> +
> +    if (!netdev_flow_api_enabled) {

In general, it is easier to read if (foo) {...} else {...} rather than
if(!foo) {...} else {...}.

> +        new_ops = ops;
> +        n_new_ops = n_ops;
> +    } else {
> +        new_ops = xmalloc((sizeof *new_ops) * n_ops);
> +        n_new_ops = 0;
> +
> +        for (i = 0; i < n_ops; i++) {
> +            if (try_send_to_netdev(dpif, ops[i])) {
> +                ops[i]->error = 0;
> +            } else {
> +                new_ops[n_new_ops++] = ops[i];
> +            }
> +        }
> +        ops = new_ops;

Please don't overwrite function parameters.

> +    }
> +
> +    while (n_new_ops > 0) {
> +        size_t chunk = dpif_netlink_operate__(dpif, new_ops, n_new_ops);
>
> -    while (n_ops > 0) {
> -        size_t chunk = dpif_netlink_operate__(dpif, ops, n_ops);
> -        ops += chunk;
> -        n_ops -= chunk;
> +        new_ops += chunk;
> +        n_new_ops -= chunk;
> +    }
> +
> +    if (netdev_flow_api_enabled) {
> +       free(ops);
>      }
>  }

I think you can get rid of the malloc/free by using a style closer to
the one at the beginning of push_dp_ops() in ofproto-dpif-upcall.c.
Joe Stringer Jan. 5, 2017, 11:28 p.m. UTC | #2
On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote:
> Using the new netdev flow api operate will now try and
> offload flows to the relevant netdev of the input port.
> Other operate methods flows will come in later patches.
>
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>
> ---
>  lib/dpif-netlink.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 228 insertions(+), 4 deletions(-)
>
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 3d8940e..717af90 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink *dpif,
>      return n_ops;
>  }
>
> +static int
> +parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
> +                            const struct nlattr *mask, size_t mask_len,
> +                            struct match *match)
> +{
> +    enum odp_key_fitness fitness;
> +
> +    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
> +    if (fitness) {
> +        /* This should not happen: it indicates that odp_flow_key_from_flow()
> +         * and odp_flow_key_to_flow() disagree on the acceptable form of a
> +         * flow.  Log the problem as an error, with enough details to enable
> +         * debugging. */
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +
> +        if (!VLOG_DROP_ERR(&rl)) {
> +            struct ds s;
> +
> +            ds_init(&s);
> +            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
> +            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
> +            ds_destroy(&s);
> +        }
> +
> +        return EINVAL;
> +    }
> +
> +    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc, &match->flow);
> +    if (fitness) {
> +        /* This should not happen: it indicates that
> +         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
> +         * disagree on the acceptable form of a mask.  Log the problem
> +         * as an error, with enough details to enable debugging. */
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +
> +        if (!VLOG_DROP_ERR(&rl)) {
> +            struct ds s;
> +
> +            VLOG_ERR("internal error parsing flow mask %s (%s)",
> +                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
> +            ds_destroy(&s);
> +        }
> +
> +        return EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static bool
> +parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
> +{
> +    struct match match;
> +    odp_port_t in_port;
> +    const struct nlattr *nla;
> +    size_t left;
> +    int outputs = 0;
> +    struct ofpbuf buf;
> +    uint64_t act_stub[1024 / 8];
> +    size_t offset;
> +    struct nlattr *act;
> +    struct netdev *dev;
> +    int err;
> +
> +    /* 0x1234 - fake eth type sent to probe feature */
> +    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type == htons(0x1234)) {
> +        return false;
> +    }
> +
> +    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
> +                                put->mask_len, &match)) {
> +        return false;
> +    }
> +
> +    in_port = match.flow.in_port.odp_port;
> +    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
> +    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
> +    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
> +        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
> +            struct netdev *outdev;
> +            int ifindex_out;
> +            const struct netdev_tunnel_config *tnl_cfg;
> +            size_t out_off;
> +            odp_port_t out_port;
> +
> +            outputs++;
> +            if (outputs > 1) {
> +                break;
> +            }
> +
> +            out_port = nl_attr_get_u32(nla);
> +            outdev = netdev_hmap_port_get(out_port, dpif->dpif.dpif_class);
> +            tnl_cfg = netdev_get_tunnel_config(outdev);
> +
> +            out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
> +            ifindex_out = netdev_get_ifindex(outdev);
> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST, tnl_cfg->dst_port);
> +            }
> +            nl_msg_end_nested(&buf, out_off);
> +
> +            if (outdev) {
> +                netdev_close(outdev);
> +            }
> +        } else {
> +            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
> +                              nl_attr_get_size(nla));
> +        }
> +    }
> +    nl_msg_end_nested(&buf, offset);
> +
> +    if (outputs > 1) {
> +        return false;
> +    }
> +
> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
> +                                                  nl_attr_get(act)),
> +                          nl_attr_get_size(act), put->stats,
> +                          CONST_CAST(ovs_u128 *, put->ufid));
> +    netdev_close(dev);
> +
> +    if (!err) {
> +        if (put->flags & DPIF_FP_MODIFY) {
> +            struct dpif_op *opp;
> +            struct dpif_op op;
> +
> +            op.type = DPIF_OP_FLOW_DEL;
> +            op.u.flow_del.key = put->key;
> +            op.u.flow_del.key_len = put->key_len;
> +            op.u.flow_del.ufid = put->ufid;
> +            op.u.flow_del.pmd_id = put->pmd_id;
> +            op.u.flow_del.stats = NULL;
> +            op.u.flow_del.terse = false;
> +
> +            opp = &op;
> +            dpif_netlink_operate__(dpif, &opp, 1);
> +        }
> +        VLOG_DBG("added flow");
> +        return true;
> +    }
> +    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
> +
> +    return false;
> +}
> +
> +static void
> +dbg_print_flow(const struct nlattr *key, size_t key_len,
> +               const struct nlattr *mask, size_t mask_len,
> +               const struct nlattr *actions, size_t actions_len,
> +               const ovs_u128 *ufid,
> +               const char *op)
> +{
> +        struct ds s;
> +
> +        ds_init(&s);
> +        ds_put_cstr(&s, op);
> +        ds_put_cstr(&s, " (");
> +        odp_format_ufid(ufid, &s);
> +        ds_put_cstr(&s, ")");
> +        if (key_len) {
> +            ds_put_cstr(&s, "\nflow (verbose): ");
> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, true);
> +            ds_put_cstr(&s, "\nflow: ");
> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, false);
> +        }
> +        if (actions_len) {
> +            ds_put_cstr(&s, "\nactions: ");
> +            format_odp_actions(&s, actions, actions_len);
> +        }
> +        VLOG_DBG("\n%s", ds_cstr(&s));
> +        ds_destroy(&s);
> +}
> +
> +static bool
> +try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
> +{
> +    switch (op->type) {
> +    case DPIF_OP_FLOW_PUT: {
> +        struct dpif_flow_put *put = &op->u.flow_put;
> +
> +        if (!put->ufid) {
> +            return false;
> +        }
> +        dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
> +                       put->actions, put->actions_len, put->ufid, "PUT");
> +        return parse_flow_put(dpif, put);
> +    }
> +    case DPIF_OP_FLOW_DEL:
> +    case DPIF_OP_FLOW_GET:
> +    case DPIF_OP_EXECUTE:
> +    default:
> +        break;
> +    }
> +    return false;
> +}
> +
>  static void
>  dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
>  {
>      struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
> +    struct dpif_op **new_ops;
> +    int n_new_ops = 0;
> +    int i = 0;
> +
> +    if (!netdev_flow_api_enabled) {
> +        new_ops = ops;
> +        n_new_ops = n_ops;
> +    } else {
> +        new_ops = xmalloc((sizeof *new_ops) * n_ops);
> +        n_new_ops = 0;
> +
> +        for (i = 0; i < n_ops; i++) {
> +            if (try_send_to_netdev(dpif, ops[i])) {
> +                ops[i]->error = 0;

What if the hardware returns EEXIST? Shouldn't we return EEXIST?

What if the hardware reaches some resource constraint? This isn't
required for an initial implementation, but it may be nice to have
some heuristic to try to cut down on the failed syscalls if userspace
has become aware that the hardware is out of resources. (Getting good
visibility on this would also matter if you tried to deploy this).

> +            } else {
> +                new_ops[n_new_ops++] = ops[i];
> +            }
> +        }
> +        ops = new_ops;
> +    }
> +
> +    while (n_new_ops > 0) {
> +        size_t chunk = dpif_netlink_operate__(dpif, new_ops, n_new_ops);
>
> -    while (n_ops > 0) {
> -        size_t chunk = dpif_netlink_operate__(dpif, ops, n_ops);
> -        ops += chunk;
> -        n_ops -= chunk;
> +        new_ops += chunk;
> +        n_new_ops -= chunk;
> +    }
> +
> +    if (netdev_flow_api_enabled) {
> +       free(ops);
>      }
>  }
>
> --
> 1.8.3.1
>
Paul Blakey Jan. 10, 2017, 2:36 p.m. UTC | #3
On 06/01/2017 01:28, Joe Stringer wrote:
> On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote:
>> Using the new netdev flow api operate will now try and
>> offload flows to the relevant netdev of the input port.
>> Other operate methods flows will come in later patches.
>>
>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>> ---
>>   lib/dpif-netlink.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 228 insertions(+), 4 deletions(-)
>>
>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>> index 3d8940e..717af90 100644
>> --- a/lib/dpif-netlink.c
>> +++ b/lib/dpif-netlink.c
>> @@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink *dpif,
>>       return n_ops;
>>   }
>>
>> +static int
>> +parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
>> +                            const struct nlattr *mask, size_t mask_len,
>> +                            struct match *match)
>> +{
>> +    enum odp_key_fitness fitness;
>> +
>> +    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
>> +    if (fitness) {
>> +        /* This should not happen: it indicates that odp_flow_key_from_flow()
>> +         * and odp_flow_key_to_flow() disagree on the acceptable form of a
>> +         * flow.  Log the problem as an error, with enough details to enable
>> +         * debugging. */
>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>> +
>> +        if (!VLOG_DROP_ERR(&rl)) {
>> +            struct ds s;
>> +
>> +            ds_init(&s);
>> +            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
>> +            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
>> +            ds_destroy(&s);
>> +        }
>> +
>> +        return EINVAL;
>> +    }
>> +
>> +    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc, &match->flow);
>> +    if (fitness) {
>> +        /* This should not happen: it indicates that
>> +         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
>> +         * disagree on the acceptable form of a mask.  Log the problem
>> +         * as an error, with enough details to enable debugging. */
>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>> +
>> +        if (!VLOG_DROP_ERR(&rl)) {
>> +            struct ds s;
>> +
>> +            VLOG_ERR("internal error parsing flow mask %s (%s)",
>> +                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
>> +            ds_destroy(&s);
>> +        }
>> +
>> +        return EINVAL;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static bool
>> +parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
>> +{
>> +    struct match match;
>> +    odp_port_t in_port;
>> +    const struct nlattr *nla;
>> +    size_t left;
>> +    int outputs = 0;
>> +    struct ofpbuf buf;
>> +    uint64_t act_stub[1024 / 8];
>> +    size_t offset;
>> +    struct nlattr *act;
>> +    struct netdev *dev;
>> +    int err;
>> +
>> +    /* 0x1234 - fake eth type sent to probe feature */
>> +    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type == htons(0x1234)) {
>> +        return false;
>> +    }
>> +
>> +    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
>> +                                put->mask_len, &match)) {
>> +        return false;
>> +    }
>> +
>> +    in_port = match.flow.in_port.odp_port;
>> +    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
>> +    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
>> +    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
>> +        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
>> +            struct netdev *outdev;
>> +            int ifindex_out;
>> +            const struct netdev_tunnel_config *tnl_cfg;
>> +            size_t out_off;
>> +            odp_port_t out_port;
>> +
>> +            outputs++;
>> +            if (outputs > 1) {
>> +                break;
>> +            }
>> +
>> +            out_port = nl_attr_get_u32(nla);
>> +            outdev = netdev_hmap_port_get(out_port, dpif->dpif.dpif_class);
>> +            tnl_cfg = netdev_get_tunnel_config(outdev);
>> +
>> +            out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
>> +            ifindex_out = netdev_get_ifindex(outdev);
>> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
>> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
>> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST, tnl_cfg->dst_port);
>> +            }
>> +            nl_msg_end_nested(&buf, out_off);
>> +
>> +            if (outdev) {
>> +                netdev_close(outdev);
>> +            }
>> +        } else {
>> +            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
>> +                              nl_attr_get_size(nla));
>> +        }
>> +    }
>> +    nl_msg_end_nested(&buf, offset);
>> +
>> +    if (outputs > 1) {
>> +        return false;
>> +    }
>> +
>> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
>> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
>> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
>> +                                                  nl_attr_get(act)),
>> +                          nl_attr_get_size(act), put->stats,
>> +                          CONST_CAST(ovs_u128 *, put->ufid));
>> +    netdev_close(dev);
>> +
>> +    if (!err) {
>> +        if (put->flags & DPIF_FP_MODIFY) {
>> +            struct dpif_op *opp;
>> +            struct dpif_op op;
>> +
>> +            op.type = DPIF_OP_FLOW_DEL;
>> +            op.u.flow_del.key = put->key;
>> +            op.u.flow_del.key_len = put->key_len;
>> +            op.u.flow_del.ufid = put->ufid;
>> +            op.u.flow_del.pmd_id = put->pmd_id;
>> +            op.u.flow_del.stats = NULL;
>> +            op.u.flow_del.terse = false;
>> +
>> +            opp = &op;
>> +            dpif_netlink_operate__(dpif, &opp, 1);
>> +        }
>> +        VLOG_DBG("added flow");
>> +        return true;
>> +    }
>> +    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
>> +
>> +    return false;
>> +}
>> +
>> +static void
>> +dbg_print_flow(const struct nlattr *key, size_t key_len,
>> +               const struct nlattr *mask, size_t mask_len,
>> +               const struct nlattr *actions, size_t actions_len,
>> +               const ovs_u128 *ufid,
>> +               const char *op)
>> +{
>> +        struct ds s;
>> +
>> +        ds_init(&s);
>> +        ds_put_cstr(&s, op);
>> +        ds_put_cstr(&s, " (");
>> +        odp_format_ufid(ufid, &s);
>> +        ds_put_cstr(&s, ")");
>> +        if (key_len) {
>> +            ds_put_cstr(&s, "\nflow (verbose): ");
>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, true);
>> +            ds_put_cstr(&s, "\nflow: ");
>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, false);
>> +        }
>> +        if (actions_len) {
>> +            ds_put_cstr(&s, "\nactions: ");
>> +            format_odp_actions(&s, actions, actions_len);
>> +        }
>> +        VLOG_DBG("\n%s", ds_cstr(&s));
>> +        ds_destroy(&s);
>> +}
>> +
>> +static bool
>> +try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
>> +{
>> +    switch (op->type) {
>> +    case DPIF_OP_FLOW_PUT: {
>> +        struct dpif_flow_put *put = &op->u.flow_put;
>> +
>> +        if (!put->ufid) {
>> +            return false;
>> +        }
>> +        dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
>> +                       put->actions, put->actions_len, put->ufid, "PUT");
>> +        return parse_flow_put(dpif, put);
>> +    }
>> +    case DPIF_OP_FLOW_DEL:
>> +    case DPIF_OP_FLOW_GET:
>> +    case DPIF_OP_EXECUTE:
>> +    default:
>> +        break;
>> +    }
>> +    return false;
>> +}
>> +
>>   static void
>>   dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
>>   {
>>       struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>> +    struct dpif_op **new_ops;
>> +    int n_new_ops = 0;
>> +    int i = 0;
>> +
>> +    if (!netdev_flow_api_enabled) {
>> +        new_ops = ops;
>> +        n_new_ops = n_ops;
>> +    } else {
>> +        new_ops = xmalloc((sizeof *new_ops) * n_ops);
>> +        n_new_ops = 0;
>> +
>> +        for (i = 0; i < n_ops; i++) {
>> +            if (try_send_to_netdev(dpif, ops[i])) {
>> +                ops[i]->error = 0;
> What if the hardware returns EEXIST? Shouldn't we return EEXIST?
Right it should, we'll fix that.
>
> What if the hardware reaches some resource constraint? This isn't
> required for an initial implementation, but it may be nice to have
> some heuristic to try to cut down on the failed syscalls if userspace
> has become aware that the hardware is out of resources. (Getting good
> visibility on this would also matter if you tried to deploy this).
Right, do you mean that if certain kinds of flow fail (a specific mask 
type), don't try again (with the same mask)?
Is it done in kernel?
>> +            } else {
>> +                new_ops[n_new_ops++] = ops[i];
>> +            }
>> +        }
>> +        ops = new_ops;
>> +    }
>> +
>> +    while (n_new_ops > 0) {
>> +        size_t chunk = dpif_netlink_operate__(dpif, new_ops, n_new_ops);
>>
>> -    while (n_ops > 0) {
>> -        size_t chunk = dpif_netlink_operate__(dpif, ops, n_ops);
>> -        ops += chunk;
>> -        n_ops -= chunk;
>> +        new_ops += chunk;
>> +        n_new_ops -= chunk;
>> +    }
>> +
>> +    if (netdev_flow_api_enabled) {
>> +       free(ops);
>>       }
>>   }
>>
>> --
>> 1.8.3.1
>>
Paul Blakey Jan. 10, 2017, 3:50 p.m. UTC | #4
On 06/01/2017 01:25, Joe Stringer wrote:
> On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote:
>> Using the new netdev flow api operate will now try and
>> offload flows to the relevant netdev of the input port.
>> Other operate methods flows will come in later patches.
>>
>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>> ---
>>   lib/dpif-netlink.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 228 insertions(+), 4 deletions(-)
>>
>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>> index 3d8940e..717af90 100644
>> --- a/lib/dpif-netlink.c
>> +++ b/lib/dpif-netlink.c
>> @@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink *dpif,
>>       return n_ops;
>>   }
>>
>> +static int
>> +parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
>> +                            const struct nlattr *mask, size_t mask_len,
>> +                            struct match *match)
>> +{
>> +    enum odp_key_fitness fitness;
>> +
>> +    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
>> +    if (fitness) {
>> +        /* This should not happen: it indicates that odp_flow_key_from_flow()
>> +         * and odp_flow_key_to_flow() disagree on the acceptable form of a
>> +         * flow.  Log the problem as an error, with enough details to enable
>> +         * debugging. */
>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>> +
>> +        if (!VLOG_DROP_ERR(&rl)) {
>> +            struct ds s;
>> +
>> +            ds_init(&s);
>> +            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
>> +            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
>> +            ds_destroy(&s);
>> +        }
>> +
>> +        return EINVAL;
>> +    }
>> +
>> +    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc, &match->flow);
>> +    if (fitness) {
>> +        /* This should not happen: it indicates that
>> +         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
>> +         * disagree on the acceptable form of a mask.  Log the problem
>> +         * as an error, with enough details to enable debugging. */
>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>> +
>> +        if (!VLOG_DROP_ERR(&rl)) {
>> +            struct ds s;
>> +
>> +            VLOG_ERR("internal error parsing flow mask %s (%s)",
>> +                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
>> +            ds_destroy(&s);
>> +        }
>> +
>> +        return EINVAL;
>> +    }
>> +
>> +    return 0;
>> +}
> Duplicated code. Please refactor and reuse. (Separate patch for
> refactoring to keep the patches clear).
>
>> +
>> +static bool
>> +parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
>> +{
>> +    struct match match;
>> +    odp_port_t in_port;
>> +    const struct nlattr *nla;
>> +    size_t left;
>> +    int outputs = 0;
>> +    struct ofpbuf buf;
>> +    uint64_t act_stub[1024 / 8];
>> +    size_t offset;
>> +    struct nlattr *act;
>> +    struct netdev *dev;
>> +    int err;
>> +
>> +    /* 0x1234 - fake eth type sent to probe feature */
>> +    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type == htons(0x1234)) {
>> +        return false;
>> +    }
> Shouldn't DPIF_FP_PROBE be sufficient?
I guess, I think we added it because we still got some probe features. I've
checked it now and I couldn't find any.
>> +
>> +    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
>> +                                put->mask_len, &match)) {
>> +        return false;
>> +    }
>> +
>> +    in_port = match.flow.in_port.odp_port;
>> +    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
>> +    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
>> +    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
>> +        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
>> +            struct netdev *outdev;
>> +            int ifindex_out;
>> +            const struct netdev_tunnel_config *tnl_cfg;
>> +            size_t out_off;
>> +            odp_port_t out_port;
>> +
>> +            outputs++;
>> +            if (outputs > 1) {
>> +                break;
>> +            }
>> +
>> +            out_port = nl_attr_get_u32(nla);
>> +            outdev = netdev_hmap_port_get(out_port, dpif->dpif.dpif_class);
>> +            tnl_cfg = netdev_get_tunnel_config(outdev);
>> +
>> +            out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
>> +            ifindex_out = netdev_get_ifindex(outdev);
> Can this fail? (port not backed by device with ifindex?)
>
>> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
>> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
>> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST, tnl_cfg->dst_port);
>> +            }
>> +            nl_msg_end_nested(&buf, out_off);
>> +
>> +            if (outdev) {
>> +                netdev_close(outdev);
>> +            }
>> +        } else {
>> +            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
>> +                              nl_attr_get_size(nla));
>> +        }
>> +    }
>> +    nl_msg_end_nested(&buf, offset);
> Hmm. I'm a bit uneasy about this whole copying/rewriting of the
> actions. Firstly, the tunnel stuff just looks wrong. A quick "git
> grep" shows that OVS_TUNNEL_KEY_ATTR_TP_DST is using the wrong type,
> and looking closer at the other places it is used, it looks like this
> function is shortcutting a whole bunch of the tunnel config.
Besides the wrong type, I'm not sure what you mean, dpif_netlink_port_add__
seems to access the tunnel dst port the same way.
>
> I recognise that the ODP port number needs to be translated into an
> ifindex. Do we need to do anything else?
tunnel dst port for set action was missing in ACTION_SET ,  TUNNEL pair.
We needed that for offloading.
> Also, these actions are only going to end up getting translated again,
> into the actual tc format so is there a way we can get rid of this
> extra piece?
>
> 'buf' is used with ofpbuf_use_stub(), and the comment says that the
> caller must call ofpbuf_uninit() on the buffer in case it was
> reallocated with malloc'd memory.
>
>> +
>> +    if (outputs > 1) {
>> +        return false;
>> +    }
>> +
>> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
>> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
>> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
>> +                                                  nl_attr_get(act)),
>> +                          nl_attr_get_size(act), put->stats,
>> +                          CONST_CAST(ovs_u128 *, put->ufid));
>> +    netdev_close(dev);
>> +
>> +    if (!err) {
>> +        if (put->flags & DPIF_FP_MODIFY) {
>> +            struct dpif_op *opp;
>> +            struct dpif_op op;
>> +
>> +            op.type = DPIF_OP_FLOW_DEL;
>> +            op.u.flow_del.key = put->key;
>> +            op.u.flow_del.key_len = put->key_len;
>> +            op.u.flow_del.ufid = put->ufid;
>> +            op.u.flow_del.pmd_id = put->pmd_id;
>> +            op.u.flow_del.stats = NULL;
>> +            op.u.flow_del.terse = false;
>> +
>> +            opp = &op;
>> +            dpif_netlink_operate__(dpif, &opp, 1);
> Won't this just delete the flow that was just added?
The above FLOW_DEL calls the original operate (without offload) to delete it
if it from netlink kernel datapath. This will happen if we got a modify 
request and we now
can offload the flow and previously couldn't (so its in kernel datapath).
> Is there a modify at the tc layer?
Yes there is, and implemented by given a already used handle.
So if we find the ufid in the map (ufid to handle/prio) already, we 
modify it by using
the same handle.

>> +        }
>> +        VLOG_DBG("added flow");
>> +        return true;
>> +    }
>> +    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
>> +
>> +    return false;
>> +}
>> +
>> +static void
>> +dbg_print_flow(const struct nlattr *key, size_t key_len,
>> +               const struct nlattr *mask, size_t mask_len,
>> +               const struct nlattr *actions, size_t actions_len,
>> +               const ovs_u128 *ufid,
>> +               const char *op)
>> +{
>> +        struct ds s;
>> +
>> +        ds_init(&s);
>> +        ds_put_cstr(&s, op);
>> +        ds_put_cstr(&s, " (");
>> +        odp_format_ufid(ufid, &s);
>> +        ds_put_cstr(&s, ")");
>> +        if (key_len) {
>> +            ds_put_cstr(&s, "\nflow (verbose): ");
>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, true);
>> +            ds_put_cstr(&s, "\nflow: ");
>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, false);
>> +        }
>> +        if (actions_len) {
>> +            ds_put_cstr(&s, "\nactions: ");
>> +            format_odp_actions(&s, actions, actions_len);
>> +        }
>> +        VLOG_DBG("\n%s", ds_cstr(&s));
>> +        ds_destroy(&s);
>> +}
>> +
>> +static bool
>> +try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
>> +{
>> +    switch (op->type) {
>> +    case DPIF_OP_FLOW_PUT: {
>> +        struct dpif_flow_put *put = &op->u.flow_put;
>> +
>> +        if (!put->ufid) {
>> +            return false;
>> +        }
>> +        dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
>> +                       put->actions, put->actions_len, put->ufid, "PUT");
>> +        return parse_flow_put(dpif, put);
>> +    }
>> +    case DPIF_OP_FLOW_DEL:
>> +    case DPIF_OP_FLOW_GET:
>> +    case DPIF_OP_EXECUTE:
>> +    default:
> Do you never delete from hardware?
Yes its in later patch in the patch set. we split those
logically.

>
>> +        break;
>> +    }
>> +    return false;
>> +}
>> +
>>   static void
>>   dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
>>   {
>>       struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>> +    struct dpif_op **new_ops;
>> +    int n_new_ops = 0;
>> +    int i = 0;
>> +
>> +    if (!netdev_flow_api_enabled) {
> In general, it is easier to read if (foo) {...} else {...} rather than
> if(!foo) {...} else {...}.
>
>> +        new_ops = ops;
>> +        n_new_ops = n_ops;
>> +    } else {
>> +        new_ops = xmalloc((sizeof *new_ops) * n_ops);
>> +        n_new_ops = 0;
>> +
>> +        for (i = 0; i < n_ops; i++) {
>> +            if (try_send_to_netdev(dpif, ops[i])) {
>> +                ops[i]->error = 0;
>> +            } else {
>> +                new_ops[n_new_ops++] = ops[i];
>> +            }
>> +        }
>> +        ops = new_ops;
> Please don't overwrite function parameters.
>
>> +    }
>> +
>> +    while (n_new_ops > 0) {
>> +        size_t chunk = dpif_netlink_operate__(dpif, new_ops, n_new_ops);
>>
>> -    while (n_ops > 0) {
>> -        size_t chunk = dpif_netlink_operate__(dpif, ops, n_ops);
>> -        ops += chunk;
>> -        n_ops -= chunk;
>> +        new_ops += chunk;
>> +        n_new_ops -= chunk;
>> +    }
>> +
>> +    if (netdev_flow_api_enabled) {
>> +       free(ops);
>>       }
>>   }
> I think you can get rid of the malloc/free by using a style closer to
> the one at the beginning of push_dp_ops() in ofproto-dpif-upcall.c.
Joe Stringer Jan. 10, 2017, 9:58 p.m. UTC | #5
On 10 January 2017 at 06:36, Paul Blakey <paulb@mellanox.com> wrote:
>
>
> On 06/01/2017 01:28, Joe Stringer wrote:
>>
>> On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote:
>>>
>>> Using the new netdev flow api operate will now try and
>>> offload flows to the relevant netdev of the input port.
>>> Other operate methods flows will come in later patches.
>>>
>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>>> ---
>>>   lib/dpif-netlink.c | 232
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>   1 file changed, 228 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>>> index 3d8940e..717af90 100644
>>> --- a/lib/dpif-netlink.c
>>> +++ b/lib/dpif-netlink.c
>>> @@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink
>>> *dpif,
>>>       return n_ops;
>>>   }
>>>
>>> +static int
>>> +parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
>>> +                            const struct nlattr *mask, size_t mask_len,
>>> +                            struct match *match)
>>> +{
>>> +    enum odp_key_fitness fitness;
>>> +
>>> +    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
>>> +    if (fitness) {
>>> +        /* This should not happen: it indicates that
>>> odp_flow_key_from_flow()
>>> +         * and odp_flow_key_to_flow() disagree on the acceptable form of
>>> a
>>> +         * flow.  Log the problem as an error, with enough details to
>>> enable
>>> +         * debugging. */
>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>> +
>>> +        if (!VLOG_DROP_ERR(&rl)) {
>>> +            struct ds s;
>>> +
>>> +            ds_init(&s);
>>> +            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
>>> +            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
>>> +            ds_destroy(&s);
>>> +        }
>>> +
>>> +        return EINVAL;
>>> +    }
>>> +
>>> +    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc,
>>> &match->flow);
>>> +    if (fitness) {
>>> +        /* This should not happen: it indicates that
>>> +         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
>>> +         * disagree on the acceptable form of a mask.  Log the problem
>>> +         * as an error, with enough details to enable debugging. */
>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>> +
>>> +        if (!VLOG_DROP_ERR(&rl)) {
>>> +            struct ds s;
>>> +
>>> +            VLOG_ERR("internal error parsing flow mask %s (%s)",
>>> +                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
>>> +            ds_destroy(&s);
>>> +        }
>>> +
>>> +        return EINVAL;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static bool
>>> +parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
>>> +{
>>> +    struct match match;
>>> +    odp_port_t in_port;
>>> +    const struct nlattr *nla;
>>> +    size_t left;
>>> +    int outputs = 0;
>>> +    struct ofpbuf buf;
>>> +    uint64_t act_stub[1024 / 8];
>>> +    size_t offset;
>>> +    struct nlattr *act;
>>> +    struct netdev *dev;
>>> +    int err;
>>> +
>>> +    /* 0x1234 - fake eth type sent to probe feature */
>>> +    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type ==
>>> htons(0x1234)) {
>>> +        return false;
>>> +    }
>>> +
>>> +    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
>>> +                                put->mask_len, &match)) {
>>> +        return false;
>>> +    }
>>> +
>>> +    in_port = match.flow.in_port.odp_port;
>>> +    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
>>> +    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
>>> +    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
>>> +        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
>>> +            struct netdev *outdev;
>>> +            int ifindex_out;
>>> +            const struct netdev_tunnel_config *tnl_cfg;
>>> +            size_t out_off;
>>> +            odp_port_t out_port;
>>> +
>>> +            outputs++;
>>> +            if (outputs > 1) {
>>> +                break;
>>> +            }
>>> +
>>> +            out_port = nl_attr_get_u32(nla);
>>> +            outdev = netdev_hmap_port_get(out_port,
>>> dpif->dpif.dpif_class);
>>> +            tnl_cfg = netdev_get_tunnel_config(outdev);
>>> +
>>> +            out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
>>> +            ifindex_out = netdev_get_ifindex(outdev);
>>> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
>>> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
>>> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST,
>>> tnl_cfg->dst_port);
>>> +            }
>>> +            nl_msg_end_nested(&buf, out_off);
>>> +
>>> +            if (outdev) {
>>> +                netdev_close(outdev);
>>> +            }
>>> +        } else {
>>> +            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
>>> +                              nl_attr_get_size(nla));
>>> +        }
>>> +    }
>>> +    nl_msg_end_nested(&buf, offset);
>>> +
>>> +    if (outputs > 1) {
>>> +        return false;
>>> +    }
>>> +
>>> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
>>> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
>>> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
>>> +                                                  nl_attr_get(act)),
>>> +                          nl_attr_get_size(act), put->stats,
>>> +                          CONST_CAST(ovs_u128 *, put->ufid));
>>> +    netdev_close(dev);
>>> +
>>> +    if (!err) {
>>> +        if (put->flags & DPIF_FP_MODIFY) {
>>> +            struct dpif_op *opp;
>>> +            struct dpif_op op;
>>> +
>>> +            op.type = DPIF_OP_FLOW_DEL;
>>> +            op.u.flow_del.key = put->key;
>>> +            op.u.flow_del.key_len = put->key_len;
>>> +            op.u.flow_del.ufid = put->ufid;
>>> +            op.u.flow_del.pmd_id = put->pmd_id;
>>> +            op.u.flow_del.stats = NULL;
>>> +            op.u.flow_del.terse = false;
>>> +
>>> +            opp = &op;
>>> +            dpif_netlink_operate__(dpif, &opp, 1);
>>> +        }
>>> +        VLOG_DBG("added flow");
>>> +        return true;
>>> +    }
>>> +    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
>>> +
>>> +    return false;
>>> +}
>>> +
>>> +static void
>>> +dbg_print_flow(const struct nlattr *key, size_t key_len,
>>> +               const struct nlattr *mask, size_t mask_len,
>>> +               const struct nlattr *actions, size_t actions_len,
>>> +               const ovs_u128 *ufid,
>>> +               const char *op)
>>> +{
>>> +        struct ds s;
>>> +
>>> +        ds_init(&s);
>>> +        ds_put_cstr(&s, op);
>>> +        ds_put_cstr(&s, " (");
>>> +        odp_format_ufid(ufid, &s);
>>> +        ds_put_cstr(&s, ")");
>>> +        if (key_len) {
>>> +            ds_put_cstr(&s, "\nflow (verbose): ");
>>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>> true);
>>> +            ds_put_cstr(&s, "\nflow: ");
>>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>> false);
>>> +        }
>>> +        if (actions_len) {
>>> +            ds_put_cstr(&s, "\nactions: ");
>>> +            format_odp_actions(&s, actions, actions_len);
>>> +        }
>>> +        VLOG_DBG("\n%s", ds_cstr(&s));
>>> +        ds_destroy(&s);
>>> +}
>>> +
>>> +static bool
>>> +try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
>>> +{
>>> +    switch (op->type) {
>>> +    case DPIF_OP_FLOW_PUT: {
>>> +        struct dpif_flow_put *put = &op->u.flow_put;
>>> +
>>> +        if (!put->ufid) {
>>> +            return false;
>>> +        }
>>> +        dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
>>> +                       put->actions, put->actions_len, put->ufid,
>>> "PUT");
>>> +        return parse_flow_put(dpif, put);
>>> +    }
>>> +    case DPIF_OP_FLOW_DEL:
>>> +    case DPIF_OP_FLOW_GET:
>>> +    case DPIF_OP_EXECUTE:
>>> +    default:
>>> +        break;
>>> +    }
>>> +    return false;
>>> +}
>>> +
>>>   static void
>>>   dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t
>>> n_ops)
>>>   {
>>>       struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>> +    struct dpif_op **new_ops;
>>> +    int n_new_ops = 0;
>>> +    int i = 0;
>>> +
>>> +    if (!netdev_flow_api_enabled) {
>>> +        new_ops = ops;
>>> +        n_new_ops = n_ops;
>>> +    } else {
>>> +        new_ops = xmalloc((sizeof *new_ops) * n_ops);
>>> +        n_new_ops = 0;
>>> +
>>> +        for (i = 0; i < n_ops; i++) {
>>> +            if (try_send_to_netdev(dpif, ops[i])) {
>>> +                ops[i]->error = 0;
>>
>> What if the hardware returns EEXIST? Shouldn't we return EEXIST?
>
> Right it should, we'll fix that.
>>
>>
>> What if the hardware reaches some resource constraint? This isn't
>> required for an initial implementation, but it may be nice to have
>> some heuristic to try to cut down on the failed syscalls if userspace
>> has become aware that the hardware is out of resources. (Getting good
>> visibility on this would also matter if you tried to deploy this).
>
> Right, do you mean that if certain kinds of flow fail (a specific mask
> type), don't try again (with the same mask)?
> Is it done in kernel?

There's a couple of things: On the side of resource constraints, there
is currently a ceiling of about 200,000 flows, above which userspace
will not attempt to install a new flow. This logic is in
ofproto-dpif-upcall. Depending on how complex your constraints are,
maybe there is a way to model the hardware resource in the userspace
so that once the limits are hit, userspace minimizes the number of
failed syscalls due to hardware constraints. Simplest model would be
something like, when TC starts returning error codes for ENOSPC or
whatever the "out of hardware resource" error codes are, then you take
the current number of hardware flows and choose that as the maximum
number of hardware flows. Future flow installs to hardware will fail
out early based on this "n_flows > max_flows" logic. Obviously
depending on how constrained your hardware is, and what the
constraints look like, this may be useful or useless. If hardware
supports 2K flows, then you're more likely to get benefit out of being
aware of this than if the hardware allows 200K arbitrary flows. Also I
recognise that hardware may arrange flows differently so two flows may
consume different amounts of hardware resources.

The second part is if it's certain kinds of flow. The "probe"s in
ofproto-dpif allow datapath feature detection at runtime, which can
then be used to change the behaviour. For instance, if there is no
support in datapath for a particular action, then the OpenFlow layer
will return errors when a controller attempts to use that action (as
we can't satisfy the flow mod request). For TC, it may be more like,
during datapath initialization we detect the supported features so
that flows may be checked against this feature support before going
down to the kernel to install the flow (which would fail if, for
instance, you tried to use one of the newer fields against an older
kernel that has only the initial flower support).

One thing I'm considering to avoid is where ovs-vswitchd logs end up
getting flooded with all of these "hardware flow failed to be
installed" errors, and you spend extra CPU attempting things that we
can easily detect and avoid. Something to consider.

While I remember, please check all the VLOG calls and use ratelimiters
for them. (I think I provided this feedback somewhere, but I put it
here as well just in case I forgot to).
Joe Stringer Jan. 10, 2017, 10:34 p.m. UTC | #6
On 10 January 2017 at 07:50, Paul Blakey <paulb@mellanox.com> wrote:
>>> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
>>> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
>>> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST,
>>> tnl_cfg->dst_port);
>>> +            }
>>> +            nl_msg_end_nested(&buf, out_off);
>>> +
>>> +            if (outdev) {
>>> +                netdev_close(outdev);
>>> +            }
>>> +        } else {
>>> +            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
>>> +                              nl_attr_get_size(nla));
>>> +        }
>>> +    }
>>> +    nl_msg_end_nested(&buf, offset);
>>
>> Hmm. I'm a bit uneasy about this whole copying/rewriting of the
>> actions. Firstly, the tunnel stuff just looks wrong. A quick "git
>> grep" shows that OVS_TUNNEL_KEY_ATTR_TP_DST is using the wrong type,
>> and looking closer at the other places it is used, it looks like this
>> function is shortcutting a whole bunch of the tunnel config.
>
> Besides the wrong type, I'm not sure what you mean, dpif_netlink_port_add__
> seems to access the tunnel dst port the same way.

Yes, data type, bitwise differences compared to
dpif_netlink_port_add__(). Checking the ifindex.

Does tnl_cfg->exts need to be checked too? Are any of the other
'struct netdev_tunnel_config' fields relevant for output (or will
ignoring them potentially lead to a correctness issue)?

>>
>>
>> I recognise that the ODP port number needs to be translated into an
>> ifindex. Do we need to do anything else?
>
> tunnel dst port for set action was missing in ACTION_SET ,  TUNNEL pair.
> We needed that for offloading.

Broadly I was thinking that any and all fields that may be relevant to
tunnel outputting should be handled and checked - fail out if it can't
be interpreted or implemented on lower layers. Looking at 'enum
ovs_tunnel_key_attr', there's a whole bunch of fields which are
similar to OVS_TUNNEL_KEY_ATTR_TP_DST and it wasn't immediately
obvious to me why this function only serialises this one member of the
enum and doesn't need to handle any of the rest.

If I follow correctly, the answer is that for traditional OVS vports,
we configure the vport in kernel to include TP_DST (and a few other
things), then omit these from the flow. When the datapath performs
output action, it will fetch this information from the vport in
kernel. However, for the TC hardware offloads case there is no such
port configuration in kernel, so now we need to ensure that it is
encoded in the flow. Is that right?

>>> +
>>> +    if (outputs > 1) {
>>> +        return false;
>>> +    }
>>> +
>>> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
>>> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
>>> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
>>> +                                                  nl_attr_get(act)),
>>> +                          nl_attr_get_size(act), put->stats,
>>> +                          CONST_CAST(ovs_u128 *, put->ufid));
>>> +    netdev_close(dev);
>>> +
>>> +    if (!err) {
>>> +        if (put->flags & DPIF_FP_MODIFY) {
>>> +            struct dpif_op *opp;
>>> +            struct dpif_op op;
>>> +
>>> +            op.type = DPIF_OP_FLOW_DEL;
>>> +            op.u.flow_del.key = put->key;
>>> +            op.u.flow_del.key_len = put->key_len;
>>> +            op.u.flow_del.ufid = put->ufid;
>>> +            op.u.flow_del.pmd_id = put->pmd_id;
>>> +            op.u.flow_del.stats = NULL;
>>> +            op.u.flow_del.terse = false;
>>> +
>>> +            opp = &op;
>>> +            dpif_netlink_operate__(dpif, &opp, 1);
>>
>> Won't this just delete the flow that was just added?
>
> The above FLOW_DEL calls the original operate (without offload) to delete it
> if it from netlink kernel datapath. This will happen if we got a modify
> request and we now
> can offload the flow and previously couldn't (so its in kernel datapath).

Ah, I missed that it was the "dpif_netlink_operate__" variation. OK.

>>
>> Is there a modify at the tc layer?
>
> Yes there is, and implemented by given a already used handle.
> So if we find the ufid in the map (ufid to handle/prio) already, we modify
> it by using
> the same handle.

OK, this seems fine at a glance.
Paul Blakey Jan. 11, 2017, 9:40 a.m. UTC | #7
On 11/01/2017 00:34, Joe Stringer wrote:
> On 10 January 2017 at 07:50, Paul Blakey <paulb@mellanox.com> wrote:
>>>> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
>>>> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
>>>> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST,
>>>> tnl_cfg->dst_port);
>>>> +            }
>>>> +            nl_msg_end_nested(&buf, out_off);
>>>> +
>>>> +            if (outdev) {
>>>> +                netdev_close(outdev);
>>>> +            }
>>>> +        } else {
>>>> +            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
>>>> +                              nl_attr_get_size(nla));
>>>> +        }
>>>> +    }
>>>> +    nl_msg_end_nested(&buf, offset);
>>>
>>> Hmm. I'm a bit uneasy about this whole copying/rewriting of the
>>> actions. Firstly, the tunnel stuff just looks wrong. A quick "git
>>> grep" shows that OVS_TUNNEL_KEY_ATTR_TP_DST is using the wrong type,
>>> and looking closer at the other places it is used, it looks like this
>>> function is shortcutting a whole bunch of the tunnel config.
>>
>> Besides the wrong type, I'm not sure what you mean, dpif_netlink_port_add__
>> seems to access the tunnel dst port the same way.
>
> Yes, data type, bitwise differences compared to
> dpif_netlink_port_add__(). Checking the ifindex.
>
> Does tnl_cfg->exts need to be checked too? Are any of the other
> 'struct netdev_tunnel_config' fields relevant for output (or will
> ignoring them potentially lead to a correctness issue)?
>
>>>
>>>
>>> I recognise that the ODP port number needs to be translated into an
>>> ifindex. Do we need to do anything else?
>>
>> tunnel dst port for set action was missing in ACTION_SET ,  TUNNEL pair.
>> We needed that for offloading.
>
> Broadly I was thinking that any and all fields that may be relevant to
> tunnel outputting should be handled and checked - fail out if it can't
> be interpreted or implemented on lower layers. Looking at 'enum
> ovs_tunnel_key_attr', there's a whole bunch of fields which are
> similar to OVS_TUNNEL_KEY_ATTR_TP_DST and it wasn't immediately
> obvious to me why this function only serialises this one member of the
> enum and doesn't need to handle any of the rest.
>
> If I follow correctly, the answer is that for traditional OVS vports,
> we configure the vport in kernel to include TP_DST (and a few other
> things), then omit these from the flow. When the datapath performs
> output action, it will fetch this information from the vport in
> kernel. However, for the TC hardware offloads case there is no such
> port configuration in kernel, so now we need to ensure that it is
> encoded in the flow. Is that right?
>

Right (though not sure whats the reason).

>>>> +
>>>> +    if (outputs > 1) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
>>>> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
>>>> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
>>>> +                                                  nl_attr_get(act)),
>>>> +                          nl_attr_get_size(act), put->stats,
>>>> +                          CONST_CAST(ovs_u128 *, put->ufid));
>>>> +    netdev_close(dev);
>>>> +
>>>> +    if (!err) {
>>>> +        if (put->flags & DPIF_FP_MODIFY) {
>>>> +            struct dpif_op *opp;
>>>> +            struct dpif_op op;
>>>> +
>>>> +            op.type = DPIF_OP_FLOW_DEL;
>>>> +            op.u.flow_del.key = put->key;
>>>> +            op.u.flow_del.key_len = put->key_len;
>>>> +            op.u.flow_del.ufid = put->ufid;
>>>> +            op.u.flow_del.pmd_id = put->pmd_id;
>>>> +            op.u.flow_del.stats = NULL;
>>>> +            op.u.flow_del.terse = false;
>>>> +
>>>> +            opp = &op;
>>>> +            dpif_netlink_operate__(dpif, &opp, 1);
>>>
>>> Won't this just delete the flow that was just added?
>>
>> The above FLOW_DEL calls the original operate (without offload) to delete it
>> if it from netlink kernel datapath. This will happen if we got a modify
>> request and we now
>> can offload the flow and previously couldn't (so its in kernel datapath).
>
> Ah, I missed that it was the "dpif_netlink_operate__" variation. OK.
>
>>>
>>> Is there a modify at the tc layer?
>>
>> Yes there is, and implemented by given a already used handle.
>> So if we find the ufid in the map (ufid to handle/prio) already, we modify
>> it by using
>> the same handle.
>
> OK, this seems fine at a glance.
>
Paul Blakey Jan. 11, 2017, 10:20 a.m. UTC | #8
On 10/01/2017 23:58, Joe Stringer wrote:
> On 10 January 2017 at 06:36, Paul Blakey <paulb@mellanox.com> wrote:
>>
>>
>> On 06/01/2017 01:28, Joe Stringer wrote:
>>>
>>> On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote:
>>>>
>>>> Using the new netdev flow api operate will now try and
>>>> offload flows to the relevant netdev of the input port.
>>>> Other operate methods flows will come in later patches.
>>>>
>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>>>> ---
>>>>   lib/dpif-netlink.c | 232
>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>   1 file changed, 228 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>>>> index 3d8940e..717af90 100644
>>>> --- a/lib/dpif-netlink.c
>>>> +++ b/lib/dpif-netlink.c
>>>> @@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink
>>>> *dpif,
>>>>       return n_ops;
>>>>   }
>>>>
>>>> +static int
>>>> +parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
>>>> +                            const struct nlattr *mask, size_t mask_len,
>>>> +                            struct match *match)
>>>> +{
>>>> +    enum odp_key_fitness fitness;
>>>> +
>>>> +    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
>>>> +    if (fitness) {
>>>> +        /* This should not happen: it indicates that
>>>> odp_flow_key_from_flow()
>>>> +         * and odp_flow_key_to_flow() disagree on the acceptable form of
>>>> a
>>>> +         * flow.  Log the problem as an error, with enough details to
>>>> enable
>>>> +         * debugging. */
>>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>>> +
>>>> +        if (!VLOG_DROP_ERR(&rl)) {
>>>> +            struct ds s;
>>>> +
>>>> +            ds_init(&s);
>>>> +            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
>>>> +            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
>>>> +            ds_destroy(&s);
>>>> +        }
>>>> +
>>>> +        return EINVAL;
>>>> +    }
>>>> +
>>>> +    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc,
>>>> &match->flow);
>>>> +    if (fitness) {
>>>> +        /* This should not happen: it indicates that
>>>> +         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
>>>> +         * disagree on the acceptable form of a mask.  Log the problem
>>>> +         * as an error, with enough details to enable debugging. */
>>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>>> +
>>>> +        if (!VLOG_DROP_ERR(&rl)) {
>>>> +            struct ds s;
>>>> +
>>>> +            VLOG_ERR("internal error parsing flow mask %s (%s)",
>>>> +                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
>>>> +            ds_destroy(&s);
>>>> +        }
>>>> +
>>>> +        return EINVAL;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static bool
>>>> +parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
>>>> +{
>>>> +    struct match match;
>>>> +    odp_port_t in_port;
>>>> +    const struct nlattr *nla;
>>>> +    size_t left;
>>>> +    int outputs = 0;
>>>> +    struct ofpbuf buf;
>>>> +    uint64_t act_stub[1024 / 8];
>>>> +    size_t offset;
>>>> +    struct nlattr *act;
>>>> +    struct netdev *dev;
>>>> +    int err;
>>>> +
>>>> +    /* 0x1234 - fake eth type sent to probe feature */
>>>> +    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type ==
>>>> htons(0x1234)) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
>>>> +                                put->mask_len, &match)) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    in_port = match.flow.in_port.odp_port;
>>>> +    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
>>>> +    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
>>>> +    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
>>>> +        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
>>>> +            struct netdev *outdev;
>>>> +            int ifindex_out;
>>>> +            const struct netdev_tunnel_config *tnl_cfg;
>>>> +            size_t out_off;
>>>> +            odp_port_t out_port;
>>>> +
>>>> +            outputs++;
>>>> +            if (outputs > 1) {
>>>> +                break;
>>>> +            }
>>>> +
>>>> +            out_port = nl_attr_get_u32(nla);
>>>> +            outdev = netdev_hmap_port_get(out_port,
>>>> dpif->dpif.dpif_class);
>>>> +            tnl_cfg = netdev_get_tunnel_config(outdev);
>>>> +
>>>> +            out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
>>>> +            ifindex_out = netdev_get_ifindex(outdev);
>>>> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
>>>> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
>>>> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST,
>>>> tnl_cfg->dst_port);
>>>> +            }
>>>> +            nl_msg_end_nested(&buf, out_off);
>>>> +
>>>> +            if (outdev) {
>>>> +                netdev_close(outdev);
>>>> +            }
>>>> +        } else {
>>>> +            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
>>>> +                              nl_attr_get_size(nla));
>>>> +        }
>>>> +    }
>>>> +    nl_msg_end_nested(&buf, offset);
>>>> +
>>>> +    if (outputs > 1) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
>>>> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
>>>> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
>>>> +                                                  nl_attr_get(act)),
>>>> +                          nl_attr_get_size(act), put->stats,
>>>> +                          CONST_CAST(ovs_u128 *, put->ufid));
>>>> +    netdev_close(dev);
>>>> +
>>>> +    if (!err) {
>>>> +        if (put->flags & DPIF_FP_MODIFY) {
>>>> +            struct dpif_op *opp;
>>>> +            struct dpif_op op;
>>>> +
>>>> +            op.type = DPIF_OP_FLOW_DEL;
>>>> +            op.u.flow_del.key = put->key;
>>>> +            op.u.flow_del.key_len = put->key_len;
>>>> +            op.u.flow_del.ufid = put->ufid;
>>>> +            op.u.flow_del.pmd_id = put->pmd_id;
>>>> +            op.u.flow_del.stats = NULL;
>>>> +            op.u.flow_del.terse = false;
>>>> +
>>>> +            opp = &op;
>>>> +            dpif_netlink_operate__(dpif, &opp, 1);
>>>> +        }
>>>> +        VLOG_DBG("added flow");
>>>> +        return true;
>>>> +    }
>>>> +    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
>>>> +
>>>> +    return false;
>>>> +}
>>>> +
>>>> +static void
>>>> +dbg_print_flow(const struct nlattr *key, size_t key_len,
>>>> +               const struct nlattr *mask, size_t mask_len,
>>>> +               const struct nlattr *actions, size_t actions_len,
>>>> +               const ovs_u128 *ufid,
>>>> +               const char *op)
>>>> +{
>>>> +        struct ds s;
>>>> +
>>>> +        ds_init(&s);
>>>> +        ds_put_cstr(&s, op);
>>>> +        ds_put_cstr(&s, " (");
>>>> +        odp_format_ufid(ufid, &s);
>>>> +        ds_put_cstr(&s, ")");
>>>> +        if (key_len) {
>>>> +            ds_put_cstr(&s, "\nflow (verbose): ");
>>>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>>> true);
>>>> +            ds_put_cstr(&s, "\nflow: ");
>>>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>>> false);
>>>> +        }
>>>> +        if (actions_len) {
>>>> +            ds_put_cstr(&s, "\nactions: ");
>>>> +            format_odp_actions(&s, actions, actions_len);
>>>> +        }
>>>> +        VLOG_DBG("\n%s", ds_cstr(&s));
>>>> +        ds_destroy(&s);
>>>> +}
>>>> +
>>>> +static bool
>>>> +try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
>>>> +{
>>>> +    switch (op->type) {
>>>> +    case DPIF_OP_FLOW_PUT: {
>>>> +        struct dpif_flow_put *put = &op->u.flow_put;
>>>> +
>>>> +        if (!put->ufid) {
>>>> +            return false;
>>>> +        }
>>>> +        dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
>>>> +                       put->actions, put->actions_len, put->ufid,
>>>> "PUT");
>>>> +        return parse_flow_put(dpif, put);
>>>> +    }
>>>> +    case DPIF_OP_FLOW_DEL:
>>>> +    case DPIF_OP_FLOW_GET:
>>>> +    case DPIF_OP_EXECUTE:
>>>> +    default:
>>>> +        break;
>>>> +    }
>>>> +    return false;
>>>> +}
>>>> +
>>>>   static void
>>>>   dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t
>>>> n_ops)
>>>>   {
>>>>       struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>>> +    struct dpif_op **new_ops;
>>>> +    int n_new_ops = 0;
>>>> +    int i = 0;
>>>> +
>>>> +    if (!netdev_flow_api_enabled) {
>>>> +        new_ops = ops;
>>>> +        n_new_ops = n_ops;
>>>> +    } else {
>>>> +        new_ops = xmalloc((sizeof *new_ops) * n_ops);
>>>> +        n_new_ops = 0;
>>>> +
>>>> +        for (i = 0; i < n_ops; i++) {
>>>> +            if (try_send_to_netdev(dpif, ops[i])) {
>>>> +                ops[i]->error = 0;
>>>
>>> What if the hardware returns EEXIST? Shouldn't we return EEXIST?
>>
>> Right it should, we'll fix that.
>>>
>>>
>>> What if the hardware reaches some resource constraint? This isn't
>>> required for an initial implementation, but it may be nice to have
>>> some heuristic to try to cut down on the failed syscalls if userspace
>>> has become aware that the hardware is out of resources. (Getting good
>>> visibility on this would also matter if you tried to deploy this).
>>
>> Right, do you mean that if certain kinds of flow fail (a specific mask
>> type), don't try again (with the same mask)?
>> Is it done in kernel?
>
> There's a couple of things: On the side of resource constraints, there
> is currently a ceiling of about 200,000 flows, above which userspace
> will not attempt to install a new flow. This logic is in
> ofproto-dpif-upcall. Depending on how complex your constraints are,
> maybe there is a way to model the hardware resource in the userspace
> so that once the limits are hit, userspace minimizes the number of
> failed syscalls due to hardware constraints. Simplest model would be
> something like, when TC starts returning error codes for ENOSPC or
> whatever the "out of hardware resource" error codes are, then you take
> the current number of hardware flows and choose that as the maximum
> number of hardware flows. Future flow installs to hardware will fail
> out early based on this "n_flows > max_flows" logic. Obviously
> depending on how constrained your hardware is, and what the
> constraints look like, this may be useful or useless. If hardware
> supports 2K flows, then you're more likely to get benefit out of being
> aware of this than if the hardware allows 200K arbitrary flows. Also I
> recognise that hardware may arrange flows differently so two flows may
> consume different amounts of hardware resources.
>
> The second part is if it's certain kinds of flow. The "probe"s in
> ofproto-dpif allow datapath feature detection at runtime, which can
> then be used to change the behaviour. For instance, if there is no
> support in datapath for a particular action, then the OpenFlow layer
> will return errors when a controller attempts to use that action (as
> we can't satisfy the flow mod request). For TC, it may be more like,
> during datapath initialization we detect the supported features so
> that flows may be checked against this feature support before going
> down to the kernel to install the flow (which would fail if, for
> instance, you tried to use one of the newer fields against an older
> kernel that has only the initial flower support).
>

Thanks for the suggestions. We need to look into that but I think
we can postpone this for the current patch set, and do these 
optimizations later.


> One thing I'm considering to avoid is where ovs-vswitchd logs end up
> getting flooded with all of these "hardware flow failed to be
> installed" errors, and you spend extra CPU attempting things that we
> can easily detect and avoid. Something to consider.
>
> While I remember, please check all the VLOG calls and use ratelimiters
> for them. (I think I provided this feedback somewhere, but I put it
> here as well just in case I forgot to).
>
Joe Stringer Jan. 11, 2017, 10:37 p.m. UTC | #9
On 11 January 2017 at 02:20, Paul Blakey <paulb@mellanox.com> wrote:
>
>
> On 10/01/2017 23:58, Joe Stringer wrote:
>>
>> On 10 January 2017 at 06:36, Paul Blakey <paulb@mellanox.com> wrote:
>>>
>>>
>>>
>>> On 06/01/2017 01:28, Joe Stringer wrote:
>>>>
>>>>
>>>> On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>
>>>>>
>>>>> Using the new netdev flow api operate will now try and
>>>>> offload flows to the relevant netdev of the input port.
>>>>> Other operate methods flows will come in later patches.
>>>>>
>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>>>>> ---
>>>>>   lib/dpif-netlink.c | 232
>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>   1 file changed, 228 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>>>>> index 3d8940e..717af90 100644
>>>>> --- a/lib/dpif-netlink.c
>>>>> +++ b/lib/dpif-netlink.c
>>>>> @@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink
>>>>> *dpif,
>>>>>       return n_ops;
>>>>>   }
>>>>>
>>>>> +static int
>>>>> +parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
>>>>> +                            const struct nlattr *mask, size_t
>>>>> mask_len,
>>>>> +                            struct match *match)
>>>>> +{
>>>>> +    enum odp_key_fitness fitness;
>>>>> +
>>>>> +    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
>>>>> +    if (fitness) {
>>>>> +        /* This should not happen: it indicates that
>>>>> odp_flow_key_from_flow()
>>>>> +         * and odp_flow_key_to_flow() disagree on the acceptable form
>>>>> of
>>>>> a
>>>>> +         * flow.  Log the problem as an error, with enough details to
>>>>> enable
>>>>> +         * debugging. */
>>>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>>>> +
>>>>> +        if (!VLOG_DROP_ERR(&rl)) {
>>>>> +            struct ds s;
>>>>> +
>>>>> +            ds_init(&s);
>>>>> +            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
>>>>> +            VLOG_ERR("internal error parsing flow key %s",
>>>>> ds_cstr(&s));
>>>>> +            ds_destroy(&s);
>>>>> +        }
>>>>> +
>>>>> +        return EINVAL;
>>>>> +    }
>>>>> +
>>>>> +    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc,
>>>>> &match->flow);
>>>>> +    if (fitness) {
>>>>> +        /* This should not happen: it indicates that
>>>>> +         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
>>>>> +         * disagree on the acceptable form of a mask.  Log the problem
>>>>> +         * as an error, with enough details to enable debugging. */
>>>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>>>> +
>>>>> +        if (!VLOG_DROP_ERR(&rl)) {
>>>>> +            struct ds s;
>>>>> +
>>>>> +            VLOG_ERR("internal error parsing flow mask %s (%s)",
>>>>> +                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
>>>>> +            ds_destroy(&s);
>>>>> +        }
>>>>> +
>>>>> +        return EINVAL;
>>>>> +    }
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static bool
>>>>> +parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
>>>>> +{
>>>>> +    struct match match;
>>>>> +    odp_port_t in_port;
>>>>> +    const struct nlattr *nla;
>>>>> +    size_t left;
>>>>> +    int outputs = 0;
>>>>> +    struct ofpbuf buf;
>>>>> +    uint64_t act_stub[1024 / 8];
>>>>> +    size_t offset;
>>>>> +    struct nlattr *act;
>>>>> +    struct netdev *dev;
>>>>> +    int err;
>>>>> +
>>>>> +    /* 0x1234 - fake eth type sent to probe feature */
>>>>> +    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type ==
>>>>> htons(0x1234)) {
>>>>> +        return false;
>>>>> +    }
>>>>> +
>>>>> +    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
>>>>> +                                put->mask_len, &match)) {
>>>>> +        return false;
>>>>> +    }
>>>>> +
>>>>> +    in_port = match.flow.in_port.odp_port;
>>>>> +    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
>>>>> +    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
>>>>> +    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
>>>>> +        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
>>>>> +            struct netdev *outdev;
>>>>> +            int ifindex_out;
>>>>> +            const struct netdev_tunnel_config *tnl_cfg;
>>>>> +            size_t out_off;
>>>>> +            odp_port_t out_port;
>>>>> +
>>>>> +            outputs++;
>>>>> +            if (outputs > 1) {
>>>>> +                break;
>>>>> +            }
>>>>> +
>>>>> +            out_port = nl_attr_get_u32(nla);
>>>>> +            outdev = netdev_hmap_port_get(out_port,
>>>>> dpif->dpif.dpif_class);
>>>>> +            tnl_cfg = netdev_get_tunnel_config(outdev);
>>>>> +
>>>>> +            out_off = nl_msg_start_nested(&buf,
>>>>> OVS_ACTION_ATTR_OUTPUT);
>>>>> +            ifindex_out = netdev_get_ifindex(outdev);
>>>>> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
>>>>> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
>>>>> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST,
>>>>> tnl_cfg->dst_port);
>>>>> +            }
>>>>> +            nl_msg_end_nested(&buf, out_off);
>>>>> +
>>>>> +            if (outdev) {
>>>>> +                netdev_close(outdev);
>>>>> +            }
>>>>> +        } else {
>>>>> +            nl_msg_put_unspec(&buf, nl_attr_type(nla),
>>>>> nl_attr_get(nla),
>>>>> +                              nl_attr_get_size(nla));
>>>>> +        }
>>>>> +    }
>>>>> +    nl_msg_end_nested(&buf, offset);
>>>>> +
>>>>> +    if (outputs > 1) {
>>>>> +        return false;
>>>>> +    }
>>>>> +
>>>>> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
>>>>> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
>>>>> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
>>>>> +                                                  nl_attr_get(act)),
>>>>> +                          nl_attr_get_size(act), put->stats,
>>>>> +                          CONST_CAST(ovs_u128 *, put->ufid));
>>>>> +    netdev_close(dev);
>>>>> +
>>>>> +    if (!err) {
>>>>> +        if (put->flags & DPIF_FP_MODIFY) {
>>>>> +            struct dpif_op *opp;
>>>>> +            struct dpif_op op;
>>>>> +
>>>>> +            op.type = DPIF_OP_FLOW_DEL;
>>>>> +            op.u.flow_del.key = put->key;
>>>>> +            op.u.flow_del.key_len = put->key_len;
>>>>> +            op.u.flow_del.ufid = put->ufid;
>>>>> +            op.u.flow_del.pmd_id = put->pmd_id;
>>>>> +            op.u.flow_del.stats = NULL;
>>>>> +            op.u.flow_del.terse = false;
>>>>> +
>>>>> +            opp = &op;
>>>>> +            dpif_netlink_operate__(dpif, &opp, 1);
>>>>> +        }
>>>>> +        VLOG_DBG("added flow");
>>>>> +        return true;
>>>>> +    }
>>>>> +    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
>>>>> +
>>>>> +    return false;
>>>>> +}
>>>>> +
>>>>> +static void
>>>>> +dbg_print_flow(const struct nlattr *key, size_t key_len,
>>>>> +               const struct nlattr *mask, size_t mask_len,
>>>>> +               const struct nlattr *actions, size_t actions_len,
>>>>> +               const ovs_u128 *ufid,
>>>>> +               const char *op)
>>>>> +{
>>>>> +        struct ds s;
>>>>> +
>>>>> +        ds_init(&s);
>>>>> +        ds_put_cstr(&s, op);
>>>>> +        ds_put_cstr(&s, " (");
>>>>> +        odp_format_ufid(ufid, &s);
>>>>> +        ds_put_cstr(&s, ")");
>>>>> +        if (key_len) {
>>>>> +            ds_put_cstr(&s, "\nflow (verbose): ");
>>>>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>>>> true);
>>>>> +            ds_put_cstr(&s, "\nflow: ");
>>>>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>>>> false);
>>>>> +        }
>>>>> +        if (actions_len) {
>>>>> +            ds_put_cstr(&s, "\nactions: ");
>>>>> +            format_odp_actions(&s, actions, actions_len);
>>>>> +        }
>>>>> +        VLOG_DBG("\n%s", ds_cstr(&s));
>>>>> +        ds_destroy(&s);
>>>>> +}
>>>>> +
>>>>> +static bool
>>>>> +try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
>>>>> +{
>>>>> +    switch (op->type) {
>>>>> +    case DPIF_OP_FLOW_PUT: {
>>>>> +        struct dpif_flow_put *put = &op->u.flow_put;
>>>>> +
>>>>> +        if (!put->ufid) {
>>>>> +            return false;
>>>>> +        }
>>>>> +        dbg_print_flow(put->key, put->key_len, put->mask,
>>>>> put->mask_len,
>>>>> +                       put->actions, put->actions_len, put->ufid,
>>>>> "PUT");
>>>>> +        return parse_flow_put(dpif, put);
>>>>> +    }
>>>>> +    case DPIF_OP_FLOW_DEL:
>>>>> +    case DPIF_OP_FLOW_GET:
>>>>> +    case DPIF_OP_EXECUTE:
>>>>> +    default:
>>>>> +        break;
>>>>> +    }
>>>>> +    return false;
>>>>> +}
>>>>> +
>>>>>   static void
>>>>>   dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t
>>>>> n_ops)
>>>>>   {
>>>>>       struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>>>> +    struct dpif_op **new_ops;
>>>>> +    int n_new_ops = 0;
>>>>> +    int i = 0;
>>>>> +
>>>>> +    if (!netdev_flow_api_enabled) {
>>>>> +        new_ops = ops;
>>>>> +        n_new_ops = n_ops;
>>>>> +    } else {
>>>>> +        new_ops = xmalloc((sizeof *new_ops) * n_ops);
>>>>> +        n_new_ops = 0;
>>>>> +
>>>>> +        for (i = 0; i < n_ops; i++) {
>>>>> +            if (try_send_to_netdev(dpif, ops[i])) {
>>>>> +                ops[i]->error = 0;
>>>>
>>>>
>>>> What if the hardware returns EEXIST? Shouldn't we return EEXIST?
>>>
>>>
>>> Right it should, we'll fix that.
>>>>
>>>>
>>>>
>>>> What if the hardware reaches some resource constraint? This isn't
>>>> required for an initial implementation, but it may be nice to have
>>>> some heuristic to try to cut down on the failed syscalls if userspace
>>>> has become aware that the hardware is out of resources. (Getting good
>>>> visibility on this would also matter if you tried to deploy this).
>>>
>>>
>>> Right, do you mean that if certain kinds of flow fail (a specific mask
>>> type), don't try again (with the same mask)?
>>> Is it done in kernel?
>>
>>
>> There's a couple of things: On the side of resource constraints, there
>> is currently a ceiling of about 200,000 flows, above which userspace
>> will not attempt to install a new flow. This logic is in
>> ofproto-dpif-upcall. Depending on how complex your constraints are,
>> maybe there is a way to model the hardware resource in the userspace
>> so that once the limits are hit, userspace minimizes the number of
>> failed syscalls due to hardware constraints. Simplest model would be
>> something like, when TC starts returning error codes for ENOSPC or
>> whatever the "out of hardware resource" error codes are, then you take
>> the current number of hardware flows and choose that as the maximum
>> number of hardware flows. Future flow installs to hardware will fail
>> out early based on this "n_flows > max_flows" logic. Obviously
>> depending on how constrained your hardware is, and what the
>> constraints look like, this may be useful or useless. If hardware
>> supports 2K flows, then you're more likely to get benefit out of being
>> aware of this than if the hardware allows 200K arbitrary flows. Also I
>> recognise that hardware may arrange flows differently so two flows may
>> consume different amounts of hardware resources.
>>
>> The second part is if it's certain kinds of flow. The "probe"s in
>> ofproto-dpif allow datapath feature detection at runtime, which can
>> then be used to change the behaviour. For instance, if there is no
>> support in datapath for a particular action, then the OpenFlow layer
>> will return errors when a controller attempts to use that action (as
>> we can't satisfy the flow mod request). For TC, it may be more like,
>> during datapath initialization we detect the supported features so
>> that flows may be checked against this feature support before going
>> down to the kernel to install the flow (which would fail if, for
>> instance, you tried to use one of the newer fields against an older
>> kernel that has only the initial flower support).
>>
>
> Thanks for the suggestions. We need to look into that but I think
> we can postpone this for the current patch set, and do these optimizations
> later.

That seems reasonable.
diff mbox

Patch

diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 3d8940e..717af90 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -1908,15 +1908,239 @@  dpif_netlink_operate__(struct dpif_netlink *dpif,
     return n_ops;
 }
 
+static int
+parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
+                            const struct nlattr *mask, size_t mask_len,
+                            struct match *match)
+{
+    enum odp_key_fitness fitness;
+
+    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
+    if (fitness) {
+        /* This should not happen: it indicates that odp_flow_key_from_flow()
+         * and odp_flow_key_to_flow() disagree on the acceptable form of a
+         * flow.  Log the problem as an error, with enough details to enable
+         * debugging. */
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+        if (!VLOG_DROP_ERR(&rl)) {
+            struct ds s;
+
+            ds_init(&s);
+            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
+            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
+            ds_destroy(&s);
+        }
+
+        return EINVAL;
+    }
+
+    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc, &match->flow);
+    if (fitness) {
+        /* This should not happen: it indicates that
+         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
+         * disagree on the acceptable form of a mask.  Log the problem
+         * as an error, with enough details to enable debugging. */
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+        if (!VLOG_DROP_ERR(&rl)) {
+            struct ds s;
+
+            VLOG_ERR("internal error parsing flow mask %s (%s)",
+                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
+            ds_destroy(&s);
+        }
+
+        return EINVAL;
+    }
+
+    return 0;
+}
+
+static bool
+parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
+{
+    struct match match;
+    odp_port_t in_port;
+    const struct nlattr *nla;
+    size_t left;
+    int outputs = 0;
+    struct ofpbuf buf;
+    uint64_t act_stub[1024 / 8];
+    size_t offset;
+    struct nlattr *act;
+    struct netdev *dev;
+    int err;
+
+    /* 0x1234 - fake eth type sent to probe feature */
+    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type == htons(0x1234)) {
+        return false;
+    }
+
+    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
+                                put->mask_len, &match)) {
+        return false;
+    }
+
+    in_port = match.flow.in_port.odp_port;
+    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
+    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
+    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
+        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
+            struct netdev *outdev;
+            int ifindex_out;
+            const struct netdev_tunnel_config *tnl_cfg;
+            size_t out_off;
+            odp_port_t out_port;
+
+            outputs++;
+            if (outputs > 1) {
+                break;
+            }
+
+            out_port = nl_attr_get_u32(nla);
+            outdev = netdev_hmap_port_get(out_port, dpif->dpif.dpif_class);
+            tnl_cfg = netdev_get_tunnel_config(outdev);
+
+            out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
+            ifindex_out = netdev_get_ifindex(outdev);
+            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
+            if (tnl_cfg && tnl_cfg->dst_port != 0) {
+                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST, tnl_cfg->dst_port);
+            }
+            nl_msg_end_nested(&buf, out_off);
+
+            if (outdev) {
+                netdev_close(outdev);
+            }
+        } else {
+            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
+                              nl_attr_get_size(nla));
+        }
+    }
+    nl_msg_end_nested(&buf, offset);
+
+    if (outputs > 1) {
+        return false;
+    }
+
+    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
+    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
+    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
+                                                  nl_attr_get(act)),
+                          nl_attr_get_size(act), put->stats,
+                          CONST_CAST(ovs_u128 *, put->ufid));
+    netdev_close(dev);
+
+    if (!err) {
+        if (put->flags & DPIF_FP_MODIFY) {
+            struct dpif_op *opp;
+            struct dpif_op op;
+
+            op.type = DPIF_OP_FLOW_DEL;
+            op.u.flow_del.key = put->key;
+            op.u.flow_del.key_len = put->key_len;
+            op.u.flow_del.ufid = put->ufid;
+            op.u.flow_del.pmd_id = put->pmd_id;
+            op.u.flow_del.stats = NULL;
+            op.u.flow_del.terse = false;
+
+            opp = &op;
+            dpif_netlink_operate__(dpif, &opp, 1);
+        }
+        VLOG_DBG("added flow");
+        return true;
+    }
+    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
+
+    return false;
+}
+
+static void
+dbg_print_flow(const struct nlattr *key, size_t key_len,
+               const struct nlattr *mask, size_t mask_len,
+               const struct nlattr *actions, size_t actions_len,
+               const ovs_u128 *ufid,
+               const char *op)
+{
+        struct ds s;
+
+        ds_init(&s);
+        ds_put_cstr(&s, op);
+        ds_put_cstr(&s, " (");
+        odp_format_ufid(ufid, &s);
+        ds_put_cstr(&s, ")");
+        if (key_len) {
+            ds_put_cstr(&s, "\nflow (verbose): ");
+            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, true);
+            ds_put_cstr(&s, "\nflow: ");
+            odp_flow_format(key, key_len, mask, mask_len, NULL, &s, false);
+        }
+        if (actions_len) {
+            ds_put_cstr(&s, "\nactions: ");
+            format_odp_actions(&s, actions, actions_len);
+        }
+        VLOG_DBG("\n%s", ds_cstr(&s));
+        ds_destroy(&s);
+}
+
+static bool
+try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
+{
+    switch (op->type) {
+    case DPIF_OP_FLOW_PUT: {
+        struct dpif_flow_put *put = &op->u.flow_put;
+
+        if (!put->ufid) {
+            return false;
+        }
+        dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
+                       put->actions, put->actions_len, put->ufid, "PUT");
+        return parse_flow_put(dpif, put);
+    }
+    case DPIF_OP_FLOW_DEL:
+    case DPIF_OP_FLOW_GET:
+    case DPIF_OP_EXECUTE:
+    default:
+        break;
+    }
+    return false;
+}
+
 static void
 dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
 {
     struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
+    struct dpif_op **new_ops;
+    int n_new_ops = 0;
+    int i = 0;
+
+    if (!netdev_flow_api_enabled) {
+        new_ops = ops;
+        n_new_ops = n_ops;
+    } else {
+        new_ops = xmalloc((sizeof *new_ops) * n_ops);
+        n_new_ops = 0;
+
+        for (i = 0; i < n_ops; i++) {
+            if (try_send_to_netdev(dpif, ops[i])) {
+                ops[i]->error = 0;
+            } else {
+                new_ops[n_new_ops++] = ops[i];
+            }
+        }
+        ops = new_ops;
+    }
+
+    while (n_new_ops > 0) {
+        size_t chunk = dpif_netlink_operate__(dpif, new_ops, n_new_ops);
 
-    while (n_ops > 0) {
-        size_t chunk = dpif_netlink_operate__(dpif, ops, n_ops);
-        ops += chunk;
-        n_ops -= chunk;
+        new_ops += chunk;
+        n_new_ops -= chunk;
+    }
+
+    if (netdev_flow_api_enabled) {
+       free(ops);
     }
 }