Message ID | 1482665989-791-10-git-send-email-paulb@mellanox.com |
---|---|
State | Changes Requested |
Headers | show |
On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote: > While dumping flows, dump flows that were offloaded to > netdev and parse them back to dpif flow. > > Signed-off-by: Paul Blakey <paulb@mellanox.com> > Reviewed-by: Roi Dayan <roid@mellanox.com> > --- > lib/dpif-netlink.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 179 insertions(+) > > diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c > index 36f2888..3d8940e 100644 > --- a/lib/dpif-netlink.c > +++ b/lib/dpif-netlink.c > @@ -38,6 +38,7 @@ > #include "flow.h" > #include "fat-rwlock.h" > #include "netdev.h" > +#include "netdev-provider.h" > #include "netdev-linux.h" > #include "netdev-vport.h" > #include "netlink-conntrack.h" > @@ -55,6 +56,7 @@ > #include "unaligned.h" > #include "util.h" > #include "openvswitch/vlog.h" > +#include "openvswitch/match.h" > > VLOG_DEFINE_THIS_MODULE(dpif_netlink); > #ifdef _WIN32 > @@ -68,6 +70,8 @@ enum { MAX_PORTS = USHRT_MAX }; > * missing if we have old headers. */ > #define ETH_FLAG_LRO (1 << 15) /* LRO is enabled */ > > +#define FLOW_DUMP_MAX_BATCH 50 > + > struct dpif_netlink_dp { > /* Generic Netlink header. */ > uint8_t cmd; > @@ -1355,6 +1359,10 @@ struct dpif_netlink_flow_dump { > struct dpif_flow_dump up; > struct nl_dump nl_dump; > atomic_int status; > + struct netdev_flow_dump **netdev_dumps; > + int netdev_num; > + int netdev_given; > + struct ovs_mutex netdev_lock; Could you add a brief comment above these variables that describes their use? (It's also common in OVS code to mention that, eg, netdev_lock protects the following elements. ) If there's a more descriptive name than "netdev_num", like netdev_max_dumps or something then please use that instead. At a glance, "given" and "num" don't provide particularly much context about how they relate to each other or to the dump. > }; > > static struct dpif_netlink_flow_dump * > @@ -1363,6 +1371,34 @@ dpif_netlink_flow_dump_cast(struct dpif_flow_dump *dump) > return CONTAINER_OF(dump, struct dpif_netlink_flow_dump, up); > } > > +static void start_netdev_dump(const struct dpif *dpif_, > + struct dpif_netlink_flow_dump *dump) { > + > + if (!netdev_flow_api_enabled) { > + dump->netdev_num = 0; > + return; > + } Typically for style we still place all variable declarations at the top of a function, in a christmas tree long lines to short lines, before functional code like this. > + > + struct netdev_list_element *element; > + struct ovs_list port_list; > + int ports = netdev_hmap_port_get_list(dpif_->dpif_class, &port_list); > + int i = 0; > + > + dump->netdev_dumps = > + ports ? xzalloc(sizeof(struct netdev_flow_dump *) * ports) : 0; Can this be sizeof(dump->netdev_dumps)? > + dump->netdev_num = ports; > + dump->netdev_given = 0; > + > + LIST_FOR_EACH(element, node, &port_list) { > + dump->netdev_dumps[i] = netdev_flow_dump_create(element->netdev); > + dump->netdev_dumps[i]->port = element->port_no; > + i++; > + } As a matter of style, it's easier to see that this loop is bounded by 'ports' (and that number is correct) if it's structured as for (i = 0; i < ports; i++) { element = get_next_node; ... } Also, it seems that even if the netdev doesn't support flow_dump, we allocate a netdev_flow_dump and add it to the netdev_dumps here.. perhaps we could/should skip it for these netdevs instead? > + netdev_port_list_del(&port_list); > + > + ovs_mutex_init(&dump->netdev_lock); I don't see a corresponding ovs_mutex_destroy() call for this. > +} > + > static struct dpif_flow_dump * > dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse) > { > @@ -1387,6 +1423,8 @@ dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse) > atomic_init(&dump->status, 0); > dump->up.terse = terse; > > + start_netdev_dump(dpif_, dump); > + > return &dump->up; > } > > @@ -1397,6 +1435,16 @@ dpif_netlink_flow_dump_destroy(struct dpif_flow_dump *dump_) > unsigned int nl_status = nl_dump_done(&dump->nl_dump); > int dump_status; > > + if (netdev_flow_api_enabled) { > + for (int i = 0; i < dump->netdev_num; i++) { > + int err = netdev_flow_dump_destroy(dump->netdev_dumps[i]); > + if (err != 0 && err != EOPNOTSUPP) { > + VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err)); > + } > + } > + free(dump->netdev_dumps); > + } You don't really need to check for netdev_flow_api_enabled here; netdev_num will be 0 if it is disabled, so that for loop turns into a no-op; then you could initialize dump->netdev_dumps to NULL in that case and unconditionally free it. It's a bit simpler to read the code if you don't have to think about whether or not hardware offloads are enabled. > + > /* No other thread has access to 'dump' at this point. */ > atomic_read_relaxed(&dump->status, &dump_status); > free(dump); > @@ -1410,6 +1458,11 @@ struct dpif_netlink_flow_dump_thread { > struct dpif_flow_stats stats; > struct ofpbuf nl_flows; /* Always used to store flows. */ > struct ofpbuf *nl_actions; /* Used if kernel does not supply actions. */ > + struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH]; > + struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH]; > + struct odputil_keybuf actbuf[FLOW_DUMP_MAX_BATCH]; > + int netdev_cur_dump; > + bool netdev_done; I wonder if it's worthwhile to reuse 'nl_flows' to store all of these netlink-formatted key/mask/acts instead of having these keybufs? It seems that it is currently unused for the first half of the dpif_netlink_flow_dump while the flows are being dumped from the netdev. Regardless of the above question, I also question whether FLOW_DUMP_MAX_BATCH is too big for dumping from the kernel. How many tc flows will we really get from the kernel at once? > }; > > static struct dpif_netlink_flow_dump_thread * > @@ -1429,6 +1482,8 @@ dpif_netlink_flow_dump_thread_create(struct dpif_flow_dump *dump_) > thread->dump = dump; > ofpbuf_init(&thread->nl_flows, NL_DUMP_BUFSIZE); > thread->nl_actions = NULL; > + thread->netdev_cur_dump = 0; > + thread->netdev_done = !(thread->netdev_cur_dump < dump->netdev_num); > > return &thread->up; > } > @@ -1466,6 +1521,90 @@ dpif_netlink_flow_to_dpif_flow(struct dpif *dpif, struct dpif_flow *dpif_flow, > dpif_netlink_flow_get_stats(datapath_flow, &dpif_flow->stats); > } > > +static void > +dpif_netlink_advance_netdev_dump(struct dpif_netlink_flow_dump_thread *thread) > +{ > + struct dpif_netlink_flow_dump *dump = thread->dump; > + > + ovs_mutex_lock(&dump->netdev_lock); > + /* if we haven't finished (dumped everything) */ > + if (dump->netdev_given < dump->netdev_num) { > + /* if we are the first to find that given dump is finished > + * (for race condition, e.g 3 finish dump 0 at the same time) */ Why is there a race condition here if this is executed under netdev_lock? > + if (thread->netdev_cur_dump == dump->netdev_given) { > + thread->netdev_cur_dump = ++dump->netdev_given; > + /* did we just finish the last dump? done. */ > + if (dump->netdev_given == dump->netdev_num) { > + thread->netdev_done = true; > + } > + } else { > + /* otherwise, we are behind, catch up */ > + thread->netdev_cur_dump = dump->netdev_given; > + } > + } else { > + /* some other thread finished */ > + thread->netdev_done = true; > + } > + ovs_mutex_unlock(&dump->netdev_lock); > +} > + > +static struct odp_support netdev_flow_support = { > + .max_mpls_depth = SIZE_MAX, > + .recirc = false, > + .ct_state = false, > + .ct_zone = false, > + .ct_mark = false, > + .ct_label = false, > +}; > + > +static int > +dpif_netlink_netdev_match_to_dpif_flow(struct match *match, > + struct ofpbuf *key_buf, > + struct ofpbuf *mask_buf, > + struct nlattr *actions, > + struct dpif_flow_stats *stats, > + ovs_u128 *ufid, > + struct dpif_flow *flow, > + bool terse OVS_UNUSED) > +{ > + > + struct odp_flow_key_parms odp_parms = { > + .flow = &match->flow, > + .mask = &match->wc.masks, > + .support = netdev_flow_support, There's also 'key_buf' field in parms that may be needed. > + }; > + size_t offset; > + > + memset(flow, 0, sizeof *flow); > + > + /* Key */ > + offset = key_buf->size; > + flow->key = ofpbuf_tail(key_buf); > + odp_flow_key_from_flow(&odp_parms, key_buf); > + flow->key_len = key_buf->size - offset; > + > + /* Mask */ > + offset = mask_buf->size; > + flow->mask = ofpbuf_tail(mask_buf); > + odp_parms.key_buf = key_buf; > + odp_flow_key_from_mask(&odp_parms, mask_buf); > + flow->mask_len = mask_buf->size - offset; > + > + /* Actions */ > + flow->actions = nl_attr_get(actions); > + flow->actions_len = nl_attr_get_size(actions); > + > + /* Stats */ > + memcpy(&flow->stats, stats, sizeof *stats); > + > + /* UFID */ > + flow->ufid_present = true; > + flow->ufid = *ufid; > + > + flow->pmd_id = PMD_ID_NULL; > + return 0; > +} > + > static int > dpif_netlink_flow_dump_next(struct dpif_flow_dump_thread *thread_, > struct dpif_flow *flows, int max_flows) > @@ -1475,11 +1614,51 @@ dpif_netlink_flow_dump_next(struct dpif_flow_dump_thread *thread_, > struct dpif_netlink_flow_dump *dump = thread->dump; > struct dpif_netlink *dpif = dpif_netlink_cast(thread->up.dpif); > int n_flows; > + int i = 0; > > ofpbuf_delete(thread->nl_actions); > thread->nl_actions = NULL; > > n_flows = 0; > + > + while (!thread->netdev_done && n_flows < max_flows > + && i < FLOW_DUMP_MAX_BATCH) { > + struct odputil_keybuf *maskbuf = &thread->maskbuf[i]; > + struct odputil_keybuf *keybuf = &thread->keybuf[i]; > + struct odputil_keybuf *actbuf = &thread->actbuf[i]; > + struct ofpbuf key, mask, act; > + struct dpif_flow *f = &flows[n_flows]; > + int cur = thread->netdev_cur_dump; > + struct netdev_flow_dump *netdev_dump = dump->netdev_dumps[cur]; > + struct match match; > + struct nlattr *actions; > + struct dpif_flow_stats stats; > + ovs_u128 ufid; > + bool has_next; > + > + ofpbuf_use_stack(&key, keybuf, sizeof *keybuf); > + ofpbuf_use_stack(&act, actbuf, sizeof *actbuf); > + ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf); > + has_next = netdev_flow_dump_next(netdev_dump, &match, > + &actions, &stats, > + &ufid, > + &thread->nl_flows, > + &act); > + if (has_next) { > + dpif_netlink_netdev_match_to_dpif_flow(&match, > + &key, &mask, > + actions, > + &stats, > + &ufid, > + f, > + dump->up.terse); > + n_flows++; > + i++; Seems like 'i' and 'n_flows' are trying to achieve the same objective. Can we just drop 'i'? > + } else { > + dpif_netlink_advance_netdev_dump(thread); > + } > + } > + > while (!n_flows > || (n_flows < max_flows && thread->nl_flows.size)) { > struct dpif_netlink_flow datapath_flow; > -- > 1.8.3.1 >
On 05/01/2017 23:27, Joe Stringer wrote: > On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote: >> While dumping flows, dump flows that were offloaded to >> netdev and parse them back to dpif flow. >> >> Signed-off-by: Paul Blakey <paulb@mellanox.com> >> Reviewed-by: Roi Dayan <roid@mellanox.com> >> --- >> lib/dpif-netlink.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 179 insertions(+) >> >> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c >> index 36f2888..3d8940e 100644 >> --- a/lib/dpif-netlink.c >> +++ b/lib/dpif-netlink.c >> @@ -38,6 +38,7 @@ >> #include "flow.h" >> #include "fat-rwlock.h" >> #include "netdev.h" >> +#include "netdev-provider.h" >> #include "netdev-linux.h" >> #include "netdev-vport.h" >> #include "netlink-conntrack.h" >> @@ -55,6 +56,7 @@ >> #include "unaligned.h" >> #include "util.h" >> #include "openvswitch/vlog.h" >> +#include "openvswitch/match.h" >> >> VLOG_DEFINE_THIS_MODULE(dpif_netlink); >> #ifdef _WIN32 >> @@ -68,6 +70,8 @@ enum { MAX_PORTS = USHRT_MAX }; >> * missing if we have old headers. */ >> #define ETH_FLAG_LRO (1 << 15) /* LRO is enabled */ >> >> +#define FLOW_DUMP_MAX_BATCH 50 >> + >> struct dpif_netlink_dp { >> /* Generic Netlink header. */ >> uint8_t cmd; >> @@ -1355,6 +1359,10 @@ struct dpif_netlink_flow_dump { >> struct dpif_flow_dump up; >> struct nl_dump nl_dump; >> atomic_int status; >> + struct netdev_flow_dump **netdev_dumps; >> + int netdev_num; >> + int netdev_given; >> + struct ovs_mutex netdev_lock; > Could you add a brief comment above these variables that describes > their use? (It's also common in OVS code to mention that, eg, > netdev_lock protects the following elements. ) > > If there's a more descriptive name than "netdev_num", like > netdev_max_dumps or something then please use that instead. At a > glance, "given" and "num" don't provide particularly much context > about how they relate to each other or to the dump. sure, thanks. > >> }; >> >> static struct dpif_netlink_flow_dump * >> @@ -1363,6 +1371,34 @@ dpif_netlink_flow_dump_cast(struct dpif_flow_dump *dump) >> return CONTAINER_OF(dump, struct dpif_netlink_flow_dump, up); >> } >> >> +static void start_netdev_dump(const struct dpif *dpif_, >> + struct dpif_netlink_flow_dump *dump) { >> + >> + if (!netdev_flow_api_enabled) { >> + dump->netdev_num = 0; >> + return; >> + } > Typically for style we still place all variable declarations at the > top of a function, in a christmas tree long lines to short lines, > before functional code like this. > >> + >> + struct netdev_list_element *element; >> + struct ovs_list port_list; >> + int ports = netdev_hmap_port_get_list(dpif_->dpif_class, &port_list); >> + int i = 0; >> + >> + dump->netdev_dumps = >> + ports ? xzalloc(sizeof(struct netdev_flow_dump *) * ports) : 0; > Can this be sizeof(dump->netdev_dumps)? Do you mean sizeof(*dump-netdev_dumps), or sizeof(dump->netdev_dumps[0]), if so yes. >> + dump->netdev_num = ports; >> + dump->netdev_given = 0; >> + >> + LIST_FOR_EACH(element, node, &port_list) { >> + dump->netdev_dumps[i] = netdev_flow_dump_create(element->netdev); >> + dump->netdev_dumps[i]->port = element->port_no; >> + i++; >> + } > As a matter of style, it's easier to see that this loop is bounded by > 'ports' (and that number is correct) if it's structured as > > for (i = 0; i < ports; i++) { > element = get_next_node; > ... > } > > Also, it seems that even if the netdev doesn't support flow_dump, we > allocate a netdev_flow_dump and add it to the netdev_dumps here.. > perhaps we could/should skip it for these netdevs instead? > >> + netdev_port_list_del(&port_list); >> + >> + ovs_mutex_init(&dump->netdev_lock); > I don't see a corresponding ovs_mutex_destroy() call for this. Good catch, thanks. >> +} >> + >> static struct dpif_flow_dump * >> dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse) >> { >> @@ -1387,6 +1423,8 @@ dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse) >> atomic_init(&dump->status, 0); >> dump->up.terse = terse; >> >> + start_netdev_dump(dpif_, dump); >> + >> return &dump->up; >> } >> >> @@ -1397,6 +1435,16 @@ dpif_netlink_flow_dump_destroy(struct dpif_flow_dump *dump_) >> unsigned int nl_status = nl_dump_done(&dump->nl_dump); >> int dump_status; >> >> + if (netdev_flow_api_enabled) { >> + for (int i = 0; i < dump->netdev_num; i++) { >> + int err = netdev_flow_dump_destroy(dump->netdev_dumps[i]); >> + if (err != 0 && err != EOPNOTSUPP) { >> + VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err)); >> + } >> + } >> + free(dump->netdev_dumps); >> + } > You don't really need to check for netdev_flow_api_enabled here; > netdev_num will be 0 if it is disabled, so that for loop turns into a > no-op; then you could initialize dump->netdev_dumps to NULL in that > case and unconditionally free it. It's a bit simpler to read the code > if you don't have to think about whether or not hardware offloads are > enabled. > >> + >> /* No other thread has access to 'dump' at this point. */ >> atomic_read_relaxed(&dump->status, &dump_status); >> free(dump); >> @@ -1410,6 +1458,11 @@ struct dpif_netlink_flow_dump_thread { >> struct dpif_flow_stats stats; >> struct ofpbuf nl_flows; /* Always used to store flows. */ >> struct ofpbuf *nl_actions; /* Used if kernel does not supply actions. */ >> + struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH]; >> + struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH]; >> + struct odputil_keybuf actbuf[FLOW_DUMP_MAX_BATCH]; >> + int netdev_cur_dump; >> + bool netdev_done; > I wonder if it's worthwhile to reuse 'nl_flows' to store all of these > netlink-formatted key/mask/acts instead of having these keybufs? It > seems that it is currently unused for the first half of the > dpif_netlink_flow_dump while the flows are being dumped from the > netdev. > > Regardless of the above question, I also question whether > FLOW_DUMP_MAX_BATCH is too big for dumping from the kernel. How many > tc flows will we really get from the kernel at once? > >> }; >> >> static struct dpif_netlink_flow_dump_thread * >> @@ -1429,6 +1482,8 @@ dpif_netlink_flow_dump_thread_create(struct dpif_flow_dump *dump_) >> thread->dump = dump; >> ofpbuf_init(&thread->nl_flows, NL_DUMP_BUFSIZE); >> thread->nl_actions = NULL; >> + thread->netdev_cur_dump = 0; >> + thread->netdev_done = !(thread->netdev_cur_dump < dump->netdev_num); >> >> return &thread->up; >> } >> @@ -1466,6 +1521,90 @@ dpif_netlink_flow_to_dpif_flow(struct dpif *dpif, struct dpif_flow *dpif_flow, >> dpif_netlink_flow_get_stats(datapath_flow, &dpif_flow->stats); >> } >> >> +static void >> +dpif_netlink_advance_netdev_dump(struct dpif_netlink_flow_dump_thread *thread) >> +{ >> + struct dpif_netlink_flow_dump *dump = thread->dump; >> + >> + ovs_mutex_lock(&dump->netdev_lock); >> + /* if we haven't finished (dumped everything) */ >> + if (dump->netdev_given < dump->netdev_num) { >> + /* if we are the first to find that given dump is finished >> + * (for race condition, e.g 3 finish dump 0 at the same time) */ > Why is there a race condition here if this is executed under netdev_lock? The design is such that all threads are working together on the first dump to the last, in order. (at first they all on dump 0), and when one thread finds that the given dump is finished, they all move to the next. As the comment tried to explain, if 3 (or 2+) threads are working on the first dump, dump 0, (thread->netdev_cur_dump == 0) and finish at the same time, they all call advance func. Now the first one to get the lock advances the shared given dump, which signify which highest dump we have given (and all lower dumps have finished). The rest now enter and we check if the dump they have found to be finished is higher then the new one that was given, if not they catch up, so now all of them will work on dump 1. The race is that if 2 or more threads worked on the same dump and finished at the same time, if we just increased netdev_given without checking (thread->cur == given) for both of them, we would have increased given twice and skip one dump. > >> + if (thread->netdev_cur_dump == dump->netdev_given) { >> + thread->netdev_cur_dump = ++dump->netdev_given; >> + /* did we just finish the last dump? done. */ >> + if (dump->netdev_given == dump->netdev_num) { >> + thread->netdev_done = true; >> + } >> + } else { >> + /* otherwise, we are behind, catch up */ >> + thread->netdev_cur_dump = dump->netdev_given; >> + } >> + } else { >> + /* some other thread finished */ >> + thread->netdev_done = true; >> + } >> + ovs_mutex_unlock(&dump->netdev_lock); >> +} >> + >> +static struct odp_support netdev_flow_support = { >> + .max_mpls_depth = SIZE_MAX, >> + .recirc = false, >> + .ct_state = false, >> + .ct_zone = false, >> + .ct_mark = false, >> + .ct_label = false, >> +}; >> + >> +static int >> +dpif_netlink_netdev_match_to_dpif_flow(struct match *match, >> + struct ofpbuf *key_buf, >> + struct ofpbuf *mask_buf, >> + struct nlattr *actions, >> + struct dpif_flow_stats *stats, >> + ovs_u128 *ufid, >> + struct dpif_flow *flow, >> + bool terse OVS_UNUSED) >> +{ >> + >> + struct odp_flow_key_parms odp_parms = { >> + .flow = &match->flow, >> + .mask = &match->wc.masks, >> + .support = netdev_flow_support, > There's also 'key_buf' field in parms that may be needed. > >> + }; >> + size_t offset; >> + >> + memset(flow, 0, sizeof *flow); >> + >> + /* Key */ >> + offset = key_buf->size; >> + flow->key = ofpbuf_tail(key_buf); >> + odp_flow_key_from_flow(&odp_parms, key_buf); >> + flow->key_len = key_buf->size - offset; >> + >> + /* Mask */ >> + offset = mask_buf->size; >> + flow->mask = ofpbuf_tail(mask_buf); >> + odp_parms.key_buf = key_buf; >> + odp_flow_key_from_mask(&odp_parms, mask_buf); >> + flow->mask_len = mask_buf->size - offset; >> + >> + /* Actions */ >> + flow->actions = nl_attr_get(actions); >> + flow->actions_len = nl_attr_get_size(actions); >> + >> + /* Stats */ >> + memcpy(&flow->stats, stats, sizeof *stats); >> + >> + /* UFID */ >> + flow->ufid_present = true; >> + flow->ufid = *ufid; >> + >> + flow->pmd_id = PMD_ID_NULL; >> + return 0; >> +} >> + >> static int >> dpif_netlink_flow_dump_next(struct dpif_flow_dump_thread *thread_, >> struct dpif_flow *flows, int max_flows) >> @@ -1475,11 +1614,51 @@ dpif_netlink_flow_dump_next(struct dpif_flow_dump_thread *thread_, >> struct dpif_netlink_flow_dump *dump = thread->dump; >> struct dpif_netlink *dpif = dpif_netlink_cast(thread->up.dpif); >> int n_flows; >> + int i = 0; >> >> ofpbuf_delete(thread->nl_actions); >> thread->nl_actions = NULL; >> >> n_flows = 0; >> + >> + while (!thread->netdev_done && n_flows < max_flows >> + && i < FLOW_DUMP_MAX_BATCH) { >> + struct odputil_keybuf *maskbuf = &thread->maskbuf[i]; >> + struct odputil_keybuf *keybuf = &thread->keybuf[i]; >> + struct odputil_keybuf *actbuf = &thread->actbuf[i]; >> + struct ofpbuf key, mask, act; >> + struct dpif_flow *f = &flows[n_flows]; >> + int cur = thread->netdev_cur_dump; >> + struct netdev_flow_dump *netdev_dump = dump->netdev_dumps[cur]; >> + struct match match; >> + struct nlattr *actions; >> + struct dpif_flow_stats stats; >> + ovs_u128 ufid; >> + bool has_next; >> + >> + ofpbuf_use_stack(&key, keybuf, sizeof *keybuf); >> + ofpbuf_use_stack(&act, actbuf, sizeof *actbuf); >> + ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf); >> + has_next = netdev_flow_dump_next(netdev_dump, &match, >> + &actions, &stats, >> + &ufid, >> + &thread->nl_flows, >> + &act); >> + if (has_next) { >> + dpif_netlink_netdev_match_to_dpif_flow(&match, >> + &key, &mask, >> + actions, >> + &stats, >> + &ufid, >> + f, >> + dump->up.terse); >> + n_flows++; >> + i++; > Seems like 'i' and 'n_flows' are trying to achieve the same objective. > Can we just drop 'i'? > >> + } else { >> + dpif_netlink_advance_netdev_dump(thread); >> + } >> + } >> + >> while (!n_flows >> || (n_flows < max_flows && thread->nl_flows.size)) { >> struct dpif_netlink_flow datapath_flow; >> -- >> 1.8.3.1 >>
On 10 January 2017 at 03:45, Paul Blakey <paulb@mellanox.com> wrote: >>> + struct netdev_list_element *element; >>> + struct ovs_list port_list; >>> + int ports = netdev_hmap_port_get_list(dpif_->dpif_class, >>> &port_list); >>> + int i = 0; >>> + >>> + dump->netdev_dumps = >>> + ports ? xzalloc(sizeof(struct netdev_flow_dump *) * ports) : 0; >> >> Can this be sizeof(dump->netdev_dumps)? > > Do you mean sizeof(*dump-netdev_dumps), or sizeof(dump->netdev_dumps[0]), if > so yes. Yes that's what I meant. <snip> >>> @@ -1466,6 +1521,90 @@ dpif_netlink_flow_to_dpif_flow(struct dpif *dpif, >>> struct dpif_flow *dpif_flow, >>> dpif_netlink_flow_get_stats(datapath_flow, &dpif_flow->stats); >>> } >>> >>> +static void >>> +dpif_netlink_advance_netdev_dump(struct dpif_netlink_flow_dump_thread >>> *thread) >>> +{ >>> + struct dpif_netlink_flow_dump *dump = thread->dump; >>> + >>> + ovs_mutex_lock(&dump->netdev_lock); >>> + /* if we haven't finished (dumped everything) */ >>> + if (dump->netdev_given < dump->netdev_num) { >>> + /* if we are the first to find that given dump is finished >>> + * (for race condition, e.g 3 finish dump 0 at the same time) */ >> >> Why is there a race condition here if this is executed under netdev_lock? > > The design is such that all threads are working together on the first dump > to the last, in order. (at first they all on dump 0), > and when one thread finds that the given dump is finished, they all move to > the next. > As the comment tried to explain, if 3 (or 2+) threads are working on the > first dump, dump 0, > (thread->netdev_cur_dump == 0) and finish at the same time, they all call > advance func. > Now the first one to get the lock advances the shared given dump, which > signify which highest dump we have given > (and all lower dumps have finished). The rest now enter and we check if the > dump they have found to be > finished is higher then the new one that was given, if not they catch up, so > now all of them will work on dump 1. > > The race is that if 2 or more threads worked on the same dump and finished > at the same time, > if we just increased netdev_given without checking (thread->cur == given) > for both of them, > we would have increased given twice and skip one dump. Thanks for the explanation. I think that the code would benefit from having this written somewhere - for instance, above the dpif_netlink_advance_netdev_dump() function.
On 05/01/2017 23:27, Joe Stringer wrote: > On 25 December 2016 at 03:39, Paul Blakey <paulb@mellanox.com> wrote: >> While dumping flows, dump flows that were offloaded to >> netdev and parse them back to dpif flow. >> >> Signed-off-by: Paul Blakey <paulb@mellanox.com> >> Reviewed-by: Roi Dayan <roid@mellanox.com> >> --- >> lib/dpif-netlink.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 179 insertions(+) >> >> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c >> index 36f2888..3d8940e 100644 >> --- a/lib/dpif-netlink.c >> +++ b/lib/dpif-netlink.c >> @@ -38,6 +38,7 @@ >> #include "flow.h" >> #include "fat-rwlock.h" >> #include "netdev.h" >> +#include "netdev-provider.h" >> #include "netdev-linux.h" >> #include "netdev-vport.h" >> #include "netlink-conntrack.h" >> @@ -55,6 +56,7 @@ >> #include "unaligned.h" >> #include "util.h" >> #include "openvswitch/vlog.h" >> +#include "openvswitch/match.h" >> >> VLOG_DEFINE_THIS_MODULE(dpif_netlink); >> #ifdef _WIN32 >> @@ -68,6 +70,8 @@ enum { MAX_PORTS = USHRT_MAX }; >> * missing if we have old headers. */ >> #define ETH_FLAG_LRO (1 << 15) /* LRO is enabled */ >> >> +#define FLOW_DUMP_MAX_BATCH 50 >> + >> struct dpif_netlink_dp { >> /* Generic Netlink header. */ >> uint8_t cmd; >> @@ -1355,6 +1359,10 @@ struct dpif_netlink_flow_dump { >> struct dpif_flow_dump up; >> struct nl_dump nl_dump; >> atomic_int status; >> + struct netdev_flow_dump **netdev_dumps; >> + int netdev_num; >> + int netdev_given; >> + struct ovs_mutex netdev_lock; > > Could you add a brief comment above these variables that describes > their use? (It's also common in OVS code to mention that, eg, > netdev_lock protects the following elements. ) > > If there's a more descriptive name than "netdev_num", like > netdev_max_dumps or something then please use that instead. At a > glance, "given" and "num" don't provide particularly much context > about how they relate to each other or to the dump. > >> }; >> >> static struct dpif_netlink_flow_dump * >> @@ -1363,6 +1371,34 @@ dpif_netlink_flow_dump_cast(struct dpif_flow_dump *dump) >> return CONTAINER_OF(dump, struct dpif_netlink_flow_dump, up); >> } >> >> +static void start_netdev_dump(const struct dpif *dpif_, >> + struct dpif_netlink_flow_dump *dump) { >> + >> + if (!netdev_flow_api_enabled) { >> + dump->netdev_num = 0; >> + return; >> + } > > Typically for style we still place all variable declarations at the > top of a function, in a christmas tree long lines to short lines, > before functional code like this. > >> + >> + struct netdev_list_element *element; >> + struct ovs_list port_list; >> + int ports = netdev_hmap_port_get_list(dpif_->dpif_class, &port_list); >> + int i = 0; >> + >> + dump->netdev_dumps = >> + ports ? xzalloc(sizeof(struct netdev_flow_dump *) * ports) : 0; > > Can this be sizeof(dump->netdev_dumps)? > >> + dump->netdev_num = ports; >> + dump->netdev_given = 0; >> + >> + LIST_FOR_EACH(element, node, &port_list) { >> + dump->netdev_dumps[i] = netdev_flow_dump_create(element->netdev); >> + dump->netdev_dumps[i]->port = element->port_no; >> + i++; >> + } > > As a matter of style, it's easier to see that this loop is bounded by > 'ports' (and that number is correct) if it's structured as > > for (i = 0; i < ports; i++) { > element = get_next_node; > ... > } > > Also, it seems that even if the netdev doesn't support flow_dump, we > allocate a netdev_flow_dump and add it to the netdev_dumps here.. > perhaps we could/should skip it for these netdevs instead? > >> + netdev_port_list_del(&port_list); >> + >> + ovs_mutex_init(&dump->netdev_lock); > > I don't see a corresponding ovs_mutex_destroy() call for this. > >> +} >> + >> static struct dpif_flow_dump * >> dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse) >> { >> @@ -1387,6 +1423,8 @@ dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse) >> atomic_init(&dump->status, 0); >> dump->up.terse = terse; >> >> + start_netdev_dump(dpif_, dump); >> + >> return &dump->up; >> } >> >> @@ -1397,6 +1435,16 @@ dpif_netlink_flow_dump_destroy(struct dpif_flow_dump *dump_) >> unsigned int nl_status = nl_dump_done(&dump->nl_dump); >> int dump_status; >> >> + if (netdev_flow_api_enabled) { >> + for (int i = 0; i < dump->netdev_num; i++) { >> + int err = netdev_flow_dump_destroy(dump->netdev_dumps[i]); >> + if (err != 0 && err != EOPNOTSUPP) { >> + VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err)); >> + } >> + } >> + free(dump->netdev_dumps); >> + } > > You don't really need to check for netdev_flow_api_enabled here; > netdev_num will be 0 if it is disabled, so that for loop turns into a > no-op; then you could initialize dump->netdev_dumps to NULL in that > case and unconditionally free it. It's a bit simpler to read the code > if you don't have to think about whether or not hardware offloads are > enabled. > >> + >> /* No other thread has access to 'dump' at this point. */ >> atomic_read_relaxed(&dump->status, &dump_status); >> free(dump); >> @@ -1410,6 +1458,11 @@ struct dpif_netlink_flow_dump_thread { >> struct dpif_flow_stats stats; >> struct ofpbuf nl_flows; /* Always used to store flows. */ >> struct ofpbuf *nl_actions; /* Used if kernel does not supply actions. */ >> + struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH]; >> + struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH]; >> + struct odputil_keybuf actbuf[FLOW_DUMP_MAX_BATCH]; >> + int netdev_cur_dump; >> + bool netdev_done; > > I wonder if it's worthwhile to reuse 'nl_flows' to store all of these > netlink-formatted key/mask/acts instead of having these keybufs? It > seems that it is currently unused for the first half of the > dpif_netlink_flow_dump while the flows are being dumped from the > netdev. > > Regardless of the above question, I also question whether > FLOW_DUMP_MAX_BATCH is too big for dumping from the kernel. How many > tc flows will we really get from the kernel at once? Hi, Unlike dpif_netlink dumping nl_flows, which already gets the correct netlink stream format and can just take pointers to it, We need a temporary buffers to store the conversion from struct match to dpif_flow key/mask/actions netlink attributes. So that's a KEY_SIZE*2*DUMP_MAX (size*(key+mask)*max) right there, same with actions (but probably can be smaller). We can't reuse nl_flows for that because that is used for nl_dump_next to keep the buffer filled with multiple replies. The flow described in kernel datapath netlink format and tc format is should be about the same size but tc doesn't support getting only stats (terse). NL_DUMP_BUFSIZE should be enough for ~30 flows (20bytes for struct tcmsg, 32 for macs & masks, 12 for eth_type and ip_proto, 12 per port, actions and stats.... not all are present), But don't nl_dump_next refills it anyways (won't just stop at buffer size). <snip>
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 36f2888..3d8940e 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -38,6 +38,7 @@ #include "flow.h" #include "fat-rwlock.h" #include "netdev.h" +#include "netdev-provider.h" #include "netdev-linux.h" #include "netdev-vport.h" #include "netlink-conntrack.h" @@ -55,6 +56,7 @@ #include "unaligned.h" #include "util.h" #include "openvswitch/vlog.h" +#include "openvswitch/match.h" VLOG_DEFINE_THIS_MODULE(dpif_netlink); #ifdef _WIN32 @@ -68,6 +70,8 @@ enum { MAX_PORTS = USHRT_MAX }; * missing if we have old headers. */ #define ETH_FLAG_LRO (1 << 15) /* LRO is enabled */ +#define FLOW_DUMP_MAX_BATCH 50 + struct dpif_netlink_dp { /* Generic Netlink header. */ uint8_t cmd; @@ -1355,6 +1359,10 @@ struct dpif_netlink_flow_dump { struct dpif_flow_dump up; struct nl_dump nl_dump; atomic_int status; + struct netdev_flow_dump **netdev_dumps; + int netdev_num; + int netdev_given; + struct ovs_mutex netdev_lock; }; static struct dpif_netlink_flow_dump * @@ -1363,6 +1371,34 @@ dpif_netlink_flow_dump_cast(struct dpif_flow_dump *dump) return CONTAINER_OF(dump, struct dpif_netlink_flow_dump, up); } +static void start_netdev_dump(const struct dpif *dpif_, + struct dpif_netlink_flow_dump *dump) { + + if (!netdev_flow_api_enabled) { + dump->netdev_num = 0; + return; + } + + struct netdev_list_element *element; + struct ovs_list port_list; + int ports = netdev_hmap_port_get_list(dpif_->dpif_class, &port_list); + int i = 0; + + dump->netdev_dumps = + ports ? xzalloc(sizeof(struct netdev_flow_dump *) * ports) : 0; + dump->netdev_num = ports; + dump->netdev_given = 0; + + LIST_FOR_EACH(element, node, &port_list) { + dump->netdev_dumps[i] = netdev_flow_dump_create(element->netdev); + dump->netdev_dumps[i]->port = element->port_no; + i++; + } + netdev_port_list_del(&port_list); + + ovs_mutex_init(&dump->netdev_lock); +} + static struct dpif_flow_dump * dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse) { @@ -1387,6 +1423,8 @@ dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse) atomic_init(&dump->status, 0); dump->up.terse = terse; + start_netdev_dump(dpif_, dump); + return &dump->up; } @@ -1397,6 +1435,16 @@ dpif_netlink_flow_dump_destroy(struct dpif_flow_dump *dump_) unsigned int nl_status = nl_dump_done(&dump->nl_dump); int dump_status; + if (netdev_flow_api_enabled) { + for (int i = 0; i < dump->netdev_num; i++) { + int err = netdev_flow_dump_destroy(dump->netdev_dumps[i]); + if (err != 0 && err != EOPNOTSUPP) { + VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err)); + } + } + free(dump->netdev_dumps); + } + /* No other thread has access to 'dump' at this point. */ atomic_read_relaxed(&dump->status, &dump_status); free(dump); @@ -1410,6 +1458,11 @@ struct dpif_netlink_flow_dump_thread { struct dpif_flow_stats stats; struct ofpbuf nl_flows; /* Always used to store flows. */ struct ofpbuf *nl_actions; /* Used if kernel does not supply actions. */ + struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH]; + struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH]; + struct odputil_keybuf actbuf[FLOW_DUMP_MAX_BATCH]; + int netdev_cur_dump; + bool netdev_done; }; static struct dpif_netlink_flow_dump_thread * @@ -1429,6 +1482,8 @@ dpif_netlink_flow_dump_thread_create(struct dpif_flow_dump *dump_) thread->dump = dump; ofpbuf_init(&thread->nl_flows, NL_DUMP_BUFSIZE); thread->nl_actions = NULL; + thread->netdev_cur_dump = 0; + thread->netdev_done = !(thread->netdev_cur_dump < dump->netdev_num); return &thread->up; } @@ -1466,6 +1521,90 @@ dpif_netlink_flow_to_dpif_flow(struct dpif *dpif, struct dpif_flow *dpif_flow, dpif_netlink_flow_get_stats(datapath_flow, &dpif_flow->stats); } +static void +dpif_netlink_advance_netdev_dump(struct dpif_netlink_flow_dump_thread *thread) +{ + struct dpif_netlink_flow_dump *dump = thread->dump; + + ovs_mutex_lock(&dump->netdev_lock); + /* if we haven't finished (dumped everything) */ + if (dump->netdev_given < dump->netdev_num) { + /* if we are the first to find that given dump is finished + * (for race condition, e.g 3 finish dump 0 at the same time) */ + if (thread->netdev_cur_dump == dump->netdev_given) { + thread->netdev_cur_dump = ++dump->netdev_given; + /* did we just finish the last dump? done. */ + if (dump->netdev_given == dump->netdev_num) { + thread->netdev_done = true; + } + } else { + /* otherwise, we are behind, catch up */ + thread->netdev_cur_dump = dump->netdev_given; + } + } else { + /* some other thread finished */ + thread->netdev_done = true; + } + ovs_mutex_unlock(&dump->netdev_lock); +} + +static struct odp_support netdev_flow_support = { + .max_mpls_depth = SIZE_MAX, + .recirc = false, + .ct_state = false, + .ct_zone = false, + .ct_mark = false, + .ct_label = false, +}; + +static int +dpif_netlink_netdev_match_to_dpif_flow(struct match *match, + struct ofpbuf *key_buf, + struct ofpbuf *mask_buf, + struct nlattr *actions, + struct dpif_flow_stats *stats, + ovs_u128 *ufid, + struct dpif_flow *flow, + bool terse OVS_UNUSED) +{ + + struct odp_flow_key_parms odp_parms = { + .flow = &match->flow, + .mask = &match->wc.masks, + .support = netdev_flow_support, + }; + size_t offset; + + memset(flow, 0, sizeof *flow); + + /* Key */ + offset = key_buf->size; + flow->key = ofpbuf_tail(key_buf); + odp_flow_key_from_flow(&odp_parms, key_buf); + flow->key_len = key_buf->size - offset; + + /* Mask */ + offset = mask_buf->size; + flow->mask = ofpbuf_tail(mask_buf); + odp_parms.key_buf = key_buf; + odp_flow_key_from_mask(&odp_parms, mask_buf); + flow->mask_len = mask_buf->size - offset; + + /* Actions */ + flow->actions = nl_attr_get(actions); + flow->actions_len = nl_attr_get_size(actions); + + /* Stats */ + memcpy(&flow->stats, stats, sizeof *stats); + + /* UFID */ + flow->ufid_present = true; + flow->ufid = *ufid; + + flow->pmd_id = PMD_ID_NULL; + return 0; +} + static int dpif_netlink_flow_dump_next(struct dpif_flow_dump_thread *thread_, struct dpif_flow *flows, int max_flows) @@ -1475,11 +1614,51 @@ dpif_netlink_flow_dump_next(struct dpif_flow_dump_thread *thread_, struct dpif_netlink_flow_dump *dump = thread->dump; struct dpif_netlink *dpif = dpif_netlink_cast(thread->up.dpif); int n_flows; + int i = 0; ofpbuf_delete(thread->nl_actions); thread->nl_actions = NULL; n_flows = 0; + + while (!thread->netdev_done && n_flows < max_flows + && i < FLOW_DUMP_MAX_BATCH) { + struct odputil_keybuf *maskbuf = &thread->maskbuf[i]; + struct odputil_keybuf *keybuf = &thread->keybuf[i]; + struct odputil_keybuf *actbuf = &thread->actbuf[i]; + struct ofpbuf key, mask, act; + struct dpif_flow *f = &flows[n_flows]; + int cur = thread->netdev_cur_dump; + struct netdev_flow_dump *netdev_dump = dump->netdev_dumps[cur]; + struct match match; + struct nlattr *actions; + struct dpif_flow_stats stats; + ovs_u128 ufid; + bool has_next; + + ofpbuf_use_stack(&key, keybuf, sizeof *keybuf); + ofpbuf_use_stack(&act, actbuf, sizeof *actbuf); + ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf); + has_next = netdev_flow_dump_next(netdev_dump, &match, + &actions, &stats, + &ufid, + &thread->nl_flows, + &act); + if (has_next) { + dpif_netlink_netdev_match_to_dpif_flow(&match, + &key, &mask, + actions, + &stats, + &ufid, + f, + dump->up.terse); + n_flows++; + i++; + } else { + dpif_netlink_advance_netdev_dump(thread); + } + } + while (!n_flows || (n_flows < max_flows && thread->nl_flows.size)) { struct dpif_netlink_flow datapath_flow;