diff mbox series

[ovs-dev,branch-3.3] ofproto-dpif-upcall: Avoid stale ukeys leaks.

Message ID 20240910151342.918730-1-aconole@redhat.com
State Accepted
Commit c64206c6e04e00487b584eb551564b6921710cc7
Delegated to: Eelco Chaudron
Headers show
Series [ovs-dev,branch-3.3] ofproto-dpif-upcall: Avoid stale ukeys leaks. | expand

Checks

Context Check Description
ovsrobot/apply-robot warning apply and check: warning
ovsrobot/github-robot-_Build_and_Test fail github build: failed

Commit Message

Aaron Conole Sept. 10, 2024, 3:13 p.m. UTC
From: Eelco Chaudron <echaudro@redhat.com>

It is observed in some environments that there are much more ukeys than
actual DP flows. For example:

$ ovs-appctl upcall/show
system@ovs-system:
flows : (current 7) (avg 6) (max 117) (limit 2125)
offloaded flows : 525
dump duration : 1063ms
ufid enabled : true

23: (keys 3612)
24: (keys 3625)
25: (keys 3485)

The revalidator threads are busy revalidating the stale ukeys leading to
high CPU and long dump duration.

This patch tracks the number of consecutive missed dumps. If four dumps
are missed in a row, it is assumed that the datapath flow no longer
exists, and the ukey can be deleted.

Reported-by: Roi Dayan <roid@nvidia.com>
Co-authored-by: Han Zhou <hzhou@ovn.org>
Co-authored-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Aaron Conole <aconole@redhat.com>

NOTE: Backported a portion of
600125b2c380 ("ofproto: Add ofproto/detrace command to map UFIDs to OpenFlow.")
to provide `strip_dp_hash` macro.
---
 ofproto/ofproto-dpif-upcall.c | 16 +++++++++++++
 tests/ofproto-dpif.at         | 45 +++++++++++++++++++++++++++++++++++
 tests/ofproto-macros.at       |  5 ++++
 3 files changed, 66 insertions(+)

Comments

Roi Dayan Sept. 11, 2024, 7:48 a.m. UTC | #1
On 9/10/24 18:13, Aaron Conole wrote:
> From: Eelco Chaudron <echaudro@redhat.com>
> 
> It is observed in some environments that there are much more ukeys than
> actual DP flows. For example:
> 
> $ ovs-appctl upcall/show
> system@ovs-system:
> flows : (current 7) (avg 6) (max 117) (limit 2125)
> offloaded flows : 525
> dump duration : 1063ms
> ufid enabled : true
> 
> 23: (keys 3612)
> 24: (keys 3625)
> 25: (keys 3485)
> 
> The revalidator threads are busy revalidating the stale ukeys leading to
> high CPU and long dump duration.
> 
> This patch tracks the number of consecutive missed dumps. If four dumps
> are missed in a row, it is assumed that the datapath flow no longer
> exists, and the ukey can be deleted.
> 
> Reported-by: Roi Dayan <roid@nvidia.com>
> Co-authored-by: Han Zhou <hzhou@ovn.org>
> Co-authored-by: Roi Dayan <roid@nvidia.com>
> Signed-off-by: Han Zhou <hzhou@ovn.org>
> Signed-off-by: Roi Dayan <roid@nvidia.com>
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> Signed-off-by: Aaron Conole <aconole@redhat.com>
> 
> NOTE: Backported a portion of
> 600125b2c380 ("ofproto: Add ofproto/detrace command to map UFIDs to OpenFlow.")
> to provide `strip_dp_hash` macro.
> ---
>   ofproto/ofproto-dpif-upcall.c | 16 +++++++++++++
>   tests/ofproto-dpif.at         | 45 +++++++++++++++++++++++++++++++++++
>   tests/ofproto-macros.at       |  5 ++++
>   3 files changed, 66 insertions(+)
> 
> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
> index a046f8a339..f122b47f1c 100644
> --- a/ofproto/ofproto-dpif-upcall.c
> +++ b/ofproto/ofproto-dpif-upcall.c
> @@ -57,6 +57,7 @@ COVERAGE_DEFINE(dumped_inconsistent_flow);
>   COVERAGE_DEFINE(dumped_new_flow);
>   COVERAGE_DEFINE(handler_duplicate_upcall);
>   COVERAGE_DEFINE(revalidate_missed_dp_flow);
> +COVERAGE_DEFINE(revalidate_missing_dp_flow);
>   COVERAGE_DEFINE(ukey_dp_change);
>   COVERAGE_DEFINE(ukey_invalid_stat_reset);
>   COVERAGE_DEFINE(ukey_replace_contention);
> @@ -302,6 +303,7 @@ struct udpif_key {
>       uint64_t dump_seq OVS_GUARDED;            /* Tracks udpif->dump_seq. */
>       uint64_t reval_seq OVS_GUARDED;           /* Tracks udpif->reval_seq. */
>       enum ukey_state state OVS_GUARDED;        /* Tracks ukey lifetime. */
> +    uint32_t missed_dumps OVS_GUARDED;        /* Missed consecutive dumps. */
>   
>       /* 'state' debug information. */
>       unsigned int state_thread OVS_GUARDED;    /* Thread that transitions. */
> @@ -2995,6 +2997,20 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge)
>                       result = revalidate_ukey(udpif, ukey, &stats, &odp_actions,
>                                                reval_seq, &recircs);
>                   }
> +
> +                if (ukey->dump_seq != dump_seq) {
> +                    ukey->missed_dumps++;
> +                    if (ukey->missed_dumps >= 4) {
> +                        /* If the flow was not dumped for 4 revalidator rounds,
> +                         * we can assume the datapath flow no longer exists
> +                         * and the ukey should be deleted. */
> +                        COVERAGE_INC(revalidate_missing_dp_flow);
> +                        result = UKEY_DELETE;
> +                    }
> +                } else {
> +                    ukey->missed_dumps = 0;
> +                }
> +
>                   if (result != UKEY_KEEP) {
>                       /* Clears 'recircs' if filled by revalidate_ukey(). */
>                       reval_op_init(&ops[n_ops++], result, udpif, ukey, &recircs,
> diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
> index 0b23fd6c5e..d5aa9145c7 100644
> --- a/tests/ofproto-dpif.at
> +++ b/tests/ofproto-dpif.at
> @@ -12136,3 +12136,48 @@ AT_CHECK([test 1 = `ovs-ofctl parse-pcap p2-tx.pcap | wc -l`])
>   
>   OVS_VSWITCHD_STOP
>   AT_CLEANUP
> +
> +AT_SETUP([ofproto-dpif - Cleanup missing datapath flows])
> +
> +OVS_VSWITCHD_START
> +add_of_ports br0 1 2
> +
> +m4_define([ICMP_PKT], [m4_join([,],
> +    [eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800)],
> +    [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)],
> +    [icmp(type=8,code=0)])])
> +
> +AT_CHECK([ovs-ofctl del-flows br0])
> +AT_CHECK([ovs-ofctl add-flow br0 'actions=normal' ])
> +
> +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'])
> +
> +AT_CHECK([ovs-appctl dpctl/dump-flows --names | strip_used | strip_stats | dnl
> +          strip_duration | strip_dp_hash | sort], [0], [dnl
> +flow-dump from the main thread:
> +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:br0,p2
> +])
> +
> +dnl Make sure the ukey exists.
> +AT_CHECK([ovs-appctl upcall/show | grep '(keys' | awk '{print $3}' | \
> +            grep -q '1)'], [0])
> +
> +dnl Delete all datapath flows, and make sure they are gone.
> +AT_CHECK([ovs-appctl dpctl/del-flows])
> +AT_CHECK([ovs-appctl dpctl/dump-flows --names ], [0], [])
> +
> +dnl Move forward in time and make sure we have at least 4 * 500ms.
> +AT_CHECK([ovs-appctl time/warp 3000 300], [0], [ignore])
> +
> +dnl Make sure no more ukeys exists.
> +AT_CHECK([ovs-appctl upcall/show | grep '(keys' | awk '{print $3}' | \
> +            grep -qv '0)'], [1])
> +
> +dnl Verify coverage counter was hit.
> +AT_CHECK([ovs-appctl coverage/read-counter revalidate_missing_dp_flow], [0],
> +         [dnl
> +1
> +])
> +
> +OVS_VSWITCHD_STOP(["/failed to flow_del (No such file or directory)/d"])
> +AT_CLEANUP
> diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at
> index c22fb3c79c..3795ca7149 100644
> --- a/tests/ofproto-macros.at
> +++ b/tests/ofproto-macros.at
> @@ -169,6 +169,11 @@ strip_recirc() {
>           s/recirc_id=[[x0-9]]*/recirc_id=<recirc>/
>           s/recirc([[x0-9]]*)/recirc(<recirc>)/'
>   }
> +
> +# Strips dp_hash from output.
> +strip_dp_hash() {
> +    sed 's/dp_hash([[0-9a-fx/]]*),//'
> +}
>   m4_divert_pop([PREPARE_TESTS])
>   
>   m4_define([TESTABLE_LOG], [-vPATTERN:ANY:'%c|%p|%m'])

thanks
Acked-by: Roi Dayan <roid@nvidia.com>
Eelco Chaudron Sept. 11, 2024, 8:46 a.m. UTC | #2
On 10 Sep 2024, at 17:13, Aaron Conole wrote:

> From: Eelco Chaudron <echaudro@redhat.com>
>
> It is observed in some environments that there are much more ukeys than
> actual DP flows. For example:
>
> $ ovs-appctl upcall/show
> system@ovs-system:
> flows : (current 7) (avg 6) (max 117) (limit 2125)
> offloaded flows : 525
> dump duration : 1063ms
> ufid enabled : true
>
> 23: (keys 3612)
> 24: (keys 3625)
> 25: (keys 3485)
>
> The revalidator threads are busy revalidating the stale ukeys leading to
> high CPU and long dump duration.
>
> This patch tracks the number of consecutive missed dumps. If four dumps
> are missed in a row, it is assumed that the datapath flow no longer
> exists, and the ukey can be deleted.
>
> Reported-by: Roi Dayan <roid@nvidia.com>
> Co-authored-by: Han Zhou <hzhou@ovn.org>
> Co-authored-by: Roi Dayan <roid@nvidia.com>
> Signed-off-by: Han Zhou <hzhou@ovn.org>
> Signed-off-by: Roi Dayan <roid@nvidia.com>
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> Signed-off-by: Aaron Conole <aconole@redhat.com>
>
> NOTE: Backported a portion of
> 600125b2c380 ("ofproto: Add ofproto/detrace command to map UFIDs to OpenFlow.")
> to provide `strip_dp_hash` macro.

Thanks Aaron for doing all the backport patches on my behalf.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
diff mbox series

Patch

diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index a046f8a339..f122b47f1c 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -57,6 +57,7 @@  COVERAGE_DEFINE(dumped_inconsistent_flow);
 COVERAGE_DEFINE(dumped_new_flow);
 COVERAGE_DEFINE(handler_duplicate_upcall);
 COVERAGE_DEFINE(revalidate_missed_dp_flow);
+COVERAGE_DEFINE(revalidate_missing_dp_flow);
 COVERAGE_DEFINE(ukey_dp_change);
 COVERAGE_DEFINE(ukey_invalid_stat_reset);
 COVERAGE_DEFINE(ukey_replace_contention);
@@ -302,6 +303,7 @@  struct udpif_key {
     uint64_t dump_seq OVS_GUARDED;            /* Tracks udpif->dump_seq. */
     uint64_t reval_seq OVS_GUARDED;           /* Tracks udpif->reval_seq. */
     enum ukey_state state OVS_GUARDED;        /* Tracks ukey lifetime. */
+    uint32_t missed_dumps OVS_GUARDED;        /* Missed consecutive dumps. */
 
     /* 'state' debug information. */
     unsigned int state_thread OVS_GUARDED;    /* Thread that transitions. */
@@ -2995,6 +2997,20 @@  revalidator_sweep__(struct revalidator *revalidator, bool purge)
                     result = revalidate_ukey(udpif, ukey, &stats, &odp_actions,
                                              reval_seq, &recircs);
                 }
+
+                if (ukey->dump_seq != dump_seq) {
+                    ukey->missed_dumps++;
+                    if (ukey->missed_dumps >= 4) {
+                        /* If the flow was not dumped for 4 revalidator rounds,
+                         * we can assume the datapath flow no longer exists
+                         * and the ukey should be deleted. */
+                        COVERAGE_INC(revalidate_missing_dp_flow);
+                        result = UKEY_DELETE;
+                    }
+                } else {
+                    ukey->missed_dumps = 0;
+                }
+
                 if (result != UKEY_KEEP) {
                     /* Clears 'recircs' if filled by revalidate_ukey(). */
                     reval_op_init(&ops[n_ops++], result, udpif, ukey, &recircs,
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 0b23fd6c5e..d5aa9145c7 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -12136,3 +12136,48 @@  AT_CHECK([test 1 = `ovs-ofctl parse-pcap p2-tx.pcap | wc -l`])
 
 OVS_VSWITCHD_STOP
 AT_CLEANUP
+
+AT_SETUP([ofproto-dpif - Cleanup missing datapath flows])
+
+OVS_VSWITCHD_START
+add_of_ports br0 1 2
+
+m4_define([ICMP_PKT], [m4_join([,],
+    [eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800)],
+    [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)],
+    [icmp(type=8,code=0)])])
+
+AT_CHECK([ovs-ofctl del-flows br0])
+AT_CHECK([ovs-ofctl add-flow br0 'actions=normal' ])
+
+AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'])
+
+AT_CHECK([ovs-appctl dpctl/dump-flows --names | strip_used | strip_stats | dnl
+          strip_duration | strip_dp_hash | sort], [0], [dnl
+flow-dump from the main thread:
+recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:br0,p2
+])
+
+dnl Make sure the ukey exists.
+AT_CHECK([ovs-appctl upcall/show | grep '(keys' | awk '{print $3}' | \
+            grep -q '1)'], [0])
+
+dnl Delete all datapath flows, and make sure they are gone.
+AT_CHECK([ovs-appctl dpctl/del-flows])
+AT_CHECK([ovs-appctl dpctl/dump-flows --names ], [0], [])
+
+dnl Move forward in time and make sure we have at least 4 * 500ms.
+AT_CHECK([ovs-appctl time/warp 3000 300], [0], [ignore])
+
+dnl Make sure no more ukeys exists.
+AT_CHECK([ovs-appctl upcall/show | grep '(keys' | awk '{print $3}' | \
+            grep -qv '0)'], [1])
+
+dnl Verify coverage counter was hit.
+AT_CHECK([ovs-appctl coverage/read-counter revalidate_missing_dp_flow], [0],
+         [dnl
+1
+])
+
+OVS_VSWITCHD_STOP(["/failed to flow_del (No such file or directory)/d"])
+AT_CLEANUP
diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at
index c22fb3c79c..3795ca7149 100644
--- a/tests/ofproto-macros.at
+++ b/tests/ofproto-macros.at
@@ -169,6 +169,11 @@  strip_recirc() {
         s/recirc_id=[[x0-9]]*/recirc_id=<recirc>/
         s/recirc([[x0-9]]*)/recirc(<recirc>)/'
 }
+
+# Strips dp_hash from output.
+strip_dp_hash() {
+    sed 's/dp_hash([[0-9a-fx/]]*),//'
+}
 m4_divert_pop([PREPARE_TESTS])
 
 m4_define([TESTABLE_LOG], [-vPATTERN:ANY:'%c|%p|%m'])