diff mbox series

[ovs-dev,v2,5/5] ovsdb: raft: Fix inability to join after leadership change round trip.

Message ID 20240326172717.1454071-6-i.maximets@ovn.org
State Accepted
Commit 5b9feacfc8ffc06837e965c5f17269e62e0a1668
Headers show
Series ovsdb: raft: Fixes for cluster joining state. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Ilya Maximets March 26, 2024, 5:27 p.m. UTC
Consider the following sequence of events:

 1. Cluster with 2 nodes - A and B.  A is a leader.
 2. C connects to A and sends a join request.
 3. A sends an append request to C.  C is in CATCHUP phase for A.
 4. A looses leadership to B.  Sends join failure notification to C.
 5. C sends append reply to A.
 6. A discards append reply (not leader).
 7. B looses leadership back to A.
 8. C sends a new join request to A.
 9. A replies with failure (already in progress).
 10. GoTo step 8.

At this point A is waiting for an append reply that it already
discarded at step 6 and fails all the new attempts of C to join with
'already in progress' verdict.  C stays forever in a joining state
and in a CATCHUP phase from A's perspective.

This is a similar case to a sudden disconnect from a leader fixed in
commit 999ba294fb4f ("ovsdb: raft: Fix inability to join the cluster
after interrupted attempt."), but since we're not disconnecting, the
servers are not getting destroyed.

Fix that by destroying all the servers that are not yet part of the
configuration after leadership is lost.  This way, server C will be
able to simply re-start the joining process from scratch.

New failure test command is added in order to simulate leadership
change before we receive the append reply, so it gets discarded.
New cluster test is added to exercise this scenario.

Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.")
Reported-at: https://github.com/ovn-org/ovn/issues/235
Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
---
 ovsdb/raft.c           | 16 ++++++++++++-
 tests/ovsdb-cluster.at | 53 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 0c171b754..d81a1758a 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -81,6 +81,7 @@  enum raft_failure_test {
     FT_STOP_RAFT_RPC,
     FT_TRANSFER_LEADERSHIP,
     FT_TRANSFER_LEADERSHIP_AFTER_SEND_APPEND_REQ,
+    FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD,
 };
 static enum raft_failure_test failure_test;
 
@@ -2740,15 +2741,22 @@  raft_become_follower(struct raft *raft)
      * new configuration.  Our AppendEntries processing will properly update
      * the server configuration later, if necessary.
      *
+     * However, since we're sending replies about a failure to add, those new
+     * servers has to be cleaned up.  Otherwise, they will stuck in a 'CATCHUP'
+     * phase in case this server regains leadership before they join through
+     * the current new leader.  They are not yet in 'raft->servers', so not
+     * part of the shared configuration.
+     *
      * Also we do not complete commands here, as they can still be completed
      * if their log entries have already been replicated to other servers.
      * If the entries were actually committed according to the new leader, our
      * AppendEntries processing will complete the corresponding commands.
      */
     struct raft_server *s;
-    HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) {
+    HMAP_FOR_EACH_POP (s, hmap_node, &raft->add_servers) {
         raft_send_add_server_reply__(raft, &s->sid, s->address, false,
                                      RAFT_SERVER_LOST_LEADERSHIP);
+        raft_server_destroy(s);
     }
     if (raft->remove_server) {
         raft_send_remove_server_reply__(raft, &raft->remove_server->sid,
@@ -4023,6 +4031,10 @@  raft_handle_add_server_request(struct raft *raft,
                  "to cluster "CID_FMT, s->nickname, SID_ARGS(&s->sid),
                  rq->address, CID_ARGS(&raft->cid));
     raft_send_append_request(raft, s, 0, "initialize new server");
+
+    if (failure_test == FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD) {
+        failure_test = FT_TRANSFER_LEADERSHIP;
+    }
 }
 
 static void
@@ -5148,6 +5160,8 @@  raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED,
     } else if (!strcmp(test,
                        "transfer-leadership-after-sending-append-request")) {
         failure_test = FT_TRANSFER_LEADERSHIP_AFTER_SEND_APPEND_REQ;
+    } else if (!strcmp(test, "transfer-leadership-after-starting-to-add")) {
+        failure_test = FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD;
     } else if (!strcmp(test, "transfer-leadership")) {
         failure_test = FT_TRANSFER_LEADERSHIP;
     } else if (!strcmp(test, "clear")) {
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index 482e4e02d..9d8b4d06a 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -525,6 +525,59 @@  for i in $(seq $n); do
     OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
 done
 
+AT_CLEANUP
+
+AT_SETUP([OVSDB cluster - leadership change before replication while joining])
+AT_KEYWORDS([ovsdb server negative unix cluster join])
+
+n=5
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db dnl
+              $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
+cid=$(ovsdb-tool db-cid s1.db)
+schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema)
+for i in $(seq 2 $n); do
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
+done
+
+on_exit 'kill $(cat *.pid)'
+on_exit "
+  for i in \$(ls $(pwd)/s[[0-$n]]); do
+    ovs-appctl --timeout 1 -t \$i cluster/status $schema_name;
+  done
+"
+
+dnl Starting servers one by one asking all exisitng servers to transfer
+dnl leadership right after starting to add a server.  Joining server will
+dnl need to find a new leader that will also transfer leadership.
+dnl This will continue until the same server will not become a leader
+dnl for the second time and will be able to add a new server.
+for i in $(seq $n); do
+    dnl Make sure that all already started servers joined the cluster.
+    for j in $(seq $((i - 1)) ); do
+        AT_CHECK([ovsdb_client_wait unix:s$j.ovsdb $schema_name connected])
+    done
+    for j in $(seq $((i - 1)) ); do
+        OVS_WAIT_UNTIL([ovs-appctl -t "$(pwd)"/s$j \
+                          cluster/failure-test \
+                            transfer-leadership-after-starting-to-add \
+                        | grep -q "engaged"])
+    done
+
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
+                           --detach --no-chdir --log-file=s$i.log \
+                           --pidfile=s$i.pid --unixctl=s$i \
+                           --remote=punix:s$i.ovsdb s$i.db])
+done
+
+dnl Make sure that all servers joined the cluster.
+for i in $(seq $n); do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+for i in $(seq $n); do
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
+done
+
 AT_CLEANUP