From a3f7956d9b44e7c0a965fc2e3031d2064134de7d Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 12 Mar 2024 19:39:11 +0200 Subject: [PATCH] scylla_cluster, scylla_node: watch_log_for_alive before watch_rest_for_alive https://github.com/scylladb/scylla-ccm/pull/462 introduced watch_rest_for_alive that replaced the calls to watch_log_for_alive on the scylla node(s) start path. But a node is killed and then restarted, and other nodes miss the kill event, `watch_rest_for_alive` will consider that node already as up as seen by the other nodes, while previously, watch_log_for_alive, waited until other nodes discovered this node as up again, based on markes taken right before (re)starting that node. This change brings this call back. Fixes scylladb/scylla-ccm#563 Signed-off-by: Benny Halevy --- ccmlib/scylla_cluster.py | 3 ++- ccmlib/scylla_node.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ccmlib/scylla_cluster.py b/ccmlib/scylla_cluster.py index 1dd56b20..9309f9f5 100644 --- a/ccmlib/scylla_cluster.py +++ b/ccmlib/scylla_cluster.py @@ -156,9 +156,10 @@ def start_nodes(self, nodes=None, no_wait=False, verbose=False, wait_for_binary_ verbose=verbose, from_mark=mark) if wait_other_notice: - for old_node, _ in marks: + for old_node, mark in marks: for node, _, _ in started: if old_node is not node: + old_node.watch_log_for_alive(node, from_mark=mark, timeout=self.default_wait_other_notice_timeout) old_node.watch_rest_for_alive(node, timeout=self.default_wait_other_notice_timeout, wait_normal_token_owner=wait_normal_token_owner) diff --git a/ccmlib/scylla_node.py b/ccmlib/scylla_node.py index 1e6d1cb8..b712ebde 100644 --- a/ccmlib/scylla_node.py +++ b/ccmlib/scylla_node.py @@ -332,8 +332,9 @@ def _start_scylla(self, args, marks, update_pid, self.wait_for_binary_interface(from_mark=from_mark, process=self._process_scylla, timeout=t) if wait_other_notice: - for node, _ in marks: + for node, mark in marks: t = self.cluster.default_wait_other_notice_timeout + node.watch_log_for_alive(self, from_mark=mark, timeout=t) node.watch_rest_for_alive(self, timeout=t, wait_normal_token_owner=wait_normal_token_owner) self.watch_rest_for_alive(node, timeout=t, wait_normal_token_owner=wait_normal_token_owner)