diff --git a/ccmlib/scylla_cluster.py b/ccmlib/scylla_cluster.py index e758d1d8..511b505a 100644 --- a/ccmlib/scylla_cluster.py +++ b/ccmlib/scylla_cluster.py @@ -143,10 +143,10 @@ def start_nodes(self, nodes=None, no_wait=False, verbose=False, wait_for_binary_ verbose=verbose, from_mark=mark) if wait_other_notice: - for old_node, mark in marks: + for old_node, _ in marks: for node, _, _ in started: if old_node is not node: - old_node.watch_log_for_alive(node, from_mark=mark) + old_node.watch_rest_for_alive(node) return started diff --git a/ccmlib/scylla_node.py b/ccmlib/scylla_node.py index d0ae76d6..8ce4323e 100644 --- a/ccmlib/scylla_node.py +++ b/ccmlib/scylla_node.py @@ -18,6 +18,7 @@ import yaml import glob import re +import requests from ccmlib.common import CASSANDRA_SH, BIN_DIR, wait_for, copy_directory @@ -307,9 +308,10 @@ def _start_scylla(self, args, marks, update_pid, wait_other_notice, self._process_scylla) if wait_other_notice: - for node, mark in marks: + for node, _ in marks: t = timeout if timeout is not None else 120 if self.cluster.scylla_mode != 'debug' else 360 - node.watch_log_for_alive(self, from_mark=mark, timeout=t) + node.watch_rest_for_alive(self, timeout=t) + self.watch_rest_for_alive(node, timeout=t) if wait_for_binary_proto: t = timeout * 4 if timeout is not None else 420 if self.cluster.scylla_mode != 'debug' else 900 @@ -1337,6 +1339,45 @@ def upgrade(self, upgrade_to_version): def rollback(self, upgrade_to_version): self.upgrader.upgrade(upgrade_version=upgrade_to_version, recover_system_tables=True) + def watch_rest_for_alive(self, nodes, timeout=120): + """ + Use the REST API to wait until this node detects that the nodes listed + in "nodes" become fully operational and knows of its tokens. + This is similar to watch_log_for_alive but uses ScyllaDB's REST API + instead of the log file and waits for the node to be really useable, + not just "UP" (see issue #461) + """ + tofind = nodes if isinstance(nodes, list) else [nodes] + tofind = set([node.address() for node in tofind]) + url_live = f"http://{self.address()}:10000/gossiper/endpoint/live" + url_joining = f"http://{self.address()}:10000/storage_service/nodes/joining" + url_tokens = f"http://{self.address()}:10000/storage_service/tokens/" + endtime = time.time() + timeout + while time.time() < endtime: + live = set() + response = requests.get(url=url_live) + if response.text: + live = set(response.json()) + response = requests.get(url=url_joining) + if response.text: + live = live - set(response.json()) + # Verify that node knows not only about the existance of the + # other node, but also its tokens: + if tofind.issubset(live): + # This node thinks that all given nodes are alive and not + # "joining", we're almost done, but still need to verify + # that the node knows the others' tokens. + check = tofind + tofind = set() + for n in check: + response = requests.get(url=url_tokens+n) + if response.text == '[]': + tofind.add(n) + if not tofind: + return + time.sleep(0.1) + raise TimeoutError(f"watch_rest_for_alive() timeout after {timeout} seconds") + @property def gnutls_config_file(self): candidates = [