scylladb · fruch · Jul 19, 2023 · May 28, 2023 · Jun 6, 2023 · Jun 6, 2023
diff --git a/ccmlib/scylla_cluster.py b/ccmlib/scylla_cluster.py
@@ -143,10 +143,10 @@ def start_nodes(self, nodes=None, no_wait=False, verbose=False, wait_for_binary_
                                    verbose=verbose, from_mark=mark)
 
         if wait_other_notice:
-            for old_node, mark in marks:
+            for old_node, _ in marks:
                 for node, _, _ in started:
                     if old_node is not node:
-                        old_node.watch_log_for_alive(node, from_mark=mark)
+                        old_node.watch_rest_for_alive(node)
 
         return started
 

diff --git a/ccmlib/scylla_node.py b/ccmlib/scylla_node.py
@@ -18,6 +18,7 @@
 import yaml
 import glob
 import re
+import requests
 
 from ccmlib.common import CASSANDRA_SH, BIN_DIR, wait_for, copy_directory
 
@@ -307,9 +308,10 @@ def _start_scylla(self, args, marks, update_pid, wait_other_notice,
                                 self._process_scylla)
 
         if wait_other_notice:
-            for node, mark in marks:
+            for node, _ in marks:
                 t = timeout if timeout is not None else 120 if self.cluster.scylla_mode != 'debug' else 360
-                node.watch_log_for_alive(self, from_mark=mark, timeout=t)
+                node.watch_rest_for_alive(self, timeout=t)
+                self.watch_rest_for_alive(node, timeout=t)
 
         if wait_for_binary_proto:
             t = timeout * 4 if timeout is not None else 420 if self.cluster.scylla_mode != 'debug' else 900
@@ -1337,6 +1339,45 @@ def upgrade(self, upgrade_to_version):
     def rollback(self, upgrade_to_version):
         self.upgrader.upgrade(upgrade_version=upgrade_to_version, recover_system_tables=True)
 
+    def watch_rest_for_alive(self, nodes, timeout=120):
+        """
+        Use the REST API to wait until this node detects that the nodes listed
+        in "nodes" become fully operational and knows of its tokens.
+        This is similar to watch_log_for_alive but uses ScyllaDB's REST API
+        instead of the log file and waits for the node to be really useable,
+        not just "UP" (see issue #461)
+        """
+        tofind = nodes if isinstance(nodes, list) else [nodes]
+        tofind = set([node.address() for node in tofind])
+        url_live = f"http://{self.address()}:10000/gossiper/endpoint/live"
+        url_joining = f"http://{self.address()}:10000/storage_service/nodes/joining"
+        url_tokens = f"http://{self.address()}:10000/storage_service/tokens/"
+        endtime = time.time() + timeout
+        while time.time() < endtime:
+            live = set()
+            response = requests.get(url=url_live)
+            if response.text:
+                live = set(response.json())
+            response = requests.get(url=url_joining)
+            if response.text:
+                live = live - set(response.json())
+            # Verify that node knows not only about the existance of the
+            # other node, but also its tokens:
+            if tofind.issubset(live):
+                # This node thinks that all given nodes are alive and not
+                # "joining", we're almost done, but still need to verify
+                # that the node knows the others' tokens.
+                check = tofind
+                tofind = set()
+                for n in check:
+                    response = requests.get(url=url_tokens+n)
+                    if response.text == '[]':
+                        tofind.add(n)
+            if not tofind:
+                return
+            time.sleep(0.1)
+        raise TimeoutError(f"watch_rest_for_alive() timeout after {timeout} seconds")
+
     @property
     def gnutls_config_file(self):
         candidates = [