From ca9352d54ad686b2738e2e46c841dc8432cab701 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 8 Nov 2023 18:07:02 +0200 Subject: [PATCH] scylla_node: watch_rest_for_alive: wait for others to be considered normal token owners It is not enough for node to know about other nodes' tokens, as they might not be reflected in the token_metadata map. Instead, check the `/storage_service/host_id` api that provides a list of nodes that are normal token owners and ready to be used by queries. Refs https://github.com/scylladb/scylladb/issues/15146 Signed-off-by: Benny Halevy --- ccmlib/scylla_node.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/ccmlib/scylla_node.py b/ccmlib/scylla_node.py index a41bdde4..14d9638b 100644 --- a/ccmlib/scylla_node.py +++ b/ccmlib/scylla_node.py @@ -1344,18 +1344,21 @@ def hostid(self, timeout=60, force_refresh=False): def watch_rest_for_alive(self, nodes, timeout=120): """ Use the REST API to wait until this node detects that the nodes listed - in "nodes" become fully operational and knows of its tokens. + in "nodes" become fully operational as normal token owners. This is similar to watch_log_for_alive but uses ScyllaDB's REST API instead of the log file and waits for the node to be really useable, not just "UP" (see issue #461) """ logging.getLogger('urllib3.connectionpool').disabled = True try: - tofind = nodes if isinstance(nodes, list) else [nodes] - tofind = set([node.address() for node in tofind]) + nodes_tofind = nodes if isinstance(nodes, list) else [nodes] + tofind = set([node.address() for node in nodes_tofind]) + tofind_host_id_map = dict([(node.address(), node.hostid()) for node in nodes_tofind]) + found = set() + host_id_map = dict() url_live = f"http://{self.address()}:10000/gossiper/endpoint/live" url_joining = f"http://{self.address()}:10000/storage_service/nodes/joining" - url_tokens = f"http://{self.address()}:10000/storage_service/tokens/" + url_host_ids = f"http://{self.address()}:10000/storage_service/host_id" endtime = time.time() + timeout while time.time() < endtime: live = set() @@ -1366,20 +1369,25 @@ def watch_rest_for_alive(self, nodes, timeout=120): if response.status_code == requests.codes.ok: live = live - set(response.json()) # Verify that node knows not only about the existance of the - # other node, but also its tokens: + # other node, but also its host_id as a normal token owner: if tofind.issubset(live): # This node thinks that all given nodes are alive and not # "joining", we're almost done, but still need to verify - # that the node knows the others' tokens. - check = tofind - tofind = set() - for n in check: - response = requests.get(url=url_tokens+n) - if response.text == '[]': - tofind.add(n) - if not tofind: - return + # that the node knows that the others' are normal token owners. + host_id_map = dict() + response = requests.get(url=url_host_ids) + if response.status_code == requests.codes.ok: + for r in response.json(): + host_id_map[r['key']] = r['value'] + # Verify that the other nodes are considered normal token owners on this node + # and their host_id matches the host_id the client knows about + found = set([addr for addr, id in host_id_map.items() \ + if addr in tofind_host_id_map and \ + (id == tofind_host_id_map[addr] or not tofind_host_id_map[addr])]) + if found == tofind: + return time.sleep(0.1) + self.debug(f"watch_rest_for_alive: found={found} tofind={tofind}: host_id_map={host_id_map} tofind_host_id_map={tofind_host_id_map}") raise TimeoutError(f"watch_rest_for_alive() timeout after {timeout} seconds") finally: logging.getLogger('urllib3.connectionpool').disabled = False