Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scylla_cluster: fix handling of wait_other_notice #462

Merged
merged 3 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ccmlib/scylla_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,10 @@ def start_nodes(self, nodes=None, no_wait=False, verbose=False, wait_for_binary_
verbose=verbose, from_mark=mark)

if wait_other_notice:
for old_node, mark in marks:
for old_node, _ in marks:
for node, _, _ in started:
if old_node is not node:
old_node.watch_log_for_alive(node, from_mark=mark)
old_node.watch_rest_for_alive(node)

return started

Expand Down
45 changes: 43 additions & 2 deletions ccmlib/scylla_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import yaml
import glob
import re
import requests

from ccmlib.common import CASSANDRA_SH, BIN_DIR, wait_for, copy_directory

Expand Down Expand Up @@ -307,9 +308,10 @@ def _start_scylla(self, args, marks, update_pid, wait_other_notice,
self._process_scylla)

if wait_other_notice:
for node, mark in marks:
for node, _ in marks:
t = timeout if timeout is not None else 120 if self.cluster.scylla_mode != 'debug' else 360
node.watch_log_for_alive(self, from_mark=mark, timeout=t)
node.watch_rest_for_alive(self, timeout=t)
self.watch_rest_for_alive(node, timeout=t)
fruch marked this conversation as resolved.
Show resolved Hide resolved

if wait_for_binary_proto:
t = timeout * 4 if timeout is not None else 420 if self.cluster.scylla_mode != 'debug' else 900
Expand Down Expand Up @@ -1337,6 +1339,45 @@ def upgrade(self, upgrade_to_version):
def rollback(self, upgrade_to_version):
self.upgrader.upgrade(upgrade_version=upgrade_to_version, recover_system_tables=True)

def watch_rest_for_alive(self, nodes, timeout=120):
"""
Use the REST API to wait until this node detects that the nodes listed
in "nodes" become fully operational and knows of its tokens.
This is similar to watch_log_for_alive but uses ScyllaDB's REST API
instead of the log file and waits for the node to be really useable,
not just "UP" (see issue #461)
"""
tofind = nodes if isinstance(nodes, list) else [nodes]
tofind = set([node.address() for node in tofind])
url_live = f"http://{self.address()}:10000/gossiper/endpoint/live"
url_joining = f"http://{self.address()}:10000/storage_service/nodes/joining"
url_tokens = f"http://{self.address()}:10000/storage_service/tokens/"
endtime = time.time() + timeout
while time.time() < endtime:
live = set()
response = requests.get(url=url_live)
if response.text:
live = set(response.json())
response = requests.get(url=url_joining)
if response.text:
live = live - set(response.json())
# Verify that node knows not only about the existance of the
# other node, but also its tokens:
if tofind.issubset(live):
# This node thinks that all given nodes are alive and not
# "joining", we're almost done, but still need to verify
# that the node knows the others' tokens.
check = tofind
tofind = set()
for n in check:
response = requests.get(url=url_tokens+n)
if response.text == '[]':
tofind.add(n)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder how this would work with topology-related tests that currently track the log to inject some event during e.g. bootstrap or replace.
Although they can pass wait_other_notice=False and implement their own state tracking but they might break with this change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know what kind of tests you mean, but we can see if any tests starts failing with this patch.
I think that if a certain test wants to capture some exact moment during the middle of the boot process, it can't ask ccm to wait until the boot is ove.

if not tofind:
return
time.sleep(0.1)
fruch marked this conversation as resolved.
Show resolved Hide resolved
raise TimeoutError(f"watch_rest_for_alive() timeout after {timeout} seconds")

@property
def gnutls_config_file(self):
candidates = [
Expand Down