Skip to content

Commit

Permalink
improvement(setup): improve caching of local tarballs
Browse files Browse the repository at this point in the history
If the test version is unstable (not release, maybe private branch) and installation folder exists,
we want to check if this version was downloaded nd installed already in the past and was
changed.
In this case the version should be downloaded and installed again.
Compare hash of saved version (it is saved in the 'scylla-core-package/source.txt' file) and hash
of a new package.
If it is same - skip the download. If not - remove existing folder and download again.
  • Loading branch information
juliayakovlev committed Feb 1, 2024
1 parent c2bba25 commit ae4bc72
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 14 deletions.
23 changes: 19 additions & 4 deletions ccmlib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
import pathlib
from itertools import zip_longest
from typing import Callable, Optional, TextIO, Union, List
from pathlib import Path

import yaml
from boto3.session import Session
from botocore import UNSIGNED
from botocore.client import Config


BIN_DIR = "bin"
CASSANDRA_CONF_DIR = "conf"
DSE_CASSANDRA_CONF_DIR = "resources/cassandra/conf"
Expand Down Expand Up @@ -644,14 +646,15 @@ def scylla_extract_mode(path):


def scylla_extract_install_dir_and_mode(install_dir):
from ccmlib.scylla_repository import CORE_PACKAGE_DIR_NAME, SOURCE_FILE_NAME # to prevent failure due to a circular import
scylla_mode = scylla_extract_mode(install_dir)
if scylla_mode:
install_dir = str(os.path.join(install_dir, os.pardir, os.pardir))
else:
scylla_mode = 'release'
if os.path.exists(os.path.join(install_dir, 'scylla-core-package')):
if os.path.exists(os.path.join(install_dir, CORE_PACKAGE_DIR_NAME)):
try:
f = open(os.path.join(install_dir, 'scylla-core-package', 'source.txt'), 'r')
f = open(os.path.join(install_dir, CORE_PACKAGE_DIR_NAME, SOURCE_FILE_NAME), 'r')
for l in f.readlines():
if l.startswith('url='):
scylla_mode = scylla_extract_mode(l) or scylla_mode
Expand Down Expand Up @@ -685,8 +688,6 @@ def wait_for(func: Callable, timeout: int, first: float = 0.0, step: float = 1.0
return False




def validate_install_dir(install_dir):
if install_dir is None:
raise ArgumentError('Undefined installation directory')
Expand Down Expand Up @@ -973,3 +974,17 @@ def print_if_standalone(*args, debug_callback=None, end='\n', **kwargs):
print(*args, *kwargs, end=end)
else:
debug_callback(*args, **kwargs)


def get_installed_scylla_package_hash(source_file: Path):
current_hash = ""

# If source file does not exists - we can not to check the hash of the existing package
if source_file.exists():
with open(source_file, 'r') as f:
lines = f.readlines()
# get hash from file
for line in lines:
if line.startswith("hash="):
current_hash = line.replace("hash=", "").strip()
return current_hash
53 changes: 46 additions & 7 deletions ccmlib/scylla_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@

from ccmlib.common import (
ArgumentError, CCMError, get_default_path, rmdirs, validate_install_dir, get_scylla_version, aws_bucket_ls,
DOWNLOAD_IN_PROGRESS_FILE, print_if_standalone, LockFile)
from ccmlib.utils.download import download_file, download_version_from_s3, get_url_hash
DOWNLOAD_IN_PROGRESS_FILE, print_if_standalone, LockFile, get_installed_scylla_package_hash)
from ccmlib.utils.download import download_file, download_version_from_s3, get_url_hash, save_source_file
from ccmlib.utils.version import parse_version

GIT_REPO = "http://github.com/scylladb/scylla.git"

CORE_PACKAGE_DIR_NAME = 'scylla-core-package'
SCYLLA_VERSION_FILE = 'SCYLLA-VERSION-FILE'
SOURCE_FILE_NAME = "source.txt"

RELOCATABLE_URLS_BASE = ['https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla/{0}/relocatable/{1}',
'https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla-enterprise/{0}/relocatable/{1}',
Expand Down Expand Up @@ -244,7 +245,14 @@ def setup(version, verbose=True, skip_downloads=False):
type_n_version = version.split(os.path.sep, 1)
version_dir = version_directory(version) if not skip_downloads else None

if len(type_n_version) == 2 and version_dir is None:
# If the test version is unstable (not release, maybe private branch) and installation folder exists,
# we want to check if this version was downloaded nd installed already in the past and was changed.
# In this case the version should be downloaded and installed again.
# Compare hash of saved version (it is saved in the 'scylla-core-package/source.txt' file) and hash of a new package.
# If it is same - skip the download. If not - remove existing folder and download again.
validate_by_hash = version_dir is not None and type_n_version[0] != "release"

if len(type_n_version) == 2 and (version_dir is None or validate_by_hash):
s3_version = type_n_version[1]

if type_n_version[0] == 'release':
Expand Down Expand Up @@ -303,6 +311,32 @@ def setup(version, verbose=True, skip_downloads=False):
if skip_downloads:
return directory_name(version), packages

if validate_by_hash and packages:
"""
Validate if packages hash was changed and the new package(s) have to be downloaded
"""
map_field_to_dir_name = {"scylla_unified_package": CORE_PACKAGE_DIR_NAME,
"scylla_package": CORE_PACKAGE_DIR_NAME,
"scylla_tools_package": "scylla-tools-java",
"scylla_jmx_package": "scylla-jmx"
}
for package in zip(packages._fields, packages):
if not package[1]:
continue

new_hash = get_url_hash(package[1])
package_dir = map_field_to_dir_name[package[0]]
current_hash = get_installed_scylla_package_hash(source_file=Path(version_dir) / package_dir / SOURCE_FILE_NAME)
# Current hash may be None. It may be due to uncompleted downloading and installation.
# Or because of it is old installation that was not saved the hash yet.
# In any case the new download and installation should be performed.
# For this goal we need to remove existing folder and start downloading again
if not (new_hash and current_hash) or new_hash != current_hash:
# remove version_dir
rmdirs(version_dir)
version_dir = None
break

if version_dir is None:
# Create version folder and add placeholder file to prevent parallel downloading from another test.
version_dir = directory_name(version)
Expand Down Expand Up @@ -490,10 +524,15 @@ def download_version(version, url=None, verbose=False, target_dir=None, unified=
# add breadcrumb so we could list the origin of each part easily for debugging
# for example listing all the version we have in ccm scylla-repository
# find ~/.ccm/scylla-repository/*/ -iname source.txt | xargs cat
source_breadcrumb_file = os.path.join(target_dir, 'source.txt')
with open(source_breadcrumb_file, 'w') as f:
f.write(f"version={version}\n")
f.write(f"url={url}\n")
source_breadcrumb_file = os.path.join(target_dir, SOURCE_FILE_NAME)
# To improve caching of local tarballs, save hash of current package.
# In case the relocatable package was downloaded in the past and saved locally, by comparing of package hash we can decide
# if the package was changed and we need to download it again
url_hash = get_url_hash(url=url)
save_source_file(source_file=source_breadcrumb_file,
version=version,
url=url,
url_hash=url_hash)

return package_version
except urllib.error.URLError as e:
Expand Down
10 changes: 9 additions & 1 deletion ccmlib/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ def get_url_hash(url: str) -> str:
"""

if os.path.exists(url): # if file/dir is local, hash based on the path
return hashlib.md5(url).hexdigest()
# Strings must be encoded before hashing
return hashlib.md5(url.encode('utf-8')).hexdigest()

# first try is on s3
parts = urllib.parse.urlparse(url)
Expand All @@ -185,3 +186,10 @@ def get_url_hash(url: str) -> str:
except botocore.client.ClientError:
# fallback to http
return requests.head(url).headers.get('ETag')[1:-1]


def save_source_file(source_file: str, version: str, url: str, url_hash: str):
with open(source_file, 'w') as f:
f.write(f"version={version}\n")
f.write(f"url={url}\n")
f.write(f"hash={url_hash}\n")
63 changes: 61 additions & 2 deletions tests/test_scylla_repository.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import time
import typing
from pathlib import Path
import random

import pytest

from ccmlib.scylla_repository import setup as scylla_setup
from ccmlib.scylla_repository import setup as scylla_setup, CORE_PACKAGE_DIR_NAME, SOURCE_FILE_NAME
from ccmlib.scylla_repository import (
get_manager_release_url,
get_manager_latest_reloc_url,
Architecture,
)
from ccmlib.utils.download import get_url_hash


@pytest.mark.repo_tests
Expand Down Expand Up @@ -42,7 +46,6 @@ def test_setup_unstable_enterprise_new_url(self):
assert version == '2023.3.0-dev'



class TestScyllaRepositoryRelease:
@pytest.mark.parametrize(argnames=['version', 'expected_cdir'], argvalues=[
("release:5.1", 'release/5.1'),
Expand Down Expand Up @@ -115,6 +118,62 @@ def test_setup_unstable_master_new_url(self):
assert packages.scylla_tools_package == 'https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla/master/relocatable/2021-01-18T15:48:13Z/scylla-tools-package.tar.gz'
assert packages.scylla_jmx_package == 'https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla/master/relocatable/2021-01-18T15:48:13Z/scylla-jmx-package.tar.gz'

@staticmethod
def corrupt_hash_value(source_file):
file_text = source_file.read_text()
file_text = file_text.replace("hash=", "hash=123")
source_file.write_text(file_text)

def test_setup_unstable_master_no_unified_package_reinstall(self):
"""
Validate that if package hash is changed, new package will be downloaded.
- download the Scylla packages. Packages hash will be saved in the "source.txt" file under relevant package folder
- change the hash to be wrong for one of the packages (choose the package randomly). No matter hash of which package is wrong -
all packages should be re-downloaded
- run setup again. It expected that the packages will be downloaded again. The download time should be not short.
Actually time without download should be around 5 ms, and with download about 35 ms. I put here more than 20
"""
cdir, version = scylla_setup(version="unstable/master:2021-01-18T15:48:13Z", verbose=True, skip_downloads=False)
assert '2021-01-18T15_48_13Z' in cdir
assert version == '4.4.dev'

package_to_corrupt = random.choice([CORE_PACKAGE_DIR_NAME, "scylla-tools-java", "scylla-jmx"])
self.corrupt_hash_value(Path(cdir) / package_to_corrupt / SOURCE_FILE_NAME)

start_time = time.time()
cdir, version = scylla_setup(version="unstable/master:2021-01-18T15:48:13Z", verbose=True, skip_downloads=False)
end_time = time.time()
assert (end_time - start_time) > 20

assert '2021-01-18T15_48_13Z' in cdir
assert version == '4.4.dev'

def test_setup_unstable_master_unified_package_reinstall(self):
"""
Validate that if package hash is changed, new package will be downloaded.
- download the unified package. Package hash will be saved in the "source.txt" file
- change the hash to be wrong
- run setup again. It expected that the package will be downloaded again. The download time should be not short.
Actually time without download should be less than 3 ms, and with download about 9 ms. I put here more than 20
"""
cdir, version = scylla_setup(version="unstable/master:2023-04-03T22:38:18Z", verbose=True, skip_downloads=False)
assert '2023-04-03T22_38_18Z' in cdir
assert version == '5.3.0-dev'

self.corrupt_hash_value(Path(cdir) / CORE_PACKAGE_DIR_NAME / SOURCE_FILE_NAME)

start_time = time.time()
cdir, version = scylla_setup(version="unstable/master:2023-04-03T22:38:18Z", verbose=True, skip_downloads=False)
end_time = time.time()
assert (end_time - start_time) > 5
assert '2023-04-03T22_38_18Z' in cdir
assert version == '5.3.0-dev'

def test_get_local_tarball_hash(self):
this_path = Path().resolve()
url_hash = get_url_hash(url=str(this_path / "tests" / "test_data" / "scylla_unified_master_2023_04_03.tar.gz"))
assert url_hash == '6dbfe395e99c04787bb8e6bcef0283fc'


@pytest.mark.parametrize('architecture', argvalues=typing.get_args(Architecture))
class TestGetManagerFunctions:
Expand Down

0 comments on commit ae4bc72

Please sign in to comment.