From ae4bc7240d39ded9a9f892a39d278df471d8c93c Mon Sep 17 00:00:00 2001 From: Julia Yakovlev Date: Sun, 28 Jan 2024 12:41:34 +0200 Subject: [PATCH] improvement(setup): improve caching of local tarballs If the test version is unstable (not release, maybe private branch) and installation folder exists, we want to check if this version was downloaded nd installed already in the past and was changed. In this case the version should be downloaded and installed again. Compare hash of saved version (it is saved in the 'scylla-core-package/source.txt' file) and hash of a new package. If it is same - skip the download. If not - remove existing folder and download again. --- ccmlib/common.py | 23 +++++++++--- ccmlib/scylla_repository.py | 53 +++++++++++++++++++++++---- ccmlib/utils/download.py | 10 +++++- tests/test_scylla_repository.py | 63 +++++++++++++++++++++++++++++++-- 4 files changed, 135 insertions(+), 14 deletions(-) diff --git a/ccmlib/common.py b/ccmlib/common.py index 4edd4def..043a9987 100644 --- a/ccmlib/common.py +++ b/ccmlib/common.py @@ -18,12 +18,14 @@ import pathlib from itertools import zip_longest from typing import Callable, Optional, TextIO, Union, List +from pathlib import Path import yaml from boto3.session import Session from botocore import UNSIGNED from botocore.client import Config + BIN_DIR = "bin" CASSANDRA_CONF_DIR = "conf" DSE_CASSANDRA_CONF_DIR = "resources/cassandra/conf" @@ -644,14 +646,15 @@ def scylla_extract_mode(path): def scylla_extract_install_dir_and_mode(install_dir): + from ccmlib.scylla_repository import CORE_PACKAGE_DIR_NAME, SOURCE_FILE_NAME # to prevent failure due to a circular import scylla_mode = scylla_extract_mode(install_dir) if scylla_mode: install_dir = str(os.path.join(install_dir, os.pardir, os.pardir)) else: scylla_mode = 'release' - if os.path.exists(os.path.join(install_dir, 'scylla-core-package')): + if os.path.exists(os.path.join(install_dir, CORE_PACKAGE_DIR_NAME)): try: - f = open(os.path.join(install_dir, 'scylla-core-package', 'source.txt'), 'r') + f = open(os.path.join(install_dir, CORE_PACKAGE_DIR_NAME, SOURCE_FILE_NAME), 'r') for l in f.readlines(): if l.startswith('url='): scylla_mode = scylla_extract_mode(l) or scylla_mode @@ -685,8 +688,6 @@ def wait_for(func: Callable, timeout: int, first: float = 0.0, step: float = 1.0 return False - - def validate_install_dir(install_dir): if install_dir is None: raise ArgumentError('Undefined installation directory') @@ -973,3 +974,17 @@ def print_if_standalone(*args, debug_callback=None, end='\n', **kwargs): print(*args, *kwargs, end=end) else: debug_callback(*args, **kwargs) + + +def get_installed_scylla_package_hash(source_file: Path): + current_hash = "" + + # If source file does not exists - we can not to check the hash of the existing package + if source_file.exists(): + with open(source_file, 'r') as f: + lines = f.readlines() + # get hash from file + for line in lines: + if line.startswith("hash="): + current_hash = line.replace("hash=", "").strip() + return current_hash diff --git a/ccmlib/scylla_repository.py b/ccmlib/scylla_repository.py index b9e16f9f..13443393 100644 --- a/ccmlib/scylla_repository.py +++ b/ccmlib/scylla_repository.py @@ -19,14 +19,15 @@ from ccmlib.common import ( ArgumentError, CCMError, get_default_path, rmdirs, validate_install_dir, get_scylla_version, aws_bucket_ls, - DOWNLOAD_IN_PROGRESS_FILE, print_if_standalone, LockFile) -from ccmlib.utils.download import download_file, download_version_from_s3, get_url_hash + DOWNLOAD_IN_PROGRESS_FILE, print_if_standalone, LockFile, get_installed_scylla_package_hash) +from ccmlib.utils.download import download_file, download_version_from_s3, get_url_hash, save_source_file from ccmlib.utils.version import parse_version GIT_REPO = "http://github.com/scylladb/scylla.git" CORE_PACKAGE_DIR_NAME = 'scylla-core-package' SCYLLA_VERSION_FILE = 'SCYLLA-VERSION-FILE' +SOURCE_FILE_NAME = "source.txt" RELOCATABLE_URLS_BASE = ['https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla/{0}/relocatable/{1}', 'https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla-enterprise/{0}/relocatable/{1}', @@ -244,7 +245,14 @@ def setup(version, verbose=True, skip_downloads=False): type_n_version = version.split(os.path.sep, 1) version_dir = version_directory(version) if not skip_downloads else None - if len(type_n_version) == 2 and version_dir is None: + # If the test version is unstable (not release, maybe private branch) and installation folder exists, + # we want to check if this version was downloaded nd installed already in the past and was changed. + # In this case the version should be downloaded and installed again. + # Compare hash of saved version (it is saved in the 'scylla-core-package/source.txt' file) and hash of a new package. + # If it is same - skip the download. If not - remove existing folder and download again. + validate_by_hash = version_dir is not None and type_n_version[0] != "release" + + if len(type_n_version) == 2 and (version_dir is None or validate_by_hash): s3_version = type_n_version[1] if type_n_version[0] == 'release': @@ -303,6 +311,32 @@ def setup(version, verbose=True, skip_downloads=False): if skip_downloads: return directory_name(version), packages + if validate_by_hash and packages: + """ + Validate if packages hash was changed and the new package(s) have to be downloaded + """ + map_field_to_dir_name = {"scylla_unified_package": CORE_PACKAGE_DIR_NAME, + "scylla_package": CORE_PACKAGE_DIR_NAME, + "scylla_tools_package": "scylla-tools-java", + "scylla_jmx_package": "scylla-jmx" + } + for package in zip(packages._fields, packages): + if not package[1]: + continue + + new_hash = get_url_hash(package[1]) + package_dir = map_field_to_dir_name[package[0]] + current_hash = get_installed_scylla_package_hash(source_file=Path(version_dir) / package_dir / SOURCE_FILE_NAME) + # Current hash may be None. It may be due to uncompleted downloading and installation. + # Or because of it is old installation that was not saved the hash yet. + # In any case the new download and installation should be performed. + # For this goal we need to remove existing folder and start downloading again + if not (new_hash and current_hash) or new_hash != current_hash: + # remove version_dir + rmdirs(version_dir) + version_dir = None + break + if version_dir is None: # Create version folder and add placeholder file to prevent parallel downloading from another test. version_dir = directory_name(version) @@ -490,10 +524,15 @@ def download_version(version, url=None, verbose=False, target_dir=None, unified= # add breadcrumb so we could list the origin of each part easily for debugging # for example listing all the version we have in ccm scylla-repository # find ~/.ccm/scylla-repository/*/ -iname source.txt | xargs cat - source_breadcrumb_file = os.path.join(target_dir, 'source.txt') - with open(source_breadcrumb_file, 'w') as f: - f.write(f"version={version}\n") - f.write(f"url={url}\n") + source_breadcrumb_file = os.path.join(target_dir, SOURCE_FILE_NAME) + # To improve caching of local tarballs, save hash of current package. + # In case the relocatable package was downloaded in the past and saved locally, by comparing of package hash we can decide + # if the package was changed and we need to download it again + url_hash = get_url_hash(url=url) + save_source_file(source_file=source_breadcrumb_file, + version=version, + url=url, + url_hash=url_hash) return package_version except urllib.error.URLError as e: diff --git a/ccmlib/utils/download.py b/ccmlib/utils/download.py index 493eccf3..ec9013d3 100644 --- a/ccmlib/utils/download.py +++ b/ccmlib/utils/download.py @@ -172,7 +172,8 @@ def get_url_hash(url: str) -> str: """ if os.path.exists(url): # if file/dir is local, hash based on the path - return hashlib.md5(url).hexdigest() + # Strings must be encoded before hashing + return hashlib.md5(url.encode('utf-8')).hexdigest() # first try is on s3 parts = urllib.parse.urlparse(url) @@ -185,3 +186,10 @@ def get_url_hash(url: str) -> str: except botocore.client.ClientError: # fallback to http return requests.head(url).headers.get('ETag')[1:-1] + + +def save_source_file(source_file: str, version: str, url: str, url_hash: str): + with open(source_file, 'w') as f: + f.write(f"version={version}\n") + f.write(f"url={url}\n") + f.write(f"hash={url_hash}\n") diff --git a/tests/test_scylla_repository.py b/tests/test_scylla_repository.py index 554366e1..1f1fe71a 100644 --- a/tests/test_scylla_repository.py +++ b/tests/test_scylla_repository.py @@ -1,13 +1,17 @@ +import time import typing +from pathlib import Path +import random import pytest -from ccmlib.scylla_repository import setup as scylla_setup +from ccmlib.scylla_repository import setup as scylla_setup, CORE_PACKAGE_DIR_NAME, SOURCE_FILE_NAME from ccmlib.scylla_repository import ( get_manager_release_url, get_manager_latest_reloc_url, Architecture, ) +from ccmlib.utils.download import get_url_hash @pytest.mark.repo_tests @@ -42,7 +46,6 @@ def test_setup_unstable_enterprise_new_url(self): assert version == '2023.3.0-dev' - class TestScyllaRepositoryRelease: @pytest.mark.parametrize(argnames=['version', 'expected_cdir'], argvalues=[ ("release:5.1", 'release/5.1'), @@ -115,6 +118,62 @@ def test_setup_unstable_master_new_url(self): assert packages.scylla_tools_package == 'https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla/master/relocatable/2021-01-18T15:48:13Z/scylla-tools-package.tar.gz' assert packages.scylla_jmx_package == 'https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla/master/relocatable/2021-01-18T15:48:13Z/scylla-jmx-package.tar.gz' + @staticmethod + def corrupt_hash_value(source_file): + file_text = source_file.read_text() + file_text = file_text.replace("hash=", "hash=123") + source_file.write_text(file_text) + + def test_setup_unstable_master_no_unified_package_reinstall(self): + """ + Validate that if package hash is changed, new package will be downloaded. + - download the Scylla packages. Packages hash will be saved in the "source.txt" file under relevant package folder + - change the hash to be wrong for one of the packages (choose the package randomly). No matter hash of which package is wrong - + all packages should be re-downloaded + - run setup again. It expected that the packages will be downloaded again. The download time should be not short. + Actually time without download should be around 5 ms, and with download about 35 ms. I put here more than 20 + """ + cdir, version = scylla_setup(version="unstable/master:2021-01-18T15:48:13Z", verbose=True, skip_downloads=False) + assert '2021-01-18T15_48_13Z' in cdir + assert version == '4.4.dev' + + package_to_corrupt = random.choice([CORE_PACKAGE_DIR_NAME, "scylla-tools-java", "scylla-jmx"]) + self.corrupt_hash_value(Path(cdir) / package_to_corrupt / SOURCE_FILE_NAME) + + start_time = time.time() + cdir, version = scylla_setup(version="unstable/master:2021-01-18T15:48:13Z", verbose=True, skip_downloads=False) + end_time = time.time() + assert (end_time - start_time) > 20 + + assert '2021-01-18T15_48_13Z' in cdir + assert version == '4.4.dev' + + def test_setup_unstable_master_unified_package_reinstall(self): + """ + Validate that if package hash is changed, new package will be downloaded. + - download the unified package. Package hash will be saved in the "source.txt" file + - change the hash to be wrong + - run setup again. It expected that the package will be downloaded again. The download time should be not short. + Actually time without download should be less than 3 ms, and with download about 9 ms. I put here more than 20 + """ + cdir, version = scylla_setup(version="unstable/master:2023-04-03T22:38:18Z", verbose=True, skip_downloads=False) + assert '2023-04-03T22_38_18Z' in cdir + assert version == '5.3.0-dev' + + self.corrupt_hash_value(Path(cdir) / CORE_PACKAGE_DIR_NAME / SOURCE_FILE_NAME) + + start_time = time.time() + cdir, version = scylla_setup(version="unstable/master:2023-04-03T22:38:18Z", verbose=True, skip_downloads=False) + end_time = time.time() + assert (end_time - start_time) > 5 + assert '2023-04-03T22_38_18Z' in cdir + assert version == '5.3.0-dev' + + def test_get_local_tarball_hash(self): + this_path = Path().resolve() + url_hash = get_url_hash(url=str(this_path / "tests" / "test_data" / "scylla_unified_master_2023_04_03.tar.gz")) + assert url_hash == '6dbfe395e99c04787bb8e6bcef0283fc' + @pytest.mark.parametrize('architecture', argvalues=typing.get_args(Architecture)) class TestGetManagerFunctions: