Skip to content

Commit

Permalink
Merge pull request #9881 from neondatabase/rc/release/2024-11-25--2
Browse files Browse the repository at this point in the history
Fixup Storage & Compute Release 2024-11-25
  • Loading branch information
problame authored Nov 25, 2024
2 parents aada2ee + 166f33f commit 23e579d
Show file tree
Hide file tree
Showing 22 changed files with 183 additions and 75 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/_build-and-test-locally.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ on:
description: 'debug or release'
required: true
type: string
pg-versions:
description: 'a json array of postgres versions to run regression tests on'
test-cfg:
description: 'a json object of postgres versions and lfc states to run regression tests on'
required: true
type: string

Expand Down Expand Up @@ -276,14 +276,14 @@ jobs:
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
strategy:
fail-fast: false
matrix:
pg_version: ${{ fromJson(inputs.pg-versions) }}
matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
steps:
- uses: actions/checkout@v4
with:
submodules: true

- name: Pytest regression tests
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
uses: ./.github/actions/run-python-test-set
timeout-minutes: 60
with:
Expand All @@ -300,6 +300,7 @@ jobs:
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ inputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,14 @@ jobs:
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
# run without LFC on v17 release only
test-cfg: |
${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
{"pg_version":"v15", "lfc_state": "without-lfc"},
{"pg_version":"v16", "lfc_state": "without-lfc"},
{"pg_version":"v17", "lfc_state": "without-lfc"},
{"pg_version":"v17", "lfc_state": "with-lfc"}]'
|| '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
secrets: inherit

# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
Expand Down
48 changes: 24 additions & 24 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions deny.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ reason = "the marvin attack only affects private key decryption, not public key
[licenses]
allow = [
"Apache-2.0",
"Artistic-2.0",
"BSD-2-Clause",
"BSD-3-Clause",
"CC0-1.0",
Expand Down Expand Up @@ -67,7 +66,7 @@ registries = []
# More documentation about the 'bans' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
[bans]
multiple-versions = "warn"
multiple-versions = "allow"
wildcards = "allow"
highlight = "all"
workspace-default-features = "allow"
Expand Down
4 changes: 4 additions & 0 deletions scripts/ingest_regress_test_result-new-format.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
duration INT NOT NULL,
flaky BOOLEAN NOT NULL,
arch arch DEFAULT 'X64',
lfc BOOLEAN DEFAULT false NOT NULL,
build_type TEXT NOT NULL,
pg_version INT NOT NULL,
run_id BIGINT NOT NULL,
Expand All @@ -54,6 +55,7 @@ class Row:
duration: int
flaky: bool
arch: str
lfc: bool
build_type: str
pg_version: int
run_id: int
Expand Down Expand Up @@ -132,6 +134,7 @@ def ingest_test_result(
if p["name"].startswith("__")
}
arch = parameters.get("arch", "UNKNOWN").strip("'")
lfc = parameters.get("lfc", "False") == "True"

build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
labels = {label["name"]: label["value"] for label in test["labels"]}
Expand All @@ -145,6 +148,7 @@ def ingest_test_result(
duration=test["time"]["duration"],
flaky=test["flaky"] or test["retriesStatusChange"],
arch=arch,
lfc=lfc,
build_type=build_type,
pg_version=pg_version,
run_id=run_id,
Expand Down
82 changes: 77 additions & 5 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,12 @@
from fixtures.utils import (
ATTACHMENT_NAME_REGEX,
COMPONENT_BINARIES,
USE_LFC,
allure_add_grafana_links,
assert_no_errors,
get_dir_size,
print_gc_result,
size_to_bytes,
subprocess_capture,
wait_until,
)
Expand Down Expand Up @@ -3742,12 +3744,45 @@ def create(
self.pgdata_dir = self.env.repo_dir / path
self.logfile = self.endpoint_path() / "compute.log"

config_lines = config_lines or []

# set small 'max_replication_write_lag' to enable backpressure
# and make tests more stable.
config_lines = ["max_replication_write_lag=15MB"] + config_lines

# Delete file cache if it exists (and we're recreating the endpoint)
if USE_LFC:
if (lfc_path := Path(self.lfc_path())).exists():
lfc_path.unlink()
else:
lfc_path.parent.mkdir(parents=True, exist_ok=True)
for line in config_lines:
if (
line.find("neon.max_file_cache_size") > -1
or line.find("neon.file_cache_size_limit") > -1
):
m = re.search(r"=\s*(\S+)", line)
assert m is not None, f"malformed config line {line}"
size = m.group(1)
assert size_to_bytes(size) >= size_to_bytes(
"1MB"
), "LFC size cannot be set less than 1MB"
# shared_buffers = 512kB to make postgres use LFC intensively
# neon.max_file_cache_size and neon.file_cache size limit are
# set to 1MB because small LFC is better for testing (helps to find more problems)
config_lines = [
"shared_buffers = 512kB",
f"neon.file_cache_path = '{self.lfc_path()}'",
"neon.max_file_cache_size = 1MB",
"neon.file_cache_size_limit = 1MB",
] + config_lines
else:
for line in config_lines:
assert (
line.find("neon.max_file_cache_size") == -1
), "Setting LFC parameters is not allowed when LFC is disabled"
assert (
line.find("neon.file_cache_size_limit") == -1
), "Setting LFC parameters is not allowed when LFC is disabled"

self.config(config_lines)

return self
Expand Down Expand Up @@ -3781,6 +3816,9 @@ def start(
basebackup_request_tries=basebackup_request_tries,
)
self._running.release(1)
self.log_config_value("shared_buffers")
self.log_config_value("neon.max_file_cache_size")
self.log_config_value("neon.file_cache_size_limit")

return self

Expand All @@ -3806,6 +3844,10 @@ def config_file_path(self) -> Path:
"""Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
return self.endpoint_path() / "postgresql.conf"

def lfc_path(self) -> Path:
"""Path to the lfc file"""
return self.endpoint_path() / "file_cache" / "file.cache"

def config(self, lines: list[str]) -> Self:
"""
Add lines to postgresql.conf.
Expand Down Expand Up @@ -3984,16 +4026,46 @@ def get_pg_wal_size(self):
assert self.pgdata_dir is not None # please mypy
return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024

def clear_shared_buffers(self, cursor: Any | None = None):
def clear_buffers(self, cursor: Any | None = None):
"""
Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.'
Might also clear LFC.
It clears LFC as well by setting neon.file_cache_size_limit to 0 and then returning it to the previous value,
if LFC is enabled
"""
if cursor is not None:
cursor.execute("select clear_buffer_cache()")
if not USE_LFC:
return
cursor.execute("SHOW neon.file_cache_size_limit")
res = cursor.fetchone()
assert res, "Cannot get neon.file_cache_size_limit"
file_cache_size_limit = res[0]
if file_cache_size_limit == 0:
return
cursor.execute("ALTER SYSTEM SET neon.file_cache_size_limit=0")
cursor.execute("SELECT pg_reload_conf()")
cursor.execute(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'")
cursor.execute("SELECT pg_reload_conf()")
else:
self.safe_psql("select clear_buffer_cache()")
if not USE_LFC:
return
file_cache_size_limit = self.safe_psql_scalar(
"SHOW neon.file_cache_size_limit", log_query=False
)
if file_cache_size_limit == 0:
return
self.safe_psql("ALTER SYSTEM SET neon.file_cache_size_limit=0")
self.safe_psql("SELECT pg_reload_conf()")
self.safe_psql(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'")
self.safe_psql("SELECT pg_reload_conf()")

def log_config_value(self, param):
"""
Writes the config value param to log
"""
res = self.safe_psql_scalar(f"SHOW {param}", log_query=False)
log.info("%s = %s", param, res)


class EndpointFactory:
Expand Down
Loading

1 comment on commit 23e579d

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

6230 tests run: 5960 passed, 0 failed, 270 skipped (full report)


Flaky tests (1)

Postgres 17

  • test_ondemand_wal_download_in_replication_slot_funcs: release-arm64

Test coverage report is not available

The comment gets automatically updated with the latest test results
23e579d at 2024-11-25T17:44:02.072Z :recycle:

Please sign in to comment.