Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment: cache results in sqlite db #198

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ test:
deps:
python -m pessimist -c 'python -m $(SRCS).tests' --requirements= --fast .

.PHONY: benchmark
benchmark:
python -m $(SRCS).tests.benchmark

.PHONY: html
html: .venv README.md docs/*.rst docs/conf.py
source $(ACTIVATE) && sphinx-build -ab html docs html
Expand Down
58 changes: 43 additions & 15 deletions ufmt/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,13 @@
UsortConfig,
UsortConfigFactory,
)
from .util import make_black_config, normalize_result, read_file, write_file
from .util import (
make_black_config,
normalize_result,
read_file,
ResultCache,
write_file,
)

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -184,24 +190,28 @@ def ufmt_file(
the skip exception, or ``True`` if no message is given.
"""
path = path.resolve()
black_config = (black_config_factory or make_black_config)(path)
usort_config = (usort_config_factory or UsortConfig.find)(path)

LOG.debug(f"Checking {path}")

cache = ResultCache()
result = Result(path)

try:
src_contents, encoding, newline = read_file(path)
dst_contents = ufmt_bytes(
path,
src_contents,
encoding=encoding,
black_config=black_config,
usort_config=usort_config,
pre_processor=pre_processor,
post_processor=post_processor,
)
if cache.check(path, src_contents):
result.cached = True
dst_contents = src_contents
else:
black_config = (black_config_factory or make_black_config)(path)
usort_config = (usort_config_factory or UsortConfig.find)(path)
dst_contents = ufmt_bytes(
path,
src_contents,
encoding=encoding,
black_config=black_config,
usort_config=usort_config,
pre_processor=pre_processor,
post_processor=post_processor,
)
except SkipFormatting as e:
dst_contents = src_contents
result.skipped = str(e) or True
Expand All @@ -219,7 +229,13 @@ def ufmt_file(
result.before = src_result
result.after = dst_result

if src_contents != dst_contents:
if result.cached:
pass

elif src_contents == dst_contents:
cache.mark(path, src_contents)

else:
result.changed = True

if diff:
Expand All @@ -234,6 +250,7 @@ def ufmt_file(
try:
write_file(path, dst_contents, newline=newline)
result.written = True
cache.mark(path, dst_contents)
except Exception as e:
result.error = e

Expand Down Expand Up @@ -373,6 +390,9 @@ def ufmt_paths(
Trailrunner() if concurrency is None else Trailrunner(concurrency=concurrency)
)

cache = ResultCache()
cache.prepare()

def generate_paths() -> Generator[Path, None, None]:
"""
yield paths to format, using trailrunner to walk directories and exclude paths
Expand All @@ -382,7 +402,13 @@ def generate_paths() -> Generator[Path, None, None]:
LOG.warning("Cannot mix stdin ('-') with normal paths, ignoring")
continue
config = ufmt_config(path, root)
yield from runner.walk(path, excludes=config.excludes)
for p in runner.walk(path, excludes=config.excludes):
p = p.resolve()
content, _, _ = read_file(p)
if cache.check(p, content):
continue

yield p

fn = partial(
ufmt_file,
Expand Down Expand Up @@ -426,3 +452,5 @@ def generate_paths() -> Generator[Path, None, None]:
combined = chain([first, second], gen) # combine first, second, and the rest
for _, result in runner.run_iter(combined, fn):
yield result

cache.cleanup()
90 changes: 90 additions & 0 deletions ufmt/tests/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright Amethyst Reese, Tim Hatch
# Licensed under the MIT license

import time
from pathlib import Path
from typing import Any, List

from typing_extensions import Self

from ufmt import ufmt_file, ufmt_paths

ROOT = Path(__file__).parent.parent.parent


class Timer:
def __init__(self, name: str) -> None:
self.name = name
self.totals: List[int] = []

@classmethod
def fields(self) -> str:
headline = f"{'name':^40} {'min':^10} {'mean':^10} {'max':^10}"
underline = "-" * len(headline)
return f"{headline}\n{underline}"

def __str__(self) -> str:
short = min(self.totals)
long = max(self.totals)
avg = sum(self.totals) // len(self.totals)
fields = " ".join(f"{value // 1000:>7} µs" for value in (short, avg, long))
return f"{self.name + ':':<40} {fields}"

def __enter__(self) -> Self:
self.before = time.monotonic_ns()
return self

def __exit__(self, *args: Any) -> None:
after = time.monotonic_ns()
self.totals.append(after - self.before)


def benchmark() -> None:
print("starting benchmark...")

ufmt_dir = ROOT / "ufmt"
ufmt_core = ufmt_dir / "core.py"
assert ufmt_dir.is_dir(), f"{ufmt_dir} not found, must run benchmark from repo"

print()
print(Timer.fields())

timer = Timer("ufmt_file")
for _ in range(5):
with timer:
ufmt_file(ufmt_core, dry_run=True)
print(timer)

timer = Timer("ufmt_file, diff=True")
for _ in range(5):
with timer:
ufmt_file(ufmt_core, dry_run=True, diff=True)
print(timer)

timer = Timer("ufmt_file, return_content=True")
for _ in range(5):
with timer:
ufmt_file(ufmt_core, dry_run=True, return_content=True)
print(timer)

timer = Timer("ufmt_paths")
for _ in range(5):
with timer:
list(ufmt_paths([ufmt_dir], dry_run=True))
print(timer)

timer = Timer("ufmt_paths, diff=True")
for _ in range(5):
with timer:
list(ufmt_paths([ufmt_dir], dry_run=True, diff=True))
print(timer)

timer = Timer("ufmt_paths, return_content=True")
for _ in range(5):
with timer:
list(ufmt_paths([ufmt_dir], dry_run=True, diff=True))
print(timer)


if __name__ == "__main__":
benchmark()
2 changes: 2 additions & 0 deletions ufmt/tests/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@
)

FAKE_CONFIG = """

[tool.ufmt]
excludes = [
"foo/frob/",
"__init__.py",
]

"""

POORLY_FORMATTED_CODE = """\
Expand Down
1 change: 1 addition & 0 deletions ufmt/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class Result:
path: Path
changed: bool = False
written: bool = False
cached: bool = False
skipped: Union[bool, str] = False
diff: Optional[str] = None
error: Optional[Exception] = None
Expand Down
73 changes: 71 additions & 2 deletions ufmt/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
# Licensed under the MIT license

import os
import sqlite3
import time
import tokenize
import zlib
from contextlib import closing
from pathlib import Path
from typing import Tuple
from typing import Optional, Tuple

from black import find_pyproject_toml, parse_pyproject_toml, TargetVersion

from .types import BlackConfig, Encoding, FileContent, Newline
from .types import BlackConfig, Encoding, FileContent, Newline, SkipFormatting


def make_black_config(path: Path) -> BlackConfig:
Expand Down Expand Up @@ -97,3 +101,68 @@ def enable_libcst_native() -> None:
os.environ["LIBCST_PARSER_TYPE"] = "native"
except ImportError: # pragma: nocover
pass


class ResultCache:
def __init__(
self,
cache_path: Optional[Path] = None,
threshold: int = 7 * 86400,
) -> None:
if cache_path is None:
cache_path = Path.cwd() / ".ufmt_cache" / "cache.db"
cache_path.parent.mkdir(exist_ok=True)
self.cache_path = cache_path
self.threshold = threshold

def prepare(self) -> None:
with closing(sqlite3.connect(self.cache_path)) as db:
with db:
db.execute(
"""
create table if not exists clean (
`path` text,
`crc` integer,
`seen` integer,
unique(`path`, `crc`)
)"""
)

def cleanup(self) -> None:
with closing(sqlite3.connect(self.cache_path)) as db:
with db:
db.execute(
"""
delete from clean where rowid in (
select rowid from clean where `seen` < ?
)
""",
(int(time.time()) - self.threshold,),
)

def check(self, path: Path, content: FileContent) -> bool:
path_str = path.as_posix()
crc = zlib.adler32(content)
with closing(sqlite3.connect(self.cache_path)) as db:
with db:
cursor = db.execute(
"select * from clean where `path` = ? and `crc` = ?",
(path_str, crc),
)
if cursor.fetchone():
db.execute(
"update clean set `seen` = ? where `path` = ? and `crc` = ?",
(int(time.time()), path_str, crc),
)
return True
return False

def mark(self, path: Path, content: FileContent) -> None:
path_str = path.as_posix()
crc = zlib.adler32(content)
with closing(sqlite3.connect(self.cache_path)) as db:
with db:
db.execute(
"insert into clean (`path`, `crc`, `seen`) values (?, ?, ?) on conflict (`path`, `crc`) do update set `seen` = excluded.`seen`",
(path_str, crc, int(time.time())),
)