diff --git a/.readthedocs.yml b/.readthedocs.yml index f41ab7436..08ac5176d 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -3,6 +3,12 @@ # Required version: 2 +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py @@ -16,7 +22,6 @@ formats: all # Optionally set the version of Python and requirements required to build your docs python: - version: 3.7 install: - requirements: docs/requirements.txt - method: pip diff --git a/dedupe/__init__.py b/dedupe/__init__.py index cf75318a5..726836a72 100644 --- a/dedupe/__init__.py +++ b/dedupe/__init__.py @@ -1,7 +1,15 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- -from pkgutil import extend_path - -__path__ = extend_path(__path__, __name__) - -from dedupe._init import * # noqa +from dedupe.api import ( # noqa: F401 + Dedupe, + Gazetteer, + RecordLink, + StaticDedupe, + StaticGazetteer, + StaticRecordLink, +) +from dedupe.convenience import ( # noqa: F401 + canonicalize, + console_label, + training_data_dedupe, + training_data_link, +) +from dedupe.serializer import read_training, write_training # noqa: F401 diff --git a/dedupe/_init.py b/dedupe/_init.py deleted file mode 100644 index 726836a72..000000000 --- a/dedupe/_init.py +++ /dev/null @@ -1,15 +0,0 @@ -from dedupe.api import ( # noqa: F401 - Dedupe, - Gazetteer, - RecordLink, - StaticDedupe, - StaticGazetteer, - StaticRecordLink, -) -from dedupe.convenience import ( # noqa: F401 - canonicalize, - console_label, - training_data_dedupe, - training_data_link, -) -from dedupe.serializer import read_training, write_training # noqa: F401 diff --git a/dedupe/branch_and_bound.py b/dedupe/branch_and_bound.py new file mode 100644 index 000000000..6a6004504 --- /dev/null +++ b/dedupe/branch_and_bound.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +import functools +import warnings +from typing import Any, Iterable, Mapping, Sequence, Tuple + +from ._typing import Cover +from .predicates import Predicate + +Partial = Tuple[Predicate, ...] + + +def _reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int: + return len(frozenset.union(*dupe_cover.values())) if dupe_cover else 0 + + +def _remove_dominated(coverage: Cover, dominator: Predicate) -> Cover: + dominant_cover = coverage[dominator] + + return { + pred: cover + for pred, cover in coverage.items() + if not (dominator.cover_count <= pred.cover_count and dominant_cover >= cover) + } + + +def _uncovered_by( + coverage: Mapping[Any, frozenset[int]], covered: frozenset[int] +) -> dict[Any, frozenset[int]]: + remaining = {} + for predicate, uncovered in coverage.items(): + still_uncovered = uncovered - covered + if still_uncovered: + remaining[predicate] = still_uncovered + + return remaining + + +def _order_by( + candidates: Mapping[Predicate, Sequence[Any]], p: Predicate +) -> tuple[int, float]: + return (len(candidates[p]), -p.cover_count) + + +def _score(partial: Iterable[Predicate]) -> float: + return sum(p.cover_count for p in partial) + + +def _suppress_recursion_error(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except RecursionError: + warnings.warn("Recursion limit eached while searching for predicates") + + return wrapper + + +def search(candidates, target: int, max_calls: int) -> Partial: + calls = max_calls + + cheapest_score = float("inf") + cheapest: Partial = () + + original_cover = candidates.copy() + + def _covered(partial: Partial) -> int: + return ( + len(frozenset.union(*(original_cover[p] for p in partial))) + if partial + else 0 + ) + + @_suppress_recursion_error + def walk(candidates: Cover, partial: Partial = ()) -> None: + nonlocal calls + nonlocal cheapest + nonlocal cheapest_score + + if calls <= 0: + return + + calls -= 1 + + covered = _covered(partial) + score = _score(partial) + + if covered < target: + window = cheapest_score - score + candidates = { + p: cover for p, cover in candidates.items() if p.cover_count < window + } + + reachable = _reachable(candidates) + covered + + if candidates and reachable >= target: + order_by = functools.partial(_order_by, candidates) + best = max(candidates, key=order_by) + + remaining = _uncovered_by(candidates, candidates[best]) + walk(remaining, partial + (best,)) + del remaining + + reduced = _remove_dominated(candidates, best) + walk(reduced, partial) + del reduced + + elif score < cheapest_score: + cheapest = partial + cheapest_score = score + + walk(candidates) + return cheapest diff --git a/dedupe/training.py b/dedupe/training.py index b38d5c732..56a078149 100644 --- a/dedupe/training.py +++ b/dedupe/training.py @@ -10,10 +10,10 @@ from typing import TYPE_CHECKING, overload from warnings import warn -from . import blocking +from . import blocking, branch_and_bound if TYPE_CHECKING: - from typing import Any, Iterable, Mapping, Sequence + from typing import Iterable, Sequence from ._typing import ( ComparisonCover, @@ -75,8 +75,7 @@ def learn( else: raise ValueError("candidate_type is not valid") - searcher = BranchBound(target_cover, 2500) - final_predicates = searcher.search(candidate_cover) + final_predicates = branch_and_bound.search(candidate_cover, target_cover, 2500) logger.info("Final predicate set:") for predicate in final_predicates: @@ -329,113 +328,6 @@ def coveredPairs(self, blocker, records_1, records_2): return pair_cover -class BranchBound(object): - def __init__(self, target: int, max_calls: int) -> None: - self.target: int = target - self.calls: int = max_calls - - self.cheapest_score: float = float("inf") - self.original_cover: Cover = {} - self.cheapest: tuple[Predicate, ...] = () - - def search( - self, candidates: Cover, partial: tuple[Predicate, ...] = () - ) -> tuple[Predicate, ...]: - if self.calls <= 0: - return self.cheapest - - if not self.original_cover: - self.original_cover = candidates.copy() - - self.calls -= 1 - - covered = self.covered(partial) - score = self.score(partial) - - if covered >= self.target: - if score < self.cheapest_score: - self.cheapest = partial - self.cheapest_score = score - - else: - window = self.cheapest_score - score - - candidates = { - p: cover for p, cover in candidates.items() if p.cover_count < window - } - - reachable = self.reachable(candidates) + covered - - if candidates and reachable >= self.target: - order_by = functools.partial(self.order_by, candidates) - - best = max(candidates, key=order_by) - - remaining = self.uncovered_by(candidates, candidates[best]) - try: - self.search(remaining, partial + (best,)) - except RecursionError: - return self.cheapest - - del remaining - - reduced = self.remove_dominated(candidates, best) - - try: - self.search(reduced, partial) - except RecursionError: - return self.cheapest - - del reduced - - return self.cheapest - - @staticmethod - def order_by( - candidates: Mapping[Predicate, Sequence[Any]], p: Predicate - ) -> tuple[int, float]: - return (len(candidates[p]), -p.cover_count) - - @staticmethod - def score(partial: Iterable[Predicate]) -> float: - return sum(p.cover_count for p in partial) - - def covered(self, partial: tuple[Predicate, ...]) -> int: - if partial: - return len(frozenset.union(*(self.original_cover[p] for p in partial))) - else: - return 0 - - @staticmethod - def reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int: - if dupe_cover: - return len(frozenset.union(*dupe_cover.values())) - else: - return 0 - - @staticmethod - def remove_dominated(coverage: Cover, dominator: Predicate) -> Cover: - dominant_cover = coverage[dominator] - - for pred, cover in coverage.copy().items(): - if dominator.cover_count <= pred.cover_count and dominant_cover >= cover: - del coverage[pred] - - return coverage - - @staticmethod - def uncovered_by( - coverage: Mapping[Any, frozenset[int]], covered: frozenset[int] - ) -> dict[Any, frozenset[int]]: - remaining = {} - for predicate, uncovered in coverage.items(): - still_uncovered = uncovered - covered - if still_uncovered: - remaining[predicate] = still_uncovered - - return remaining - - class InfiniteSet(object): def __and__(self, item): return item diff --git a/pyproject.toml b/pyproject.toml index 5f1e7d721..508cac74e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,6 @@ dependencies = [ "scikit-learn", "affinegap>=1.3", "categorical-distance>=1.9", - "dedupe-variable-datetime", "numpy>=1.20", "doublemetaphone", "highered>=0.2.0", diff --git a/tests/test_blocking.py b/tests/test_blocking.py index 59a1f674e..6e7af98d8 100644 --- a/tests/test_blocking.py +++ b/tests/test_blocking.py @@ -1,8 +1,6 @@ import unittest from collections import defaultdict -from future.utils import viewitems, viewvalues - import dedupe @@ -54,7 +52,7 @@ def test_unconstrained_inverted_index(self): [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")] ) - blocker.index(set(record["name"] for record in viewvalues(self.data_d)), "name") + blocker.index(set(record["name"] for record in self.data_d.values()), "name") blocks = defaultdict(set) @@ -87,13 +85,13 @@ def setUp(self): self.records_1 = dict( (record_id, record) - for record_id, record in viewitems(data_d) + for record_id, record in data_d.items() if record["dataset"] == 0 ) self.fields_2 = dict( (record_id, record["name"]) - for record_id, record in viewitems(data_d) + for record_id, record in data_d.items() if record["dataset"] == 1 ) diff --git a/tests/test_predicate_functions.py b/tests/test_predicate_functions.py index ac2ce6728..9567eb7d8 100644 --- a/tests/test_predicate_functions.py +++ b/tests/test_predicate_functions.py @@ -1,7 +1,5 @@ import unittest -from future.builtins import str - from dedupe import predicate_functions as fn from dedupe.cpredicates import ngrams diff --git a/tests/test_predicates.py b/tests/test_predicates.py index b491dc704..bcae4fa60 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -1,7 +1,5 @@ import unittest -from future.builtins import str - from dedupe import predicates diff --git a/tests/test_training.py b/tests/test_training.py index 52500134e..b908dde0c 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -1,6 +1,7 @@ import unittest import dedupe +import dedupe.branch_and_bound as branch_and_bound import dedupe.training as training @@ -67,8 +68,8 @@ def test_uncovered_by(self): before_copy = before.copy() - assert training.BranchBound.uncovered_by(before, frozenset()) == before - assert training.BranchBound.uncovered_by(before, frozenset({3})) == after + assert branch_and_bound._uncovered_by(before, frozenset()) == before + assert branch_and_bound._uncovered_by(before, frozenset({3})) == after assert before == before_copy def test_covered_pairs(self):