From 689105a9110c32e261e33d099ce32b1c0542822c Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 16:38:29 -0500 Subject: [PATCH 01/11] black --- dedupe/__init__.py | 22 ++++--- dedupe/_init.py | 15 ----- dedupe/branch_and_bound.py | 111 ++++++++++++++++++++++++++++++++++++ dedupe/training.py | 114 +------------------------------------ pyproject.toml | 1 - 5 files changed, 129 insertions(+), 134 deletions(-) delete mode 100644 dedupe/_init.py create mode 100644 dedupe/branch_and_bound.py diff --git a/dedupe/__init__.py b/dedupe/__init__.py index cf75318a5..726836a72 100644 --- a/dedupe/__init__.py +++ b/dedupe/__init__.py @@ -1,7 +1,15 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- -from pkgutil import extend_path - -__path__ = extend_path(__path__, __name__) - -from dedupe._init import * # noqa +from dedupe.api import ( # noqa: F401 + Dedupe, + Gazetteer, + RecordLink, + StaticDedupe, + StaticGazetteer, + StaticRecordLink, +) +from dedupe.convenience import ( # noqa: F401 + canonicalize, + console_label, + training_data_dedupe, + training_data_link, +) +from dedupe.serializer import read_training, write_training # noqa: F401 diff --git a/dedupe/_init.py b/dedupe/_init.py deleted file mode 100644 index 726836a72..000000000 --- a/dedupe/_init.py +++ /dev/null @@ -1,15 +0,0 @@ -from dedupe.api import ( # noqa: F401 - Dedupe, - Gazetteer, - RecordLink, - StaticDedupe, - StaticGazetteer, - StaticRecordLink, -) -from dedupe.convenience import ( # noqa: F401 - canonicalize, - console_label, - training_data_dedupe, - training_data_link, -) -from dedupe.serializer import read_training, write_training # noqa: F401 diff --git a/dedupe/branch_and_bound.py b/dedupe/branch_and_bound.py new file mode 100644 index 000000000..f9ee0dba6 --- /dev/null +++ b/dedupe/branch_and_bound.py @@ -0,0 +1,111 @@ +import functools +import warnings +from typing import Any, Iterable, Mapping, Sequence + +from ._typing import Cover +from .predicates import Predicate + +Partial = tuple[Predicate, ...] + + +def _reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int: + return len(frozenset.union(*dupe_cover.values())) if dupe_cover else 0 + + +def _remove_dominated(coverage: Cover, dominator: Predicate) -> Cover: + dominant_cover = coverage[dominator] + + return { + pred: cover + for pred, cover in coverage.items() + if not (dominator.cover_count <= pred.cover_count and dominant_cover >= cover) + } + + +def _uncovered_by( + coverage: Mapping[Any, frozenset[int]], covered: frozenset[int] +) -> dict[Any, frozenset[int]]: + remaining = {} + for predicate, uncovered in coverage.items(): + still_uncovered = uncovered - covered + if still_uncovered: + remaining[predicate] = still_uncovered + + return remaining + + +def _order_by( + candidates: Mapping[Predicate, Sequence[Any]], p: Predicate +) -> tuple[int, float]: + return (len(candidates[p]), -p.cover_count) + + +def _score(partial: Iterable[Predicate]) -> float: + return sum(p.cover_count for p in partial) + + +def _suppress_recursion_wrapper(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except RecursionError: + warnings.warn("Recursion limit eached while searching for predicates") + + return wrapper + + +def search(candidates, target: int, max_calls: int) -> Partial: + calls = max_calls + + cheapest_score = float("inf") + cheapest: Partial = () + + original_cover = candidates.copy() + + def _covered(partial: Partial) -> int: + return ( + len(frozenset.union(*(original_cover[p] for p in partial))) + if partial + else 0 + ) + + @_suppress_recursion_wrapper + def walk(candidates: Cover, partial: Partial = ()) -> None: + nonlocal calls + nonlocal cheapest + nonlocal cheapest_score + + if calls <= 0: + return + + calls -= 1 + + covered = _covered(partial) + score = _score(partial) + + if covered < target: + window = cheapest_score - score + candidates = { + p: cover for p, cover in candidates.items() if p.cover_count < window + } + + reachable = _reachable(candidates) + covered + + if candidates and reachable >= target: + order_by = functools.partial(_order_by, candidates) + best = max(candidates, key=order_by) + + remaining = _uncovered_by(candidates, candidates[best]) + walk(remaining, partial + (best,)) + del remaining + + reduced = _remove_dominated(candidates, best) + walk(reduced, partial) + del reduced + + elif score < cheapest_score: + cheapest = partial + cheapest_score = score + + walk(candidates) + return cheapest diff --git a/dedupe/training.py b/dedupe/training.py index b38d5c732..56a078149 100644 --- a/dedupe/training.py +++ b/dedupe/training.py @@ -10,10 +10,10 @@ from typing import TYPE_CHECKING, overload from warnings import warn -from . import blocking +from . import blocking, branch_and_bound if TYPE_CHECKING: - from typing import Any, Iterable, Mapping, Sequence + from typing import Iterable, Sequence from ._typing import ( ComparisonCover, @@ -75,8 +75,7 @@ def learn( else: raise ValueError("candidate_type is not valid") - searcher = BranchBound(target_cover, 2500) - final_predicates = searcher.search(candidate_cover) + final_predicates = branch_and_bound.search(candidate_cover, target_cover, 2500) logger.info("Final predicate set:") for predicate in final_predicates: @@ -329,113 +328,6 @@ def coveredPairs(self, blocker, records_1, records_2): return pair_cover -class BranchBound(object): - def __init__(self, target: int, max_calls: int) -> None: - self.target: int = target - self.calls: int = max_calls - - self.cheapest_score: float = float("inf") - self.original_cover: Cover = {} - self.cheapest: tuple[Predicate, ...] = () - - def search( - self, candidates: Cover, partial: tuple[Predicate, ...] = () - ) -> tuple[Predicate, ...]: - if self.calls <= 0: - return self.cheapest - - if not self.original_cover: - self.original_cover = candidates.copy() - - self.calls -= 1 - - covered = self.covered(partial) - score = self.score(partial) - - if covered >= self.target: - if score < self.cheapest_score: - self.cheapest = partial - self.cheapest_score = score - - else: - window = self.cheapest_score - score - - candidates = { - p: cover for p, cover in candidates.items() if p.cover_count < window - } - - reachable = self.reachable(candidates) + covered - - if candidates and reachable >= self.target: - order_by = functools.partial(self.order_by, candidates) - - best = max(candidates, key=order_by) - - remaining = self.uncovered_by(candidates, candidates[best]) - try: - self.search(remaining, partial + (best,)) - except RecursionError: - return self.cheapest - - del remaining - - reduced = self.remove_dominated(candidates, best) - - try: - self.search(reduced, partial) - except RecursionError: - return self.cheapest - - del reduced - - return self.cheapest - - @staticmethod - def order_by( - candidates: Mapping[Predicate, Sequence[Any]], p: Predicate - ) -> tuple[int, float]: - return (len(candidates[p]), -p.cover_count) - - @staticmethod - def score(partial: Iterable[Predicate]) -> float: - return sum(p.cover_count for p in partial) - - def covered(self, partial: tuple[Predicate, ...]) -> int: - if partial: - return len(frozenset.union(*(self.original_cover[p] for p in partial))) - else: - return 0 - - @staticmethod - def reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int: - if dupe_cover: - return len(frozenset.union(*dupe_cover.values())) - else: - return 0 - - @staticmethod - def remove_dominated(coverage: Cover, dominator: Predicate) -> Cover: - dominant_cover = coverage[dominator] - - for pred, cover in coverage.copy().items(): - if dominator.cover_count <= pred.cover_count and dominant_cover >= cover: - del coverage[pred] - - return coverage - - @staticmethod - def uncovered_by( - coverage: Mapping[Any, frozenset[int]], covered: frozenset[int] - ) -> dict[Any, frozenset[int]]: - remaining = {} - for predicate, uncovered in coverage.items(): - still_uncovered = uncovered - covered - if still_uncovered: - remaining[predicate] = still_uncovered - - return remaining - - class InfiniteSet(object): def __and__(self, item): return item diff --git a/pyproject.toml b/pyproject.toml index 5f1e7d721..508cac74e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,6 @@ dependencies = [ "scikit-learn", "affinegap>=1.3", "categorical-distance>=1.9", - "dedupe-variable-datetime", "numpy>=1.20", "doublemetaphone", "highered>=0.2.0", From 9e70bebf028d8462f34bc8b65a34ace892573b9e Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 16:40:17 -0500 Subject: [PATCH 02/11] better name for decorator --- dedupe/branch_and_bound.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedupe/branch_and_bound.py b/dedupe/branch_and_bound.py index f9ee0dba6..ef49830a4 100644 --- a/dedupe/branch_and_bound.py +++ b/dedupe/branch_and_bound.py @@ -44,7 +44,7 @@ def _score(partial: Iterable[Predicate]) -> float: return sum(p.cover_count for p in partial) -def _suppress_recursion_wrapper(func): +def _suppress_recursion_error(func): def wrapper(*args, **kwargs): try: return func(*args, **kwargs) @@ -69,7 +69,7 @@ def _covered(partial: Partial) -> int: else 0 ) - @_suppress_recursion_wrapper + @_suppress_recursion_error def walk(candidates: Cover, partial: Partial = ()) -> None: nonlocal calls nonlocal cheapest From 487930d1b90b2d1a8955639a49defefd465c272f Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 16:43:11 -0500 Subject: [PATCH 03/11] add os option --- .readthedocs.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.readthedocs.yml b/.readthedocs.yml index f41ab7436..50a7541f8 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -3,6 +3,12 @@ # Required version: 2 +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py From 3bf9964a76bc3d17caa5104d31b4501aeed25a46 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 16:44:28 -0500 Subject: [PATCH 04/11] add os option --- .readthedocs.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 50a7541f8..08ac5176d 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -22,7 +22,6 @@ formats: all # Optionally set the version of Python and requirements required to build your docs python: - version: 3.7 install: - requirements: docs/requirements.txt - method: pip From 97ed385b645e278ed29d7ea82be5d5435d22218d Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 16:47:00 -0500 Subject: [PATCH 05/11] remove future mention --- tests/test_predicates.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_predicates.py b/tests/test_predicates.py index b491dc704..bcae4fa60 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -1,7 +1,5 @@ import unittest -from future.builtins import str - from dedupe import predicates From 037286eaae83249d4085152a30e571f8fa52c322 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 17:26:45 -0500 Subject: [PATCH 06/11] Update test_predicate_functions.py --- tests/test_predicate_functions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_predicate_functions.py b/tests/test_predicate_functions.py index ac2ce6728..9567eb7d8 100644 --- a/tests/test_predicate_functions.py +++ b/tests/test_predicate_functions.py @@ -1,7 +1,5 @@ import unittest -from future.builtins import str - from dedupe import predicate_functions as fn from dedupe.cpredicates import ngrams From 307adb477aad73f01c687aa0bf6738ff1faf2cac Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 19:34:38 -0500 Subject: [PATCH 07/11] remove future mentions --- tests/test_blocking.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_blocking.py b/tests/test_blocking.py index 59a1f674e..6e7af98d8 100644 --- a/tests/test_blocking.py +++ b/tests/test_blocking.py @@ -1,8 +1,6 @@ import unittest from collections import defaultdict -from future.utils import viewitems, viewvalues - import dedupe @@ -54,7 +52,7 @@ def test_unconstrained_inverted_index(self): [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")] ) - blocker.index(set(record["name"] for record in viewvalues(self.data_d)), "name") + blocker.index(set(record["name"] for record in self.data_d.values()), "name") blocks = defaultdict(set) @@ -87,13 +85,13 @@ def setUp(self): self.records_1 = dict( (record_id, record) - for record_id, record in viewitems(data_d) + for record_id, record in data_d.items() if record["dataset"] == 0 ) self.fields_2 = dict( (record_id, record["name"]) - for record_id, record in viewitems(data_d) + for record_id, record in data_d.items() if record["dataset"] == 1 ) From 830dd21d0346d740919d0cde2ddba232f1e2b261 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 19:37:31 -0500 Subject: [PATCH 08/11] adjust tests --- tests/test_training.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index 52500134e..b908dde0c 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -1,6 +1,7 @@ import unittest import dedupe +import dedupe.branch_and_bound as branch_and_bound import dedupe.training as training @@ -67,8 +68,8 @@ def test_uncovered_by(self): before_copy = before.copy() - assert training.BranchBound.uncovered_by(before, frozenset()) == before - assert training.BranchBound.uncovered_by(before, frozenset({3})) == after + assert branch_and_bound._uncovered_by(before, frozenset()) == before + assert branch_and_bound._uncovered_by(before, frozenset({3})) == after assert before == before_copy def test_covered_pairs(self): From 69a710c4384c9d0ada5941b3fe404a315e34f5a2 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 19:42:33 -0500 Subject: [PATCH 09/11] typing --- dedupe/branch_and_bound.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedupe/branch_and_bound.py b/dedupe/branch_and_bound.py index ef49830a4..ad814c2c8 100644 --- a/dedupe/branch_and_bound.py +++ b/dedupe/branch_and_bound.py @@ -1,11 +1,11 @@ import functools import warnings -from typing import Any, Iterable, Mapping, Sequence +from typing import Any, Iterable, Mapping, Sequence, Tuple from ._typing import Cover from .predicates import Predicate -Partial = tuple[Predicate, ...] +Partial = Tuple[Predicate, ...] def _reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int: From 4ec39779f1f21e7159290d5d9162978aa3a48ebf Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 19:47:04 -0500 Subject: [PATCH 10/11] typing --- dedupe/branch_and_bound.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dedupe/branch_and_bound.py b/dedupe/branch_and_bound.py index ad814c2c8..eaa8c17e2 100644 --- a/dedupe/branch_and_bound.py +++ b/dedupe/branch_and_bound.py @@ -1,11 +1,13 @@ +from __future__ import annotations + import functools import warnings -from typing import Any, Iterable, Mapping, Sequence, Tuple +from typing import Any, Iterable, Mapping, Sequence from ._typing import Cover from .predicates import Predicate -Partial = Tuple[Predicate, ...] +Partial = tuple[Predicate, ...] def _reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int: From 3b159a264c8589451153b54d5584071bccd4a2dd Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 18 Dec 2023 19:50:46 -0500 Subject: [PATCH 11/11] typing --- dedupe/branch_and_bound.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedupe/branch_and_bound.py b/dedupe/branch_and_bound.py index eaa8c17e2..6a6004504 100644 --- a/dedupe/branch_and_bound.py +++ b/dedupe/branch_and_bound.py @@ -2,12 +2,12 @@ import functools import warnings -from typing import Any, Iterable, Mapping, Sequence +from typing import Any, Iterable, Mapping, Sequence, Tuple from ._typing import Cover from .predicates import Predicate -Partial = tuple[Predicate, ...] +Partial = Tuple[Predicate, ...] def _reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int: