Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: closure branch and bound #1174

Merged
merged 11 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
# Required
version: 2

# Set the OS, Python version and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.12"

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
Expand All @@ -16,7 +22,6 @@ formats: all

# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.7
install:
- requirements: docs/requirements.txt
- method: pip
Expand Down
22 changes: 15 additions & 7 deletions dedupe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from pkgutil import extend_path

__path__ = extend_path(__path__, __name__)

from dedupe._init import * # noqa
from dedupe.api import ( # noqa: F401

Check warning on line 1 in dedupe/__init__.py

View check run for this annotation

Codecov / codecov/patch

dedupe/__init__.py#L1

Added line #L1 was not covered by tests
Dedupe,
Gazetteer,
RecordLink,
StaticDedupe,
StaticGazetteer,
StaticRecordLink,
)
from dedupe.convenience import ( # noqa: F401

Check warning on line 9 in dedupe/__init__.py

View check run for this annotation

Codecov / codecov/patch

dedupe/__init__.py#L9

Added line #L9 was not covered by tests
canonicalize,
console_label,
training_data_dedupe,
training_data_link,
)
from dedupe.serializer import read_training, write_training # noqa: F401

Check warning on line 15 in dedupe/__init__.py

View check run for this annotation

Codecov / codecov/patch

dedupe/__init__.py#L15

Added line #L15 was not covered by tests
15 changes: 0 additions & 15 deletions dedupe/_init.py

This file was deleted.

113 changes: 113 additions & 0 deletions dedupe/branch_and_bound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from __future__ import annotations

Check warning on line 1 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L1

Added line #L1 was not covered by tests

import functools
import warnings
from typing import Any, Iterable, Mapping, Sequence, Tuple

Check warning on line 5 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L3-L5

Added lines #L3 - L5 were not covered by tests

from ._typing import Cover
from .predicates import Predicate

Check warning on line 8 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L7-L8

Added lines #L7 - L8 were not covered by tests

Partial = Tuple[Predicate, ...]

Check warning on line 10 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L10

Added line #L10 was not covered by tests


def _reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int:
return len(frozenset.union(*dupe_cover.values())) if dupe_cover else 0

Check warning on line 14 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L13-L14

Added lines #L13 - L14 were not covered by tests


def _remove_dominated(coverage: Cover, dominator: Predicate) -> Cover:
dominant_cover = coverage[dominator]

Check warning on line 18 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L17-L18

Added lines #L17 - L18 were not covered by tests

return {

Check warning on line 20 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L20

Added line #L20 was not covered by tests
pred: cover
for pred, cover in coverage.items()
if not (dominator.cover_count <= pred.cover_count and dominant_cover >= cover)
}


def _uncovered_by(

Check warning on line 27 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L27

Added line #L27 was not covered by tests
coverage: Mapping[Any, frozenset[int]], covered: frozenset[int]
) -> dict[Any, frozenset[int]]:
remaining = {}
for predicate, uncovered in coverage.items():
still_uncovered = uncovered - covered
if still_uncovered:
remaining[predicate] = still_uncovered

Check warning on line 34 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L30-L34

Added lines #L30 - L34 were not covered by tests

return remaining

Check warning on line 36 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L36

Added line #L36 was not covered by tests


def _order_by(

Check warning on line 39 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L39

Added line #L39 was not covered by tests
candidates: Mapping[Predicate, Sequence[Any]], p: Predicate
) -> tuple[int, float]:
return (len(candidates[p]), -p.cover_count)

Check warning on line 42 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L42

Added line #L42 was not covered by tests


def _score(partial: Iterable[Predicate]) -> float:
return sum(p.cover_count for p in partial)

Check warning on line 46 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L45-L46

Added lines #L45 - L46 were not covered by tests


def _suppress_recursion_error(func):
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except RecursionError:
warnings.warn("Recursion limit eached while searching for predicates")

Check warning on line 54 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L49-L54

Added lines #L49 - L54 were not covered by tests

return wrapper

Check warning on line 56 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L56

Added line #L56 was not covered by tests


def search(candidates, target: int, max_calls: int) -> Partial:
calls = max_calls

Check warning on line 60 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L59-L60

Added lines #L59 - L60 were not covered by tests

cheapest_score = float("inf")
cheapest: Partial = ()

Check warning on line 63 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L62-L63

Added lines #L62 - L63 were not covered by tests

original_cover = candidates.copy()

Check warning on line 65 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L65

Added line #L65 was not covered by tests

def _covered(partial: Partial) -> int:
return (

Check warning on line 68 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L67-L68

Added lines #L67 - L68 were not covered by tests
len(frozenset.union(*(original_cover[p] for p in partial)))
if partial
else 0
)

@_suppress_recursion_error
def walk(candidates: Cover, partial: Partial = ()) -> None:

Check warning on line 75 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L74-L75

Added lines #L74 - L75 were not covered by tests
nonlocal calls
nonlocal cheapest
nonlocal cheapest_score

if calls <= 0:
return

Check warning on line 81 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L80-L81

Added lines #L80 - L81 were not covered by tests

calls -= 1

Check warning on line 83 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L83

Added line #L83 was not covered by tests

covered = _covered(partial)
score = _score(partial)

Check warning on line 86 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L85-L86

Added lines #L85 - L86 were not covered by tests

if covered < target:
window = cheapest_score - score
candidates = {

Check warning on line 90 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L88-L90

Added lines #L88 - L90 were not covered by tests
p: cover for p, cover in candidates.items() if p.cover_count < window
}

reachable = _reachable(candidates) + covered

Check warning on line 94 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L94

Added line #L94 was not covered by tests

if candidates and reachable >= target:
order_by = functools.partial(_order_by, candidates)
best = max(candidates, key=order_by)

Check warning on line 98 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L96-L98

Added lines #L96 - L98 were not covered by tests

remaining = _uncovered_by(candidates, candidates[best])
walk(remaining, partial + (best,))
del remaining

Check warning on line 102 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L100-L102

Added lines #L100 - L102 were not covered by tests

reduced = _remove_dominated(candidates, best)
walk(reduced, partial)
del reduced

Check warning on line 106 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L104-L106

Added lines #L104 - L106 were not covered by tests

elif score < cheapest_score:
cheapest = partial
cheapest_score = score

Check warning on line 110 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L108-L110

Added lines #L108 - L110 were not covered by tests

walk(candidates)
return cheapest

Check warning on line 113 in dedupe/branch_and_bound.py

View check run for this annotation

Codecov / codecov/patch

dedupe/branch_and_bound.py#L112-L113

Added lines #L112 - L113 were not covered by tests
114 changes: 3 additions & 111 deletions dedupe/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
from typing import TYPE_CHECKING, overload
from warnings import warn

from . import blocking
from . import blocking, branch_and_bound

Check warning on line 13 in dedupe/training.py

View check run for this annotation

Codecov / codecov/patch

dedupe/training.py#L13

Added line #L13 was not covered by tests

if TYPE_CHECKING:
from typing import Any, Iterable, Mapping, Sequence
from typing import Iterable, Sequence

Check warning on line 16 in dedupe/training.py

View check run for this annotation

Codecov / codecov/patch

dedupe/training.py#L16

Added line #L16 was not covered by tests

from ._typing import (
ComparisonCover,
Expand Down Expand Up @@ -75,8 +75,7 @@
else:
raise ValueError("candidate_type is not valid")

searcher = BranchBound(target_cover, 2500)
final_predicates = searcher.search(candidate_cover)
final_predicates = branch_and_bound.search(candidate_cover, target_cover, 2500)

Check warning on line 78 in dedupe/training.py

View check run for this annotation

Codecov / codecov/patch

dedupe/training.py#L78

Added line #L78 was not covered by tests

logger.info("Final predicate set:")
for predicate in final_predicates:
Expand Down Expand Up @@ -329,113 +328,6 @@
return pair_cover


class BranchBound(object):
def __init__(self, target: int, max_calls: int) -> None:
self.target: int = target
self.calls: int = max_calls

self.cheapest_score: float = float("inf")
self.original_cover: Cover = {}
self.cheapest: tuple[Predicate, ...] = ()

def search(
self, candidates: Cover, partial: tuple[Predicate, ...] = ()
) -> tuple[Predicate, ...]:
if self.calls <= 0:
return self.cheapest

if not self.original_cover:
self.original_cover = candidates.copy()

self.calls -= 1

covered = self.covered(partial)
score = self.score(partial)

if covered >= self.target:
if score < self.cheapest_score:
self.cheapest = partial
self.cheapest_score = score

else:
window = self.cheapest_score - score

candidates = {
p: cover for p, cover in candidates.items() if p.cover_count < window
}

reachable = self.reachable(candidates) + covered

if candidates and reachable >= self.target:
order_by = functools.partial(self.order_by, candidates)

best = max(candidates, key=order_by)

remaining = self.uncovered_by(candidates, candidates[best])
try:
self.search(remaining, partial + (best,))
except RecursionError:
return self.cheapest

del remaining

reduced = self.remove_dominated(candidates, best)

try:
self.search(reduced, partial)
except RecursionError:
return self.cheapest

del reduced

return self.cheapest

@staticmethod
def order_by(
candidates: Mapping[Predicate, Sequence[Any]], p: Predicate
) -> tuple[int, float]:
return (len(candidates[p]), -p.cover_count)

@staticmethod
def score(partial: Iterable[Predicate]) -> float:
return sum(p.cover_count for p in partial)

def covered(self, partial: tuple[Predicate, ...]) -> int:
if partial:
return len(frozenset.union(*(self.original_cover[p] for p in partial)))
else:
return 0

@staticmethod
def reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int:
if dupe_cover:
return len(frozenset.union(*dupe_cover.values()))
else:
return 0

@staticmethod
def remove_dominated(coverage: Cover, dominator: Predicate) -> Cover:
dominant_cover = coverage[dominator]

for pred, cover in coverage.copy().items():
if dominator.cover_count <= pred.cover_count and dominant_cover >= cover:
del coverage[pred]

return coverage

@staticmethod
def uncovered_by(
coverage: Mapping[Any, frozenset[int]], covered: frozenset[int]
) -> dict[Any, frozenset[int]]:
remaining = {}
for predicate, uncovered in coverage.items():
still_uncovered = uncovered - covered
if still_uncovered:
remaining[predicate] = still_uncovered

return remaining


class InfiniteSet(object):
def __and__(self, item):
return item
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ dependencies = [
"scikit-learn",
"affinegap>=1.3",
"categorical-distance>=1.9",
"dedupe-variable-datetime",
"numpy>=1.20",
"doublemetaphone",
"highered>=0.2.0",
Expand Down
8 changes: 3 additions & 5 deletions tests/test_blocking.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import unittest
from collections import defaultdict

from future.utils import viewitems, viewvalues

import dedupe


Expand Down Expand Up @@ -54,7 +52,7 @@ def test_unconstrained_inverted_index(self):
[dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")]
)

blocker.index(set(record["name"] for record in viewvalues(self.data_d)), "name")
blocker.index(set(record["name"] for record in self.data_d.values()), "name")

blocks = defaultdict(set)

Expand Down Expand Up @@ -87,13 +85,13 @@ def setUp(self):

self.records_1 = dict(
(record_id, record)
for record_id, record in viewitems(data_d)
for record_id, record in data_d.items()
if record["dataset"] == 0
)

self.fields_2 = dict(
(record_id, record["name"])
for record_id, record in viewitems(data_d)
for record_id, record in data_d.items()
if record["dataset"] == 1
)

Expand Down
2 changes: 0 additions & 2 deletions tests/test_predicate_functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import unittest

from future.builtins import str

from dedupe import predicate_functions as fn
from dedupe.cpredicates import ngrams

Expand Down
2 changes: 0 additions & 2 deletions tests/test_predicates.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import unittest

from future.builtins import str

from dedupe import predicates


Expand Down
5 changes: 3 additions & 2 deletions tests/test_training.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest

import dedupe
import dedupe.branch_and_bound as branch_and_bound
import dedupe.training as training


Expand Down Expand Up @@ -67,8 +68,8 @@ def test_uncovered_by(self):

before_copy = before.copy()

assert training.BranchBound.uncovered_by(before, frozenset()) == before
assert training.BranchBound.uncovered_by(before, frozenset({3})) == after
assert branch_and_bound._uncovered_by(before, frozenset()) == before
assert branch_and_bound._uncovered_by(before, frozenset({3})) == after
assert before == before_copy

def test_covered_pairs(self):
Expand Down
Loading