parseratorvariable for new dedupe pattern

dedupeio · Jun 26, 2024 · 0c6b65c · 0c6b65c
1 parent 0d5100e
commit 0c6b65c
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 107 deletions.
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -0,0 +1,42 @@
+name: tests
+
+on: [push]
+
+jobs:
+  test:
+    continue-on-error: ${{ matrix.python-version == '3.10' }} 
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [windows-latest, macos-latest, ubuntu-latest]
+        python-version: [3.8, 3.9, "3.10", "3.11", "3.12"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        pip install --upgrade pip
+        pip install -e .
+  wheels:
+    needs: test
+    name: Build wheels
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - uses: actions/setup-python@v2
+    - name: Build sdist
+      run: |
+        pip install wheel setuptools build
+        python -m build
+    - name: Publish wheels to PyPI
+      env:
+        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+      run: |
+        pip install twine
+        twine upload dist/*
+      continue-on-error: true
diff --git a/parseratorvariable/__init__.py b/parseratorvariable/__init__.py
@@ -1,9 +1,9 @@
-from collections import OrderedDict
 import functools
+from collections import OrderedDict
 
 import numpy
 from dedupe.variables.base import DerivedType
-from dedupe.variables.string import BaseStringType, StringType, crfEd, affineGap
+from dedupe.variables.string import BaseStringType, StringType, affineGap, crfEd
 from probableparsing import RepeatedLabelError
 
 from . import predicates
@@ -13,27 +13,42 @@
 except ImportError:
     from backports.functools_lru_cache import lru_cache
 
-class ParseratorType(BaseStringType) :
+
+class ParseratorType(BaseStringType):
     type = None
 
     _predicate_functions = StringType._predicate_functions
     _index_predicates = StringType._index_predicates
     _index_thresholds = StringType._index_thresholds
-    _partial_index_predicates = (predicates.PTNCPredicate, predicates.PTNSPredicate,
-                                 predicates.PTTCPredicate, predicates.PTTSPredicate) 
-
-    def __len__(self) :
+    _partial_index_predicates = (
+        predicates.PTNCPredicate,
+        predicates.PTNSPredicate,
+        predicates.PTTCPredicate,
+        predicates.PTTSPredicate,
+    )
+
+    def __len__(self):
         return self.expanded_size
 
-    def __init__(self, definition, tagger=None, block_parts=()) :
-        super(ParseratorType, self).__init__(definition)
+    def __init__(
+        self, field, tagger=None, block_parts=(), crf=False, log_file=None, **kwargs
+    ):
+        super().__init__(field, **kwargs)
 
-        if definition.get('crf', False) == True :
+        if crf:
             self._string_comparison = crfEd
-        else :
+        else:
             self._string_comparison = affineGap
 
-        self._definition = definition
+        # setting up some information for pickling this variable
+        self._definition = {
+            "field": field,
+            "tagger": tagger,
+            "block_parts": block_parts,
+            "crf": crf,
+            "log_file": log_file,
+        }
+        self._definition.update(**kwargs)
 
         self.variable_types, self.variable_parts = comparisons(self.components)
 
@@ -42,22 +57,24 @@ def __init__(self, definition, tagger=None, block_parts=()) :
 
         # missing? + ambiguous? + same_type? + len(indicator) + ...
         # + full_string
-        self.expanded_size = (1 + 1 + 1 + self.n_type_indicators 
-                              + 2 * self.n_parts + 1)
+        self.expanded_size = 1 + 1 + 1 + self.n_type_indicators + 2 * self.n_parts + 1
 
-        fields = self.fields(definition['field'])
-
-        self.higher_vars = [DerivedType({'name' : variable,
-                                         'type' : field_type})
-                            for variable, field_type in fields]
+        fields = self.fields(field)
 
-        self.log_file = definition.get('log file', None)
+        self.higher_vars = [
+            DerivedType(variable, field_type)
+            for variable, field_type in fields
+        ]
+
+        self.log_file = log_file
 
         self._tagger = tagger
 
         for part in block_parts:
             for pred in self._predicate_functions:
-                partial_pred = predicates.PartialString(pred, self.field, part, self.tag)
+                partial_pred = predicates.PartialString(
+                    pred, self.field, part, self.tag
+                )
                 self.predicates.append(partial_pred)
             for pred in self._partial_index_predicates:
                 for threshold in self._index_thresholds:
@@ -73,164 +90,165 @@ def tag(self, field, *args):
         try:
             result = self._tagger(field, *args)
         except RepeatedLabelError as e:
-            if self.log_file :
+            if self.log_file:
                 import csv
-                with open(self.log_file, 'a') as f :
+
+                with open(self.log_file, "a") as f:
                     writer = csv.writer(f)
-                    writer.writerow([e.original_string.encode('utf8')])
+                    writer.writerow([e.original_string.encode("utf8")])
             result = None
 
         return result
 
-
-    def __getstate__(self) :
+    def __getstate__(self):
         return self._definition.copy()
 
-    def __setstate__(self, d) :
-        self.__init__(d)
-
+    def __setstate__(self, d):
+        self.__init__(**d)
 
-    def comparator(self, field_1, field_2) :
+    def comparator(self, field_1, field_2):
         distances = numpy.zeros(self.expanded_size)
         i = 0
 
-        if not (field_1 and field_2) :
+        if not (field_1 and field_2):
             return distances
-        
+
         distances[i] = 1
         i += 1
 
-        try :
-            parsed_variable_1, variable_type_1 = self.tagger(field_1) 
-            parsed_variable_2, variable_type_2  = self.tagger(field_2)
+        try:
+            parsed_variable_1, variable_type_1 = self.tagger(field_1)
+            parsed_variable_2, variable_type_2 = self.tagger(field_2)
         except TypeError:
             distances[i:3] = [1, 0]
             distances[-1] = self.compareString(field_1, field_2)
             return distances
 
-        if 'Ambiguous' in (variable_type_1, variable_type_2) :
+        if "Ambiguous" in (variable_type_1, variable_type_2):
             distances[i:3] = [1, 0]
             distances[-1] = self.compareString(field_1, field_2)
             return distances
-        elif variable_type_1 != variable_type_2 :
+        elif variable_type_1 != variable_type_2:
             distances[i:3] = [0, 0]
             distances[-1] = self.compareString(field_1, field_2)
             return distances
-        elif variable_type_1 == variable_type_2 : 
+        elif variable_type_1 == variable_type_2:
             distances[i:3] = [0, 1]
 
         i += 2
 
         variable_type = self.variable_types[variable_type_1]
 
-        distances[i:i+self.n_type_indicators] = variable_type['indicator']
+        distances[i : i + self.n_type_indicators] = variable_type["indicator"]
         i += self.n_type_indicators
 
-        i += variable_type['offset']
-        for j, dist in enumerate(variable_type['compare'](parsed_variable_1, 
-                                                          parsed_variable_2), 
-                                 i) :
+        i += variable_type["offset"]
+        for j, dist in enumerate(
+            variable_type["compare"](parsed_variable_1, parsed_variable_2), i
+        ):
             distances[j] = dist
 
-        unobserved_parts = numpy.isnan(distances[i:j+1])
-        distances[i:j+1][unobserved_parts] = 0
+        unobserved_parts = numpy.isnan(distances[i : j + 1])
+        distances[i : j + 1][unobserved_parts] = 0
         unobserved_parts = (~unobserved_parts).astype(int)
-        distances[(i + self.n_parts):(j + 1 + self.n_parts)] = unobserved_parts
+        distances[(i + self.n_parts) : (j + 1 + self.n_parts)] = unobserved_parts
 
         return distances
 
-    def fields(self, field) :
-        fields = [('%s: Not Missing' % field, 'Dummy'),
-                  ('ambiguous', 'Dummy'),
-                  ('same name type?', 'Dummy')]
+    def fields(self, field):
+        fields = [
+            ("%s: Not Missing" % field, "Dummy"),
+            ("ambiguous", "Dummy"),
+            ("same name type?", "Dummy"),
+        ]
 
-        fields += [(k.lower(), 'Dummy')
-                   for k in list(self.variable_types.keys())[1:]] 
+        fields += [(k.lower(), "Dummy") for k in list(self.variable_types.keys())[1:]]
 
-        fields += [(part, 'Derived') 
-                   for part in self.variable_parts]
+        fields += [(part, "Derived") for part in self.variable_parts]
 
-        fields += [('%s: Not Missing' % (part,), 
-                    'Not Missing') 
-                   for part in self.variable_parts]
+        fields += [
+            ("%s: Not Missing" % (part,), "Not Missing") for part in self.variable_parts
+        ]
 
-        fields += [('full string', 'String')]
+        fields += [("full string", "String")]
 
         return fields
 
-    def compareFields(self, parts, field_1, field_2) :
-        joinParts = functools.partial(consolidate, components=parts)    
-        for part_1, part_2 in zip(*map(joinParts, [field_1, field_2])) :
+    def compareFields(self, parts, field_1, field_2):
+        joinParts = functools.partial(consolidate, components=parts)
+        for part_1, part_2 in zip(*map(joinParts, [field_1, field_2])):
             yield self.compareString(part_1, part_2)
 
-    def comparePermutable(self, tags_1, tags_2, field_1, field_2) :
+    def comparePermutable(self, tags_1, tags_2, field_1, field_2):
 
         section_1A = tuple(consolidate(field_1, tags_1))
         section_1B = tuple(consolidate(field_1, tags_2))
         whole_2 = tuple(consolidate(field_2, tags_1 + tags_2))
 
-        straight_distances = [self.compareString(part_1, part_2)
-                              for part_1, part_2 
-                              in zip(section_1A + section_1B, whole_2)]
+        straight_distances = [
+            self.compareString(part_1, part_2)
+            for part_1, part_2 in zip(section_1A + section_1B, whole_2)
+        ]
 
-        permuted_distances = [self.compareString(part_1, part_2)
-                               for part_1, part_2 
-                               in zip(section_1B + section_1A, whole_2)]
+        permuted_distances = [
+            self.compareString(part_1, part_2)
+            for part_1, part_2 in zip(section_1B + section_1A, whole_2)
+        ]
 
-        if numpy.nansum(straight_distances) <= numpy.nansum(permuted_distances) :
+        if numpy.nansum(straight_distances) <= numpy.nansum(permuted_distances):
             return straight_distances
-        else :
+        else:
             return permuted_distances
 
-    def compareString(self, string_1, string_2) :
-        if string_1 and string_2 :
+    def compareString(self, string_1, string_2):
+        if string_1 and string_2:
             return self._string_comparison(string_1, string_2)
-        else :
+        else:
             return numpy.nan
 
 
-def comparisons(components) :
+def comparisons(components):
     variable_types = OrderedDict()
     tag_names = []
     offset = 0
 
     n_components = len(components)
-    
-    for i, component in enumerate(components) :
+
+    for i, component in enumerate(components):
         key, compare_func, parts = component[0], component[1], component[2:]
 
         args = []
-        for part in parts :
+        for part in parts:
             names, tags = list(zip(*part))
             tag_names.extend(names)
             args.append(tags)
 
-        comparison = {'compare' : functools.partial(compare_func, *args),
-                      'indicator' : indicatorVector(i, n_components),
-                      'offset' : offset }
+        comparison = {
+            "compare": functools.partial(compare_func, *args),
+            "indicator": indicatorVector(i, n_components),
+            "offset": offset,
+        }
 
         variable_types[key] = comparison
 
         offset = len(tag_names)
-
-    return variable_types, tag_names
-
 
+    return variable_types, tag_names
 
 
-def consolidate(d, components) :
-    for component in components :
+def consolidate(d, components):
+    for component in components:
         available_component = [part for part in component if part in d]
         # Sometimes we want to return non strings so we have to avoid
         # join
-        if len(available_component) == 1 :
+        if len(available_component) == 1:
             yield d[available_component[0]]
-        else :
-            yield ' '.join(d[part] for part in available_component)
+        else:
+            yield " ".join(d[part] for part in available_component)
+
 
-
-def indicatorVector(value, n_categories) :
-    response = numpy.zeros(n_categories-1)
-    if value :
+def indicatorVector(value, n_categories):
+    response = numpy.zeros(n_categories - 1)
+    if value:
         response[value - 1] = 1
     return response