diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..ba914b9 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,42 @@ +name: tests + +on: [push] + +jobs: + test: + continue-on-error: ${{ matrix.python-version == '3.10' }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [windows-latest, macos-latest, ubuntu-latest] + python-version: [3.8, 3.9, "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e . + wheels: + needs: test + name: Build wheels + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - uses: actions/setup-python@v2 + - name: Build sdist + run: | + pip install wheel setuptools build + python -m build + - name: Publish wheels to PyPI + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + pip install twine + twine upload dist/* + continue-on-error: true diff --git a/parseratorvariable/__init__.py b/parseratorvariable/__init__.py index 37da9b8..8c63d1d 100644 --- a/parseratorvariable/__init__.py +++ b/parseratorvariable/__init__.py @@ -1,9 +1,9 @@ -from collections import OrderedDict import functools +from collections import OrderedDict import numpy from dedupe.variables.base import DerivedType -from dedupe.variables.string import BaseStringType, StringType, crfEd, affineGap +from dedupe.variables.string import BaseStringType, StringType, affineGap, crfEd from probableparsing import RepeatedLabelError from . import predicates @@ -13,27 +13,42 @@ except ImportError: from backports.functools_lru_cache import lru_cache -class ParseratorType(BaseStringType) : + +class ParseratorType(BaseStringType): type = None _predicate_functions = StringType._predicate_functions _index_predicates = StringType._index_predicates _index_thresholds = StringType._index_thresholds - _partial_index_predicates = (predicates.PTNCPredicate, predicates.PTNSPredicate, - predicates.PTTCPredicate, predicates.PTTSPredicate) - - def __len__(self) : + _partial_index_predicates = ( + predicates.PTNCPredicate, + predicates.PTNSPredicate, + predicates.PTTCPredicate, + predicates.PTTSPredicate, + ) + + def __len__(self): return self.expanded_size - def __init__(self, definition, tagger=None, block_parts=()) : - super(ParseratorType, self).__init__(definition) + def __init__( + self, field, tagger=None, block_parts=(), crf=False, log_file=None, **kwargs + ): + super().__init__(field, **kwargs) - if definition.get('crf', False) == True : + if crf: self._string_comparison = crfEd - else : + else: self._string_comparison = affineGap - self._definition = definition + # setting up some information for pickling this variable + self._definition = { + "field": field, + "tagger": tagger, + "block_parts": block_parts, + "crf": crf, + "log_file": log_file, + } + self._definition.update(**kwargs) self.variable_types, self.variable_parts = comparisons(self.components) @@ -42,22 +57,24 @@ def __init__(self, definition, tagger=None, block_parts=()) : # missing? + ambiguous? + same_type? + len(indicator) + ... # + full_string - self.expanded_size = (1 + 1 + 1 + self.n_type_indicators - + 2 * self.n_parts + 1) + self.expanded_size = 1 + 1 + 1 + self.n_type_indicators + 2 * self.n_parts + 1 - fields = self.fields(definition['field']) - - self.higher_vars = [DerivedType({'name' : variable, - 'type' : field_type}) - for variable, field_type in fields] + fields = self.fields(field) - self.log_file = definition.get('log file', None) + self.higher_vars = [ + DerivedType(variable, field_type) + for variable, field_type in fields + ] + + self.log_file = log_file self._tagger = tagger for part in block_parts: for pred in self._predicate_functions: - partial_pred = predicates.PartialString(pred, self.field, part, self.tag) + partial_pred = predicates.PartialString( + pred, self.field, part, self.tag + ) self.predicates.append(partial_pred) for pred in self._partial_index_predicates: for threshold in self._index_thresholds: @@ -73,164 +90,165 @@ def tag(self, field, *args): try: result = self._tagger(field, *args) except RepeatedLabelError as e: - if self.log_file : + if self.log_file: import csv - with open(self.log_file, 'a') as f : + + with open(self.log_file, "a") as f: writer = csv.writer(f) - writer.writerow([e.original_string.encode('utf8')]) + writer.writerow([e.original_string.encode("utf8")]) result = None return result - - def __getstate__(self) : + def __getstate__(self): return self._definition.copy() - def __setstate__(self, d) : - self.__init__(d) - + def __setstate__(self, d): + self.__init__(**d) - def comparator(self, field_1, field_2) : + def comparator(self, field_1, field_2): distances = numpy.zeros(self.expanded_size) i = 0 - if not (field_1 and field_2) : + if not (field_1 and field_2): return distances - + distances[i] = 1 i += 1 - try : - parsed_variable_1, variable_type_1 = self.tagger(field_1) - parsed_variable_2, variable_type_2 = self.tagger(field_2) + try: + parsed_variable_1, variable_type_1 = self.tagger(field_1) + parsed_variable_2, variable_type_2 = self.tagger(field_2) except TypeError: distances[i:3] = [1, 0] distances[-1] = self.compareString(field_1, field_2) return distances - if 'Ambiguous' in (variable_type_1, variable_type_2) : + if "Ambiguous" in (variable_type_1, variable_type_2): distances[i:3] = [1, 0] distances[-1] = self.compareString(field_1, field_2) return distances - elif variable_type_1 != variable_type_2 : + elif variable_type_1 != variable_type_2: distances[i:3] = [0, 0] distances[-1] = self.compareString(field_1, field_2) return distances - elif variable_type_1 == variable_type_2 : + elif variable_type_1 == variable_type_2: distances[i:3] = [0, 1] i += 2 variable_type = self.variable_types[variable_type_1] - distances[i:i+self.n_type_indicators] = variable_type['indicator'] + distances[i : i + self.n_type_indicators] = variable_type["indicator"] i += self.n_type_indicators - i += variable_type['offset'] - for j, dist in enumerate(variable_type['compare'](parsed_variable_1, - parsed_variable_2), - i) : + i += variable_type["offset"] + for j, dist in enumerate( + variable_type["compare"](parsed_variable_1, parsed_variable_2), i + ): distances[j] = dist - unobserved_parts = numpy.isnan(distances[i:j+1]) - distances[i:j+1][unobserved_parts] = 0 + unobserved_parts = numpy.isnan(distances[i : j + 1]) + distances[i : j + 1][unobserved_parts] = 0 unobserved_parts = (~unobserved_parts).astype(int) - distances[(i + self.n_parts):(j + 1 + self.n_parts)] = unobserved_parts + distances[(i + self.n_parts) : (j + 1 + self.n_parts)] = unobserved_parts return distances - def fields(self, field) : - fields = [('%s: Not Missing' % field, 'Dummy'), - ('ambiguous', 'Dummy'), - ('same name type?', 'Dummy')] + def fields(self, field): + fields = [ + ("%s: Not Missing" % field, "Dummy"), + ("ambiguous", "Dummy"), + ("same name type?", "Dummy"), + ] - fields += [(k.lower(), 'Dummy') - for k in list(self.variable_types.keys())[1:]] + fields += [(k.lower(), "Dummy") for k in list(self.variable_types.keys())[1:]] - fields += [(part, 'Derived') - for part in self.variable_parts] + fields += [(part, "Derived") for part in self.variable_parts] - fields += [('%s: Not Missing' % (part,), - 'Not Missing') - for part in self.variable_parts] + fields += [ + ("%s: Not Missing" % (part,), "Not Missing") for part in self.variable_parts + ] - fields += [('full string', 'String')] + fields += [("full string", "String")] return fields - def compareFields(self, parts, field_1, field_2) : - joinParts = functools.partial(consolidate, components=parts) - for part_1, part_2 in zip(*map(joinParts, [field_1, field_2])) : + def compareFields(self, parts, field_1, field_2): + joinParts = functools.partial(consolidate, components=parts) + for part_1, part_2 in zip(*map(joinParts, [field_1, field_2])): yield self.compareString(part_1, part_2) - def comparePermutable(self, tags_1, tags_2, field_1, field_2) : + def comparePermutable(self, tags_1, tags_2, field_1, field_2): section_1A = tuple(consolidate(field_1, tags_1)) section_1B = tuple(consolidate(field_1, tags_2)) whole_2 = tuple(consolidate(field_2, tags_1 + tags_2)) - straight_distances = [self.compareString(part_1, part_2) - for part_1, part_2 - in zip(section_1A + section_1B, whole_2)] + straight_distances = [ + self.compareString(part_1, part_2) + for part_1, part_2 in zip(section_1A + section_1B, whole_2) + ] - permuted_distances = [self.compareString(part_1, part_2) - for part_1, part_2 - in zip(section_1B + section_1A, whole_2)] + permuted_distances = [ + self.compareString(part_1, part_2) + for part_1, part_2 in zip(section_1B + section_1A, whole_2) + ] - if numpy.nansum(straight_distances) <= numpy.nansum(permuted_distances) : + if numpy.nansum(straight_distances) <= numpy.nansum(permuted_distances): return straight_distances - else : + else: return permuted_distances - def compareString(self, string_1, string_2) : - if string_1 and string_2 : + def compareString(self, string_1, string_2): + if string_1 and string_2: return self._string_comparison(string_1, string_2) - else : + else: return numpy.nan -def comparisons(components) : +def comparisons(components): variable_types = OrderedDict() tag_names = [] offset = 0 n_components = len(components) - - for i, component in enumerate(components) : + + for i, component in enumerate(components): key, compare_func, parts = component[0], component[1], component[2:] args = [] - for part in parts : + for part in parts: names, tags = list(zip(*part)) tag_names.extend(names) args.append(tags) - comparison = {'compare' : functools.partial(compare_func, *args), - 'indicator' : indicatorVector(i, n_components), - 'offset' : offset } + comparison = { + "compare": functools.partial(compare_func, *args), + "indicator": indicatorVector(i, n_components), + "offset": offset, + } variable_types[key] = comparison offset = len(tag_names) - - return variable_types, tag_names - + return variable_types, tag_names -def consolidate(d, components) : - for component in components : +def consolidate(d, components): + for component in components: available_component = [part for part in component if part in d] # Sometimes we want to return non strings so we have to avoid # join - if len(available_component) == 1 : + if len(available_component) == 1: yield d[available_component[0]] - else : - yield ' '.join(d[part] for part in available_component) + else: + yield " ".join(d[part] for part in available_component) + - -def indicatorVector(value, n_categories) : - response = numpy.zeros(n_categories-1) - if value : +def indicatorVector(value, n_categories): + response = numpy.zeros(n_categories - 1) + if value: response[value - 1] = 1 return response diff --git a/parseratorvariable/predicates.py b/parseratorvariable/predicates.py index 15f9131..3ecbddd 100644 --- a/parseratorvariable/predicates.py +++ b/parseratorvariable/predicates.py @@ -1,43 +1,51 @@ -from probableparsing import RepeatedLabelError from dedupe import predicates +from probableparsing import RepeatedLabelError + class PartialIndex(object): def __init__(self, *args, **kwargs): - self.part = kwargs.pop('part') - self.tag = kwargs.pop('tag') + self.part = kwargs.pop("part") + self.tag = kwargs.pop("tag") super(PartialIndex, self).__init__(*args, **kwargs) - self.__name__ = '(%s, %s, %s)' % (self.threshold, self.field, self.part) - + self.__name__ = "(%s, %s, %s)" % (self.threshold, self.field, self.part) + def preprocess(self, doc): try: tags, _ = self.tag(doc) except TypeError: - part = '' + part = "" else: - part = tags.get(self.part, '') + part = tags.get(self.part, "") return super(PartialIndex, self).preprocess(part) + class PLCPredicate(PartialIndex, predicates.LevenshteinCanopyPredicate): type = "PartialIndexLevenshteinCanopyPredicate" + class PLSPredicate(PartialIndex, predicates.LevenshteinSearchPredicate): type = "PartialIndexLevenshteinSearchPredicate" + class PTNCPredicate(PartialIndex, predicates.TfidfNGramCanopyPredicate): type = "PartialIndexTfidfNGramCanopyPredicate" + class PTNSPredicate(PartialIndex, predicates.TfidfNGramSearchPredicate): type = "PartialIndexTfidfNGramSearchPredicate" + class PTTCPredicate(PartialIndex, predicates.TfidfTextCanopyPredicate): type = "PartialIndexTfidfTextCanopyPredicate" + class PTTSPredicate(PartialIndex, predicates.TfidfTextSearchPredicate): type = "PartialIndexTfidfTextSearchPredicate" + class PartialString(predicates.StringPredicate): - type = 'PartialPredicate' - + type = "PartialPredicate" + def __init__(self, func, field, part, tag): self.func = func self.__name__ = "(%s, %s, %s)" % (func.__name__, field, part) @@ -45,9 +53,9 @@ def __init__(self, func, field, part, tag): self.part = part self.tag = tag - def __call__(self, record, **kwargs) : + def __call__(self, record, **kwargs): column = record[self.field] - if not column : + if not column: return () try: @@ -55,6 +63,6 @@ def __call__(self, record, **kwargs) : except TypeError: return () else: - part = tags.get(self.part, '') + part = tags.get(self.part, "") return super(PartialString, self).__call__({self.field: part}) diff --git a/setup.py b/setup.py index f798e56..d5561e6 100644 --- a/setup.py +++ b/setup.py @@ -6,11 +6,10 @@ setup( name='parseratorvariable', url='https://github.com/datamade/parseratorvariables', - version='0.0.17', + version='1.0.0', description='Structured variable type for dedupe', packages=['parseratorvariable'], - install_requires=['dedupe>=1.0.0', - "backports.functools_lru_cache; python_version<'3.2'", + install_requires=['dedupe>=3.0.0', 'numpy', 'probableparsing'], license='The MIT License: http://www.opensource.org/licenses/mit-license.php'