From 05da6cdd9df02b55002d44e8a87da39de9d5cfe6 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 7 Feb 2024 14:50:41 +0100 Subject: [PATCH 1/2] ruff format . && ruff --select=I --fix . --- benchmark/enron.py | 7 +- benchmark/marc21.py | 9 +- benchmark/reuters.py | 3 +- docs/source/conf.py | 4 +- scripts/make_checkpoint.py | 8 +- scripts/read_checkpoint.py | 2 +- src/whoosh/analysis/__init__.py | 6 +- src/whoosh/analysis/acore.py | 29 +- src/whoosh/analysis/analyzers.py | 20 +- src/whoosh/analysis/filters.py | 3 +- src/whoosh/analysis/intraword.py | 3 +- src/whoosh/analysis/ngrams.py | 8 +- src/whoosh/analysis/tokenizers.py | 9 +- src/whoosh/automata/fsa.py | 3 +- src/whoosh/automata/fst.py | 31 +- src/whoosh/automata/glob.py | 1 - src/whoosh/automata/reg.py | 1 - src/whoosh/classify.py | 4 +- src/whoosh/codec/base.py | 3 +- src/whoosh/codec/plaintext.py | 14 +- src/whoosh/codec/whoosh2.py | 58 +- src/whoosh/codec/whoosh3.py | 35 +- src/whoosh/collectors.py | 1 - src/whoosh/columns.py | 11 +- src/whoosh/externalsort.py | 4 +- src/whoosh/fields.py | 19 +- src/whoosh/filedb/compound.py | 4 +- src/whoosh/filedb/fileindex.py | 4 +- src/whoosh/filedb/filepostings.py | 8 +- src/whoosh/filedb/filereading.py | 15 +- src/whoosh/filedb/filestore.py | 7 +- src/whoosh/filedb/filetables.py | 10 +- src/whoosh/filedb/filewriting.py | 15 +- src/whoosh/filedb/gae.py | 2 +- src/whoosh/filedb/structfile.py | 43 +- src/whoosh/formats.py | 17 +- src/whoosh/highlight.py | 13 +- src/whoosh/idsets.py | 1 - src/whoosh/index.py | 6 +- src/whoosh/lang/__init__.py | 115 +- src/whoosh/lang/isri.py | 361 ++- src/whoosh/lang/lovins.py | 178 +- src/whoosh/lang/morph_en.py | 6 +- src/whoosh/lang/paicehusk.py | 39 +- src/whoosh/lang/porter.py | 77 +- src/whoosh/lang/porter2.py | 233 +- src/whoosh/lang/snowball/__init__.py | 32 +- src/whoosh/lang/snowball/bases.py | 14 +- src/whoosh/lang/snowball/danish.py | 4 +- src/whoosh/lang/snowball/dutch.py | 4 +- src/whoosh/lang/snowball/english.py | 4 +- src/whoosh/lang/snowball/finnish.py | 4 +- src/whoosh/lang/snowball/french.py | 4 +- src/whoosh/lang/snowball/german.py | 4 +- src/whoosh/lang/snowball/italian.py | 4 +- src/whoosh/lang/snowball/norwegian.py | 4 +- src/whoosh/lang/snowball/portugese.py | 4 +- src/whoosh/lang/snowball/romanian.py | 4 +- src/whoosh/lang/snowball/spanish.py | 4 +- src/whoosh/lang/snowball/swedish.py | 4 +- src/whoosh/lang/stopwords.py | 105 +- src/whoosh/lang/wordnet.py | 2 +- src/whoosh/matching/__init__.py | 4 +- src/whoosh/matching/combo.py | 1 + src/whoosh/matching/mcore.py | 4 +- src/whoosh/multiproc.py | 5 +- src/whoosh/qparser/common.py | 3 +- src/whoosh/qparser/dateparse.py | 13 +- src/whoosh/qparser/default.py | 3 +- src/whoosh/qparser/plugins.py | 5 +- src/whoosh/qparser/syntax.py | 7 +- src/whoosh/qparser/taggers.py | 1 - src/whoosh/query/__init__.py | 10 +- src/whoosh/query/compound.py | 2 +- src/whoosh/query/positional.py | 5 +- src/whoosh/query/qcore.py | 5 +- src/whoosh/query/ranges.py | 2 +- src/whoosh/query/spans.py | 5 +- src/whoosh/query/terms.py | 1 + src/whoosh/query/wrappers.py | 1 + src/whoosh/reading.py | 8 +- src/whoosh/scoring.py | 2 +- src/whoosh/searching.py | 5 +- src/whoosh/sorting.py | 4 +- src/whoosh/spelling.py | 3 +- src/whoosh/support/base85.py | 1 - src/whoosh/support/bench.py | 16 +- src/whoosh/support/bitstream.py | 1 - src/whoosh/support/bitvector.py | 391 ++- src/whoosh/support/charset.py | 4 +- src/whoosh/support/pyparsing.py | 3485 +++++++++++++++---------- src/whoosh/support/relativedelta.py | 381 +-- src/whoosh/support/unicode.py | 1 - src/whoosh/system.py | 1 - src/whoosh/util/__init__.py | 8 +- src/whoosh/util/cache.py | 2 +- src/whoosh/util/filelock.py | 10 +- src/whoosh/util/numeric.py | 26 +- src/whoosh/util/numlists.py | 13 +- src/whoosh/util/text.py | 6 +- src/whoosh/util/varints.py | 1 - src/whoosh/writing.py | 5 +- stress/test_bigindex.py | 2 +- stress/test_bigsort.py | 6 +- stress/test_bigtable.py | 5 +- stress/test_hugeindex.py | 2 +- stress/test_threading.py | 7 +- stress/test_update.py | 2 +- tests/test_analysis.py | 6 +- tests/test_automata.py | 3 +- tests/test_codecs.py | 9 +- tests/test_collector.py | 1 - tests/test_columns.py | 18 +- tests/test_dateparse.py | 1 - tests/test_fields.py | 1 - tests/test_flexible.py | 2 +- tests/test_highlighting.py | 3 +- tests/test_indexing.py | 12 +- tests/test_matching.py | 7 +- tests/test_misc.py | 11 +- tests/test_mpwriter.py | 6 +- tests/test_parse_plugins.py | 5 +- tests/test_parsing.py | 2 +- tests/test_postings.py | 13 +- tests/test_quality.py | 5 +- tests/test_queries.py | 1 - tests/test_reading.py | 10 +- tests/test_results.py | 5 +- tests/test_searching.py | 7 +- tests/test_sorting.py | 9 +- tests/test_spans.py | 6 +- tests/test_spelling.py | 2 +- tests/test_stem.py | 2 +- tests/test_weightings.py | 9 +- tests/test_writing.py | 8 +- 135 files changed, 3761 insertions(+), 2529 deletions(-) diff --git a/benchmark/enron.py b/benchmark/enron.py index 0c3d5d41..f3447a4a 100644 --- a/benchmark/enron.py +++ b/benchmark/enron.py @@ -1,5 +1,7 @@ from __future__ import division -import os.path, tarfile + +import os.path +import tarfile from email import message_from_string from marshal import dump, load from zlib import compress, decompress @@ -10,11 +12,10 @@ pass from whoosh import analysis, fields -from whoosh.compat import urlretrieve, next +from whoosh.compat import next, urlretrieve from whoosh.support.bench import Bench, Spec from whoosh.util import now - # Benchmark class diff --git a/benchmark/marc21.py b/benchmark/marc21.py index 49c04277..a2f89ab0 100644 --- a/benchmark/marc21.py +++ b/benchmark/marc21.py @@ -1,11 +1,14 @@ -from __future__ import with_statement, print_function -import fnmatch, logging, os.path, re +from __future__ import print_function, with_statement + +import fnmatch +import logging +import os.path +import re from whoosh import analysis, fields, index, qparser, query, scoring from whoosh.compat import range from whoosh.util import now - log = logging.getLogger(__name__) diff --git a/benchmark/reuters.py b/benchmark/reuters.py index 0aaa3276..dde05363 100644 --- a/benchmark/reuters.py +++ b/benchmark/reuters.py @@ -1,4 +1,5 @@ -import gzip, os.path +import gzip +import os.path from whoosh import analysis, fields, index, qparser, query from whoosh.support.bench import Bench, Spec diff --git a/docs/source/conf.py b/docs/source/conf.py index 77011a03..a8a3f8ae 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,6 @@ -import sys, os, os.path +import os +import os.path +import sys sys.path.append(os.path.abspath("../../src")) import whoosh diff --git a/scripts/make_checkpoint.py b/scripts/make_checkpoint.py index d4318b65..2553bbac 100644 --- a/scripts/make_checkpoint.py +++ b/scripts/make_checkpoint.py @@ -4,12 +4,14 @@ # version of Whoosh from __future__ import print_function, with_statement -import os.path, random, sys + +import os.path +import random +import sys from datetime import datetime from whoosh import fields, index -from whoosh.compat import u, range - +from whoosh.compat import range, u if len(sys.argv) < 2: print("USAGE: make_checkpoint.py ") diff --git a/scripts/read_checkpoint.py b/scripts/read_checkpoint.py index 1385637d..2f75df53 100644 --- a/scripts/read_checkpoint.py +++ b/scripts/read_checkpoint.py @@ -3,12 +3,12 @@ # Read a "checkpoint" index, to check backwards compatibility from __future__ import print_function, with_statement + import sys from whoosh import index, query from whoosh.compat import u - if len(sys.argv) < 2: print("USAGE: read_checkpoint.py ") sys.exit(1) diff --git a/src/whoosh/analysis/__init__.py b/src/whoosh/analysis/__init__.py index 66293bc1..cce0ae43 100644 --- a/src/whoosh/analysis/__init__.py +++ b/src/whoosh/analysis/__init__.py @@ -61,9 +61,9 @@ """ from whoosh.analysis.acore import * -from whoosh.analysis.tokenizers import * +from whoosh.analysis.analyzers import * from whoosh.analysis.filters import * -from whoosh.analysis.morph import * from whoosh.analysis.intraword import * +from whoosh.analysis.morph import * from whoosh.analysis.ngrams import * -from whoosh.analysis.analyzers import * +from whoosh.analysis.tokenizers import * diff --git a/src/whoosh/analysis/acore.py b/src/whoosh/analysis/acore.py index b1a01493..f6ccee3d 100644 --- a/src/whoosh/analysis/acore.py +++ b/src/whoosh/analysis/acore.py @@ -27,23 +27,24 @@ from whoosh.compat import iteritems - # Exceptions + class CompositionError(Exception): pass # Utility functions + def unstopped(tokenstream): - """Removes tokens from a token stream where token.stopped = True. - """ + """Removes tokens from a token stream where token.stopped = True.""" return (t for t in tokenstream if not t.stopped) -def entoken(textstream, positions=False, chars=False, start_pos=0, - start_char=0, **kwargs): +def entoken( + textstream, positions=False, chars=False, start_pos=0, start_char=0, **kwargs +): """Takes a sequence of unicode strings and yields a series of Token objects (actually the same Token object over and over, for performance reasons), with the attributes filled in with reasonable values (for example, if @@ -72,6 +73,7 @@ def entoken(textstream, positions=False, chars=False, start_pos=0, # Token object + class Token(object): """ Represents a "token" (usually a word) extracted from the source text being @@ -101,8 +103,9 @@ def RemoveDuplicatesFilter(self, stream): ...or, call token.copy() to get a copy of the token object. """ - def __init__(self, positions=False, chars=False, removestops=True, mode='', - **kwargs): + def __init__( + self, positions=False, chars=False, removestops=True, mode="", **kwargs + ): """ :param positions: Whether tokens should have the token position in the 'pos' attribute. @@ -123,8 +126,9 @@ def __init__(self, positions=False, chars=False, removestops=True, mode='', self.__dict__.update(kwargs) def __repr__(self): - parms = ", ".join(f"{name}={value!r}" - for name, value in iteritems(self.__dict__)) + parms = ", ".join( + f"{name}={value!r}" for name, value in iteritems(self.__dict__) + ) return f"{self.__class__.__name__}({parms})" def copy(self): @@ -134,6 +138,7 @@ def copy(self): # Composition support + class Composable(object): is_morph = False @@ -147,9 +152,9 @@ def __or__(self, other): def __repr__(self): attrs = "" if self.__dict__: - attrs = ", ".join(f"{key}={value!r}" - for key, value - in iteritems(self.__dict__)) + attrs = ", ".join( + f"{key}={value!r}" for key, value in iteritems(self.__dict__) + ) return self.__class__.__name__ + f"({attrs})" def has_morph(self): diff --git a/src/whoosh/analysis/analyzers.py b/src/whoosh/analysis/analyzers.py index 5c218fdf..1738be18 100644 --- a/src/whoosh/analysis/analyzers.py +++ b/src/whoosh/analysis/analyzers.py @@ -26,19 +26,19 @@ # policies, either expressed or implied, of Matt Chaput. from whoosh.analysis.acore import Composable, CompositionError -from whoosh.analysis.tokenizers import Tokenizer -from whoosh.analysis.filters import LowercaseFilter -from whoosh.analysis.filters import StopFilter, STOP_WORDS -from whoosh.analysis.morph import StemFilter +from whoosh.analysis.filters import STOP_WORDS, LowercaseFilter, StopFilter from whoosh.analysis.intraword import IntraWordFilter -from whoosh.analysis.tokenizers import default_pattern -from whoosh.analysis.tokenizers import CommaSeparatedTokenizer -from whoosh.analysis.tokenizers import IDTokenizer -from whoosh.analysis.tokenizers import RegexTokenizer -from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer +from whoosh.analysis.morph import StemFilter +from whoosh.analysis.tokenizers import ( + CommaSeparatedTokenizer, + IDTokenizer, + RegexTokenizer, + SpaceSeparatedTokenizer, + Tokenizer, + default_pattern, +) from whoosh.lang.porter import stem - # Analyzers diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py index 8818b373..6a134549 100644 --- a/src/whoosh/analysis/filters.py +++ b/src/whoosh/analysis/filters.py @@ -29,11 +29,10 @@ from itertools import chain -from whoosh.compat import next from whoosh.analysis.acore import Composable +from whoosh.compat import next from whoosh.util.text import rcompile - # Default list of stop words (words so common it's usually wasteful to index # them). This list is used by the StopFilter class, which allows you to supply # an optional list to override this one. diff --git a/src/whoosh/analysis/intraword.py b/src/whoosh/analysis/intraword.py index 85355f11..a6f4b28d 100644 --- a/src/whoosh/analysis/intraword.py +++ b/src/whoosh/analysis/intraword.py @@ -28,9 +28,8 @@ import re from collections import deque -from whoosh.compat import u, text_type -from whoosh.compat import range from whoosh.analysis.filters import Filter +from whoosh.compat import range, text_type, u class CompoundWordFilter(Filter): diff --git a/src/whoosh/analysis/ngrams.py b/src/whoosh/analysis/ngrams.py index 47819c73..5236fb5a 100644 --- a/src/whoosh/analysis/ngrams.py +++ b/src/whoosh/analysis/ngrams.py @@ -25,12 +25,10 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh.compat import text_type -from whoosh.compat import range from whoosh.analysis.acore import Token from whoosh.analysis.filters import Filter, LowercaseFilter -from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer - +from whoosh.analysis.tokenizers import RegexTokenizer, Tokenizer +from whoosh.compat import range, text_type # Tokenizer @@ -79,7 +77,7 @@ def __call__( start_pos=0, start_char=0, mode="", - **kwargs + **kwargs, ): assert isinstance(value, text_type), f"{value!r} is not unicode" diff --git a/src/whoosh/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py index e2654a23..b76e7df5 100644 --- a/src/whoosh/analysis/tokenizers.py +++ b/src/whoosh/analysis/tokenizers.py @@ -25,11 +25,10 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh.compat import u, text_type from whoosh.analysis.acore import Composable, Token +from whoosh.compat import text_type, u from whoosh.util.text import rcompile - default_pattern = rcompile(r"[\w\*]+(\.?[\w\*]+)*") @@ -62,7 +61,7 @@ def __call__( start_pos=0, start_char=0, mode="", - **kwargs + **kwargs, ): assert isinstance(value, text_type), f"{value!r} is not unicode" t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) @@ -117,7 +116,7 @@ def __call__( start_char=0, tokenize=True, mode="", - **kwargs + **kwargs, ): """ :param value: The unicode string to tokenize. @@ -250,7 +249,7 @@ def __call__( start_char=0, tokenize=True, mode="", - **kwargs + **kwargs, ): """ :param value: The unicode string to tokenize. diff --git a/src/whoosh/automata/fsa.py b/src/whoosh/automata/fsa.py index c9e1f547..c621c1fc 100644 --- a/src/whoosh/automata/fsa.py +++ b/src/whoosh/automata/fsa.py @@ -5,8 +5,7 @@ import sys from bisect import bisect_left -from whoosh.compat import iteritems, next, text_type, unichr, range - +from whoosh.compat import iteritems, next, range, text_type, unichr unull = unichr(0) diff --git a/src/whoosh/automata/fst.py b/src/whoosh/automata/fst.py index 29ea24fa..36f1d1b7 100644 --- a/src/whoosh/automata/fst.py +++ b/src/whoosh/automata/fst.py @@ -39,18 +39,33 @@ """ -import sys, copy +import copy +import sys from array import array from hashlib import sha1 # type: ignore @UnresolvedImport -from whoosh.compat import b, u, BytesIO -from whoosh.compat import range, iteritems, iterkeys, izip, array_tobytes -from whoosh.compat import bytes_type, text_type +from whoosh.compat import ( + BytesIO, + array_tobytes, + b, + bytes_type, + iteritems, + iterkeys, + izip, + range, + text_type, + u, +) from whoosh.filedb.structfile import StructFile -from whoosh.system import _INT_SIZE -from whoosh.system import pack_byte, pack_int, pack_uint, pack_long -from whoosh.system import emptybytes -from whoosh.util.text import utf8encode, utf8decode +from whoosh.system import ( + _INT_SIZE, + emptybytes, + pack_byte, + pack_int, + pack_long, + pack_uint, +) +from whoosh.util.text import utf8decode, utf8encode from whoosh.util.varints import varint diff --git a/src/whoosh/automata/glob.py b/src/whoosh/automata/glob.py index 32573afa..c41074c7 100644 --- a/src/whoosh/automata/glob.py +++ b/src/whoosh/automata/glob.py @@ -27,7 +27,6 @@ from whoosh.automata.fsa import ANY, EPSILON, NFA - # Constants for glob _LIT = 0 _STAR = 1 diff --git a/src/whoosh/automata/reg.py b/src/whoosh/automata/reg.py index f70f68b4..e60ab1c0 100644 --- a/src/whoosh/automata/reg.py +++ b/src/whoosh/automata/reg.py @@ -27,7 +27,6 @@ from whoosh.automata.fsa import ANY, EPSILON, NFA - # Operator precedence CHOICE = ("|",) ops = () diff --git a/src/whoosh/classify.py b/src/whoosh/classify.py index 37898c77..beab3462 100644 --- a/src/whoosh/classify.py +++ b/src/whoosh/classify.py @@ -30,12 +30,12 @@ """ from __future__ import division + import random from collections import defaultdict from math import log -from whoosh.compat import range, iteritems - +from whoosh.compat import iteritems, range # Expansion models diff --git a/src/whoosh/codec/base.py b/src/whoosh/codec/base.py index 56ce4bbc..ba1da0e5 100644 --- a/src/whoosh/codec/base.py +++ b/src/whoosh/codec/base.py @@ -33,12 +33,11 @@ from whoosh import columns from whoosh.automata import lev -from whoosh.compat import abstractmethod, izip, unichr, range +from whoosh.compat import abstractmethod, izip, range, unichr from whoosh.filedb.compound import CompoundStorage from whoosh.system import emptybytes from whoosh.util import random_name - # Exceptions diff --git a/src/whoosh/codec/plaintext.py b/src/whoosh/codec/plaintext.py index 9d5b3b84..e6b024d9 100644 --- a/src/whoosh/codec/plaintext.py +++ b/src/whoosh/codec/plaintext.py @@ -27,13 +27,21 @@ from ast import literal_eval -from whoosh.compat import b, bytes_type, text_type, integer_types, PY3 -from whoosh.compat import iteritems, dumps, loads, range from whoosh.codec import base +from whoosh.compat import ( + PY3, + b, + bytes_type, + dumps, + integer_types, + iteritems, + loads, + range, + text_type, +) from whoosh.matching import ListMatcher from whoosh.reading import TermInfo, TermNotFound - if not PY3: class memoryview: diff --git a/src/whoosh/codec/whoosh2.py b/src/whoosh/codec/whoosh2.py index 5d949672..0b5e4860 100644 --- a/src/whoosh/codec/whoosh2.py +++ b/src/whoosh/codec/whoosh2.py @@ -25,7 +25,8 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -import struct, sys +import struct +import sys from array import array from binascii import crc32 from collections import defaultdict @@ -38,28 +39,48 @@ except ImportError: zlib = None -from whoosh.compat import b, PY3 -from whoosh.compat import loads, dumps -from whoosh.compat import range, iteritems -from whoosh.compat import bytes_type, text_type, string_type, integer_types -from whoosh.compat import array_frombytes, array_tobytes +from whoosh.automata.fst import GraphReader, GraphWriter from whoosh.codec import base +from whoosh.compat import ( + PY3, + array_frombytes, + array_tobytes, + b, + bytes_type, + dumps, + integer_types, + iteritems, + loads, + range, + string_type, + text_type, +) from whoosh.filedb.filestore import Storage -from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher +from whoosh.matching import LeafMatcher, ListMatcher, ReadTooFar from whoosh.reading import NoGraphError, TermInfo, TermNotFound -from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE, IS_LITTLE -from whoosh.system import emptybytes -from whoosh.system import pack_byte -from whoosh.system import pack_ushort, unpack_ushort, pack_long, unpack_long - -from whoosh.automata.fst import GraphWriter, GraphReader -from whoosh.util.numeric import byte_to_length, length_to_byte -from whoosh.util.numeric import to_sortable, from_sortable, NaN +from whoosh.system import ( + _FLOAT_SIZE, + _INT_SIZE, + _LONG_SIZE, + IS_LITTLE, + emptybytes, + pack_byte, + pack_long, + pack_ushort, + unpack_long, + unpack_ushort, +) +from whoosh.util.numeric import ( + NaN, + byte_to_length, + from_sortable, + length_to_byte, + to_sortable, +) from whoosh.util.numlists import GrowableArray -from whoosh.util.text import utf8encode, utf8decode +from whoosh.util.text import utf8decode, utf8encode from whoosh.util.times import datetime_to_long, long_to_datetime - # Old hash file implementations _4GB = 4 * 1024 * 1024 * 1024 @@ -2122,6 +2143,7 @@ def __init__(self, stored=False, unique=False): def to_text(self, x, shift=0): from datetime import datetime + from whoosh.util.times import floor try: @@ -2242,7 +2264,7 @@ def text_to_float(text, signed=True): # Functions for converting sortable representations to and from text. -from whoosh.support.base85 import to_base85, from_base85 +from whoosh.support.base85 import from_base85, to_base85 def sortable_int_to_text(x, shift=0): diff --git a/src/whoosh/codec/whoosh3.py b/src/whoosh/codec/whoosh3.py index 4216b8e9..16107445 100644 --- a/src/whoosh/codec/whoosh3.py +++ b/src/whoosh/codec/whoosh3.py @@ -34,18 +34,35 @@ from collections import defaultdict from whoosh import columns, formats -from whoosh.compat import b, bytes_type, string_type, integer_types -from whoosh.compat import dumps, loads, iteritems, range from whoosh.codec import base +from whoosh.compat import ( + b, + bytes_type, + dumps, + integer_types, + iteritems, + loads, + range, + string_type, +) from whoosh.filedb import compound, filetables -from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher +from whoosh.matching import LeafMatcher, ListMatcher, ReadTooFar from whoosh.reading import TermInfo, TermNotFound -from whoosh.system import emptybytes -from whoosh.system import _SHORT_SIZE, _INT_SIZE, _LONG_SIZE, _FLOAT_SIZE -from whoosh.system import pack_ushort, unpack_ushort -from whoosh.system import pack_int, unpack_int, pack_long, unpack_long -from whoosh.util.numlists import delta_encode, delta_decode -from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.system import ( + _FLOAT_SIZE, + _INT_SIZE, + _LONG_SIZE, + _SHORT_SIZE, + emptybytes, + pack_int, + pack_long, + pack_ushort, + unpack_int, + unpack_long, + unpack_ushort, +) +from whoosh.util.numeric import byte_to_length, length_to_byte +from whoosh.util.numlists import delta_decode, delta_encode try: import zlib diff --git a/src/whoosh/collectors.py b/src/whoosh/collectors.py index 50b71c1e..939b3319 100644 --- a/src/whoosh/collectors.py +++ b/src/whoosh/collectors.py @@ -87,7 +87,6 @@ def collect(self, sub_docnum): from whoosh.searching import Results, TimeLimit from whoosh.util import now - # Functions diff --git a/src/whoosh/columns.py b/src/whoosh/columns.py index 5585f479..d15ec6c6 100644 --- a/src/whoosh/columns.py +++ b/src/whoosh/columns.py @@ -47,7 +47,9 @@ """ from __future__ import division, with_statement -import struct, warnings + +import struct +import warnings from array import array from bisect import bisect_right @@ -56,16 +58,13 @@ except ImportError: zlib = None -from whoosh.compat import b, bytes_type, BytesIO -from whoosh.compat import array_tobytes, range -from whoosh.compat import dumps, loads +from whoosh.compat import BytesIO, array_tobytes, b, bytes_type, dumps, loads, range from whoosh.filedb.structfile import StructFile from whoosh.idsets import BitSet, OnDiskBitSet from whoosh.system import emptybytes from whoosh.util.numeric import typecode_max, typecode_min from whoosh.util.numlists import GrowableArray -from whoosh.util.varints import varint, read_varint - +from whoosh.util.varints import read_varint, varint # Base classes diff --git a/src/whoosh/externalsort.py b/src/whoosh/externalsort.py index f339271e..510441a8 100644 --- a/src/whoosh/externalsort.py +++ b/src/whoosh/externalsort.py @@ -31,12 +31,12 @@ from __future__ import with_statement -import os, tempfile +import os +import tempfile from heapq import heapify, heappop, heapreplace from whoosh.compat import dump, load - ## Python 3.2 had a bug that make marshal.load unusable # if (hasattr(platform, "python_implementation") # and platform.python_implementation() == "CPython" diff --git a/src/whoosh/fields.py b/src/whoosh/fields.py index 0339bc45..763a5a02 100644 --- a/src/whoosh/fields.py +++ b/src/whoosh/fields.py @@ -29,22 +29,21 @@ Contains functions and classes related to fields. """ -import datetime, fnmatch, re, struct, sys +import datetime +import fnmatch +import re +import struct +import sys from array import array from decimal import Decimal from whoosh import analysis, columns, formats -from whoosh.compat import with_metaclass -from whoosh.compat import itervalues -from whoosh.compat import bytes_type, string_type, text_type -from whoosh.system import emptybytes -from whoosh.system import pack_byte -from whoosh.util.numeric import to_sortable, from_sortable -from whoosh.util.numeric import typecode_max, NaN -from whoosh.util.text import utf8encode, utf8decode +from whoosh.compat import bytes_type, itervalues, string_type, text_type, with_metaclass +from whoosh.system import emptybytes, pack_byte +from whoosh.util.numeric import NaN, from_sortable, to_sortable, typecode_max +from whoosh.util.text import utf8decode, utf8encode from whoosh.util.times import datetime_to_long, long_to_datetime - # Exceptions diff --git a/src/whoosh/filedb/compound.py b/src/whoosh/filedb/compound.py index e65c1b17..3840ddab 100644 --- a/src/whoosh/filedb/compound.py +++ b/src/whoosh/filedb/compound.py @@ -28,8 +28,8 @@ import errno import os import sys -from threading import Lock from shutil import copyfileobj +from threading import Lock try: import mmap @@ -37,8 +37,8 @@ mmap = None from whoosh.compat import BytesIO, memoryview_ -from whoosh.filedb.structfile import BufferFile, StructFile from whoosh.filedb.filestore import FileStorage, StorageError +from whoosh.filedb.structfile import BufferFile, StructFile from whoosh.system import emptybytes from whoosh.util import random_name diff --git a/src/whoosh/filedb/fileindex.py b/src/whoosh/filedb/fileindex.py index 33c91f5a..cd1e9160 100644 --- a/src/whoosh/filedb/fileindex.py +++ b/src/whoosh/filedb/fileindex.py @@ -15,12 +15,12 @@ # =============================================================================== import os +import pickle import re from bisect import bisect_right from threading import Lock from time import time -import pickle from whoosh import __version__ from whoosh.fields import Schema from whoosh.index import ( @@ -28,9 +28,9 @@ EmptyIndexError, Index, IndexVersionError, + LockError, OutOfDateError, ) -from whoosh.index import LockError from whoosh.support.bitvector import BitVector from whoosh.system import _FLOAT_SIZE, _INT_SIZE diff --git a/src/whoosh/filedb/filepostings.py b/src/whoosh/filedb/filepostings.py index 3daf19bf..fbd0c0a0 100644 --- a/src/whoosh/filedb/filepostings.py +++ b/src/whoosh/filedb/filepostings.py @@ -17,12 +17,12 @@ import types from array import array from struct import Struct -from whoosh.support import unicode -from whoosh.writing import PostingWriter from whoosh.matching import Matcher, ReadTooFar -from whoosh.system import _INT_SIZE, _FLOAT_SIZE -from whoosh.util import utf8encode, utf8decode, length_to_byte, byte_to_length +from whoosh.support import unicode +from whoosh.system import _FLOAT_SIZE, _INT_SIZE +from whoosh.util import byte_to_length, length_to_byte, utf8decode, utf8encode +from whoosh.writing import PostingWriter class BlockInfo(object): diff --git a/src/whoosh/filedb/filereading.py b/src/whoosh/filedb/filereading.py index 4773b1df..f7e8bf43 100644 --- a/src/whoosh/filedb/filereading.py +++ b/src/whoosh/filedb/filereading.py @@ -14,23 +14,22 @@ # limitations under the License. # =============================================================================== -from threading import Lock from marshal import loads +from threading import Lock from whoosh.fields import FieldConfigurationError +from whoosh.filedb import misc from whoosh.filedb.filepostings import FilePostingReader from whoosh.filedb.filetables import ( - FileTableReader, FileListReader, - StructHashReader, + FileTableReader, LengthReader, + StructHashReader, ) -from whoosh.filedb import misc # from whoosh.postings import Exclude from whoosh.reading import IndexReader, TermNotFound -from whoosh.util import protected, byte_to_length - +from whoosh.util import byte_to_length, protected # Reader class @@ -242,8 +241,6 @@ def vector(self, docnum, fieldid): self._open_vectors() offset = self.vectorindex.get((docnum, fieldnum)) if offset is None: - raise Exception( - f"No vector found for document {docnum} field {fieldid!r}" - ) + raise Exception(f"No vector found for document {docnum} field {fieldid!r}") return FilePostingReader(self.vpostfile, offset, vformat, stringids=True) diff --git a/src/whoosh/filedb/filestore.py b/src/whoosh/filedb/filestore.py index 524000b3..4a793e35 100644 --- a/src/whoosh/filedb/filestore.py +++ b/src/whoosh/filedb/filestore.py @@ -26,7 +26,11 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import with_statement -import errno, os, sys, tempfile + +import errno +import os +import sys +import tempfile from threading import Lock from whoosh.compat import BytesIO, memoryview_ @@ -35,7 +39,6 @@ from whoosh.util import random_name from whoosh.util.filelock import FileLock - # Exceptions diff --git a/src/whoosh/filedb/filetables.py b/src/whoosh/filedb/filetables.py index c5ed48d7..53c4d4d6 100644 --- a/src/whoosh/filedb/filetables.py +++ b/src/whoosh/filedb/filetables.py @@ -30,15 +30,15 @@ D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html). """ -import os, struct, sys +import os +import struct +import sys from binascii import crc32 from hashlib import md5 # type: ignore @UnresolvedImport -from whoosh.compat import b, bytes_type -from whoosh.compat import range -from whoosh.util.numlists import GrowableArray +from whoosh.compat import b, bytes_type, range from whoosh.system import _INT_SIZE, emptybytes - +from whoosh.util.numlists import GrowableArray # Exceptions diff --git a/src/whoosh/filedb/filewriting.py b/src/whoosh/filedb/filewriting.py index 5b1f3aed..36060a68 100644 --- a/src/whoosh/filedb/filewriting.py +++ b/src/whoosh/filedb/filewriting.py @@ -16,25 +16,24 @@ from collections import defaultdict from marshal import dumps -from whoosh.support import unicode from whoosh.fields import UnknownFieldError -from whoosh.filedb.fileindex import SegmentDeletionMixin, Segment, SegmentSet +from whoosh.filedb import misc +from whoosh.filedb.fileindex import Segment, SegmentDeletionMixin, SegmentSet from whoosh.filedb.filepostings import FilePostingWriter from whoosh.filedb.filetables import ( FileListWriter, FileTableWriter, - StructHashWriter, LengthWriter, + StructHashWriter, ) -from whoosh.filedb import misc -from whoosh.filedb.pools import TempfilePool, MultiPool +from whoosh.filedb.pools import MultiPool, TempfilePool from whoosh.index import LockError -from whoosh.util.filelock import try_for +from whoosh.support import unicode from whoosh.util import fib +from whoosh.util.filelock import try_for from whoosh.writing import IndexWriter - # Merge policies # A merge policy is a callable that takes the Index object, the SegmentWriter @@ -87,7 +86,7 @@ def __init__( blocklimit=128, timeout=0.0, delay=0.1, - **poolargs + **poolargs, ): self.lock = ix.storage.lock(ix.indexname + "_LOCK") if not try_for(self.lock.acquire, timeout=timeout, delay=delay): diff --git a/src/whoosh/filedb/gae.py b/src/whoosh/filedb/gae.py index 2e13d846..c79b400c 100644 --- a/src/whoosh/filedb/gae.py +++ b/src/whoosh/filedb/gae.py @@ -23,9 +23,9 @@ from google.appengine.ext import db # type: ignore @UnresolvedImport from whoosh.compat import BytesIO -from whoosh.index import TOC, FileIndex, _DEF_INDEX_NAME from whoosh.filedb.filestore import ReadOnlyError, Storage from whoosh.filedb.structfile import StructFile +from whoosh.index import _DEF_INDEX_NAME, TOC, FileIndex class DatastoreFile(db.Model): diff --git a/src/whoosh/filedb/structfile.py b/src/whoosh/filedb/structfile.py index 50f54537..84ffa39d 100644 --- a/src/whoosh/filedb/structfile.py +++ b/src/whoosh/filedb/structfile.py @@ -29,22 +29,37 @@ from copy import copy from struct import calcsize -from whoosh.compat import BytesIO, bytes_type +from whoosh.compat import BytesIO, array_frombytes, array_tobytes, bytes_type from whoosh.compat import dump as dump_pickle from whoosh.compat import load as load_pickle -from whoosh.compat import array_frombytes, array_tobytes -from whoosh.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE -from whoosh.system import IS_LITTLE -from whoosh.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte -from whoosh.system import pack_ushort, unpack_ushort -from whoosh.system import pack_ushort_le, unpack_ushort_le -from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint -from whoosh.system import pack_uint_le, unpack_uint_le -from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong -from whoosh.system import pack_float, unpack_float -from whoosh.util.varints import varint, read_varint -from whoosh.util.varints import signed_varint, decode_signed_varint - +from whoosh.system import ( + _FLOAT_SIZE, + _INT_SIZE, + _LONG_SIZE, + _SHORT_SIZE, + IS_LITTLE, + pack_byte, + pack_float, + pack_int, + pack_long, + pack_sbyte, + pack_uint, + pack_uint_le, + pack_ulong, + pack_ushort, + pack_ushort_le, + unpack_byte, + unpack_float, + unpack_int, + unpack_long, + unpack_sbyte, + unpack_uint, + unpack_uint_le, + unpack_ulong, + unpack_ushort, + unpack_ushort_le, +) +from whoosh.util.varints import decode_signed_varint, read_varint, signed_varint, varint _SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf") _ORDERMAP = {"little": "<", "big": ">"} diff --git a/src/whoosh/formats.py b/src/whoosh/formats.py index 77bc3f06..23162aed 100644 --- a/src/whoosh/formats.py +++ b/src/whoosh/formats.py @@ -33,12 +33,17 @@ from collections import defaultdict -from whoosh.analysis import unstopped, entoken -from whoosh.compat import iteritems, dumps, loads, b -from whoosh.system import emptybytes -from whoosh.system import _INT_SIZE, _FLOAT_SIZE -from whoosh.system import pack_uint, unpack_uint, pack_float, unpack_float - +from whoosh.analysis import entoken, unstopped +from whoosh.compat import b, dumps, iteritems, loads +from whoosh.system import ( + _FLOAT_SIZE, + _INT_SIZE, + emptybytes, + pack_float, + pack_uint, + unpack_float, + unpack_uint, +) # Format base class diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py index a7b15b29..254ae3d2 100644 --- a/src/whoosh/highlight.py +++ b/src/whoosh/highlight.py @@ -241,9 +241,7 @@ def set_matched_filter_phrases(tokens, text, terms, phrases): """ text_sub = text[ current_word_index + 1 : current_word_index + 1 + slop - ][ - ::-1 - ] # Substring to scan (reversed) + ][::-1] # Substring to scan (reversed) len_sub = len(text_sub) next_word_index = ( len_sub - text_sub.index(word) - 1 @@ -864,8 +862,13 @@ def __init__(self, qname="strong", between="..."): self.qname = qname self.between = between - from genshi.core import START, END, TEXT # type: ignore @UnresolvedImport - from genshi.core import Attrs, Stream # type: ignore @UnresolvedImport + from genshi.core import ( # type: ignore @UnresolvedImport # type: ignore @UnresolvedImport + END, + START, + TEXT, + Attrs, + Stream, + ) self.START, self.END, self.TEXT = START, END, TEXT self.Attrs, self.Stream = Attrs, Stream diff --git a/src/whoosh/idsets.py b/src/whoosh/idsets.py index bceafaa6..d6c2c1bd 100644 --- a/src/whoosh/idsets.py +++ b/src/whoosh/idsets.py @@ -9,7 +9,6 @@ from whoosh.compat import izip, izip_longest, next, range from whoosh.util.numeric import bytes_for_bits - # Number of '1' bits in each byte (0-255) _1SPERBYTE = array( "B", diff --git a/src/whoosh/index.py b/src/whoosh/index.py index 2182b5be..c2966d4b 100644 --- a/src/whoosh/index.py +++ b/src/whoosh/index.py @@ -34,13 +34,13 @@ import os.path import re import sys -from time import time, sleep +from time import sleep, time from whoosh import __version__ from whoosh.compat import pickle, string_type from whoosh.fields import ensure_schema from whoosh.legacy import toc_loaders -from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE +from whoosh.system import _FLOAT_SIZE, _INT_SIZE, _LONG_SIZE _DEF_INDEX_NAME = "MAIN" _CURRENT_TOC_VERSION = -111 @@ -498,7 +498,7 @@ def version(self): def _reader(cls, storage, schema, segments, generation, reuse=None): # Returns a reader for the given segments, possibly reusing already # opened readers - from whoosh.reading import SegmentReader, MultiReader, EmptyReader + from whoosh.reading import EmptyReader, MultiReader, SegmentReader if reuse: # Merge segments with reuse segments diff --git a/src/whoosh/lang/__init__.py b/src/whoosh/lang/__init__.py index 2ab9b067..b4cff58c 100644 --- a/src/whoosh/lang/__init__.py +++ b/src/whoosh/lang/__init__.py @@ -30,6 +30,7 @@ # Exceptions + class NoStemmer(Exception): pass @@ -40,50 +41,76 @@ class NoStopWords(Exception): # Data and functions for language names -languages = ("ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", - "ro", "ru", "es", "sv", "tr") +languages = ( + "ar", + "da", + "nl", + "en", + "fi", + "fr", + "de", + "hu", + "it", + "no", + "pt", + "ro", + "ru", + "es", + "sv", + "tr", +) aliases = { - # By ISO 639-1 three letter codes - "ara": "ar", - "dan": "da", "nld": "nl", "eng": "en", "fin": "fi", "fra": "fr", - "deu": "de", "hun": "hu", "ita": "it", "nor": "no", "por": "pt", - "ron": "ro", "rus": "ru", "spa": "es", "swe": "sv", "tur": "tr", - - # By name in English - "arabic": "ar", - "danish": "da", - "dutch": "nl", - "english": "en", - "finnish": "fi", - "french": "fr", - "german": "de", - "hungarian": "hu", - "italian": "it", - "norwegian": "no", - "portuguese": "pt", - "romanian": "ro", - "russian": "ru", - "spanish": "es", - "swedish": "sv", - "turkish": "tr", - - # By name in own language - "العربية": "ar", - "dansk": "da", - "nederlands": "nl", - "suomi": "fi", - "français": "fr", - "deutsch": "de", - "magyar": "hu", - "italiano": "it", - "norsk": "no", - "português": "pt", - "русский язык": "ru", - "español": "es", - "svenska": "sv", - "türkçe": "tr", - } + # By ISO 639-1 three letter codes + "ara": "ar", + "dan": "da", + "nld": "nl", + "eng": "en", + "fin": "fi", + "fra": "fr", + "deu": "de", + "hun": "hu", + "ita": "it", + "nor": "no", + "por": "pt", + "ron": "ro", + "rus": "ru", + "spa": "es", + "swe": "sv", + "tur": "tr", + # By name in English + "arabic": "ar", + "danish": "da", + "dutch": "nl", + "english": "en", + "finnish": "fi", + "french": "fr", + "german": "de", + "hungarian": "hu", + "italian": "it", + "norwegian": "no", + "portuguese": "pt", + "romanian": "ro", + "russian": "ru", + "spanish": "es", + "swedish": "sv", + "turkish": "tr", + # By name in own language + "العربية": "ar", + "dansk": "da", + "nederlands": "nl", + "suomi": "fi", + "français": "fr", + "deutsch": "de", + "magyar": "hu", + "italiano": "it", + "norsk": "no", + "português": "pt", + "русский язык": "ru", + "español": "es", + "svenska": "sv", + "türkçe": "tr", +} def two_letter_code(name): @@ -96,6 +123,7 @@ def two_letter_code(name): # Getter functions + def has_stemmer(lang): try: return bool(stemmer_for_language(lang)) @@ -115,15 +143,18 @@ def stemmer_for_language(lang): # Original porter stemming algorithm is several times faster than the # more correct porter2 algorithm in snowball package from .porter import stem as porter_stem + return porter_stem tlc = two_letter_code(lang) if tlc == "ar": from .isri import ISRIStemmer + return ISRIStemmer().stem from .snowball import classes as snowball_classes + if tlc in snowball_classes: return snowball_classes[tlc]().stem diff --git a/src/whoosh/lang/isri.py b/src/whoosh/lang/isri.py index dafffd8b..0e79d15c 100644 --- a/src/whoosh/lang/isri.py +++ b/src/whoosh/lang/isri.py @@ -35,11 +35,12 @@ """ from __future__ import unicode_literals + import re class ISRIStemmer(object): - ''' + """ ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. Information Science Research Institute. University of Nevada, Las Vegas, USA. @@ -51,93 +52,136 @@ class ISRIStemmer(object): The ISRI Stemmer requires that all tokens have Unicode string types. If you use Python IDLE on Arabic Windows you have to decode text first using Arabic '1256' coding. - ''' + """ def __init__(self): - self.stm = 'defult none' - - self.p3 = ['\u0643\u0627\u0644', '\u0628\u0627\u0644', - '\u0648\u0644\u0644', '\u0648\u0627\u0644'] # length three prefixes - self.p2 = ['\u0627\u0644', '\u0644\u0644'] # length two prefixes - self.p1 = ['\u0644', '\u0628', '\u0641', '\u0633', '\u0648', - '\u064a', '\u062a', '\u0646', '\u0627'] # length one prefixes - - self.s3 = ['\u062a\u0645\u0644', '\u0647\u0645\u0644', - '\u062a\u0627\u0646', '\u062a\u064a\u0646', - '\u0643\u0645\u0644'] # length three suffixes - self.s2 = ['\u0648\u0646', '\u0627\u062a', '\u0627\u0646', - '\u064a\u0646', '\u062a\u0646', '\u0643\u0645', - '\u0647\u0646', '\u0646\u0627', '\u064a\u0627', - '\u0647\u0627', '\u062a\u0645', '\u0643\u0646', - '\u0646\u064a', '\u0648\u0627', '\u0645\u0627', - '\u0647\u0645'] # length two suffixes - self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a', - '\u0627', '\u0646'] # length one suffixes - - self.pr4 = {0: ['\u0645'], 1:['\u0627'], - 2: ['\u0627', '\u0648', '\u064A'], 3:['\u0629']} # groups of length four patterns - self.pr53 = {0: ['\u0627', '\u062a'], - 1: ['\u0627', '\u064a', '\u0648'], - 2: ['\u0627', '\u062a', '\u0645'], - 3: ['\u0645', '\u064a', '\u062a'], - 4: ['\u0645', '\u062a'], - 5: ['\u0627', '\u0648'], - 6: ['\u0627', '\u0645']} # Groups of length five patterns and length three roots - - self.re_short_vowels = re.compile('[\u064B-\u0652]') - self.re_hamza = re.compile('[\u0621\u0624\u0626]') - self.re_intial_hamza = re.compile('^[\u0622\u0623\u0625]') - - self.stop_words = ['\u064a\u0643\u0648\u0646', - '\u0648\u0644\u064a\u0633', - '\u0648\u0643\u0627\u0646', - '\u0643\u0630\u0644\u0643', - '\u0627\u0644\u062a\u064a', - '\u0648\u0628\u064a\u0646', - '\u0639\u0644\u064a\u0647\u0627', - '\u0645\u0633\u0627\u0621', - '\u0627\u0644\u0630\u064a', - '\u0648\u0643\u0627\u0646\u062a', - '\u0648\u0644\u0643\u0646', - '\u0648\u0627\u0644\u062a\u064a', - '\u062a\u0643\u0648\u0646', - '\u0627\u0644\u064a\u0648\u0645', - '\u0627\u0644\u0644\u0630\u064a\u0646', - '\u0639\u0644\u064a\u0647', - '\u0643\u0627\u0646\u062a', - '\u0644\u0630\u0644\u0643', - '\u0623\u0645\u0627\u0645', - '\u0647\u0646\u0627\u0643', - '\u0645\u0646\u0647\u0627', - '\u0645\u0627\u0632\u0627\u0644', - '\u0644\u0627\u0632\u0627\u0644', - '\u0644\u0627\u064a\u0632\u0627\u0644', - '\u0645\u0627\u064a\u0632\u0627\u0644', - '\u0627\u0635\u0628\u062d', - '\u0623\u0635\u0628\u062d', - '\u0623\u0645\u0633\u0649', - '\u0627\u0645\u0633\u0649', - '\u0623\u0636\u062d\u0649', - '\u0627\u0636\u062d\u0649', - '\u0645\u0627\u0628\u0631\u062d', - '\u0645\u0627\u0641\u062a\u0626', - '\u0645\u0627\u0627\u0646\u0641\u0643', - '\u0644\u0627\u0633\u064a\u0645\u0627', - '\u0648\u0644\u0627\u064a\u0632\u0627\u0644', - '\u0627\u0644\u062d\u0627\u0644\u064a', - '\u0627\u0644\u064a\u0647\u0627', - '\u0627\u0644\u0630\u064a\u0646', - '\u0641\u0627\u0646\u0647', - '\u0648\u0627\u0644\u0630\u064a', - '\u0648\u0647\u0630\u0627', - '\u0644\u0647\u0630\u0627', - '\u0641\u0643\u0627\u0646', - '\u0633\u062a\u0643\u0648\u0646', - '\u0627\u0644\u064a\u0647', - '\u064a\u0645\u0643\u0646', - '\u0628\u0647\u0630\u0627', - '\u0627\u0644\u0630\u0649'] - + self.stm = "defult none" + + self.p3 = [ + "\u0643\u0627\u0644", + "\u0628\u0627\u0644", + "\u0648\u0644\u0644", + "\u0648\u0627\u0644", + ] # length three prefixes + self.p2 = ["\u0627\u0644", "\u0644\u0644"] # length two prefixes + self.p1 = [ + "\u0644", + "\u0628", + "\u0641", + "\u0633", + "\u0648", + "\u064a", + "\u062a", + "\u0646", + "\u0627", + ] # length one prefixes + + self.s3 = [ + "\u062a\u0645\u0644", + "\u0647\u0645\u0644", + "\u062a\u0627\u0646", + "\u062a\u064a\u0646", + "\u0643\u0645\u0644", + ] # length three suffixes + self.s2 = [ + "\u0648\u0646", + "\u0627\u062a", + "\u0627\u0646", + "\u064a\u0646", + "\u062a\u0646", + "\u0643\u0645", + "\u0647\u0646", + "\u0646\u0627", + "\u064a\u0627", + "\u0647\u0627", + "\u062a\u0645", + "\u0643\u0646", + "\u0646\u064a", + "\u0648\u0627", + "\u0645\u0627", + "\u0647\u0645", + ] # length two suffixes + self.s1 = [ + "\u0629", + "\u0647", + "\u064a", + "\u0643", + "\u062a", + "\u0627", + "\u0646", + ] # length one suffixes + + self.pr4 = { + 0: ["\u0645"], + 1: ["\u0627"], + 2: ["\u0627", "\u0648", "\u064A"], + 3: ["\u0629"], + } # groups of length four patterns + self.pr53 = { + 0: ["\u0627", "\u062a"], + 1: ["\u0627", "\u064a", "\u0648"], + 2: ["\u0627", "\u062a", "\u0645"], + 3: ["\u0645", "\u064a", "\u062a"], + 4: ["\u0645", "\u062a"], + 5: ["\u0627", "\u0648"], + 6: ["\u0627", "\u0645"], + } # Groups of length five patterns and length three roots + + self.re_short_vowels = re.compile("[\u064B-\u0652]") + self.re_hamza = re.compile("[\u0621\u0624\u0626]") + self.re_intial_hamza = re.compile("^[\u0622\u0623\u0625]") + + self.stop_words = [ + "\u064a\u0643\u0648\u0646", + "\u0648\u0644\u064a\u0633", + "\u0648\u0643\u0627\u0646", + "\u0643\u0630\u0644\u0643", + "\u0627\u0644\u062a\u064a", + "\u0648\u0628\u064a\u0646", + "\u0639\u0644\u064a\u0647\u0627", + "\u0645\u0633\u0627\u0621", + "\u0627\u0644\u0630\u064a", + "\u0648\u0643\u0627\u0646\u062a", + "\u0648\u0644\u0643\u0646", + "\u0648\u0627\u0644\u062a\u064a", + "\u062a\u0643\u0648\u0646", + "\u0627\u0644\u064a\u0648\u0645", + "\u0627\u0644\u0644\u0630\u064a\u0646", + "\u0639\u0644\u064a\u0647", + "\u0643\u0627\u0646\u062a", + "\u0644\u0630\u0644\u0643", + "\u0623\u0645\u0627\u0645", + "\u0647\u0646\u0627\u0643", + "\u0645\u0646\u0647\u0627", + "\u0645\u0627\u0632\u0627\u0644", + "\u0644\u0627\u0632\u0627\u0644", + "\u0644\u0627\u064a\u0632\u0627\u0644", + "\u0645\u0627\u064a\u0632\u0627\u0644", + "\u0627\u0635\u0628\u062d", + "\u0623\u0635\u0628\u062d", + "\u0623\u0645\u0633\u0649", + "\u0627\u0645\u0633\u0649", + "\u0623\u0636\u062d\u0649", + "\u0627\u0636\u062d\u0649", + "\u0645\u0627\u0628\u0631\u062d", + "\u0645\u0627\u0641\u062a\u0626", + "\u0645\u0627\u0627\u0646\u0641\u0643", + "\u0644\u0627\u0633\u064a\u0645\u0627", + "\u0648\u0644\u0627\u064a\u0632\u0627\u0644", + "\u0627\u0644\u062d\u0627\u0644\u064a", + "\u0627\u0644\u064a\u0647\u0627", + "\u0627\u0644\u0630\u064a\u0646", + "\u0641\u0627\u0646\u0647", + "\u0648\u0627\u0644\u0630\u064a", + "\u0648\u0647\u0630\u0627", + "\u0644\u0647\u0630\u0627", + "\u0641\u0643\u0627\u0646", + "\u0633\u062a\u0643\u0648\u0646", + "\u0627\u0644\u064a\u0647", + "\u064a\u0645\u0643\u0646", + "\u0628\u0647\u0630\u0627", + "\u0627\u0644\u0630\u0649", + ] def stem(self, token): """ @@ -145,26 +189,28 @@ def stem(self, token): """ self.stm = token - self.norm(1) # remove diacritics which representing Arabic short vowels - if self.stm in self.stop_words: return self.stm # exclude stop words from being processed - self.pre32() # remove length three and length two prefixes in this order - self.suf32() # remove length three and length two suffixes in this order - self.waw() # remove connective ‘و’ if it precedes a word beginning with ‘و’ - self.norm(2) # normalize initial hamza to bare alif - if len(self.stm) <= 3: return self.stm # return stem if less than or equal to three - - if len(self.stm) == 4: # length 4 word + self.norm(1) # remove diacritics which representing Arabic short vowels + if self.stm in self.stop_words: + return self.stm # exclude stop words from being processed + self.pre32() # remove length three and length two prefixes in this order + self.suf32() # remove length three and length two suffixes in this order + self.waw() # remove connective ‘و’ if it precedes a word beginning with ‘و’ + self.norm(2) # normalize initial hamza to bare alif + if len(self.stm) <= 3: + return self.stm # return stem if less than or equal to three + + if len(self.stm) == 4: # length 4 word self.pro_w4() return self.stm - elif len(self.stm) == 5: # length 5 word + elif len(self.stm) == 5: # length 5 word self.pro_w53() self.end_w5() return self.stm - elif len(self.stm) == 6: # length 6 word + elif len(self.stm) == 6: # length 6 word self.pro_w6() self.end_w6() return self.stm - elif len(self.stm) == 7: # length 7 word + elif len(self.stm) == 7: # length 7 word self.suf1() if len(self.stm) == 7: self.pre1() @@ -172,7 +218,7 @@ def stem(self, token): self.pro_w6() self.end_w6() return self.stm - return self.stm # if word length >7 , then no stemming + return self.stm # if word length >7 , then no stemming def norm(self, num): """ @@ -184,14 +230,14 @@ def norm(self, num): self.k = num if self.k == 1: - self.stm = self.re_short_vowels.sub('', self.stm) + self.stm = self.re_short_vowels.sub("", self.stm) return self.stm elif self.k == 2: - self.stm = self.re_intial_hamza.sub('\u0627', self.stm) + self.stm = self.re_intial_hamza.sub("\u0627", self.stm) return self.stm elif self.k == 3: - self.stm = self.re_short_vowels.sub('', self.stm) - self.stm = self.re_intial_hamza.sub('\u0627', self.stm) + self.stm = self.re_short_vowels.sub("", self.stm) + self.stm = self.re_intial_hamza.sub("\u0627", self.stm) return self.stm def pre32(self): @@ -220,95 +266,108 @@ def suf32(self): self.stm = self.stm[:-2] return self.stm - def waw(self): - """remove connective ‘و’ if it precedes a word beginning with ‘و’ """ - if (len(self.stm) >= 4) & (self.stm[:2] == '\u0648\u0648'): + """remove connective ‘و’ if it precedes a word beginning with ‘و’""" + if (len(self.stm) >= 4) & (self.stm[:2] == "\u0648\u0648"): self.stm = self.stm[1:] return self.stm def pro_w4(self): """process length four patterns and extract length three roots""" - if self.stm[0] in self.pr4[0]: # مفعل + if self.stm[0] in self.pr4[0]: # مفعل self.stm = self.stm[1:] return self.stm - elif self.stm[1] in self.pr4[1]: # فاعل + elif self.stm[1] in self.pr4[1]: # فاعل self.stm = self.stm[0] + self.stm[2:] return self.stm - elif self.stm[2] in self.pr4[2]: # فعال - فعول - فعيل + elif self.stm[2] in self.pr4[2]: # فعال - فعول - فعيل self.stm = self.stm[:2] + self.stm[3] return self.stm - elif self.stm[3] in self.pr4[3]: # فعلة + elif self.stm[3] in self.pr4[3]: # فعلة self.stm = self.stm[:-1] return self.stm else: - self.suf1() # do - normalize short sufix + self.suf1() # do - normalize short sufix if len(self.stm) == 4: - self.pre1() # do - normalize short prefix + self.pre1() # do - normalize short prefix return self.stm def pro_w53(self): """process length five patterns and extract length three roots""" - if ((self.stm[2] in self.pr53[0]) & (self.stm[0] == '\u0627')): # افتعل - افاعل + if (self.stm[2] in self.pr53[0]) & ( + self.stm[0] == "\u0627" + ): # افتعل - افاعل self.stm = self.stm[1] + self.stm[3:] return self.stm - elif ((self.stm[3] in self.pr53[1]) & (self.stm[0] == '\u0645')): # مفعول - مفعال - مفعيل + elif (self.stm[3] in self.pr53[1]) & ( + self.stm[0] == "\u0645" + ): # مفعول - مفعال - مفعيل self.stm = self.stm[1:3] + self.stm[4] return self.stm - elif ((self.stm[0] in self.pr53[2]) & (self.stm[4] == '\u0629')): # مفعلة - تفعلة - افعلة + elif (self.stm[0] in self.pr53[2]) & ( + self.stm[4] == "\u0629" + ): # مفعلة - تفعلة - افعلة self.stm = self.stm[1:4] return self.stm - elif ((self.stm[0] in self.pr53[3]) & (self.stm[2] == '\u062a')): # مفتعل - يفتعل - تفتعل + elif (self.stm[0] in self.pr53[3]) & ( + self.stm[2] == "\u062a" + ): # مفتعل - يفتعل - تفتعل self.stm = self.stm[1] + self.stm[3:] return self.stm - elif ((self.stm[0] in self.pr53[4]) & (self.stm[2] == '\u0627')): #مفاعل - تفاعل + elif (self.stm[0] in self.pr53[4]) & ( + self.stm[2] == "\u0627" + ): # مفاعل - تفاعل self.stm = self.stm[1] + self.stm[3:] return self.stm - elif ((self.stm[2] in self.pr53[5]) & (self.stm[4] == '\u0629')): # فعولة - فعالة + elif (self.stm[2] in self.pr53[5]) & ( + self.stm[4] == "\u0629" + ): # فعولة - فعالة self.stm = self.stm[:2] + self.stm[3] return self.stm - elif ((self.stm[0] in self.pr53[6]) & (self.stm[1] == '\u0646')): # انفعل - منفعل + elif (self.stm[0] in self.pr53[6]) & ( + self.stm[1] == "\u0646" + ): # انفعل - منفعل self.stm = self.stm[2:] return self.stm - elif ((self.stm[3] == '\u0627') & (self.stm[0] == '\u0627')): # افعال + elif (self.stm[3] == "\u0627") & (self.stm[0] == "\u0627"): # افعال self.stm = self.stm[1:3] + self.stm[4] return self.stm - elif ((self.stm[4] == '\u0646') & (self.stm[3] == '\u0627')): # فعلان + elif (self.stm[4] == "\u0646") & (self.stm[3] == "\u0627"): # فعلان self.stm = self.stm[:3] return self.stm - elif ((self.stm[3] == '\u064a') & (self.stm[0] == '\u062a')): # تفعيل + elif (self.stm[3] == "\u064a") & (self.stm[0] == "\u062a"): # تفعيل self.stm = self.stm[1:3] + self.stm[4] return self.stm - elif ((self.stm[3] == '\u0648') & (self.stm[1] == '\u0627')): # فاعول + elif (self.stm[3] == "\u0648") & (self.stm[1] == "\u0627"): # فاعول self.stm = self.stm[0] + self.stm[2] + self.stm[4] return self.stm - elif ((self.stm[2] == '\u0627') & (self.stm[1] == '\u0648')): # فواعل + elif (self.stm[2] == "\u0627") & (self.stm[1] == "\u0648"): # فواعل self.stm = self.stm[0] + self.stm[3:] return self.stm - elif ((self.stm[3] == '\u0626') & (self.stm[2] == '\u0627')): # فعائل + elif (self.stm[3] == "\u0626") & (self.stm[2] == "\u0627"): # فعائل self.stm = self.stm[:2] + self.stm[4] return self.stm - elif ((self.stm[4] == '\u0629') & (self.stm[1] == '\u0627')): # فاعلة + elif (self.stm[4] == "\u0629") & (self.stm[1] == "\u0627"): # فاعلة self.stm = self.stm[0] + self.stm[2:4] return self.stm - elif ((self.stm[4] == '\u064a') & (self.stm[2] == '\u0627')): # فعالي + elif (self.stm[4] == "\u064a") & (self.stm[2] == "\u0627"): # فعالي self.stm = self.stm[:2] + self.stm[3] return self.stm else: - self.suf1() # do - normalize short sufix + self.suf1() # do - normalize short sufix if len(self.stm) == 5: - self.pre1() # do - normalize short prefix + self.pre1() # do - normalize short prefix return self.stm def pro_w54(self): """process length five patterns and extract length four roots""" - if (self.stm[0] in self.pr53[2]): #تفعلل - افعلل - مفعلل + if self.stm[0] in self.pr53[2]: # تفعلل - افعلل - مفعلل self.stm = self.stm[1:] return self.stm - elif (self.stm[4] == '\u0629'): # فعللة + elif self.stm[4] == "\u0629": # فعللة self.stm = self.stm[:4] return self.stm - elif (self.stm[2] == '\u0627'): # فعالل + elif self.stm[2] == "\u0627": # فعالل self.stm = self.stm[:2] + self.stm[3:] return self.stm @@ -325,33 +384,51 @@ def end_w5(self): def pro_w6(self): """process length six patterns and extract length three roots""" - if ((self.stm.startswith('\u0627\u0633\u062a')) or (self.stm.startswith('\u0645\u0633\u062a'))): # مستفعل - استفعل + if (self.stm.startswith("\u0627\u0633\u062a")) or ( + self.stm.startswith("\u0645\u0633\u062a") + ): # مستفعل - استفعل self.stm = self.stm[3:] return self.stm - elif (self.stm[0] == '\u0645' and self.stm[3] == '\u0627' and self.stm[5] == '\u0629'): # مفعالة + elif ( + self.stm[0] == "\u0645" + and self.stm[3] == "\u0627" + and self.stm[5] == "\u0629" + ): # مفعالة self.stm = self.stm[1:3] + self.stm[4] return self.stm - elif (self.stm[0] == '\u0627' and self.stm[2] == '\u062a' and self.stm[4] == '\u0627'): # افتعال + elif ( + self.stm[0] == "\u0627" + and self.stm[2] == "\u062a" + and self.stm[4] == "\u0627" + ): # افتعال self.stm = self.stm[1] + self.stm[3] + self.stm[5] return self.stm - elif (self.stm[0] == '\u0627' and self.stm[3] == '\u0648' and self.stm[2] == self.stm[4]): # افعوعل + elif ( + self.stm[0] == "\u0627" + and self.stm[3] == "\u0648" + and self.stm[2] == self.stm[4] + ): # افعوعل self.stm = self.stm[1] + self.stm[4:] return self.stm - elif (self.stm[0] == '\u062a' and self.stm[2] == '\u0627' and self.stm[4] == '\u064a'): # تفاعيل new pattern + elif ( + self.stm[0] == "\u062a" + and self.stm[2] == "\u0627" + and self.stm[4] == "\u064a" + ): # تفاعيل new pattern self.stm = self.stm[1] + self.stm[3] + self.stm[5] return self.stm else: - self.suf1() # do - normalize short sufix + self.suf1() # do - normalize short sufix if len(self.stm) == 6: - self.pre1() # do - normalize short prefix + self.pre1() # do - normalize short prefix return self.stm def pro_w64(self): """process length six patterns and extract length four roots""" - if (self.stm[0] and self.stm[4]) == '\u0627': # افعلال + if (self.stm[0] and self.stm[4]) == "\u0627": # افعلال self.stm = self.stm[1:4] + self.stm[5] return self.stm - elif (self.stm.startswith('\u0645\u062a')): # متفعلل + elif self.stm.startswith("\u0645\u062a"): # متفعلل self.stm = self.stm[2:] return self.stm @@ -363,7 +440,7 @@ def end_w6(self): self.pro_w53() self.end_w5() return self.stm - elif len (self.stm) == 6: + elif len(self.stm) == 6: self.pro_w64() return self.stm diff --git a/src/whoosh/lang/lovins.py b/src/whoosh/lang/lovins.py index 1e5a933a..e3b114ba 100644 --- a/src/whoosh/lang/lovins.py +++ b/src/whoosh/lang/lovins.py @@ -6,9 +6,9 @@ from collections import defaultdict - # Conditions + def A(base): # A No restrictions on stem return True @@ -159,14 +159,22 @@ def a(base): # a Remove ending only after d, f, ph, th, l, er, or, es or t c = base[-1] l2 = base[-2:] - return (c == "d" or c == "f" or l2 == "ph" or l2 == "th" or c == "l" - or l2 == "er" or l2 == "or" or l2 == "es" or c == "t") + return ( + c == "d" + or c == "f" + or l2 == "ph" + or l2 == "th" + or c == "l" + or l2 == "er" + or l2 == "or" + or l2 == "es" + or c == "t" + ) def b(base): # b Minimum stem length = 3 and do not remove ending after met or ryst - return len(base) > 2 and not (base.endswith("met") - or base.endswith("ryst")) + return len(base) > 2 and not (base.endswith("met") or base.endswith("ryst")) def c(base): @@ -178,16 +186,12 @@ def c(base): m = [None] * 12 -m[11] = dict(( - ("alistically", B), - ("arizability", A), - ("izationally", B))) -m[10] = dict(( - ("antialness", A), - ("arisations", A), - ("arizations", A), - ("entialness", A))) -m[9] = dict(( +m[11] = dict((("alistically", B), ("arizability", A), ("izationally", B))) +m[10] = dict( + (("antialness", A), ("arisations", A), ("arizations", A), ("entialness", A)) +) +m[9] = dict( + ( ("allically", C), ("antaneous", A), ("antiality", A), @@ -204,8 +208,11 @@ def c(base): ("istically", A), ("itousness", A), ("izability", A), - ("izational", A))) -m[8] = dict(( + ("izational", A), + ) +) +m[8] = dict( + ( ("ableness", A), ("arizable", A), ("entation", A), @@ -218,8 +225,11 @@ def c(base): ("ionalize", A), ("iousness", A), ("izations", A), - ("lessness", A))) -m[7] = dict(( + ("lessness", A), + ) +) +m[7] = dict( + ( ("ability", A), ("aically", A), ("alistic", B), @@ -259,8 +269,11 @@ def c(base): ("ization", F), ("izement", A), ("oidally", A), - ("ousness", A))) -m[6] = dict(( + ("ousness", A), + ) +) +m[6] = dict( + ( ("aceous", A), ("acious", B), ("action", G), @@ -299,8 +312,11 @@ def c(base): ("izable", E), ("lessly", A), ("nesses", A), - ("oidism", A))) -m[5] = dict(( + ("oidism", A), + ) +) +m[5] = dict( + ( ("acies", A), ("acity", A), ("aging", B), @@ -367,8 +383,11 @@ def c(base): ("oidal", A), ("oides", A), ("otide", A), - ("ously", A))) -m[4] = dict(( + ("ously", A), + ) +) +m[4] = dict( + ( ("able", A), ("ably", A), ("ages", B), @@ -416,8 +435,11 @@ def c(base): ("ward", A), ("wise", A), ("ying", B), - ("yish", A))) -m[3] = dict(( + ("yish", A), + ) +) +m[3] = dict( + ( ("acy", A), ("age", B), ("aic", A), @@ -456,8 +478,11 @@ def c(base): ("ize", F), ("oid", A), ("one", R), - ("ous", A))) -m[2] = dict(( + ("ous", A), + ) +) +m[2] = dict( + ( ("ae", A), ("al", b), ("ar", X), @@ -475,14 +500,10 @@ def c(base): ("us", V), ("yl", R), ("s'", A), - ("'s", A))) -m[1] = dict(( - ("a", A), - ("e", A), - ("i", A), - ("o", A), - ("s", W), - ("y", B))) + ("'s", A), + ) +) +m[1] = dict((("a", A), ("e", A), ("i", A), ("o", A), ("s", W), ("y", B))) def remove_ending(word): @@ -490,52 +511,54 @@ def remove_ending(word): el = 11 while el > 0: if length - el > 1: - ending = word[length - el:] + ending = word[length - el :] cond = m[el].get(ending) if cond: - base = word[:length - el] + base = word[: length - el] if cond(base): return base el -= 1 return word -_endings = (("iev", "ief"), - ("uct", "uc"), - ("iev", "ief"), - ("uct", "uc"), - ("umpt", "um"), - ("rpt", "rb"), - ("urs", "ur"), - ("istr", "ister"), - ("metr", "meter"), - ("olv", "olut"), - ("ul", "l", "aoi"), - ("bex", "bic"), - ("dex", "dic"), - ("pex", "pic"), - ("tex", "tic"), - ("ax", "ac"), - ("ex", "ec"), - ("ix", "ic"), - ("lux", "luc"), - ("uad", "uas"), - ("vad", "vas"), - ("cid", "cis"), - ("lid", "lis"), - ("erid", "eris"), - ("pand", "pans"), - ("end", "ens", "s"), - ("ond", "ons"), - ("lud", "lus"), - ("rud", "rus"), - ("her", "hes", "pt"), - ("mit", "mis"), - ("ent", "ens", "m"), - ("ert", "ers"), - ("et", "es", "n"), - ("yt", "ys"), - ("yz", "ys")) +_endings = ( + ("iev", "ief"), + ("uct", "uc"), + ("iev", "ief"), + ("uct", "uc"), + ("umpt", "um"), + ("rpt", "rb"), + ("urs", "ur"), + ("istr", "ister"), + ("metr", "meter"), + ("olv", "olut"), + ("ul", "l", "aoi"), + ("bex", "bic"), + ("dex", "dic"), + ("pex", "pic"), + ("tex", "tic"), + ("ax", "ac"), + ("ex", "ec"), + ("ix", "ic"), + ("lux", "luc"), + ("uad", "uas"), + ("vad", "vas"), + ("cid", "cis"), + ("lid", "lis"), + ("erid", "eris"), + ("pand", "pans"), + ("end", "ens", "s"), + ("ond", "ons"), + ("lud", "lus"), + ("rud", "rus"), + ("her", "hes", "pt"), + ("mit", "mis"), + ("ent", "ens", "m"), + ("ert", "ers"), + ("et", "es", "n"), + ("yt", "ys"), + ("yz", "ys"), +) # Hash the ending rules by the last letter of the target ending @@ -559,12 +582,11 @@ def fix_ending(word): if c in exceptafter: return word - return word[:0 - len(target)] + newend + return word[: 0 - len(target)] + newend return word def stem(word): - """Returns the stemmed version of the argument string. - """ + """Returns the stemmed version of the argument string.""" return fix_ending(remove_ending(word)) diff --git a/src/whoosh/lang/morph_en.py b/src/whoosh/lang/morph_en.py index c4e10952..29e9b58c 100644 --- a/src/whoosh/lang/morph_en.py +++ b/src/whoosh/lang/morph_en.py @@ -8,7 +8,7 @@ class of Sun's `Minion search engine `_. import re -from whoosh.compat import range, iteritems +from whoosh.compat import iteritems, range # Rule exceptions @@ -1102,9 +1102,7 @@ class of Sun's `Minion search engine `_. for p in range(0, len(rules) // _partition_size + 1): start = p * _partition_size end = (p + 1) * _partition_size - pattern = "|".join( - f"(?P<_g{i}>{r[0]})$" for i, r in enumerate(rules[start:end]) - ) + pattern = "|".join(f"(?P<_g{i}>{r[0]})$" for i, r in enumerate(rules[start:end])) _partitions.append(re.compile(pattern)) diff --git a/src/whoosh/lang/paicehusk.py b/src/whoosh/lang/paicehusk.py index 78732863..d3631cff 100644 --- a/src/whoosh/lang/paicehusk.py +++ b/src/whoosh/lang/paicehusk.py @@ -19,16 +19,18 @@ class PaiceHuskStemmer(object): - """Implements the Paice-Husk stemming algorithm. - """ + """Implements the Paice-Husk stemming algorithm.""" - rule_expr = re.compile(r""" + rule_expr = re.compile( + r""" ^(?P\w+) (?P[*]?) (?P\d+) (?P\w*) (?P[.>]) - """, re.UNICODE | re.VERBOSE) + """, + re.UNICODE | re.VERBOSE, + ) stem_expr = re.compile(r"^\w+", re.UNICODE) @@ -63,23 +65,30 @@ def read_rules(self, ruletable): raise Exception(f"Bad rule: {line!r}") def first_vowel(self, word): - vp = min([p for p in [word.find(v) for v in "aeiou"] - if p > -1]) + vp = min([p for p in [word.find(v) for v in "aeiou"] if p > -1]) yp = word.find("y") if yp > 0 and yp < vp: return yp return vp def strip_prefix(self, word): - for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega", - "nano", "pico", "pseudo"): + for prefix in ( + "kilo", + "micro", + "milli", + "intra", + "ultra", + "mega", + "nano", + "pico", + "pseudo", + ): if word.startswith(prefix): - return word[len(prefix):] + return word[len(prefix) :] return word def stem(self, word): - """Returns a stemmed version of the argument string. - """ + """Returns a stemmed version of the argument string.""" rules = self.rules match = self.stem_expr.match(word) @@ -102,21 +111,21 @@ def stem(self, word): continue newlen = len(stem) - num + len(append) - if ((pfv == 0 and newlen < 2) - or (pfv > 0 and newlen < 3)): + if (pfv == 0 and newlen < 2) or (pfv > 0 and newlen < 3): # If word starts with vowel, minimum stem length is 2. # If word starts with consonant, minimum stem length is # 3. - continue + continue is_intact = False - stem = stem[:0 - num] + append + stem = stem[: 0 - num] + append continuing = cont break return stem + # The default rules for the Paice-Husk stemming algorithm defaultrules = """ diff --git a/src/whoosh/lang/porter.py b/src/whoosh/lang/porter.py index 65d169a9..15d06a57 100644 --- a/src/whoosh/lang/porter.py +++ b/src/whoosh/lang/porter.py @@ -12,38 +12,38 @@ # Suffix replacement lists _step2list = { - "ational": "ate", - "tional": "tion", - "enci": "ence", - "anci": "ance", - "izer": "ize", - "bli": "ble", - "alli": "al", - "entli": "ent", - "eli": "e", - "ousli": "ous", - "ization": "ize", - "ation": "ate", - "ator": "ate", - "alism": "al", - "iveness": "ive", - "fulness": "ful", - "ousness": "ous", - "aliti": "al", - "iviti": "ive", - "biliti": "ble", - "logi": "log", - } + "ational": "ate", + "tional": "tion", + "enci": "ence", + "anci": "ance", + "izer": "ize", + "bli": "ble", + "alli": "al", + "entli": "ent", + "eli": "e", + "ousli": "ous", + "ization": "ize", + "ation": "ate", + "ator": "ate", + "alism": "al", + "iveness": "ive", + "fulness": "ful", + "ousness": "ous", + "aliti": "al", + "iviti": "ive", + "biliti": "ble", + "logi": "log", +} _step3list = { - "icate": "ic", - "ative": "", - "alize": "al", - "iciti": "ic", - "ical": "ic", - "ful": "", - "ness": "", - } + "icate": "ic", + "ative": "", + "alize": "al", + "iciti": "ic", + "ical": "ic", + "ful": "", + "ness": "", +} _cons = "[^aeiou]" @@ -54,9 +54,13 @@ # m > 0 _mgr0 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq) # m == 0 -_meq1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$") +_meq1 = re.compile( + "^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$" +) # m > 1 -_mgr1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq) +_mgr1 = re.compile( + "^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq +) # vowel in stem _s_v = re.compile("^(" + _cons_seq + ")?" + _vowel) # ??? @@ -67,15 +71,20 @@ _ed_ing = re.compile("^(.*)(ed|ing)$") _at_bl_iz = re.compile("(at|bl|iz)$") _step1b = re.compile("([^aeiouylsz])\\1$") -_step2 = re.compile("^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$") +_step2 = re.compile( + "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$" +) _step3 = re.compile("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$") -_step4_1 = re.compile("^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$") +_step4_1 = re.compile( + "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$" +) _step4_2 = re.compile("^(.+?)(s|t)(ion)$") _step5 = re.compile("^(.+?)e$") # Stemming function + def stem(w): """Uses the Porter stemming algorithm to remove suffixes from English words. diff --git a/src/whoosh/lang/porter2.py b/src/whoosh/lang/porter2.py index 4d669752..896b7393 100644 --- a/src/whoosh/lang/porter2.py +++ b/src/whoosh/lang/porter2.py @@ -20,9 +20,9 @@ def get_r1(word): # exceptional forms - if word.startswith('gener') or word.startswith('arsen'): + if word.startswith("gener") or word.startswith("arsen"): return 5 - if word.startswith('commun'): + if word.startswith("commun"): return 6 # normal form @@ -62,9 +62,9 @@ def remove_initial_apostrophe(word): def capitalize_consonant_ys(word): - if word.startswith('y'): - word = 'Y' + word[1:] - return ccy_exp.sub(r'\g<1>Y', word) + if word.startswith("y"): + word = "Y" + word[1:] + return ccy_exp.sub(r"\g<1>Y", word) def step_0(word): @@ -78,16 +78,16 @@ def step_0(word): def step_1a(word): - if word.endswith('sses'): - return word[:-4] + 'ss' - if word.endswith('ied') or word.endswith('ies'): + if word.endswith("sses"): + return word[:-4] + "ss" + if word.endswith("ied") or word.endswith("ies"): if len(word) > 4: - return word[:-3] + 'i' + return word[:-3] + "i" else: - return word[:-3] + 'ie' - if word.endswith('us') or word.endswith('ss'): + return word[:-3] + "ie" + if word.endswith("us") or word.endswith("ss"): return word - if word.endswith('s'): + if word.endswith("s"): preceding = word[:-1] if s1a_exp.search(preceding): return preceding @@ -95,7 +95,7 @@ def step_1a(word): return word -doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt') +doubles = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") def ends_with_double(word): @@ -106,31 +106,31 @@ def ends_with_double(word): def step_1b_helper(word): - if word.endswith('at') or word.endswith('bl') or word.endswith('iz'): - return word + 'e' + if word.endswith("at") or word.endswith("bl") or word.endswith("iz"): + return word + "e" if ends_with_double(word): return word[:-1] if is_short_word(word): - return word + 'e' + return word + "e" return word -s1b_suffixes = ('ed', 'edly', 'ing', 'ingly') +s1b_suffixes = ("ed", "edly", "ing", "ingly") def step_1b(word, r1): - if word.endswith('eedly'): + if word.endswith("eedly"): if len(word) - 5 >= r1: return word[:-3] return word - if word.endswith('eed'): + if word.endswith("eed"): if len(word) - 3 >= r1: return word[:-1] return word for suffix in s1b_suffixes: if word.endswith(suffix): - preceding = word[:-len(suffix)] + preceding = word[: -len(suffix)] if s1b_exp.search(preceding): return step_1b_helper(preceding) return word @@ -139,49 +139,51 @@ def step_1b(word, r1): def step_1c(word): - if word.endswith('y') or word.endswith('Y') and len(word) > 1: - if word[-2] not in 'aeiouy': + if word.endswith("y") or word.endswith("Y") and len(word) > 1: + if word[-2] not in "aeiouy": if len(word) > 2: - return word[:-1] + 'i' + return word[:-1] + "i" return word def step_2_helper(word, r1, end, repl, prev): - if word.endswith(end): - if len(word) - len(end) >= r1: - if prev == []: - return word[:-len(end)] + repl - for p in prev: - if word[:-len(end)].endswith(p): - return word[:-len(end)] + repl - return word - return None - - -s2_triples = (('ization', 'ize', []), - ('ational', 'ate', []), - ('fulness', 'ful', []), - ('ousness', 'ous', []), - ('iveness', 'ive', []), - ('tional', 'tion', []), - ('biliti', 'ble', []), - ('lessli', 'less', []), - ('entli', 'ent', []), - ('ation', 'ate', []), - ('alism', 'al', []), - ('aliti', 'al', []), - ('ousli', 'ous', []), - ('iviti', 'ive', []), - ('fulli', 'ful', []), - ('enci', 'ence', []), - ('anci', 'ance', []), - ('abli', 'able', []), - ('izer', 'ize', []), - ('ator', 'ate', []), - ('alli', 'al', []), - ('bli', 'ble', []), - ('ogi', 'og', ['l']), - ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'])) + if word.endswith(end): + if len(word) - len(end) >= r1: + if prev == []: + return word[: -len(end)] + repl + for p in prev: + if word[: -len(end)].endswith(p): + return word[: -len(end)] + repl + return word + return None + + +s2_triples = ( + ("ization", "ize", []), + ("ational", "ate", []), + ("fulness", "ful", []), + ("ousness", "ous", []), + ("iveness", "ive", []), + ("tional", "tion", []), + ("biliti", "ble", []), + ("lessli", "less", []), + ("entli", "ent", []), + ("ation", "ate", []), + ("alism", "al", []), + ("aliti", "al", []), + ("ousli", "ous", []), + ("iviti", "ive", []), + ("fulli", "ful", []), + ("enci", "ence", []), + ("anci", "ance", []), + ("abli", "able", []), + ("izer", "ize", []), + ("ator", "ate", []), + ("alli", "al", []), + ("bli", "ble", []), + ("ogi", "og", ["l"]), + ("li", "", ["c", "d", "e", "g", "h", "k", "m", "n", "r", "t"]), +) def step_2(word, r1): @@ -196,23 +198,25 @@ def step_3_helper(word, r1, r2, end, repl, r2_necessary): if word.endswith(end): if len(word) - len(end) >= r1: if not r2_necessary: - return word[:-len(end)] + repl + return word[: -len(end)] + repl else: if len(word) - len(end) >= r2: - return word[:-len(end)] + repl + return word[: -len(end)] + repl return word return None -s3_triples = (('ational', 'ate', False), - ('tional', 'tion', False), - ('alize', 'al', False), - ('icate', 'ic', False), - ('iciti', 'ic', False), - ('ative', '', True), - ('ical', 'ic', False), - ('ness', '', False), - ('ful', '', False)) +s3_triples = ( + ("ational", "ate", False), + ("tional", "tion", False), + ("alize", "al", False), + ("icate", "ic", False), + ("iciti", "ic", False), + ("ative", "", True), + ("ical", "ic", False), + ("ness", "", False), + ("ful", "", False), +) def step_3(word, r1, r2): @@ -223,18 +227,35 @@ def step_3(word, r1, r2): return word -s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement', - 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize') +s4_delete_list = ( + "al", + "ance", + "ence", + "er", + "ic", + "able", + "ible", + "ant", + "ement", + "ment", + "ent", + "ism", + "ate", + "iti", + "ous", + "ive", + "ize", +) def step_4(word, r2): for end in s4_delete_list: if word.endswith(end): if len(word) - len(end) >= r2: - return word[:-len(end)] + return word[: -len(end)] return word - if word.endswith('sion') or word.endswith('tion'): + if word.endswith("sion") or word.endswith("tion"): if len(word) - 3 >= r2: return word[:-3] @@ -242,12 +263,12 @@ def step_4(word, r2): def step_5(word, r1, r2): - if word.endswith('l'): - if len(word) - 1 >= r2 and word[-2] == 'l': + if word.endswith("l"): + if len(word) - 1 >= r2 and word[-2] == "l": return word[:-1] return word - if word.endswith('e'): + if word.endswith("e"): if len(word) - 1 >= r2: return word[:-1] if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]): @@ -257,30 +278,42 @@ def step_5(word, r1, r2): def normalize_ys(word): - return word.replace('Y', 'y') - - -exceptional_forms = {'skis': 'ski', - 'skies': 'sky', - 'dying': 'die', - 'lying': 'lie', - 'tying': 'tie', - 'idly': 'idl', - 'gently': 'gentl', - 'ugly': 'ugli', - 'early': 'earli', - 'only': 'onli', - 'singly': 'singl', - 'sky': 'sky', - 'news': 'news', - 'howe': 'howe', - 'atlas': 'atlas', - 'cosmos': 'cosmos', - 'bias': 'bias', - 'andes': 'andes'} - -exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring', - 'earring', 'proceed', 'exceed', 'succeed']) + return word.replace("Y", "y") + + +exceptional_forms = { + "skis": "ski", + "skies": "sky", + "dying": "die", + "lying": "lie", + "tying": "tie", + "idly": "idl", + "gently": "gentl", + "ugly": "ugli", + "early": "earli", + "only": "onli", + "singly": "singl", + "sky": "sky", + "news": "news", + "howe": "howe", + "atlas": "atlas", + "cosmos": "cosmos", + "bias": "bias", + "andes": "andes", +} + +exceptional_early_exit_post_1a = frozenset( + [ + "inning", + "outing", + "canning", + "herring", + "earring", + "proceed", + "exceed", + "succeed", + ] +) def stem(word): diff --git a/src/whoosh/lang/snowball/__init__.py b/src/whoosh/lang/snowball/__init__.py index d450288c..4b99cfbc 100644 --- a/src/whoosh/lang/snowball/__init__.py +++ b/src/whoosh/lang/snowball/__init__.py @@ -54,21 +54,21 @@ from .spanish import SpanishStemmer from .swedish import SwedishStemmer - # Map two-letter codes to stemming classes -classes = {"da": DanishStemmer, - "nl": DutchStemmer, - "en": EnglishStemmer, - "fi": FinnishStemmer, - "fr": FrenchStemmer, - "de": GermanStemmer, - "hu": HungarianStemmer, - "it": ItalianStemmer, - "no": NorwegianStemmer, - "pt": PortugueseStemmer, - "ro": RomanianStemmer, - "ru": RussianStemmer, - "es": SpanishStemmer, - "sv": SwedishStemmer, - } +classes = { + "da": DanishStemmer, + "nl": DutchStemmer, + "en": EnglishStemmer, + "fi": FinnishStemmer, + "fr": FrenchStemmer, + "de": GermanStemmer, + "hu": HungarianStemmer, + "it": ItalianStemmer, + "no": NorwegianStemmer, + "pt": PortugueseStemmer, + "ro": RomanianStemmer, + "ru": RussianStemmer, + "es": SpanishStemmer, + "sv": SwedishStemmer, +} diff --git a/src/whoosh/lang/snowball/bases.py b/src/whoosh/lang/snowball/bases.py index 0602385d..776e94aa 100644 --- a/src/whoosh/lang/snowball/bases.py +++ b/src/whoosh/lang/snowball/bases.py @@ -33,10 +33,10 @@ def _r1_scandinavian(self, word, vowels): r1 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: - if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0: + if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0: r1 = word[3:] - elif len(word[:i + 1]) >= 3: - r1 = word[i + 1:] + elif len(word[: i + 1]) >= 3: + r1 = word[i + 1 :] else: return word break @@ -82,12 +82,12 @@ def _r1r2_standard(self, word, vowels): r2 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: - r1 = word[i + 1:] + r1 = word[i + 1 :] break for i in range(1, len(r1)): if r1[i] not in vowels and r1[i - 1] in vowels: - r2 = r1[i + 1:] + r2 = r1[i + 1 :] break return (r1, r2) @@ -119,13 +119,13 @@ def _rv_standard(self, word, vowels): if word[1] not in vowels: for i in range(2, len(word)): if word[i] in vowels: - rv = word[i + 1:] + rv = word[i + 1 :] break elif word[:2] in vowels: for i in range(2, len(word)): if word[i] not in vowels: - rv = word[i + 1:] + rv = word[i + 1 :] break else: rv = word[3:] diff --git a/src/whoosh/lang/snowball/danish.py b/src/whoosh/lang/snowball/danish.py index 8c4f4878..9a4351af 100644 --- a/src/whoosh/lang/snowball/danish.py +++ b/src/whoosh/lang/snowball/danish.py @@ -1,7 +1,7 @@ -from .bases import _ScandinavianStemmer - from whoosh.compat import u +from .bases import _ScandinavianStemmer + class DanishStemmer(_ScandinavianStemmer): """ diff --git a/src/whoosh/lang/snowball/dutch.py b/src/whoosh/lang/snowball/dutch.py index 0d683649..8f73195a 100644 --- a/src/whoosh/lang/snowball/dutch.py +++ b/src/whoosh/lang/snowball/dutch.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class DutchStemmer(_StandardStemmer): """ diff --git a/src/whoosh/lang/snowball/english.py b/src/whoosh/lang/snowball/english.py index a2567dab..f7b5b092 100644 --- a/src/whoosh/lang/snowball/english.py +++ b/src/whoosh/lang/snowball/english.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class EnglishStemmer(_StandardStemmer): """ diff --git a/src/whoosh/lang/snowball/finnish.py b/src/whoosh/lang/snowball/finnish.py index 63f5a752..b9eb9205 100644 --- a/src/whoosh/lang/snowball/finnish.py +++ b/src/whoosh/lang/snowball/finnish.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class FinnishStemmer(_StandardStemmer): """ diff --git a/src/whoosh/lang/snowball/french.py b/src/whoosh/lang/snowball/french.py index f204adf3..d63a5b33 100644 --- a/src/whoosh/lang/snowball/french.py +++ b/src/whoosh/lang/snowball/french.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class FrenchStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/german.py b/src/whoosh/lang/snowball/german.py index 1c5f94f3..263b4972 100644 --- a/src/whoosh/lang/snowball/german.py +++ b/src/whoosh/lang/snowball/german.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class GermanStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/italian.py b/src/whoosh/lang/snowball/italian.py index daadac9a..2165a8d5 100644 --- a/src/whoosh/lang/snowball/italian.py +++ b/src/whoosh/lang/snowball/italian.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class ItalianStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/norwegian.py b/src/whoosh/lang/snowball/norwegian.py index 4bc0f7b0..c011ca94 100644 --- a/src/whoosh/lang/snowball/norwegian.py +++ b/src/whoosh/lang/snowball/norwegian.py @@ -1,7 +1,7 @@ -from .bases import _ScandinavianStemmer - from whoosh.compat import u +from .bases import _ScandinavianStemmer + class NorwegianStemmer(_ScandinavianStemmer): diff --git a/src/whoosh/lang/snowball/portugese.py b/src/whoosh/lang/snowball/portugese.py index 54dcb5aa..bed4e943 100644 --- a/src/whoosh/lang/snowball/portugese.py +++ b/src/whoosh/lang/snowball/portugese.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class PortugueseStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/romanian.py b/src/whoosh/lang/snowball/romanian.py index 89a96de6..c33b0d90 100644 --- a/src/whoosh/lang/snowball/romanian.py +++ b/src/whoosh/lang/snowball/romanian.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class RomanianStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/spanish.py b/src/whoosh/lang/snowball/spanish.py index ccb21871..f1e50ed2 100644 --- a/src/whoosh/lang/snowball/spanish.py +++ b/src/whoosh/lang/snowball/spanish.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class SpanishStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/swedish.py b/src/whoosh/lang/snowball/swedish.py index 9303e3f7..cb46fbfd 100644 --- a/src/whoosh/lang/snowball/swedish.py +++ b/src/whoosh/lang/snowball/swedish.py @@ -1,7 +1,7 @@ -from .bases import _ScandinavianStemmer - from whoosh.compat import u +from .bases import _ScandinavianStemmer + class SwedishStemmer(_ScandinavianStemmer): diff --git a/src/whoosh/lang/stopwords.py b/src/whoosh/lang/stopwords.py index 8fc1703d..fab7b61c 100644 --- a/src/whoosh/lang/stopwords.py +++ b/src/whoosh/lang/stopwords.py @@ -15,11 +15,11 @@ # ===== # This module was generated from the original files using the following script -#import os.path -#import textwrap +# import os.path +# import textwrap # -#names = os.listdir("stopwords") -#for name in names: +# names = os.listdir("stopwords") +# for name in names: # f = open("stopwords/" + name) # wordls = [line.strip() for line in f] # words = " ".join(wordls) @@ -30,16 +30,18 @@ stoplists = { - "da": frozenset(""" + "da": frozenset( + """ og i jeg det at en den til er som på de med han af for ikke der var mig sig men et har om vi min havde ham hun nu over da fra du ud sin dem os op man hans hvor eller hvad skal selv her alle vil blev kunne ind når være dog noget ville jo deres efter ned skulle denne end dette mit også under have dig anden hende mine alt meget sit sine vor mod disse hvis din nogle hos blive mange ad bliver hendes været thi jer sådan - """.split()), - - "nl": frozenset(""" + """.split() + ), + "nl": frozenset( + """ de en van ik te dat die in een hij het niet zijn is was op aan met als voor had er maar om hem dan zou of wat mijn men dit zo door over ze zich bij ook tot je mij uit der daar haar naar heb hoe heeft hebben deze u @@ -47,9 +49,10 @@ doen toen moet ben zonder kan hun dus alles onder ja eens hier wie werd altijd doch wordt wezen kunnen ons zelf tegen na reeds wil kon niets uw iemand geweest andere - """.split()), - - "en": frozenset(""" + """.split() + ), + "en": frozenset( + """ i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are @@ -59,9 +62,10 @@ out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now - """.split()), - - "fi": frozenset(""" + """.split() + ), + "fi": frozenset( + """ olla olen olet on olemme olette ovat ole oli olisi olisit olisin olisimme olisitte olisivat olit olin olimme olitte olivat ollut olleet en et ei emme ette eivät minä minun minut minua minussa minusta minuun @@ -85,9 +89,10 @@ joita joissa joista joihin joilla joilta joille joina joiksi että ja jos koska kuin mutta niin sekä sillä tai vaan vai vaikka kanssa mukaan noin poikki yli kun niin nyt itse - """.split()), - - "fr": frozenset(""" + """.split() + ), + "fr": frozenset( + """ au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma mais me même mes moi mon ne nos notre nous on ou par pas pour qu que qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l @@ -100,9 +105,10 @@ auront aurais aurait aurions auriez auraient avais avait avions aviez avaient eut eûmes eûtes eurent aie aies ait ayons ayez aient eusse eusses eût eussions eussiez eussent - """.split()), - - "de": frozenset(""" + """.split() + ), + "de": frozenset( + """ aber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann der den des dem die das daß derselbe derselben @@ -122,9 +128,10 @@ unter viel vom von vor während war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte würde würden zu zum zur zwar zwischen - """.split()), - - "hu": frozenset(""" + """.split() + ), + "hu": frozenset( + """ a ahogy ahol aki akik akkor alatt által általában amely amelyek amelyekben amelyeket amelyet amelynek ami amit amolyan amíg amikor át abban ahhoz annak arra arról az azok azon azt azzal azért aztán @@ -143,9 +150,10 @@ több úgy ugyanis új újabb újra után utána utolsó vagy vagyis valaki valami valamint való vagyok van vannak volt voltam voltak voltunk vissza vele viszont volna - """.split()), - - "it": frozenset(""" + """.split() + ), + "it": frozenset( + """ ad al allo ai agli all agl alla alle con col coi da dal dallo dai dagli dall dagl dalla dalle di del dello dei degli dell degl della delle in nel nello nei negli nell negl nella nelle su sul sullo sui sugli sull @@ -170,9 +178,10 @@ staresti starebbe staremmo stareste starebbero stavo stavi stava stavamo stavate stavano stetti stesti stette stemmo steste stettero stessi stesse stessimo stessero stando - """.split()), - - "no": frozenset(""" + """.split() + ), + "no": frozenset( + """ og i jeg det at en et den til er som på de med han av ikke ikkje der så var meg seg men ett har om vi min mitt ha hadde hun nå over da ved fra du ut sin dem oss opp man kan hans hvor eller hva skal selv sjøl @@ -185,9 +194,10 @@ hennes hoss hossen ikkje ingi inkje korleis korso kva kvar kvarhelst kven kvi kvifor me medan mi mine mykje no nokon noka nokor noko nokre si sia sidan so somt somme um upp vere vore verte vort varte vart - """.split()), - - "pt": frozenset(""" + """.split() + ), + "pt": frozenset( + """ de a o que e do da em um para com não uma os no se na por mais as dos como mas ao ele das à seu sua ou quando muito nos já eu também só pelo pela até isso ela entre depois sem mesmo aos seus quem nas me esse @@ -207,9 +217,10 @@ tivemos tiveram tivera tivéramos tenha tenhamos tenham tivesse tivéssemos tivessem tiver tivermos tiverem terei terá teremos terão teria teríamos teriam - """.split()), - - "ru": frozenset(""" + """.split() + ), + "ru": frozenset( + """ и в во не что он на я с со как а то все она так его но да ты к у же вы за бы по только ее мне было вот от меня еще нет о из ему @@ -228,9 +239,10 @@ впрочем хорошо свою этой перед иногда лучше чуть том нельзя такой им более всегда конечно всю между - """.split()), - - "es": frozenset(""" + """.split() + ), + "es": frozenset( + """ de la que el en y a los del se las por un para con no una su al lo como más pero sus le ya o este sí porque esta entre cuando muy sin sobre también me hasta hay donde quien desde todo nos durante todos uno les @@ -263,9 +275,10 @@ tuvieron tuviera tuvieras tuviéramos tuvierais tuvieran tuviese tuvieses tuviésemos tuvieseis tuviesen teniendo tenido tenida tenidos tenidas tened - """.split()), - - "sv": frozenset(""" + """.split() + ), + "sv": frozenset( + """ och det att i en jag hon som han på den med var sig för så till är men ett om hade de av icke mig du henne då sin nu har inte hans honom skulle hennes där min man ej vid kunde något från ut när efter upp @@ -274,12 +287,14 @@ mitt ni bli blev oss din dessa några deras blir mina samma vilken er sådan vår blivit dess inom mellan sådant varför varje vilka ditt vem vilket sitta sådana vart dina vars vårt våra ert era vilkas - """.split()), - - "tr": frozenset(""" + """.split() + ), + "tr": frozenset( + """ acaba ama aslında az bazı belki biri birkaç birşey biz bu çok çünkü da daha de defa diye eğer en gibi hem hep hepsi her hiç için ile ise kez ki kim mı mu mü nasıl ne neden nerde nerede nereye niçin niye o sanki şey siz şu tüm ve veya ya yani - """.split()), + """.split() + ), } diff --git a/src/whoosh/lang/wordnet.py b/src/whoosh/lang/wordnet.py index 843da196..bf859cc1 100644 --- a/src/whoosh/lang/wordnet.py +++ b/src/whoosh/lang/wordnet.py @@ -35,7 +35,7 @@ from collections import defaultdict from whoosh.compat import iterkeys, text_type -from whoosh.fields import Schema, ID, STORED +from whoosh.fields import ID, STORED, Schema def parse_file(f): diff --git a/src/whoosh/matching/__init__.py b/src/whoosh/matching/__init__.py index 3f826b98..e07a0320 100644 --- a/src/whoosh/matching/__init__.py +++ b/src/whoosh/matching/__init__.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh.matching.mcore import * from whoosh.matching.binary import * -from whoosh.matching.wrappers import * from whoosh.matching.combo import * +from whoosh.matching.mcore import * +from whoosh.matching.wrappers import * diff --git a/src/whoosh/matching/combo.py b/src/whoosh/matching/combo.py index e642feec..64cdb43d 100644 --- a/src/whoosh/matching/combo.py +++ b/src/whoosh/matching/combo.py @@ -26,6 +26,7 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import division + from array import array from whoosh.compat import range diff --git a/src/whoosh/matching/mcore.py b/src/whoosh/matching/mcore.py index df5db6b8..a13a0d6e 100644 --- a/src/whoosh/matching/mcore.py +++ b/src/whoosh/matching/mcore.py @@ -51,9 +51,7 @@ from itertools import repeat -from whoosh.compat import izip -from whoosh.compat import abstractmethod - +from whoosh.compat import abstractmethod, izip # Exceptions diff --git a/src/whoosh/multiproc.py b/src/whoosh/multiproc.py index c1caf00f..6d7bcc4a 100644 --- a/src/whoosh/multiproc.py +++ b/src/whoosh/multiproc.py @@ -26,13 +26,14 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import with_statement + from multiprocessing import Process, Queue, cpu_count -from whoosh.compat import queue, range, pickle from whoosh.codec import base -from whoosh.writing import SegmentWriter +from whoosh.compat import pickle, queue, range from whoosh.externalsort import imerge from whoosh.util import random_name +from whoosh.writing import SegmentWriter def finish_subsegment(writer, k=64): diff --git a/src/whoosh/qparser/common.py b/src/whoosh/qparser/common.py index 8e257801..a10d0d68 100644 --- a/src/whoosh/qparser/common.py +++ b/src/whoosh/qparser/common.py @@ -40,8 +40,7 @@ def __init__(self, cause, msg=None): def get_single_text(field, text, **kwargs): - """Returns the first token from an analyzer's output. - """ + """Returns the first token from an analyzer's output.""" for t in field.process_text(text, mode="query", **kwargs): return t diff --git a/src/whoosh/qparser/dateparse.py b/src/whoosh/qparser/dateparse.py index 37def074..2ee463f0 100644 --- a/src/whoosh/qparser/dateparse.py +++ b/src/whoosh/qparser/dateparse.py @@ -29,14 +29,19 @@ import sys from datetime import datetime, timedelta -from whoosh.compat import string_type, iteritems +from whoosh.compat import iteritems, string_type from whoosh.qparser import plugins, syntax from whoosh.qparser.taggers import Tagger from whoosh.support.relativedelta import relativedelta from whoosh.util.text import rcompile -from whoosh.util.times import adatetime, timespan -from whoosh.util.times import fill_in, is_void, relative_days -from whoosh.util.times import TimeError +from whoosh.util.times import ( + TimeError, + adatetime, + fill_in, + is_void, + relative_days, + timespan, +) class DateParseError(Exception): diff --git a/src/whoosh/qparser/default.py b/src/whoosh/qparser/default.py index 499683d4..e783147f 100644 --- a/src/whoosh/qparser/default.py +++ b/src/whoosh/qparser/default.py @@ -30,8 +30,7 @@ from whoosh import query from whoosh.compat import text_type from whoosh.qparser import syntax -from whoosh.qparser.common import print_debug, QueryParserError - +from whoosh.qparser.common import QueryParserError, print_debug # Query parser object diff --git a/src/whoosh/qparser/plugins.py b/src/whoosh/qparser/plugins.py index 1bc463d4..c382ca43 100644 --- a/src/whoosh/qparser/plugins.py +++ b/src/whoosh/qparser/plugins.py @@ -28,11 +28,10 @@ import copy from whoosh import query -from whoosh.compat import u -from whoosh.compat import iteritems, range +from whoosh.compat import iteritems, range, u from whoosh.qparser import syntax from whoosh.qparser.common import attach -from whoosh.qparser.taggers import RegexTagger, FnTagger +from whoosh.qparser.taggers import FnTagger, RegexTagger from whoosh.util.text import rcompile diff --git a/src/whoosh/qparser/syntax.py b/src/whoosh/qparser/syntax.py index ac383b5f..0a51f9d8 100644 --- a/src/whoosh/qparser/syntax.py +++ b/src/whoosh/qparser/syntax.py @@ -25,10 +25,11 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -import sys, weakref +import sys +import weakref from whoosh import query -from whoosh.qparser.common import get_single_text, QueryParserError, attach +from whoosh.qparser.common import QueryParserError, attach, get_single_text class SyntaxNode(object): @@ -227,7 +228,7 @@ def apply(self, fn): self.type, [fn(node) for node in self.nodes], boost=self.boost, - **self.kwargs + **self.kwargs, ) def query(self, parser): diff --git a/src/whoosh/qparser/taggers.py b/src/whoosh/qparser/taggers.py index d26caeee..2c7d46c1 100644 --- a/src/whoosh/qparser/taggers.py +++ b/src/whoosh/qparser/taggers.py @@ -27,7 +27,6 @@ from whoosh.util.text import rcompile - # Tagger objects diff --git a/src/whoosh/query/__init__.py b/src/whoosh/query/__init__.py index 97e34a40..8a9ae0af 100644 --- a/src/whoosh/query/__init__.py +++ b/src/whoosh/query/__init__.py @@ -25,12 +25,12 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh.query.qcore import * -from whoosh.query.terms import * from whoosh.query.compound import * -from whoosh.query.positional import * -from whoosh.query.ranges import * -from whoosh.query.wrappers import * from whoosh.query.nested import * +from whoosh.query.positional import * from whoosh.query.qcolumns import * +from whoosh.query.qcore import * +from whoosh.query.ranges import * from whoosh.query.spans import * +from whoosh.query.terms import * +from whoosh.query.wrappers import * diff --git a/src/whoosh/query/compound.py b/src/whoosh/query/compound.py index 05ca46d0..afa3e056 100644 --- a/src/whoosh/query/compound.py +++ b/src/whoosh/query/compound.py @@ -118,7 +118,7 @@ def estimate_min_size(self, ixreader): return 0 def normalize(self): - from whoosh.query import Every, TermRange, NumericRange + from whoosh.query import Every, NumericRange, TermRange # Normalize subqueries and merge nested instances of this class subqueries = [] diff --git a/src/whoosh/query/positional.py b/src/whoosh/query/positional.py index ab78d5ac..f21076cd 100644 --- a/src/whoosh/query/positional.py +++ b/src/whoosh/query/positional.py @@ -26,12 +26,13 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import division + import copy from whoosh import matching from whoosh.analysis import Token from whoosh.compat import u -from whoosh.query import qcore, terms, compound +from whoosh.query import compound, qcore, terms class Sequence(compound.CompoundQuery): @@ -244,7 +245,7 @@ def estimate_min_size(self, ixreader): return self._and_query().estimate_min_size(ixreader) def matcher(self, searcher, context=None): - from whoosh.query import Term, SpanNear2 + from whoosh.query import SpanNear2, Term fieldname = self.fieldname if fieldname not in searcher.schema: diff --git a/src/whoosh/query/qcore.py b/src/whoosh/query/qcore.py index d4845b49..73c52080 100644 --- a/src/whoosh/query/qcore.py +++ b/src/whoosh/query/qcore.py @@ -26,14 +26,13 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import division + import copy from array import array from whoosh import matching -from whoosh.compat import u +from whoosh.compat import methodcaller, u from whoosh.reading import TermNotFound -from whoosh.compat import methodcaller - # Exceptions diff --git a/src/whoosh/query/ranges.py b/src/whoosh/query/ranges.py index 5136b455..cd96e063 100644 --- a/src/whoosh/query/ranges.py +++ b/src/whoosh/query/ranges.py @@ -27,7 +27,7 @@ from whoosh.compat import b, u -from whoosh.query import qcore, terms, compound, wrappers +from whoosh.query import compound, qcore, terms, wrappers from whoosh.util.times import datetime_to_long diff --git a/src/whoosh/query/spans.py b/src/whoosh/query/spans.py index 460c99f9..5e89db75 100644 --- a/src/whoosh/query/spans.py +++ b/src/whoosh/query/spans.py @@ -43,11 +43,10 @@ """ -from whoosh.matching import mcore, wrappers, binary -from whoosh.query import Query, And, AndMaybe, Or, Term +from whoosh.matching import binary, mcore, wrappers +from whoosh.query import And, AndMaybe, Or, Query, Term from whoosh.util import make_binary_tree - # Span class diff --git a/src/whoosh/query/terms.py b/src/whoosh/query/terms.py index 98c7dee6..67b00a19 100644 --- a/src/whoosh/query/terms.py +++ b/src/whoosh/query/terms.py @@ -26,6 +26,7 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import division + import copy import fnmatch import re diff --git a/src/whoosh/query/wrappers.py b/src/whoosh/query/wrappers.py index 9b51128e..58d8ecb7 100644 --- a/src/whoosh/query/wrappers.py +++ b/src/whoosh/query/wrappers.py @@ -26,6 +26,7 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import division + from array import array from whoosh import matching diff --git a/src/whoosh/reading.py b/src/whoosh/reading.py index 94ac59dd..9b41c8c8 100644 --- a/src/whoosh/reading.py +++ b/src/whoosh/reading.py @@ -28,21 +28,19 @@ """This module contains classes that allow reading from an index. """ -from math import log from bisect import bisect_right -from heapq import heapify, heapreplace, heappop, nlargest +from heapq import heapify, heappop, heapreplace, nlargest +from math import log from cached_property import cached_property from whoosh import columns -from whoosh.compat import abstractmethod -from whoosh.compat import zip_, next, iteritems +from whoosh.compat import abstractmethod, iteritems, next, zip_ from whoosh.filedb.filestore import OverlayStorage from whoosh.matching import MultiMatcher from whoosh.support.levenshtein import distance from whoosh.system import emptybytes - # Exceptions diff --git a/src/whoosh/scoring.py b/src/whoosh/scoring.py index 40de76e5..1d92e061 100644 --- a/src/whoosh/scoring.py +++ b/src/whoosh/scoring.py @@ -30,11 +30,11 @@ """ from __future__ import division + from math import log, pi from whoosh.compat import iteritems - # Base classes diff --git a/src/whoosh/searching.py b/src/whoosh/searching.py index cdbdd20c..805e99d2 100644 --- a/src/whoosh/searching.py +++ b/src/whoosh/searching.py @@ -30,13 +30,14 @@ from __future__ import division + import copy import weakref from math import ceil from whoosh import classify, highlight, query, scoring -from whoosh.compat import iteritems, itervalues, iterkeys, range -from whoosh.idsets import DocIdSet, BitSet +from whoosh.compat import iteritems, iterkeys, itervalues, range +from whoosh.idsets import BitSet, DocIdSet from whoosh.reading import TermNotFound diff --git a/src/whoosh/sorting.py b/src/whoosh/sorting.py index 83bbbaf5..79671331 100644 --- a/src/whoosh/sorting.py +++ b/src/whoosh/sorting.py @@ -28,9 +28,7 @@ from array import array from collections import defaultdict -from whoosh.compat import string_type -from whoosh.compat import iteritems, izip, range - +from whoosh.compat import iteritems, izip, range, string_type # Faceting objects diff --git a/src/whoosh/spelling.py b/src/whoosh/spelling.py index 3a9365b1..79019402 100644 --- a/src/whoosh/spelling.py +++ b/src/whoosh/spelling.py @@ -35,7 +35,6 @@ from whoosh import highlight from whoosh.compat import iteritems, range - # Corrector objects @@ -125,8 +124,8 @@ def __init__(self, wordlist): self.wordlist = wordlist def _suggestions(self, text, maxdist, prefix): - from whoosh.automata.lev import levenshtein_automaton from whoosh.automata.fsa import find_all_matches + from whoosh.automata.lev import levenshtein_automaton seen = set() for mxd in range(1, maxdist + 1): diff --git a/src/whoosh/support/base85.py b/src/whoosh/support/base85.py index 66e7915c..620d5783 100644 --- a/src/whoosh/support/base85.py +++ b/src/whoosh/support/base85.py @@ -11,7 +11,6 @@ from whoosh.compat import range - # Instead of using the character set from the ascii85 algorithm, I put the # characters in order so that the encoded text sorts properly (my life would be # a lot easier if they had just done that from the start) diff --git a/src/whoosh/support/bench.py b/src/whoosh/support/bench.py index b3146a76..66b20988 100644 --- a/src/whoosh/support/bench.py +++ b/src/whoosh/support/bench.py @@ -26,12 +26,13 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import division + import os.path from optparse import OptionParser from shutil import rmtree from whoosh import index, qparser, query, scoring -from whoosh.util import now, find_object +from whoosh.util import find_object, now try: import xappy # type: ignore @@ -320,10 +321,12 @@ def findterms(self, terms): class ZcatalogModule(Module): def indexer(self, **kwargs): - from ZODB.FileStorage import FileStorage # type: ignore # type: ignore @UnresolvedImport - from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport - from zcatalog import catalog # type: ignore # type: ignore @UnresolvedImport import transaction # type: ignore # type: ignore @UnresolvedImport + from zcatalog import catalog # type: ignore # type: ignore @UnresolvedImport + from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport + from ZODB.FileStorage import ( + FileStorage, # type: ignore # type: ignore @UnresolvedImport + ) dir = os.path.join(self.options.dir, f"{self.options.indexname}_zcatalog") if os.path.exists(dir): @@ -360,8 +363,10 @@ def finish(self, **kwargs): del self.zcatalog_count def searcher(self): - from ZODB.FileStorage import FileStorage # type: ignore # type: ignore @UnresolvedImport from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport + from ZODB.FileStorage import ( + FileStorage, # type: ignore # type: ignore @UnresolvedImport + ) path = os.path.join( self.options.dir, f"{self.options.indexname}_zcatalog", "index" @@ -393,6 +398,7 @@ def results(self, r): class NucularModule(Module): def indexer(self, create=True): import shutil + from nucular import Nucular # type: ignore # type: ignore @UnresolvedImport dir = os.path.join(self.options.dir, f"{self.options.indexname}_nucular") diff --git a/src/whoosh/support/bitstream.py b/src/whoosh/support/bitstream.py index 682afbb8..d32ccd6d 100644 --- a/src/whoosh/support/bitstream.py +++ b/src/whoosh/support/bitstream.py @@ -8,7 +8,6 @@ from whoosh.system import _LONG_SIZE - _bitsperlong = _LONG_SIZE * 8 diff --git a/src/whoosh/support/bitvector.py b/src/whoosh/support/bitvector.py index e84943e9..45f491ec 100644 --- a/src/whoosh/support/bitvector.py +++ b/src/whoosh/support/bitvector.py @@ -6,182 +6,429 @@ from array import array #: Table of the number of '1' bits in each byte (0-255) -BYTE_COUNTS = array('B', [ - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8]) +BYTE_COUNTS = array( + "B", + [ + 0, + 1, + 1, + 2, + 1, + 2, + 2, + 3, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 5, + 6, + 6, + 7, + 6, + 7, + 7, + 8, + ], +) class BitVector(object): """ Implements a memory-efficient array of bits. - + >>> bv = BitVector(10) >>> bv >>> bv[5] = True >>> bv - + You can initialize the BitVector using an iterable of integers representing bit positions to turn on. - + >>> bv2 = BitVector(10, [2, 4, 7]) >>> bv2 >>> bv[2] True - + BitVector supports bit-wise logic operations & (and), | (or), and ^ (xor) between itself and another BitVector of equal size, or itself and a collection of integers (usually a set() or frozenset()). - + >>> bv | bv2 - + Note that ``BitVector.__len__()`` returns the number of "on" bits, not the size of the bit array. This is to make BitVector interchangeable with a set()/frozenset() of integers. To get the size, use BitVector.size. """ - + def __init__(self, size, source=None, bits=None): self.size = size - + if bits: self.bits = bits else: self.bits = array("B", ([0x00] * ((size >> 3) + 1))) - + if source: set = self.set for num in source: set(num) - + self.bcount = None - + def __eq__(self, other): if isinstance(other, BitVector): return self.bits == other.bits return False - + def __repr__(self): return f"" - + def __len__(self): # This returns the count of "on" bits instead of the size to # make BitVector exchangeable with a set() object. return self.count() - + def __contains__(self, index): return self[index] - + def __iter__(self): get = self.__getitem__ for i in range(0, self.size): if get(i): yield i - + def __str__(self): get = self.__getitem__ - return "".join("1" if get(i) else "0" - for i in range(0, self.size)) - + return "".join("1" if get(i) else "0" for i in range(0, self.size)) + def __nonzero__(self): return self.count() > 0 - + def __getitem__(self, index): return self.bits[index >> 3] & (1 << (index & 7)) != 0 - + def __setitem__(self, index, value): if value: self.set(index) else: self.clear(index) - + def _logic(self, op, bitv): if self.size != bitv.size: raise ValueError("Can't combine bitvectors of different sizes") res = BitVector(size=self.size) lpb = map(op, self.bits, bitv.bits) - res.bits = array('B', lpb) + res.bits = array("B", lpb) return res - + def union(self, other): return self.__or__(other) - + def intersection(self, other): return self.__and__(other) - + def __and__(self, other): if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__and__, other) - + def __or__(self, other): if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__or__, other) - + def __ror__(self, other): return self.__or__(other) - + def __rand__(self, other): return self.__and__(other) - + def __xor__(self, other): if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__xor__, other) - + def __invert__(self): - return BitVector(self.size, source=(x for x in range(self.size) if x not in self)) - + return BitVector( + self.size, source=(x for x in range(self.size) if x not in self) + ) + def count(self): """Returns the number of "on" bits in the bit array.""" - + if self.bcount is None: self.bcount = sum(BYTE_COUNTS[b & 0xFF] for b in self.bits) return self.bcount - + def set(self, index): """Turns the bit at the given position on.""" - + if index >= self.size: - raise IndexError(f"Position {repr(index)} greater than the size of the vector") + raise IndexError( + f"Position {repr(index)} greater than the size of the vector" + ) self.bits[index >> 3] |= 1 << (index & 7) self.bcount = None - + def clear(self, index): """Turns the bit at the given position off.""" - + self.bits[index >> 3] &= ~(1 << (index & 7)) self.bcount = None - + def set_from(self, iterable): """Takes an iterable of integers representing positions, and turns on the bits at those positions. """ - + set = self.set for index in iterable: set(index) - + def copy(self): """Returns a copy of this BitArray.""" - + return BitVector(self.size, bits=self.bits) @@ -189,23 +436,23 @@ class BitSet(object): """A set-like object for holding positive integers. It is dynamically backed by either a set or BitVector depending on how many numbers are in the set. - + Provides ``add``, ``remove``, ``union``, ``intersection``, ``__contains__``, ``__len__``, ``__iter__``, ``__and__``, ``__or__``, and ``__nonzero__`` methods. """ - + def __init__(self, size, source=None): self.size = size - + self._back = () self._switch(size > 256) - + if source: add = self.add for num in source: add(num) - + def _switch(self, toset): if toset: self._back = set(self._back) @@ -215,7 +462,7 @@ def _switch(self, toset): self._back = BitVector() self.add = self._back.set self.remove = self._vec_remove - + self.__contains__ = self._back.__contains__ self.__len__ = self._back.__len__ self.__iter__ = self._back.__iter__ @@ -226,13 +473,13 @@ def as_set(self): def union(self, other): return self.__or__(other) - + def intersection(self, other): return self.__and__(other) def __and__(self, other): self._back = self._back.intersection(other) - + def __or__(self, other): self._back = self._back.union(other) @@ -240,14 +487,8 @@ def _set_add(self, num): self._back.add(num) if len(self._back) * 4 > self.size // 8 + 32: self._switch(False) - + def _vec_remove(self, num): self._back.clear(num) if len(self._back) * 4 < self.size // 8 - 32: self._switch(True) - - - - - - diff --git a/src/whoosh/support/charset.py b/src/whoosh/support/charset.py index 530c6724..a15f6e51 100644 --- a/src/whoosh/support/charset.py +++ b/src/whoosh/support/charset.py @@ -5,10 +5,10 @@ See :class:`whoosh.analysis.CharsetTokenizer` and :class:`whoosh.analysis.CharsetFilter`. """ -from collections import defaultdict import re +from collections import defaultdict -from whoosh.compat import izip, u, iteritems, unichr, range +from whoosh.compat import iteritems, izip, range, u, unichr # This is a straightforward accent-folding charset taken from Carlos Bueno's # article "Accent Folding for Auto-Complete", for use with CharsetFilter. diff --git a/src/whoosh/support/pyparsing.py b/src/whoosh/support/pyparsing.py index 90bc23ab..9841d875 100644 --- a/src/whoosh/support/pyparsing.py +++ b/src/whoosh/support/pyparsing.py @@ -21,10 +21,9 @@ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # -#from __future__ import generators +# from __future__ import generators -__doc__ = \ -""" +__doc__ = """ pyparsing module - Classes and methods to define and execute parsing grammars The pyparsing module is an alternative approach to creating and executing simple grammars, @@ -62,35 +61,120 @@ class names, and the use of '+', '|' and '^' operators. __versionTime__ = "17 February 2009 19:45" __author__ = "Paul McGuire " -import string -from weakref import ref as wkref import copy -import sys -import warnings import re import sre_constants +import string +import sys +import warnings +from weakref import ref as wkref from whoosh.support import unicode -#~ sys.stderr.write( "testing pyparsing module, version %s, %s\n" % (__version__,__versionTime__ ) ) + +# ~ sys.stderr.write( "testing pyparsing module, version %s, %s\n" % (__version__,__versionTime__ ) ) __all__ = [ -'And', 'CaselessKeyword', 'CaselessLiteral', 'CharsNotIn', 'Combine', 'Dict', 'Each', 'Empty', -'FollowedBy', 'Forward', 'GoToColumn', 'Group', 'Keyword', 'LineEnd', 'LineStart', 'Literal', -'MatchFirst', 'NoMatch', 'NotAny', 'OneOrMore', 'OnlyOnce', 'Optional', 'Or', -'ParseBaseException', 'ParseElementEnhance', 'ParseException', 'ParseExpression', 'ParseFatalException', -'ParseResults', 'ParseSyntaxException', 'ParserElement', 'QuotedString', 'RecursiveGrammarException', -'Regex', 'SkipTo', 'StringEnd', 'StringStart', 'Suppress', 'Token', 'TokenConverter', 'Upcase', -'White', 'Word', 'WordEnd', 'WordStart', 'ZeroOrMore', -'alphanums', 'alphas', 'alphas8bit', 'anyCloseTag', 'anyOpenTag', 'cStyleComment', 'col', -'commaSeparatedList', 'commonHTMLEntity', 'countedArray', 'cppStyleComment', 'dblQuotedString', -'dblSlashComment', 'delimitedList', 'dictOf', 'downcaseTokens', 'empty', 'getTokensEndLoc', 'hexnums', -'htmlComment', 'javaStyleComment', 'keepOriginalText', 'line', 'lineEnd', 'lineStart', 'lineno', -'makeHTMLTags', 'makeXMLTags', 'matchOnlyAtCol', 'matchPreviousExpr', 'matchPreviousLiteral', -'nestedExpr', 'nullDebugAction', 'nums', 'oneOf', 'opAssoc', 'operatorPrecedence', 'printables', -'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity', -'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd', -'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute', -'indentedBlock', 'originalTextFor', + "And", + "CaselessKeyword", + "CaselessLiteral", + "CharsNotIn", + "Combine", + "Dict", + "Each", + "Empty", + "FollowedBy", + "Forward", + "GoToColumn", + "Group", + "Keyword", + "LineEnd", + "LineStart", + "Literal", + "MatchFirst", + "NoMatch", + "NotAny", + "OneOrMore", + "OnlyOnce", + "Optional", + "Or", + "ParseBaseException", + "ParseElementEnhance", + "ParseException", + "ParseExpression", + "ParseFatalException", + "ParseResults", + "ParseSyntaxException", + "ParserElement", + "QuotedString", + "RecursiveGrammarException", + "Regex", + "SkipTo", + "StringEnd", + "StringStart", + "Suppress", + "Token", + "TokenConverter", + "Upcase", + "White", + "Word", + "WordEnd", + "WordStart", + "ZeroOrMore", + "alphanums", + "alphas", + "alphas8bit", + "anyCloseTag", + "anyOpenTag", + "cStyleComment", + "col", + "commaSeparatedList", + "commonHTMLEntity", + "countedArray", + "cppStyleComment", + "dblQuotedString", + "dblSlashComment", + "delimitedList", + "dictOf", + "downcaseTokens", + "empty", + "getTokensEndLoc", + "hexnums", + "htmlComment", + "javaStyleComment", + "keepOriginalText", + "line", + "lineEnd", + "lineStart", + "lineno", + "makeHTMLTags", + "makeXMLTags", + "matchOnlyAtCol", + "matchPreviousExpr", + "matchPreviousLiteral", + "nestedExpr", + "nullDebugAction", + "nums", + "oneOf", + "opAssoc", + "operatorPrecedence", + "printables", + "punc8bit", + "pythonStyleComment", + "quotedString", + "removeQuotes", + "replaceHTMLEntity", + "replaceWith", + "restOfLine", + "sglQuotedString", + "srange", + "stringEnd", + "stringStart", + "traceParseAction", + "unicodeString", + "upcaseTokens", + "withAttribute", + "indentedBlock", + "originalTextFor", ] @@ -107,12 +191,13 @@ class names, and the use of '+', '|' and '^' operators. _MAX_INT = sys.maxint if not _PY3K: + def _ustr(obj): """Drop-in replacement for str(obj) that tries to be Unicode friendly. It first tries - str(obj). If that fails with a UnicodeEncodeError, then it tries unicode(obj). It - then < returns the unicode object | encodes it with the default encoding | ... >. + str(obj). If that fails with a UnicodeEncodeError, then it tries unicode(obj). It + then < returns the unicode object | encodes it with the default encoding | ... >. """ - if isinstance(obj,unicode): + if isinstance(obj, unicode): return obj try: @@ -129,48 +214,54 @@ def _ustr(obj): return unicode(obj) # Else encode it... but how? There are many choices... :) # Replace unprintables with escape codes? - #return unicode(obj).encode(sys.getdefaultencoding(), 'backslashreplace_errors') + # return unicode(obj).encode(sys.getdefaultencoding(), 'backslashreplace_errors') # Replace unprintables with question marks? - #return unicode(obj).encode(sys.getdefaultencoding(), 'replace') + # return unicode(obj).encode(sys.getdefaultencoding(), 'replace') # ... else: _ustr = str unichr = chr if not _PY3K: + def _str2dict(strg): - return dict( [(c,0) for c in strg] ) + return dict([(c, 0) for c in strg]) else: _str2dict = set + def _xml_escape(data): """Escape &, <, >, ", ', etc. in a string of data.""" # ampersand must be replaced first - from_symbols = '&><"\'' - to_symbols = ['&'+s+';' for s in "amp gt lt quot apos".split()] - for from_,to_ in zip(from_symbols, to_symbols): + from_symbols = "&><\"'" + to_symbols = ["&" + s + ";" for s in "amp gt lt quot apos".split()] + for from_, to_ in zip(from_symbols, to_symbols): data = data.replace(from_, to_) return data + class _Constants(object): pass + if not _PY3K: - alphas = string.lowercase + string.uppercase + alphas = string.lowercase + string.uppercase else: - alphas = string.ascii_lowercase + string.ascii_uppercase -nums = string.digits -hexnums = nums + "ABCDEFabcdef" -alphanums = alphas + nums + alphas = string.ascii_lowercase + string.ascii_uppercase +nums = string.digits +hexnums = nums + "ABCDEFabcdef" +alphanums = alphas + nums _bslash = chr(92) -printables = "".join( [ c for c in string.printable if c not in string.whitespace ] ) +printables = "".join([c for c in string.printable if c not in string.whitespace]) + class ParseBaseException(Exception): """base exception class for all parsing runtime exceptions""" + # Performance tuning: we construct a *lot* of these, so keep this # constructor as small and fast as possible - def __init__( self, pstr, loc=0, msg=None, elem=None ): + def __init__(self, pstr, loc=0, msg=None, elem=None): self.loc = loc if msg is None: self.msg = pstr @@ -180,101 +271,136 @@ def __init__( self, pstr, loc=0, msg=None, elem=None ): self.pstr = pstr self.parserElement = elem - def __getattr__( self, aname ): + def __getattr__(self, aname): """supported attributes by name are: - - lineno - returns the line number of the exception text - - col - returns the column number of the exception text - - line - returns the line containing the exception text + - lineno - returns the line number of the exception text + - col - returns the column number of the exception text + - line - returns the line containing the exception text """ - if( aname == "lineno" ): - return lineno( self.loc, self.pstr ) - elif( aname in ("col", "column") ): - return col( self.loc, self.pstr ) - elif( aname == "line" ): - return line( self.loc, self.pstr ) + if aname == "lineno": + return lineno(self.loc, self.pstr) + elif aname in ("col", "column"): + return col(self.loc, self.pstr) + elif aname == "line": + return line(self.loc, self.pstr) else: raise AttributeError(aname) - def __str__( self ): - return "%s (at char %d), (line:%d, col:%d)" % \ - ( self.msg, self.loc, self.lineno, self.column ) - def __repr__( self ): + def __str__(self): + return "%s (at char %d), (line:%d, col:%d)" % ( + self.msg, + self.loc, + self.lineno, + self.column, + ) + + def __repr__(self): return _ustr(self) - def markInputline( self, markerString = ">!<" ): + + def markInputline(self, markerString=">!<"): """Extracts the exception line from the input string, and marks - the location of the exception with a special symbol. + the location of the exception with a special symbol. """ line_str = self.line line_column = self.column - 1 if markerString: - line_str = "".join( [line_str[:line_column], - markerString, line_str[line_column:]]) + line_str = "".join( + [line_str[:line_column], markerString, line_str[line_column:]] + ) return line_str.strip() + def __dir__(self): - return "loc msg pstr parserElement lineno col line " \ - "markInputLine __str__ __repr__".split() + return ( + "loc msg pstr parserElement lineno col line " + "markInputLine __str__ __repr__".split() + ) + class ParseException(ParseBaseException): """exception thrown when parse expressions don't match class; - supported attributes by name are: - - lineno - returns the line number of the exception text - - col - returns the column number of the exception text - - line - returns the line containing the exception text + supported attributes by name are: + - lineno - returns the line number of the exception text + - col - returns the column number of the exception text + - line - returns the line containing the exception text """ + pass + class ParseFatalException(ParseBaseException): """user-throwable exception thrown when inconsistent parse content - is found; stops all parsing immediately""" + is found; stops all parsing immediately""" + pass + class ParseSyntaxException(ParseFatalException): """just like ParseFatalException, but thrown internally when an - ErrorStop indicates that parsing is to stop immediately because - an unbacktrackable syntax error has been found""" + ErrorStop indicates that parsing is to stop immediately because + an unbacktrackable syntax error has been found""" + def __init__(self, pe): super(ParseSyntaxException, self).__init__( - pe.pstr, pe.loc, pe.msg, pe.parserElement) - -#~ class ReparseException(ParseBaseException): - #~ """Experimental class - parse actions can raise this exception to cause - #~ pyparsing to reparse the input string: - #~ - with a modified input string, and/or - #~ - with a modified start location - #~ Set the values of the ReparseException in the constructor, and raise the - #~ exception in a parse action to cause pyparsing to use the new string/location. - #~ Setting the values as None causes no change to be made. - #~ """ - #~ def __init_( self, newstring, restartLoc ): - #~ self.newParseText = newstring - #~ self.reparseLoc = restartLoc + pe.pstr, pe.loc, pe.msg, pe.parserElement + ) + + +# ~ class ReparseException(ParseBaseException): +# ~ """Experimental class - parse actions can raise this exception to cause +# ~ pyparsing to reparse the input string: +# ~ - with a modified input string, and/or +# ~ - with a modified start location +# ~ Set the values of the ReparseException in the constructor, and raise the +# ~ exception in a parse action to cause pyparsing to use the new string/location. +# ~ Setting the values as None causes no change to be made. +# ~ """ +# ~ def __init_( self, newstring, restartLoc ): +# ~ self.newParseText = newstring +# ~ self.reparseLoc = restartLoc + class RecursiveGrammarException(Exception): """exception thrown by validate() if the grammar could be improperly recursive""" - def __init__( self, parseElementList ): + + def __init__(self, parseElementList): self.parseElementTrace = parseElementList - def __str__( self ): + def __str__(self): return f"RecursiveGrammarException: {self.parseElementTrace}" + class _ParseResultsWithOffset(object): - def __init__(self,p1,p2): - self.tup = (p1,p2) - def __getitem__(self,i): + def __init__(self, p1, p2): + self.tup = (p1, p2) + + def __getitem__(self, i): return self.tup[i] + def __repr__(self): return repr(self.tup) - def setOffset(self,i): - self.tup = (self.tup[0],i) + + def setOffset(self, i): + self.tup = (self.tup[0], i) + class ParseResults(object): """Structured parse results, to provide multiple means of access to the parsed data: - - as a list (len(results)) - - by list index (results[0], results[1], etc.) - - by attribute (results.) - """ - __slots__ = ( "__toklist", "__tokdict", "__doinit", "__name", "__parent", "__accumNames", "__weakref__" ) - def __new__(cls, toklist, name=None, asList=True, modal=True ): + - as a list (len(results)) + - by list index (results[0], results[1], etc.) + - by attribute (results.) + """ + + __slots__ = ( + "__toklist", + "__tokdict", + "__doinit", + "__name", + "__parent", + "__accumNames", + "__weakref__", + ) + + def __new__(cls, toklist, name=None, asList=True, modal=True): if isinstance(toklist, cls): return toklist retobj = object.__new__(cls) @@ -283,7 +409,7 @@ def __new__(cls, toklist, name=None, asList=True, modal=True ): # Performance tuning: we construct a *lot* of these, so keep this # constructor as small and fast as possible - def __init__( self, toklist, name=None, asList=True, modal=True ): + def __init__(self, toklist, name=None, asList=True, modal=True): if self.__doinit: self.__doinit = False self.__name = None @@ -298,56 +424,62 @@ def __init__( self, toklist, name=None, asList=True, modal=True ): if name: if not modal: self.__accumNames[name] = 0 - if isinstance(name,int): - name = _ustr(name) # will always return a str, but use _ustr for consistency + if isinstance(name, int): + name = _ustr( + name + ) # will always return a str, but use _ustr for consistency self.__name = name - if not toklist in (None,'',[]): - if isinstance(toklist,basestring): - toklist = [ toklist ] + if not toklist in (None, "", []): + if isinstance(toklist, basestring): + toklist = [toklist] if asList: - if isinstance(toklist,ParseResults): - self[name] = _ParseResultsWithOffset(toklist.copy(),0) + if isinstance(toklist, ParseResults): + self[name] = _ParseResultsWithOffset(toklist.copy(), 0) else: - self[name] = _ParseResultsWithOffset(ParseResults(toklist[0]),0) + self[name] = _ParseResultsWithOffset( + ParseResults(toklist[0]), 0 + ) self[name].__name = name else: try: self[name] = toklist[0] - except (KeyError,TypeError,IndexError): + except (KeyError, TypeError, IndexError): self[name] = toklist - def __getitem__( self, i ): - if isinstance( i, (int,slice) ): + def __getitem__(self, i): + if isinstance(i, (int, slice)): return self.__toklist[i] else: if i not in self.__accumNames: return self.__tokdict[i][-1][0] else: - return ParseResults([ v[0] for v in self.__tokdict[i] ]) + return ParseResults([v[0] for v in self.__tokdict[i]]) - def __setitem__( self, k, v ): - if isinstance(v,_ParseResultsWithOffset): - self.__tokdict[k] = self.__tokdict.get(k,list()) + [v] + def __setitem__(self, k, v): + if isinstance(v, _ParseResultsWithOffset): + self.__tokdict[k] = self.__tokdict.get(k, list()) + [v] sub = v[0] - elif isinstance(k,int): + elif isinstance(k, int): self.__toklist[k] = v sub = v else: - self.__tokdict[k] = self.__tokdict.get(k,list()) + [_ParseResultsWithOffset(v,0)] + self.__tokdict[k] = self.__tokdict.get(k, list()) + [ + _ParseResultsWithOffset(v, 0) + ] sub = v - if isinstance(sub,ParseResults): + if isinstance(sub, ParseResults): sub.__parent = wkref(self) - def __delitem__( self, i ): - if isinstance(i,(int,slice)): - mylen = len( self.__toklist ) + def __delitem__(self, i): + if isinstance(i, (int, slice)): + mylen = len(self.__toklist) del self.__toklist[i] # convert int to slice if isinstance(i, int): if i < 0: i += mylen - i = slice(i, i+1) + i = slice(i, i + 1) # get removed indices removed = list(range(*i.indices(mylen))) removed.reverse() @@ -356,91 +488,107 @@ def __delitem__( self, i ): occurrences = self.__tokdict[name] for j in removed: for k, (value, position) in enumerate(occurrences): - occurrences[k] = _ParseResultsWithOffset(value, position - (position > j)) + occurrences[k] = _ParseResultsWithOffset( + value, position - (position > j) + ) else: del self.__tokdict[i] - def __contains__( self, k ): + def __contains__(self, k): return k in self.__tokdict - def __len__( self ): return len( self.__toklist ) - def __bool__(self): return len( self.__toklist ) > 0 + def __len__(self): + return len(self.__toklist) + + def __bool__(self): + return len(self.__toklist) > 0 + __nonzero__ = __bool__ - def __iter__( self ): return iter( self.__toklist ) - def __reversed__( self ): return iter( reversed(self.__toklist) ) - def keys( self ): + + def __iter__(self): + return iter(self.__toklist) + + def __reversed__(self): + return iter(reversed(self.__toklist)) + + def keys(self): """Returns all named result keys.""" return self.__tokdict.keys() - def pop( self, index=-1 ): + def pop(self, index=-1): """Removes and returns item at specified index (default=last). - Will work with either numeric indices or dict-key indicies.""" + Will work with either numeric indices or dict-key indicies.""" ret = self[index] del self[index] return ret def get(self, key, defaultValue=None): """Returns named result matching the given key, or if there is no - such name, then returns the given defaultValue or None if no - defaultValue is specified.""" + such name, then returns the given defaultValue or None if no + defaultValue is specified.""" if key in self: return self[key] else: return defaultValue - def insert( self, index, insStr ): + def insert(self, index, insStr): self.__toklist.insert(index, insStr) # fixup indices in token dictionary for name in self.__tokdict: occurrences = self.__tokdict[name] for k, (value, position) in enumerate(occurrences): - occurrences[k] = _ParseResultsWithOffset(value, position + (position > index)) + occurrences[k] = _ParseResultsWithOffset( + value, position + (position > index) + ) - def items( self ): + def items(self): """Returns all named result keys and values as a list of tuples.""" - return [(k,self[k]) for k in self.__tokdict] + return [(k, self[k]) for k in self.__tokdict] - def values( self ): + def values(self): """Returns all named result values.""" - return [ v[-1][0] for v in self.__tokdict.values() ] + return [v[-1][0] for v in self.__tokdict.values()] - def __getattr__( self, name ): + def __getattr__(self, name): if name not in self.__slots__: if name in self.__tokdict: if name not in self.__accumNames: return self.__tokdict[name][-1][0] else: - return ParseResults([ v[0] for v in self.__tokdict[name] ]) + return ParseResults([v[0] for v in self.__tokdict[name]]) else: return "" return None - def __add__( self, other ): + def __add__(self, other): ret = self.copy() ret += other return ret - def __iadd__( self, other ): + def __iadd__(self, other): if other.__tokdict: offset = len(self.__toklist) - addoffset = ( lambda a: (a<0 and offset) or (a+offset) ) + addoffset = lambda a: (a < 0 and offset) or (a + offset) otheritems = other.__tokdict.items() - otherdictitems = [(k, _ParseResultsWithOffset(v[0],addoffset(v[1])) ) - for (k,vlist) in otheritems for v in vlist] - for k,v in otherdictitems: + otherdictitems = [ + (k, _ParseResultsWithOffset(v[0], addoffset(v[1]))) + for (k, vlist) in otheritems + for v in vlist + ] + for k, v in otherdictitems: self[k] = v - if isinstance(v[0],ParseResults): + if isinstance(v[0], ParseResults): v[0].__parent = wkref(self) - + self.__toklist += other.__toklist - self.__accumNames.update( other.__accumNames ) + self.__accumNames.update(other.__accumNames) del other return self - def __repr__( self ): + def __repr__(self): return f"({repr(self.__toklist)}, {repr(self.__tokdict)})" - def __str__( self ): + def __str__(self): out = "[" sep = "" for i in self.__toklist: @@ -452,46 +600,47 @@ def __str__( self ): out += "]" return out - def _asStringList( self, sep='' ): + def _asStringList(self, sep=""): out = [] for item in self.__toklist: if out and sep: out.append(sep) - if isinstance( item, ParseResults ): + if isinstance(item, ParseResults): out += item._asStringList() else: - out.append( _ustr(item) ) + out.append(_ustr(item)) return out - def asList( self ): + def asList(self): """Returns the parse results as a nested list of matching tokens, all converted to strings.""" out = [] for res in self.__toklist: - if isinstance(res,ParseResults): - out.append( res.asList() ) + if isinstance(res, ParseResults): + out.append(res.asList()) else: - out.append( res ) + out.append(res) return out - def asDict( self ): + def asDict(self): """Returns the named parse results as dictionary.""" - return dict( self.items() ) + return dict(self.items()) - def copy( self ): + def copy(self): """Returns a new copy of a ParseResults object.""" - ret = ParseResults( self.__toklist ) + ret = ParseResults(self.__toklist) ret.__tokdict = self.__tokdict.copy() ret.__parent = self.__parent - ret.__accumNames.update( self.__accumNames ) + ret.__accumNames.update(self.__accumNames) ret.__name = self.__name return ret - def asXML( self, doctag=None, namedItemsOnly=False, indent="", formatted=True ): + def asXML(self, doctag=None, namedItemsOnly=False, indent="", formatted=True): """Returns the parse results as XML. Tags are created for tokens and lists that have defined results names.""" nl = "\n" out = [] - namedItems = dict( [ (v[1],k) for (k,vlist) in self.__tokdict.items() - for v in vlist ] ) + namedItems = dict( + [(v[1], k) for (k, vlist) in self.__tokdict.items() for v in vlist] + ) nextLevelIndent = indent + " " # collapse out indents if formatting is not desired @@ -513,21 +662,29 @@ def asXML( self, doctag=None, namedItemsOnly=False, indent="", formatted=True ): else: selfTag = "ITEM" - out += [ nl, indent, "<", selfTag, ">" ] + out += [nl, indent, "<", selfTag, ">"] worklist = self.__toklist - for i,res in enumerate(worklist): - if isinstance(res,ParseResults): + for i, res in enumerate(worklist): + if isinstance(res, ParseResults): if i in namedItems: - out += [ res.asXML(namedItems[i], - namedItemsOnly and doctag is None, - nextLevelIndent, - formatted)] + out += [ + res.asXML( + namedItems[i], + namedItemsOnly and doctag is None, + nextLevelIndent, + formatted, + ) + ] else: - out += [ res.asXML(None, - namedItemsOnly and doctag is None, - nextLevelIndent, - formatted)] + out += [ + res.asXML( + None, + namedItemsOnly and doctag is None, + nextLevelIndent, + formatted, + ) + ] else: # individual token, see if there is a name for it resTag = None @@ -539,16 +696,24 @@ def asXML( self, doctag=None, namedItemsOnly=False, indent="", formatted=True ): else: resTag = "ITEM" xmlBodyText = _xml_escape(_ustr(res)) - out += [ nl, nextLevelIndent, "<", resTag, ">", - xmlBodyText, - "" ] - - out += [ nl, indent, "" ] + out += [ + nl, + nextLevelIndent, + "<", + resTag, + ">", + xmlBodyText, + "", + ] + + out += [nl, indent, ""] return "".join(out) - def __lookup(self,sub): - for k,vlist in self.__tokdict.items(): - for v,loc in vlist: + def __lookup(self, sub): + for k, vlist in self.__tokdict.items(): + for v, loc in vlist: if sub is v: return k return None @@ -563,51 +728,54 @@ def getName(self): return par.__lookup(self) else: return None - elif (len(self) == 1 and - len(self.__tokdict) == 1 and - self.__tokdict.values()[0][0][1] in (0,-1)): + elif ( + len(self) == 1 + and len(self.__tokdict) == 1 + and self.__tokdict.values()[0][0][1] in (0, -1) + ): return self.__tokdict.keys()[0] else: return None - def dump(self,indent='',depth=0): + def dump(self, indent="", depth=0): """Diagnostic method for listing out the contents of a ParseResults. - Accepts an optional indent argument so that this string can be embedded - in a nested display of other data.""" + Accepts an optional indent argument so that this string can be embedded + in a nested display of other data.""" out = [] - out.append( indent+_ustr(self.asList()) ) + out.append(indent + _ustr(self.asList())) keys = self.items() keys.sort() - for k,v in keys: + for k, v in keys: if out: - out.append('\n') - out.append( f"{indent}{' ' * depth}- {k}: " ) - if isinstance(v,ParseResults): + out.append("\n") + out.append(f"{indent}{' ' * depth}- {k}: ") + if isinstance(v, ParseResults): if v.keys(): - #~ out.append('\n') - out.append( v.dump(indent,depth+1) ) - #~ out.append('\n') + # ~ out.append('\n') + out.append(v.dump(indent, depth + 1)) + # ~ out.append('\n') else: out.append(_ustr(v)) else: out.append(_ustr(v)) - #~ out.append('\n') + # ~ out.append('\n') return "".join(out) # add support for pickle protocol def __getstate__(self): - return ( self.__toklist, - ( self.__tokdict.copy(), - self.__parent is not None and self.__parent() or None, - self.__accumNames, - self.__name ) ) - - def __setstate__(self,state): + return ( + self.__toklist, + ( + self.__tokdict.copy(), + self.__parent is not None and self.__parent() or None, + self.__accumNames, + self.__name, + ), + ) + + def __setstate__(self, state): self.__toklist = state[0] - self.__tokdict, \ - par, \ - inAccumNames, \ - self.__name = state[1] + self.__tokdict, par, inAccumNames, self.__name = state[1] self.__accumNames = {} self.__accumNames.update(inAccumNames) if par is not None: @@ -616,144 +784,163 @@ def __setstate__(self,state): self.__parent = None def __dir__(self): - return dir(super(ParseResults,self)) + self.keys() + return dir(super(ParseResults, self)) + self.keys() -def col (loc,strg): + +def col(loc, strg): """Returns current column within a string, counting newlines as line separators. - The first column is number 1. + The first column is number 1. - Note: the default parsing behavior is to expand tabs in the input string - before starting the parsing process. See L{I{ParserElement.parseString}} for more information - on parsing strings containing s, and suggested methods to maintain a - consistent view of the parsed string, the parse location, and line and column - positions within the parsed string. - """ - return (loc} for more information + on parsing strings containing s, and suggested methods to maintain a + consistent view of the parsed string, the parse location, and line and column + positions within the parsed string. + """ + return ( + (loc < len(strg) and strg[loc] == "\n") and 1 or loc - strg.rfind("\n", 0, loc) + ) -def lineno(loc,strg): + +def lineno(loc, strg): """Returns current line number within a string, counting newlines as line separators. - The first line is number 1. - - Note: the default parsing behavior is to expand tabs in the input string - before starting the parsing process. See L{I{ParserElement.parseString}} for more information - on parsing strings containing s, and suggested methods to maintain a - consistent view of the parsed string, the parse location, and line and column - positions within the parsed string. - """ - return strg.count("\n",0,loc) + 1 - -def line( loc, strg ): - """Returns the line of text containing loc within a string, counting newlines as line separators. - """ + The first line is number 1. + + Note: the default parsing behavior is to expand tabs in the input string + before starting the parsing process. See L{I{ParserElement.parseString}} for more information + on parsing strings containing s, and suggested methods to maintain a + consistent view of the parsed string, the parse location, and line and column + positions within the parsed string. + """ + return strg.count("\n", 0, loc) + 1 + + +def line(loc, strg): + """Returns the line of text containing loc within a string, counting newlines as line separators.""" lastCR = strg.rfind("\n", 0, loc) nextCR = strg.find("\n", loc) if nextCR > 0: - return strg[lastCR+1:nextCR] + return strg[lastCR + 1 : nextCR] else: - return strg[lastCR+1:] + return strg[lastCR + 1 :] + + +def _defaultStartDebugAction(instring, loc, expr): + print( + "Match " + + _ustr(expr) + + " at loc " + + _ustr(loc) + + "(%d,%d)" % (lineno(loc, instring), col(loc, instring)) + ) + -def _defaultStartDebugAction( instring, loc, expr ): - print ("Match " + _ustr(expr) + " at loc " + _ustr(loc) + "(%d,%d)" % ( lineno(loc,instring), col(loc,instring) )) +def _defaultSuccessDebugAction(instring, startloc, endloc, expr, toks): + print("Matched " + _ustr(expr) + " -> " + str(toks.asList())) -def _defaultSuccessDebugAction( instring, startloc, endloc, expr, toks ): - print ("Matched " + _ustr(expr) + " -> " + str(toks.asList())) -def _defaultExceptionDebugAction( instring, loc, expr, exc ): - print ("Exception raised:" + _ustr(exc)) +def _defaultExceptionDebugAction(instring, loc, expr, exc): + print("Exception raised:" + _ustr(exc)) + def nullDebugAction(*args): """'Do-nothing' debug action, to suppress debugging output during parsing.""" pass + class ParserElement(object): """Abstract base level parser element class.""" + DEFAULT_WHITE_CHARS = " \n\t\r" - def setDefaultWhitespaceChars( chars ): - """Overrides the default whitespace chars - """ + def setDefaultWhitespaceChars(chars): + """Overrides the default whitespace chars""" ParserElement.DEFAULT_WHITE_CHARS = chars + setDefaultWhitespaceChars = staticmethod(setDefaultWhitespaceChars) - def __init__( self, savelist=False ): + def __init__(self, savelist=False): self.parseAction = list() self.failAction = None - #~ self.name = "" # don't define self.name, let subclasses try/except upcall + # ~ self.name = "" # don't define self.name, let subclasses try/except upcall self.strRepr = None self.resultsName = None self.saveAsList = savelist self.skipWhitespace = True self.whiteChars = ParserElement.DEFAULT_WHITE_CHARS self.copyDefaultWhiteChars = True - self.mayReturnEmpty = False # used when checking for left-recursion + self.mayReturnEmpty = False # used when checking for left-recursion self.keepTabs = False self.ignoreExprs = list() self.debug = False self.streamlined = False - self.mayIndexError = True # used to optimize exception handling for subclasses that don't advance parse index + self.mayIndexError = True # used to optimize exception handling for subclasses that don't advance parse index self.errmsg = "" - self.modalResults = True # used to mark results names as modal (report only last) or cumulative (list all) - self.debugActions = ( None, None, None ) #custom debug actions + self.modalResults = True # used to mark results names as modal (report only last) or cumulative (list all) + self.debugActions = (None, None, None) # custom debug actions self.re = None - self.callPreparse = True # used to avoid redundant calls to preParse + self.callPreparse = True # used to avoid redundant calls to preParse self.callDuringTry = False - def copy( self ): + def copy(self): """Make a copy of this ParserElement. Useful for defining different parse actions - for the same parsing pattern, using copies of the original parse element.""" - cpy = copy.copy( self ) + for the same parsing pattern, using copies of the original parse element.""" + cpy = copy.copy(self) cpy.parseAction = self.parseAction[:] cpy.ignoreExprs = self.ignoreExprs[:] if self.copyDefaultWhiteChars: cpy.whiteChars = ParserElement.DEFAULT_WHITE_CHARS return cpy - def setName( self, name ): + def setName(self, name): """Define name for this expression, for use in debugging.""" self.name = name self.errmsg = "Expected " + self.name - if hasattr(self,"exception"): + if hasattr(self, "exception"): self.exception.msg = self.errmsg return self - def setResultsName( self, name, listAllMatches=False ): + def setResultsName(self, name, listAllMatches=False): """Define name for referencing matching tokens as a nested attribute - of the returned parse results. - NOTE: this returns a *copy* of the original ParserElement object; - this is so that the client can define a basic element, such as an - integer, and reference it in multiple places with different names. + of the returned parse results. + NOTE: this returns a *copy* of the original ParserElement object; + this is so that the client can define a basic element, such as an + integer, and reference it in multiple places with different names. """ newself = self.copy() newself.resultsName = name newself.modalResults = not listAllMatches return newself - def setBreak(self,breakFlag = True): + def setBreak(self, breakFlag=True): """Method to invoke the Python pdb debugger when this element is - about to be parsed. Set breakFlag to True to enable, False to - disable. + about to be parsed. Set breakFlag to True to enable, False to + disable. """ if breakFlag: _parseMethod = self._parse + def breaker(instring, loc, doActions=True, callPreParse=True): import pdb + pdb.set_trace() - return _parseMethod( instring, loc, doActions, callPreParse ) + return _parseMethod(instring, loc, doActions, callPreParse) + breaker._originalParseMethod = _parseMethod self._parse = breaker else: - if hasattr(self._parse,"_originalParseMethod"): + if hasattr(self._parse, "_originalParseMethod"): self._parse = self._parse._originalParseMethod return self - def _normalizeParseActionArgs( f ): + def _normalizeParseActionArgs(f): """Internal method used to decorate parse actions that take fewer than 3 arguments, - so that all parse actions can be called as f(s,l,t).""" + so that all parse actions can be called as f(s,l,t).""" STAR_ARGS = 4 try: restore = None - if isinstance(f,type): + if isinstance(f, type): restore = f f = f.__init__ if not _PY3K: @@ -764,10 +951,10 @@ def _normalizeParseActionArgs( f ): return f numargs = codeObj.co_argcount if not _PY3K: - if hasattr(f,"im_self"): + if hasattr(f, "im_self"): numargs -= 1 else: - if hasattr(f,"__self__"): + if hasattr(f, "__self__"): numargs -= 1 if restore: f = restore @@ -784,10 +971,10 @@ def _normalizeParseActionArgs( f ): return f numargs = call_im_func_code.co_argcount if not _PY3K: - if hasattr(f.__call__,"im_self"): + if hasattr(f.__call__, "im_self"): numargs -= 1 else: - if hasattr(f.__call__,"__self__"): + if hasattr(f.__call__, "__self__"): numargs -= 0 except AttributeError: if not _PY3K: @@ -799,103 +986,111 @@ def _normalizeParseActionArgs( f ): return f numargs = call_func_code.co_argcount if not _PY3K: - if hasattr(f.__call__,"im_self"): + if hasattr(f.__call__, "im_self"): numargs -= 1 else: - if hasattr(f.__call__,"__self__"): + if hasattr(f.__call__, "__self__"): numargs -= 1 - - #~ print ("adding function %s with %d args" % (f.func_name,numargs)) + # ~ print ("adding function %s with %d args" % (f.func_name,numargs)) if numargs == 3: return f else: if numargs > 3: - def tmp(s,l,t): - return f(f.__call__.__self__, s,l,t) + + def tmp(s, l, t): + return f(f.__call__.__self__, s, l, t) + if numargs == 2: - def tmp(s,l,t): - return f(l,t) + + def tmp(s, l, t): + return f(l, t) elif numargs == 1: - def tmp(s,l,t): + + def tmp(s, l, t): return f(t) - else: #~ numargs == 0: - def tmp(s,l,t): + else: # ~ numargs == 0: + + def tmp(s, l, t): return f() + try: tmp.__name__ = f.__name__ - except (AttributeError,TypeError): + except (AttributeError, TypeError): # no need for special handling if attribute doesnt exist pass try: tmp.__doc__ = f.__doc__ - except (AttributeError,TypeError): + except (AttributeError, TypeError): # no need for special handling if attribute doesnt exist pass try: tmp.__dict__.update(f.__dict__) - except (AttributeError,TypeError): + except (AttributeError, TypeError): # no need for special handling if attribute doesnt exist pass return tmp + _normalizeParseActionArgs = staticmethod(_normalizeParseActionArgs) - def setParseAction( self, *fns, **kwargs ): + def setParseAction(self, *fns, **kwargs): """Define action to perform when successfully matching parse element definition. - Parse action fn is a callable method with 0-3 arguments, called as fn(s,loc,toks), - fn(loc,toks), fn(toks), or just fn(), where: - - s = the original string being parsed (see note below) - - loc = the location of the matching substring - - toks = a list of the matched tokens, packaged as a ParseResults object - If the functions in fns modify the tokens, they can return them as the return - value from fn, and the modified list of tokens will replace the original. - Otherwise, fn does not need to return any value. - - Note: the default parsing behavior is to expand tabs in the input string - before starting the parsing process. See L{I{parseString}} for more information - on parsing strings containing s, and suggested methods to maintain a - consistent view of the parsed string, the parse location, and line and column - positions within the parsed string. - """ + Parse action fn is a callable method with 0-3 arguments, called as fn(s,loc,toks), + fn(loc,toks), fn(toks), or just fn(), where: + - s = the original string being parsed (see note below) + - loc = the location of the matching substring + - toks = a list of the matched tokens, packaged as a ParseResults object + If the functions in fns modify the tokens, they can return them as the return + value from fn, and the modified list of tokens will replace the original. + Otherwise, fn does not need to return any value. + + Note: the default parsing behavior is to expand tabs in the input string + before starting the parsing process. See L{I{parseString}} for more information + on parsing strings containing s, and suggested methods to maintain a + consistent view of the parsed string, the parse location, and line and column + positions within the parsed string. + """ self.parseAction = list(map(self._normalizeParseActionArgs, list(fns))) - self.callDuringTry = ("callDuringTry" in kwargs and kwargs["callDuringTry"]) + self.callDuringTry = "callDuringTry" in kwargs and kwargs["callDuringTry"] return self - def addParseAction( self, *fns, **kwargs ): + def addParseAction(self, *fns, **kwargs): """Add parse action to expression's list of parse actions. See L{I{setParseAction}}.""" self.parseAction += list(map(self._normalizeParseActionArgs, list(fns))) - self.callDuringTry = self.callDuringTry or ("callDuringTry" in kwargs and kwargs["callDuringTry"]) + self.callDuringTry = self.callDuringTry or ( + "callDuringTry" in kwargs and kwargs["callDuringTry"] + ) return self - def setFailAction( self, fn ): + def setFailAction(self, fn): """Define action to perform if parsing fails at this expression. - Fail acton fn is a callable function that takes the arguments - fn(s,loc,expr,err) where: - - s = string being parsed - - loc = location where expression match was attempted and failed - - expr = the parse expression that failed - - err = the exception thrown - The function returns no value. It may throw ParseFatalException - if it is desired to stop parsing immediately.""" + Fail acton fn is a callable function that takes the arguments + fn(s,loc,expr,err) where: + - s = string being parsed + - loc = location where expression match was attempted and failed + - expr = the parse expression that failed + - err = the exception thrown + The function returns no value. It may throw ParseFatalException + if it is desired to stop parsing immediately.""" self.failAction = fn return self - def _skipIgnorables( self, instring, loc ): + def _skipIgnorables(self, instring, loc): exprsFound = True while exprsFound: exprsFound = False for e in self.ignoreExprs: try: while 1: - loc,dummy = e._parse( instring, loc ) + loc, dummy = e._parse(instring, loc) exprsFound = True except ParseException: pass return loc - def preParse( self, instring, loc ): + def preParse(self, instring, loc): if self.ignoreExprs: - loc = self._skipIgnorables( instring, loc ) + loc = self._skipIgnorables(instring, loc) if self.skipWhitespace: wt = self.whiteChars @@ -905,188 +1100,200 @@ def preParse( self, instring, loc ): return loc - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): return loc, [] - def postParse( self, instring, loc, tokenlist ): + def postParse(self, instring, loc, tokenlist): return tokenlist - #~ @profile - def _parseNoCache( self, instring, loc, doActions=True, callPreParse=True ): - debugging = ( self.debug ) #and doActions ) + # ~ @profile + def _parseNoCache(self, instring, loc, doActions=True, callPreParse=True): + debugging = self.debug # and doActions ) if debugging or self.failAction: - #~ print ("Match",self,"at loc",loc,"(%d,%d)" % ( lineno(loc,instring), col(loc,instring) )) - if (self.debugActions[0] ): - self.debugActions[0]( instring, loc, self ) + # ~ print ("Match",self,"at loc",loc,"(%d,%d)" % ( lineno(loc,instring), col(loc,instring) )) + if self.debugActions[0]: + self.debugActions[0](instring, loc, self) if callPreParse and self.callPreparse: - preloc = self.preParse( instring, loc ) + preloc = self.preParse(instring, loc) else: preloc = loc tokensStart = loc try: try: - loc,tokens = self.parseImpl( instring, preloc, doActions ) + loc, tokens = self.parseImpl(instring, preloc, doActions) except IndexError: - raise ParseException( instring, len(instring), self.errmsg, self ) + raise ParseException(instring, len(instring), self.errmsg, self) except ParseBaseException as err: - #~ print ("Exception raised:", err) + # ~ print ("Exception raised:", err) if self.debugActions[2]: - self.debugActions[2]( instring, tokensStart, self, err ) + self.debugActions[2](instring, tokensStart, self, err) if self.failAction: - self.failAction( instring, tokensStart, self, err ) + self.failAction(instring, tokensStart, self, err) raise else: if callPreParse and self.callPreparse: - preloc = self.preParse( instring, loc ) + preloc = self.preParse(instring, loc) else: preloc = loc tokensStart = loc if self.mayIndexError or loc >= len(instring): try: - loc,tokens = self.parseImpl( instring, preloc, doActions ) + loc, tokens = self.parseImpl(instring, preloc, doActions) except IndexError: - raise ParseException( instring, len(instring), self.errmsg, self ) + raise ParseException(instring, len(instring), self.errmsg, self) else: - loc,tokens = self.parseImpl( instring, preloc, doActions ) + loc, tokens = self.parseImpl(instring, preloc, doActions) - tokens = self.postParse( instring, loc, tokens ) + tokens = self.postParse(instring, loc, tokens) - retTokens = ParseResults( tokens, self.resultsName, asList=self.saveAsList, modal=self.modalResults ) + retTokens = ParseResults( + tokens, self.resultsName, asList=self.saveAsList, modal=self.modalResults + ) if self.parseAction and (doActions or self.callDuringTry): if debugging: try: for fn in self.parseAction: - tokens = fn( instring, tokensStart, retTokens ) + tokens = fn(instring, tokensStart, retTokens) if tokens is not None: - retTokens = ParseResults( tokens, - self.resultsName, - asList=self.saveAsList and isinstance(tokens,(ParseResults,list)), - modal=self.modalResults ) + retTokens = ParseResults( + tokens, + self.resultsName, + asList=self.saveAsList + and isinstance(tokens, (ParseResults, list)), + modal=self.modalResults, + ) except ParseBaseException as err: # print ("Exception raised in user parse action:", err) - if (self.debugActions[2] ): - self.debugActions[2]( instring, tokensStart, self, err ) + if self.debugActions[2]: + self.debugActions[2](instring, tokensStart, self, err) raise else: for fn in self.parseAction: - tokens = fn( instring, tokensStart, retTokens ) + tokens = fn(instring, tokensStart, retTokens) if tokens is not None: - retTokens = ParseResults( tokens, - self.resultsName, - asList=self.saveAsList and isinstance(tokens,(ParseResults,list)), - modal=self.modalResults ) + retTokens = ParseResults( + tokens, + self.resultsName, + asList=self.saveAsList + and isinstance(tokens, (ParseResults, list)), + modal=self.modalResults, + ) if debugging: - #~ print ("Matched",self,"->",retTokens.asList()) - if (self.debugActions[1] ): - self.debugActions[1]( instring, tokensStart, loc, self, retTokens ) + # ~ print ("Matched",self,"->",retTokens.asList()) + if self.debugActions[1]: + self.debugActions[1](instring, tokensStart, loc, self, retTokens) return loc, retTokens - def tryParse( self, instring, loc ): + def tryParse(self, instring, loc): try: - return self._parse( instring, loc, doActions=False )[0] + return self._parse(instring, loc, doActions=False)[0] except ParseFatalException: - raise ParseException( instring, loc, self.errmsg, self) + raise ParseException(instring, loc, self.errmsg, self) # this method gets repeatedly called during backtracking with the same arguments - # we can cache these arguments and save ourselves the trouble of re-parsing the contained expression - def _parseCache( self, instring, loc, doActions=True, callPreParse=True ): - lookup = (self,instring,loc,callPreParse,doActions) + def _parseCache(self, instring, loc, doActions=True, callPreParse=True): + lookup = (self, instring, loc, callPreParse, doActions) if lookup in ParserElement._exprArgCache: - value = ParserElement._exprArgCache[ lookup ] - if isinstance(value,Exception): + value = ParserElement._exprArgCache[lookup] + if isinstance(value, Exception): raise value return value else: try: - value = self._parseNoCache( instring, loc, doActions, callPreParse ) - ParserElement._exprArgCache[ lookup ] = (value[0],value[1].copy()) + value = self._parseNoCache(instring, loc, doActions, callPreParse) + ParserElement._exprArgCache[lookup] = (value[0], value[1].copy()) return value except ParseBaseException as pe: - ParserElement._exprArgCache[ lookup ] = pe + ParserElement._exprArgCache[lookup] = pe raise _parse = _parseNoCache # argument cache for optimizing repeated calls when backtracking through recursive expressions _exprArgCache = {} + def resetCache(): ParserElement._exprArgCache.clear() + resetCache = staticmethod(resetCache) _packratEnabled = False + def enablePackrat(): """Enables "packrat" parsing, which adds memoizing to the parsing logic. - Repeated parse attempts at the same string location (which happens - often in many complex grammars) can immediately return a cached value, - instead of re-executing parsing/validating code. Memoizing is done of - both valid results and parsing exceptions. - - This speedup may break existing programs that use parse actions that - have side-effects. For this reason, packrat parsing is disabled when - you first import pyparsing. To activate the packrat feature, your - program must call the class method ParserElement.enablePackrat(). If - your program uses psyco to "compile as you go", you must call - enablePackrat before calling psyco.full(). If you do not do this, - Python will crash. For best results, call enablePackrat() immediately - after importing pyparsing. + Repeated parse attempts at the same string location (which happens + often in many complex grammars) can immediately return a cached value, + instead of re-executing parsing/validating code. Memoizing is done of + both valid results and parsing exceptions. + + This speedup may break existing programs that use parse actions that + have side-effects. For this reason, packrat parsing is disabled when + you first import pyparsing. To activate the packrat feature, your + program must call the class method ParserElement.enablePackrat(). If + your program uses psyco to "compile as you go", you must call + enablePackrat before calling psyco.full(). If you do not do this, + Python will crash. For best results, call enablePackrat() immediately + after importing pyparsing. """ if not ParserElement._packratEnabled: ParserElement._packratEnabled = True ParserElement._parse = ParserElement._parseCache + enablePackrat = staticmethod(enablePackrat) - def parseString( self, instring, parseAll=False ): + def parseString(self, instring, parseAll=False): """Execute the parse expression with the given string. - This is the main interface to the client code, once the complete - expression has been built. - - If you want the grammar to require that the entire input string be - successfully parsed, then set parseAll to True (equivalent to ending - the grammar with StringEnd()). - - Note: parseString implicitly calls expandtabs() on the input string, - in order to report proper column numbers in parse actions. - If the input string contains tabs and - the grammar uses parse actions that use the loc argument to index into the - string being parsed, you can ensure you have a consistent view of the input - string by: - - calling parseWithTabs on your grammar before calling parseString - (see L{I{parseWithTabs}}) - - define your parse action using the full (s,loc,toks) signature, and - reference the input string using the parse action's s argument - - explictly expand the tabs in your input string before calling - parseString + This is the main interface to the client code, once the complete + expression has been built. + + If you want the grammar to require that the entire input string be + successfully parsed, then set parseAll to True (equivalent to ending + the grammar with StringEnd()). + + Note: parseString implicitly calls expandtabs() on the input string, + in order to report proper column numbers in parse actions. + If the input string contains tabs and + the grammar uses parse actions that use the loc argument to index into the + string being parsed, you can ensure you have a consistent view of the input + string by: + - calling parseWithTabs on your grammar before calling parseString + (see L{I{parseWithTabs}}) + - define your parse action using the full (s,loc,toks) signature, and + reference the input string using the parse action's s argument + - explictly expand the tabs in your input string before calling + parseString """ ParserElement.resetCache() if not self.streamlined: self.streamline() - #~ self.saveAsList = True + # ~ self.saveAsList = True for e in self.ignoreExprs: e.streamline() if not self.keepTabs: instring = instring.expandtabs() try: - loc, tokens = self._parse( instring, 0 ) + loc, tokens = self._parse(instring, 0) if parseAll: - loc = self.preParse( instring, loc ) - StringEnd()._parse( instring, loc ) + loc = self.preParse(instring, loc) + StringEnd()._parse(instring, loc) except ParseBaseException as exc: # catch and re-raise exception from here, clears out pyparsing internal stack trace raise exc else: return tokens - def scanString( self, instring, maxMatches=_MAX_INT ): + def scanString(self, instring, maxMatches=_MAX_INT): """Scan the input string for expression matches. Each match will return the - matching tokens, start location, and end location. May be called with optional - maxMatches argument, to clip scanning after 'n' matches are found. + matching tokens, start location, and end location. May be called with optional + maxMatches argument, to clip scanning after 'n' matches are found. - Note that the start and end locations are reported relative to the string - being parsed. See L{I{parseString}} for more information on parsing - strings with embedded tabs.""" + Note that the start and end locations are reported relative to the string + being parsed. See L{I{parseString}} for more information on parsing + strings with embedded tabs.""" if not self.streamlined: self.streamline() for e in self.ignoreExprs: @@ -1103,10 +1310,10 @@ def scanString( self, instring, maxMatches=_MAX_INT ): try: while loc <= instrlen and matches < maxMatches: try: - preloc = preparseFn( instring, loc ) - nextLoc,tokens = parseFn( instring, preloc, callPreParse=False ) + preloc = preparseFn(instring, loc) + nextLoc, tokens = parseFn(instring, preloc, callPreParse=False) except ParseException: - loc = preloc+1 + loc = preloc + 1 else: matches += 1 yield tokens, preloc, nextLoc @@ -1114,288 +1321,335 @@ def scanString( self, instring, maxMatches=_MAX_INT ): except ParseBaseException as pe: raise pe - def transformString( self, instring ): + def transformString(self, instring): """Extension to scanString, to modify matching text with modified tokens that may - be returned from a parse action. To use transformString, define a grammar and - attach a parse action to it that modifies the returned token list. - Invoking transformString() on a target string will then scan for matches, - and replace the matched text patterns according to the logic in the parse - action. transformString() returns the resulting transformed string.""" + be returned from a parse action. To use transformString, define a grammar and + attach a parse action to it that modifies the returned token list. + Invoking transformString() on a target string will then scan for matches, + and replace the matched text patterns according to the logic in the parse + action. transformString() returns the resulting transformed string.""" out = [] lastE = 0 # force preservation of s, to minimize unwanted transformation of string, and to # keep string locs straight between transformString and scanString self.keepTabs = True try: - for t,s,e in self.scanString( instring ): - out.append( instring[lastE:s] ) + for t, s, e in self.scanString(instring): + out.append(instring[lastE:s]) if t: - if isinstance(t,ParseResults): + if isinstance(t, ParseResults): out += t.asList() - elif isinstance(t,list): + elif isinstance(t, list): out += t else: out.append(t) lastE = e out.append(instring[lastE:]) - return "".join(map(_ustr,out)) + return "".join(map(_ustr, out)) except ParseBaseException as pe: raise pe - def searchString( self, instring, maxMatches=_MAX_INT ): + def searchString(self, instring, maxMatches=_MAX_INT): """Another extension to scanString, simplifying the access to the tokens found - to match the given parse expression. May be called with optional - maxMatches argument, to clip searching after 'n' matches are found. + to match the given parse expression. May be called with optional + maxMatches argument, to clip searching after 'n' matches are found. """ try: - return ParseResults([ t for t,s,e in self.scanString( instring, maxMatches ) ]) + return ParseResults( + [t for t, s, e in self.scanString(instring, maxMatches)] + ) except ParseBaseException as pe: raise pe - def __add__(self, other ): + def __add__(self, other): """Implementation of + operator - returns And""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return And( [ self, other ] ) + return And([self, other]) - def __radd__(self, other ): + def __radd__(self, other): """Implementation of + operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other + self def __sub__(self, other): """Implementation of - operator, returns And with error stop""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return And( [ self, And._ErrorStop(), other ] ) + return And([self, And._ErrorStop(), other]) - def __rsub__(self, other ): + def __rsub__(self, other): """Implementation of - operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other - self - def __mul__(self,other): - if isinstance(other,int): - minElements, optElements = other,0 - elif isinstance(other,tuple): + def __mul__(self, other): + if isinstance(other, int): + minElements, optElements = other, 0 + elif isinstance(other, tuple): other = (other + (None, None))[:2] if other[0] is None: other = (0, other[1]) - if isinstance(other[0],int) and other[1] is None: + if isinstance(other[0], int) and other[1] is None: if other[0] == 0: return ZeroOrMore(self) if other[0] == 1: return OneOrMore(self) else: - return self*other[0] + ZeroOrMore(self) - elif isinstance(other[0],int) and isinstance(other[1],int): + return self * other[0] + ZeroOrMore(self) + elif isinstance(other[0], int) and isinstance(other[1], int): minElements, optElements = other optElements -= minElements else: - raise TypeError("cannot multiply 'ParserElement' and ('%s','%s') objects", type(other[0]),type(other[1])) + raise TypeError( + "cannot multiply 'ParserElement' and ('%s','%s') objects", + type(other[0]), + type(other[1]), + ) else: - raise TypeError("cannot multiply 'ParserElement' and '%s' objects", type(other)) + raise TypeError( + "cannot multiply 'ParserElement' and '%s' objects", type(other) + ) if minElements < 0: raise ValueError("cannot multiply ParserElement by negative value") if optElements < 0: - raise ValueError("second tuple value must be greater or equal to first tuple value") + raise ValueError( + "second tuple value must be greater or equal to first tuple value" + ) if minElements == optElements == 0: raise ValueError("cannot multiply ParserElement by 0 or (0,0)") - if (optElements): + if optElements: + def makeOptionalList(n): - if n>1: - return Optional(self + makeOptionalList(n-1)) + if n > 1: + return Optional(self + makeOptionalList(n - 1)) else: return Optional(self) + if minElements: if minElements == 1: ret = self + makeOptionalList(optElements) else: - ret = And([self]*minElements) + makeOptionalList(optElements) + ret = And([self] * minElements) + makeOptionalList(optElements) else: ret = makeOptionalList(optElements) else: if minElements == 1: ret = self else: - ret = And([self]*minElements) + ret = And([self] * minElements) return ret def __rmul__(self, other): return self.__mul__(other) - def __or__(self, other ): + def __or__(self, other): """Implementation of | operator - returns MatchFirst""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return MatchFirst( [ self, other ] ) + return MatchFirst([self, other]) - def __ror__(self, other ): + def __ror__(self, other): """Implementation of | operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other | self - def __xor__(self, other ): + def __xor__(self, other): """Implementation of ^ operator - returns Or""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return Or( [ self, other ] ) + return Or([self, other]) - def __rxor__(self, other ): + def __rxor__(self, other): """Implementation of ^ operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other ^ self - def __and__(self, other ): + def __and__(self, other): """Implementation of & operator - returns Each""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return Each( [ self, other ] ) + return Each([self, other]) - def __rand__(self, other ): + def __rand__(self, other): """Implementation of & operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn(f"Cannot combine element of type {type(other)} with ParserElement", - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other & self - def __invert__( self ): + def __invert__(self): """Implementation of ~ operator - returns NotAny""" - return NotAny( self ) + return NotAny(self) def __call__(self, name): """Shortcut for setResultsName, with listAllMatches=default:: - userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno") - could be written as:: - userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") - """ + userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno") + could be written as:: + userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") + """ return self.setResultsName(name) - def suppress( self ): + def suppress(self): """Suppresses the output of this ParserElement; useful to keep punctuation from - cluttering up returned output. + cluttering up returned output. """ - return Suppress( self ) + return Suppress(self) - def leaveWhitespace( self ): + def leaveWhitespace(self): """Disables the skipping of whitespace before matching the characters in the - ParserElement's defined pattern. This is normally only used internally by - the pyparsing module, but may be needed in some whitespace-sensitive grammars. + ParserElement's defined pattern. This is normally only used internally by + the pyparsing module, but may be needed in some whitespace-sensitive grammars. """ self.skipWhitespace = False return self - def setWhitespaceChars( self, chars ): - """Overrides the default whitespace chars - """ + def setWhitespaceChars(self, chars): + """Overrides the default whitespace chars""" self.skipWhitespace = True self.whiteChars = chars self.copyDefaultWhiteChars = False return self - def parseWithTabs( self ): + def parseWithTabs(self): """Overrides default behavior to expand s to spaces before parsing the input string. - Must be called before parseString when the input grammar contains elements that - match characters.""" + Must be called before parseString when the input grammar contains elements that + match characters.""" self.keepTabs = True return self - def ignore( self, other ): + def ignore(self, other): """Define expression to be ignored (e.g., comments) while doing pattern - matching; may be called repeatedly, to define multiple comment or other - ignorable patterns. + matching; may be called repeatedly, to define multiple comment or other + ignorable patterns. """ - if isinstance( other, Suppress ): + if isinstance(other, Suppress): if other not in self.ignoreExprs: - self.ignoreExprs.append( other ) + self.ignoreExprs.append(other) else: - self.ignoreExprs.append( Suppress( other ) ) + self.ignoreExprs.append(Suppress(other)) return self - def setDebugActions( self, startAction, successAction, exceptionAction ): + def setDebugActions(self, startAction, successAction, exceptionAction): """Enable display of debugging messages while doing pattern matching.""" - self.debugActions = (startAction or _defaultStartDebugAction, - successAction or _defaultSuccessDebugAction, - exceptionAction or _defaultExceptionDebugAction) + self.debugActions = ( + startAction or _defaultStartDebugAction, + successAction or _defaultSuccessDebugAction, + exceptionAction or _defaultExceptionDebugAction, + ) self.debug = True return self - def setDebug( self, flag=True ): + def setDebug(self, flag=True): """Enable display of debugging messages while doing pattern matching. - Set flag to True to enable, False to disable.""" + Set flag to True to enable, False to disable.""" if flag: - self.setDebugActions( _defaultStartDebugAction, _defaultSuccessDebugAction, _defaultExceptionDebugAction ) + self.setDebugActions( + _defaultStartDebugAction, + _defaultSuccessDebugAction, + _defaultExceptionDebugAction, + ) else: self.debug = False return self - def __str__( self ): + def __str__(self): return self.name - def __repr__( self ): + def __repr__(self): return _ustr(self) - def streamline( self ): + def streamline(self): self.streamlined = True self.strRepr = None return self - def checkRecursion( self, parseElementList ): + def checkRecursion(self, parseElementList): pass - def validate( self, validateTrace=[] ): + def validate(self, validateTrace=[]): """Check defined expressions for valid structure, check for infinite recursive definitions.""" - self.checkRecursion( [] ) + self.checkRecursion([]) - def parseFile( self, file_or_filename, parseAll=False ): + def parseFile(self, file_or_filename, parseAll=False): """Execute the parse expression on the given file or filename. - If a filename is specified (instead of a file object), - the entire file is opened, read, and closed before parsing. + If a filename is specified (instead of a file object), + the entire file is opened, read, and closed before parsing. """ try: file_contents = file_or_filename.read() @@ -1410,16 +1664,16 @@ def parseFile( self, file_or_filename, parseAll=False ): raise exc def getException(self): - return ParseException("",0,self.errmsg,self) + return ParseException("", 0, self.errmsg, self) - def __getattr__(self,aname): + def __getattr__(self, aname): if aname == "myException": - self.myException = ret = self.getException(); - return ret; + self.myException = ret = self.getException() + return ret else: raise AttributeError("no such attribute " + aname) - def __eq__(self,other): + def __eq__(self, other): if isinstance(other, ParserElement): return self is other or self.__dict__ == other.__dict__ elif isinstance(other, basestring): @@ -1429,38 +1683,40 @@ def __eq__(self,other): except ParseBaseException: return False else: - return super(ParserElement,self)==other + return super(ParserElement, self) == other - def __ne__(self,other): + def __ne__(self, other): return not (self == other) def __hash__(self): return hash(id(self)) - def __req__(self,other): + def __req__(self, other): return self == other - def __rne__(self,other): + def __rne__(self, other): return not (self == other) class Token(ParserElement): """Abstract ParserElement subclass, for defining atomic matching patterns.""" - def __init__( self ): - super(Token,self).__init__( savelist=False ) - #self.myException = ParseException("",0,"",self) + + def __init__(self): + super(Token, self).__init__(savelist=False) + # self.myException = ParseException("",0,"",self) def setName(self, name): - s = super(Token,self).setName(name) + s = super(Token, self).setName(name) self.errmsg = "Expected " + self.name - #s.myException.msg = self.errmsg + # s.myException.msg = self.errmsg return s class Empty(Token): """An empty token, will always match.""" - def __init__( self ): - super(Empty,self).__init__() + + def __init__(self): + super(Empty, self).__init__() self.name = "Empty" self.mayReturnEmpty = True self.mayIndexError = False @@ -1468,15 +1724,16 @@ def __init__( self ): class NoMatch(Token): """A token that will never match.""" - def __init__( self ): - super(NoMatch,self).__init__() + + def __init__(self): + super(NoMatch, self).__init__() self.name = "NoMatch" self.mayReturnEmpty = True self.mayIndexError = False self.errmsg = "Unmatchable token" - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): exc = self.myException exc.loc = loc exc.pstr = instring @@ -1485,62 +1742,74 @@ def parseImpl( self, instring, loc, doActions=True ): class Literal(Token): """Token to exactly match a specified string.""" - def __init__( self, matchString ): - super(Literal,self).__init__() + + def __init__(self, matchString): + super(Literal, self).__init__() self.match = matchString self.matchLen = len(matchString) try: self.firstMatchChar = matchString[0] except IndexError: - warnings.warn("null string passed to Literal; use Empty() instead", - SyntaxWarning, stacklevel=2) + warnings.warn( + "null string passed to Literal; use Empty() instead", + SyntaxWarning, + stacklevel=2, + ) self.__class__ = Empty self.name = f'"{_ustr(self.match)}"' self.errmsg = "Expected " + self.name self.mayReturnEmpty = False - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False # Performance tuning: this routine gets called a *lot* # if this is a single character match string and the first character matches, # short-circuit as quickly as possible, and avoid calling startswith - #~ @profile - def parseImpl( self, instring, loc, doActions=True ): - if (instring[loc] == self.firstMatchChar and - (self.matchLen==1 or instring.startswith(self.match,loc)) ): - return loc+self.matchLen, self.match - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ @profile + def parseImpl(self, instring, loc, doActions=True): + if instring[loc] == self.firstMatchChar and ( + self.matchLen == 1 or instring.startswith(self.match, loc) + ): + return loc + self.matchLen, self.match + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc + + _L = Literal + class Keyword(Token): """Token to exactly match a specified string as a keyword, that is, it must be - immediately followed by a non-keyword character. Compare with Literal:: - Literal("if") will match the leading 'if' in 'ifAndOnlyIf'. - Keyword("if") will not; it will only match the leading 'if in 'if x=1', or 'if(y==2)' - Accepts two optional constructor arguments in addition to the keyword string: - identChars is a string of characters that would be valid identifier characters, - defaulting to all alphanumerics + "_" and "$"; caseless allows case-insensitive - matching, default is False. + immediately followed by a non-keyword character. Compare with Literal:: + Literal("if") will match the leading 'if' in 'ifAndOnlyIf'. + Keyword("if") will not; it will only match the leading 'if in 'if x=1', or 'if(y==2)' + Accepts two optional constructor arguments in addition to the keyword string: + identChars is a string of characters that would be valid identifier characters, + defaulting to all alphanumerics + "_" and "$"; caseless allows case-insensitive + matching, default is False. """ - DEFAULT_KEYWORD_CHARS = alphanums+"_$" - def __init__( self, matchString, identChars=DEFAULT_KEYWORD_CHARS, caseless=False ): - super(Keyword,self).__init__() + DEFAULT_KEYWORD_CHARS = alphanums + "_$" + + def __init__(self, matchString, identChars=DEFAULT_KEYWORD_CHARS, caseless=False): + super(Keyword, self).__init__() self.match = matchString self.matchLen = len(matchString) try: self.firstMatchChar = matchString[0] except IndexError: - warnings.warn("null string passed to Keyword; use Empty() instead", - SyntaxWarning, stacklevel=2) + warnings.warn( + "null string passed to Keyword; use Empty() instead", + SyntaxWarning, + stacklevel=2, + ) self.name = f'"{self.match}"' self.errmsg = "Expected " + self.name self.mayReturnEmpty = False - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False self.caseless = caseless if caseless: @@ -1548,85 +1817,104 @@ def __init__( self, matchString, identChars=DEFAULT_KEYWORD_CHARS, caseless=Fals identChars = identChars.upper() self.identChars = _str2dict(identChars) - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if self.caseless: - if ( (instring[ loc:loc+self.matchLen ].upper() == self.caselessmatch) and - (loc >= len(instring)-self.matchLen or instring[loc+self.matchLen].upper() not in self.identChars) and - (loc == 0 or instring[loc-1].upper() not in self.identChars) ): - return loc+self.matchLen, self.match + if ( + (instring[loc : loc + self.matchLen].upper() == self.caselessmatch) + and ( + loc >= len(instring) - self.matchLen + or instring[loc + self.matchLen].upper() not in self.identChars + ) + and (loc == 0 or instring[loc - 1].upper() not in self.identChars) + ): + return loc + self.matchLen, self.match else: - if (instring[loc] == self.firstMatchChar and - (self.matchLen==1 or instring.startswith(self.match,loc)) and - (loc >= len(instring)-self.matchLen or instring[loc+self.matchLen] not in self.identChars) and - (loc == 0 or instring[loc-1] not in self.identChars) ): - return loc+self.matchLen, self.match - #~ raise ParseException( instring, loc, self.errmsg ) + if ( + instring[loc] == self.firstMatchChar + and (self.matchLen == 1 or instring.startswith(self.match, loc)) + and ( + loc >= len(instring) - self.matchLen + or instring[loc + self.matchLen] not in self.identChars + ) + and (loc == 0 or instring[loc - 1] not in self.identChars) + ): + return loc + self.matchLen, self.match + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc def copy(self): - c = super(Keyword,self).copy() + c = super(Keyword, self).copy() c.identChars = Keyword.DEFAULT_KEYWORD_CHARS return c - def setDefaultKeywordChars( chars ): - """Overrides the default Keyword chars - """ + def setDefaultKeywordChars(chars): + """Overrides the default Keyword chars""" Keyword.DEFAULT_KEYWORD_CHARS = chars + setDefaultKeywordChars = staticmethod(setDefaultKeywordChars) + class CaselessLiteral(Literal): """Token to match a specified string, ignoring case of letters. - Note: the matched results will always be in the case of the given - match string, NOT the case of the input text. + Note: the matched results will always be in the case of the given + match string, NOT the case of the input text. """ - def __init__( self, matchString ): - super(CaselessLiteral,self).__init__( matchString.upper() ) + + def __init__(self, matchString): + super(CaselessLiteral, self).__init__(matchString.upper()) # Preserve the defining literal. self.returnString = matchString self.name = f"'{self.returnString}'" self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg - def parseImpl( self, instring, loc, doActions=True ): - if instring[ loc:loc+self.matchLen ].upper() == self.match: - return loc+self.matchLen, self.returnString - #~ raise ParseException( instring, loc, self.errmsg ) + def parseImpl(self, instring, loc, doActions=True): + if instring[loc : loc + self.matchLen].upper() == self.match: + return loc + self.matchLen, self.returnString + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc + class CaselessKeyword(Keyword): - def __init__( self, matchString, identChars=Keyword.DEFAULT_KEYWORD_CHARS ): - super(CaselessKeyword,self).__init__( matchString, identChars, caseless=True ) - - def parseImpl( self, instring, loc, doActions=True ): - if ( (instring[ loc:loc+self.matchLen ].upper() == self.caselessmatch) and - (loc >= len(instring)-self.matchLen or instring[loc+self.matchLen].upper() not in self.identChars) ): - return loc+self.matchLen, self.match - #~ raise ParseException( instring, loc, self.errmsg ) + def __init__(self, matchString, identChars=Keyword.DEFAULT_KEYWORD_CHARS): + super(CaselessKeyword, self).__init__(matchString, identChars, caseless=True) + + def parseImpl(self, instring, loc, doActions=True): + if (instring[loc : loc + self.matchLen].upper() == self.caselessmatch) and ( + loc >= len(instring) - self.matchLen + or instring[loc + self.matchLen].upper() not in self.identChars + ): + return loc + self.matchLen, self.match + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc + class Word(Token): """Token for matching words composed of allowed character sets. - Defined with string containing all allowed initial characters, - an optional string containing allowed body characters (if omitted, - defaults to the initial character set), and an optional minimum, - maximum, and/or exact length. The default value for min is 1 (a - minimum value < 1 is not valid); the default values for max and exact - are 0, meaning no maximum or exact length restriction. + Defined with string containing all allowed initial characters, + an optional string containing allowed body characters (if omitted, + defaults to the initial character set), and an optional minimum, + maximum, and/or exact length. The default value for min is 1 (a + minimum value < 1 is not valid); the default values for max and exact + are 0, meaning no maximum or exact length restriction. """ - def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False ): - super(Word,self).__init__() + + def __init__( + self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False + ): + super(Word, self).__init__() self.initCharsOrig = initChars self.initChars = _str2dict(initChars) - if bodyChars : + if bodyChars: self.bodyCharsOrig = bodyChars self.bodyChars = _str2dict(bodyChars) else: @@ -1636,7 +1924,9 @@ def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword= self.maxSpecified = max > 0 if min < 1: - raise ValueError("cannot specify a minimum length < 1; use Optional(Word()) if zero-length word is permitted") + raise ValueError( + "cannot specify a minimum length < 1; use Optional(Word()) if zero-length word is permitted" + ) self.minLen = min @@ -1651,31 +1941,35 @@ def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword= self.name = _ustr(self) self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False self.asKeyword = asKeyword - if ' ' not in self.initCharsOrig+self.bodyCharsOrig and (min==1 and max==0 and exact==0): + if " " not in self.initCharsOrig + self.bodyCharsOrig and ( + min == 1 and max == 0 and exact == 0 + ): if self.bodyCharsOrig == self.initCharsOrig: self.reString = f"[{_escapeRegexRangeChars(self.initCharsOrig)}]+" elif len(self.bodyCharsOrig) == 1: - self.reString = "%s[%s]*" % \ - (re.escape(self.initCharsOrig), - _escapeRegexRangeChars(self.bodyCharsOrig),) + self.reString = "%s[%s]*" % ( + re.escape(self.initCharsOrig), + _escapeRegexRangeChars(self.bodyCharsOrig), + ) else: - self.reString = "[%s][%s]*" % \ - (_escapeRegexRangeChars(self.initCharsOrig), - _escapeRegexRangeChars(self.bodyCharsOrig),) + self.reString = "[%s][%s]*" % ( + _escapeRegexRangeChars(self.initCharsOrig), + _escapeRegexRangeChars(self.bodyCharsOrig), + ) if self.asKeyword: - self.reString = r"\b"+self.reString+r"\b" + self.reString = r"\b" + self.reString + r"\b" try: - self.re = re.compile( self.reString ) + self.re = re.compile(self.reString) except: self.re = None - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if self.re: - result = self.re.match(instring,loc) + result = self.re.match(instring, loc) if not result: exc = self.myException exc.loc = loc @@ -1683,10 +1977,10 @@ def parseImpl( self, instring, loc, doActions=True ): raise exc loc = result.end() - return loc,result.group() + return loc, result.group() - if not(instring[ loc ] in self.initChars): - #~ raise ParseException( instring, loc, self.errmsg ) + if not (instring[loc] in self.initChars): + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -1696,7 +1990,7 @@ def parseImpl( self, instring, loc, doActions=True ): instrlen = len(instring) bodychars = self.bodyChars maxloc = start + self.maxLen - maxloc = min( maxloc, instrlen ) + maxloc = min(maxloc, instrlen) while loc < maxloc and instring[loc] in bodychars: loc += 1 @@ -1706,11 +2000,13 @@ def parseImpl( self, instring, loc, doActions=True ): if self.maxSpecified and loc < instrlen and instring[loc] in bodychars: throwException = True if self.asKeyword: - if (start>0 and instring[start-1] in bodychars) or (loc 0 and instring[start - 1] in bodychars) or ( + loc < instrlen and instring[loc] in bodychars + ): throwException = True if throwException: - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -1718,22 +2014,21 @@ def parseImpl( self, instring, loc, doActions=True ): return loc, instring[start:loc] - def __str__( self ): + def __str__(self): try: - return super(Word,self).__str__() + return super(Word, self).__str__() except: pass - if self.strRepr is None: def charsAsStr(s): - if len(s)>4: - return s[:4]+"..." + if len(s) > 4: + return s[:4] + "..." else: return s - if ( self.initCharsOrig != self.bodyCharsOrig ): + if self.initCharsOrig != self.bodyCharsOrig: self.strRepr = f"W:({charsAsStr(self.initCharsOrig)},{charsAsStr(self.bodyCharsOrig)})" else: self.strRepr = f"W:({charsAsStr(self.initCharsOrig)})" @@ -1743,15 +2038,19 @@ def charsAsStr(s): class Regex(Token): """Token for matching strings that match a given regular expression. - Defined with string specifying the regular expression in a form recognized by the inbuilt Python re module. + Defined with string specifying the regular expression in a form recognized by the inbuilt Python re module. """ - def __init__( self, pattern, flags=0): + + def __init__(self, pattern, flags=0): """The parameters pattern and flags are passed to the re.compile() function as-is. See the Python re module for an explanation of the acceptable patterns and flags.""" - super(Regex,self).__init__() + super(Regex, self).__init__() if len(pattern) == 0: - warnings.warn("null string passed to Regex; use Empty() instead", - SyntaxWarning, stacklevel=2) + warnings.warn( + "null string passed to Regex; use Empty() instead", + SyntaxWarning, + stacklevel=2, + ) self.pattern = pattern self.flags = flags @@ -1760,18 +2059,21 @@ def __init__( self, pattern, flags=0): self.re = re.compile(self.pattern, self.flags) self.reString = self.pattern except sre_constants.error: - warnings.warn(f"invalid pattern ({pattern}) passed to Regex", - SyntaxWarning, stacklevel=2) + warnings.warn( + f"invalid pattern ({pattern}) passed to Regex", + SyntaxWarning, + stacklevel=2, + ) raise self.name = _ustr(self) self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): - result = self.re.match(instring,loc) + def parseImpl(self, instring, loc, doActions=True): + result = self.re.match(instring, loc) if not result: exc = self.myException exc.loc = loc @@ -1784,11 +2086,11 @@ def parseImpl( self, instring, loc, doActions=True ): if d: for k in d: ret[k] = d[k] - return loc,ret + return loc, ret - def __str__( self ): + def __str__(self): try: - return super(Regex,self).__str__() + return super(Regex, self).__str__() except: pass @@ -1799,24 +2101,34 @@ def __str__( self ): class QuotedString(Token): - """Token for matching strings that are delimited by quoting characters. - """ - def __init__( self, quoteChar, escChar=None, escQuote=None, multiline=False, unquoteResults=True, endQuoteChar=None): + """Token for matching strings that are delimited by quoting characters.""" + + def __init__( + self, + quoteChar, + escChar=None, + escQuote=None, + multiline=False, + unquoteResults=True, + endQuoteChar=None, + ): """ - Defined with the following parameters: - - quoteChar - string of one or more characters defining the quote delimiting string - - escChar - character to escape quotes, typically backslash (default=None) - - escQuote - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=None) - - multiline - boolean indicating whether quotes can span multiple lines (default=False) - - unquoteResults - boolean indicating whether the matched text should be unquoted (default=True) - - endQuoteChar - string of one or more characters defining the end of the quote delimited string (default=None => same as quoteChar) + Defined with the following parameters: + - quoteChar - string of one or more characters defining the quote delimiting string + - escChar - character to escape quotes, typically backslash (default=None) + - escQuote - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=None) + - multiline - boolean indicating whether quotes can span multiple lines (default=False) + - unquoteResults - boolean indicating whether the matched text should be unquoted (default=True) + - endQuoteChar - string of one or more characters defining the end of the quote delimited string (default=None => same as quoteChar) """ - super(QuotedString,self).__init__() + super(QuotedString, self).__init__() # remove white space from quote chars - wont work anyway quoteChar = quoteChar.strip() if len(quoteChar) == 0: - warnings.warn("quoteChar cannot be the empty string",SyntaxWarning,stacklevel=2) + warnings.warn( + "quoteChar cannot be the empty string", SyntaxWarning, stacklevel=2 + ) raise SyntaxError() if endQuoteChar is None: @@ -1824,7 +2136,11 @@ def __init__( self, quoteChar, escChar=None, escQuote=None, multiline=False, unq else: endQuoteChar = endQuoteChar.strip() if len(endQuoteChar) == 0: - warnings.warn("endQuoteChar cannot be the empty string",SyntaxWarning,stacklevel=2) + warnings.warn( + "endQuoteChar cannot be the empty string", + SyntaxWarning, + stacklevel=2, + ) raise SyntaxError() self.quoteChar = quoteChar @@ -1838,45 +2154,63 @@ def __init__( self, quoteChar, escChar=None, escQuote=None, multiline=False, unq if multiline: self.flags = re.MULTILINE | re.DOTALL - self.pattern = r'%s(?:[^%s%s]' % \ - ( re.escape(self.quoteChar), - _escapeRegexRangeChars(self.endQuoteChar[0]), - (escChar is not None and _escapeRegexRangeChars(escChar) or '') ) + self.pattern = r"%s(?:[^%s%s]" % ( + re.escape(self.quoteChar), + _escapeRegexRangeChars(self.endQuoteChar[0]), + (escChar is not None and _escapeRegexRangeChars(escChar) or ""), + ) else: self.flags = 0 - self.pattern = r'%s(?:[^%s\n\r%s]' % \ - ( re.escape(self.quoteChar), - _escapeRegexRangeChars(self.endQuoteChar[0]), - (escChar is not None and _escapeRegexRangeChars(escChar) or '') ) + self.pattern = r"%s(?:[^%s\n\r%s]" % ( + re.escape(self.quoteChar), + _escapeRegexRangeChars(self.endQuoteChar[0]), + (escChar is not None and _escapeRegexRangeChars(escChar) or ""), + ) if len(self.endQuoteChar) > 1: self.pattern += ( - '|(?:' + ')|(?:'.join(["%s[^%s]" % (re.escape(self.endQuoteChar[:i]), - _escapeRegexRangeChars(self.endQuoteChar[i])) - for i in range(len(self.endQuoteChar)-1,0,-1)]) + ')' + "|(?:" + + ")|(?:".join( + [ + "%s[^%s]" + % ( + re.escape(self.endQuoteChar[:i]), + _escapeRegexRangeChars(self.endQuoteChar[i]), + ) + for i in range(len(self.endQuoteChar) - 1, 0, -1) + ] ) + + ")" + ) if escQuote: - self.pattern += (r'|(?:%s)' % re.escape(escQuote)) + self.pattern += r"|(?:%s)" % re.escape(escQuote) if escChar: - self.pattern += (r'|(?:%s.)' % re.escape(escChar)) - self.escCharReplacePattern = re.escape(self.escChar)+"(.)" - self.pattern += (r')*%s' % re.escape(self.endQuoteChar)) + self.pattern += r"|(?:%s.)" % re.escape(escChar) + self.escCharReplacePattern = re.escape(self.escChar) + "(.)" + self.pattern += r")*%s" % re.escape(self.endQuoteChar) try: self.re = re.compile(self.pattern, self.flags) self.reString = self.pattern except sre_constants.error: - warnings.warn(f"invalid pattern ({self.pattern}) passed to Regex", - SyntaxWarning, stacklevel=2) + warnings.warn( + f"invalid pattern ({self.pattern}) passed to Regex", + SyntaxWarning, + stacklevel=2, + ) raise self.name = _ustr(self) self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): - result = instring[loc] == self.firstQuoteChar and self.re.match(instring,loc) or None + def parseImpl(self, instring, loc, doActions=True): + result = ( + instring[loc] == self.firstQuoteChar + and self.re.match(instring, loc) + or None + ) if not result: exc = self.myException exc.loc = loc @@ -1887,14 +2221,13 @@ def parseImpl( self, instring, loc, doActions=True ): ret = result.group() if self.unquoteResults: - # strip off quotes - ret = ret[self.quoteCharLen:-self.endQuoteCharLen] + ret = ret[self.quoteCharLen : -self.endQuoteCharLen] - if isinstance(ret,basestring): + if isinstance(ret, basestring): # replace escaped characters if self.escChar: - ret = re.sub(self.escCharReplacePattern,"\g<1>",ret) + ret = re.sub(self.escCharReplacePattern, "\g<1>", ret) # replace escaped quotes if self.escQuote: @@ -1902,9 +2235,9 @@ def parseImpl( self, instring, loc, doActions=True ): return loc, ret - def __str__( self ): + def __str__(self): try: - return super(QuotedString,self).__str__() + return super(QuotedString, self).__str__() except: pass @@ -1916,18 +2249,21 @@ def __str__( self ): class CharsNotIn(Token): """Token for matching words composed of characters *not* in a given set. - Defined with string containing all disallowed characters, and an optional - minimum, maximum, and/or exact length. The default value for min is 1 (a - minimum value < 1 is not valid); the default values for max and exact - are 0, meaning no maximum or exact length restriction. + Defined with string containing all disallowed characters, and an optional + minimum, maximum, and/or exact length. The default value for min is 1 (a + minimum value < 1 is not valid); the default values for max and exact + are 0, meaning no maximum or exact length restriction. """ - def __init__( self, notChars, min=1, max=0, exact=0 ): - super(CharsNotIn,self).__init__() + + def __init__(self, notChars, min=1, max=0, exact=0): + super(CharsNotIn, self).__init__() self.skipWhitespace = False self.notChars = notChars if min < 1: - raise ValueError("cannot specify a minimum length < 1; use Optional(CharsNotIn()) if zero-length char group is permitted") + raise ValueError( + "cannot specify a minimum length < 1; use Optional(CharsNotIn()) if zero-length char group is permitted" + ) self.minLen = min @@ -1942,13 +2278,13 @@ def __init__( self, notChars, min=1, max=0, exact=0 ): self.name = _ustr(self) self.errmsg = "Expected " + self.name - self.mayReturnEmpty = ( self.minLen == 0 ) - #self.myException.msg = self.errmsg + self.mayReturnEmpty = self.minLen == 0 + # self.myException.msg = self.errmsg self.mayIndexError = False - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if instring[loc] in self.notChars: - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -1957,13 +2293,12 @@ def parseImpl( self, instring, loc, doActions=True ): start = loc loc += 1 notchars = self.notChars - maxlen = min( start+self.maxLen, len(instring) ) - while loc < maxlen and \ - (instring[loc] not in notchars): + maxlen = min(start + self.maxLen, len(instring)) + while loc < maxlen and (instring[loc] not in notchars): loc += 1 if loc - start < self.minLen: - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -1971,7 +2306,7 @@ def parseImpl( self, instring, loc, doActions=True ): return loc, instring[start:loc] - def __str__( self ): + def __str__(self): try: return super(CharsNotIn, self).__str__() except: @@ -1985,28 +2320,33 @@ def __str__( self ): return self.strRepr + class White(Token): """Special matching class for matching whitespace. Normally, whitespace is ignored - by pyparsing grammars. This class is included when some whitespace structures - are significant. Define with a string containing the whitespace characters to be - matched; default is " \\t\\r\\n". Also takes optional min, max, and exact arguments, - as defined for the Word class.""" + by pyparsing grammars. This class is included when some whitespace structures + are significant. Define with a string containing the whitespace characters to be + matched; default is " \\t\\r\\n". Also takes optional min, max, and exact arguments, + as defined for the Word class.""" + whiteStrs = { - " " : "", + " ": "", "\t": "", "\n": "", "\r": "", "\f": "", - } + } + def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0): - super(White,self).__init__() + super(White, self).__init__() self.matchWhite = ws - self.setWhitespaceChars( "".join([c for c in self.whiteChars if c not in self.matchWhite]) ) - #~ self.leaveWhitespace() - self.name = ("".join([White.whiteStrs[c] for c in self.matchWhite])) + self.setWhitespaceChars( + "".join([c for c in self.whiteChars if c not in self.matchWhite]) + ) + # ~ self.leaveWhitespace() + self.name = "".join([White.whiteStrs[c] for c in self.matchWhite]) self.mayReturnEmpty = True self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.minLen = min @@ -2019,9 +2359,9 @@ def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0): self.maxLen = exact self.minLen = exact - def parseImpl( self, instring, loc, doActions=True ): - if not(instring[ loc ] in self.matchWhite): - #~ raise ParseException( instring, loc, self.errmsg ) + def parseImpl(self, instring, loc, doActions=True): + if not (instring[loc] in self.matchWhite): + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -2029,12 +2369,12 @@ def parseImpl( self, instring, loc, doActions=True ): start = loc loc += 1 maxloc = start + self.maxLen - maxloc = min( maxloc, len(instring) ) + maxloc = min(maxloc, len(instring)) while loc < maxloc and instring[loc] in self.matchWhite: loc += 1 if loc - start < self.minLen: - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -2044,120 +2384,136 @@ def parseImpl( self, instring, loc, doActions=True ): class _PositionToken(Token): - def __init__( self ): - super(_PositionToken,self).__init__() - self.name=self.__class__.__name__ + def __init__(self): + super(_PositionToken, self).__init__() + self.name = self.__class__.__name__ self.mayReturnEmpty = True self.mayIndexError = False + class GoToColumn(_PositionToken): """Token to advance to a specific column of input text; useful for tabular report scraping.""" - def __init__( self, colno ): - super(GoToColumn,self).__init__() + + def __init__(self, colno): + super(GoToColumn, self).__init__() self.col = colno - def preParse( self, instring, loc ): - if col(loc,instring) != self.col: + def preParse(self, instring, loc): + if col(loc, instring) != self.col: instrlen = len(instring) if self.ignoreExprs: - loc = self._skipIgnorables( instring, loc ) - while loc < instrlen and instring[loc].isspace() and col( loc, instring ) != self.col : + loc = self._skipIgnorables(instring, loc) + while ( + loc < instrlen + and instring[loc].isspace() + and col(loc, instring) != self.col + ): loc += 1 return loc - def parseImpl( self, instring, loc, doActions=True ): - thiscol = col( loc, instring ) + def parseImpl(self, instring, loc, doActions=True): + thiscol = col(loc, instring) if thiscol > self.col: - raise ParseException( instring, loc, "Text not in expected column", self ) + raise ParseException(instring, loc, "Text not in expected column", self) newloc = loc + self.col - thiscol - ret = instring[ loc: newloc ] + ret = instring[loc:newloc] return newloc, ret + class LineStart(_PositionToken): """Matches if current position is at the beginning of a line within the parse string""" - def __init__( self ): - super(LineStart,self).__init__() - self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) + + def __init__(self): + super(LineStart, self).__init__() + self.setWhitespaceChars(ParserElement.DEFAULT_WHITE_CHARS.replace("\n", "")) self.errmsg = "Expected start of line" - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg - def preParse( self, instring, loc ): - preloc = super(LineStart,self).preParse(instring,loc) + def preParse(self, instring, loc): + preloc = super(LineStart, self).preParse(instring, loc) if instring[preloc] == "\n": loc += 1 return loc - def parseImpl( self, instring, loc, doActions=True ): - if not( loc==0 or - (loc == self.preParse( instring, 0 )) or - (instring[loc-1] == "\n") ): #col(loc, instring) != 1: - #~ raise ParseException( instring, loc, "Expected start of line" ) + def parseImpl(self, instring, loc, doActions=True): + if not ( + loc == 0 + or (loc == self.preParse(instring, 0)) + or (instring[loc - 1] == "\n") + ): # col(loc, instring) != 1: + # ~ raise ParseException( instring, loc, "Expected start of line" ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc return loc, [] + class LineEnd(_PositionToken): """Matches if current position is at the end of a line within the parse string""" - def __init__( self ): - super(LineEnd,self).__init__() - self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) + + def __init__(self): + super(LineEnd, self).__init__() + self.setWhitespaceChars(ParserElement.DEFAULT_WHITE_CHARS.replace("\n", "")) self.errmsg = "Expected end of line" - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg - def parseImpl( self, instring, loc, doActions=True ): - if loc len(instring): return loc, [] else: @@ -2166,47 +2522,55 @@ def parseImpl( self, instring, loc, doActions=True ): exc.pstr = instring raise exc + class WordStart(_PositionToken): """Matches if the current position is at the beginning of a Word, and - is not preceded by any character in a given set of wordChars - (default=printables). To emulate the \b behavior of regular expressions, - use WordStart(alphanums). WordStart will also match at the beginning of - the string being parsed, or at the beginning of a line. + is not preceded by any character in a given set of wordChars + (default=printables). To emulate the \b behavior of regular expressions, + use WordStart(alphanums). WordStart will also match at the beginning of + the string being parsed, or at the beginning of a line. """ - def __init__(self, wordChars = printables): - super(WordStart,self).__init__() + + def __init__(self, wordChars=printables): + super(WordStart, self).__init__() self.wordChars = _str2dict(wordChars) self.errmsg = "Not at the start of a word" - def parseImpl(self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if loc != 0: - if (instring[loc-1] in self.wordChars or - instring[loc] not in self.wordChars): + if ( + instring[loc - 1] in self.wordChars + or instring[loc] not in self.wordChars + ): exc = self.myException exc.loc = loc exc.pstr = instring raise exc return loc, [] + class WordEnd(_PositionToken): """Matches if the current position is at the end of a Word, and - is not followed by any character in a given set of wordChars - (default=printables). To emulate the \b behavior of regular expressions, - use WordEnd(alphanums). WordEnd will also match at the end of - the string being parsed, or at the end of a line. + is not followed by any character in a given set of wordChars + (default=printables). To emulate the \b behavior of regular expressions, + use WordEnd(alphanums). WordEnd will also match at the end of + the string being parsed, or at the end of a line. """ - def __init__(self, wordChars = printables): - super(WordEnd,self).__init__() + + def __init__(self, wordChars=printables): + super(WordEnd, self).__init__() self.wordChars = _str2dict(wordChars) self.skipWhitespace = False self.errmsg = "Not at the end of a word" - def parseImpl(self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): instrlen = len(instring) - if instrlen>0 and loc 0 and loc < instrlen: + if ( + instring[loc] in self.wordChars + or instring[loc - 1] not in self.wordChars + ): + # ~ raise ParseException( instring, loc, "Expected end of word" ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -2216,51 +2580,52 @@ def parseImpl(self, instring, loc, doActions=True ): class ParseExpression(ParserElement): """Abstract subclass of ParserElement, for combining and post-processing parsed tokens.""" - def __init__( self, exprs, savelist = False ): - super(ParseExpression,self).__init__(savelist) - if isinstance( exprs, list ): + + def __init__(self, exprs, savelist=False): + super(ParseExpression, self).__init__(savelist) + if isinstance(exprs, list): self.exprs = exprs - elif isinstance( exprs, basestring ): - self.exprs = [ Literal( exprs ) ] + elif isinstance(exprs, basestring): + self.exprs = [Literal(exprs)] else: try: - self.exprs = list( exprs ) + self.exprs = list(exprs) except TypeError: - self.exprs = [ exprs ] + self.exprs = [exprs] self.callPreparse = False - def __getitem__( self, i ): + def __getitem__(self, i): return self.exprs[i] - def append( self, other ): - self.exprs.append( other ) + def append(self, other): + self.exprs.append(other) self.strRepr = None return self - def leaveWhitespace( self ): + def leaveWhitespace(self): """Extends leaveWhitespace defined in base class, and also invokes leaveWhitespace on - all contained expressions.""" + all contained expressions.""" self.skipWhitespace = False - self.exprs = [ e.copy() for e in self.exprs ] + self.exprs = [e.copy() for e in self.exprs] for e in self.exprs: e.leaveWhitespace() return self - def ignore( self, other ): - if isinstance( other, Suppress ): + def ignore(self, other): + if isinstance(other, Suppress): if other not in self.ignoreExprs: - super( ParseExpression, self).ignore( other ) + super(ParseExpression, self).ignore(other) for e in self.exprs: - e.ignore( self.ignoreExprs[-1] ) + e.ignore(self.ignoreExprs[-1]) else: - super( ParseExpression, self).ignore( other ) + super(ParseExpression, self).ignore(other) for e in self.exprs: - e.ignore( self.ignoreExprs[-1] ) + e.ignore(self.ignoreExprs[-1]) return self - def __str__( self ): + def __str__(self): try: - return super(ParseExpression,self).__str__() + return super(ParseExpression, self).__str__() except: pass @@ -2268,8 +2633,8 @@ def __str__( self ): self.strRepr = f"{self.__class__.__name__}:({_ustr(self.exprs)})" return self.strRepr - def streamline( self ): - super(ParseExpression,self).streamline() + def streamline(self): + super(ParseExpression, self).streamline() for e in self.exprs: e.streamline() @@ -2277,65 +2642,72 @@ def streamline( self ): # collapse nested And's of the form And( And( And( a,b), c), d) to And( a,b,c,d ) # but only if there are no parse actions or resultsNames on the nested And's # (likewise for Or's and MatchFirst's) - if ( len(self.exprs) == 2 ): + if len(self.exprs) == 2: other = self.exprs[0] - if ( isinstance( other, self.__class__ ) and - not(other.parseAction) and - other.resultsName is None and - not other.debug ): - self.exprs = other.exprs[:] + [ self.exprs[1] ] + if ( + isinstance(other, self.__class__) + and not (other.parseAction) + and other.resultsName is None + and not other.debug + ): + self.exprs = other.exprs[:] + [self.exprs[1]] self.strRepr = None self.mayReturnEmpty |= other.mayReturnEmpty - self.mayIndexError |= other.mayIndexError + self.mayIndexError |= other.mayIndexError other = self.exprs[-1] - if ( isinstance( other, self.__class__ ) and - not(other.parseAction) and - other.resultsName is None and - not other.debug ): + if ( + isinstance(other, self.__class__) + and not (other.parseAction) + and other.resultsName is None + and not other.debug + ): self.exprs = self.exprs[:-1] + other.exprs[:] self.strRepr = None self.mayReturnEmpty |= other.mayReturnEmpty - self.mayIndexError |= other.mayIndexError + self.mayIndexError |= other.mayIndexError return self - def setResultsName( self, name, listAllMatches=False ): - ret = super(ParseExpression,self).setResultsName(name,listAllMatches) + def setResultsName(self, name, listAllMatches=False): + ret = super(ParseExpression, self).setResultsName(name, listAllMatches) return ret - def validate( self, validateTrace=[] ): - tmp = validateTrace[:]+[self] + def validate(self, validateTrace=[]): + tmp = validateTrace[:] + [self] for e in self.exprs: e.validate(tmp) - self.checkRecursion( [] ) + self.checkRecursion([]) + class And(ParseExpression): """Requires all given ParseExpressions to be found in the given order. - Expressions may be separated by whitespace. - May be constructed using the '+' operator. + Expressions may be separated by whitespace. + May be constructed using the '+' operator. """ class _ErrorStop(Empty): def __init__(self, *args, **kwargs): - super(Empty,self).__init__(*args, **kwargs) + super(Empty, self).__init__(*args, **kwargs) self.leaveWhitespace() - def __init__( self, exprs, savelist = True ): - super(And,self).__init__(exprs, savelist) + def __init__(self, exprs, savelist=True): + super(And, self).__init__(exprs, savelist) self.mayReturnEmpty = True for e in self.exprs: if not e.mayReturnEmpty: self.mayReturnEmpty = False break - self.setWhitespaceChars( exprs[0].whiteChars ) + self.setWhitespaceChars(exprs[0].whiteChars) self.skipWhitespace = exprs[0].skipWhitespace self.callPreparse = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): # pass False as last arg to _parse for first element, since we already # pre-parsed the string as part of our And pre-parsing - loc, resultlist = self.exprs[0]._parse( instring, loc, doActions, callPreParse=False ) + loc, resultlist = self.exprs[0]._parse( + instring, loc, doActions, callPreParse=False + ) errorStop = False for e in self.exprs[1:]: if isinstance(e, And._ErrorStop): @@ -2343,68 +2715,73 @@ def parseImpl( self, instring, loc, doActions=True ): continue if errorStop: try: - loc, exprtokens = e._parse( instring, loc, doActions ) + loc, exprtokens = e._parse(instring, loc, doActions) except ParseSyntaxException: raise except ParseBaseException as pe: raise ParseSyntaxException(pe) except IndexError as ie: - raise ParseSyntaxException( ParseException(instring, len(instring), self.errmsg, self) ) + raise ParseSyntaxException( + ParseException(instring, len(instring), self.errmsg, self) + ) else: - loc, exprtokens = e._parse( instring, loc, doActions ) + loc, exprtokens = e._parse(instring, loc, doActions) if exprtokens or exprtokens.keys(): resultlist += exprtokens return loc, resultlist - def __iadd__(self, other ): - if isinstance( other, basestring ): - other = Literal( other ) - return self.append( other ) #And( [ self, other ] ) + def __iadd__(self, other): + if isinstance(other, basestring): + other = Literal(other) + return self.append(other) # And( [ self, other ] ) - def checkRecursion( self, parseElementList ): - subRecCheckList = parseElementList[:] + [ self ] + def checkRecursion(self, parseElementList): + subRecCheckList = parseElementList[:] + [self] for e in self.exprs: - e.checkRecursion( subRecCheckList ) + e.checkRecursion(subRecCheckList) if not e.mayReturnEmpty: break - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: - self.strRepr = "{" + " ".join( [ _ustr(e) for e in self.exprs ] ) + "}" + self.strRepr = "{" + " ".join([_ustr(e) for e in self.exprs]) + "}" return self.strRepr class Or(ParseExpression): """Requires that at least one ParseExpression is found. - If two expressions match, the expression that matches the longest string will be used. - May be constructed using the '^' operator. + If two expressions match, the expression that matches the longest string will be used. + May be constructed using the '^' operator. """ - def __init__( self, exprs, savelist = False ): - super(Or,self).__init__(exprs, savelist) + + def __init__(self, exprs, savelist=False): + super(Or, self).__init__(exprs, savelist) self.mayReturnEmpty = False for e in self.exprs: if e.mayReturnEmpty: self.mayReturnEmpty = True break - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): maxExcLoc = -1 maxMatchLoc = -1 maxException = None for e in self.exprs: try: - loc2 = e.tryParse( instring, loc ) + loc2 = e.tryParse(instring, loc) except ParseException as err: if err.loc > maxExcLoc: maxException = err maxExcLoc = err.loc except IndexError: if len(instring) > maxExcLoc: - maxException = ParseException(instring,len(instring),e.errmsg,self) + maxException = ParseException( + instring, len(instring), e.errmsg, self + ) maxExcLoc = len(instring) else: if loc2 > maxMatchLoc: @@ -2415,37 +2792,40 @@ def parseImpl( self, instring, loc, doActions=True ): if maxException is not None: raise maxException else: - raise ParseException(instring, loc, "no defined alternatives to match", self) + raise ParseException( + instring, loc, "no defined alternatives to match", self + ) - return maxMatchExp._parse( instring, loc, doActions ) + return maxMatchExp._parse(instring, loc, doActions) - def __ixor__(self, other ): - if isinstance( other, basestring ): - other = Literal( other ) - return self.append( other ) #Or( [ self, other ] ) + def __ixor__(self, other): + if isinstance(other, basestring): + other = Literal(other) + return self.append(other) # Or( [ self, other ] ) - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: - self.strRepr = "{" + " ^ ".join( [ _ustr(e) for e in self.exprs ] ) + "}" + self.strRepr = "{" + " ^ ".join([_ustr(e) for e in self.exprs]) + "}" return self.strRepr - def checkRecursion( self, parseElementList ): - subRecCheckList = parseElementList[:] + [ self ] + def checkRecursion(self, parseElementList): + subRecCheckList = parseElementList[:] + [self] for e in self.exprs: - e.checkRecursion( subRecCheckList ) + e.checkRecursion(subRecCheckList) class MatchFirst(ParseExpression): """Requires that at least one ParseExpression is found. - If two expressions match, the first one listed is the one that will match. - May be constructed using the '|' operator. + If two expressions match, the first one listed is the one that will match. + May be constructed using the '|' operator. """ - def __init__( self, exprs, savelist = False ): - super(MatchFirst,self).__init__(exprs, savelist) + + def __init__(self, exprs, savelist=False): + super(MatchFirst, self).__init__(exprs, savelist) if exprs: self.mayReturnEmpty = False for e in self.exprs: @@ -2455,12 +2835,12 @@ def __init__( self, exprs, savelist = False ): else: self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): maxExcLoc = -1 maxException = None for e in self.exprs: try: - ret = e._parse( instring, loc, doActions ) + ret = e._parse(instring, loc, doActions) return ret except ParseException as err: if err.loc > maxExcLoc: @@ -2468,7 +2848,9 @@ def parseImpl( self, instring, loc, doActions=True ): maxExcLoc = err.loc except IndexError: if len(instring) > maxExcLoc: - maxException = ParseException(instring,len(instring),e.errmsg,self) + maxException = ParseException( + instring, len(instring), e.errmsg, self + ) maxExcLoc = len(instring) # only got here if no expression matched, raise exception for match that made it the furthest @@ -2476,35 +2858,38 @@ def parseImpl( self, instring, loc, doActions=True ): if maxException is not None: raise maxException else: - raise ParseException(instring, loc, "no defined alternatives to match", self) + raise ParseException( + instring, loc, "no defined alternatives to match", self + ) - def __ior__(self, other ): - if isinstance( other, basestring ): - other = Literal( other ) - return self.append( other ) #MatchFirst( [ self, other ] ) + def __ior__(self, other): + if isinstance(other, basestring): + other = Literal(other) + return self.append(other) # MatchFirst( [ self, other ] ) - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: - self.strRepr = "{" + " | ".join( [ _ustr(e) for e in self.exprs ] ) + "}" + self.strRepr = "{" + " | ".join([_ustr(e) for e in self.exprs]) + "}" return self.strRepr - def checkRecursion( self, parseElementList ): - subRecCheckList = parseElementList[:] + [ self ] + def checkRecursion(self, parseElementList): + subRecCheckList = parseElementList[:] + [self] for e in self.exprs: - e.checkRecursion( subRecCheckList ) + e.checkRecursion(subRecCheckList) class Each(ParseExpression): """Requires all given ParseExpressions to be found, but in any order. - Expressions may be separated by whitespace. - May be constructed using the '&' operator. + Expressions may be separated by whitespace. + May be constructed using the '&' operator. """ - def __init__( self, exprs, savelist = True ): - super(Each,self).__init__(exprs, savelist) + + def __init__(self, exprs, savelist=True): + super(Each, self).__init__(exprs, savelist) self.mayReturnEmpty = True for e in self.exprs: if not e.mayReturnEmpty: @@ -2513,17 +2898,25 @@ def __init__( self, exprs, savelist = True ): self.skipWhitespace = True self.initExprGroups = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if self.initExprGroups: - self.optionals = [ e.expr for e in self.exprs if isinstance(e,Optional) ] - self.multioptionals = [ e.expr for e in self.exprs if isinstance(e,ZeroOrMore) ] - self.multirequired = [ e.expr for e in self.exprs if isinstance(e,OneOrMore) ] - self.required = [ e for e in self.exprs if not isinstance(e,(Optional,ZeroOrMore,OneOrMore)) ] + self.optionals = [e.expr for e in self.exprs if isinstance(e, Optional)] + self.multioptionals = [ + e.expr for e in self.exprs if isinstance(e, ZeroOrMore) + ] + self.multirequired = [ + e.expr for e in self.exprs if isinstance(e, OneOrMore) + ] + self.required = [ + e + for e in self.exprs + if not isinstance(e, (Optional, ZeroOrMore, OneOrMore)) + ] self.required += self.multirequired self.initExprGroups = False tmpLoc = loc tmpReqd = self.required[:] - tmpOpt = self.optionals[:] + tmpOpt = self.optionals[:] matchOrder = [] keepMatching = True @@ -2532,7 +2925,7 @@ def parseImpl( self, instring, loc, doActions=True ): failed = [] for e in tmpExprs: try: - tmpLoc = e.tryParse( instring, tmpLoc ) + tmpLoc = e.tryParse(instring, tmpLoc) except ParseException: failed.append(e) else: @@ -2545,15 +2938,19 @@ def parseImpl( self, instring, loc, doActions=True ): keepMatching = False if tmpReqd: - missing = ", ".join( [ _ustr(e) for e in tmpReqd ] ) - raise ParseException(instring,loc,f"Missing one or more required elements ({missing})" ) + missing = ", ".join([_ustr(e) for e in tmpReqd]) + raise ParseException( + instring, loc, f"Missing one or more required elements ({missing})" + ) # add any unmatched Optionals, in case they have default values defined - matchOrder += list(e for e in self.exprs if isinstance(e,Optional) and e.expr in tmpOpt) + matchOrder += list( + e for e in self.exprs if isinstance(e, Optional) and e.expr in tmpOpt + ) resultlist = [] for e in matchOrder: - loc,results = e._parse(instring,loc,doActions) + loc, results = e._parse(instring, loc, doActions) resultlist.append(results) finalResults = ParseResults([]) @@ -2565,89 +2962,90 @@ def parseImpl( self, instring, loc, doActions=True ): tmp += ParseResults(r[k]) dups[k] = tmp finalResults += ParseResults(r) - for k,v in dups.items(): + for k, v in dups.items(): finalResults[k] = v return loc, finalResults - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: - self.strRepr = "{" + " & ".join( [ _ustr(e) for e in self.exprs ] ) + "}" + self.strRepr = "{" + " & ".join([_ustr(e) for e in self.exprs]) + "}" return self.strRepr - def checkRecursion( self, parseElementList ): - subRecCheckList = parseElementList[:] + [ self ] + def checkRecursion(self, parseElementList): + subRecCheckList = parseElementList[:] + [self] for e in self.exprs: - e.checkRecursion( subRecCheckList ) + e.checkRecursion(subRecCheckList) class ParseElementEnhance(ParserElement): """Abstract subclass of ParserElement, for combining and post-processing parsed tokens.""" - def __init__( self, expr, savelist=False ): - super(ParseElementEnhance,self).__init__(savelist) - if isinstance( expr, basestring ): + + def __init__(self, expr, savelist=False): + super(ParseElementEnhance, self).__init__(savelist) + if isinstance(expr, basestring): expr = Literal(expr) self.expr = expr self.strRepr = None if expr is not None: self.mayIndexError = expr.mayIndexError self.mayReturnEmpty = expr.mayReturnEmpty - self.setWhitespaceChars( expr.whiteChars ) + self.setWhitespaceChars(expr.whiteChars) self.skipWhitespace = expr.skipWhitespace self.saveAsList = expr.saveAsList self.callPreparse = expr.callPreparse self.ignoreExprs.extend(expr.ignoreExprs) - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if self.expr is not None: - return self.expr._parse( instring, loc, doActions, callPreParse=False ) + return self.expr._parse(instring, loc, doActions, callPreParse=False) else: - raise ParseException("",loc,self.errmsg,self) + raise ParseException("", loc, self.errmsg, self) - def leaveWhitespace( self ): + def leaveWhitespace(self): self.skipWhitespace = False self.expr = self.expr.copy() if self.expr is not None: self.expr.leaveWhitespace() return self - def ignore( self, other ): - if isinstance( other, Suppress ): + def ignore(self, other): + if isinstance(other, Suppress): if other not in self.ignoreExprs: - super( ParseElementEnhance, self).ignore( other ) + super(ParseElementEnhance, self).ignore(other) if self.expr is not None: - self.expr.ignore( self.ignoreExprs[-1] ) + self.expr.ignore(self.ignoreExprs[-1]) else: - super( ParseElementEnhance, self).ignore( other ) + super(ParseElementEnhance, self).ignore(other) if self.expr is not None: - self.expr.ignore( self.ignoreExprs[-1] ) + self.expr.ignore(self.ignoreExprs[-1]) return self - def streamline( self ): - super(ParseElementEnhance,self).streamline() + def streamline(self): + super(ParseElementEnhance, self).streamline() if self.expr is not None: self.expr.streamline() return self - def checkRecursion( self, parseElementList ): + def checkRecursion(self, parseElementList): if self in parseElementList: - raise RecursiveGrammarException( parseElementList+[self] ) - subRecCheckList = parseElementList[:] + [ self ] + raise RecursiveGrammarException(parseElementList + [self]) + subRecCheckList = parseElementList[:] + [self] if self.expr is not None: - self.expr.checkRecursion( subRecCheckList ) + self.expr.checkRecursion(subRecCheckList) - def validate( self, validateTrace=[] ): - tmp = validateTrace[:]+[self] + def validate(self, validateTrace=[]): + tmp = validateTrace[:] + [self] if self.expr is not None: self.expr.validate(tmp) - self.checkRecursion( [] ) + self.checkRecursion([]) - def __str__( self ): + def __str__(self): try: - return super(ParseElementEnhance,self).__str__() + return super(ParseElementEnhance, self).__str__() except: pass @@ -2661,12 +3059,13 @@ class FollowedBy(ParseElementEnhance): does *not* advance the parsing position within the input string, it only verifies that the specified parse expression matches at the current position. FollowedBy always returns a null token list.""" - def __init__( self, expr ): - super(FollowedBy,self).__init__(expr) + + def __init__(self, expr): + super(FollowedBy, self).__init__(expr) self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): - self.expr.tryParse( instring, loc ) + def parseImpl(self, instring, loc, doActions=True): + self.expr.tryParse(instring, loc) return loc, [] @@ -2676,29 +3075,32 @@ class NotAny(ParseElementEnhance): verifies that the specified parse expression does *not* match at the current position. Also, NotAny does *not* skip over leading whitespace. NotAny always returns a null token list. May be constructed using the '~' operator.""" - def __init__( self, expr ): - super(NotAny,self).__init__(expr) - #~ self.leaveWhitespace() - self.skipWhitespace = False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs + + def __init__(self, expr): + super(NotAny, self).__init__(expr) + # ~ self.leaveWhitespace() + self.skipWhitespace = ( + False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs + ) self.mayReturnEmpty = True - self.errmsg = "Found unwanted token, "+_ustr(self.expr) - #self.myException = ParseException("",0,self.errmsg,self) + self.errmsg = "Found unwanted token, " + _ustr(self.expr) + # self.myException = ParseException("",0,self.errmsg,self) - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): try: - self.expr.tryParse( instring, loc ) - except (ParseException,IndexError): + self.expr.tryParse(instring, loc) + except (ParseException, IndexError): pass else: - #~ raise ParseException(instring, loc, self.errmsg ) + # ~ raise ParseException(instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc return loc, [] - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: @@ -2709,30 +3111,31 @@ def __str__( self ): class ZeroOrMore(ParseElementEnhance): """Optional repetition of zero or more of the given expression.""" - def __init__( self, expr ): - super(ZeroOrMore,self).__init__(expr) + + def __init__(self, expr): + super(ZeroOrMore, self).__init__(expr) self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): tokens = [] try: - loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False ) - hasIgnoreExprs = ( len(self.ignoreExprs) > 0 ) + loc, tokens = self.expr._parse(instring, loc, doActions, callPreParse=False) + hasIgnoreExprs = len(self.ignoreExprs) > 0 while 1: if hasIgnoreExprs: - preloc = self._skipIgnorables( instring, loc ) + preloc = self._skipIgnorables(instring, loc) else: preloc = loc - loc, tmptokens = self.expr._parse( instring, preloc, doActions ) + loc, tmptokens = self.expr._parse(instring, preloc, doActions) if tmptokens or tmptokens.keys(): tokens += tmptokens - except (ParseException,IndexError): + except (ParseException, IndexError): pass return loc, tokens - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: @@ -2740,34 +3143,35 @@ def __str__( self ): return self.strRepr - def setResultsName( self, name, listAllMatches=False ): - ret = super(ZeroOrMore,self).setResultsName(name,listAllMatches) + def setResultsName(self, name, listAllMatches=False): + ret = super(ZeroOrMore, self).setResultsName(name, listAllMatches) ret.saveAsList = True return ret class OneOrMore(ParseElementEnhance): """Repetition of one or more of the given expression.""" - def parseImpl( self, instring, loc, doActions=True ): + + def parseImpl(self, instring, loc, doActions=True): # must be at least one - loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False ) + loc, tokens = self.expr._parse(instring, loc, doActions, callPreParse=False) try: - hasIgnoreExprs = ( len(self.ignoreExprs) > 0 ) + hasIgnoreExprs = len(self.ignoreExprs) > 0 while 1: if hasIgnoreExprs: - preloc = self._skipIgnorables( instring, loc ) + preloc = self._skipIgnorables(instring, loc) else: preloc = loc - loc, tmptokens = self.expr._parse( instring, preloc, doActions ) + loc, tmptokens = self.expr._parse(instring, preloc, doActions) if tmptokens or tmptokens.keys(): tokens += tmptokens - except (ParseException,IndexError): + except (ParseException, IndexError): pass return loc, tokens - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: @@ -2775,45 +3179,52 @@ def __str__( self ): return self.strRepr - def setResultsName( self, name, listAllMatches=False ): - ret = super(OneOrMore,self).setResultsName(name,listAllMatches) + def setResultsName(self, name, listAllMatches=False): + ret = super(OneOrMore, self).setResultsName(name, listAllMatches) ret.saveAsList = True return ret + class _NullToken(object): def __bool__(self): return False + __nonzero__ = __bool__ + def __str__(self): return "" + _optionalNotMatched = _NullToken() + + class Optional(ParseElementEnhance): """Optional matching of the given expression. - A default return string can also be specified, if the optional expression - is not found. + A default return string can also be specified, if the optional expression + is not found. """ - def __init__( self, exprs, default=_optionalNotMatched ): - super(Optional,self).__init__( exprs, savelist=False ) + + def __init__(self, exprs, default=_optionalNotMatched): + super(Optional, self).__init__(exprs, savelist=False) self.defaultValue = default self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): try: - loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False ) - except (ParseException,IndexError): + loc, tokens = self.expr._parse(instring, loc, doActions, callPreParse=False) + except (ParseException, IndexError): if self.defaultValue is not _optionalNotMatched: if self.expr.resultsName: - tokens = ParseResults([ self.defaultValue ]) + tokens = ParseResults([self.defaultValue]) tokens[self.expr.resultsName] = self.defaultValue else: - tokens = [ self.defaultValue ] + tokens = [self.defaultValue] else: tokens = [] return loc, tokens - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: @@ -2824,13 +3235,14 @@ def __str__( self ): class SkipTo(ParseElementEnhance): """Token for skipping over all undefined text until the matched expression is found. - If include is set to true, the matched expression is also parsed (the skipped text - and matched expression are returned as a 2-element list). The ignore - argument is used to define grammars (typically quoted strings and comments) that - might contain false matches. + If include is set to true, the matched expression is also parsed (the skipped text + and matched expression are returned as a 2-element list). The ignore + argument is used to define grammars (typically quoted strings and comments) that + might contain false matches. """ - def __init__( self, other, include=False, ignore=None, failOn=None ): - super( SkipTo, self ).__init__( other ) + + def __init__(self, other, include=False, ignore=None, failOn=None): + super(SkipTo, self).__init__(other) self.ignoreExpr = ignore self.mayReturnEmpty = True self.mayIndexError = False @@ -2840,10 +3252,10 @@ def __init__( self, other, include=False, ignore=None, failOn=None ): self.failOn = Literal(failOn) else: self.failOn = failOn - self.errmsg = "No match found for "+_ustr(self.expr) - #self.myException = ParseException("",0,self.errmsg,self) + self.errmsg = "No match found for " + _ustr(self.expr) + # self.myException = ParseException("",0,self.errmsg,self) - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): startLoc = loc instrlen = len(instring) expr = self.expr @@ -2857,28 +3269,30 @@ def parseImpl( self, instring, loc, doActions=True ): pass else: failParse = True - raise ParseException(instring, loc, "Found expression " + str(self.failOn)) + raise ParseException( + instring, loc, "Found expression " + str(self.failOn) + ) failParse = False if self.ignoreExpr is not None: while 1: try: - loc = self.ignoreExpr.tryParse(instring,loc) - print ("found ignoreExpr, advance to", loc) + loc = self.ignoreExpr.tryParse(instring, loc) + print("found ignoreExpr, advance to", loc) except ParseBaseException: break - expr._parse( instring, loc, doActions=False, callPreParse=False ) + expr._parse(instring, loc, doActions=False, callPreParse=False) skipText = instring[startLoc:loc] if self.includeMatch: - loc,mat = expr._parse(instring,loc,doActions,callPreParse=False) + loc, mat = expr._parse(instring, loc, doActions, callPreParse=False) if mat: - skipRes = ParseResults( skipText ) + skipRes = ParseResults(skipText) skipRes += mat - return loc, [ skipRes ] + return loc, [skipRes] else: - return loc, [ skipText ] + return loc, [skipText] else: - return loc, [ skipText ] - except (ParseException,IndexError): + return loc, [skipText] + except (ParseException, IndexError): if failParse: raise else: @@ -2888,57 +3302,59 @@ def parseImpl( self, instring, loc, doActions=True ): exc.pstr = instring raise exc + class Forward(ParseElementEnhance): """Forward declaration of an expression to be defined later - - used for recursive grammars, such as algebraic infix notation. - When the expression is known, it is assigned to the Forward variable using the '<<' operator. - - Note: take care when assigning to Forward not to overlook precedence of operators. - Specifically, '|' has a lower precedence than '<<', so that:: - fwdExpr << a | b | c - will actually be evaluated as:: - (fwdExpr << a) | b | c - thereby leaving b and c out as parseable alternatives. It is recommended that you - explicitly group the values inserted into the Forward:: - fwdExpr << (a | b | c) + used for recursive grammars, such as algebraic infix notation. + When the expression is known, it is assigned to the Forward variable using the '<<' operator. + + Note: take care when assigning to Forward not to overlook precedence of operators. + Specifically, '|' has a lower precedence than '<<', so that:: + fwdExpr << a | b | c + will actually be evaluated as:: + (fwdExpr << a) | b | c + thereby leaving b and c out as parseable alternatives. It is recommended that you + explicitly group the values inserted into the Forward:: + fwdExpr << (a | b | c) """ - def __init__( self, other=None ): - super(Forward,self).__init__( other, savelist=False ) - def __lshift__( self, other ): - if isinstance( other, basestring ): + def __init__(self, other=None): + super(Forward, self).__init__(other, savelist=False) + + def __lshift__(self, other): + if isinstance(other, basestring): other = Literal(other) self.expr = other self.mayReturnEmpty = other.mayReturnEmpty self.strRepr = None self.mayIndexError = self.expr.mayIndexError self.mayReturnEmpty = self.expr.mayReturnEmpty - self.setWhitespaceChars( self.expr.whiteChars ) + self.setWhitespaceChars(self.expr.whiteChars) self.skipWhitespace = self.expr.skipWhitespace self.saveAsList = self.expr.saveAsList self.ignoreExprs.extend(self.expr.ignoreExprs) return None - def leaveWhitespace( self ): + def leaveWhitespace(self): self.skipWhitespace = False return self - def streamline( self ): + def streamline(self): if not self.streamlined: self.streamlined = True if self.expr is not None: self.expr.streamline() return self - def validate( self, validateTrace=[] ): + def validate(self, validateTrace=[]): if self not in validateTrace: - tmp = validateTrace[:]+[self] + tmp = validateTrace[:] + [self] if self.expr is not None: self.expr.validate(tmp) self.checkRecursion([]) - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name self._revertClass = self.__class__ @@ -2954,40 +3370,49 @@ def __str__( self ): def copy(self): if self.expr is not None: - return super(Forward,self).copy() + return super(Forward, self).copy() else: ret = Forward() ret << self return ret + class _ForwardNoRecurse(Forward): - def __str__( self ): + def __str__(self): return "..." + class TokenConverter(ParseElementEnhance): """Abstract subclass of ParseExpression, for converting parsed results.""" - def __init__( self, expr, savelist=False ): - super(TokenConverter,self).__init__( expr )#, savelist ) + + def __init__(self, expr, savelist=False): + super(TokenConverter, self).__init__(expr) # , savelist ) self.saveAsList = False + class Upcase(TokenConverter): """Converter to upper case all matching tokens.""" + def __init__(self, *args): - super(Upcase,self).__init__(*args) - warnings.warn("Upcase class is deprecated, use upcaseTokens parse action instead", - DeprecationWarning,stacklevel=2) + super(Upcase, self).__init__(*args) + warnings.warn( + "Upcase class is deprecated, use upcaseTokens parse action instead", + DeprecationWarning, + stacklevel=2, + ) - def postParse( self, instring, loc, tokenlist ): - return list(map( string.upper, tokenlist )) + def postParse(self, instring, loc, tokenlist): + return list(map(string.upper, tokenlist)) class Combine(TokenConverter): """Converter to concatenate all matching tokens to a single string. - By default, the matching patterns must also be contiguous in the input string; - this can be disabled by specifying 'adjacent=False' in the constructor. + By default, the matching patterns must also be contiguous in the input string; + this can be disabled by specifying 'adjacent=False' in the constructor. """ - def __init__( self, expr, joinString="", adjacent=True ): - super(Combine,self).__init__( expr ) + + def __init__(self, expr, joinString="", adjacent=True): + super(Combine, self).__init__(expr) # suppress whitespace-stripping in contained parse expressions, but re-enable it on the Combine itself if adjacent: self.leaveWhitespace() @@ -2995,388 +3420,474 @@ def __init__( self, expr, joinString="", adjacent=True ): self.skipWhitespace = True self.joinString = joinString - def ignore( self, other ): + def ignore(self, other): if self.adjacent: ParserElement.ignore(self, other) else: - super( Combine, self).ignore( other ) + super(Combine, self).ignore(other) return self - def postParse( self, instring, loc, tokenlist ): + def postParse(self, instring, loc, tokenlist): retToks = tokenlist.copy() del retToks[:] - retToks += ParseResults([ "".join(tokenlist._asStringList(self.joinString)) ], modal=self.modalResults) + retToks += ParseResults( + ["".join(tokenlist._asStringList(self.joinString))], modal=self.modalResults + ) - if self.resultsName and len(retToks.keys())>0: - return [ retToks ] + if self.resultsName and len(retToks.keys()) > 0: + return [retToks] else: return retToks + class Group(TokenConverter): """Converter to return the matched tokens as a list - useful for returning tokens of ZeroOrMore and OneOrMore expressions.""" - def __init__( self, expr ): - super(Group,self).__init__( expr ) + + def __init__(self, expr): + super(Group, self).__init__(expr) self.saveAsList = True - def postParse( self, instring, loc, tokenlist ): - return [ tokenlist ] + def postParse(self, instring, loc, tokenlist): + return [tokenlist] + class Dict(TokenConverter): """Converter to return a repetitive expression as a list, but also as a dictionary. - Each element can also be referenced using the first token in the expression as its key. - Useful for tabular report scraping when the first column can be used as a item key. + Each element can also be referenced using the first token in the expression as its key. + Useful for tabular report scraping when the first column can be used as a item key. """ - def __init__( self, exprs ): - super(Dict,self).__init__( exprs ) + + def __init__(self, exprs): + super(Dict, self).__init__(exprs) self.saveAsList = True - def postParse( self, instring, loc, tokenlist ): - for i,tok in enumerate(tokenlist): + def postParse(self, instring, loc, tokenlist): + for i, tok in enumerate(tokenlist): if len(tok) == 0: continue ikey = tok[0] - if isinstance(ikey,int): + if isinstance(ikey, int): ikey = _ustr(tok[0]).strip() - if len(tok)==1: - tokenlist[ikey] = _ParseResultsWithOffset("",i) - elif len(tok)==2 and not isinstance(tok[1],ParseResults): - tokenlist[ikey] = _ParseResultsWithOffset(tok[1],i) + if len(tok) == 1: + tokenlist[ikey] = _ParseResultsWithOffset("", i) + elif len(tok) == 2 and not isinstance(tok[1], ParseResults): + tokenlist[ikey] = _ParseResultsWithOffset(tok[1], i) else: - dictvalue = tok.copy() #ParseResults(i) + dictvalue = tok.copy() # ParseResults(i) del dictvalue[0] - if len(dictvalue)!= 1 or (isinstance(dictvalue,ParseResults) and dictvalue.keys()): - tokenlist[ikey] = _ParseResultsWithOffset(dictvalue,i) + if len(dictvalue) != 1 or ( + isinstance(dictvalue, ParseResults) and dictvalue.keys() + ): + tokenlist[ikey] = _ParseResultsWithOffset(dictvalue, i) else: - tokenlist[ikey] = _ParseResultsWithOffset(dictvalue[0],i) + tokenlist[ikey] = _ParseResultsWithOffset(dictvalue[0], i) if self.resultsName: - return [ tokenlist ] + return [tokenlist] else: return tokenlist class Suppress(TokenConverter): """Converter for ignoring the results of a parsed expression.""" - def postParse( self, instring, loc, tokenlist ): + + def postParse(self, instring, loc, tokenlist): return [] - def suppress( self ): + def suppress(self): return self class OnlyOnce(object): """Wrapper for parse actions, to ensure they are only called once.""" + def __init__(self, methodCall): self.callable = ParserElement._normalizeParseActionArgs(methodCall) self.called = False - def __call__(self,s,l,t): + + def __call__(self, s, l, t): if not self.called: - results = self.callable(s,l,t) + results = self.callable(s, l, t) self.called = True return results - raise ParseException(s,l,"") + raise ParseException(s, l, "") + def reset(self): self.called = False + def traceParseAction(f): """Decorator for debugging parse actions.""" f = ParserElement._normalizeParseActionArgs(f) + def z(*paArgs): thisFunc = f.func_name - s,l,t = paArgs[-3:] - if len(paArgs)>3: - thisFunc = paArgs[0].__class__.__name__ + '.' + thisFunc - sys.stderr.write( ">>entering %s(line: '%s', %d, %s)\n" % (thisFunc,line(l,s),l,t) ) + s, l, t = paArgs[-3:] + if len(paArgs) > 3: + thisFunc = paArgs[0].__class__.__name__ + "." + thisFunc + sys.stderr.write( + ">>entering %s(line: '%s', %d, %s)\n" % (thisFunc, line(l, s), l, t) + ) try: ret = f(*paArgs) except ValueError as exc: - sys.stderr.write( f"<", "|".join( [ _escapeRegexChars(sym) for sym in symbols] )) + # ~ print (strs,"->", "|".join( [ _escapeRegexChars(sym) for sym in symbols] )) try: - if len(symbols)==len("".join(symbols)): - return Regex( f"[{''.join([_escapeRegexRangeChars(sym) for sym in symbols])}]" ) + if len(symbols) == len("".join(symbols)): + return Regex( + f"[{''.join([_escapeRegexRangeChars(sym) for sym in symbols])}]" + ) else: - return Regex( "|".join( [ re.escape(sym) for sym in symbols] ) ) + return Regex("|".join([re.escape(sym) for sym in symbols])) except: - warnings.warn("Exception creating Regex for oneOf, building MatchFirst", - SyntaxWarning, stacklevel=2) - + warnings.warn( + "Exception creating Regex for oneOf, building MatchFirst", + SyntaxWarning, + stacklevel=2, + ) # last resort, just use MatchFirst - return MatchFirst( [ parseElementClass(sym) for sym in symbols ] ) + return MatchFirst([parseElementClass(sym) for sym in symbols]) + -def dictOf( key, value ): +def dictOf(key, value): """Helper to easily and clearly define a dictionary by specifying the respective patterns - for the key and value. Takes care of defining the Dict, ZeroOrMore, and Group tokens - in the proper order. The key pattern can include delimiting markers or punctuation, - as long as they are suppressed, thereby leaving the significant key text. The value - pattern can include named results, so that the Dict results can include named token - fields. + for the key and value. Takes care of defining the Dict, ZeroOrMore, and Group tokens + in the proper order. The key pattern can include delimiting markers or punctuation, + as long as they are suppressed, thereby leaving the significant key text. The value + pattern can include named results, so that the Dict results can include named token + fields. """ - return Dict( ZeroOrMore( Group ( key + value ) ) ) + return Dict(ZeroOrMore(Group(key + value))) + def originalTextFor(expr, asString=True): """Helper to return the original, untokenized text for a given expression. Useful to - restore the parsed fields of an HTML start tag into the raw tag text itself, or to - revert separate tokens with intervening whitespace back to the original matching - input text. Simpler to use than the parse action keepOriginalText, and does not - require the inspect module to chase up the call stack. By default, returns a - string containing the original parsed text. - - If the optional asString argument is passed as False, then the return value is a - ParseResults containing any results names that were originally matched, and a - single token containing the original matched text from the input string. So if - the expression passed to originalTextFor contains expressions with defined - results names, you must set asString to False if you want to preserve those - results name values.""" - locMarker = Empty().setParseAction(lambda s,loc,t: loc) + restore the parsed fields of an HTML start tag into the raw tag text itself, or to + revert separate tokens with intervening whitespace back to the original matching + input text. Simpler to use than the parse action keepOriginalText, and does not + require the inspect module to chase up the call stack. By default, returns a + string containing the original parsed text. + + If the optional asString argument is passed as False, then the return value is a + ParseResults containing any results names that were originally matched, and a + single token containing the original matched text from the input string. So if + the expression passed to originalTextFor contains expressions with defined + results names, you must set asString to False if you want to preserve those + results name values.""" + locMarker = Empty().setParseAction(lambda s, loc, t: loc) matchExpr = locMarker("_original_start") + expr + locMarker("_original_end") if asString: - extractText = lambda s,l,t: s[t._original_start:t._original_end] + extractText = lambda s, l, t: s[t._original_start : t._original_end] else: - def extractText(s,l,t): + + def extractText(s, l, t): del t[:] - t.insert(0, s[t._original_start:t._original_end]) + t.insert(0, s[t._original_start : t._original_end]) del t["_original_start"] del t["_original_end"] + matchExpr.setParseAction(extractText) return matchExpr - + + # convenience constants for positional expressions -empty = Empty().setName("empty") -lineStart = LineStart().setName("lineStart") -lineEnd = LineEnd().setName("lineEnd") +empty = Empty().setName("empty") +lineStart = LineStart().setName("lineStart") +lineEnd = LineEnd().setName("lineEnd") stringStart = StringStart().setName("stringStart") -stringEnd = StringEnd().setName("stringEnd") - -_escapedPunc = Word( _bslash, r"\[]-*.$+^?()~ ", exact=2 ).setParseAction(lambda s,l,t:t[0][1]) -_printables_less_backslash = "".join([ c for c in printables if c not in r"\]" ]) -_escapedHexChar = Combine( Suppress(_bslash + "0x") + Word(hexnums) ).setParseAction(lambda s,l,t:unichr(int(t[0],16))) -_escapedOctChar = Combine( Suppress(_bslash) + Word("0","01234567") ).setParseAction(lambda s,l,t:unichr(int(t[0],8))) -_singleChar = _escapedPunc | _escapedHexChar | _escapedOctChar | Word(_printables_less_backslash,exact=1) +stringEnd = StringEnd().setName("stringEnd") + +_escapedPunc = Word(_bslash, r"\[]-*.$+^?()~ ", exact=2).setParseAction( + lambda s, l, t: t[0][1] +) +_printables_less_backslash = "".join([c for c in printables if c not in r"\]"]) +_escapedHexChar = Combine(Suppress(_bslash + "0x") + Word(hexnums)).setParseAction( + lambda s, l, t: unichr(int(t[0], 16)) +) +_escapedOctChar = Combine(Suppress(_bslash) + Word("0", "01234567")).setParseAction( + lambda s, l, t: unichr(int(t[0], 8)) +) +_singleChar = ( + _escapedPunc + | _escapedHexChar + | _escapedOctChar + | Word(_printables_less_backslash, exact=1) +) _charRange = Group(_singleChar + Suppress("-") + _singleChar) -_reBracketExpr = Literal("[") + Optional("^").setResultsName("negate") + Group( OneOrMore( _charRange | _singleChar ) ).setResultsName("body") + "]" +_reBracketExpr = ( + Literal("[") + + Optional("^").setResultsName("negate") + + Group(OneOrMore(_charRange | _singleChar)).setResultsName("body") + + "]" +) + +_expanded = lambda p: ( + isinstance(p, ParseResults) + and "".join([unichr(c) for c in range(ord(p[0]), ord(p[1]) + 1)]) + or p +) -_expanded = lambda p: (isinstance(p,ParseResults) and ''.join([ unichr(c) for c in range(ord(p[0]),ord(p[1])+1) ]) or p) def srange(s): r"""Helper to easily define string ranges for use in Word construction. Borrows - syntax from regexp '[]' string range definitions:: - srange("[0-9]") -> "0123456789" - srange("[a-z]") -> "abcdefghijklmnopqrstuvwxyz" - srange("[a-z$_]") -> "abcdefghijklmnopqrstuvwxyz$_" - The input string must be enclosed in []'s, and the returned string is the expanded - character set joined into a single string. - The values enclosed in the []'s may be:: - a single character - an escaped character with a leading backslash (such as \- or \]) - an escaped hex character with a leading '\0x' (\0x21, which is a '!' character) - an escaped octal character with a leading '\0' (\041, which is a '!' character) - a range of any of the above, separated by a dash ('a-z', etc.) - any combination of the above ('aeiouy', 'a-zA-Z0-9_$', etc.) + syntax from regexp '[]' string range definitions:: + srange("[0-9]") -> "0123456789" + srange("[a-z]") -> "abcdefghijklmnopqrstuvwxyz" + srange("[a-z$_]") -> "abcdefghijklmnopqrstuvwxyz$_" + The input string must be enclosed in []'s, and the returned string is the expanded + character set joined into a single string. + The values enclosed in the []'s may be:: + a single character + an escaped character with a leading backslash (such as \- or \]) + an escaped hex character with a leading '\0x' (\0x21, which is a '!' character) + an escaped octal character with a leading '\0' (\041, which is a '!' character) + a range of any of the above, separated by a dash ('a-z', etc.) + any combination of the above ('aeiouy', 'a-zA-Z0-9_$', etc.) """ try: return "".join([_expanded(part) for part in _reBracketExpr.parseString(s).body]) except: return "" + def matchOnlyAtCol(n): """Helper method for defining parse actions that require matching at a specific - column in the input text. + column in the input text. """ - def verifyCol(strg,locn,toks): - if col(locn,strg) != n: - raise ParseException(strg,locn,"matched token not at column %d" % n) + + def verifyCol(strg, locn, toks): + if col(locn, strg) != n: + raise ParseException(strg, locn, "matched token not at column %d" % n) + return verifyCol + def replaceWith(replStr): """Helper method for common parse actions that simply return a literal value. Especially - useful when used with transformString(). + useful when used with transformString(). """ + def _replFunc(*args): return [replStr] + return _replFunc -def removeQuotes(s,l,t): + +def removeQuotes(s, l, t): """Helper parse action for removing quotation marks from parsed quoted strings. - To use, add this parse action to quoted string using:: - quotedString.setParseAction( removeQuotes ) + To use, add this parse action to quoted string using:: + quotedString.setParseAction( removeQuotes ) """ return t[0][1:-1] -def upcaseTokens(s,l,t): + +def upcaseTokens(s, l, t): """Helper parse action to convert tokens to upper case.""" - return [ tt.upper() for tt in map(_ustr,t) ] + return [tt.upper() for tt in map(_ustr, t)] -def downcaseTokens(s,l,t): + +def downcaseTokens(s, l, t): """Helper parse action to convert tokens to lower case.""" - return [ tt.lower() for tt in map(_ustr,t) ] + return [tt.lower() for tt in map(_ustr, t)] + -def keepOriginalText(s,startLoc,t): +def keepOriginalText(s, startLoc, t): """Helper parse action to preserve original parsed text, - overriding any nested parse actions.""" + overriding any nested parse actions.""" try: endloc = getTokensEndLoc() except ParseException: - raise ParseFatalException("incorrect usage of keepOriginalText - may only be called as a parse action") + raise ParseFatalException( + "incorrect usage of keepOriginalText - may only be called as a parse action" + ) del t[:] t += ParseResults(s[startLoc:endloc]) return t + def getTokensEndLoc(): """Method to be called from within a parse action to determine the end - location of the parsed tokens.""" + location of the parsed tokens.""" import inspect + fstack = inspect.stack() try: # search up the stack (through intervening argument normalizers) for correct calling routine @@ -3385,268 +3896,368 @@ def getTokensEndLoc(): endloc = f[0].f_locals["loc"] return endloc else: - raise ParseFatalException("incorrect usage of getTokensEndLoc - may only be called from within a parse action") + raise ParseFatalException( + "incorrect usage of getTokensEndLoc - may only be called from within a parse action" + ) finally: del fstack + def _makeTags(tagStr, xml): """Internal helper to construct opening and closing tag expressions, given a tag name""" - if isinstance(tagStr,basestring): + if isinstance(tagStr, basestring): resname = tagStr tagStr = Keyword(tagStr, caseless=not xml) else: resname = tagStr.name - tagAttrName = Word(alphas,alphanums+"_-:") - if (xml): - tagAttrValue = dblQuotedString.copy().setParseAction( removeQuotes ) - openTag = Suppress("<") + tagStr + \ - Dict(ZeroOrMore(Group( tagAttrName + Suppress("=") + tagAttrValue ))) + \ - Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">") + tagAttrName = Word(alphas, alphanums + "_-:") + if xml: + tagAttrValue = dblQuotedString.copy().setParseAction(removeQuotes) + openTag = ( + Suppress("<") + + tagStr + + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue))) + + Optional("/", default=[False]) + .setResultsName("empty") + .setParseAction(lambda s, l, t: t[0] == "/") + + Suppress(">") + ) else: - printablesLessRAbrack = "".join( [ c for c in printables if c not in ">" ] ) - tagAttrValue = quotedString.copy().setParseAction( removeQuotes ) | Word(printablesLessRAbrack) - openTag = Suppress("<") + tagStr + \ - Dict(ZeroOrMore(Group( tagAttrName.setParseAction(downcaseTokens) + \ - Optional( Suppress("=") + tagAttrValue ) ))) + \ - Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">") + printablesLessRAbrack = "".join([c for c in printables if c not in ">"]) + tagAttrValue = quotedString.copy().setParseAction(removeQuotes) | Word( + printablesLessRAbrack + ) + openTag = ( + Suppress("<") + + tagStr + + Dict( + ZeroOrMore( + Group( + tagAttrName.setParseAction(downcaseTokens) + + Optional(Suppress("=") + tagAttrValue) + ) + ) + ) + + Optional("/", default=[False]) + .setResultsName("empty") + .setParseAction(lambda s, l, t: t[0] == "/") + + Suppress(">") + ) closeTag = Combine(_L("") - openTag = openTag.setResultsName("start"+"".join(resname.replace(":"," ").title().split())).setName(f"<{tagStr}>") - closeTag = closeTag.setResultsName("end"+"".join(resname.replace(":"," ").title().split())).setName(f"") + openTag = openTag.setResultsName( + "start" + "".join(resname.replace(":", " ").title().split()) + ).setName(f"<{tagStr}>") + closeTag = closeTag.setResultsName( + "end" + "".join(resname.replace(":", " ").title().split()) + ).setName(f"") return openTag, closeTag + def makeHTMLTags(tagStr): """Helper to construct opening and closing tag expressions for HTML, given a tag name""" - return _makeTags( tagStr, False ) + return _makeTags(tagStr, False) + def makeXMLTags(tagStr): """Helper to construct opening and closing tag expressions for XML, given a tag name""" - return _makeTags( tagStr, True ) + return _makeTags(tagStr, True) -def withAttribute(*args,**attrDict): + +def withAttribute(*args, **attrDict): """Helper to create a validating parse action to be used with start tags created - with makeXMLTags or makeHTMLTags. Use withAttribute to qualify a starting tag - with a required attribute value, to avoid false matches on common tags such as - or
. - - Call withAttribute with a series of attribute names and values. Specify the list - of filter attributes names and values as: - - keyword arguments, as in (class="Customer",align="right"), or - - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") ) - For attribute names with a namespace prefix, you must use the second form. Attribute - names are matched insensitive to upper/lower case. - - To verify that the attribute exists, but without specifying a value, pass - withAttribute.ANY_VALUE as the value. - """ + with makeXMLTags or makeHTMLTags. Use withAttribute to qualify a starting tag + with a required attribute value, to avoid false matches on common tags such as + or
. + + Call withAttribute with a series of attribute names and values. Specify the list + of filter attributes names and values as: + - keyword arguments, as in (class="Customer",align="right"), or + - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") ) + For attribute names with a namespace prefix, you must use the second form. Attribute + names are matched insensitive to upper/lower case. + + To verify that the attribute exists, but without specifying a value, pass + withAttribute.ANY_VALUE as the value. + """ if args: attrs = args[:] else: attrs = attrDict.items() - attrs = [(k,v) for k,v in attrs] - def pa(s,l,tokens): - for attrName,attrValue in attrs: + attrs = [(k, v) for k, v in attrs] + + def pa(s, l, tokens): + for attrName, attrValue in attrs: if attrName not in tokens: - raise ParseException(s,l,"no matching attribute " + attrName) + raise ParseException(s, l, "no matching attribute " + attrName) if attrValue != withAttribute.ANY_VALUE and tokens[attrName] != attrValue: - raise ParseException(s,l,"attribute '%s' has value '%s', must be '%s'" % - (attrName, tokens[attrName], attrValue)) + raise ParseException( + s, + l, + "attribute '%s' has value '%s', must be '%s'" + % (attrName, tokens[attrName], attrValue), + ) + return pa + + withAttribute.ANY_VALUE = object() opAssoc = _Constants() opAssoc.LEFT = object() opAssoc.RIGHT = object() -def operatorPrecedence( baseExpr, opList ): + +def operatorPrecedence(baseExpr, opList): """Helper method for constructing grammars of expressions made up of - operators working in a precedence hierarchy. Operators may be unary or - binary, left- or right-associative. Parse actions can also be attached - to operator expressions. - - Parameters: - - baseExpr - expression representing the most basic element for the nested - - opList - list of tuples, one for each operator precedence level in the - expression grammar; each tuple is of the form - (opExpr, numTerms, rightLeftAssoc, parseAction), where: - - opExpr is the pyparsing expression for the operator; - may also be a string, which will be converted to a Literal; - if numTerms is 3, opExpr is a tuple of two expressions, for the - two operators separating the 3 terms - - numTerms is the number of terms for this operator (must - be 1, 2, or 3) - - rightLeftAssoc is the indicator whether the operator is - right or left associative, using the pyparsing-defined - constants opAssoc.RIGHT and opAssoc.LEFT. - - parseAction is the parse action to be associated with - expressions matching this operator expression (the - parse action tuple member may be omitted) + operators working in a precedence hierarchy. Operators may be unary or + binary, left- or right-associative. Parse actions can also be attached + to operator expressions. + + Parameters: + - baseExpr - expression representing the most basic element for the nested + - opList - list of tuples, one for each operator precedence level in the + expression grammar; each tuple is of the form + (opExpr, numTerms, rightLeftAssoc, parseAction), where: + - opExpr is the pyparsing expression for the operator; + may also be a string, which will be converted to a Literal; + if numTerms is 3, opExpr is a tuple of two expressions, for the + two operators separating the 3 terms + - numTerms is the number of terms for this operator (must + be 1, 2, or 3) + - rightLeftAssoc is the indicator whether the operator is + right or left associative, using the pyparsing-defined + constants opAssoc.RIGHT and opAssoc.LEFT. + - parseAction is the parse action to be associated with + expressions matching this operator expression (the + parse action tuple member may be omitted) """ ret = Forward() - lastExpr = baseExpr | ( Suppress('(') + ret + Suppress(')') ) - for i,operDef in enumerate(opList): - opExpr,arity,rightLeftAssoc,pa = (operDef + (None,))[:4] + lastExpr = baseExpr | (Suppress("(") + ret + Suppress(")")) + for i, operDef in enumerate(opList): + opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] if arity == 3: if opExpr is None or len(opExpr) != 2: - raise ValueError("if numterms=3, opExpr must be a tuple or list of two expressions") + raise ValueError( + "if numterms=3, opExpr must be a tuple or list of two expressions" + ) opExpr1, opExpr2 = opExpr - thisExpr = Forward()#.setName("expr%d" % i) + thisExpr = Forward() # .setName("expr%d" % i) if rightLeftAssoc == opAssoc.LEFT: if arity == 1: - matchExpr = FollowedBy(lastExpr + opExpr) + Group( lastExpr + OneOrMore( opExpr ) ) + matchExpr = FollowedBy(lastExpr + opExpr) + Group( + lastExpr + OneOrMore(opExpr) + ) elif arity == 2: if opExpr is not None: - matchExpr = FollowedBy(lastExpr + opExpr + lastExpr) + Group( lastExpr + OneOrMore( opExpr + lastExpr ) ) + matchExpr = FollowedBy(lastExpr + opExpr + lastExpr) + Group( + lastExpr + OneOrMore(opExpr + lastExpr) + ) else: - matchExpr = FollowedBy(lastExpr+lastExpr) + Group( lastExpr + OneOrMore(lastExpr) ) + matchExpr = FollowedBy(lastExpr + lastExpr) + Group( + lastExpr + OneOrMore(lastExpr) + ) elif arity == 3: - matchExpr = FollowedBy(lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr) + \ - Group( lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr ) + matchExpr = FollowedBy( + lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr + ) + Group(lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr) else: - raise ValueError("operator must be unary (1), binary (2), or ternary (3)") + raise ValueError( + "operator must be unary (1), binary (2), or ternary (3)" + ) elif rightLeftAssoc == opAssoc.RIGHT: if arity == 1: # try to avoid LR with this extra test if not isinstance(opExpr, Optional): opExpr = Optional(opExpr) - matchExpr = FollowedBy(opExpr.expr + thisExpr) + Group( opExpr + thisExpr ) + matchExpr = FollowedBy(opExpr.expr + thisExpr) + Group( + opExpr + thisExpr + ) elif arity == 2: if opExpr is not None: - matchExpr = FollowedBy(lastExpr + opExpr + thisExpr) + Group( lastExpr + OneOrMore( opExpr + thisExpr ) ) + matchExpr = FollowedBy(lastExpr + opExpr + thisExpr) + Group( + lastExpr + OneOrMore(opExpr + thisExpr) + ) else: - matchExpr = FollowedBy(lastExpr + thisExpr) + Group( lastExpr + OneOrMore( thisExpr ) ) + matchExpr = FollowedBy(lastExpr + thisExpr) + Group( + lastExpr + OneOrMore(thisExpr) + ) elif arity == 3: - matchExpr = FollowedBy(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) + \ - Group( lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr ) + matchExpr = FollowedBy( + lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr + ) + Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) else: - raise ValueError("operator must be unary (1), binary (2), or ternary (3)") + raise ValueError( + "operator must be unary (1), binary (2), or ternary (3)" + ) else: raise ValueError("operator must indicate right or left associativity") if pa: - matchExpr.setParseAction( pa ) - thisExpr << ( matchExpr | lastExpr ) + matchExpr.setParseAction(pa) + thisExpr << (matchExpr | lastExpr) lastExpr = thisExpr ret << lastExpr return ret -dblQuotedString = Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*"').setName("string enclosed in double quotes") -sglQuotedString = Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*'").setName("string enclosed in single quotes") -quotedString = Regex(r'''(?:"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*")|(?:'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*')''').setName("quotedString using single or double quotes") -unicodeString = Combine(_L('u') + quotedString.copy()) + +dblQuotedString = Regex( + r'"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*"' +).setName("string enclosed in double quotes") +sglQuotedString = Regex( + r"'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*'" +).setName("string enclosed in single quotes") +quotedString = Regex( + r"""(?:"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*")|(?:'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*')""" +).setName("quotedString using single or double quotes") +unicodeString = Combine(_L("u") + quotedString.copy()) + def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString): """Helper method for defining nested lists enclosed in opening and closing - delimiters ("(" and ")" are the default). - - Parameters: - - opener - opening character for a nested list (default="("); can also be a pyparsing expression - - closer - closing character for a nested list (default=")"); can also be a pyparsing expression - - content - expression for items within the nested lists (default=None) - - ignoreExpr - expression for ignoring opening and closing delimiters (default=quotedString) - - If an expression is not provided for the content argument, the nested - expression will capture all whitespace-delimited content between delimiters - as a list of separate values. - - Use the ignoreExpr argument to define expressions that may contain - opening or closing characters that should not be treated as opening - or closing characters for nesting, such as quotedString or a comment - expression. Specify multiple expressions using an Or or MatchFirst. - The default is quotedString, but if no expressions are to be ignored, - then pass None for this argument. + delimiters ("(" and ")" are the default). + + Parameters: + - opener - opening character for a nested list (default="("); can also be a pyparsing expression + - closer - closing character for a nested list (default=")"); can also be a pyparsing expression + - content - expression for items within the nested lists (default=None) + - ignoreExpr - expression for ignoring opening and closing delimiters (default=quotedString) + + If an expression is not provided for the content argument, the nested + expression will capture all whitespace-delimited content between delimiters + as a list of separate values. + + Use the ignoreExpr argument to define expressions that may contain + opening or closing characters that should not be treated as opening + or closing characters for nesting, such as quotedString or a comment + expression. Specify multiple expressions using an Or or MatchFirst. + The default is quotedString, but if no expressions are to be ignored, + then pass None for this argument. """ if opener == closer: raise ValueError("opening and closing strings cannot be the same") if content is None: - if isinstance(opener,basestring) and isinstance(closer,basestring): - if len(opener) == 1 and len(closer)==1: + if isinstance(opener, basestring) and isinstance(closer, basestring): + if len(opener) == 1 and len(closer) == 1: if ignoreExpr is not None: - content = (Combine(OneOrMore(~ignoreExpr + - CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS,exact=1)) - ).setParseAction(lambda t:t[0].strip())) + content = Combine( + OneOrMore( + ~ignoreExpr + + CharsNotIn( + opener + closer + ParserElement.DEFAULT_WHITE_CHARS, + exact=1, + ) + ) + ).setParseAction(lambda t: t[0].strip()) else: - content = (empty+CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS - ).setParseAction(lambda t:t[0].strip())) + content = empty + CharsNotIn( + opener + closer + ParserElement.DEFAULT_WHITE_CHARS + ).setParseAction(lambda t: t[0].strip()) else: if ignoreExpr is not None: - content = (Combine(OneOrMore(~ignoreExpr + - ~Literal(opener) + ~Literal(closer) + - CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) - ).setParseAction(lambda t:t[0].strip())) + content = Combine( + OneOrMore( + ~ignoreExpr + + ~Literal(opener) + + ~Literal(closer) + + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1) + ) + ).setParseAction(lambda t: t[0].strip()) else: - content = (Combine(OneOrMore(~Literal(opener) + ~Literal(closer) + - CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) - ).setParseAction(lambda t:t[0].strip())) + content = Combine( + OneOrMore( + ~Literal(opener) + + ~Literal(closer) + + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1) + ) + ).setParseAction(lambda t: t[0].strip()) else: - raise ValueError("opening and closing arguments must be strings if no content expression is given") + raise ValueError( + "opening and closing arguments must be strings if no content expression is given" + ) ret = Forward() if ignoreExpr is not None: - ret << Group( Suppress(opener) + ZeroOrMore( ignoreExpr | ret | content ) + Suppress(closer) ) + ret << Group( + Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer) + ) else: - ret << Group( Suppress(opener) + ZeroOrMore( ret | content ) + Suppress(closer) ) + ret << Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer)) return ret + def indentedBlock(blockStatementExpr, indentStack, indent=True): """Helper method for defining space-delimited indentation blocks, such as - those used to define block statements in Python source code. - - Parameters: - - blockStatementExpr - expression defining syntax of statement that - is repeated within the indented block - - indentStack - list created by caller to manage indentation stack - (multiple statementWithIndentedBlock expressions within a single grammar - should share a common indentStack) - - indent - boolean indicating whether block must be indented beyond the - the current level; set to False for block of left-most statements - (default=True) - - A valid block must contain at least one blockStatement. + those used to define block statements in Python source code. + + Parameters: + - blockStatementExpr - expression defining syntax of statement that + is repeated within the indented block + - indentStack - list created by caller to manage indentation stack + (multiple statementWithIndentedBlock expressions within a single grammar + should share a common indentStack) + - indent - boolean indicating whether block must be indented beyond the + the current level; set to False for block of left-most statements + (default=True) + + A valid block must contain at least one blockStatement. """ - def checkPeerIndent(s,l,t): - if l >= len(s): return - curCol = col(l,s) + + def checkPeerIndent(s, l, t): + if l >= len(s): + return + curCol = col(l, s) if curCol != indentStack[-1]: if curCol > indentStack[-1]: - raise ParseFatalException(s,l,"illegal nesting") - raise ParseException(s,l,"not a peer entry") + raise ParseFatalException(s, l, "illegal nesting") + raise ParseException(s, l, "not a peer entry") - def checkSubIndent(s,l,t): - curCol = col(l,s) + def checkSubIndent(s, l, t): + curCol = col(l, s) if curCol > indentStack[-1]: - indentStack.append( curCol ) + indentStack.append(curCol) else: - raise ParseException(s,l,"not a subentry") - - def checkUnindent(s,l,t): - if l >= len(s): return - curCol = col(l,s) - if not(indentStack and curCol < indentStack[-1] and curCol <= indentStack[-2]): - raise ParseException(s,l,"not an unindent") + raise ParseException(s, l, "not a subentry") + + def checkUnindent(s, l, t): + if l >= len(s): + return + curCol = col(l, s) + if not (indentStack and curCol < indentStack[-1] and curCol <= indentStack[-2]): + raise ParseException(s, l, "not an unindent") indentStack.pop() NL = OneOrMore(LineEnd().setWhitespaceChars("\t ").suppress()) INDENT = Empty() + Empty().setParseAction(checkSubIndent) - PEER = Empty().setParseAction(checkPeerIndent) + PEER = Empty().setParseAction(checkPeerIndent) UNDENT = Empty().setParseAction(checkUnindent) if indent: - smExpr = Group( Optional(NL) + - FollowedBy(blockStatementExpr) + - INDENT + (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) + UNDENT) + smExpr = Group( + Optional(NL) + + FollowedBy(blockStatementExpr) + + INDENT + + (OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL))) + + UNDENT + ) else: - smExpr = Group( Optional(NL) + - (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) ) + smExpr = Group( + Optional(NL) + (OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL))) + ) blockStatementExpr.ignore(_bslash + LineEnd()) return smExpr + alphas8bit = srange(r"[\0xc0-\0xd6\0xd8-\0xf6\0xf8-\0xff]") punc8bit = srange(r"[\0xa1-\0xbf\0xd7\0xf7]") -anyOpenTag,anyCloseTag = makeHTMLTags(Word(alphas,alphanums+"_:")) -commonHTMLEntity = Combine(_L("&") + oneOf("gt lt amp nbsp quot").setResultsName("entity") +";").streamline() -_htmlEntityMap = dict(zip("gt lt amp nbsp quot".split(),'><& "')) -replaceHTMLEntity = lambda t : t.entity in _htmlEntityMap and _htmlEntityMap[t.entity] or None +anyOpenTag, anyCloseTag = makeHTMLTags(Word(alphas, alphanums + "_:")) +commonHTMLEntity = Combine( + _L("&") + oneOf("gt lt amp nbsp quot").setResultsName("entity") + ";" +).streamline() +_htmlEntityMap = dict(zip("gt lt amp nbsp quot".split(), '><& "')) +replaceHTMLEntity = ( + lambda t: t.entity in _htmlEntityMap and _htmlEntityMap[t.entity] or None +) # it's easy to get these comment structures wrong - they're very common, so may as well make them available cStyleComment = Regex(r"/\*(?:[^*]*\*+)+?/").setName("C style comment") @@ -3654,56 +4265,66 @@ def checkUnindent(s,l,t): htmlComment = Regex(r"") restOfLine = Regex(r".*").leaveWhitespace() dblSlashComment = Regex(r"\/\/(\\\n|.)*").setName("// comment") -cppStyleComment = Regex(r"/(?:\*(?:[^*]*\*+)+?/|/[^\n]*(?:\n[^\n]*)*?(?:(?" + str(tokenlist)) - print ("tokens = " + str(tokens)) - print ("tokens.columns = " + str(tokens.columns)) - print ("tokens.tables = " + str(tokens.tables)) - print (tokens.asXML("SQL",True)) + print(teststring + "->" + str(tokenlist)) + print("tokens = " + str(tokens)) + print("tokens.columns = " + str(tokens.columns)) + print("tokens.tables = " + str(tokens.tables)) + print(tokens.asXML("SQL", True)) except ParseBaseException as err: - print (teststring + "->") - print (err.line) - print (" "*(err.column-1) + "^") - print (err) + print(teststring + "->") + print(err.line) + print(" " * (err.column - 1) + "^") + print(err) print() - selectToken = CaselessLiteral( "select" ) - fromToken = CaselessLiteral( "from" ) - - ident = Word( alphas, alphanums + "_$" ) - columnName = delimitedList( ident, ".", combine=True ).setParseAction( upcaseTokens ) - columnNameList = Group( delimitedList( columnName ) )#.setName("columns") - tableName = delimitedList( ident, ".", combine=True ).setParseAction( upcaseTokens ) - tableNameList = Group( delimitedList( tableName ) )#.setName("tables") - simpleSQL = ( selectToken + \ - ( '*' | columnNameList ).setResultsName( "columns" ) + \ - fromToken + \ - tableNameList.setResultsName( "tables" ) ) - - test( "SELECT * from XYZZY, ABC" ) - test( "select * from SYS.XYZZY" ) - test( "Select A from Sys.dual" ) - test( "Select AA,BB,CC from Sys.dual" ) - test( "Select A, B, C from Sys.dual" ) - test( "Select A, B, C from Sys.dual" ) - test( "Xelect A, B, C from Sys.dual" ) - test( "Select A, B, C frox Sys.dual" ) - test( "Select" ) - test( "Select ^^^ frox Sys.dual" ) - test( "Select A, B, C from Sys.dual, Table2 " ) + selectToken = CaselessLiteral("select") + fromToken = CaselessLiteral("from") + + ident = Word(alphas, alphanums + "_$") + columnName = delimitedList(ident, ".", combine=True).setParseAction(upcaseTokens) + columnNameList = Group(delimitedList(columnName)) # .setName("columns") + tableName = delimitedList(ident, ".", combine=True).setParseAction(upcaseTokens) + tableNameList = Group(delimitedList(tableName)) # .setName("tables") + simpleSQL = ( + selectToken + + ("*" | columnNameList).setResultsName("columns") + + fromToken + + tableNameList.setResultsName("tables") + ) + + test("SELECT * from XYZZY, ABC") + test("select * from SYS.XYZZY") + test("Select A from Sys.dual") + test("Select AA,BB,CC from Sys.dual") + test("Select A, B, C from Sys.dual") + test("Select A, B, C from Sys.dual") + test("Xelect A, B, C from Sys.dual") + test("Select A, B, C frox Sys.dual") + test("Select") + test("Select ^^^ frox Sys.dual") + test("Select A, B, C from Sys.dual, Table2 ") diff --git a/src/whoosh/support/relativedelta.py b/src/whoosh/support/relativedelta.py index 9604907a..552f0e4e 100644 --- a/src/whoosh/support/relativedelta.py +++ b/src/whoosh/support/relativedelta.py @@ -7,8 +7,8 @@ __author__ = "Gustavo Niemeyer " __license__ = "PSF License" -import datetime import calendar +import datetime __all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"] @@ -41,82 +41,100 @@ def __repr__(self): else: return "%s(%+d)" % (s, self.n) + MO, TU, WE, TH, FR, SA, SU = weekdays = tuple([weekday(x) for x in range(7)]) class relativedelta: """ -The relativedelta type is based on the specification of the excellent -work done by M.-A. Lemburg in his mx.DateTime extension. However, -notice that this type does *NOT* implement the same algorithm as -his work. Do *NOT* expect it to behave like mx.DateTime's counterpart. + The relativedelta type is based on the specification of the excellent + work done by M.-A. Lemburg in his mx.DateTime extension. However, + notice that this type does *NOT* implement the same algorithm as + his work. Do *NOT* expect it to behave like mx.DateTime's counterpart. -There's two different ways to build a relativedelta instance. The -first one is passing it two date/datetime classes: + There's two different ways to build a relativedelta instance. The + first one is passing it two date/datetime classes: - relativedelta(datetime1, datetime2) + relativedelta(datetime1, datetime2) -And the other way is to use the following keyword arguments: + And the other way is to use the following keyword arguments: - year, month, day, hour, minute, second, microsecond: - Absolute information. + year, month, day, hour, minute, second, microsecond: + Absolute information. - years, months, weeks, days, hours, minutes, seconds, microseconds: - Relative information, may be negative. + years, months, weeks, days, hours, minutes, seconds, microseconds: + Relative information, may be negative. - weekday: - One of the weekday instances (MO, TU, etc). These instances may - receive a parameter N, specifying the Nth weekday, which could - be positive or negative (like MO(+1) or MO(-2). Not specifying - it is the same as specifying +1. You can also use an integer, - where 0=MO. + weekday: + One of the weekday instances (MO, TU, etc). These instances may + receive a parameter N, specifying the Nth weekday, which could + be positive or negative (like MO(+1) or MO(-2). Not specifying + it is the same as specifying +1. You can also use an integer, + where 0=MO. - leapdays: - Will add given days to the date found, if year is a leap - year, and the date found is post 28 of february. + leapdays: + Will add given days to the date found, if year is a leap + year, and the date found is post 28 of february. - yearday, nlyearday: - Set the yearday or the non-leap year day (jump leap days). - These are converted to day/month/leapdays information. + yearday, nlyearday: + Set the yearday or the non-leap year day (jump leap days). + These are converted to day/month/leapdays information. -Here is the behavior of operations with relativedelta: + Here is the behavior of operations with relativedelta: -1) Calculate the absolute year, using the 'year' argument, or the - original datetime year, if the argument is not present. + 1) Calculate the absolute year, using the 'year' argument, or the + original datetime year, if the argument is not present. -2) Add the relative 'years' argument to the absolute year. + 2) Add the relative 'years' argument to the absolute year. -3) Do steps 1 and 2 for month/months. + 3) Do steps 1 and 2 for month/months. -4) Calculate the absolute day, using the 'day' argument, or the - original datetime day, if the argument is not present. Then, - subtract from the day until it fits in the year and month - found after their operations. + 4) Calculate the absolute day, using the 'day' argument, or the + original datetime day, if the argument is not present. Then, + subtract from the day until it fits in the year and month + found after their operations. -5) Add the relative 'days' argument to the absolute day. Notice - that the 'weeks' argument is multiplied by 7 and added to - 'days'. + 5) Add the relative 'days' argument to the absolute day. Notice + that the 'weeks' argument is multiplied by 7 and added to + 'days'. -6) Do steps 1 and 2 for hour/hours, minute/minutes, second/seconds, - microsecond/microseconds. + 6) Do steps 1 and 2 for hour/hours, minute/minutes, second/seconds, + microsecond/microseconds. -7) If the 'weekday' argument is present, calculate the weekday, - with the given (wday, nth) tuple. wday is the index of the - weekday (0-6, 0=Mon), and nth is the number of weeks to add - forward or backward, depending on its signal. Notice that if - the calculated date is already Monday, for example, using - (0, 1) or (0, -1) won't change the day. + 7) If the 'weekday' argument is present, calculate the weekday, + with the given (wday, nth) tuple. wday is the index of the + weekday (0-6, 0=Mon), and nth is the number of weeks to add + forward or backward, depending on its signal. Notice that if + the calculated date is already Monday, for example, using + (0, 1) or (0, -1) won't change the day. """ - def __init__(self, dt1=None, dt2=None, - years=0, months=0, days=0, leapdays=0, weeks=0, - hours=0, minutes=0, seconds=0, microseconds=0, - year=None, month=None, day=None, weekday=None, - yearday=None, nlyearday=None, - hour=None, minute=None, second=None, microsecond=None): + def __init__( + self, + dt1=None, + dt2=None, + years=0, + months=0, + days=0, + leapdays=0, + weeks=0, + hours=0, + minutes=0, + seconds=0, + microseconds=0, + year=None, + month=None, + day=None, + weekday=None, + yearday=None, + nlyearday=None, + hour=None, + minute=None, + second=None, + microsecond=None, + ): if dt1 and dt2: - if not isinstance(dt1, datetime.date) or \ - not isinstance(dt2, datetime.date): + if not isinstance(dt1, datetime.date) or not isinstance(dt2, datetime.date): raise TypeError("relativedelta only diffs datetime/date") if type(dt1) is not type(dt2): if not isinstance(dt1, datetime.datetime): @@ -187,8 +205,7 @@ def __init__(self, dt1=None, dt2=None, if yearday > 59: self.leapdays = -1 if yday: - ydayidx = [31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, - 366] + ydayidx = [31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 366] for idx, ydays in enumerate(ydayidx): if yday <= ydays: self.month = idx + 1 @@ -228,9 +245,16 @@ def _fix(self): div, mod = divmod(self.months * s, 12) self.months = mod * s self.years += div * s - if (self.hours or self.minutes or self.seconds or self.microseconds or - self.hour is not None or self.minute is not None or - self.second is not None or self.microsecond is not None): + if ( + self.hours + or self.minutes + or self.seconds + or self.microseconds + or self.hour is not None + or self.minute is not None + or self.second is not None + or self.microsecond is not None + ): self._has_time = 1 else: self._has_time = 0 @@ -261,8 +285,7 @@ def __radd__(self, other): elif month < 1: year -= 1 month += 12 - day = min(calendar.monthrange(year, month)[1], - self.day or other.day) + day = min(calendar.monthrange(year, month)[1], self.day or other.day) repl = {"year": year, "month": month, "day": day} for attr in ["hour", "minute", "second", "microsecond"]: value = getattr(self, attr) @@ -271,12 +294,13 @@ def __radd__(self, other): days = self.days if self.leapdays and month > 2 and calendar.isleap(year): days += self.leapdays - ret = (other.replace(**repl) - + datetime.timedelta(days=days, - hours=self.hours, - minutes=self.minutes, - seconds=self.seconds, - microseconds=self.microseconds)) + ret = other.replace(**repl) + datetime.timedelta( + days=days, + hours=self.hours, + minutes=self.minutes, + seconds=self.seconds, + microseconds=self.microseconds, + ) if self.weekday: weekday, nth = self.weekday.weekday, self.weekday.n or 1 jumpdays = (abs(nth) - 1) * 7 @@ -294,99 +318,109 @@ def __rsub__(self, other): def __add__(self, other): if not isinstance(other, relativedelta): raise TypeError("unsupported type for add operation") - return relativedelta(years=other.years + self.years, - months=other.months + self.months, - days=other.days + self.days, - hours=other.hours + self.hours, - minutes=other.minutes + self.minutes, - seconds=other.seconds + self.seconds, - microseconds=other.microseconds + self.microseconds, - leapdays=other.leapdays or self.leapdays, - year=other.year or self.year, - month=other.month or self.month, - day=other.day or self.day, - weekday=other.weekday or self.weekday, - hour=other.hour or self.hour, - minute=other.minute or self.minute, - second=other.second or self.second, - microsecond=other.second or self.microsecond) + return relativedelta( + years=other.years + self.years, + months=other.months + self.months, + days=other.days + self.days, + hours=other.hours + self.hours, + minutes=other.minutes + self.minutes, + seconds=other.seconds + self.seconds, + microseconds=other.microseconds + self.microseconds, + leapdays=other.leapdays or self.leapdays, + year=other.year or self.year, + month=other.month or self.month, + day=other.day or self.day, + weekday=other.weekday or self.weekday, + hour=other.hour or self.hour, + minute=other.minute or self.minute, + second=other.second or self.second, + microsecond=other.second or self.microsecond, + ) def __sub__(self, other): if not isinstance(other, relativedelta): raise TypeError("unsupported type for sub operation") - return relativedelta(years=other.years - self.years, - months=other.months - self.months, - days=other.days - self.days, - hours=other.hours - self.hours, - minutes=other.minutes - self.minutes, - seconds=other.seconds - self.seconds, - microseconds=other.microseconds - self.microseconds, - leapdays=other.leapdays or self.leapdays, - year=other.year or self.year, - month=other.month or self.month, - day=other.day or self.day, - weekday=other.weekday or self.weekday, - hour=other.hour or self.hour, - minute=other.minute or self.minute, - second=other.second or self.second, - microsecond=other.second or self.microsecond) + return relativedelta( + years=other.years - self.years, + months=other.months - self.months, + days=other.days - self.days, + hours=other.hours - self.hours, + minutes=other.minutes - self.minutes, + seconds=other.seconds - self.seconds, + microseconds=other.microseconds - self.microseconds, + leapdays=other.leapdays or self.leapdays, + year=other.year or self.year, + month=other.month or self.month, + day=other.day or self.day, + weekday=other.weekday or self.weekday, + hour=other.hour or self.hour, + minute=other.minute or self.minute, + second=other.second or self.second, + microsecond=other.second or self.microsecond, + ) def __neg__(self): - return relativedelta(years= -self.years, - months= -self.months, - days= -self.days, - hours= -self.hours, - minutes= -self.minutes, - seconds= -self.seconds, - microseconds= -self.microseconds, - leapdays=self.leapdays, - year=self.year, - month=self.month, - day=self.day, - weekday=self.weekday, - hour=self.hour, - minute=self.minute, - second=self.second, - microsecond=self.microsecond) + return relativedelta( + years=-self.years, + months=-self.months, + days=-self.days, + hours=-self.hours, + minutes=-self.minutes, + seconds=-self.seconds, + microseconds=-self.microseconds, + leapdays=self.leapdays, + year=self.year, + month=self.month, + day=self.day, + weekday=self.weekday, + hour=self.hour, + minute=self.minute, + second=self.second, + microsecond=self.microsecond, + ) def __nonzero__(self): - return not (not self.years and - not self.months and - not self.days and - not self.hours and - not self.minutes and - not self.seconds and - not self.microseconds and - not self.leapdays and - self.year is None and - self.month is None and - self.day is None and - self.weekday is None and - self.hour is None and - self.minute is None and - self.second is None and - self.microsecond is None) + return not ( + not self.years + and not self.months + and not self.days + and not self.hours + and not self.minutes + and not self.seconds + and not self.microseconds + and not self.leapdays + and self.year is None + and self.month is None + and self.day is None + and self.weekday is None + and self.hour is None + and self.minute is None + and self.second is None + and self.microsecond is None + ) __bool__ = __nonzero__ def __mul__(self, other): f = float(other) - return relativedelta(years=self.years * f, - months=self.months * f, - days=self.days * f, - hours=self.hours * f, - minutes=self.minutes * f, - seconds=self.seconds * f, - microseconds=self.microseconds * f, - leapdays=self.leapdays, - year=self.year, - month=self.month, - day=self.day, - weekday=self.weekday, - hour=self.hour, - minute=self.minute, - second=self.second, - microsecond=self.microsecond) + return relativedelta( + years=self.years * f, + months=self.months * f, + days=self.days * f, + hours=self.hours * f, + minutes=self.minutes * f, + seconds=self.seconds * f, + microseconds=self.microseconds * f, + leapdays=self.leapdays, + year=self.year, + month=self.month, + day=self.day, + weekday=self.weekday, + hour=self.hour, + minute=self.minute, + second=self.second, + microsecond=self.microsecond, + ) def __eq__(self, other): if not isinstance(other, relativedelta): @@ -399,20 +433,22 @@ def __eq__(self, other): n1, n2 = self.weekday.n, other.weekday.n if n1 != n2 and not ((not n1 or n1 == 1) and (not n2 or n2 == 1)): return False - return (self.years == other.years and - self.months == other.months and - self.days == other.days and - self.hours == other.hours and - self.minutes == other.minutes and - self.seconds == other.seconds and - self.leapdays == other.leapdays and - self.year == other.year and - self.month == other.month and - self.day == other.day and - self.hour == other.hour and - self.minute == other.minute and - self.second == other.second and - self.microsecond == other.microsecond) + return ( + self.years == other.years + and self.months == other.months + and self.days == other.days + and self.hours == other.hours + and self.minutes == other.minutes + and self.seconds == other.seconds + and self.leapdays == other.leapdays + and self.year == other.year + and self.month == other.month + and self.day == other.day + and self.hour == other.hour + and self.minute == other.minute + and self.second == other.second + and self.microsecond == other.microsecond + ) def __ne__(self, other): return not self.__eq__(other) @@ -422,16 +458,33 @@ def __div__(self, other): def __repr__(self): l = [] - for attr in ["years", "months", "days", "leapdays", - "hours", "minutes", "seconds", "microseconds"]: + for attr in [ + "years", + "months", + "days", + "leapdays", + "hours", + "minutes", + "seconds", + "microseconds", + ]: value = getattr(self, attr) if value: l.append("%s=%+d" % (attr, value)) - for attr in ["year", "month", "day", "weekday", - "hour", "minute", "second", "microsecond"]: + for attr in [ + "year", + "month", + "day", + "weekday", + "hour", + "minute", + "second", + "microsecond", + ]: value = getattr(self, attr) if value is not None: l.append(f"{attr}={repr(value)}") return f"{self.__class__.__name__}({', '.join(l)})" + # vim:ts=4:sw=4:et diff --git a/src/whoosh/support/unicode.py b/src/whoosh/support/unicode.py index 351c7130..4c2248db 100644 --- a/src/whoosh/support/unicode.py +++ b/src/whoosh/support/unicode.py @@ -3,7 +3,6 @@ from whoosh.compat import text_type, u - # http://unicode.org/Public/UNIDATA/Blocks.txt _blockdata = """ # Blocks-5.1.0.txt diff --git a/src/whoosh/system.py b/src/whoosh/system.py index 2bdce1b1..36ad58eb 100644 --- a/src/whoosh/system.py +++ b/src/whoosh/system.py @@ -28,7 +28,6 @@ import sys from struct import Struct, calcsize - IS_LITTLE = sys.byteorder == "little" _INT_SIZE = calcsize("!i") diff --git a/src/whoosh/util/__init__.py b/src/whoosh/util/__init__.py index cc91d3d9..93868a32 100644 --- a/src/whoosh/util/__init__.py +++ b/src/whoosh/util/__init__.py @@ -26,13 +26,15 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import with_statement -import random, sys, time + +import random +import sys +import time from bisect import insort from functools import wraps from whoosh.compat import range - # These must be valid separate characters in CASE-INSENSTIVE filenames IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz" @@ -78,7 +80,7 @@ def make_binary_tree(fn, args, **kwargs): return fn( make_binary_tree(fn, args[:half], **kwargs), make_binary_tree(fn, args[half:], **kwargs), - **kwargs + **kwargs, ) diff --git a/src/whoosh/util/cache.py b/src/whoosh/util/cache.py index 00cb3f27..f2e8a414 100644 --- a/src/whoosh/util/cache.py +++ b/src/whoosh/util/cache.py @@ -26,13 +26,13 @@ # policies, either expressed or implied, of Matt Chaput. from __future__ import with_statement + import functools from heapq import nsmallest from operator import itemgetter from whoosh.compat import iteritems - try: from collections import Counter except ImportError: diff --git a/src/whoosh/util/filelock.py b/src/whoosh/util/filelock.py index 5534d123..ef1533bd 100644 --- a/src/whoosh/util/filelock.py +++ b/src/whoosh/util/filelock.py @@ -59,8 +59,7 @@ def try_for(fn, timeout=5.0, delay=0.1): class LockBase(object): - """Base class for file locks. - """ + """Base class for file locks.""" def __init__(self, filename): self.fd = None @@ -88,8 +87,7 @@ def release(self): class FcntlLock(LockBase): - """File lock based on UNIX-only fcntl module. - """ + """File lock based on UNIX-only fcntl module.""" def acquire(self, blocking=False): import fcntl # type: ignore @UnresolvedImport @@ -118,14 +116,14 @@ def release(self): raise Exception("Lock was not acquired") import fcntl # type: ignore @UnresolvedImport + fcntl.flock(self.fd, fcntl.LOCK_UN) os.close(self.fd) self.fd = None class MsvcrtLock(LockBase): - """File lock based on Windows-only msvcrt module. - """ + """File lock based on Windows-only msvcrt module.""" def acquire(self, blocking=False): import msvcrt # type: ignore @UnresolvedImport diff --git a/src/whoosh/util/numeric.py b/src/whoosh/util/numeric.py index 5b4670c8..af813f49 100644 --- a/src/whoosh/util/numeric.py +++ b/src/whoosh/util/numeric.py @@ -25,17 +25,31 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -import math, struct +import math +import struct from array import array from bisect import bisect_left from struct import pack, unpack from whoosh.compat import b, long_type -from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort -from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint -from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong -from whoosh.system import pack_float, unpack_float, pack_double, unpack_double - +from whoosh.system import ( + pack_byte, + pack_double, + pack_float, + pack_int, + pack_long, + pack_uint, + pack_ulong, + pack_ushort, + unpack_byte, + unpack_double, + unpack_float, + unpack_int, + unpack_long, + unpack_uint, + unpack_ulong, + unpack_ushort, +) NaN = struct.unpack(" fire is within 2 edits (transpose + delete) of first w.add_document(title=u("Fifth"), content=u("The fire is beautiful")) - from whoosh.qparser import QueryParser, FuzzyTermPlugin + from whoosh.qparser import FuzzyTermPlugin, QueryParser parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 78beb808..e1cb3836 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -1,5 +1,5 @@ from whoosh import analysis, fields, query -from whoosh.compat import u, text_type +from whoosh.compat import text_type, u from whoosh.qparser import default, plugins diff --git a/tests/test_postings.py b/tests/test_postings.py index f586e1d8..0260db65 100644 --- a/tests/test_postings.py +++ b/tests/test_postings.py @@ -1,11 +1,16 @@ from __future__ import with_statement from whoosh import analysis, fields -from whoosh.compat import u from whoosh.codec import default_codec -from whoosh.formats import Existence, Frequency -from whoosh.formats import Positions, PositionBoosts -from whoosh.formats import Characters, CharacterBoosts +from whoosh.compat import u +from whoosh.formats import ( + CharacterBoosts, + Characters, + Existence, + Frequency, + PositionBoosts, + Positions, +) from whoosh.util.testing import TempStorage diff --git a/tests/test_quality.py b/tests/test_quality.py index e051bd2f..1f6476e3 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -1,10 +1,11 @@ from __future__ import with_statement + import random from whoosh import fields, matching, scoring -from whoosh.compat import u, range +from whoosh.compat import range, u from whoosh.filedb.filestore import RamStorage -from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.numeric import byte_to_length, length_to_byte def _discreet(length): diff --git a/tests/test_queries.py b/tests/test_queries.py index 323a34db..a8a7558f 100644 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -1,7 +1,6 @@ import copy import pytest - from whoosh import fields, qparser, query from whoosh.compat import b, u from whoosh.filedb.filestore import RamStorage diff --git a/tests/test_reading.py b/tests/test_reading.py index 0177f4b5..a8f1427f 100644 --- a/tests/test_reading.py +++ b/tests/test_reading.py @@ -1,13 +1,15 @@ # coding=utf-8 from __future__ import with_statement -import random, threading, time + +import random +import threading +import time import pytest from whoosh import fields, formats, reading - -from whoosh.compat import b, u, range -from whoosh.reading import SegmentReader +from whoosh.compat import b, range, u from whoosh.filedb.filestore import RamStorage +from whoosh.reading import SegmentReader from whoosh.util.testing import TempIndex diff --git a/tests/test_results.py b/tests/test_results.py index b98f39c1..c750363b 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -1,12 +1,11 @@ from __future__ import with_statement import pytest - from whoosh import analysis, fields, formats, highlight, qparser, query from whoosh.codec.whoosh3 import W3Codec -from whoosh.compat import u, range, text_type, permutations +from whoosh.compat import permutations, range, text_type, u from whoosh.filedb.filestore import RamStorage -from whoosh.util.testing import TempStorage, TempIndex +from whoosh.util.testing import TempIndex, TempStorage def test_score_retrieval(): diff --git a/tests/test_searching.py b/tests/test_searching.py index b140aa56..d40d3539 100644 --- a/tests/test_searching.py +++ b/tests/test_searching.py @@ -1,15 +1,14 @@ # encoding: utf-8 from __future__ import with_statement + import copy from datetime import datetime, timedelta import pytest - from whoosh import analysis, fields, index, qparser, query, scoring from whoosh.codec.whoosh3 import W3Codec -from whoosh.compat import b, u, text_type -from whoosh.compat import range, permutations, izip_longest +from whoosh.compat import b, izip_longest, permutations, range, text_type, u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex @@ -747,7 +746,7 @@ def test_short_prefix(): def test_weighting(): - from whoosh.scoring import Weighting, BaseScorer + from whoosh.scoring import BaseScorer, Weighting schema = fields.Schema(id=fields.ID(stored=True), n_comments=fields.STORED) st = RamStorage() diff --git a/tests/test_sorting.py b/tests/test_sorting.py index 78a12335..b2744c24 100644 --- a/tests/test_sorting.py +++ b/tests/test_sorting.py @@ -1,14 +1,13 @@ from __future__ import with_statement -from datetime import datetime, timedelta + import random +from datetime import datetime, timedelta -from whoosh import fields, query, sorting, columns -from whoosh.compat import u -from whoosh.compat import permutations, range +from whoosh import columns, fields, query, sorting +from whoosh.compat import permutations, range, u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex - try: import multiprocessing except ImportError: diff --git a/tests/test_spans.py b/tests/test_spans.py index 4587afe1..ee5c80a8 100644 --- a/tests/test_spans.py +++ b/tests/test_spans.py @@ -1,13 +1,11 @@ from __future__ import with_statement from whoosh import analysis, fields, formats -from whoosh.compat import u, range, permutations +from whoosh.compat import permutations, range, u from whoosh.filedb.filestore import RamStorage -from whoosh.query import spans -from whoosh.query import And, Or, Term, Phrase +from whoosh.query import And, Or, Phrase, Term, spans from whoosh.util.testing import TempIndex - domain = ("alfa", "bravo", "bravo", "charlie", "delta", "echo") _ix = None diff --git a/tests/test_spelling.py b/tests/test_spelling.py index 93878126..3a418f15 100644 --- a/tests/test_spelling.py +++ b/tests/test_spelling.py @@ -1,4 +1,5 @@ from __future__ import with_statement + import gzip from whoosh import analysis, fields, highlight, query, spelling @@ -7,7 +8,6 @@ from whoosh.support.levenshtein import levenshtein from whoosh.util.testing import TempIndex - _wordlist = sorted( u( "render animation animate shader shading zebra koala" diff --git a/tests/test_stem.py b/tests/test_stem.py index cbfa274b..0912d8d5 100644 --- a/tests/test_stem.py +++ b/tests/test_stem.py @@ -1,6 +1,6 @@ from whoosh.lang.snowball.english import EnglishStemmer -from whoosh.lang.snowball.french import FrenchStemmer from whoosh.lang.snowball.finnish import FinnishStemmer +from whoosh.lang.snowball.french import FrenchStemmer from whoosh.lang.snowball.spanish import SpanishStemmer diff --git a/tests/test_weightings.py b/tests/test_weightings.py index f9d62705..09f07995 100644 --- a/tests/test_weightings.py +++ b/tests/test_weightings.py @@ -1,10 +1,11 @@ from __future__ import with_statement + import inspect -from random import choice, randint import sys +from random import choice, randint from whoosh import fields, query, scoring -from whoosh.compat import u, range, permutations +from whoosh.compat import permutations, range, u from whoosh.filedb.filestore import RamStorage @@ -24,9 +25,7 @@ def test_all(): ix = storage.create_index(schema) w = ix.writer() for _ in range(100): - w.add_document( - text=u(" ").join(choice(domain) for _ in range(randint(10, 20))) - ) + w.add_document(text=u(" ").join(choice(domain) for _ in range(randint(10, 20)))) w.commit() # List ABCs that should not be tested diff --git a/tests/test_writing.py b/tests/test_writing.py index de032b6e..7c979a2d 100644 --- a/tests/test_writing.py +++ b/tests/test_writing.py @@ -1,10 +1,12 @@ from __future__ import with_statement -import random, time, threading -import pytest +import random +import threading +import time +import pytest from whoosh import analysis, fields, query, writing -from whoosh.compat import b, u, range, text_type +from whoosh.compat import b, range, text_type, u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex From 23a65f1f7d1f71f526a4dc887eb2fccf8440e780 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 7 Feb 2024 15:02:09 +0100 Subject: [PATCH 2/2] ruff format . && ruff --select=I --fix . --- .github/ISSUE_TEMPLATE/sweep-template.yml | 2 +- .github/workflows/deploy-github-pages.yml | 2 +- .github/workflows/first-interaction.yml | 2 +- .github/workflows/python-publish.yml | 2 +- .pre-commit-config.yaml | 14 +- .readthedocs.yaml | 2 +- benchmark/enron.py | 2 - benchmark/marc21.py | 2 - docs/Makefile | 2 +- docs/make.bat | 2 +- docs/requirements.txt | 2 +- docs/source/analysis.rst | 3 - docs/source/api/analysis.rst | 1 - docs/source/api/codec/base.rst | 2 - docs/source/api/collectors.rst | 5 - docs/source/api/columns.rst | 1 - docs/source/api/formats.rst | 2 - docs/source/api/lang/wordnet.rst | 1 - docs/source/api/qparser.rst | 7 - docs/source/api/reading.rst | 1 - docs/source/api/scoring.rst | 4 - docs/source/api/searching.rst | 1 - docs/source/api/sorting.rst | 2 - docs/source/api/spelling.rst | 2 - docs/source/api/support/charset.rst | 1 - docs/source/api/support/levenshtein.rst | 1 - docs/source/api/util.rst | 1 - docs/source/api/writing.rst | 2 - docs/source/batch.rst | 6 - docs/source/dates.rst | 4 - docs/source/facets.rst | 2 - docs/source/fieldcaches.rst | 6 - docs/source/glossary.rst | 1 - docs/source/highlight.rst | 12 -- docs/source/index.rst | 1 - docs/source/keywords.rst | 1 - docs/source/nested.rst | 1 - docs/source/ngrams.rst | 3 - docs/source/parsing.rst | 7 +- docs/source/query.rst | 1 - docs/source/querylang.rst | 3 - docs/source/quickstart.rst | 1 - docs/source/recipes.rst | 1 - docs/source/releases/1_0.rst | 1 - docs/source/releases/2_0.rst | 3 - docs/source/releases/index.rst | 1 - docs/source/schema.rst | 4 - docs/source/searching.rst | 6 - docs/source/stemming.rst | 14 -- docs/source/tech/filedb.rst | 1 - docs/source/threads.rst | 3 - requirements-dev.txt | 2 +- requirements.txt | 2 +- scripts/make_checkpoint.py | 1 - scripts/pylint.ini | 22 +-- scripts/read_checkpoint.py | 1 - src/whoosh/analysis/acore.py | 4 +- src/whoosh/analysis/analyzers.py | 2 +- src/whoosh/analysis/filters.py | 2 - src/whoosh/analysis/morph.py | 4 +- src/whoosh/automata/fsa.py | 15 +-- src/whoosh/automata/fst.py | 35 +++-- src/whoosh/automata/lev.py | 2 - src/whoosh/automata/reg.py | 2 +- src/whoosh/classify.py | 5 +- src/whoosh/codec/base.py | 18 +-- src/whoosh/codec/memory.py | 1 - src/whoosh/codec/plaintext.py | 7 +- src/whoosh/codec/whoosh2.py | 33 ++--- src/whoosh/collectors.py | 2 +- src/whoosh/columns.py | 12 +- src/whoosh/compat.py | 5 +- src/whoosh/externalsort.py | 6 +- src/whoosh/fields.py | 13 +- src/whoosh/filedb/compound.py | 8 +- src/whoosh/filedb/fileindex.py | 8 +- src/whoosh/filedb/filepostings.py | 5 +- src/whoosh/filedb/filereading.py | 2 +- src/whoosh/filedb/filestore.py | 7 +- src/whoosh/filedb/filetables.py | 10 +- src/whoosh/filedb/gae.py | 4 +- src/whoosh/filedb/pools.py | 4 +- src/whoosh/filedb/structfile.py | 4 +- src/whoosh/formats.py | 4 +- src/whoosh/highlight.py | 13 +- src/whoosh/idsets.py | 6 +- src/whoosh/index.py | 13 +- src/whoosh/lang/__init__.py | 2 - src/whoosh/lang/dmetaphone.py | 2 - src/whoosh/lang/isri.py | 4 +- src/whoosh/lang/morph_en.py | 74 +++++----- src/whoosh/lang/paicehusk.py | 2 +- src/whoosh/lang/phonetic.py | 2 - src/whoosh/lang/snowball/bases.py | 4 +- src/whoosh/lang/snowball/hungarian.py | 2 +- src/whoosh/lang/snowball/russian.py | 2 +- src/whoosh/lang/stopwords.py | 4 - src/whoosh/lang/wordnet.py | 2 +- src/whoosh/matching/binary.py | 8 +- src/whoosh/matching/combo.py | 1 - src/whoosh/matching/mcore.py | 5 +- src/whoosh/matching/wrappers.py | 9 +- src/whoosh/multiproc.py | 1 - src/whoosh/qparser/common.py | 2 +- src/whoosh/qparser/dateparse.py | 14 +- src/whoosh/qparser/default.py | 2 +- src/whoosh/qparser/plugins.py | 6 +- src/whoosh/qparser/syntax.py | 4 +- src/whoosh/qparser/taggers.py | 2 +- src/whoosh/query/compound.py | 1 - src/whoosh/query/nested.py | 3 +- src/whoosh/query/positional.py | 3 +- src/whoosh/query/qcore.py | 21 ++- src/whoosh/query/spans.py | 20 +-- src/whoosh/query/terms.py | 1 - src/whoosh/query/wrappers.py | 1 - src/whoosh/reading.py | 9 +- src/whoosh/scoring.py | 5 +- src/whoosh/searching.py | 14 +- src/whoosh/sorting.py | 10 +- src/whoosh/spelling.py | 8 +- src/whoosh/support/bench.py | 7 +- src/whoosh/support/bitstream.py | 2 +- src/whoosh/support/bitvector.py | 4 +- src/whoosh/support/charset.py | 4 +- src/whoosh/support/pyparsing.py | 156 +++++++++++----------- src/whoosh/support/relativedelta.py | 2 +- src/whoosh/support/unicode.py | 2 +- src/whoosh/system.py | 2 +- src/whoosh/util/__init__.py | 1 - src/whoosh/util/cache.py | 1 - src/whoosh/util/filelock.py | 6 +- src/whoosh/util/loading.py | 2 +- src/whoosh/util/numlists.py | 4 +- src/whoosh/util/testing.py | 2 +- src/whoosh/util/times.py | 4 +- src/whoosh/util/versions.py | 4 +- src/whoosh/writing.py | 17 ++- stress/test_bigfacet.py | 2 - stress/test_bigindex.py | 2 - stress/test_bigtable.py | 4 +- stress/test_hugeindex.py | 2 - stress/test_threading.py | 2 - stress/test_update.py | 2 - tests/test_analysis.py | 4 - tests/test_automata.py | 2 +- tests/test_classify.py | 12 +- tests/test_codecs.py | 6 +- tests/test_collector.py | 2 - tests/test_columns.py | 6 +- tests/test_compound.py | 2 - tests/test_flexible.py | 2 - tests/test_highlighting.py | 4 - tests/test_indexing.py | 2 - tests/test_matching.py | 2 - tests/test_misc.py | 2 - tests/test_mpwriter.py | 2 - tests/test_nested.py | 2 - tests/test_parse_plugins.py | 2 - tests/test_postings.py | 2 - tests/test_quality.py | 2 - tests/test_reading.py | 59 ++++---- tests/test_results.py | 8 +- tests/test_searching.py | 10 +- tests/test_sorting.py | 4 +- tests/test_spans.py | 2 - tests/test_spelling.py | 2 - tests/test_tables.py | 12 +- tests/test_vectors.py | 3 - tests/test_weightings.py | 2 - tests/test_writing.py | 2 - 171 files changed, 406 insertions(+), 676 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/sweep-template.yml b/.github/ISSUE_TEMPLATE/sweep-template.yml index 44116f53..d46b4ff1 100644 --- a/.github/ISSUE_TEMPLATE/sweep-template.yml +++ b/.github/ISSUE_TEMPLATE/sweep-template.yml @@ -12,4 +12,4 @@ body: Unit Tests: Write unit tests for . Test each function in the file. Make sure to test edge cases. Bugs: The bug might be in . Here are the logs: ... Features: the new endpoint should use the ... class from because it contains ... logic. - Refactors: We are migrating this function to ... version because ... \ No newline at end of file + Refactors: We are migrating this function to ... version because ... diff --git a/.github/workflows/deploy-github-pages.yml b/.github/workflows/deploy-github-pages.yml index aac0fdf0..59055185 100644 --- a/.github/workflows/deploy-github-pages.yml +++ b/.github/workflows/deploy-github-pages.yml @@ -38,4 +38,4 @@ jobs: # The GH actions bot is used by default if you didn't specify the two fields. # You can swap them out with your own user credentials. user_name: github-actions[bot] - user_email: 41898282+github-actions[bot]@users.noreply.github.com \ No newline at end of file + user_email: 41898282+github-actions[bot]@users.noreply.github.com diff --git a/.github/workflows/first-interaction.yml b/.github/workflows/first-interaction.yml index e32f91a1..422476bc 100644 --- a/.github/workflows/first-interaction.yml +++ b/.github/workflows/first-interaction.yml @@ -21,7 +21,7 @@ jobs: If this is a bug report, please include relevant logs to help us debug the problem. pr-message: | Hello! Thank you for your contribution. - + If you are fixing a bug, please reference the issue number in the description. If you are implementing a feature request, please check with the maintainers that the feature will be accepted first. diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 62851151..15dcc20a 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -13,7 +13,7 @@ on: types: [published] workflow_dispatch: # This line allows manual triggering - + #push: # branches: # - master diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a31c673..5a19435c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: 'v4.5.0' + rev: v4.5.0 hooks: - id: check-added-large-files - id: check-ast @@ -12,18 +12,20 @@ repos: - id: end-of-file-fixer - id: mixed-line-ending - id: trailing-whitespace - - repo: https://github.com/psf/black - rev: '24.1.1' + + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.2.1 hooks: - - id: black - language_version: python3.11 - exclude: ^notebooks + - id: ruff + args: [ --select=I ] # isort + - id: ruff-format - repo: https://github.com/asottile/pyupgrade rev: 'v3.15.0' hooks: - id: pyupgrade args: [ --py38-plus ] + - repo: https://github.com/ikamensh/flynt/ rev: '1.0.1' hooks: diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 274eb141..5b8d7726 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -26,4 +26,4 @@ sphinx: python: install: # - requirements: requirements.txt - - requirements: docs/requirements.txt \ No newline at end of file + - requirements: docs/requirements.txt diff --git a/benchmark/enron.py b/benchmark/enron.py index f3447a4a..175c0ef6 100644 --- a/benchmark/enron.py +++ b/benchmark/enron.py @@ -1,5 +1,3 @@ -from __future__ import division - import os.path import tarfile from email import message_from_string diff --git a/benchmark/marc21.py b/benchmark/marc21.py index a2f89ab0..b4316c53 100644 --- a/benchmark/marc21.py +++ b/benchmark/marc21.py @@ -1,5 +1,3 @@ -from __future__ import print_function, with_statement - import fnmatch import logging import os.path diff --git a/docs/Makefile b/docs/Makefile index 09b24957..5c1c3530 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -174,4 +174,4 @@ xml: pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." \ No newline at end of file + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/make.bat b/docs/make.bat index 6b6fea17..502a76a6 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -239,4 +239,4 @@ if "%1" == "pseudoxml" ( goto end ) -:end \ No newline at end of file +:end diff --git a/docs/requirements.txt b/docs/requirements.txt index f063a0ad..0ccde19f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,3 @@ sphinx sphinx_rtd_theme -sphinx-jsonschema \ No newline at end of file +sphinx-jsonschema diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index 27297f61..ebbb72a9 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -324,6 +324,3 @@ change it. ;) Nothing requires that an Analyzer be implemented by calling a tokenizer and filters. Tokenizers and filters are simply a convenient way to structure the code. You're free to write an analyzer any way you want, as long as it implements ``__call__``. - - - diff --git a/docs/source/api/analysis.rst b/docs/source/api/analysis.rst index bbb1b978..81805618 100644 --- a/docs/source/api/analysis.rst +++ b/docs/source/api/analysis.rst @@ -59,4 +59,3 @@ Token classes and functions .. autoclass:: Token .. autofunction:: unstopped - diff --git a/docs/source/api/codec/base.rst b/docs/source/api/codec/base.rst index 28f707c4..a1326a57 100644 --- a/docs/source/api/codec/base.rst +++ b/docs/source/api/codec/base.rst @@ -28,5 +28,3 @@ Classes .. autoclass:: Segment :members: - - diff --git a/docs/source/api/collectors.rst b/docs/source/api/collectors.rst index b27b8c1f..6a9a8c63 100644 --- a/docs/source/api/collectors.rst +++ b/docs/source/api/collectors.rst @@ -40,8 +40,3 @@ Wrappers .. autoclass:: TimeLimitCollector .. autoclass:: TermsCollector - - - - - diff --git a/docs/source/api/columns.rst b/docs/source/api/columns.rst index 26fa7916..69b7bd9a 100644 --- a/docs/source/api/columns.rst +++ b/docs/source/api/columns.rst @@ -46,4 +46,3 @@ Experimental columns ==================== .. autoclass:: ClampedNumericColumn - diff --git a/docs/source/api/formats.rst b/docs/source/api/formats.rst index 9cd9dd19..9f184db9 100644 --- a/docs/source/api/formats.rst +++ b/docs/source/api/formats.rst @@ -20,5 +20,3 @@ Formats .. autoclass:: Characters .. autoclass:: PositionBoosts .. autoclass:: CharacterBoosts - - diff --git a/docs/source/api/lang/wordnet.rst b/docs/source/api/lang/wordnet.rst index 8adcdb0b..d1422525 100644 --- a/docs/source/api/lang/wordnet.rst +++ b/docs/source/api/lang/wordnet.rst @@ -17,4 +17,3 @@ Low-level functions .. autofunction:: parse_file .. autofunction:: synonyms .. autofunction:: make_index - diff --git a/docs/source/api/qparser.rst b/docs/source/api/qparser.rst index d3c5ecda..a0b10cc2 100644 --- a/docs/source/api/qparser.rst +++ b/docs/source/api/qparser.rst @@ -88,10 +88,3 @@ Operators .. autoclass:: PrefixOperator .. autoclass:: PostfixOperator .. autoclass:: InfixOperator - - - - - - - diff --git a/docs/source/api/reading.rst b/docs/source/api/reading.rst index e0fd2a12..b923ac05 100644 --- a/docs/source/api/reading.rst +++ b/docs/source/api/reading.rst @@ -19,4 +19,3 @@ Exceptions ========== .. autoexception:: TermNotFound - diff --git a/docs/source/api/scoring.rst b/docs/source/api/scoring.rst index 73ea1e76..46fa6ab1 100644 --- a/docs/source/api/scoring.rst +++ b/docs/source/api/scoring.rst @@ -36,7 +36,3 @@ Scoring utility classes .. autoclass:: MultiWeighting .. autoclass:: ReverseWeighting - - - - diff --git a/docs/source/api/searching.rst b/docs/source/api/searching.rst index 8acfe492..c717ff00 100644 --- a/docs/source/api/searching.rst +++ b/docs/source/api/searching.rst @@ -30,4 +30,3 @@ Exceptions .. autoexception:: NoTermsException .. autoexception:: TimeLimit - diff --git a/docs/source/api/sorting.rst b/docs/source/api/sorting.rst index faf78d0f..d7f4955a 100644 --- a/docs/source/api/sorting.rst +++ b/docs/source/api/sorting.rst @@ -44,5 +44,3 @@ FacetType objects .. autoclass:: UnorderedList .. autoclass:: Count .. autoclass:: Best - - diff --git a/docs/source/api/spelling.rst b/docs/source/api/spelling.rst index 79d5961e..34db9dc5 100644 --- a/docs/source/api/spelling.rst +++ b/docs/source/api/spelling.rst @@ -27,5 +27,3 @@ QueryCorrector objects .. autoclass:: SimpleQueryCorrector .. autoclass:: Correction - - diff --git a/docs/source/api/support/charset.rst b/docs/source/api/support/charset.rst index b0a687e9..fabd03ac 100644 --- a/docs/source/api/support/charset.rst +++ b/docs/source/api/support/charset.rst @@ -10,4 +10,3 @@ Taken from http://speeple.com/unicode-maps.txt .. autofunction:: charset_table_to_dict - diff --git a/docs/source/api/support/levenshtein.rst b/docs/source/api/support/levenshtein.rst index cb64027e..e36870bb 100644 --- a/docs/source/api/support/levenshtein.rst +++ b/docs/source/api/support/levenshtein.rst @@ -7,4 +7,3 @@ .. autofunction:: relative .. autofunction:: distance - diff --git a/docs/source/api/util.rst b/docs/source/api/util.rst index 9359f742..8380a413 100644 --- a/docs/source/api/util.rst +++ b/docs/source/api/util.rst @@ -4,4 +4,3 @@ .. automodule:: whoosh.util :members: - diff --git a/docs/source/api/writing.rst b/docs/source/api/writing.rst index 0bebc86f..5361cc02 100644 --- a/docs/source/api/writing.rst +++ b/docs/source/api/writing.rst @@ -26,5 +26,3 @@ Exceptions ========== .. autoexception:: IndexingError - - diff --git a/docs/source/batch.rst b/docs/source/batch.rst index 5caf256e..b8a741f0 100644 --- a/docs/source/batch.rst +++ b/docs/source/batch.rst @@ -106,9 +106,3 @@ So, while ``multisegment=True`` is much faster than a normal writer, you should only use it for large batch indexing jobs (or perhaps only for indexing from scratch). It should not be the only method you use for indexing, because otherwise the number of segments will tend to increase forever! - - - - - - diff --git a/docs/source/dates.rst b/docs/source/dates.rst index ab1aadd6..ac5cd2bf 100644 --- a/docs/source/dates.rst +++ b/docs/source/dates.rst @@ -196,7 +196,3 @@ Limitations * ``DATETIME`` fields do not currently support open-ended ranges. You can simulate an open ended range by using an endpoint far in the past or future. - - - - diff --git a/docs/source/facets.rst b/docs/source/facets.rst index b8c16936..4717a250 100644 --- a/docs/source/facets.rst +++ b/docs/source/facets.rst @@ -767,5 +767,3 @@ Expert: writing your own facet ============================== TBD. - - diff --git a/docs/source/fieldcaches.rst b/docs/source/fieldcaches.rst index 49091dc7..2e399ed5 100644 --- a/docs/source/fieldcaches.rst +++ b/docs/source/fieldcaches.rst @@ -44,9 +44,3 @@ Then you can pass an instance of your policy object to the ``set_caching_policy` method:: searcher.set_caching_policy(MyPolicy()) - - - - - - diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index e9dd52d7..c62516b4 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -62,4 +62,3 @@ Glossary Term vector A *forward index* for a certain field in a certain document. You can specify in the Schema that a given field should store term vectors. - diff --git a/docs/source/highlight.rst b/docs/source/highlight.rst index 79c76ae9..bc266c8c 100644 --- a/docs/source/highlight.rst +++ b/docs/source/highlight.rst @@ -405,15 +405,3 @@ an analyzer:: ``order`` An ordering function that determines the order of the "top" fragments in the output text. - - - - - - - - - - - - diff --git a/docs/source/index.rst b/docs/source/index.rst index 236372f7..ca3f0062 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -47,4 +47,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` - diff --git a/docs/source/keywords.rst b/docs/source/keywords.rst index fe0e91f2..ed1440ee 100644 --- a/docs/source/keywords.rst +++ b/docs/source/keywords.rst @@ -91,4 +91,3 @@ Expansion models The ``ExpansionModel`` subclasses in the :mod:` whoosh.classify` module implement different weighting functions for key words. These models are translated into Python from original Java implementations in Terrier. - diff --git a/docs/source/nested.rst b/docs/source/nested.rst index da43d282..465b8af7 100644 --- a/docs/source/nested.rst +++ b/docs/source/nested.rst @@ -235,4 +235,3 @@ additional searches for each found document. Future versions of Whoosh may include "join" queries to make this process more efficient (or at least more automatic). - diff --git a/docs/source/ngrams.rst b/docs/source/ngrams.rst index 558f4e34..56bfe22f 100644 --- a/docs/source/ngrams.rst +++ b/docs/source/ngrams.rst @@ -46,6 +46,3 @@ whitespace and punctuation, while ``NGRAMWORDS`` extracts words from the text using a tokenizer, then runs each word through the N-gram filter. TBD. - - - diff --git a/docs/source/parsing.rst b/docs/source/parsing.rst index c4acc746..8eec5aec 100644 --- a/docs/source/parsing.rst +++ b/docs/source/parsing.rst @@ -185,7 +185,7 @@ replace the default English tokens with your own regular expressions. The :class:` whoosh.qparser.OperatorsPlugin` implements the ability to use AND, OR, NOT, ANDNOT, and ANDMAYBE clauses in queries. You can instantiate a new -``OperatorsPlugin`` and use the ``And``, ``Or``, ``Not``, ``AndNot``, and +``OperatorsPlugin`` and use the ``And``, ``Or``, ``Not``, ``AndNot``, and ``AndMaybe`` keyword arguments to change the token patterns:: # Use Spanish equivalents instead of AND and OR @@ -430,8 +430,3 @@ use the ``clean`` keyword argument:: Operators earlier in the list bind more closely than operators later in the list. - - - - - diff --git a/docs/source/query.rst b/docs/source/query.rst index f56b26b6..c7aec022 100644 --- a/docs/source/query.rst +++ b/docs/source/query.rst @@ -7,4 +7,3 @@ The classes in the :mod:` whoosh.query` module implement *queries* you can run a TBD. See :doc:`searching` for how to search the index using query objects. - diff --git a/docs/source/querylang.rst b/docs/source/querylang.rst index 085363da..7c436c12 100644 --- a/docs/source/querylang.rst +++ b/docs/source/querylang.rst @@ -186,6 +186,3 @@ in single quotes:: path:'MacHD:My Documents' 'term with spaces' title:'function()' - - - diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index b169fb7c..380252c7 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -241,4 +241,3 @@ Whoosh includes extra features for dealing with search results, such as * Paginating the results (e.g. "Showing results 1-20, page 1 of 4"). See :doc:`searching` for more information. - diff --git a/docs/source/recipes.rst b/docs/source/recipes.rst index 19150153..29c9571b 100644 --- a/docs/source/recipes.rst +++ b/docs/source/recipes.rst @@ -226,4 +226,3 @@ Is term X in document Y? # ...or the slower but easier way wordset = set(searcher.vector(500, "content").all_ids()) return "wobble" in wordset - diff --git a/docs/source/releases/1_0.rst b/docs/source/releases/1_0.rst index 7312123b..08887f53 100644 --- a/docs/source/releases/1_0.rst +++ b/docs/source/releases/1_0.rst @@ -479,4 +479,3 @@ Misc previous versions. * Unit tests should no longer leave directories and files behind. - diff --git a/docs/source/releases/2_0.rst b/docs/source/releases/2_0.rst index 053966fe..20569387 100644 --- a/docs/source/releases/2_0.rst +++ b/docs/source/releases/2_0.rst @@ -328,6 +328,3 @@ Compatibility now yield :class:` whoosh.reading.TermInfo` objects. * The arguments to :class:` whoosh.query.FuzzyTerm` changed. - - - diff --git a/docs/source/releases/index.rst b/docs/source/releases/index.rst index cf63ae83..def33734 100644 --- a/docs/source/releases/index.rst +++ b/docs/source/releases/index.rst @@ -8,4 +8,3 @@ Release notes 2_0 1_0 0_3 - diff --git a/docs/source/schema.rst b/docs/source/schema.rst index 043facb5..58da2fc7 100644 --- a/docs/source/schema.rst +++ b/docs/source/schema.rst @@ -371,7 +371,3 @@ If you set ``FieldType.vector`` to a ``Format`` object, the indexing code will u ``Format`` object to store information about the terms in each document. Currently by default Whoosh does not make use of term vectors at all, but they are available to expert users who want to implement their own field types. - - - - diff --git a/docs/source/searching.rst b/docs/source/searching.rst index 603244a4..ac640424 100644 --- a/docs/source/searching.rst +++ b/docs/source/searching.rst @@ -392,9 +392,3 @@ The ``Results`` object supports the following methods: Any result documents that also appear in 'results' are moved to the top of the list of result documents. Then any other documents in 'results' are added on to the list of result documents. - - - - - - diff --git a/docs/source/stemming.rst b/docs/source/stemming.rst index e88c66b7..0d30b569 100644 --- a/docs/source/stemming.rst +++ b/docs/source/stemming.rst @@ -201,17 +201,3 @@ required by ``CharsetTokenizer`` and ``CharsetFilter``:: (The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table ) - - - - - - - - - - - - - - diff --git a/docs/source/tech/filedb.rst b/docs/source/tech/filedb.rst index 0fe22be7..3d96b504 100644 --- a/docs/source/tech/filedb.rst +++ b/docs/source/tech/filedb.rst @@ -26,4 +26,3 @@ The index directory will contain a set of files for each segment. A segment is l .fvz contains term vectors (forward indexes) for each document. This file is only created if at least one field in the schema stores term vectors. The size will vary based on the number of documents, field length, the formats used for each vector (e.g. storing term positions takes more space than storing frequency only), etc. - diff --git a/docs/source/threads.rst b/docs/source/threads.rst index 0b45a643..54ecba8a 100644 --- a/docs/source/threads.rst +++ b/docs/source/threads.rst @@ -69,6 +69,3 @@ returns it.) Calling ``Searcher.refresh()`` is more efficient that closing the searcher and opening a new one, since it will re-use any underlying readers and caches that haven't changed. - - - diff --git a/requirements-dev.txt b/requirements-dev.txt index 8046a275..403fbade 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ pytest pythomata versioneer --e . \ No newline at end of file +-e . diff --git a/requirements.txt b/requirements.txt index 945c9b46..9c558e35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -. \ No newline at end of file +. diff --git a/scripts/make_checkpoint.py b/scripts/make_checkpoint.py index 2553bbac..6da29392 100644 --- a/scripts/make_checkpoint.py +++ b/scripts/make_checkpoint.py @@ -3,7 +3,6 @@ # Make a "checkpoint" index, capturing the index format created by a certain # version of Whoosh -from __future__ import print_function, with_statement import os.path import random diff --git a/scripts/pylint.ini b/scripts/pylint.ini index cda16306..630e452e 100644 --- a/scripts/pylint.ini +++ b/scripts/pylint.ini @@ -1,11 +1,11 @@ # lint Python modules using external checkers. -# +# # This is the main checker controling the other ones and the reports # generation. It is itself both a raw checker and an astng checker in order # to: # * handle message activation / deactivation at the module level # * handle some basic but necessary stats'data (number of classes, methods...) -# +# [MASTER] # Specify a configuration file. @@ -92,7 +92,7 @@ comment=no # * undefined variables # * redefinition of variable from builtins or from an outer scope # * use of variable before assigment -# +# [VARIABLES] # Tells wether we should check for unused import in __init__ files. @@ -107,7 +107,7 @@ additional-builtins= # try to find bugs in the code using type inference -# +# [TYPECHECK] # Tells wether missing members accessed in mixin class should be ignored. A @@ -132,7 +132,7 @@ acquired-members=REQUEST,acl_users,aq_parent # * dangerous default values as arguments # * redefinition of function / method / class # * uses of the global statement -# +# [BASIC] # Required attributes for module, separated by a comma @@ -183,7 +183,7 @@ bad-functions=apply,input # checks for sign of poor/misdesign: # * number of methods, attributes, local variables... # * size, complexity of functions, methods -# +# [DESIGN] # Maximum number of arguments for function / method @@ -219,7 +219,7 @@ max-public-methods=20 # * relative / wildcard imports # * cyclic imports # * uses of deprecated modules -# +# [IMPORTS] # Deprecated modules which should not be used, separated by a comma @@ -245,7 +245,7 @@ int-import-graph= # * attributes not defined in the __init__ method # * supported interfaces implementation # * unreachable code -# +# [CLASSES] # List of interface methods to ignore, separated by a comma. This is used for @@ -259,7 +259,7 @@ defining-attr-methods=__init__,__new__,setUp # checks for similarities and duplicated code. This computation may be # memory / CPU intensive, so you should disable it if you experiments some # problems. -# +# [SIMILARITIES] # Minimum lines number of a similarity. @@ -275,7 +275,7 @@ ignore-docstrings=yes # checks for: # * warning notes in the code like FIXME, XXX # * PEP 263: source code with non ascii character but no encoding declaration -# +# [MISCELLANEOUS] # List of note tags to take in consideration, separated by a comma. @@ -287,7 +287,7 @@ notes=FIXME,XXX,TODO # * strict indentation # * line length # * use of <> instead of != -# +# [FORMAT] # Maximum number of characters on a single line. diff --git a/scripts/read_checkpoint.py b/scripts/read_checkpoint.py index 2f75df53..c6947f0c 100644 --- a/scripts/read_checkpoint.py +++ b/scripts/read_checkpoint.py @@ -2,7 +2,6 @@ # Read a "checkpoint" index, to check backwards compatibility -from __future__ import print_function, with_statement import sys diff --git a/src/whoosh/analysis/acore.py b/src/whoosh/analysis/acore.py index f6ccee3d..74bf926a 100644 --- a/src/whoosh/analysis/acore.py +++ b/src/whoosh/analysis/acore.py @@ -74,7 +74,7 @@ def entoken( # Token object -class Token(object): +class Token: """ Represents a "token" (usually a word) extracted from the source text being indexed. @@ -139,7 +139,7 @@ def copy(self): # Composition support -class Composable(object): +class Composable: is_morph = False def __or__(self, other): diff --git a/src/whoosh/analysis/analyzers.py b/src/whoosh/analysis/analyzers.py index 1738be18..236733d9 100644 --- a/src/whoosh/analysis/analyzers.py +++ b/src/whoosh/analysis/analyzers.py @@ -83,7 +83,7 @@ def __init__(self, *composables): ) def __repr__(self): - return "%s(%s)" % ( + return "{}({})".format( self.__class__.__name__, ", ".join(repr(item) for item in self.items), ) diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py index 6a134549..ee64bf9b 100644 --- a/src/whoosh/analysis/filters.py +++ b/src/whoosh/analysis/filters.py @@ -1,5 +1,3 @@ -# coding=utf-8 - # Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/src/whoosh/analysis/morph.py b/src/whoosh/analysis/morph.py index 50d5a631..7b1944c1 100644 --- a/src/whoosh/analysis/morph.py +++ b/src/whoosh/analysis/morph.py @@ -92,7 +92,7 @@ def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): def __getstate__(self): # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state - return dict([(k, self.__dict__[k]) for k in self.__dict__ if k != "_stem"]) + return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): # Check for old instances of StemFilter class, which didn't have a @@ -197,7 +197,7 @@ def _get_stemmer_fn(self): def __getstate__(self): # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state - return dict([(k, self.__dict__[k]) for k in self.__dict__ if k != "_stem"]) + return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): # Check for old instances of StemFilter class, which didn't have a diff --git a/src/whoosh/automata/fsa.py b/src/whoosh/automata/fsa.py index c621c1fc..05a1cfbd 100644 --- a/src/whoosh/automata/fsa.py +++ b/src/whoosh/automata/fsa.py @@ -1,5 +1,3 @@ -from __future__ import print_function - import itertools import operator import sys @@ -13,7 +11,7 @@ # Marker constants -class Marker(object): +class Marker: def __init__(self, name): self.name = name @@ -28,7 +26,7 @@ def __repr__(self): # Base class -class FSA(object): +class FSA: def __init__(self, initial): self.initial = initial self.transitions = {} @@ -67,8 +65,7 @@ def generate_all(self, state=None, sofar=""): yield sofar for label in sorted(self.get_labels(state)): newstate = self.next_state(state, label) - for string in self.generate_all(newstate, sofar + label): - yield string + yield from self.generate_all(newstate, sofar + label) def start(self): return self.initial @@ -128,7 +125,7 @@ def dump(self, stream=sys.stdout): end = "||" if self.is_final(dests) else "" def start(self): - return frozenset(self._expand(set([self.initial]))) + return frozenset(self._expand({self.initial})) def add_transition(self, src, label, dest): self.transitions.setdefault(src, {}).setdefault(label, set()).add(dest) @@ -383,7 +380,7 @@ def minimize(self): assert new_initial is not None # Apply mapping to existing transitions - new_finals = set(mapping[s] for s in final_states) + new_finals = {mapping[s] for s in final_states} for state, d in iteritems(new_trans): trans = transitions[state] for label, dest in iteritems(trans): @@ -628,7 +625,7 @@ def optional_nfa(n): # Daciuk Mihov DFA construction algorithm -class DMNode(object): +class DMNode: def __init__(self, n): self.n = n self.arcs = {} diff --git a/src/whoosh/automata/fst.py b/src/whoosh/automata/fst.py index 36f1d1b7..64451efa 100644 --- a/src/whoosh/automata/fst.py +++ b/src/whoosh/automata/fst.py @@ -88,7 +88,7 @@ class InactiveCursor(Exception): # FST Value types -class Values(object): +class Values: """Base for classes the describe how to encode and decode FST values.""" @staticmethod @@ -364,7 +364,7 @@ def to_bytes(v): # Node-like interface wrappers -class Node(object): +class Node: """A slow but easier-to-use wrapper for FSA/DAWGs. Translates the low-level arc-based interface of GraphReader into Node objects with methods to follow edges. @@ -391,10 +391,10 @@ def _load(self): if self.address is None: d = {} else: - d = dict( - (arc.label, Node(owner, arc.target, arc.accept)) + d = { + arc.label: Node(owner, arc.target, arc.accept) for arc in self.owner.iter_arcs(self.address) - ) + } self._edges = d def keys(self): @@ -417,8 +417,7 @@ def flatten(self, sofar=emptybytes): yield sofar for key in sorted(self): node = self.edge(key) - for result in node.flatten(sofar + key): - yield result + yield from node.flatten(sofar + key) def flatten_strings(self): return (utf8decode(k)[0] for k in self.flatten()) @@ -476,7 +475,7 @@ def edge(self, key): # Cursor -class BaseCursor(object): +class BaseCursor: """Base class for a cursor-type object for navigating an FST/word graph, represented by a :class:`GraphReader` object. @@ -525,8 +524,7 @@ def peek_key(self): key in the graph. """ - for label in self.prefix(): - yield label + yield from self.prefix() c = self.copy() while not c.stopped(): c.follow() @@ -714,8 +712,7 @@ def peek_key(self): if not self.stack: raise InactiveCursor - for label in self.prefix(): - yield label + yield from self.prefix() arc = copy.copy(self.stack[-1]) graph = self.graph while not arc.accept and arc.target is not None: @@ -821,7 +818,7 @@ def _pop_to_prefix(self, key): return i -class UncompiledNode(object): +class UncompiledNode: # Represents an "in-memory" node used by the GraphWriter before it is # written to disk. @@ -893,7 +890,7 @@ def prepend_value(self, prefix): self.value = add(prefix, self.value) -class Arc(object): +class Arc: """ Represents a directed arc between two nodes in an FSA/FST graph. @@ -933,11 +930,11 @@ def __init__( self.endpos = endpos def __repr__(self): - return "<%r-%s %s%s>" % ( + return "<{!r}-{} {}{}>".format( self.label, self.target, "." if self.accept else "", - (" %r" % self.value) if self.value else "", + f" {self.value!r}" if self.value else "", ) def __eq__(self, other): @@ -968,7 +965,7 @@ def copy(self): # Graph writer -class GraphWriter(object): +class GraphWriter: """Writes an FSA/FST graph to disk. Call ``insert(key)`` to insert keys into the graph. You must @@ -1247,7 +1244,7 @@ def _write_node(self, uncnode): # Graph reader -class BaseGraphReader(object): +class BaseGraphReader: def cursor(self, rootname=None): return Cursor(self, self.root(rootname)) @@ -1280,7 +1277,7 @@ def list_arcs(self, address): return list(arc.copy() for arc in self.iter_arcs(address)) def arc_dict(self, address): - return dict((arc.label, arc.copy()) for arc in self.iter_arcs(address)) + return {arc.label: arc.copy() for arc in self.iter_arcs(address)} def find_path(self, path, arc=None, address=None): path = to_labels(path) diff --git a/src/whoosh/automata/lev.py b/src/whoosh/automata/lev.py index 24e829c0..109fb2b0 100644 --- a/src/whoosh/automata/lev.py +++ b/src/whoosh/automata/lev.py @@ -1,5 +1,3 @@ -from __future__ import print_function - from whoosh.automata.fsa import ANY, EPSILON, NFA from whoosh.compat import range diff --git a/src/whoosh/automata/reg.py b/src/whoosh/automata/reg.py index e60ab1c0..54a5ecf6 100644 --- a/src/whoosh/automata/reg.py +++ b/src/whoosh/automata/reg.py @@ -37,7 +37,7 @@ def parse(pattern): ops = [] -class RegexBuilder(object): +class RegexBuilder: def __init__(self): self.statenum = 1 diff --git a/src/whoosh/classify.py b/src/whoosh/classify.py index beab3462..34ec7e98 100644 --- a/src/whoosh/classify.py +++ b/src/whoosh/classify.py @@ -29,7 +29,6 @@ documents. """ -from __future__ import division import random from collections import defaultdict @@ -40,7 +39,7 @@ # Expansion models -class ExpansionModel(object): +class ExpansionModel: def __init__(self, doc_count, field_length): self.N = doc_count self.collection_total = field_length @@ -99,7 +98,7 @@ def score(self, weight_in_top, weight_in_collection, top_total): ) -class Expander(object): +class Expander: """Uses an ExpansionModel to expand the set of query terms based on the top N result documents. """ diff --git a/src/whoosh/codec/base.py b/src/whoosh/codec/base.py index ba1da0e5..1ea2d94f 100644 --- a/src/whoosh/codec/base.py +++ b/src/whoosh/codec/base.py @@ -48,7 +48,7 @@ class OutOfOrderError(Exception): # Base classes -class Codec(object): +class Codec: length_stats = True # Per document value writer @@ -128,7 +128,7 @@ def new_segment(self, storage, indexname): # Writer classes -class PerDocumentWriter(object): +class PerDocumentWriter: @abstractmethod def start_doc(self, docnum): raise NotImplementedError @@ -165,7 +165,7 @@ def close(self): pass -class FieldWriter(object): +class FieldWriter: def add_postings(self, schema, lengths, items): # This method translates a generator of (fieldname, btext, docnum, w, v) # postings into calls to start_field(), start_term(), add(), @@ -273,7 +273,7 @@ def close(self): # Postings -class PostingsWriter(object): +class PostingsWriter: @abstractmethod def start_postings(self, format_, terminfo): raise NotImplementedError @@ -296,7 +296,7 @@ def written(self): # Reader classes -class FieldCursor(object): +class FieldCursor: def first(self): raise NotImplementedError @@ -310,7 +310,7 @@ def term(self): raise NotImplementedError -class TermsReader(object): +class TermsReader: @abstractmethod def __contains__(self, term): raise NotImplementedError @@ -360,7 +360,7 @@ def close(self): pass -class Automata(object): +class Automata: @staticmethod def levenshtein_dfa(uterm, maxdist, prefix=0): return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa() @@ -392,7 +392,7 @@ def terms_within(self, fieldcur, uterm, maxdist, prefix=0): # Per-doc value reader -class PerDocumentReader(object): +class PerDocumentReader: def close(self): # This method is intentionally left empty. pass @@ -497,7 +497,7 @@ def all_stored_fields(self): # Segment base class -class Segment(object): +class Segment: """Do not instantiate this object directly. It is used by the Index object to hold information about a segment. A list of objects of this class are pickled as part of the TOC file. diff --git a/src/whoosh/codec/memory.py b/src/whoosh/codec/memory.py index 088a7913..5d3955cc 100644 --- a/src/whoosh/codec/memory.py +++ b/src/whoosh/codec/memory.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement from bisect import bisect_left from threading import Lock diff --git a/src/whoosh/codec/plaintext.py b/src/whoosh/codec/plaintext.py index e6b024d9..95b54b5d 100644 --- a/src/whoosh/codec/plaintext.py +++ b/src/whoosh/codec/plaintext.py @@ -54,7 +54,7 @@ class memoryview: # Mixin classes for producing and consuming the simple text format -class LineWriter(object): +class LineWriter: def _print_line(self, indent, command, **kwargs): self._dbfile.write(b(" ") * indent) self._dbfile.write(command.encode("latin1")) @@ -67,7 +67,7 @@ def _print_line(self, indent, command, **kwargs): self._dbfile.write(b("\n")) -class LineReader(object): +class LineReader: def __init__(self, dbfile): self._dbfile = dbfile @@ -222,8 +222,7 @@ def _iter_docs(self): def _iter_docfields(self, fieldname): for _ in self._iter_docs(): - for c in self._find_lines(2, "DOCFIELD", fn=fieldname): - yield c + yield from self._find_lines(2, "DOCFIELD", fn=fieldname) def _iter_lengths(self, fieldname): return (c.get("len", 0) for c in self._iter_docfields(fieldname)) diff --git a/src/whoosh/codec/whoosh2.py b/src/whoosh/codec/whoosh2.py index 0b5e4860..c146aff2 100644 --- a/src/whoosh/codec/whoosh2.py +++ b/src/whoosh/codec/whoosh2.py @@ -122,7 +122,7 @@ def crc_hash(key): # Table classes -class HashWriter(object): +class HashWriter: def __init__(self, dbfile, hashtype=2): self.dbfile = dbfile self.hashtype = hashtype @@ -217,7 +217,7 @@ def close(self): self.dbfile.close() -class HashReader(object): +class HashReader: def __init__(self, dbfile, startoffset=0): self.dbfile = dbfile self.startoffset = startoffset @@ -448,8 +448,7 @@ def _ranges_from(self, key): if pos is None: return - for x in self._ranges(pos=pos): - yield x + yield from self._ranges(pos=pos) def items_from(self, key): read = self.read @@ -813,8 +812,7 @@ def all_ids(self): block = self._read_block(nextoffset) nextoffset = block.nextoffset ids = block.read_ids() - for id in ids: - yield id + yield from ids def next(self): if self.i == self.block.count - 1: @@ -1023,8 +1021,7 @@ def _ranges_from(self, key): if pos is None: return - for x in self._ranges(pos=pos): - yield x + yield from self._ranges(pos=pos) def __getitem__(self, key): k = self.keycoder(key) @@ -1224,7 +1221,7 @@ def has_vector(self, docnum, fieldname): if self._vectors is None: try: self._prep_vectors() - except (NameError, IOError): + except (NameError, OSError): return False return (docnum, fieldname) in self._vectors @@ -1242,7 +1239,7 @@ def stored_fields(self, docnum): # Single-byte field lengths implementations -class ByteLengthsBase(object): +class ByteLengthsBase: magic = b("~LN1") def __init__(self): @@ -1435,7 +1432,7 @@ def close(self): unpack_stored_pointer = _stored_pointer_struct.unpack -class StoredFieldWriter(object): +class StoredFieldWriter: def __init__(self, dbfile): self.dbfile = dbfile self.length = 0 @@ -1484,7 +1481,7 @@ def close(self): f.close() -class StoredFieldReader(object): +class StoredFieldReader: def __init__(self, dbfile): self.dbfile = dbfile @@ -1522,9 +1519,9 @@ def __iter__(self): dbfile.seek(self.basepos) for length in lengths: vlist = loads(dbfile.read(length) + b(".")) - vdict = dict( - (names[i], vlist[i]) for i in range(len(vlist)) if vlist[i] is not None - ) + vdict = { + names[i]: vlist[i] for i in range(len(vlist)) if vlist[i] is not None + } yield vdict def __getitem__(self, num): @@ -1547,9 +1544,7 @@ def __getitem__(self, num): # Recreate a dictionary by putting the field names and values back # together by position. We can't just use dict(zip(...)) because we # want to filter out the None values. - vdict = dict( - (names[i], vlist[i]) for i in range(len(vlist)) if vlist[i] is not None - ) + vdict = {names[i]: vlist[i] for i in range(len(vlist)) if vlist[i] is not None} return vdict @@ -1618,7 +1613,7 @@ def deleted_docs(self): # Posting blocks -class W2Block(object): +class W2Block: magic = b("Blk3") infokeys = ( diff --git a/src/whoosh/collectors.py b/src/whoosh/collectors.py index 939b3319..a2ccb737 100644 --- a/src/whoosh/collectors.py +++ b/src/whoosh/collectors.py @@ -100,7 +100,7 @@ def ilen(iterator): # Base class -class Collector(object): +class Collector: """Base class for collectors.""" def prepare(self, top_searcher, q, context): diff --git a/src/whoosh/columns.py b/src/whoosh/columns.py index d15ec6c6..429e1ade 100644 --- a/src/whoosh/columns.py +++ b/src/whoosh/columns.py @@ -46,7 +46,6 @@ and ``reader()`` to return a ``ColumnReader`` object. """ -from __future__ import division, with_statement import struct import warnings @@ -69,7 +68,7 @@ # Base classes -class Column(object): +class Column: """Represents a "column" of rows mapping docnums to document values. The interface requires that you store the start offset of the column, the @@ -116,7 +115,7 @@ def stores_lists(self): return False -class ColumnWriter(object): +class ColumnWriter: def __init__(self, dbfile): self._dbfile = dbfile self._count = 0 @@ -136,7 +135,7 @@ def finish(self, docnum): pass -class ColumnReader(object): +class ColumnReader: def __init__(self, dbfile, basepos, length, doccount): self._dbfile = dbfile self._basepos = basepos @@ -242,7 +241,7 @@ def finish(self, doccount): # ...but if we wrote offsets, make the last byte "X" so we know if write_offsets: dbfile.write(offsets.typecode.encode("ascii")) - dbfile.write("X".encode("ascii")) + dbfile.write(b"X") class Reader(ColumnReader): def __init__(self, dbfile, basepos, length, doccount): @@ -1071,8 +1070,7 @@ def __getitem__(self, docnum): def __iter__(self): for r in self._readers: - for v in r: - yield v + yield from r class TranslatingColumnReader(ColumnReader): diff --git a/src/whoosh/compat.py b/src/whoosh/compat.py index c2b602eb..9bd790c7 100644 --- a/src/whoosh/compat.py +++ b/src/whoosh/compat.py @@ -76,7 +76,7 @@ def b(s): import io BytesIO = io.BytesIO - callable = lambda o: isinstance(o, collections.Callable) + callable = lambda o: isinstance(o, collections.abc.Callable) exec_ = eval("exec") integer_types = (int,) iteritems = lambda o: o.items() @@ -197,8 +197,7 @@ def sentinel(counter=([fillvalue] * (len(args) - 1)).pop): fillers = repeat(fillvalue) iters = [chain(it, sentinel(), fillers) for it in args] try: - for tup in izip(*iters): - yield tup + yield from izip(*iters) except IndexError: pass diff --git a/src/whoosh/externalsort.py b/src/whoosh/externalsort.py index 510441a8..46fd39b8 100644 --- a/src/whoosh/externalsort.py +++ b/src/whoosh/externalsort.py @@ -29,7 +29,6 @@ This module implements a general external merge sort for Python objects. """ -from __future__ import with_statement import os import tempfile @@ -84,7 +83,7 @@ def imerge(iterables): return -class SortingPool(object): +class SortingPool: """This object implements a general K-way external merge sort for Python objects. @@ -148,8 +147,7 @@ def _read_run(self, path): def _merge_runs(self, paths): iters = [self._read_run(path) for path in paths] - for item in imerge(iters): - yield item + yield from imerge(iters) def add(self, item): """Adds `item` to the pool to be sorted.""" diff --git a/src/whoosh/fields.py b/src/whoosh/fields.py index 763a5a02..ef585192 100644 --- a/src/whoosh/fields.py +++ b/src/whoosh/fields.py @@ -58,7 +58,7 @@ class UnknownFieldError(Exception): # Field Types -class FieldType(object): +class FieldType: """ Represents a field configuration. @@ -133,7 +133,7 @@ def __init__( self.vector = None def __repr__(self): - return "%s(format=%r, scorable=%s, stored=%s, unique=%s)" % ( + return "{}(format={!r}, scorable={}, stored={}, unique={})".format( self.__class__.__name__, self.format, self.scorable, @@ -689,8 +689,7 @@ def index(self, num, **kwargs): # If the user gave us a list of numbers, recurse on the list if isinstance(num, (list, tuple)): for n in num: - for item in self.index(n): - yield item + yield from self.index(n) return # word, freq, weight, valuestring @@ -837,7 +836,7 @@ def __init__(self, stored=False, unique=False, sortable=False): :param unique: Whether the value of this field is unique per-document. """ - super(DATETIME, self).__init__( + super().__init__( int, 64, stored=stored, unique=unique, shift_step=8, sortable=sortable ) @@ -1356,7 +1355,7 @@ def subfields(self): class MetaSchema(type): def __new__(cls, name, bases, attrs): - super_new = super(MetaSchema, cls).__new__ + super_new = super().__new__ if not any(b for b in bases if isinstance(b, MetaSchema)): # If this isn't a subclass of MetaSchema, don't do anything special return super_new(cls, name, bases, attrs) @@ -1380,7 +1379,7 @@ def schema(self): return Schema(**self._clsfields) -class Schema(object): +class Schema: """ Represents the collection of fields in an index. Maps field names to FieldType objects which define the behavior of each field. diff --git a/src/whoosh/filedb/compound.py b/src/whoosh/filedb/compound.py index 3840ddab..7f1e2793 100644 --- a/src/whoosh/filedb/compound.py +++ b/src/whoosh/filedb/compound.py @@ -73,7 +73,7 @@ def __init__(self, dbfile, use_mmap=True, basepos=0): try: fileno = self._file.fileno() self._source = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ) - except (mmap.error, OSError): + except OSError: e = sys.exc_info()[1] # If we got an error because there wasn't enough memory to # open the map, ignore it and fall through, we'll just use the @@ -185,7 +185,7 @@ def write_dir(dbfile, basepos, directory, options=None): dbfile.close() -class SubFile(object): +class SubFile: def __init__(self, parentfile, offset, length, name=None): self._file = parentfile self._offset = offset @@ -247,7 +247,7 @@ def tell(self): return self._pos -class CompoundWriter(object): +class CompoundWriter: def __init__(self, tempstorage, buffersize=32 * 1024): assert isinstance(buffersize, int) self._tempstorage = tempstorage @@ -298,7 +298,7 @@ def save_as_files(self, storage, name_fn): f.write(block) f.close() - class SubStream(object): + class SubStream: def __init__(self, dbfile, buffersize): self._dbfile = dbfile self._buffersize = buffersize diff --git a/src/whoosh/filedb/fileindex.py b/src/whoosh/filedb/fileindex.py index cd1e9160..9b3e9985 100644 --- a/src/whoosh/filedb/fileindex.py +++ b/src/whoosh/filedb/fileindex.py @@ -42,7 +42,7 @@ # well as Index for convenience, so they're broken out here. -class SegmentDeletionMixin(object): +class SegmentDeletionMixin: """Mix-in for classes that support deleting documents from self.segments.""" def delete_document(self, docnum, delete=True): @@ -272,7 +272,7 @@ def _clean_files(self): # probably be deleted eventually by a later call to clean_files. storage = self.storage - current_segment_names = set(s.name for s in self.segments) + current_segment_names = {s.name for s in self.segments} tocpattern = _toc_pattern(self.indexname) segpattern = _segment_pattern(self.indexname) @@ -317,7 +317,7 @@ def writer(self, **kwargs): # SegmentSet object -class SegmentSet(object): +class SegmentSet: """This class is never instantiated by the user. It is used by the Index object to keep track of the segments in the index. """ @@ -450,7 +450,7 @@ def reader(self, storage, schema): return MultiReader(readers, schema) -class Segment(object): +class Segment: """Do not instantiate this object directly. It is used by the Index object to hold information about a segment. A list of objects of this class are pickled as part of the TOC file. diff --git a/src/whoosh/filedb/filepostings.py b/src/whoosh/filedb/filepostings.py index fbd0c0a0..abbae1b5 100644 --- a/src/whoosh/filedb/filepostings.py +++ b/src/whoosh/filedb/filepostings.py @@ -25,7 +25,7 @@ from whoosh.writing import PostingWriter -class BlockInfo(object): +class BlockInfo: __slots__ = ( "nextoffset", "postcount", @@ -318,8 +318,7 @@ def all_ids(self): blockinfo = self._read_blockinfo(nextoffset) nextoffset = blockinfo.nextoffset ids, __ = self._read_ids(blockinfo.dataoffset, blockinfo.postcount) - for id in ids: - yield id + yield from ids def next(self): if self.i == self.blockinfo.postcount - 1: diff --git a/src/whoosh/filedb/filereading.py b/src/whoosh/filedb/filereading.py index f7e8bf43..8b3ea6e4 100644 --- a/src/whoosh/filedb/filereading.py +++ b/src/whoosh/filedb/filereading.py @@ -66,7 +66,7 @@ def decode_storedfields(value): # Field length file scorables = schema.scorable_fields() if scorables: - self.indices = dict((fieldnum, i) for i, fieldnum in enumerate(scorables)) + self.indices = {fieldnum: i for i, fieldnum in enumerate(scorables)} lengthcount = segment.doc_count_all() * len(self.indices) flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = flf.read_array("B", lengthcount) diff --git a/src/whoosh/filedb/filestore.py b/src/whoosh/filedb/filestore.py index 4a793e35..d142e898 100644 --- a/src/whoosh/filedb/filestore.py +++ b/src/whoosh/filedb/filestore.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement import errno import os @@ -53,7 +52,7 @@ class ReadOnlyError(StorageError): # Base class -class Storage(object): +class Storage: """Abstract base class for storage objects. A storage object is a virtual flat filesystem, allowing the creation and @@ -466,7 +465,7 @@ def destroy(self): try: # Try to remove the directory os.rmdir(self.folder) - except IOError: + except OSError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: pass @@ -530,7 +529,7 @@ def clean(self, ignore=False): def list(self): try: files = os.listdir(self.folder) - except IOError: + except OSError: files = [] return files diff --git a/src/whoosh/filedb/filetables.py b/src/whoosh/filedb/filetables.py index 53c4d4d6..4299cc0c 100644 --- a/src/whoosh/filedb/filetables.py +++ b/src/whoosh/filedb/filetables.py @@ -85,7 +85,7 @@ def crc_hash(key): # Basic hash file -class HashWriter(object): +class HashWriter: """Implements a fast on-disk key-value store. This hash uses a two-level hashing scheme, where a key is hashed, the low eight bits of the hash value are used to index into one of 256 hash tables. This is basically the CDB @@ -219,7 +219,7 @@ def close(self): return endpos -class HashReader(object): +class HashReader: """Reader for the fast on-disk key-value files created by :class:`HashWriter`. """ @@ -496,8 +496,7 @@ def ranges_from(self, key): if pos is None: return - for item in self._ranges(pos=pos): - yield item + yield from self._ranges(pos=pos) def keys_from(self, key): """Yields an ordered series of keys equal to or greater than the given @@ -724,8 +723,7 @@ def term_ranges_from(self, fieldname, btext): return startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] - for item in self._ranges(pos, ixpos): - yield item + yield from self._ranges(pos, ixpos) def terms_from(self, fieldname, btext): dbfile = self.dbfile diff --git a/src/whoosh/filedb/gae.py b/src/whoosh/filedb/gae.py index c79b400c..7ffca5ea 100644 --- a/src/whoosh/filedb/gae.py +++ b/src/whoosh/filedb/gae.py @@ -37,7 +37,7 @@ class DatastoreFile(db.Model): mtime = db.IntegerProperty(default=0) def __init__(self, *args, **kwargs): - super(DatastoreFile, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.data = BytesIO() @classmethod @@ -78,7 +78,7 @@ def getvalue(self): return self.data.getvalue() -class MemcacheLock(object): +class MemcacheLock: def __init__(self, name): self.name = name diff --git a/src/whoosh/filedb/pools.py b/src/whoosh/filedb/pools.py index 3ca83768..fee6f3d0 100644 --- a/src/whoosh/filedb/pools.py +++ b/src/whoosh/filedb/pools.py @@ -198,7 +198,7 @@ def write_postings(schema, termtable, postwriter, postiter): ) -class LengthSpool(object): +class LengthSpool: def __init__(self, filename): self.filename = filename self.file = None @@ -224,7 +224,7 @@ def readback(self): f.close() -class PoolBase(object): +class PoolBase: def __init__(self, dir): self._dir = dir self._fieldlength_totals = defaultdict(int) diff --git a/src/whoosh/filedb/structfile.py b/src/whoosh/filedb/structfile.py index 84ffa39d..9db58ffd 100644 --- a/src/whoosh/filedb/structfile.py +++ b/src/whoosh/filedb/structfile.py @@ -61,7 +61,7 @@ ) from whoosh.util.varints import decode_signed_varint, read_varint, signed_varint, varint -_SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf") +_SIZEMAP = {typecode: calcsize(typecode) for typecode in "bBiIhHqQf"} _ORDERMAP = {"little": "<", "big": ">"} _types = (("sbyte", "b"), ("ushort", "H"), ("int", "i"), ("long", "q"), ("float", "f")) @@ -70,7 +70,7 @@ # Main function -class StructFile(object): +class StructFile: """Returns a "structured file" object that wraps the given file object and provides numerous additional methods for writing structured data, such as "write_varint" and "write_long". diff --git a/src/whoosh/formats.py b/src/whoosh/formats.py index 23162aed..ef36f195 100644 --- a/src/whoosh/formats.py +++ b/src/whoosh/formats.py @@ -48,7 +48,7 @@ # Format base class -class Format(object): +class Format: """Abstract base class representing a storage format for a field or vector. Format objects are responsible for writing and reading the low-level representation of a field. It controls what kind/level of information to @@ -152,7 +152,7 @@ def __init__(self, field_boost=1.0, **options): def word_values(self, value, analyzer, **kwargs): fb = self.field_boost - wordset = set(t.text for t in tokens(value, analyzer, kwargs)) + wordset = {t.text for t in tokens(value, analyzer, kwargs)} return ((w, 1, fb, emptybytes) for w in wordset) def encode(self, value): diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py index 254ae3d2..562c68c1 100644 --- a/src/whoosh/highlight.py +++ b/src/whoosh/highlight.py @@ -48,7 +48,6 @@ See :doc:`/highlight` for more information. """ -from __future__ import division from collections import deque from heapq import nlargest @@ -80,7 +79,7 @@ def mkfrag(text, tokens, startchar=None, endchar=None, charsbefore=0, charsafter return Fragment(text, tokens, startchar, endchar) -class Fragment(object): +class Fragment: """Represents a fragment (extract) from a hit document. This object is mainly used to keep track of the start and end points of the fragment and the "matched" character ranges inside; it does not contain the text of the @@ -194,7 +193,7 @@ def set_matched_filter_phrases(tokens, text, terms, phrases): """ Implementation note: Because the Token object follows a Singleton pattern, we can only read each one once. Because phrase matching requires rescanning, - we require a rendered token list (the text parameter) instead. The function must + we require a rendered token list (the text parameter) instead. The function must still yield Token objects at the end, so the text list is used as a way to build a list of Token indices (the matches set). The yield loop at the end uses this to properly set .matched on the yielded Token objects. @@ -274,7 +273,7 @@ def set_matched_filter_phrases(tokens, text, terms, phrases): # Fragmenters -class Fragmenter(object): +class Fragmenter: def must_retokenize(self): """Returns True if this fragmenter requires retokenized text. @@ -605,7 +604,7 @@ def fragment_matches(self, text, tokens): # Fragment scorers -class FragmentScorer(object): +class FragmentScorer: pass @@ -662,7 +661,7 @@ def get_text(original, token, replace): return original[token.startchar : token.endchar] -class Formatter(object): +class Formatter: """Base class for formatters. For highlighters that return strings, it is usually only necessary to @@ -959,7 +958,7 @@ def highlight( return formatter(text, fragments) -class Highlighter(object): +class Highlighter: def __init__( self, fragmenter=None, diff --git a/src/whoosh/idsets.py b/src/whoosh/idsets.py index d6c2c1bd..36f4bfc8 100644 --- a/src/whoosh/idsets.py +++ b/src/whoosh/idsets.py @@ -273,7 +273,7 @@ ) -class DocIdSet(object): +class DocIdSet: """Base class for a set of positive integers, implementing a subset of the built-in ``set`` type's interface with extra docid-related methods. @@ -765,10 +765,10 @@ def difference_update(self, other): self.data = array(self.typecode, (num for num in self if num not in other)) def intersection(self, other): - return SortedIntSet((num for num in self if num in other)) + return SortedIntSet(num for num in self if num in other) def difference(self, other): - return SortedIntSet((num for num in self if num not in other)) + return SortedIntSet(num for num in self if num not in other) def first(self): return self.data[0] diff --git a/src/whoosh/index.py b/src/whoosh/index.py index c2966d4b..eac6c603 100644 --- a/src/whoosh/index.py +++ b/src/whoosh/index.py @@ -29,7 +29,6 @@ an index. """ -from __future__ import division import os.path import re @@ -217,7 +216,7 @@ def version(storage, indexname=None): # Index base class -class Index(object): +class Index: """Represents an indexed collection of documents.""" def close(self): @@ -377,7 +376,7 @@ def clean_files(storage, indexname, gen, segments): # open, they may not be deleted immediately (i.e. on Windows) but will # probably be deleted eventually by a later call to clean_files. - current_segment_names = set(s.segment_id() for s in segments) + current_segment_names = {s.segment_id() for s in segments} tocpattern = TOC._pattern(indexname) segpattern = TOC._segment_pattern(indexname) @@ -516,9 +515,7 @@ def _reader(cls, storage, schema, segments, generation, reuse=None): if reuse: # Put all atomic readers in a dictionary readers = [r for r, _ in reuse.leaf_readers()] - reusable = dict( - (r.segment(), r) for r in readers if r.segment() is not None - ) + reusable = {r.segment(): r for r in readers if r.segment() is not None} # Make a function to open readers, which reuses reusable readers. # It removes any readers it reuses from the "reusable" dictionary, @@ -561,7 +558,7 @@ def reader(self, reuse=None): info.generation, reuse=reuse, ) - except IOError: + except OSError: # Presume that we got a "file not found error" because a writer # deleted one of the files just as we were trying to open it, # and so retry a few times before actually raising the @@ -576,7 +573,7 @@ def reader(self, reuse=None): # TOC class -class TOC(object): +class TOC: """Object representing the state of the index after a commit. Essentially a container for the index's schema and the list of segment objects. """ diff --git a/src/whoosh/lang/__init__.py b/src/whoosh/lang/__init__.py index b4cff58c..1d4924cd 100644 --- a/src/whoosh/lang/__init__.py +++ b/src/whoosh/lang/__init__.py @@ -1,5 +1,3 @@ -# coding=utf-8 - # Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/src/whoosh/lang/dmetaphone.py b/src/whoosh/lang/dmetaphone.py index b7bb23ce..d6b02b1d 100644 --- a/src/whoosh/lang/dmetaphone.py +++ b/src/whoosh/lang/dmetaphone.py @@ -1,5 +1,3 @@ -# coding= utf-8 - # This script implements the Double Metaphone algorythm (c) 1998, 1999 by # Lawrence Philips. It was translated to Python from the C source written by # Kevin Atkinson (http://aspell.net/metaphone/) By Andrew Collins - January 12, diff --git a/src/whoosh/lang/isri.py b/src/whoosh/lang/isri.py index 0e79d15c..97d32328 100644 --- a/src/whoosh/lang/isri.py +++ b/src/whoosh/lang/isri.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Natural Language Toolkit: The ISRI Arabic Stemmer # @@ -34,12 +33,11 @@ root. """ -from __future__ import unicode_literals import re -class ISRIStemmer(object): +class ISRIStemmer: """ ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. Information Science Research Institute. University of Nevada, Las Vegas, USA. diff --git a/src/whoosh/lang/morph_en.py b/src/whoosh/lang/morph_en.py index 29e9b58c..ff8221f5 100644 --- a/src/whoosh/lang/morph_en.py +++ b/src/whoosh/lang/morph_en.py @@ -611,7 +611,7 @@ class of Sun's `Minion search engine `_. # Words ending in S # (e.g., happiness, business) ( - r"[%s].*[%s](iness)" % (vowels, cons), + rf"[{vowels}].*[{cons}](iness)", "y,ies,ier,iers,iest,ied,ying,yings,ily,inesses,iment,iments,iless,iful", ), # (e.g., baseless, shoeless) @@ -621,7 +621,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., gutless, hatless, spotless) ( - r"[%s][%s][bdgklmnprt]?(less)" % (cons, vowels), + rf"[{cons}][{vowels}][bdgklmnprt]?(less)", ",s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,ful", ), # (e.g., thoughtless, worthless) @@ -661,12 +661,12 @@ class of Sun's `Minion search engine `_. ), # (e.g., tokenizes) // adds British variations ( - r"[%s].*[%s](izes)" % (vowels, cons), + rf"[{vowels}].*[{cons}](izes)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenises) // British variant // ~expertise ( - r"[%s].*[%s](ises)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ises)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., aches, arches) @@ -688,15 +688,15 @@ class of Sun's `Minion search engine `_. # (e.g., judgments, abridgments) (r"[%s].*dg(ments)" % vowels, "ment,*ments"), # (e.g., merriments, embodiments) -iment in turn will generate y and *y (redo y) - (r"[%s].*[%s]iment(s)" % (vowels, cons), ",*"), + (rf"[{vowels}].*[{cons}]iment(s)", ",*"), # (e.g., atonements, entrapments) (r"[%s].*ment(s)" % vowels, ",*"), # (e.g., viewers, meters, traders, transfers) (r"[%s].*er(s)" % vowels, ",*"), # (e.g., unflags) polysyllables - (r"[%s].*[%s][%s][bdglmnprt](s)" % (vowels, cons, vowels), ",*"), + (rf"[{vowels}].*[{cons}][{vowels}][bdglmnprt](s)", ",*"), # (e.g., frogs) monosyllables - (r"[%s][%s][bdglmnprt](s)" % (vowels, cons), ",*"), + (rf"[{vowels}][{cons}][bdglmnprt](s)", ",*"), # (e.g., killings, muggings) (r"[%s].*ing(s)" % vowels, ",*"), # (e.g., hulls, tolls) @@ -704,16 +704,16 @@ class of Sun's `Minion search engine `_. # e.g., boas, polkas, spas) don't generate latin endings (r"a(s)", ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., beads, toads) - (r"[%s].*[%s].*(s)" % (vowels, cons), ",*"), + (rf"[{vowels}].*[{cons}].*(s)", ",*"), # (e.g., boas, zoos) ( - r"[%s].*[%s](s)" % (cons, vowels), + rf"[{cons}].*[{vowels}](s)", ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., ss, sss, ssss) no vowel (vowel case is already handled above) (r"ss()", ""), # (e.g., cds, lcds, m-16s) no vowel (can be a plural noun, but not verb) - (r"[%s].*[%s1234567890](s)" % (cons, cons), ""), + (rf"[{cons}].*[{cons}1234567890](s)", ""), # Words ending in E # (e.g., apple, so it doesn't include apply) (r"appl(e)", "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ement,ements,eless,eful"), @@ -724,7 +724,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., able, abominable, fungible, table, enable, idle, subtle) ( - r"[%s].*[%s]l(e)" % (vowels, cons), + rf"[{vowels}].*[{cons}]l(e)", "es,er,ers,est,ed,ing,ings,y,ely,eness,enesses,ement,ements,eless,eful", ), # (e.g., bookie, magpie, vie) @@ -746,17 +746,17 @@ class of Sun's `Minion search engine `_. ), # (e.g., tokenize) // adds British variations ( - r"[%s].*[%s](ize)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ize)", "izes,izer,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenise) // British variant // ~expertise ( - r"[%s].*[%s](ise)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ise)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tree, agree, rage, horse, hoarse) ( - r"[%s].*[%s](e)" % (vowels, cons), + rf"[{vowels}].*[{cons}](e)", "es,er,ers,est,ed,ing,ings,eing,eings,ely,eness,enesses,ement,ements,eless,eful", ), # Words ending in -ED @@ -774,7 +774,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., controlled, fulfilled, rebelled) ( - r"[%s].*[%s].*l(led)" % (vowels, cons), + rf"[{vowels}].*[{cons}].*l(led)", ",s,er,ers,est,ing,ings,ly,ness,nesses,ment,ments,less,ful,&,&s,&er,&ers,&est,&ing,&ings,&y,&ness,&nesses,&ment,&ments,&ful", ), # (e.g., pulled, filled, fulled) @@ -794,12 +794,12 @@ class of Sun's `Minion search engine `_. ), # (e.g., tokenize) // adds British variations ( - r"[%s].*[%s](ized)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ized)", "izes,izer,izers,ize,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenise) // British variant // ~expertise ( - r"[%s].*[%s](ized)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ized)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ise,ising,isings,isation,isations", ), # (e.g., spoiled, tooled, tracked, roasted, atoned, abridged) @@ -819,7 +819,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., acidifier, saltier) ( - r"[%s].*[%s](ier)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ier)", "y,ie,ies,iest,ied,ying,yings,ily,yly,iness,yness,inesses,ynesses,yment,yments,yless,yful,iment,iments,iless,iful,iers,iered,iering,ierings,ierly,ierness,iernesses,ierment,ierments,ierless,ierful,ierer,ierers,ierest", ), # (e.g., puller, filler, fuller) @@ -834,17 +834,17 @@ class of Sun's `Minion search engine `_. ), # (e.g., bigger, trekker, hitter) ( - r"[%s][%s](?P[bdgkmnprt])((?P=er1)er)" % (cons, vowels), + rf"[{cons}][{vowels}](?P[bdgkmnprt])((?P=er1)er)", "s,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erments,&erless,&erful", ), # (e.g., tokenize) // adds British variations ( - r"[%s].*[%s](izer)" % (vowels, cons), + rf"[{vowels}].*[{cons}](izer)", "izes,ize,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenise) // British variant // ~expertise ( - r"[%s].*[%s](iser)" % (vowels, cons), + rf"[{vowels}].*[{cons}](iser)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,ise,isers,ised,ising,isings,isation,isations", ), # (e.g., actioner, atoner, icer, trader, accruer, churchgoer, prefer) @@ -870,7 +870,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., biggest) ( - r"[%s][%s](?P[bdglmnprst])((?P=est1)est)" % (cons, vowels), + rf"[{cons}][{vowels}](?P[bdglmnprst])((?P=est1)est)", ",s,&er,&ers,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., basest, archest, rashest) @@ -892,7 +892,7 @@ class of Sun's `Minion search engine `_. (r"est", "s,er,ers,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # Words ending in -FUL # (e.g., beautiful, plentiful) - (r"[%s].*[%s](iful)" % (vowels, cons), "ifully,ifulness,*y"), + (rf"[{vowels}].*[{cons}](iful)", "ifully,ifulness,*y"), # (e.g., hopeful, sorrowful) (r"[%s].*(ful)" % vowels, "fully,fulness,,*"), # Words ending in -ICAL @@ -917,7 +917,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., hugging, trekking) ( - r"[%s][%s](?P[bdgklmnprt])((?P=ing1)ing)" % (cons, vowels), + rf"[{cons}][{vowels}](?P[bdgklmnprt])((?P=ing1)ing)", ",s,&er,&ers,&est,&ed,&ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., freeing, agreeing) @@ -937,22 +937,22 @@ class of Sun's `Minion search engine `_. ), # (e.g., editing, crediting, expediting, siting, exciting) ( - r"[%s].*[%s][eio]t(ing)" % (vowels, cons), + rf"[{vowels}].*[{cons}][eio]t(ing)", ",*,*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful", ), # (e.g., robing, siding, doling, translating, flaking) ( - r"[%s][%s][bdgklmt](ing)" % (cons, vowels), + rf"[{cons}][{vowels}][bdgklmt](ing)", "*e,ings,inger,ingers,ingest,inged,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful", ), # (e.g., tokenize) // adds British variations ( - r"[%s].*[%s](izing)" % (vowels, cons), + rf"[{vowels}].*[{cons}](izing)", "izes,izer,izers,ized,ize,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenise) // British variant // ~expertise ( - r"[%s].*[%s](ising)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ising)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ise,isings,isation,isations", ), # (e.g., icing, aging, achieving, amazing, housing) @@ -972,7 +972,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., farming, harping, interesting, bedspring, redwing) ( - r"[%s].*[%s][bdfjkmnpqrtwxz](ing)" % (vowels, cons), + rf"[{vowels}].*[{cons}][bdfjkmnpqrtwxz](ing)", ",*,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful", ), # (e.g., spoiling, reviling, autoing, egging, hanging, hingeing) @@ -998,9 +998,9 @@ class of Sun's `Minion search engine `_. # (e.g., judgment, abridgment) (r"[%s].*dg(ment)" % vowels, "*e"), # (e.g., merriment, embodiment) - (r"[%s].*[%s](iment)" % (vowels, cons), "*y"), + (rf"[{vowels}].*[{cons}](iment)", "*y"), # (e.g., atonement, entrapment) - (r"[%s].*[%s](ment)" % (vowels, cons), ",*"), + (rf"[{vowels}].*[{cons}](ment)", ",*"), # Words ending in -O # (e.g., taboo, rodeo) ( @@ -1026,7 +1026,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., happily, dizzily) ( - r"[%s].*[%s](ily)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ily)", "y,ies,ier,iers,iest,ied,ying,yings,yness,iness,ynesses,inesses,iment,iments,iless,iful", ), # (e.g., peaceful+ly) @@ -1064,24 +1064,24 @@ class of Sun's `Minion search engine `_. ), # (e.g., unflag, open, besot) ( - r"[%s].*[%s][%s][bdglmnprt]()" % (vowels, cons, vowels), + rf"[{vowels}].*[{cons}][{vowels}][bdglmnprt]()", "s,er,ers,est,ed,ing,ings,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., bed, cop) ( - r"[%s][%s][bdglmnprt]()" % (cons, vowels), + rf"[{cons}][{vowels}][bdglmnprt]()", "s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., schemata, automata) ( - r"[%s].*[%s][%s]ma(ta)" % (vowels, cons, vowels), + rf"[{vowels}].*[{cons}][{vowels}]ma(ta)", ",s,tas,tum,tums,ton,tons,tic,tical", ), # (e.g., chordata, data, errata, sonata, toccata) (r"[%s].*t(a)" % vowels, "as,ae,um,ums,on,ons,ic,ical"), # (e.g., polka, spa, schema, ova, polyhedra) ( - r"[%s].*[%s](a)" % (vowels, cons), + rf"[{vowels}].*[{cons}](a)", "as,aed,aing,ae,ata,um,ums,on,ons,al,atic,atical", ), # (e.g., full) @@ -1135,7 +1135,7 @@ def variations(word): ending = groups[-1] root = word[: 0 - len(ending)] if ending else word - out = set((word,)) + out = {word} results = rules[i * _partition_size + num][1] for result in results.split(","): if result.startswith("&"): diff --git a/src/whoosh/lang/paicehusk.py b/src/whoosh/lang/paicehusk.py index d3631cff..1540318f 100644 --- a/src/whoosh/lang/paicehusk.py +++ b/src/whoosh/lang/paicehusk.py @@ -18,7 +18,7 @@ from collections import defaultdict -class PaiceHuskStemmer(object): +class PaiceHuskStemmer: """Implements the Paice-Husk stemming algorithm.""" rule_expr = re.compile( diff --git a/src/whoosh/lang/phonetic.py b/src/whoosh/lang/phonetic.py index 4a760ec7..fcec3c7c 100644 --- a/src/whoosh/lang/phonetic.py +++ b/src/whoosh/lang/phonetic.py @@ -1,5 +1,3 @@ -# encoding: utf-8 - """ This module contains quasi-phonetic encoders for words in different languages. """ diff --git a/src/whoosh/lang/snowball/bases.py b/src/whoosh/lang/snowball/bases.py index 776e94aa..864d8a07 100644 --- a/src/whoosh/lang/snowball/bases.py +++ b/src/whoosh/lang/snowball/bases.py @@ -1,7 +1,7 @@ # Base classes -class _ScandinavianStemmer(object): +class _ScandinavianStemmer: """ This subclass encapsulates a method for defining the string region R1. @@ -44,7 +44,7 @@ def _r1_scandinavian(self, word, vowels): return r1 -class _StandardStemmer(object): +class _StandardStemmer: """ This subclass encapsulates two methods for defining the standard versions of the string regions R1, R2, and RV. diff --git a/src/whoosh/lang/snowball/hungarian.py b/src/whoosh/lang/snowball/hungarian.py index 05597c5d..b3050721 100644 --- a/src/whoosh/lang/snowball/hungarian.py +++ b/src/whoosh/lang/snowball/hungarian.py @@ -1,7 +1,7 @@ from whoosh.compat import u -class HungarianStemmer(object): +class HungarianStemmer: """ The Hungarian Snowball stemmer. diff --git a/src/whoosh/lang/snowball/russian.py b/src/whoosh/lang/snowball/russian.py index dc4a825e..76e0ccb7 100644 --- a/src/whoosh/lang/snowball/russian.py +++ b/src/whoosh/lang/snowball/russian.py @@ -1,7 +1,7 @@ from whoosh.compat import u -class RussianStemmer(object): +class RussianStemmer: """ The Russian Snowball stemmer. diff --git a/src/whoosh/lang/stopwords.py b/src/whoosh/lang/stopwords.py index fab7b61c..1bb67370 100644 --- a/src/whoosh/lang/stopwords.py +++ b/src/whoosh/lang/stopwords.py @@ -1,7 +1,3 @@ -# coding=utf-8 - -from __future__ import unicode_literals - # Stopwords Corpus # # This module contains lists of stop words for several languages. These diff --git a/src/whoosh/lang/wordnet.py b/src/whoosh/lang/wordnet.py index bf859cc1..69fbffb3 100644 --- a/src/whoosh/lang/wordnet.py +++ b/src/whoosh/lang/wordnet.py @@ -97,7 +97,7 @@ def synonyms(word2nums, num2words, word): return sorted(syns) -class Thesaurus(object): +class Thesaurus: """Represents the WordNet synonym database, either loaded into memory from the wn_s.pl Prolog file, or stored on disk in a Whoosh index. diff --git a/src/whoosh/matching/binary.py b/src/whoosh/matching/binary.py index f15cabab..43eee663 100644 --- a/src/whoosh/matching/binary.py +++ b/src/whoosh/matching/binary.py @@ -34,7 +34,7 @@ class BiMatcher(mcore.Matcher): """ def __init__(self, a, b): - super(BiMatcher, self).__init__() + super().__init__() self.a = a self.b = b @@ -307,7 +307,7 @@ class DisjunctionMaxMatcher(UnionMatcher): # inheritance. def __init__(self, a, b, tiebreak=0.0): - super(DisjunctionMaxMatcher, self).__init__(a, b) + super().__init__(a, b) self.tiebreak = tiebreak def copy(self): @@ -406,7 +406,7 @@ class IntersectionMatcher(AdditiveBiMatcher): """Matches the intersection (AND) of the postings in the two sub-matchers.""" def __init__(self, a, b): - super(IntersectionMatcher, self).__init__(a, b) + super().__init__(a, b) self._find_first() def reset(self): @@ -562,7 +562,7 @@ class AndNotMatcher(BiMatcher): """ def __init__(self, a, b): - super(AndNotMatcher, self).__init__(a, b) + super().__init__(a, b) self._find_first() def reset(self): diff --git a/src/whoosh/matching/combo.py b/src/whoosh/matching/combo.py index 64cdb43d..63adcc37 100644 --- a/src/whoosh/matching/combo.py +++ b/src/whoosh/matching/combo.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division from array import array diff --git a/src/whoosh/matching/mcore.py b/src/whoosh/matching/mcore.py index a13a0d6e..6e8112b3 100644 --- a/src/whoosh/matching/mcore.py +++ b/src/whoosh/matching/mcore.py @@ -72,7 +72,7 @@ class NoQualityAvailable(Exception): # Classes -class Matcher(object): +class Matcher: """Base class for all matchers.""" @abstractmethod @@ -108,8 +108,7 @@ def term_matchers(self): yield self else: for cm in self.children(): - for m in cm.term_matchers(): - yield m + yield from cm.term_matchers() def matching_terms(self, id=None): """Returns an iterator of ``("fieldname", "termtext")`` tuples for the diff --git a/src/whoosh/matching/wrappers.py b/src/whoosh/matching/wrappers.py index 1515475a..ee7ab92e 100644 --- a/src/whoosh/matching/wrappers.py +++ b/src/whoosh/matching/wrappers.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division from whoosh.matching import mcore @@ -130,7 +129,7 @@ def __init__(self, matchers, idoffsets, scorer=None, current=0): self._next_matcher() def __repr__(self): - return "%s(%r, %r, current=%s)" % ( + return "{}({!r}, {!r}, current={})".format( self.__class__.__name__, self.matchers, self.offsets, @@ -268,14 +267,14 @@ def __init__(self, child, ids, exclude=False, boost=1.0): the wrapped matcher that are **not in** the set are used. """ - super(FilterMatcher, self).__init__(child) + super().__init__(child) self._ids = ids self._exclude = exclude self.boost = boost self._find_next() def __repr__(self): - return "%s(%r, %r, %r, boost=%s)" % ( + return "{}({!r}, {!r}, {!r}, boost={})".format( self.__class__.__name__, self.child, self._ids, @@ -339,7 +338,7 @@ class InverseMatcher(WrappingMatcher): """ def __init__(self, child, limit, missing=None, weight=1.0, id=0): - super(InverseMatcher, self).__init__(child) + super().__init__(child) self.limit = limit self._weight = weight self.missing = missing or (lambda id: False) diff --git a/src/whoosh/multiproc.py b/src/whoosh/multiproc.py index 6d7bcc4a..8ddf5f49 100644 --- a/src/whoosh/multiproc.py +++ b/src/whoosh/multiproc.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement from multiprocessing import Process, Queue, cpu_count diff --git a/src/whoosh/qparser/common.py b/src/whoosh/qparser/common.py index a10d0d68..9195241b 100644 --- a/src/whoosh/qparser/common.py +++ b/src/whoosh/qparser/common.py @@ -35,7 +35,7 @@ class QueryParserError(Exception): def __init__(self, cause, msg=None): - super(QueryParserError, self).__init__(str(cause)) + super().__init__(str(cause)) self.cause = cause diff --git a/src/whoosh/qparser/dateparse.py b/src/whoosh/qparser/dateparse.py index 2ee463f0..6a29b252 100644 --- a/src/whoosh/qparser/dateparse.py +++ b/src/whoosh/qparser/dateparse.py @@ -59,7 +59,7 @@ def print_debug(level, msg, *args): # Parser element objects -class Props(object): +class Props: """A dumb little object that just puts copies a dictionary into attibutes so I can use dot syntax instead of square bracket string item lookup and save a little bit of typing. Used by :class:`Regex`. @@ -75,7 +75,7 @@ def get(self, key, default=None): return self.__dict__.get(key, default) -class ParserBase(object): +class ParserBase: """Base class for date parser elements.""" def to_parser(self, e): @@ -127,7 +127,7 @@ def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", name=None, progressive=Fals sequence matches like ``a[b[c]]``. """ - super(Sequence, self).__init__(elements, name) + super().__init__(elements, name) self.sep_pattern = sep if sep: self.sep_expr = rcompile(sep, re.IGNORECASE) @@ -210,7 +210,7 @@ def __init__( :param name: a name for this element (for debugging purposes only). """ - super(Combo, self).__init__(elements, sep=sep, name=name) + super().__init__(elements, sep=sep, name=name) self.fn = fn self.min = min self.max = max @@ -316,7 +316,7 @@ def __init__( :param name: a name for this element (for debugging purposes only). """ - super(Bag, self).__init__(elements, name) + super().__init__(elements, name) self.sep_expr = rcompile(sep, re.IGNORECASE) self.onceper = onceper self.requireall = requireall @@ -518,7 +518,7 @@ def __init__(self, years, months, weeks, days, hours, minutes, seconds): rel_mins = f"((?P[0-9]+) *({minutes}))?" rel_secs = f"((?P[0-9]+) *({seconds}))?" - self.pattern = "(?P[+-]) *%s *%s *%s *%s *%s *%s *%s(?=(\\W|$))" % ( + self.pattern = "(?P[+-]) *{} *{} *{} *{} *{} *{} *{}(?=(\\W|$))".format( rel_years, rel_months, rel_weeks, @@ -601,7 +601,7 @@ def props_to_date(self, p, dt): # Top-level parser classes -class DateParser(object): +class DateParser: """Base class for locale-specific parser classes.""" day = Regex( diff --git a/src/whoosh/qparser/default.py b/src/whoosh/qparser/default.py index e783147f..2b028e53 100644 --- a/src/whoosh/qparser/default.py +++ b/src/whoosh/qparser/default.py @@ -35,7 +35,7 @@ # Query parser object -class QueryParser(object): +class QueryParser: """A hand-written query parser built on modular plug-ins. The default configuration implements a powerful fielded query language similar to Lucene's. diff --git a/src/whoosh/qparser/plugins.py b/src/whoosh/qparser/plugins.py index c382ca43..d803ed97 100644 --- a/src/whoosh/qparser/plugins.py +++ b/src/whoosh/qparser/plugins.py @@ -35,7 +35,7 @@ from whoosh.util.text import rcompile -class Plugin(object): +class Plugin: """Base class for parser plugins.""" def taggers(self, parser): @@ -80,7 +80,7 @@ def filters(self, parser): def create(self, parser, match): # Groupdict keys can be unicode sometimes apparently? Convert them to # str for use as keyword arguments. This should be Py3-safe. - kwargs = dict((str(k), v) for k, v in iteritems(match.groupdict())) + kwargs = {str(k): v for k, v in iteritems(match.groupdict())} return self.nodetype(**kwargs) @@ -1282,7 +1282,7 @@ def __init__(self, map, group=syntax.OrGroup, mirror=False): self.group = group if mirror: # Add in reversed mappings - map.update(dict((v, k) for k, v in iteritems(map))) + map.update({v: k for k, v in iteritems(map)}) def filters(self, parser): # Run after the fieldname filter (100) but before multifield (110) diff --git a/src/whoosh/qparser/syntax.py b/src/whoosh/qparser/syntax.py index 0a51f9d8..99a1c6a6 100644 --- a/src/whoosh/qparser/syntax.py +++ b/src/whoosh/qparser/syntax.py @@ -32,7 +32,7 @@ from whoosh.qparser.common import QueryParserError, attach, get_single_text -class SyntaxNode(object): +class SyntaxNode: """Base class for nodes that make up the abstract syntax tree (AST) of a parsed user query string. The AST is an intermediate step, generated from the query string, then converted into a :class:`whoosh.query.Query` @@ -413,7 +413,7 @@ class ScaledOrGroup(OrGroup): def __init__(self, nodes=None, **kwargs): if "scale" in kwargs: del kwargs["scale"] - super(ScaledOrGroup, self).__init__(nodes=nodes, scale=scale, **kwargs) + super().__init__(nodes=nodes, scale=scale, **kwargs) return ScaledOrGroup diff --git a/src/whoosh/qparser/taggers.py b/src/whoosh/qparser/taggers.py index 2c7d46c1..46900203 100644 --- a/src/whoosh/qparser/taggers.py +++ b/src/whoosh/qparser/taggers.py @@ -30,7 +30,7 @@ # Tagger objects -class Tagger(object): +class Tagger: """Base class for taggers, objects which match syntax in the query string and translate it into a :class:`whoosh.qparser.syntax.SyntaxNode` object. """ diff --git a/src/whoosh/query/compound.py b/src/whoosh/query/compound.py index afa3e056..fc2787aa 100644 --- a/src/whoosh/query/compound.py +++ b/src/whoosh/query/compound.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division from whoosh import matching from whoosh.compat import text_type, u diff --git a/src/whoosh/query/nested.py b/src/whoosh/query/nested.py index e7e67d8c..8620d81d 100644 --- a/src/whoosh/query/nested.py +++ b/src/whoosh/query/nested.py @@ -128,8 +128,7 @@ def deletion_docs(self, searcher): docnum = m.id() parentdoc = bits.before(docnum + 1) nextparent = bits.after(docnum) or maxdoc - for i in range(parentdoc, nextparent): - yield i + yield from range(parentdoc, nextparent) m.skip_to(nextparent) class NestedParentMatcher(matching.Matcher): diff --git a/src/whoosh/query/positional.py b/src/whoosh/query/positional.py index f21076cd..bb6b381a 100644 --- a/src/whoosh/query/positional.py +++ b/src/whoosh/query/positional.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division import copy @@ -166,7 +165,7 @@ def __eq__(self, other): ) def __repr__(self): - return "%s(%r, %r, slop=%s, boost=%f)" % ( + return "{}({!r}, {!r}, slop={}, boost={:f})".format( self.__class__.__name__, self.fieldname, self.words, diff --git a/src/whoosh/query/qcore.py b/src/whoosh/query/qcore.py index 73c52080..a827a9bc 100644 --- a/src/whoosh/query/qcore.py +++ b/src/whoosh/query/qcore.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division import copy from array import array @@ -82,7 +81,7 @@ def token_lists(q, phrases=True): # Utility classes -class Lowest(object): +class Lowest: """A value that is always compares lower than any other object except itself. """ @@ -111,7 +110,7 @@ def __ge__(self, other): return self.__eq__(other) or self.__gt__(other) -class Highest(object): +class Highest: """A value that is always compares higher than any other object except itself. """ @@ -147,7 +146,7 @@ def __ge__(self, other): # Base classes -class Query(object): +class Query: """Abstract base class for all queries. Note that this base class implements __or__, __and__, and __sub__ to allow @@ -409,8 +408,7 @@ def leaves(self): yield self else: for q in self.children(): - for qq in q.leaves(): - yield qq + yield from q.leaves() def iter_all_terms(self, phrases=True): """Returns an iterator of (fieldname, text) pairs for all terms in @@ -435,8 +433,7 @@ def iter_all_terms(self, phrases=True): for q in self.leaves(): if q.has_terms(): - for t in q.terms(phrases=phrases): - yield t + yield from q.terms(phrases=phrases) def all_tokens(self, boost=1.0): """Returns an iterator of :class:`analysis.Token` objects corresponding @@ -448,13 +445,11 @@ def all_tokens(self, boost=1.0): """ if self.is_leaf(): - for token in self.tokens(boost): - yield token + yield from self.tokens(boost) else: boost *= self.boost if hasattr(self, "boost") else 1.0 for child in self.children(): - for token in child.all_tokens(boost): - yield token + yield from child.all_tokens(boost) def tokens(self, boost=1.0, exreader=None): """Yields zero or more :class:`analysis.Token` objects corresponding to @@ -495,7 +490,7 @@ def requires(self): # Subclasses should implement the _add_required_to(qset) method - return set([self]) + return {self} def field(self): """Returns the field this query matches in, or None if this query does diff --git a/src/whoosh/query/spans.py b/src/whoosh/query/spans.py index 5e89db75..a22c07fa 100644 --- a/src/whoosh/query/spans.py +++ b/src/whoosh/query/spans.py @@ -50,7 +50,7 @@ # Span class -class Span(object): +class Span: __slots__ = ("start", "end", "startchar", "endchar", "boost") def __init__(self, start, end=None, startchar=None, endchar=None, boost=1.0): @@ -198,7 +198,7 @@ class SpanWrappingMatcher(wrappers.WrappingMatcher): """ def __init__(self, child): - super(SpanWrappingMatcher, self).__init__(child) + super().__init__(child) self._spans = None if self.is_active(): self._find_next() @@ -338,7 +338,7 @@ def matcher(self, searcher, context=None): class SpanFirstMatcher(SpanWrappingMatcher): def __init__(self, child, limit=0): self.limit = limit - super(SpanFirst.SpanFirstMatcher, self).__init__(child) + super().__init__(child) def copy(self): return self.__class__(self.child.copy(), limit=self.limit) @@ -479,7 +479,7 @@ def __init__(self, a, b, slop=1, ordered=True, mindist=1): self.ordered = ordered self.mindist = mindist isect = binary.IntersectionMatcher(a, b) - super(SpanNear.SpanNearMatcher, self).__init__(isect) + super().__init__(isect) def copy(self): return self.__class__( @@ -632,7 +632,7 @@ def __init__(self, ms, slop=1, ordered=True, mindist=1): self.ordered = ordered self.mindist = mindist isect = make_binary_tree(binary.IntersectionMatcher, ms) - super(SpanNear2.SpanNear2Matcher, self).__init__(isect) + super().__init__(isect) def copy(self): return self.__class__( @@ -726,7 +726,7 @@ def __init__(self, a, b): self.a = a self.b = b um = binary.UnionMatcher(a, b) - super(SpanOr.SpanOrMatcher, self).__init__(um) + super().__init__(um) def _get_spans(self): a_active = self.a.is_active() @@ -797,7 +797,7 @@ def __init__(self, a, b): self.a = a self.b = b amm = binary.AndMaybeMatcher(a, b) - super(SpanNot._Matcher, self).__init__(amm) + super().__init__(amm) def _get_spans(self): if self.a.id() == self.b.id(): @@ -846,7 +846,7 @@ def __init__(self, a, b): self.a = a self.b = b im = binary.IntersectionMatcher(a, b) - super(SpanContains._Matcher, self).__init__(im) + super().__init__(im) def _get_spans(self): spans = [] @@ -892,7 +892,7 @@ def __init__(self, a, b): self.a = a self.b = b im = binary.IntersectionMatcher(a, b) - super(SpanBefore._Matcher, self).__init__(im) + super().__init__(im) def _get_spans(self): bminstart = min(bspan.start for bspan in self.b.spans()) @@ -922,7 +922,7 @@ class _Matcher(SpanBiMatcher): def __init__(self, a, b): self.a = a im = binary.IntersectionMatcher(a, b) - super(SpanCondition._Matcher, self).__init__(im) + super().__init__(im) def _get_spans(self): return self.a.spans() diff --git a/src/whoosh/query/terms.py b/src/whoosh/query/terms.py index 67b00a19..cffd471b 100644 --- a/src/whoosh/query/terms.py +++ b/src/whoosh/query/terms.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division import copy import fnmatch diff --git a/src/whoosh/query/wrappers.py b/src/whoosh/query/wrappers.py index 58d8ecb7..a3fc62eb 100644 --- a/src/whoosh/query/wrappers.py +++ b/src/whoosh/query/wrappers.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division from array import array diff --git a/src/whoosh/reading.py b/src/whoosh/reading.py index 9b41c8c8..484fff6c 100644 --- a/src/whoosh/reading.py +++ b/src/whoosh/reading.py @@ -59,7 +59,7 @@ class TermNotFound(Exception): # Term Info base class -class TermInfo(object): +class TermInfo: """Represents a set of statistics about a term. This object is returned by :meth:`IndexReader.term_info`. These statistics may be useful for optimizations and scoring algorithms. @@ -143,7 +143,7 @@ def max_id(self): # Reader base class -class IndexReader(object): +class IndexReader: """Do not instantiate this object directly. Instead use Index.reader().""" def __enter__(self): @@ -1222,8 +1222,7 @@ def column_reader(self, fieldname, column=None, reverse=False, translate=True): def all_stored_fields(self): for reader in self.readers: - for result in reader.all_stored_fields(): - yield result + yield from reader.all_stored_fields() def doc_count_all(self): return sum(dr.doc_count_all() for dr in self.readers) @@ -1279,7 +1278,7 @@ def combine_terminfos(tis): return TermInfo(w, df, ml, xl, xw, mid, xid) -class MultiCursor(object): +class MultiCursor: def __init__(self, cursors): self._cursors = [c for c in cursors if c.is_valid()] self._low = [] diff --git a/src/whoosh/scoring.py b/src/whoosh/scoring.py index 1d92e061..ad515b43 100644 --- a/src/whoosh/scoring.py +++ b/src/whoosh/scoring.py @@ -29,7 +29,6 @@ This module contains classes for scoring (and sorting) search results. """ -from __future__ import division from math import log, pi @@ -38,7 +37,7 @@ # Base classes -class WeightingModel(object): +class WeightingModel: """Abstract base class for scoring models. A WeightingModel object provides a method, ``scorer``, which returns an instance of :class:`whoosh.scoring.Scorer`. @@ -85,7 +84,7 @@ def final(self, searcher, docnum, score): return score -class BaseScorer(object): +class BaseScorer: """Base class for "scorer" implementations. A scorer provides a method for scoring a document, and sometimes methods for rating the "quality" of a document and a matcher's current "block", to implement quality-based diff --git a/src/whoosh/searching.py b/src/whoosh/searching.py index 805e99d2..d9e81f20 100644 --- a/src/whoosh/searching.py +++ b/src/whoosh/searching.py @@ -29,8 +29,6 @@ """ -from __future__ import division - import copy import weakref from math import ceil @@ -63,7 +61,7 @@ class TimeLimit(Exception): # Context class -class SearchContext(object): +class SearchContext: """A container for information about the current search that may be used by the collector or the query objects to change how they operate. """ @@ -99,7 +97,7 @@ def set(self, **kwargs): # Searcher class -class Searcher(object): +class Searcher: """Wraps an :class:`~whoosh.reading.IndexReader` object and provides methods for searching the index. """ @@ -615,7 +613,7 @@ def more_like( [query.Term(fieldname, word, boost=weight) for word, weight in kts] ) - return self.search(q, limit=top, filter=filter, mask=set([docnum])) + return self.search(q, limit=top, filter=filter, mask={docnum}) def search_page(self, query, pagenum, pagelen=10, **kwargs): """This method is Like the :meth:`Searcher.search` method, but returns @@ -980,7 +978,7 @@ def correct_query( return sqc.correct_query(q, qstring) -class Results(object): +class Results: """This object is returned by a Searcher. This object represents the results of a search query. You can mostly use it as if it was a list of dictionaries, where each dictionary is the stored fields of the document at @@ -1399,7 +1397,7 @@ def upgrade_and_extend(self, results): self.top_n = arein + notin + other -class Hit(object): +class Hit: """Represents a single search result ("hit") in a Results object. This object acts like a dictionary of the matching document's stored @@ -1612,7 +1610,7 @@ def __delitem__(self, key): raise NotImplementedError("You cannot modify a search result") -class ResultsPage(object): +class ResultsPage: """Represents a single page out of a longer list of results, as returned by :func:`whoosh.searching.Searcher.search_page`. Supports a subset of the interface of the :class:`~whoosh.searching.Results` object, namely getting diff --git a/src/whoosh/sorting.py b/src/whoosh/sorting.py index 79671331..ec12ebae 100644 --- a/src/whoosh/sorting.py +++ b/src/whoosh/sorting.py @@ -33,7 +33,7 @@ # Faceting objects -class FacetType(object): +class FacetType: """Base class for "facets", aspects that can be sorted/faceted.""" maptype = None @@ -63,7 +63,7 @@ def default_name(self): return "facet" -class Categorizer(object): +class Categorizer: """Base class for categorizer objects which compute a key value for a document based on certain criteria, for use in sorting/faceting. @@ -222,7 +222,7 @@ def __init__(self, global_searcher, fieldname, reverse=False): self._creader = None def __repr__(self): - return "%s(%r, %r, reverse=%r)" % ( + return "{}({!r}, {!r}, reverse={!r})".format( self.__class__.__name__, self._fieldobj, self._fieldname, @@ -855,7 +855,7 @@ def key_to_name(self, key): ) -class Facets(object): +class Facets: """Maps facet names to :class:`FacetType` objects, for creating multiple groupings of documents. @@ -949,7 +949,7 @@ def add_facets(self, facets, replace=True): # Objects for holding facet groups -class FacetMap(object): +class FacetMap: """Base class for objects holding the results of grouping search results by a Facet. Use an object's ``as_dict()`` method to access the results. diff --git a/src/whoosh/spelling.py b/src/whoosh/spelling.py index 79019402..38305c9a 100644 --- a/src/whoosh/spelling.py +++ b/src/whoosh/spelling.py @@ -38,7 +38,7 @@ # Corrector objects -class Corrector(object): +class Corrector: """ Base class for spelling correction objects. Concrete sub-classes should implement the ``_suggestions`` method. @@ -136,7 +136,7 @@ def _suggestions(self, text, maxdist, prefix): seen.add(sug) yield (0 - mxd), sug - class Skipper(object): + class Skipper: def __init__(self, data): self.data = data self.i = 0 @@ -176,7 +176,7 @@ def _suggestions(self, text, maxdist, prefix): # Query correction -class Correction(object): +class Correction: """ Represents the corrected version of a user query string. Has the following attributes: @@ -241,7 +241,7 @@ def format_string(self, formatter): # QueryCorrector objects -class QueryCorrector(object): +class QueryCorrector: """ Base class for objects that correct words in a user query. """ diff --git a/src/whoosh/support/bench.py b/src/whoosh/support/bench.py index 66b20988..d4495745 100644 --- a/src/whoosh/support/bench.py +++ b/src/whoosh/support/bench.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division import os.path from optparse import OptionParser @@ -58,7 +57,7 @@ def __init__(self, d): pass -class Module(object): +class Module: def __init__(self, bench, options, args): self.bench = bench self.options = options @@ -103,7 +102,7 @@ def results(self, r): yield self._process_result(hit) -class Spec(object): +class Spec: headline_field = "title" main_field = "body" @@ -448,7 +447,7 @@ def findterms(self, terms): yield q.resultDictionaries() -class Bench(object): +class Bench: libs = { "whoosh": WhooshModule, "xappy": XappyModule, diff --git a/src/whoosh/support/bitstream.py b/src/whoosh/support/bitstream.py index d32ccd6d..50984639 100644 --- a/src/whoosh/support/bitstream.py +++ b/src/whoosh/support/bitstream.py @@ -11,7 +11,7 @@ _bitsperlong = _LONG_SIZE * 8 -class BitStreamReader(object): +class BitStreamReader: def __init__(self, source): self._totalbits = len(source) * _bitsperlong self._position = 0 diff --git a/src/whoosh/support/bitvector.py b/src/whoosh/support/bitvector.py index 45f491ec..d7ef507d 100644 --- a/src/whoosh/support/bitvector.py +++ b/src/whoosh/support/bitvector.py @@ -269,7 +269,7 @@ ) -class BitVector(object): +class BitVector: """ Implements a memory-efficient array of bits. @@ -432,7 +432,7 @@ def copy(self): return BitVector(self.size, bits=self.bits) -class BitSet(object): +class BitSet: """A set-like object for holding positive integers. It is dynamically backed by either a set or BitVector depending on how many numbers are in the set. diff --git a/src/whoosh/support/charset.py b/src/whoosh/support/charset.py index a15f6e51..4acee88f 100644 --- a/src/whoosh/support/charset.py +++ b/src/whoosh/support/charset.py @@ -1,5 +1,3 @@ -# coding=utf-8 - """This module contains tools for working with Sphinx charset table files. These files are useful for doing case and accent folding. See :class:`whoosh.analysis.CharsetTokenizer` and :class:`whoosh.analysis.CharsetFilter`. @@ -732,7 +730,7 @@ # The unicode.translate() method actually requires a dictionary mapping # character *numbers* to characters, for some reason. -accent_map = dict((ord(k), v) for k, v in iteritems(accent_map)) +accent_map = {ord(k): v for k, v in iteritems(accent_map)} # This Sphinx charset table taken from http://speeple.com/unicode-maps.txt diff --git a/src/whoosh/support/pyparsing.py b/src/whoosh/support/pyparsing.py index 9841d875..8db43458 100644 --- a/src/whoosh/support/pyparsing.py +++ b/src/whoosh/support/pyparsing.py @@ -225,7 +225,7 @@ def _ustr(obj): if not _PY3K: def _str2dict(strg): - return dict([(c, 0) for c in strg]) + return {c: 0 for c in strg} else: _str2dict = set @@ -241,7 +241,7 @@ def _xml_escape(data): return data -class _Constants(object): +class _Constants: pass @@ -340,9 +340,7 @@ class ParseSyntaxException(ParseFatalException): an unbacktrackable syntax error has been found""" def __init__(self, pe): - super(ParseSyntaxException, self).__init__( - pe.pstr, pe.loc, pe.msg, pe.parserElement - ) + super().__init__(pe.pstr, pe.loc, pe.msg, pe.parserElement) # ~ class ReparseException(ParseBaseException): @@ -369,7 +367,7 @@ def __str__(self): return f"RecursiveGrammarException: {self.parseElementTrace}" -class _ParseResultsWithOffset(object): +class _ParseResultsWithOffset: def __init__(self, p1, p2): self.tup = (p1, p2) @@ -383,7 +381,7 @@ def setOffset(self, i): self.tup = (self.tup[0], i) -class ParseResults(object): +class ParseResults: """Structured parse results, to provide multiple means of access to the parsed data: - as a list (len(results)) - by list index (results[0], results[1], etc.) @@ -419,7 +417,7 @@ def __init__(self, toklist, name=None, asList=True, modal=True): self.__toklist = toklist[:] else: self.__toklist = [toklist] - self.__tokdict = dict() + self.__tokdict = {} if name: if not modal: @@ -638,9 +636,7 @@ def asXML(self, doctag=None, namedItemsOnly=False, indent="", formatted=True): """Returns the parse results as XML. Tags are created for tokens and lists that have defined results names.""" nl = "\n" out = [] - namedItems = dict( - [(v[1], k) for (k, vlist) in self.__tokdict.items() for v in vlist] - ) + namedItems = {v[1]: k for (k, vlist) in self.__tokdict.items() for v in vlist} nextLevelIndent = indent + " " # collapse out indents if formatting is not desired @@ -784,7 +780,7 @@ def __setstate__(self, state): self.__parent = None def __dir__(self): - return dir(super(ParseResults, self)) + self.keys() + return dir(super()) + self.keys() def col(loc, strg): @@ -848,7 +844,7 @@ def nullDebugAction(*args): pass -class ParserElement(object): +class ParserElement: """Abstract base level parser element class.""" DEFAULT_WHITE_CHARS = " \n\t\r" @@ -860,7 +856,7 @@ def setDefaultWhitespaceChars(chars): setDefaultWhitespaceChars = staticmethod(setDefaultWhitespaceChars) def __init__(self, savelist=False): - self.parseAction = list() + self.parseAction = [] self.failAction = None # ~ self.name = "" # don't define self.name, let subclasses try/except upcall self.strRepr = None @@ -871,7 +867,7 @@ def __init__(self, savelist=False): self.copyDefaultWhiteChars = True self.mayReturnEmpty = False # used when checking for left-recursion self.keepTabs = False - self.ignoreExprs = list() + self.ignoreExprs = [] self.debug = False self.streamlined = False self.mayIndexError = True # used to optimize exception handling for subclasses that don't advance parse index @@ -1683,7 +1679,7 @@ def __eq__(self, other): except ParseBaseException: return False else: - return super(ParserElement, self) == other + return super() == other def __ne__(self, other): return not (self == other) @@ -1702,11 +1698,11 @@ class Token(ParserElement): """Abstract ParserElement subclass, for defining atomic matching patterns.""" def __init__(self): - super(Token, self).__init__(savelist=False) + super().__init__(savelist=False) # self.myException = ParseException("",0,"",self) def setName(self, name): - s = super(Token, self).setName(name) + s = super().setName(name) self.errmsg = "Expected " + self.name # s.myException.msg = self.errmsg return s @@ -1716,7 +1712,7 @@ class Empty(Token): """An empty token, will always match.""" def __init__(self): - super(Empty, self).__init__() + super().__init__() self.name = "Empty" self.mayReturnEmpty = True self.mayIndexError = False @@ -1726,7 +1722,7 @@ class NoMatch(Token): """A token that will never match.""" def __init__(self): - super(NoMatch, self).__init__() + super().__init__() self.name = "NoMatch" self.mayReturnEmpty = True self.mayIndexError = False @@ -1744,7 +1740,7 @@ class Literal(Token): """Token to exactly match a specified string.""" def __init__(self, matchString): - super(Literal, self).__init__() + super().__init__() self.match = matchString self.matchLen = len(matchString) try: @@ -1795,7 +1791,7 @@ class Keyword(Token): DEFAULT_KEYWORD_CHARS = alphanums + "_$" def __init__(self, matchString, identChars=DEFAULT_KEYWORD_CHARS, caseless=False): - super(Keyword, self).__init__() + super().__init__() self.match = matchString self.matchLen = len(matchString) try: @@ -1846,7 +1842,7 @@ def parseImpl(self, instring, loc, doActions=True): raise exc def copy(self): - c = super(Keyword, self).copy() + c = super().copy() c.identChars = Keyword.DEFAULT_KEYWORD_CHARS return c @@ -1864,7 +1860,7 @@ class CaselessLiteral(Literal): """ def __init__(self, matchString): - super(CaselessLiteral, self).__init__(matchString.upper()) + super().__init__(matchString.upper()) # Preserve the defining literal. self.returnString = matchString self.name = f"'{self.returnString}'" @@ -1883,7 +1879,7 @@ def parseImpl(self, instring, loc, doActions=True): class CaselessKeyword(Keyword): def __init__(self, matchString, identChars=Keyword.DEFAULT_KEYWORD_CHARS): - super(CaselessKeyword, self).__init__(matchString, identChars, caseless=True) + super().__init__(matchString, identChars, caseless=True) def parseImpl(self, instring, loc, doActions=True): if (instring[loc : loc + self.matchLen].upper() == self.caselessmatch) and ( @@ -1911,7 +1907,7 @@ class Word(Token): def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False ): - super(Word, self).__init__() + super().__init__() self.initCharsOrig = initChars self.initChars = _str2dict(initChars) if bodyChars: @@ -1951,12 +1947,12 @@ def __init__( if self.bodyCharsOrig == self.initCharsOrig: self.reString = f"[{_escapeRegexRangeChars(self.initCharsOrig)}]+" elif len(self.bodyCharsOrig) == 1: - self.reString = "%s[%s]*" % ( + self.reString = "{}[{}]*".format( re.escape(self.initCharsOrig), _escapeRegexRangeChars(self.bodyCharsOrig), ) else: - self.reString = "[%s][%s]*" % ( + self.reString = "[{}][{}]*".format( _escapeRegexRangeChars(self.initCharsOrig), _escapeRegexRangeChars(self.bodyCharsOrig), ) @@ -2016,7 +2012,7 @@ def parseImpl(self, instring, loc, doActions=True): def __str__(self): try: - return super(Word, self).__str__() + return super().__str__() except: pass @@ -2043,7 +2039,7 @@ class Regex(Token): def __init__(self, pattern, flags=0): """The parameters pattern and flags are passed to the re.compile() function as-is. See the Python re module for an explanation of the acceptable patterns and flags.""" - super(Regex, self).__init__() + super().__init__() if len(pattern) == 0: warnings.warn( @@ -2090,7 +2086,7 @@ def parseImpl(self, instring, loc, doActions=True): def __str__(self): try: - return super(Regex, self).__str__() + return super().__str__() except: pass @@ -2121,7 +2117,7 @@ def __init__( - unquoteResults - boolean indicating whether the matched text should be unquoted (default=True) - endQuoteChar - string of one or more characters defining the end of the quote delimited string (default=None => same as quoteChar) """ - super(QuotedString, self).__init__() + super().__init__() # remove white space from quote chars - wont work anyway quoteChar = quoteChar.strip() @@ -2154,14 +2150,14 @@ def __init__( if multiline: self.flags = re.MULTILINE | re.DOTALL - self.pattern = r"%s(?:[^%s%s]" % ( + self.pattern = r"{}(?:[^{}{}]".format( re.escape(self.quoteChar), _escapeRegexRangeChars(self.endQuoteChar[0]), (escChar is not None and _escapeRegexRangeChars(escChar) or ""), ) else: self.flags = 0 - self.pattern = r"%s(?:[^%s\n\r%s]" % ( + self.pattern = r"{}(?:[^{}\n\r{}]".format( re.escape(self.quoteChar), _escapeRegexRangeChars(self.endQuoteChar[0]), (escChar is not None and _escapeRegexRangeChars(escChar) or ""), @@ -2227,7 +2223,7 @@ def parseImpl(self, instring, loc, doActions=True): if isinstance(ret, basestring): # replace escaped characters if self.escChar: - ret = re.sub(self.escCharReplacePattern, "\g<1>", ret) + ret = re.sub(self.escCharReplacePattern, r"\g<1>", ret) # replace escaped quotes if self.escQuote: @@ -2237,7 +2233,7 @@ def parseImpl(self, instring, loc, doActions=True): def __str__(self): try: - return super(QuotedString, self).__str__() + return super().__str__() except: pass @@ -2256,7 +2252,7 @@ class CharsNotIn(Token): """ def __init__(self, notChars, min=1, max=0, exact=0): - super(CharsNotIn, self).__init__() + super().__init__() self.skipWhitespace = False self.notChars = notChars @@ -2308,7 +2304,7 @@ def parseImpl(self, instring, loc, doActions=True): def __str__(self): try: - return super(CharsNotIn, self).__str__() + return super().__str__() except: pass @@ -2337,7 +2333,7 @@ class White(Token): } def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0): - super(White, self).__init__() + super().__init__() self.matchWhite = ws self.setWhitespaceChars( "".join([c for c in self.whiteChars if c not in self.matchWhite]) @@ -2385,7 +2381,7 @@ def parseImpl(self, instring, loc, doActions=True): class _PositionToken(Token): def __init__(self): - super(_PositionToken, self).__init__() + super().__init__() self.name = self.__class__.__name__ self.mayReturnEmpty = True self.mayIndexError = False @@ -2395,7 +2391,7 @@ class GoToColumn(_PositionToken): """Token to advance to a specific column of input text; useful for tabular report scraping.""" def __init__(self, colno): - super(GoToColumn, self).__init__() + super().__init__() self.col = colno def preParse(self, instring, loc): @@ -2424,13 +2420,13 @@ class LineStart(_PositionToken): """Matches if current position is at the beginning of a line within the parse string""" def __init__(self): - super(LineStart, self).__init__() + super().__init__() self.setWhitespaceChars(ParserElement.DEFAULT_WHITE_CHARS.replace("\n", "")) self.errmsg = "Expected start of line" # self.myException.msg = self.errmsg def preParse(self, instring, loc): - preloc = super(LineStart, self).preParse(instring, loc) + preloc = super().preParse(instring, loc) if instring[preloc] == "\n": loc += 1 return loc @@ -2453,7 +2449,7 @@ class LineEnd(_PositionToken): """Matches if current position is at the end of a line within the parse string""" def __init__(self): - super(LineEnd, self).__init__() + super().__init__() self.setWhitespaceChars(ParserElement.DEFAULT_WHITE_CHARS.replace("\n", "")) self.errmsg = "Expected end of line" # self.myException.msg = self.errmsg @@ -2481,7 +2477,7 @@ class StringStart(_PositionToken): """Matches if current position is at the beginning of the parse string""" def __init__(self): - super(StringStart, self).__init__() + super().__init__() self.errmsg = "Expected start of text" # self.myException.msg = self.errmsg @@ -2501,7 +2497,7 @@ class StringEnd(_PositionToken): """Matches if current position is at the end of the parse string""" def __init__(self): - super(StringEnd, self).__init__() + super().__init__() self.errmsg = "Expected end of text" # self.myException.msg = self.errmsg @@ -2532,7 +2528,7 @@ class WordStart(_PositionToken): """ def __init__(self, wordChars=printables): - super(WordStart, self).__init__() + super().__init__() self.wordChars = _str2dict(wordChars) self.errmsg = "Not at the start of a word" @@ -2558,7 +2554,7 @@ class WordEnd(_PositionToken): """ def __init__(self, wordChars=printables): - super(WordEnd, self).__init__() + super().__init__() self.wordChars = _str2dict(wordChars) self.skipWhitespace = False self.errmsg = "Not at the end of a word" @@ -2582,7 +2578,7 @@ class ParseExpression(ParserElement): """Abstract subclass of ParserElement, for combining and post-processing parsed tokens.""" def __init__(self, exprs, savelist=False): - super(ParseExpression, self).__init__(savelist) + super().__init__(savelist) if isinstance(exprs, list): self.exprs = exprs elif isinstance(exprs, basestring): @@ -2614,18 +2610,18 @@ def leaveWhitespace(self): def ignore(self, other): if isinstance(other, Suppress): if other not in self.ignoreExprs: - super(ParseExpression, self).ignore(other) + super().ignore(other) for e in self.exprs: e.ignore(self.ignoreExprs[-1]) else: - super(ParseExpression, self).ignore(other) + super().ignore(other) for e in self.exprs: e.ignore(self.ignoreExprs[-1]) return self def __str__(self): try: - return super(ParseExpression, self).__str__() + return super().__str__() except: pass @@ -2634,7 +2630,7 @@ def __str__(self): return self.strRepr def streamline(self): - super(ParseExpression, self).streamline() + super().streamline() for e in self.exprs: e.streamline() @@ -2670,7 +2666,7 @@ def streamline(self): return self def setResultsName(self, name, listAllMatches=False): - ret = super(ParseExpression, self).setResultsName(name, listAllMatches) + ret = super().setResultsName(name, listAllMatches) return ret def validate(self, validateTrace=[]): @@ -2692,7 +2688,7 @@ def __init__(self, *args, **kwargs): self.leaveWhitespace() def __init__(self, exprs, savelist=True): - super(And, self).__init__(exprs, savelist) + super().__init__(exprs, savelist) self.mayReturnEmpty = True for e in self.exprs: if not e.mayReturnEmpty: @@ -2759,7 +2755,7 @@ class Or(ParseExpression): """ def __init__(self, exprs, savelist=False): - super(Or, self).__init__(exprs, savelist) + super().__init__(exprs, savelist) self.mayReturnEmpty = False for e in self.exprs: if e.mayReturnEmpty: @@ -2825,7 +2821,7 @@ class MatchFirst(ParseExpression): """ def __init__(self, exprs, savelist=False): - super(MatchFirst, self).__init__(exprs, savelist) + super().__init__(exprs, savelist) if exprs: self.mayReturnEmpty = False for e in self.exprs: @@ -2889,7 +2885,7 @@ class Each(ParseExpression): """ def __init__(self, exprs, savelist=True): - super(Each, self).__init__(exprs, savelist) + super().__init__(exprs, savelist) self.mayReturnEmpty = True for e in self.exprs: if not e.mayReturnEmpty: @@ -2985,7 +2981,7 @@ class ParseElementEnhance(ParserElement): """Abstract subclass of ParserElement, for combining and post-processing parsed tokens.""" def __init__(self, expr, savelist=False): - super(ParseElementEnhance, self).__init__(savelist) + super().__init__(savelist) if isinstance(expr, basestring): expr = Literal(expr) self.expr = expr @@ -3015,17 +3011,17 @@ def leaveWhitespace(self): def ignore(self, other): if isinstance(other, Suppress): if other not in self.ignoreExprs: - super(ParseElementEnhance, self).ignore(other) + super().ignore(other) if self.expr is not None: self.expr.ignore(self.ignoreExprs[-1]) else: - super(ParseElementEnhance, self).ignore(other) + super().ignore(other) if self.expr is not None: self.expr.ignore(self.ignoreExprs[-1]) return self def streamline(self): - super(ParseElementEnhance, self).streamline() + super().streamline() if self.expr is not None: self.expr.streamline() return self @@ -3045,7 +3041,7 @@ def validate(self, validateTrace=[]): def __str__(self): try: - return super(ParseElementEnhance, self).__str__() + return super().__str__() except: pass @@ -3061,7 +3057,7 @@ class FollowedBy(ParseElementEnhance): position. FollowedBy always returns a null token list.""" def __init__(self, expr): - super(FollowedBy, self).__init__(expr) + super().__init__(expr) self.mayReturnEmpty = True def parseImpl(self, instring, loc, doActions=True): @@ -3077,7 +3073,7 @@ class NotAny(ParseElementEnhance): always returns a null token list. May be constructed using the '~' operator.""" def __init__(self, expr): - super(NotAny, self).__init__(expr) + super().__init__(expr) # ~ self.leaveWhitespace() self.skipWhitespace = ( False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs @@ -3113,7 +3109,7 @@ class ZeroOrMore(ParseElementEnhance): """Optional repetition of zero or more of the given expression.""" def __init__(self, expr): - super(ZeroOrMore, self).__init__(expr) + super().__init__(expr) self.mayReturnEmpty = True def parseImpl(self, instring, loc, doActions=True): @@ -3144,7 +3140,7 @@ def __str__(self): return self.strRepr def setResultsName(self, name, listAllMatches=False): - ret = super(ZeroOrMore, self).setResultsName(name, listAllMatches) + ret = super().setResultsName(name, listAllMatches) ret.saveAsList = True return ret @@ -3180,12 +3176,12 @@ def __str__(self): return self.strRepr def setResultsName(self, name, listAllMatches=False): - ret = super(OneOrMore, self).setResultsName(name, listAllMatches) + ret = super().setResultsName(name, listAllMatches) ret.saveAsList = True return ret -class _NullToken(object): +class _NullToken: def __bool__(self): return False @@ -3205,7 +3201,7 @@ class Optional(ParseElementEnhance): """ def __init__(self, exprs, default=_optionalNotMatched): - super(Optional, self).__init__(exprs, savelist=False) + super().__init__(exprs, savelist=False) self.defaultValue = default self.mayReturnEmpty = True @@ -3242,7 +3238,7 @@ class SkipTo(ParseElementEnhance): """ def __init__(self, other, include=False, ignore=None, failOn=None): - super(SkipTo, self).__init__(other) + super().__init__(other) self.ignoreExpr = ignore self.mayReturnEmpty = True self.mayIndexError = False @@ -3319,7 +3315,7 @@ class Forward(ParseElementEnhance): """ def __init__(self, other=None): - super(Forward, self).__init__(other, savelist=False) + super().__init__(other, savelist=False) def __lshift__(self, other): if isinstance(other, basestring): @@ -3370,7 +3366,7 @@ def __str__(self): def copy(self): if self.expr is not None: - return super(Forward, self).copy() + return super().copy() else: ret = Forward() ret << self @@ -3386,7 +3382,7 @@ class TokenConverter(ParseElementEnhance): """Abstract subclass of ParseExpression, for converting parsed results.""" def __init__(self, expr, savelist=False): - super(TokenConverter, self).__init__(expr) # , savelist ) + super().__init__(expr) # , savelist ) self.saveAsList = False @@ -3394,7 +3390,7 @@ class Upcase(TokenConverter): """Converter to upper case all matching tokens.""" def __init__(self, *args): - super(Upcase, self).__init__(*args) + super().__init__(*args) warnings.warn( "Upcase class is deprecated, use upcaseTokens parse action instead", DeprecationWarning, @@ -3412,7 +3408,7 @@ class Combine(TokenConverter): """ def __init__(self, expr, joinString="", adjacent=True): - super(Combine, self).__init__(expr) + super().__init__(expr) # suppress whitespace-stripping in contained parse expressions, but re-enable it on the Combine itself if adjacent: self.leaveWhitespace() @@ -3424,7 +3420,7 @@ def ignore(self, other): if self.adjacent: ParserElement.ignore(self, other) else: - super(Combine, self).ignore(other) + super().ignore(other) return self def postParse(self, instring, loc, tokenlist): @@ -3444,7 +3440,7 @@ class Group(TokenConverter): """Converter to return the matched tokens as a list - useful for returning tokens of ZeroOrMore and OneOrMore expressions.""" def __init__(self, expr): - super(Group, self).__init__(expr) + super().__init__(expr) self.saveAsList = True def postParse(self, instring, loc, tokenlist): @@ -3458,7 +3454,7 @@ class Dict(TokenConverter): """ def __init__(self, exprs): - super(Dict, self).__init__(exprs) + super().__init__(exprs) self.saveAsList = True def postParse(self, instring, loc, tokenlist): @@ -3498,7 +3494,7 @@ def suppress(self): return self -class OnlyOnce(object): +class OnlyOnce: """Wrapper for parse actions, to ensure they are only called once.""" def __init__(self, methodCall): diff --git a/src/whoosh/support/relativedelta.py b/src/whoosh/support/relativedelta.py index 552f0e4e..5dfa8f03 100644 --- a/src/whoosh/support/relativedelta.py +++ b/src/whoosh/support/relativedelta.py @@ -13,7 +13,7 @@ __all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"] -class weekday(object): +class weekday: __slots__ = ["weekday", "n"] def __init__(self, weekday, n=None): diff --git a/src/whoosh/support/unicode.py b/src/whoosh/support/unicode.py index 4c2248db..4010744b 100644 --- a/src/whoosh/support/unicode.py +++ b/src/whoosh/support/unicode.py @@ -216,7 +216,7 @@ _names = [] -class blocks(object): +class blocks: pass diff --git a/src/whoosh/system.py b/src/whoosh/system.py index 36ad58eb..13c3da66 100644 --- a/src/whoosh/system.py +++ b/src/whoosh/system.py @@ -75,4 +75,4 @@ if sys.version_info[0] < 3: emptybytes = "" else: - emptybytes = "".encode("latin-1") + emptybytes = b"" diff --git a/src/whoosh/util/__init__.py b/src/whoosh/util/__init__.py index 93868a32..2a1c9880 100644 --- a/src/whoosh/util/__init__.py +++ b/src/whoosh/util/__init__.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement import random import sys diff --git a/src/whoosh/util/cache.py b/src/whoosh/util/cache.py index f2e8a414..6e8b7a09 100644 --- a/src/whoosh/util/cache.py +++ b/src/whoosh/util/cache.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement import functools from heapq import nsmallest diff --git a/src/whoosh/util/filelock.py b/src/whoosh/util/filelock.py index ef1533bd..e8beae57 100644 --- a/src/whoosh/util/filelock.py +++ b/src/whoosh/util/filelock.py @@ -58,7 +58,7 @@ def try_for(fn, timeout=5.0, delay=0.1): return v -class LockBase(object): +class LockBase: """Base class for file locks.""" def __init__(self, filename): @@ -103,7 +103,7 @@ def acquire(self, blocking=False): fcntl.flock(self.fd, mode) self.locked = True return True - except IOError: + except OSError: e = sys.exc_info()[1] if e.errno not in (errno.EAGAIN, errno.EACCES): raise @@ -137,7 +137,7 @@ def acquire(self, blocking=False): try: msvcrt.locking(self.fd, mode, 1) return True - except IOError: + except OSError: e = sys.exc_info()[1] if e.errno not in (errno.EAGAIN, errno.EACCES, errno.EDEADLK): raise diff --git a/src/whoosh/util/loading.py b/src/whoosh/util/loading.py index cbaa847e..0daf281b 100644 --- a/src/whoosh/util/loading.py +++ b/src/whoosh/util/loading.py @@ -37,7 +37,7 @@ def __init__(self, f, objmap, shortcuts=None): pickle.Unpickler.__init__(self, f) if shortcuts: - objmap = dict((k % shortcuts, v % shortcuts) for k, v in objmap.items()) + objmap = {k % shortcuts: v % shortcuts for k, v in objmap.items()} self._objmap = objmap def find_class(self, modulename, objname): diff --git a/src/whoosh/util/numlists.py b/src/whoosh/util/numlists.py index 8807570d..7472ea49 100644 --- a/src/whoosh/util/numlists.py +++ b/src/whoosh/util/numlists.py @@ -26,7 +26,7 @@ def delta_decode(nums): yield base -class GrowableArray(object): +class GrowableArray: def __init__(self, inittype="B", allow_longs=True): self.array = array(inittype) self._allow_longs = allow_longs @@ -88,7 +88,7 @@ def to_file(self, dbfile): # Number list encoding base class -class NumberEncoding(object): +class NumberEncoding: maxint = None def write_nums(self, f, numbers): diff --git a/src/whoosh/util/testing.py b/src/whoosh/util/testing.py index 5910341c..a1ceac6c 100644 --- a/src/whoosh/util/testing.py +++ b/src/whoosh/util/testing.py @@ -35,7 +35,7 @@ from whoosh.util import now, random_name -class TempDir(object): +class TempDir: def __init__( self, basename="", diff --git a/src/whoosh/util/times.py b/src/whoosh/util/times.py index 2b7ad04c..55df3e1d 100644 --- a/src/whoosh/util/times.py +++ b/src/whoosh/util/times.py @@ -92,7 +92,7 @@ def long_to_datetime(x): # Ambiguous datetime object -class adatetime(object): +class adatetime: """An "ambiguous" datetime object. This object acts like a ``datetime.datetime`` object but can have any of its attributes set to None, meaning unspecified. @@ -301,7 +301,7 @@ def disambiguated(self, basedate): # Time span class -class timespan(object): +class timespan: """A span of time between two ``datetime`` or ``adatetime`` objects.""" def __init__(self, start, end): diff --git a/src/whoosh/util/versions.py b/src/whoosh/util/versions.py index df84192c..2f056e2f 100644 --- a/src/whoosh/util/versions.py +++ b/src/whoosh/util/versions.py @@ -28,7 +28,7 @@ from whoosh.util.text import rcompile -class BaseVersion(object): +class BaseVersion: @classmethod def parse(cls, text): obj = cls() @@ -128,7 +128,7 @@ class SimpleVersion(BaseVersion): ] _ex_bits = {"a": 0, "b": 1, "c": 2, "rc": 10, "z": 15} - _bits_ex = dict((v, k) for k, v in _ex_bits.items()) + _bits_ex = {v: k for k, v in _ex_bits.items()} __slots__ = ("major", "minor", "release", "ex", "exnum") diff --git a/src/whoosh/writing.py b/src/whoosh/writing.py index b71dcf8d..5d14d98b 100644 --- a/src/whoosh/writing.py +++ b/src/whoosh/writing.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement import threading import time @@ -194,7 +193,7 @@ def save(self): # Writer base class -class IndexWriter(object): +class IndexWriter: """High-level object for writing to an index. To get a writer for a particular index, call @@ -625,13 +624,13 @@ def add_field(self, fieldname, fieldspec, **kwargs): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") - super(SegmentWriter, self).add_field(fieldname, fieldspec, **kwargs) + super().add_field(fieldname, fieldspec, **kwargs) def remove_field(self, fieldname): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") - super(SegmentWriter, self).remove_field(fieldname) + super().remove_field(fieldname) def has_deletions(self): """ @@ -710,7 +709,7 @@ def write_per_doc(self, fieldnames, reader): pdw.start_doc(self.docnum) # Set disjunction includes dynamic fields (can be different for each document) - for fieldname in fieldnames | set(s for s in stored if s in self.schema): + for fieldname in fieldnames | {s for s in stored if s in self.schema}: fieldobj = schema[fieldname] length = reader.doc_field_length(docnum, fieldname) pdw.add_field(fieldname, fieldobj, stored.get(fieldname), length) @@ -731,9 +730,9 @@ def write_per_doc(self, fieldnames, reader): def add_reader(self, reader): self._check_state() basedoc = self.docnum - ndxnames = set( + ndxnames = { fname for fname in reader.indexed_field_names() if fname in self.schema - ) + } fieldnames = set(self.schema.names()) | ndxnames docmap = self.write_per_doc(fieldnames, reader) @@ -840,10 +839,10 @@ def searcher(self, **kwargs): # We have a write lock, nothing is changing. Only cache if kwargs is emtpy # and the SegmentWriter is still open. if kwargs or self.is_closed: - return super(SegmentWriter, self).searcher(**kwargs) + return super().searcher(**kwargs) if self._searcher is None: - s = super(SegmentWriter, self).searcher() + s = super().searcher() self._searcher = s s._orig_close = s.close # called in _finish() s.close = lambda: None diff --git a/stress/test_bigfacet.py b/stress/test_bigfacet.py index a806f621..e41dbe7d 100644 --- a/stress/test_bigfacet.py +++ b/stress/test_bigfacet.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import os.path import random import string diff --git a/stress/test_bigindex.py b/stress/test_bigindex.py index d863d326..f4ae8bfd 100644 --- a/stress/test_bigindex.py +++ b/stress/test_bigindex.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import random from whoosh import fields diff --git a/stress/test_bigtable.py b/stress/test_bigtable.py index 1d4fc8ff..7a595061 100644 --- a/stress/test_bigtable.py +++ b/stress/test_bigtable.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from random import randint, shuffle from nose.tools import assert_equal # type: ignore @UnresolvedImport @@ -15,7 +13,7 @@ def randstring(min, max): return "".join(chr(randint(1, 255)) for _ in range(randint(min, max))) count = 100000 - samp = dict((randstring(1, 50), randstring(1, 50)) for _ in range(count)) + samp = {randstring(1, 50): randstring(1, 50) for _ in range(count)} fhw = HashWriter(st.create_file("big.hsh")) fhw.add_all(iteritems(samp)) diff --git a/stress/test_hugeindex.py b/stress/test_hugeindex.py index e5f1dec8..b6fec182 100644 --- a/stress/test_hugeindex.py +++ b/stress/test_hugeindex.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import struct from nose.tools import assert_equal # type: ignore @UnresolvedImport diff --git a/stress/test_threading.py b/stress/test_threading.py index dbd2c27e..f92a081d 100644 --- a/stress/test_threading.py +++ b/stress/test_threading.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import random import threading import time diff --git a/stress/test_update.py b/stress/test_update.py index d0be86d5..9e3f6dde 100644 --- a/stress/test_update.py +++ b/stress/test_update.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import random from nose.tools import assert_equal diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 28c3dbde..394ce032 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -1,7 +1,3 @@ -# coding=utf-8 - -from __future__ import with_statement - import pytest from whoosh import analysis, fields, qparser from whoosh.compat import b, dumps, u, unichr diff --git a/tests/test_automata.py b/tests/test_automata.py index a1cbfd75..1edf8702 100644 --- a/tests/test_automata.py +++ b/tests/test_automata.py @@ -155,7 +155,7 @@ def test_glob_range(): assert not nfa.accept("acc") -class Skipper(object): +class Skipper: def __init__(self, data): self.data = data self.i = 0 diff --git a/tests/test_classify.py b/tests/test_classify.py index c36c0e4a..89d008d3 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from whoosh import analysis, classify, fields, formats, query, reading from whoosh.compat import text_type, u from whoosh.filedb.filestore import RamStorage @@ -58,9 +56,11 @@ def test_add_text(model=classify.Bo1Model): with ix.reader() as r: exp = classify.Expander(r, "content", model=model) exp.add_text(text) - assert set([t[0] for t in exp.expanded_terms(3)]) == set( - ["particles", "velocity", "field"] - ) + assert {t[0] for t in exp.expanded_terms(3)} == { + "particles", + "velocity", + "field", + } exp = classify.Expander(r, "extra", model=model) exp.add_text(text) assert exp.expanded_terms(3) == [] @@ -80,7 +80,7 @@ def test_keyterms_from_text(model=classify.Bo2Model): ix = create_index() with ix.searcher() as s: keys = list(s.key_terms_from_text("content", text, model=model)) - assert set([t[0] for t in keys]) == set(["particles", "velocity", "field"]) + assert {t[0] for t in keys} == {"particles", "velocity", "field"} keys = list(s.key_terms_from_text("extra", text, model=model)) assert keys == [] diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 9a727bd5..7d37c125 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import random from array import array @@ -18,7 +16,7 @@ def _make_codec(**kwargs): return st, codec, seg -class FakeLengths(object): +class FakeLengths: def __init__(self, **lens): self.lens = lens @@ -63,7 +61,7 @@ def random_btext(): return array_tobytes(a).decode("utf-16") domain = sorted( - set([(random_fieldname(), random_btext().encode("utf-8")) for _ in range(1000)]) + {(random_fieldname(), random_btext().encode("utf-8")) for _ in range(1000)} ) st, codec, seg = _make_codec() diff --git a/tests/test_collector.py b/tests/test_collector.py index ea775b2d..16260a06 100644 --- a/tests/test_collector.py +++ b/tests/test_collector.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import pytest from whoosh import collectors, fields, query, searching from whoosh.compat import u diff --git a/tests/test_columns.py b/tests/test_columns.py index 4b4a2b92..d21a1165 100644 --- a/tests/test_columns.py +++ b/tests/test_columns.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import inspect import random import sys @@ -66,7 +64,7 @@ def test_multistream(): st = RamStorage() msw = compound.CompoundWriter(st) - files = dict((name, msw.create_file(name)) for name in "abc") + files = {name: msw.create_file(name) for name in "abc"} for name, data in domain: files[name].write(b(data)) f = st.create_file("test") @@ -92,7 +90,7 @@ def randstring(n): value = randstring(2500) domain[name] = value - outfiles = dict((name, BytesIO(value)) for name, value in domain.items()) + outfiles = {name: BytesIO(value) for name, value in domain.items()} with TempStorage() as st: msw = compound.CompoundWriter(st, buffersize=1024) diff --git a/tests/test_compound.py b/tests/test_compound.py index 515b966b..38de3f22 100644 --- a/tests/test_compound.py +++ b/tests/test_compound.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from whoosh.compat import b from whoosh.filedb.compound import CompoundStorage from whoosh.filedb.filestore import RamStorage diff --git a/tests/test_flexible.py b/tests/test_flexible.py index 3d594f0d..3eb6c257 100644 --- a/tests/test_flexible.py +++ b/tests/test_flexible.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from whoosh import fields from whoosh.compat import b, u from whoosh.util.testing import TempIndex diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 121d134e..e47b0d9b 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import with_statement - import pytest # from jieba.analyse import ChineseAnalyzer diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 45262133..29631438 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import random from collections import defaultdict from datetime import datetime diff --git a/tests/test_matching.py b/tests/test_matching.py index 2d31d9bf..4d21cf5a 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from random import choice, randint, sample from whoosh import fields, matching, qparser, query diff --git a/tests/test_misc.py b/tests/test_misc.py index ddfb2d50..5a4f2224 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import os import threading import time diff --git a/tests/test_mpwriter.py b/tests/test_mpwriter.py index 211abc72..acf16488 100644 --- a/tests/test_mpwriter.py +++ b/tests/test_mpwriter.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import random from collections import deque diff --git a/tests/test_nested.py b/tests/test_nested.py index 4da5274d..41dc704c 100644 --- a/tests/test_nested.py +++ b/tests/test_nested.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from whoosh import fields, query, sorting from whoosh.compat import u from whoosh.filedb.filestore import RamStorage diff --git a/tests/test_parse_plugins.py b/tests/test_parse_plugins.py index 2cd6d3a4..ae512588 100644 --- a/tests/test_parse_plugins.py +++ b/tests/test_parse_plugins.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import inspect from datetime import datetime diff --git a/tests/test_postings.py b/tests/test_postings.py index 0260db65..6d836a19 100644 --- a/tests/test_postings.py +++ b/tests/test_postings.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from whoosh import analysis, fields from whoosh.codec import default_codec from whoosh.compat import u diff --git a/tests/test_quality.py b/tests/test_quality.py index 1f6476e3..2e00d58b 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import random from whoosh import fields, matching, scoring diff --git a/tests/test_reading.py b/tests/test_reading.py index a8f1427f..56cc9cc6 100644 --- a/tests/test_reading.py +++ b/tests/test_reading.py @@ -1,6 +1,3 @@ -# coding=utf-8 -from __future__ import with_statement - import random import threading import time @@ -126,20 +123,18 @@ def test_term_inspection(): a_exp = list(r.expand_prefix("content", "a")) assert a_exp == [b("aa"), b("ab"), b("ax")] - assert set(r.all_terms()) == set( - [ - ("content", b("aa")), - ("content", b("ab")), - ("content", b("ax")), - ("content", b("bb")), - ("content", b("cc")), - ("content", b("dd")), - ("content", b("ee")), - ("title", b("document")), - ("title", b("my")), - ("title", b("other")), - ] - ) + assert set(r.all_terms()) == { + ("content", b("aa")), + ("content", b("ab")), + ("content", b("ax")), + ("content", b("bb")), + ("content", b("cc")), + ("content", b("dd")), + ("content", b("ee")), + ("title", b("document")), + ("title", b("my")), + ("title", b("other")), + } # (text, doc_freq, index_freq) cstats = _fstats(r.iter_field("content")) @@ -497,8 +492,8 @@ def test_cursor(): def _check_inspection_results(ix): - AE = "aé".encode("utf-8") - AU = "aú".encode("utf-8") + AE = "aé".encode() + AU = "aú".encode() with ix.reader() as r: cterms = " ".join(r.field_terms("content")) @@ -508,20 +503,18 @@ def _check_inspection_results(ix): assert a_exp == [b("aa"), AE, AU] tset = set(r.all_terms()) - assert tset == set( - [ - ("content", b("aa")), - ("content", AE), - ("content", AU), - ("content", b("bb")), - ("content", b("cc")), - ("content", b("dd")), - ("content", b("ee")), - ("title", b("document")), - ("title", b("my")), - ("title", b("other")), - ] - ) + assert tset == { + ("content", b("aa")), + ("content", AE), + ("content", AU), + ("content", b("bb")), + ("content", b("cc")), + ("content", b("dd")), + ("content", b("ee")), + ("title", b("document")), + ("title", b("my")), + ("title", b("other")), + } # (text, doc_freq, index_freq) assert _fstats(r.iter_field("content")) == [ diff --git a/tests/test_results.py b/tests/test_results.py index c750363b..6df221dc 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import pytest from whoosh import analysis, fields, formats, highlight, qparser, query from whoosh.codec.whoosh3 import W3Codec @@ -214,8 +212,8 @@ def test_extend_filtered(): hits = lambda result: [hit["id"] for hit in result] with ix.searcher() as s: - r1 = s.search(query.Term("text", u("alfa")), filter=set([1, 4])) - assert r1.allowed == set([1, 4]) + r1 = s.search(query.Term("text", u("alfa")), filter={1, 4}) + assert r1.allowed == {1, 4} assert len(r1.top_n) == 0 r2 = s.search(query.Term("text", u("bravo"))) @@ -223,7 +221,7 @@ def test_extend_filtered(): assert hits(r2) == [1, 2, 4] r3 = r1.copy() - assert r3.allowed == set([1, 4]) + assert r3.allowed == {1, 4} assert len(r3.top_n) == 0 r3.extend(r2) assert len(r3.top_n) == 3 diff --git a/tests/test_searching.py b/tests/test_searching.py index d40d3539..88bbefa5 100644 --- a/tests/test_searching.py +++ b/tests/test_searching.py @@ -1,7 +1,3 @@ -# encoding: utf-8 - -from __future__ import with_statement - import copy from datetime import datetime, timedelta @@ -1641,7 +1637,7 @@ def test_groupedby_with_terms(): assert len(r) == 2 assert r.groups("organism") == {"mus": [1, 0]} assert r.has_matched_terms() - assert r.matched_terms() == set([("content", b("ipfstd1"))]) + assert r.matched_terms() == {("content", b("ipfstd1"))} def test_buffered_refresh(): @@ -1702,7 +1698,7 @@ def test_terms_with_filter(): w.add_document(text=u("hotel alfa bravo charlie")) with ix.searcher() as s: - workingset = set([1, 2, 3]) + workingset = {1, 2, 3} q = query.Term("text", u("foxtrot")) r = s.search_page(q, pagenum=1, pagelen=5, terms=True, filter=workingset) @@ -1867,7 +1863,7 @@ def pos_score_fn(searcher, fieldname, text, matcher): assert not m.supports_block_quality() r = s.search(q, limit=5) - ids = "".join(([hit["id"] for hit in r])) + ids = "".join([hit["id"] for hit in r]) assert ids == "agmsb" q = query.Or( diff --git a/tests/test_sorting.py b/tests/test_sorting.py index b2744c24..18697d6a 100644 --- a/tests/test_sorting.py +++ b/tests/test_sorting.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import random from datetime import datetime, timedelta @@ -562,7 +560,7 @@ def test_multifacet(): with ix.searcher() as s: facet = sorting.MultiFacet(["tag", "size"]) r = s.search(query.Every(), groupedby={"tag/size": facet}) - cats = r.groups(("tag/size")) + cats = r.groups("tag/size") assert cats == correct diff --git a/tests/test_spans.py b/tests/test_spans.py index ee5c80a8..162fcbdc 100644 --- a/tests/test_spans.py +++ b/tests/test_spans.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from whoosh import analysis, fields, formats from whoosh.compat import permutations, range, u from whoosh.filedb.filestore import RamStorage diff --git a/tests/test_spelling.py b/tests/test_spelling.py index 3a418f15..8a313eb1 100644 --- a/tests/test_spelling.py +++ b/tests/test_spelling.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import gzip from whoosh import analysis, fields, highlight, query, spelling diff --git a/tests/test_tables.py b/tests/test_tables.py index cdbdae0c..c276d77d 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -1,7 +1,3 @@ -# encoding: utf-8 - -from __future__ import with_statement - import random from whoosh.compat import b, iteritems, range @@ -71,7 +67,7 @@ def test_hash_contents(): ("whiskey", "xray"), ] # Convert to bytes - samp = set((b(k), b(v)) for k, v in samp) + samp = {(b(k), b(v)) for k, v in samp} with TempStorage("hashcontents") as st: hw = HashWriter(st.create_file("test.hsh")) @@ -85,8 +81,8 @@ def test_hash_contents(): for key, value in probes: assert hr[key] == value - assert set(hr.keys()) == set([k for k, v in samp]) - assert set(hr.values()) == set([v for k, v in samp]) + assert set(hr.keys()) == {k for k, v in samp} + assert set(hr.values()) == {v for k, v in samp} assert set(hr.items()) == samp hr.close() @@ -104,7 +100,7 @@ def randstring(): return b(s) with TempStorage("randomhash") as st: - samp = dict((randstring(), randstring()) for _ in range(times)) + samp = {randstring(): randstring() for _ in range(times)} hw = HashWriter(st.create_file("test.hsh")) for k, v in iteritems(samp): diff --git a/tests/test_vectors.py b/tests/test_vectors.py index cdd13148..339859f3 100644 --- a/tests/test_vectors.py +++ b/tests/test_vectors.py @@ -1,6 +1,3 @@ -# encoding: utf-8 -from __future__ import with_statement - from whoosh import fields, formats from whoosh.compat import u from whoosh.filedb.filestore import RamStorage diff --git a/tests/test_weightings.py b/tests/test_weightings.py index 09f07995..01f1b8ec 100644 --- a/tests/test_weightings.py +++ b/tests/test_weightings.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import inspect import sys from random import choice, randint diff --git a/tests/test_writing.py b/tests/test_writing.py index 7c979a2d..8faa48a7 100644 --- a/tests/test_writing.py +++ b/tests/test_writing.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import random import threading import time