From 07db88d659b5de964ab7c9ffa9ab98bdb7f90c20 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Mon, 15 Jul 2019 04:24:12 -0400 Subject: [PATCH 01/14] #532: Highlight longest overlapping token --- setup.py | 26 ++++++++++++++------------ src/whoosh/highlight.py | 11 ++++++++--- tests/test_highlighting.py | 24 ++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index c4997749..d5accade 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,8 @@ #!python -import os.path, sys +import os.path +import sys + from setuptools import setup, find_packages from setuptools.command.test import test as TestCommand @@ -20,7 +22,7 @@ def finalize_options(self): self.test_suite = True def run_tests(self): - #import here, cause outside the eggs aren't loaded + # import here, cause outside the eggs aren't loaded import pytest pytest.main(self.test_args) @@ -44,18 +46,18 @@ def run_tests(self): zip_safe=True, install_requires=['cached-property'], - tests_require=['pytest'], + tests_require=['pytest', 'jieba'], cmdclass={'test': PyTest}, classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Natural Language :: English", - "Operating System :: OS Independent", - "Programming Language :: Python :: 2.5", - "Programming Language :: Python :: 3", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Text Processing :: Indexing", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 2.5", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing :: Indexing", ], ) diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py index 6fbbe6d0..212a7157 100644 --- a/src/whoosh/highlight.py +++ b/src/whoosh/highlight.py @@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1): self.matched_terms.add(t.text) def __repr__(self): - return "" % (self.startchar, self.endchar, - len(self.matches)) + return "" % (self.startchar, self.endchar, + len(self.matches)) def __len__(self): return self.endchar - self.startchar @@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False): index = fragment.startchar text = fragment.text - for t in fragment.matches: + # For overlapping tokens (such as in Chinese), sort by position, + # then by inverse of length. + # Because the formatter is sequential, it will only pick the first + # token for a given position to highlight. This makes sure it picks + # the longest overlapping token. + for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))): if t.startchar is None: continue if t.startchar < index: diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 523dff6b..c2cb94ff 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -2,6 +2,7 @@ from __future__ import with_statement +from jieba.analyse import ChineseAnalyzer import pytest from whoosh import analysis, highlight, fields, qparser, query @@ -330,3 +331,26 @@ def test_whole_noterms(): hi = r[0].highlights("text", minscore=0) assert hi == u("alfa bravo charlie delta echo foxtrot golf") + + +def test_overlapping_tokens(): + query_string = "马克思" + text = "两次历史性飞跃与马克思主义中国化" + analyzer = ChineseAnalyzer() + formatter = highlight.HtmlFormatter() + + terms = [token.text for token in analyzer(query_string)] + + assert terms == ['马克', '马克思'] + + output = highlight.highlight( + text, + terms, + analyzer, + highlight.WholeFragmenter(), + formatter + ) + + assert output == '两次历史性飞跃与马克思主义中国化', \ + 'The longest overlapping token 马克思 was not selected by the highlighter' + # as opposed to '两次历史性飞跃与马克思主义中国化' From 4b02612b4fbfee2c67bc2e5c43d055e2c76c92c7 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 02:02:58 -0400 Subject: [PATCH 02/14] #532: Add jieba to Travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a75414cf..6f458cbf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ python: - "3.7" install: - - pip install pytest nose codecov coverage cached-property + - pip install pytest nose codecov coverage cached-property jieba script: - nosetests --with-coverage From 4737bb31f303fb91ce73d2e1ef3468efea499748 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 02:16:57 -0400 Subject: [PATCH 03/14] Unicode input --- tests/test_highlighting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index c2cb94ff..714a395f 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -334,8 +334,8 @@ def test_whole_noterms(): def test_overlapping_tokens(): - query_string = "马克思" - text = "两次历史性飞跃与马克思主义中国化" + query_string = u("马克思") + text = u("两次历史性飞跃与马克思主义中国化") analyzer = ChineseAnalyzer() formatter = highlight.HtmlFormatter() From e96c6cc38e1a47adbc6fb0446470e79650836194 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 02:19:55 -0400 Subject: [PATCH 04/14] Unicode input --- tests/test_highlighting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 714a395f..7416a516 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -341,7 +341,7 @@ def test_overlapping_tokens(): terms = [token.text for token in analyzer(query_string)] - assert terms == ['马克', '马克思'] + assert terms == [u('马克'), u('马克思')] output = highlight.highlight( text, From 407b6c012a9c8a1cfe60e303eb3ca484f80f59e2 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 02:58:44 -0400 Subject: [PATCH 05/14] Remove assert --- tests/test_highlighting.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 7416a516..17069078 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -341,8 +341,6 @@ def test_overlapping_tokens(): terms = [token.text for token in analyzer(query_string)] - assert terms == [u('马克'), u('马克思')] - output = highlight.highlight( text, terms, From 3c9379f06022cded307047b90a2bbf7edd455500 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 03:04:02 -0400 Subject: [PATCH 06/14] Unicode --- tests/test_highlighting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 17069078..c29f0f67 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -349,6 +349,6 @@ def test_overlapping_tokens(): formatter ) - assert output == '两次历史性飞跃与马克思主义中国化', \ + assert output == u('两次历史性飞跃与马克思主义中国化'), \ 'The longest overlapping token 马克思 was not selected by the highlighter' # as opposed to '两次历史性飞跃与马克思主义中国化' From a7f8243acb1ea659d2d628b9cbbb2bf40a380b17 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 03:16:11 -0400 Subject: [PATCH 07/14] Unicode --- tests/test_highlighting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index c29f0f67..b22db8da 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -2,8 +2,8 @@ from __future__ import with_statement -from jieba.analyse import ChineseAnalyzer import pytest +from jieba.analyse import ChineseAnalyzer from whoosh import analysis, highlight, fields, qparser, query from whoosh.compat import u @@ -350,5 +350,5 @@ def test_overlapping_tokens(): ) assert output == u('两次历史性飞跃与马克思主义中国化'), \ - 'The longest overlapping token 马克思 was not selected by the highlighter' + 'The longest overlapping token 马克思 was not selected by the highlighter' + ' : ' + output # as opposed to '两次历史性飞跃与马克思主义中国化' From cbb9f77cad9ae29e0c59f33b1c09ee08b782789d Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Thu, 18 Jul 2019 02:20:32 -0400 Subject: [PATCH 08/14] Fix Unicode in test --- tests/test_highlighting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index b22db8da..1647d1bf 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -334,8 +334,8 @@ def test_whole_noterms(): def test_overlapping_tokens(): - query_string = u("马克思") - text = u("两次历史性飞跃与马克思主义中国化") + query_string = u'马克思' + text = u'两次历史性飞跃与马克思主义中国化' analyzer = ChineseAnalyzer() formatter = highlight.HtmlFormatter() @@ -349,6 +349,6 @@ def test_overlapping_tokens(): formatter ) - assert output == u('两次历史性飞跃与马克思主义中国化'), \ - 'The longest overlapping token 马克思 was not selected by the highlighter' + ' : ' + output + assert output == u'两次历史性飞跃与马克思主义中国化', \ + u'The longest overlapping token 马克思 was not selected by the highlighter' # as opposed to '两次历史性飞跃与马克思主义中国化' From a0439acf83089cabfe0731155b87a0fca7b3b099 Mon Sep 17 00:00:00 2001 From: "sweep-ai[bot]" <128439645+sweep-ai[bot]@users.noreply.github.com> Date: Sat, 30 Dec 2023 06:55:26 +0000 Subject: [PATCH 09/14] Create sweep.yaml --- sweep.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 sweep.yaml diff --git a/sweep.yaml b/sweep.yaml new file mode 100644 index 00000000..89e1d027 --- /dev/null +++ b/sweep.yaml @@ -0,0 +1,27 @@ +# Sweep AI turns bugs & feature requests into code changes (https://sweep.dev) +# For details on our config file, check out our docs at https://docs.sweep.dev/usage/config + +# This setting contains a list of rules that Sweep will check for. If any of these rules are broken in a new commit, Sweep will create an pull request to fix the broken rule. +rules: + - "All new business logic should have corresponding unit tests." + - "Refactor large functions to be more modular." + - "Add docstrings to all functions and file headers." + +# This is the branch that Sweep will develop from and make pull requests to. Most people use 'main' or 'master' but some users also use 'dev' or 'staging'. +branch: 'main' + +# By default Sweep will read the logs and outputs from your existing Github Actions. To disable this, set this to false. +gha_enabled: True + +# This is the description of your project. It will be used by sweep when creating PRs. You can tell Sweep what's unique about your project, what frameworks you use, or anything else you want. +# +# Example: +# +# description: sweepai/sweep is a python project. The main api endpoints are in sweepai/api.py. Write code that adheres to PEP8. +description: '' + +# This sets whether to create pull requests as drafts. If this is set to True, then all pull requests will be created as drafts and GitHub Actions will not be triggered. +draft: False + +# This is a list of directories that Sweep will not be able to edit. +blocked_dirs: [] From fe50e0cd1c890154ce8944b522d5de4304651e32 Mon Sep 17 00:00:00 2001 From: "sweep-ai[bot]" <128439645+sweep-ai[bot]@users.noreply.github.com> Date: Sat, 30 Dec 2023 06:55:26 +0000 Subject: [PATCH 10/14] Create sweep template --- .github/ISSUE_TEMPLATE/sweep-template.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/sweep-template.yml diff --git a/.github/ISSUE_TEMPLATE/sweep-template.yml b/.github/ISSUE_TEMPLATE/sweep-template.yml new file mode 100644 index 00000000..44116f53 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/sweep-template.yml @@ -0,0 +1,15 @@ +name: Sweep Issue +title: 'Sweep: ' +description: For small bugs, features, refactors, and tests to be handled by Sweep, an AI-powered junior developer. +labels: sweep +body: + - type: textarea + id: description + attributes: + label: Details + description: Tell Sweep where and what to edit and provide enough context for a new developer to the codebase + placeholder: | + Unit Tests: Write unit tests for . Test each function in the file. Make sure to test edge cases. + Bugs: The bug might be in . Here are the logs: ... + Features: the new endpoint should use the ... class from because it contains ... logic. + Refactors: We are migrating this function to ... version because ... \ No newline at end of file From fd1a83afa066ac699c538812a223286eb926e659 Mon Sep 17 00:00:00 2001 From: Alejandro Gil Date: Wed, 3 Jan 2024 06:54:57 -0700 Subject: [PATCH 11/14] Fixing test_automata.py DFA error in Pypy3.9 and 3.10 --- tests/test_automata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_automata.py b/tests/test_automata.py index 1bceb733..9b57c45b 100644 --- a/tests/test_automata.py +++ b/tests/test_automata.py @@ -360,7 +360,7 @@ def test_strings_dfa(): domain = "abcd" words = set() - for i in xrange(1, len(domain) + 1): + for i in range(1, len(domain) + 1): # Replace xrange with range since xrange is failing in Pypy3.9 and 3.10 words.update("".join(p) for p in permutations(domain[:i])) words = sorted(words) dfa = fsa.strings_dfa(words) From b18b9e5513af25d522c991b10c7921dee5c13b57 Mon Sep 17 00:00:00 2001 From: Alejandro Gil Date: Wed, 3 Jan 2024 07:05:38 -0700 Subject: [PATCH 12/14] Fix test_analysis.py Deprecation warning in Pypy3.10 --- tests/test_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index c46a70db..425415f4 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -520,7 +520,7 @@ def test_stop_lang(): def test_issue358(): - t = analysis.RegexTokenizer("\w+") + t = analysis.RegexTokenizer(r"\w+") with pytest.raises(analysis.CompositionError): _ = t | analysis.StandardAnalyzer() From b479b22cc942d938ce2b699522b9b660dd30970a Mon Sep 17 00:00:00 2001 From: Alejandro Gil Date: Wed, 3 Jan 2024 08:00:18 -0700 Subject: [PATCH 13/14] Update Readme.md with a working documentation link and some other extra information and fixes. --- README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index e11d0352..45faaa69 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,7 @@ Whoosh might be useful in the following circumstances: * When an easy-to-use Pythonic interface is more important to you than raw speed. -Whoosh was created and is maintained by Matt Chaput. It was originally created -for use in the online help system of Side Effects Software's 3D animation +Whoosh was created by Matt Chaput and is maintained currently by the Sygil-Dev Organization. It was originally created for use in the online help system of Side Effects Software's 3D animation software Houdini. Side Effects Software Inc. graciously agreed to open-source the code. @@ -43,28 +42,29 @@ Installing Whoosh If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install`` or ``pip`` to download and install Whoosh automatically:: + # install the old version from Pypi $ easy_install Whoosh - + or - + $ pip install Whoosh + + + # Install the development version from Github. + $ pip install git+https://github.com/Sygil-Dev/whoosh.git Learning more ============= -* Read the online documentation at https://whoosh.readthedocs.org/en/latest/ +* Read the online documentation at https://docs.red-dove.com/whoosh/ (Search DOES work). -* Join the Whoosh mailing list at http://groups.google.com/group/whoosh +* Read the old online documentation at https://whoosh.readthedocs.org/en/latest/ (Search DOES NOT work). -* File bug reports and view the Whoosh wiki at - http://bitbucket.org/mchaput/whoosh/ +* File bug reports and issues at https://github.com/Sygil-Dev/whoosh/issues -Getting the source +Getting the source. ================== -Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/ - -You can check out the latest version of the source code using Mercurial:: - - hg clone http://bitbucket.org/mchaput/whoosh +You can check out the latest version of the source code on GitHub using git: + $ git clone https://github.com/Sygil-Dev/whoosh.git From 844d840d7a5aa829d3ac5886b2cae09d74270fa4 Mon Sep 17 00:00:00 2001 From: "sweep-ai[bot]" <128439645+sweep-ai[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 16:58:41 +0000 Subject: [PATCH 14/14] fix: install jieba module before running tests --- .github/workflows/python-package.yml | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..0707587f --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,37 @@ +name: Python package + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pip install jieba + pytest