diff --git a/.github/ISSUE_TEMPLATE/sweep-template.yml b/.github/ISSUE_TEMPLATE/sweep-template.yml new file mode 100644 index 00000000..44116f53 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/sweep-template.yml @@ -0,0 +1,15 @@ +name: Sweep Issue +title: 'Sweep: ' +description: For small bugs, features, refactors, and tests to be handled by Sweep, an AI-powered junior developer. +labels: sweep +body: + - type: textarea + id: description + attributes: + label: Details + description: Tell Sweep where and what to edit and provide enough context for a new developer to the codebase + placeholder: | + Unit Tests: Write unit tests for . Test each function in the file. Make sure to test edge cases. + Bugs: The bug might be in . Here are the logs: ... + Features: the new endpoint should use the ... class from because it contains ... logic. + Refactors: We are migrating this function to ... version because ... \ No newline at end of file diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..0707587f --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,37 @@ +name: Python package + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pip install jieba + pytest diff --git a/setup.py b/setup.py index c4997749..d5accade 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,8 @@ #!python -import os.path, sys +import os.path +import sys + from setuptools import setup, find_packages from setuptools.command.test import test as TestCommand @@ -20,7 +22,7 @@ def finalize_options(self): self.test_suite = True def run_tests(self): - #import here, cause outside the eggs aren't loaded + # import here, cause outside the eggs aren't loaded import pytest pytest.main(self.test_args) @@ -44,18 +46,18 @@ def run_tests(self): zip_safe=True, install_requires=['cached-property'], - tests_require=['pytest'], + tests_require=['pytest', 'jieba'], cmdclass={'test': PyTest}, classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Natural Language :: English", - "Operating System :: OS Independent", - "Programming Language :: Python :: 2.5", - "Programming Language :: Python :: 3", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Text Processing :: Indexing", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 2.5", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing :: Indexing", ], ) diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py index 6fbbe6d0..212a7157 100644 --- a/src/whoosh/highlight.py +++ b/src/whoosh/highlight.py @@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1): self.matched_terms.add(t.text) def __repr__(self): - return "" % (self.startchar, self.endchar, - len(self.matches)) + return "" % (self.startchar, self.endchar, + len(self.matches)) def __len__(self): return self.endchar - self.startchar @@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False): index = fragment.startchar text = fragment.text - for t in fragment.matches: + # For overlapping tokens (such as in Chinese), sort by position, + # then by inverse of length. + # Because the formatter is sequential, it will only pick the first + # token for a given position to highlight. This makes sure it picks + # the longest overlapping token. + for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))): if t.startchar is None: continue if t.startchar < index: diff --git a/sweep.yaml b/sweep.yaml new file mode 100644 index 00000000..89e1d027 --- /dev/null +++ b/sweep.yaml @@ -0,0 +1,27 @@ +# Sweep AI turns bugs & feature requests into code changes (https://sweep.dev) +# For details on our config file, check out our docs at https://docs.sweep.dev/usage/config + +# This setting contains a list of rules that Sweep will check for. If any of these rules are broken in a new commit, Sweep will create an pull request to fix the broken rule. +rules: + - "All new business logic should have corresponding unit tests." + - "Refactor large functions to be more modular." + - "Add docstrings to all functions and file headers." + +# This is the branch that Sweep will develop from and make pull requests to. Most people use 'main' or 'master' but some users also use 'dev' or 'staging'. +branch: 'main' + +# By default Sweep will read the logs and outputs from your existing Github Actions. To disable this, set this to false. +gha_enabled: True + +# This is the description of your project. It will be used by sweep when creating PRs. You can tell Sweep what's unique about your project, what frameworks you use, or anything else you want. +# +# Example: +# +# description: sweepai/sweep is a python project. The main api endpoints are in sweepai/api.py. Write code that adheres to PEP8. +description: '' + +# This sets whether to create pull requests as drafts. If this is set to True, then all pull requests will be created as drafts and GitHub Actions will not be triggered. +draft: False + +# This is a list of directories that Sweep will not be able to edit. +blocked_dirs: [] diff --git a/tests/test_automata.py b/tests/test_automata.py index 1bceb733..9b57c45b 100644 --- a/tests/test_automata.py +++ b/tests/test_automata.py @@ -360,7 +360,7 @@ def test_strings_dfa(): domain = "abcd" words = set() - for i in xrange(1, len(domain) + 1): + for i in range(1, len(domain) + 1): # Replace xrange with range since xrange is failing in Pypy3.9 and 3.10 words.update("".join(p) for p in permutations(domain[:i])) words = sorted(words) dfa = fsa.strings_dfa(words) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 523dff6b..1647d1bf 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -3,6 +3,7 @@ from __future__ import with_statement import pytest +from jieba.analyse import ChineseAnalyzer from whoosh import analysis, highlight, fields, qparser, query from whoosh.compat import u @@ -330,3 +331,24 @@ def test_whole_noterms(): hi = r[0].highlights("text", minscore=0) assert hi == u("alfa bravo charlie delta echo foxtrot golf") + + +def test_overlapping_tokens(): + query_string = u'马克思' + text = u'两次历史性飞跃与马克思主义中国化' + analyzer = ChineseAnalyzer() + formatter = highlight.HtmlFormatter() + + terms = [token.text for token in analyzer(query_string)] + + output = highlight.highlight( + text, + terms, + analyzer, + highlight.WholeFragmenter(), + formatter + ) + + assert output == u'两次历史性飞跃与马克思主义中国化', \ + u'The longest overlapping token 马克思 was not selected by the highlighter' + # as opposed to '两次历史性飞跃与马克思主义中国化'