Merge pull request mchaput#17 from ZeroCool940711/main

Highlight longest overlapping token
cclauss · Jan 3, 2024 · 67bcb6c · 67bcb6c
2 parents 0330b24 + f8beeeb
commit 67bcb6c
Show file tree

Hide file tree

Showing 7 changed files with 124 additions and 16 deletions.
diff --git a/.github/ISSUE_TEMPLATE/sweep-template.yml b/.github/ISSUE_TEMPLATE/sweep-template.yml
@@ -0,0 +1,15 @@
+name: Sweep Issue
+title: 'Sweep: '
+description: For small bugs, features, refactors, and tests to be handled by Sweep, an AI-powered junior developer.
+labels: sweep
+body:
+  - type: textarea
+    id: description
+    attributes:
+      label: Details
+      description: Tell Sweep where and what to edit and provide enough context for a new developer to the codebase
+      placeholder: |
+        Unit Tests: Write unit tests for <FILE>. Test each function in the file. Make sure to test edge cases.
+        Bugs: The bug might be in <FILE>. Here are the logs: ...
+        Features: the new endpoint should use the ... class from <FILE> because it contains ... logic.
+        Refactors: We are migrating this function to ... version because ...
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,37 @@
+name: Python package
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.6, 3.7, 3.8, 3.9]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pip install jieba
+        pytest
diff --git a/setup.py b/setup.py
@@ -1,6 +1,8 @@
 #!python
 
-import os.path, sys
+import os.path
+import sys
+
 from setuptools import setup, find_packages
 from setuptools.command.test import test as TestCommand
 
@@ -20,7 +22,7 @@ def finalize_options(self):
         self.test_suite = True
 
     def run_tests(self):
-        #import here, cause outside the eggs aren't loaded
+        # import here, cause outside the eggs aren't loaded
         import pytest
         pytest.main(self.test_args)
 
@@ -44,18 +46,18 @@ def run_tests(self):
 
         zip_safe=True,
         install_requires=['cached-property'],
-        tests_require=['pytest'],
+        tests_require=['pytest', 'jieba'],
         cmdclass={'test': PyTest},
 
         classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: BSD License",
-        "Natural Language :: English",
-        "Operating System :: OS Independent",
-        "Programming Language :: Python :: 2.5",
-        "Programming Language :: Python :: 3",
-        "Topic :: Software Development :: Libraries :: Python Modules",
-        "Topic :: Text Processing :: Indexing",
+            "Development Status :: 5 - Production/Stable",
+            "Intended Audience :: Developers",
+            "License :: OSI Approved :: BSD License",
+            "Natural Language :: English",
+            "Operating System :: OS Independent",
+            "Programming Language :: Python :: 2.5",
+            "Programming Language :: Python :: 3",
+            "Topic :: Software Development :: Libraries :: Python Modules",
+            "Topic :: Text Processing :: Indexing",
         ],
     )
diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py
@@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1):
                 self.matched_terms.add(t.text)
 
     def __repr__(self):
-        return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
-                                        len(self.matches))
+        return "<Fragment %d:%d has %d matches>" % (self.startchar, self.endchar,
+                                                    len(self.matches))
 
     def __len__(self):
         return self.endchar - self.startchar
@@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False):
         index = fragment.startchar
         text = fragment.text
 
-        for t in fragment.matches:
+        # For overlapping tokens (such as in Chinese), sort by position,
+        # then by inverse of length.
+        # Because the formatter is sequential, it will only pick the first
+        # token for a given position to highlight. This makes sure it picks
+        # the longest overlapping token.
+        for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))):
             if t.startchar is None:
                 continue
             if t.startchar < index:

diff --git a/sweep.yaml b/sweep.yaml
@@ -0,0 +1,27 @@
+# Sweep AI turns bugs & feature requests into code changes (https://sweep.dev)
+# For details on our config file, check out our docs at https://docs.sweep.dev/usage/config
+
+# This setting contains a list of rules that Sweep will check for. If any of these rules are broken in a new commit, Sweep will create an pull request to fix the broken rule.
+rules:
+  - "All new business logic should have corresponding unit tests."
+  - "Refactor large functions to be more modular."
+  - "Add docstrings to all functions and file headers."
+
+# This is the branch that Sweep will develop from and make pull requests to. Most people use 'main' or 'master' but some users also use 'dev' or 'staging'.
+branch: 'main'
+
+# By default Sweep will read the logs and outputs from your existing Github Actions. To disable this, set this to false.
+gha_enabled: True
+
+# This is the description of your project. It will be used by sweep when creating PRs. You can tell Sweep what's unique about your project, what frameworks you use, or anything else you want.
+#
+# Example:
+#
+# description: sweepai/sweep is a python project. The main api endpoints are in sweepai/api.py. Write code that adheres to PEP8.
+description: ''
+
+# This sets whether to create pull requests as drafts. If this is set to True, then all pull requests will be created as drafts and GitHub Actions will not be triggered.
+draft: False
+
+# This is a list of directories that Sweep will not be able to edit.
+blocked_dirs: []
diff --git a/tests/test_automata.py b/tests/test_automata.py
@@ -360,7 +360,7 @@ def test_strings_dfa():
 
     domain = "abcd"
     words = set()
-    for i in xrange(1, len(domain) + 1):
+    for i in range(1, len(domain) + 1):  # Replace xrange with range since xrange is failing in Pypy3.9 and 3.10
         words.update("".join(p) for p in permutations(domain[:i]))
     words = sorted(words)
     dfa = fsa.strings_dfa(words)

diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
@@ -3,6 +3,7 @@
 from __future__ import with_statement
 
 import pytest
+from jieba.analyse import ChineseAnalyzer
 
 from whoosh import analysis, highlight, fields, qparser, query
 from whoosh.compat import u
@@ -330,3 +331,24 @@ def test_whole_noterms():
 
         hi = r[0].highlights("text", minscore=0)
         assert hi == u("alfa bravo charlie delta echo foxtrot golf")
+
+
+def test_overlapping_tokens():
+    query_string = u'马克思'
+    text = u'两次历史性飞跃与马克思主义中国化'
+    analyzer = ChineseAnalyzer()
+    formatter = highlight.HtmlFormatter()
+
+    terms = [token.text for token in analyzer(query_string)]
+
+    output = highlight.highlight(
+        text,
+        terms,
+        analyzer,
+        highlight.WholeFragmenter(),
+        formatter
+    )
+
+    assert output == u'两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化', \
+        u'The longest overlapping token 马克思 was not selected by the highlighter'
+    # as opposed to '两次历史性飞跃与<strong class="match term0">马克</strong>思主义中国化'