Merge branch '532'

cclauss · Jan 3, 2024 · 0b0e86f · 0b0e86f
2 parents 0330b24 + cbb9f77
commit 0b0e86f
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 15 deletions.
diff --git a/setup.py b/setup.py
@@ -1,6 +1,8 @@
 #!python
 
-import os.path, sys
+import os.path
+import sys
+
 from setuptools import setup, find_packages
 from setuptools.command.test import test as TestCommand
 
@@ -20,7 +22,7 @@ def finalize_options(self):
         self.test_suite = True
 
     def run_tests(self):
-        #import here, cause outside the eggs aren't loaded
+        # import here, cause outside the eggs aren't loaded
         import pytest
         pytest.main(self.test_args)
 
@@ -44,18 +46,18 @@ def run_tests(self):
 
         zip_safe=True,
         install_requires=['cached-property'],
-        tests_require=['pytest'],
+        tests_require=['pytest', 'jieba'],
         cmdclass={'test': PyTest},
 
         classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: BSD License",
-        "Natural Language :: English",
-        "Operating System :: OS Independent",
-        "Programming Language :: Python :: 2.5",
-        "Programming Language :: Python :: 3",
-        "Topic :: Software Development :: Libraries :: Python Modules",
-        "Topic :: Text Processing :: Indexing",
+            "Development Status :: 5 - Production/Stable",
+            "Intended Audience :: Developers",
+            "License :: OSI Approved :: BSD License",
+            "Natural Language :: English",
+            "Operating System :: OS Independent",
+            "Programming Language :: Python :: 2.5",
+            "Programming Language :: Python :: 3",
+            "Topic :: Software Development :: Libraries :: Python Modules",
+            "Topic :: Text Processing :: Indexing",
         ],
     )
diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py
@@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1):
                 self.matched_terms.add(t.text)
 
     def __repr__(self):
-        return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
-                                        len(self.matches))
+        return "<Fragment %d:%d has %d matches>" % (self.startchar, self.endchar,
+                                                    len(self.matches))
 
     def __len__(self):
         return self.endchar - self.startchar
@@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False):
         index = fragment.startchar
         text = fragment.text
 
-        for t in fragment.matches:
+        # For overlapping tokens (such as in Chinese), sort by position,
+        # then by inverse of length.
+        # Because the formatter is sequential, it will only pick the first
+        # token for a given position to highlight. This makes sure it picks
+        # the longest overlapping token.
+        for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))):
             if t.startchar is None:
                 continue
             if t.startchar < index:

diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
@@ -3,6 +3,7 @@
 from __future__ import with_statement
 
 import pytest
+from jieba.analyse import ChineseAnalyzer
 
 from whoosh import analysis, highlight, fields, qparser, query
 from whoosh.compat import u
@@ -330,3 +331,24 @@ def test_whole_noterms():
 
         hi = r[0].highlights("text", minscore=0)
         assert hi == u("alfa bravo charlie delta echo foxtrot golf")
+
+
+def test_overlapping_tokens():
+    query_string = u'马克思'
+    text = u'两次历史性飞跃与马克思主义中国化'
+    analyzer = ChineseAnalyzer()
+    formatter = highlight.HtmlFormatter()
+
+    terms = [token.text for token in analyzer(query_string)]
+
+    output = highlight.highlight(
+        text,
+        terms,
+        analyzer,
+        highlight.WholeFragmenter(),
+        formatter
+    )
+
+    assert output == u'两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化', \
+        u'The longest overlapping token 马克思 was not selected by the highlighter'
+    # as opposed to '两次历史性飞跃与<strong class="match term0">马克</strong>思主义中国化'