Skip to content

Commit

Permalink
Merge branch '532'
Browse files Browse the repository at this point in the history
  • Loading branch information
ZeroCool940711 committed Jan 3, 2024
2 parents 0330b24 + cbb9f77 commit 0b0e86f
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 15 deletions.
26 changes: 14 additions & 12 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!python

import os.path, sys
import os.path
import sys

from setuptools import setup, find_packages
from setuptools.command.test import test as TestCommand

Expand All @@ -20,7 +22,7 @@ def finalize_options(self):
self.test_suite = True

def run_tests(self):
#import here, cause outside the eggs aren't loaded
# import here, cause outside the eggs aren't loaded
import pytest
pytest.main(self.test_args)

Expand All @@ -44,18 +46,18 @@ def run_tests(self):

zip_safe=True,
install_requires=['cached-property'],
tests_require=['pytest'],
tests_require=['pytest', 'jieba'],
cmdclass={'test': PyTest},

classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 2.5",
"Programming Language :: Python :: 3",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing :: Indexing",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 2.5",
"Programming Language :: Python :: 3",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing :: Indexing",
],
)
11 changes: 8 additions & 3 deletions src/whoosh/highlight.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1):
self.matched_terms.add(t.text)

def __repr__(self):
return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
len(self.matches))
return "<Fragment %d:%d has %d matches>" % (self.startchar, self.endchar,
len(self.matches))

def __len__(self):
return self.endchar - self.startchar
Expand Down Expand Up @@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False):
index = fragment.startchar
text = fragment.text

for t in fragment.matches:
# For overlapping tokens (such as in Chinese), sort by position,
# then by inverse of length.
# Because the formatter is sequential, it will only pick the first
# token for a given position to highlight. This makes sure it picks
# the longest overlapping token.
for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))):
if t.startchar is None:
continue
if t.startchar < index:
Expand Down
22 changes: 22 additions & 0 deletions tests/test_highlighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import with_statement

import pytest
from jieba.analyse import ChineseAnalyzer

from whoosh import analysis, highlight, fields, qparser, query
from whoosh.compat import u
Expand Down Expand Up @@ -330,3 +331,24 @@ def test_whole_noterms():

hi = r[0].highlights("text", minscore=0)
assert hi == u("alfa bravo charlie delta echo foxtrot golf")


def test_overlapping_tokens():
query_string = u'马克思'
text = u'两次历史性飞跃与马克思主义中国化'
analyzer = ChineseAnalyzer()
formatter = highlight.HtmlFormatter()

terms = [token.text for token in analyzer(query_string)]

output = highlight.highlight(
text,
terms,
analyzer,
highlight.WholeFragmenter(),
formatter
)

assert output == u'两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化', \
u'The longest overlapping token 马克思 was not selected by the highlighter'
# as opposed to '两次历史性飞跃与<strong class="match term0">马克</strong>思主义中国化'

0 comments on commit 0b0e86f

Please sign in to comment.