Skip to content

Commit

Permalink
Merge pull request mchaput#17 from ZeroCool940711/main
Browse files Browse the repository at this point in the history
Highlight longest overlapping token
  • Loading branch information
ZeroCool940711 authored Jan 3, 2024
2 parents 0330b24 + f8beeeb commit 67bcb6c
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 16 deletions.
15 changes: 15 additions & 0 deletions .github/ISSUE_TEMPLATE/sweep-template.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Sweep Issue
title: 'Sweep: '
description: For small bugs, features, refactors, and tests to be handled by Sweep, an AI-powered junior developer.
labels: sweep
body:
- type: textarea
id: description
attributes:
label: Details
description: Tell Sweep where and what to edit and provide enough context for a new developer to the codebase
placeholder: |
Unit Tests: Write unit tests for <FILE>. Test each function in the file. Make sure to test edge cases.
Bugs: The bug might be in <FILE>. Here are the logs: ...
Features: the new endpoint should use the ... class from <FILE> because it contains ... logic.
Refactors: We are migrating this function to ... version because ...
37 changes: 37 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Python package

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pip install jieba
pytest
26 changes: 14 additions & 12 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!python

import os.path, sys
import os.path
import sys

from setuptools import setup, find_packages
from setuptools.command.test import test as TestCommand

Expand All @@ -20,7 +22,7 @@ def finalize_options(self):
self.test_suite = True

def run_tests(self):
#import here, cause outside the eggs aren't loaded
# import here, cause outside the eggs aren't loaded
import pytest
pytest.main(self.test_args)

Expand All @@ -44,18 +46,18 @@ def run_tests(self):

zip_safe=True,
install_requires=['cached-property'],
tests_require=['pytest'],
tests_require=['pytest', 'jieba'],
cmdclass={'test': PyTest},

classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 2.5",
"Programming Language :: Python :: 3",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing :: Indexing",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 2.5",
"Programming Language :: Python :: 3",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing :: Indexing",
],
)
11 changes: 8 additions & 3 deletions src/whoosh/highlight.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1):
self.matched_terms.add(t.text)

def __repr__(self):
return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
len(self.matches))
return "<Fragment %d:%d has %d matches>" % (self.startchar, self.endchar,
len(self.matches))

def __len__(self):
return self.endchar - self.startchar
Expand Down Expand Up @@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False):
index = fragment.startchar
text = fragment.text

for t in fragment.matches:
# For overlapping tokens (such as in Chinese), sort by position,
# then by inverse of length.
# Because the formatter is sequential, it will only pick the first
# token for a given position to highlight. This makes sure it picks
# the longest overlapping token.
for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))):
if t.startchar is None:
continue
if t.startchar < index:
Expand Down
27 changes: 27 additions & 0 deletions sweep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Sweep AI turns bugs & feature requests into code changes (https://sweep.dev)
# For details on our config file, check out our docs at https://docs.sweep.dev/usage/config

# This setting contains a list of rules that Sweep will check for. If any of these rules are broken in a new commit, Sweep will create an pull request to fix the broken rule.
rules:
- "All new business logic should have corresponding unit tests."
- "Refactor large functions to be more modular."
- "Add docstrings to all functions and file headers."

# This is the branch that Sweep will develop from and make pull requests to. Most people use 'main' or 'master' but some users also use 'dev' or 'staging'.
branch: 'main'

# By default Sweep will read the logs and outputs from your existing Github Actions. To disable this, set this to false.
gha_enabled: True

# This is the description of your project. It will be used by sweep when creating PRs. You can tell Sweep what's unique about your project, what frameworks you use, or anything else you want.
#
# Example:
#
# description: sweepai/sweep is a python project. The main api endpoints are in sweepai/api.py. Write code that adheres to PEP8.
description: ''

# This sets whether to create pull requests as drafts. If this is set to True, then all pull requests will be created as drafts and GitHub Actions will not be triggered.
draft: False

# This is a list of directories that Sweep will not be able to edit.
blocked_dirs: []
2 changes: 1 addition & 1 deletion tests/test_automata.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def test_strings_dfa():

domain = "abcd"
words = set()
for i in xrange(1, len(domain) + 1):
for i in range(1, len(domain) + 1): # Replace xrange with range since xrange is failing in Pypy3.9 and 3.10
words.update("".join(p) for p in permutations(domain[:i]))
words = sorted(words)
dfa = fsa.strings_dfa(words)
Expand Down
22 changes: 22 additions & 0 deletions tests/test_highlighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import with_statement

import pytest
from jieba.analyse import ChineseAnalyzer

from whoosh import analysis, highlight, fields, qparser, query
from whoosh.compat import u
Expand Down Expand Up @@ -330,3 +331,24 @@ def test_whole_noterms():

hi = r[0].highlights("text", minscore=0)
assert hi == u("alfa bravo charlie delta echo foxtrot golf")


def test_overlapping_tokens():
query_string = u'马克思'
text = u'两次历史性飞跃与马克思主义中国化'
analyzer = ChineseAnalyzer()
formatter = highlight.HtmlFormatter()

terms = [token.text for token in analyzer(query_string)]

output = highlight.highlight(
text,
terms,
analyzer,
highlight.WholeFragmenter(),
formatter
)

assert output == u'两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化', \
u'The longest overlapping token 马克思 was not selected by the highlighter'
# as opposed to '两次历史性飞跃与<strong class="match term0">马克</strong>思主义中国化'

0 comments on commit 67bcb6c

Please sign in to comment.