Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace python-Levenshtein with rapidfuzz #10

Merged
merged 10 commits into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 2 additions & 10 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ Requirements
============

- Python 3.7 or higher
- difflib
- `python-Levenshtein <https://github.com/ztane/python-Levenshtein/>`_ (optional, provides a 4-10x speedup in String
Matching, though may result in `differing results for certain cases <https://github.com/seatgeek/fuzzywuzzy/issues/128>`_)
- `rapidfuzz <https://github.com/maxbachmann/RapidFuzz/>`_

For testing
~~~~~~~~~~~
Expand All @@ -29,12 +27,6 @@ Using PIP via PyPI

pip install thefuzz

or the following to install `python-Levenshtein` too

.. code:: bash

pip install thefuzz[speedup]


Using PIP via Github

Expand Down Expand Up @@ -110,7 +102,7 @@ Partial Token Sort Ratio
84
>>> fuzz.partial_token_sort_ratio("fuzzy was a bear", "wuzzy fuzzy was a bear")
100

Process
~~~~~~~

Expand Down
23 changes: 7 additions & 16 deletions benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
]

common_setup = "from thefuzz import fuzz, utils; "
basic_setup = "from thefuzz.string_processing import StringProcessor;"


def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
Expand All @@ -55,48 +54,42 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
duration, avg_duration * (1000 ** -thousands), units[-thousands]))


for s in choices:
print('Test validate_string for: "%s"' % s)
print_result_from_timeit('utils.validate_string(\'%s\')' % s, common_setup, number=iterations)

print('')

for s in mixed_strings + cirque_strings + choices:
print('Test full_process for: "%s"' % s)
print_result_from_timeit('utils.full_process(u\'%s\')' % s,
common_setup + basic_setup, number=iterations)
common_setup, number=iterations)

# benchmarking the core matching methods...

for s in cirque_strings:
print('Test fuzz.ratio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.ratio(u\'cirque du soleil\', u\'%s\')' % s,
common_setup + basic_setup, number=iterations / 100)
common_setup, number=iterations / 100)

for s in cirque_strings:
print('Test fuzz.partial_ratio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.partial_ratio(u\'cirque du soleil\', u\'%s\')'
% s, common_setup + basic_setup, number=iterations / 100)
% s, common_setup, number=iterations / 100)

for s in cirque_strings:
print('Test fuzz.WRatio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.WRatio(u\'cirque du soleil\', u\'%s\')' % s,
common_setup + basic_setup, number=iterations / 100)
common_setup, number=iterations / 100)

print('Test process.extract(scorer = fuzz.QRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.QRatio)',
common_setup + basic_setup + " from thefuzz import process; import string,random; random.seed(18);"
common_setup + " from thefuzz import process; import string,random; random.seed(18);"
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
number=10)

print('Test process.extract(scorer = fuzz.WRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.WRatio)',
common_setup + basic_setup + " from thefuzz import process; import string,random; random.seed(18);"
common_setup + " from thefuzz import process; import string,random; random.seed(18);"
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
number=10)

Expand All @@ -114,6 +107,4 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
print('-------------------------------')
test += 'prepared_ratio = functools.partial(fuzz.ratio, "%s")\n' % s
test += 'titles.sort(key=prepared_ratio)\n'
print_result_from_timeit(test,
common_setup + basic_setup,
number=100)
print_result_from_timeit(test, common_setup, number=100)
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ def open_file(fname):
author='Adam Cohen',
author_email='[email protected]',
packages=['thefuzz'],
extras_require={'speedup': ['python-levenshtein>=0.12']},
# keep for backwards compatibility of projects depending on `thefuzz[speedup]`
extras_require={'speedup': []},
install_requires= ['rapidfuzz>=2.1.2'],
url='https://github.com/seatgeek/thefuzz',
license="GPLv2",
classifiers=[
Expand All @@ -41,4 +43,5 @@ def open_file(fname):
description='Fuzzy string matching in python',
long_description=open_file('README.rst').read(),
zip_safe=True,
python_requires='>=3.7'
)
82 changes: 42 additions & 40 deletions test_thefuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,40 @@
from thefuzz import fuzz
from thefuzz import process
from thefuzz import utils
from thefuzz.string_processing import StringProcessor

scorers = [
fuzz.ratio,
fuzz.partial_ratio,
fuzz.token_sort_ratio,
fuzz.token_set_ratio,
fuzz.partial_token_sort_ratio,
fuzz.partial_token_set_ratio,
fuzz.QRatio,
fuzz.UQRatio,
fuzz.WRatio,
fuzz.UWRatio,
]

class StringProcessingTest(unittest.TestCase):
def test_replace_non_letters_non_numbers_with_whitespace(self):
strings = ["new york mets - atlanta braves", "Cães danados",
"New York //// Mets $$$", "Ça va?"]
for string in strings:
proc_string = StringProcessor.replace_non_letters_non_numbers_with_whitespace(string)
proc_string = utils.full_process(string)
regex = re.compile(r"(?ui)[\W]")
for expr in regex.finditer(proc_string):
self.assertEqual(expr.group(), " ")

def test_dont_condense_whitespace(self):
s1 = "new york mets - atlanta braves"
s2 = "new york mets atlanta braves"
p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s1)
p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s2)
self.assertNotEqual(p1, p2)
s3 = "new york mets atlanta braves"
p1 = utils.full_process(s1)
p2 = utils.full_process(s2)
p3 = utils.full_process(s3)
self.assertEqual(p1, s3)
self.assertEqual(p2, s2)
self.assertEqual(p3, s3)


class UtilsTest(unittest.TestCase):
Expand Down Expand Up @@ -120,7 +135,8 @@ def testPartialTokenSortRatio(self):
self.assertEqual(fuzz.partial_token_sort_ratio(self.s8, self.s8a, full_process=False), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=True), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=False), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 50)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 67)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s10a, self.s10, full_process=False), 67)
Comment on lines +138 to +139
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original implementation did have different results for these two tests (50 / 67), since it only allowed alignments behind the string, but not in the front. This is fixed here.


def testTokenSetRatio(self):
self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5), 100)
Expand Down Expand Up @@ -243,58 +259,44 @@ def testQRatioForceAscii(self):
score = fuzz.WRatio(s1, s2, force_ascii=False)
self.assertLess(score, 100)

def testTokenSetForceAscii(self):
def testPartialTokenSetRatioForceAscii(self):
s1 = "ABCD\u00C1 HELP\u00C1"
s2 = "ABCD HELP"

score = fuzz._token_set(s1, s2, force_ascii=True)
score = fuzz.partial_token_set_ratio(s1, s2, force_ascii=True)
self.assertEqual(score, 100)

score = fuzz._token_set(s1, s2, force_ascii=False)
score = fuzz.partial_token_set_ratio(s1, s2, force_ascii=False)
self.assertLess(score, 100)

def testTokenSortForceAscii(self):
def testPartialTokenSortRatioForceAscii(self):
s1 = "ABCD\u00C1 HELP\u00C1"
s2 = "ABCD HELP"

score = fuzz._token_sort(s1, s2, force_ascii=True)
score = fuzz.partial_token_sort_ratio(s1, s2, force_ascii=True)
self.assertEqual(score, 100)

score = fuzz._token_sort(s1, s2, force_ascii=False)
score = fuzz.partial_token_sort_ratio(s1, s2, force_ascii=False)
self.assertLess(score, 100)


class ValidatorTest(unittest.TestCase):
def setUp(self):
self.testFunc = lambda *args, **kwargs: (args, kwargs)

def testCheckForNone(self):
invalid_input = [
(None, None),
('Some', None),
(None, 'Some')
]
decorated_func = utils.check_for_none(self.testFunc)
for i in invalid_input:
self.assertEqual(decorated_func(*i), 0)
for scorer in scorers:
self.assertEqual(scorer(None, None), 0)
self.assertEqual(scorer('Some', None), 0)
self.assertEqual(scorer(None, 'Some'), 0)

valid_input = ('Some', 'Some')
actual = decorated_func(*valid_input)
self.assertNotEqual(actual, 0)
self.assertNotEqual(scorer('Some', 'Some'), 0)

def testCheckEmptyString(self):
invalid_input = [
('', ''),
('Some', ''),
('', 'Some')
]
decorated_func = utils.check_empty_string(self.testFunc)
for i in invalid_input:
self.assertEqual(decorated_func(*i), 0)

valid_input = ('Some', 'Some')
actual = decorated_func(*valid_input)
self.assertNotEqual(actual, 0)
for scorer in scorers:
if scorer in {fuzz.token_set_ratio, fuzz.partial_token_set_ratio, fuzz.WRatio, fuzz.UWRatio}:
self.assertEqual(scorer('', ''), 0)
else:
self.assertEqual(scorer('', ''), 100)

self.assertEqual(scorer('Some', ''), 0)
self.assertEqual(scorer('', 'Some'), 0)
self.assertNotEqual(scorer('Some', 'Some'), 0)


class ProcessTest(unittest.TestCase):
Expand Down
4 changes: 2 additions & 2 deletions test_thefuzz_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import partial
from string import ascii_letters, digits, punctuation

from hypothesis import given, assume, settings
from hypothesis import given, assume, settings, HealthCheck
import hypothesis.strategies as st
import pytest

Expand Down Expand Up @@ -62,7 +62,7 @@ def full_scorers_processors():
@pytest.mark.parametrize('scorer,processor',
scorers_processors())
@given(data=st.data())
@settings(max_examples=20, deadline=5000)
@settings(max_examples=20, deadline=5000, suppress_health_check=[HealthCheck.data_too_large])
def test_identical_strings_extracted(scorer, processor, data):
"""
Test that identical strings will always return a perfect match.
Expand Down
79 changes: 0 additions & 79 deletions thefuzz/StringMatcher.py

This file was deleted.

26 changes: 0 additions & 26 deletions thefuzz/StringMatcher.pyi

This file was deleted.

Loading