From 73c0c2f9e0cdce20938bbaa110d34c1d7d7a0b19 Mon Sep 17 00:00:00 2001 From: Karl Voit Date: Sun, 6 Mar 2016 12:28:55 +0100 Subject: [PATCH] fuzzy_contains_one_of --- .gitignore | 1 + guessfilename.py | 23 +++++++++++++++++++++-- guessfilename_test.py | 37 ++++++++++++++++++++++++++++++++++--- 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 4d1c7c9..a76423f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.pyc *.pdf /flycheck_guessfilename.py +/flycheck_guessfilename_test.py diff --git a/guessfilename.py b/guessfilename.py index a617d72..912bcb7 100755 --- a/guessfilename.py +++ b/guessfilename.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Time-stamp: <2016-03-05 23:43:01 vk> +# Time-stamp: <2016-03-06 12:12:53 vk> ## TODO: ## * fix parts marked with «FIXXME» @@ -18,6 +18,7 @@ import os.path import time import logging from optparse import OptionParser +from fuzzywuzzy import fuzz # for fuzzy comparison of strings PROG_VERSION_NUMBER = u"0.1" PROG_VERSION_DATE = u"2016-03-04" @@ -199,7 +200,7 @@ class GuessFilename(object): return self.oldfilename = oldfilename - + def split_filename_entities(self, filename): """ @@ -241,6 +242,24 @@ class GuessFilename(object): return False + def fuzzy_contains_one_of(self, string, entries): + """ + Returns true, if the string contains a similar one of the strings within entries array + """ + + assert(type(string) == unicode or type(string) == str) + assert(type(entries) == list) + assert(len(string)>0) + assert(len(entries)>0) + + for entry in entries: + similarity = fuzz.partial_ratio(string, entry) + if similarity > 65: + logging.debug("fuzzy_contains_one_of(%s, %s) == %i" % (string, str(entries), similarity)) + return True + + return False + def has_euro_charge(self, string): """ Returns true, if the string contains a number with a €-currency diff --git a/guessfilename_test.py b/guessfilename_test.py index 0c55ebe..5a4e761 100644 --- a/guessfilename_test.py +++ b/guessfilename_test.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8; mode: python; -*- -# Time-stamp: <2016-03-05 23:38:58 vk> +# Time-stamp: <2016-03-06 12:28:12 vk> import unittest from guessfilename import GuessFilename @@ -8,10 +8,13 @@ from guessfilename import GuessFilename class TestGuessFilename(unittest.TestCase): logging = None - guess_filename = GuessFilename() + guess_filename = None def setUp(self): - pass + verbose = True + quiet = False + self.guess_filename = GuessFilename() + self.guess_filename.verbose = verbose def tearDown(self): pass @@ -40,6 +43,34 @@ class TestGuessFilename(unittest.TestCase): self.assertFalse(self.guess_filename.contains_one_of(u"foo bar baz", [u'xba'])) self.assertFalse(self.guess_filename.contains_one_of(u"foo bar baz", [u'x', u'xba', u'yuio'])) + def test_fuzzy_contains_one_of(self): + + ## comparing exact strings: + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", ['foo'])) + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'foo'])) + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'bar'])) + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'ba'])) + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'x', u'ba', u'yuio'])) + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"Kundennummer 1234567890", [u'12345'])) + + ## fuzzy similarities: + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", ['xfoo'])) + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'xfoo'])) + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'xbar'])) + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'xba'])) + self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'x', u'xba', u'yuio'])) + #self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"Kundennummer 1234567890", [u'1234581388'])) + #self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"Rundemummer 1234567890", [u'Rundemummer 1234581388'])) + #self.assertTrue(self.guess_filename.fuzzy_contains_one_of(u"Rundemummer 1234567890", [u'Rumdemummer 1234581388'])) + + ## fuzzy non-matches: + self.assertFalse(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'xyz'])) + self.assertFalse(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'111'])) + self.assertFalse(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'xby'])) + self.assertFalse(self.guess_filename.fuzzy_contains_one_of(u"foo bar baz", [u'x', u'yyy', u'yuio'])) + #self.assertFalse(self.guess_filename.fuzzy_contains_one_of(u"Kundennummer 1234567890", [u'12345', u' 345 ', u'0987654321'])) + #self.assertFalse(self.guess_filename.fuzzy_contains_one_of(u"Kundennummer 1234567890", [u'12345'])) + def test_has_euro_charge(self): self.assertTrue(self.guess_filename.has_euro_charge(u"12,34EUR"))