fuzzy_contains_all_of

This commit is contained in:
Karl Voit 2016-03-07 13:52:47 +01:00
parent d75416db96
commit cc615cff3a
2 changed files with 47 additions and 2 deletions

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Time-stamp: <2016-03-06 19:41:42 vk>
# Time-stamp: <2016-03-07 13:47:31 vk>
# TODO:
# * fix parts marked with «FIXXME»
@ -206,6 +206,27 @@ class GuessFilename(object):
return False
def fuzzy_contains_all_of(self, string, entries):
"""
Returns true, if the string contains all similar ones of the strings within the entries array
"""
assert(type(string) == unicode or type(string) == str)
assert(type(entries) == list)
assert(len(string) > 0)
assert(len(entries) > 0)
for entry in entries:
similarity = fuzz.partial_ratio(string, entry)
if similarity > 64:
#logging.debug(u"MATCH fuzzy_contains_all_of(%s, %s) == %i" % (string, str(entry), similarity))
pass
else:
#logging.debug(u"¬ MATCH fuzzy_contains_all_of(%s, %s) == %i" % (string, str(entry), similarity))
return False
return True
def has_euro_charge(self, string):
"""
Returns true, if the string contains a number with a -currency
@ -331,6 +352,16 @@ class GuessFilename(object):
' '.join(self.adding_tags(tags, ['scan', 'finance', 'private'])) + \
u".pdf"
# 2015-11-20 Kirchenbeitrag 12,34 EUR -- scan taxes bill.pdf
if self.fuzzy_contains_one_of(content, ["4294-0208"]) and \
self.fuzzy_contains_one_of(content, ["AT086000000007042401"]) and \
self.fuzzy_contains_one_of(content, ["Kontonachricht"]) and \
datetimestr:
return datetimestr + \
u" easybank - neue TAN-Liste -- " + \
' '.join(self.adding_tags(tags, ['scan', 'finance', 'private'])) + \
u".pdf"
# FIXXME: more file documents
import pdb; pdb.set_trace()

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8; mode: python; -*-
# Time-stamp: <2016-03-06 18:58:52 vk>
# Time-stamp: <2016-03-07 13:52:10 vk>
import unittest
import logging
@ -117,6 +117,20 @@ class TestGuessFilename(unittest.TestCase):
self.assertFalse(self.guess_filename.contains_one_of(u"foo bar baz", [u'xba']))
self.assertFalse(self.guess_filename.contains_one_of(u"foo bar baz", [u'x', u'xba', u'yuio']))
def test_fuzzy_contains_all_of(self):
self.assertTrue(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", ['foo']))
self.assertTrue(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", [u'foo']))
self.assertTrue(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", [u'bar']))
self.assertTrue(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", [u'ba']))
self.assertTrue(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", [u'foo', u"bar", u"baz"]))
self.assertFalse(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", [u'x', u'ba', u'yuio']))
self.assertTrue(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", ['xfoo']))
self.assertTrue(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", [u'xfoo']))
self.assertTrue(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", [u'xbar']))
self.assertFalse(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", [u'xba', u"12345"]))
self.assertFalse(self.guess_filename.fuzzy_contains_all_of(u"foo bar baz", [u'x', u'xba', u'yuio']))
def test_fuzzy_contains_one_of(self):
# comparing exact strings: