forked from Github/guess-filename.py
updated lib, working for me
This commit is contained in:
parent
bfe44f056e
commit
9ee670b686
1 changed files with 19 additions and 10 deletions
|
|
@ -762,11 +762,11 @@ class GuessFilename(object):
|
|||
return False
|
||||
|
||||
try:
|
||||
pdffile = PyPDF2.PdfFileReader(open(filename, "rb"))
|
||||
pdffile = PyPDF2.PdfReader(open(filename, "rb"))
|
||||
|
||||
# if PDF is encryped, try password stored in config file
|
||||
# or quit this function if decryption is not successful
|
||||
if pdffile.isEncrypted:
|
||||
if pdffile.is_encrypted:
|
||||
returncode = pdffile.decrypt(self.config.SALARY_PDF_PASSWORD)
|
||||
if returncode < 1:
|
||||
logging.error('PDF file is encrypted and could NOT be decrypted using ' +
|
||||
|
|
@ -777,10 +777,10 @@ class GuessFilename(object):
|
|||
'config.SALARY_PDF_PASSWORD. Return code = ' + str(returncode))
|
||||
|
||||
# use first and second page of content only:
|
||||
if pdffile.getNumPages() > 1:
|
||||
content = pdffile.pages[0].extractText() + pdffile.pages[1].extractText()
|
||||
elif pdffile.getNumPages() == 1:
|
||||
content = pdffile.pages[0].extractText()
|
||||
if len(pdffile.pages) > 1:
|
||||
content = pdffile.pages[0].extract_text() + pdffile.pages[1].extract_text()
|
||||
elif len(pdffile.pages) == 1:
|
||||
content = pdffile.pages[0].extract_text()
|
||||
else:
|
||||
logging.error('Could not determine number of pages of PDF content! (skipping content analysis)')
|
||||
return False
|
||||
|
|
@ -814,15 +814,24 @@ class GuessFilename(object):
|
|||
# should parse starting sequence of
|
||||
# "^.LOHN/GEHALTSABRECHNUNG JÄNNER 2018Klien..." and
|
||||
# return "Jaenner"
|
||||
month_of_salary = re.match(r'.LOHN.*/.*GEHALTSABRECHNUNG (.+) .+', content).group(1).capitalize().replace('ä', 'ae')
|
||||
month_of_salary = re.search(r'.*Semimonthly ?\d?\d-(\w\w\w)', content).group(1).capitalize()
|
||||
except:
|
||||
logging.error('derive_new_filename_from_content(' + filename + '): I recognized pattern ' +
|
||||
'for salary file but content format for extracting month must have changed.')
|
||||
month_of_salary = 'FIXXME'
|
||||
|
||||
if datetimestr is None:
|
||||
try:
|
||||
date = re.search(r'.*Semimonthly ?\d?\d-\w\w\w-\d\d\d\d ?(\d\d-\w\w\w-\d\d\d\d)', content).group(1)
|
||||
datetimestr = datetime.datetime.strptime(date, "%d-%b-%Y").strftime("%Y-%m-%d")
|
||||
except:
|
||||
logging.error('derive_new_filename_from_content(' + filename + '): I recognized pattern ' +
|
||||
'for salary file but content format for date/time must have changed.')
|
||||
|
||||
try:
|
||||
# should extract "2.345,67" from following sequence
|
||||
# ".+SZAbzüge1.234,56Netto2.345,67IBAN:.+"
|
||||
net_salary = re.match(r'.+Netto(\d\.\d{3},\d{2})IBAN.+', content).group(1)
|
||||
net_salary = re.search(r'.+Ally Bank CHECKING XXXXXX9933 ([0-9,]+)', content).group(1)
|
||||
except:
|
||||
logging.error('derive_new_filename_from_content(' + filename + '): I recognized pattern ' +
|
||||
'for salary file but content format for extracting net salary must have changed.')
|
||||
|
|
@ -831,8 +840,8 @@ class GuessFilename(object):
|
|||
# hint when he wants to open the PDF in a PDF viewer
|
||||
print(' ' * 7 + colorama.Style.DIM + '→ PDF file password: ' + self.config.SALARY_PDF_PASSWORD +
|
||||
colorama.Style.RESET_ALL)
|
||||
return datetimestr + ' ' + self.config.SALARY_DESCRIPTION + ' ' + month_of_salary + ' - ' + \
|
||||
net_salary + '€ -- detego private.pdf'
|
||||
return datetimestr + ' ' + self.config.SALARY_DESCRIPTION + ' ' + month_of_salary + ' - $' + \
|
||||
net_salary + ' -- private.pdf'
|
||||
|
||||
# 2010-06-08 easybank - neue TAN-Liste -- scan private.pdf
|
||||
if self.fuzzy_contains_all_of(content, ["Transaktionsnummern (TANs)", "Ihre TAN-Liste in Verlust geraten"]) and \
|
||||
|
|
|
|||
Loading…
Reference in a new issue