From 42c04230fb53d563a85c79e32f6df20516f1864a Mon Sep 17 00:00:00 2001 From: Karl Voit Date: Tue, 16 May 2023 16:32:21 +0200 Subject: [PATCH] PyPDF2 -> pypdf + new salary file format --- guessfilename/__init__.py | 93 ++++++++++++++------------------------- 1 file changed, 32 insertions(+), 61 deletions(-) diff --git a/guessfilename/__init__.py b/guessfilename/__init__.py index 0acfe73..4d685e7 100755 --- a/guessfilename/__init__.py +++ b/guessfilename/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -PROG_VERSION = u"Time-stamp: <2023-03-15 10:52:13 vk>" +PROG_VERSION = u"Time-stamp: <2023-05-16 16:25:50 vk>" # TODO: @@ -695,15 +695,6 @@ class GuessFilename(object): if regex_match: return self.get_datetime_description_extension_filename(regex_match, replace_description_underscores=True) - # 2019-05-24: this is a workaround until PDF file decryption in PyPDF2 is fixed for parsing the content id:2019-05-24-guessfilename-salary - if extension.upper() == "PDF" and self.config.SALARY_STARTSTRING in oldfilename and datetimestr: - # print out password to stdout in order to give the user a - # hint when he wants to open the PDF in a PDF viewer - print(' ' * 7 + colorama.Style.DIM + '→ PDF file password: ' + self.config.SALARY_PDF_PASSWORD + - colorama.Style.RESET_ALL) - return datetimestr + ' ' + self.config.SALARY_DESCRIPTION + ' MONTH - SALARY' + \ - '€ -- detego private.pdf' - # 2019-10-10: '2019-10-10 a file exported by Boox Max 2-Exported.pdf' or # '2019-10-10 a file exported by Boox Max 2 -- notes-Exported.pdf' become # -> '2019-10-10 a file exported by Boox Max 2 -- notes.pdf' @@ -802,32 +793,34 @@ class GuessFilename(object): logging.debug("File is not a PDF file and thus can't be parsed by this script: %s" % filename) return False - try: - pdffile = PyPDF2.PdfFileReader(open(filename, "rb")) +# try: + pdffile = pypdf.PdfReader(open(filename, "rb")) + #pdffile = PyPDF2.PdfFileReader(open(filename, "rb")) - # if PDF is encryped, try password stored in config file - # or quit this function if decryption is not successful - if pdffile.isEncrypted: - returncode = pdffile.decrypt(self.config.SALARY_PDF_PASSWORD) - if returncode < 1: - logging.error('PDF file is encrypted and could NOT be decrypted using ' + - 'config.SALARY_PDF_PASSWORD. Skipping content analysis.') - return False - else: - logging.debug('PDF file is encrypted and could be decrypted using ' + - 'config.SALARY_PDF_PASSWORD. Return code = ' + str(returncode)) + if pdffile.is_encrypted: + logging.debug("derive_new_filename_from_content: if PDF is encryped, try password stored in config file or quit this function if decryption is not successful") + returncode = pdffile.decrypt(self.config.DEFAULT_PDF_PASSWORD) + if returncode < 1: + logging.error('PDF file is encrypted and could NOT be decrypted using ' + + 'config.DEFAULT_PDF_PASSWORD. Skipping content analysis.') + return False + else: + logging.debug('PDF file is encrypted and could be decrypted using ' + + 'config.DEFAULT_PDF_PASSWORD. Return code = ' + str(returncode)) + else: + logging.debug("derive_new_filename_from_content: PDF is not encryped") # use first and second page of content only: - if pdffile.getNumPages() > 1: - content = pdffile.pages[0].extractText() + pdffile.pages[1].extractText() - elif pdffile.getNumPages() == 1: - content = pdffile.pages[0].extractText() + if len(pdffile.pages) > 1: + content = pdffile.pages[0].extract_text() + pdffile.pages[1].extract_text() + elif len(pdffile.pages) == 1: + content = pdffile.pages[0].extract_text() else: logging.error('Could not determine number of pages of PDF content! (skipping content analysis)') return False - except: - logging.error('Could not read PDF file content. Skipping its content.') - return False +# except: +# logging.error('Could not read PDF file content. Skipping its content.') +# return False if len(content) == 0: logging.info('Could read PDF file content but it is empty (skipping content analysis)') @@ -837,43 +830,21 @@ class GuessFilename(object): # structure of the author's salary processing software. # Therefore, this most likely does not work for your salary # PDF file. - if extension.upper() == "PDF" and self.config.SALARY_STARTSTRING in filename: - #content = content.replace('\n', '') # there is a '\n' after each character - # 2019-05-24: new file format for salary PDF can not be parsed by PyPDF2: id:2019-05-24-guessfilename-salary - ## File "/home/vk/bin/guessfilename", line 1055, in derive_new_filename_from_content - ## returncode = pdffile.decrypt(self.config.SALARY_PDF_PASSWORD) - ## File "/usr/lib/python3/dist-packages/PyPDF2/pdf.py", line 1987, in decrypt - ## return self._decrypt(password) - ## File "/usr/lib/python3/dist-packages/PyPDF2/pdf.py", line 1996, in _decrypt - ## raise NotImplementedError("only algorithm code 1 and 2 are supported") - ## NotImplementedError: only algorithm code 1 and 2 are supported - ## - ## producer of PDF file: "wPDF4 by WPCubed GmbH" "PDF v. 1.7" - ## might relate to: https://github.com/mstamy2/PyPDF2/issues/378 + if extension == "PDF" and \ + self.config.SALARY_IDSTRING in filename: + #logging.debug('PARSING SALARY FILE ...') + content = content.replace('\n', '•') # to simplify regex match below try: - # should parse starting sequence of - # "^.LOHN/GEHALTSABRECHNUNG JÄNNER 2018Klien..." and - # return "Jaenner" - month_of_salary = re.match(r'.LOHN.*/.*GEHALTSABRECHNUNG (.+) .+', content).group(1).capitalize().replace('ä', 'ae') - except: - logging.error('derive_new_filename_from_content(' + filename + '): I recognized pattern ' + - 'for salary file but content format for extracting month must have changed.') - month_of_salary = 'FIXXME' - try: - # should extract "2.345,67" from following sequence - # ".+SZAbzüge1.234,56Netto2.345,67IBAN:.+" - net_salary = re.match(r'.+Netto(\d\.\d{3},\d{2})IBAN.+', content).group(1) + net_salary = re.match(r'.+•Netto (?P\d\.\d{3},\d{2})•=+.+', content).group('salary') + logging.debug('found salary: ' + str(net_salary)) except: logging.error('derive_new_filename_from_content(' + filename + '): I recognized pattern ' + 'for salary file but content format for extracting net salary must have changed.') net_salary = 'FIXXME' - # print out password to stdout in order to give the user a - # hint when he wants to open the PDF in a PDF viewer - print(' ' * 7 + colorama.Style.DIM + '→ PDF file password: ' + self.config.SALARY_PDF_PASSWORD + - colorama.Style.RESET_ALL) - return datetimestr + ' ' + self.config.SALARY_DESCRIPTION + ' ' + month_of_salary + ' - ' + \ - net_salary + '€ -- detego private.pdf' + + return filename[:-4] + ' - ' + \ + net_salary + '€ -- ' + self.config.SALARY_COMPANY_NAME + ' private.pdf' # 2010-06-08 easybank - neue TAN-Liste -- scan private.pdf if self.fuzzy_contains_all_of(content, ["Transaktionsnummern (TANs)", "Ihre TAN-Liste in Verlust geraten"]) and \