Compare commits

..

1 commit

Author SHA1 Message Date
Benson Chu
9ee670b686 updated lib, working for me 2024-01-01 21:54:52 -06:00
2 changed files with 44 additions and 99 deletions

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
PROG_VERSION = u"Time-stamp: <2023-03-15 10:52:13 vk>"
PROG_VERSION = u"Time-stamp: <2021-08-27 15:10:05 vk>"
# TODO:
@ -31,9 +31,9 @@ except ImportError:
sys.exit(1)
try:
import pypdf
import PyPDF2
except ImportError:
print("Could not find Python module \"pypdf\".\nPlease install it, e.g., with \"sudo pip install pypdf\".")
print("Could not find Python module \"PyPDF2\".\nPlease install it, e.g., with \"sudo pip install PyPDF2\".")
sys.exit(1)
PROG_VERSION_DATE = PROG_VERSION[13:23]
@ -298,9 +298,6 @@ class GuessFilename(object):
# 20200224-0914_Foo_bar.wav
SMARTREC_REGEX = re.compile(r'(?P<DAY>' + DATESTAMP_REGEX + ')-' + TIMESTAMP_REGEX + r'(_(?P<description>.+))?.(?P<extension>wav|mp3)')
# KVR-2022-08-09-14-00-16.txt -> 2022-08-09T14.00.16.mp4
KVR_REGEX = re.compile(r'KVR-' + DATESTAMP_REGEX + '-' + TIMESTAMP_REGEX + r'(?P<description>.+?)?\.(?P<extension>wav|mp3|mp4|txt)')
logger = None
config = None
@ -309,20 +306,6 @@ class GuessFilename(object):
self.logger = logger
self.config = config
def get_unique_show_and_title(self, show, title):
"""If show starts with title (or vice versa), omit the redundant one and use the longer string"""
## if show in contained in title (or vice versa), omit the redundant one:
if show.startswith(title) and len(show) > len(title):
logging.debug('get_unique_show_and_title: reduced show/title to show')
return show
elif title.startswith(show) and len(show) <= len(title):
logging.debug('get_unique_show_and_title: reduced show/title to title')
return title
else:
return show + ' - ' + title
def derive_new_filename_from_old_filename(self, oldfilename):
"""
Analyses the old filename and returns a new one if feasible.
@ -386,29 +369,14 @@ class GuessFilename(object):
if regex_match.group('sexpression'):
# the file name contained the optional chunk time-stamp(s)
## Extra handling of this case:
## 20230303T232946 ORF - Gute Nacht Österreich mit Peter Klien - Wirtschaftliche Probleme in Großbritannien -ORIGINALlow- 2023-03-03_2329_tl_01_Gute-Nacht-Oest_Wirtschaftliche__14170146__o__3365936366__s15349885_5__ORF1HD_00005621P_00105414P_Q4A.mp4
## 2023-03-04T00.00.56 ORF - Gute Nacht Österreich mit Peter Klien - Wirtschaftliche Probleme in Großbritannien -- lowquality.mp4
## ... the day should be incremented because this did start shortly before midnight but this part was started after midnight
## -> When the actual start time (2nd timestamp in filename) is older than 10 hours compared to the file name start time, assume it is actually started after midnight.
## exception: first time-stamp is "00:00:00" which stands for "unknown".
if (regex_match.group('hour') != '00' and regex_match.group('minute') != '00') and \
int(regex_match.group('hour')) > int(regex_match.group('hour2')) and \
int(regex_match.group('hour')) > int(regex_match.group('hour2')) + 10:
logging.debug('Correcting day of MediathekView file: file started after midnight, so I increment the day here.')
new_datestamp = self.get_incremented_date_string_from_named_groups(regex_match)
else:
new_datestamp = self.get_date_string_from_named_groups(regex_match)
newname = new_datestamp + 'T' + \
newname = self.get_date_string_from_named_groups(regex_match) + 'T' + \
regex_match.group('hour2') + '.' + regex_match.group('minute2') + '.' + regex_match.group('second2') + ' ' + \
regex_match.group('channel') + ' - ' + self.get_unique_show_and_title(regex_match.group('show'), regex_match.group('title')) + ' -- ' + \
regex_match.group('channel') + ' - ' + regex_match.group('show') + ' - ' + regex_match.group('title') + ' -- ' + \
qualitytag + '.mp4'
else:
# the file name did NOT contain the optional chunk time-stamp(s), so we have to use the main time-stamp
newname = self.get_datetime_string_from_named_groups(regex_match) + \
regex_match.group('channel') + ' - ' + self.get_unique_show_and_title(regex_match.group('show'), regex_match.group('title')) + ' -- ' + \
regex_match.group('channel') + ' - ' + regex_match.group('show') + ' - ' + regex_match.group('title') + ' -- ' + \
qualitytag + '.mp4'
return newname.replace('_', ' ')
@ -457,7 +425,7 @@ class GuessFilename(object):
qualitytag = self.translate_ORF_quality_string_to_tag(regex_match.group('qualityindicator'))
newname = self.get_datetime_string_from_named_groups(regex_match) + ' ' + \
regex_match.group('channel') + ' - ' + self.get_unique_show_and_title(regex_match.group('show'), regex_match.group('title')) + ' -- ' + \
regex_match.group('channel') + ' - ' + regex_match.group('show') + ' - ' + regex_match.group('title') + ' -- ' + \
qualitytag + '.mp4'
return newname.replace('_', ' ')
@ -494,7 +462,7 @@ class GuessFilename(object):
qualitytag = self.translate_ORF_quality_string_to_tag(regex_match.group('qualityshort').upper())
return self.get_datetime_string_from_named_groups(regex_match) + ' ' + regex_match.group('channel') + \
' - ' + self.get_unique_show_and_title(regex_match.group('show'), regex_match.group('title')) + ' -- ' + qualitytag + '.mp4'
' - ' + regex_match.group('show') + ' - ' + regex_match.group('title') + ' -- ' + qualitytag + '.mp4'
else:
# we got the ability to derive starting time from "original filename"
@ -768,15 +736,6 @@ class GuessFilename(object):
" Voltino Vorschreibung Teilbetrag " + self.config.VOLTINO_Teilbetrag + " -- " + ' '.join(self.adding_tags(tags, ['bill'])) + \
".pdf"
# 2022-06-17 Rechtschutzversicherung
if self.config.RECHTSCHUTZVERSICHERUNG in oldfilename and 'Wertanpassung' in oldfilename and datetimestr and self.has_euro_charge(oldfilename):
return datetimestr + ' ' + self.config.RECHTSCHUTZVERSICHERUNG + ' ' + self.config.RECHTSCHUTZPOLIZZE + \
' - Wertanpassung monatliche Versicherungspraemie auf ' + self.get_euro_charge(oldfilename) + '€ -- scan.pdf'
# KVR-2022-08-09-14-00-16.txt -> 2022-08-09T14.00.16.mp4
regex_match = re.match(self.KVR_REGEX, oldfilename)
if regex_match:
return self.get_datetime_description_extension_filename(regex_match, replace_description_underscores=True)
# FIXXME: more cases!
@ -803,11 +762,11 @@ class GuessFilename(object):
return False
try:
pdffile = PyPDF2.PdfFileReader(open(filename, "rb"))
pdffile = PyPDF2.PdfReader(open(filename, "rb"))
# if PDF is encryped, try password stored in config file
# or quit this function if decryption is not successful
if pdffile.isEncrypted:
if pdffile.is_encrypted:
returncode = pdffile.decrypt(self.config.SALARY_PDF_PASSWORD)
if returncode < 1:
logging.error('PDF file is encrypted and could NOT be decrypted using ' +
@ -818,10 +777,10 @@ class GuessFilename(object):
'config.SALARY_PDF_PASSWORD. Return code = ' + str(returncode))
# use first and second page of content only:
if pdffile.getNumPages() > 1:
content = pdffile.pages[0].extractText() + pdffile.pages[1].extractText()
elif pdffile.getNumPages() == 1:
content = pdffile.pages[0].extractText()
if len(pdffile.pages) > 1:
content = pdffile.pages[0].extract_text() + pdffile.pages[1].extract_text()
elif len(pdffile.pages) == 1:
content = pdffile.pages[0].extract_text()
else:
logging.error('Could not determine number of pages of PDF content! (skipping content analysis)')
return False
@ -855,15 +814,24 @@ class GuessFilename(object):
# should parse starting sequence of
# "^.LOHN/GEHALTSABRECHNUNG JÄNNER 2018Klien..." and
# return "Jaenner"
month_of_salary = re.match(r'.LOHN.*/.*GEHALTSABRECHNUNG (.+) .+', content).group(1).capitalize().replace('ä', 'ae')
month_of_salary = re.search(r'.*Semimonthly ?\d?\d-(\w\w\w)', content).group(1).capitalize()
except:
logging.error('derive_new_filename_from_content(' + filename + '): I recognized pattern ' +
'for salary file but content format for extracting month must have changed.')
month_of_salary = 'FIXXME'
if datetimestr is None:
try:
date = re.search(r'.*Semimonthly ?\d?\d-\w\w\w-\d\d\d\d ?(\d\d-\w\w\w-\d\d\d\d)', content).group(1)
datetimestr = datetime.datetime.strptime(date, "%d-%b-%Y").strftime("%Y-%m-%d")
except:
logging.error('derive_new_filename_from_content(' + filename + '): I recognized pattern ' +
'for salary file but content format for date/time must have changed.')
try:
# should extract "2.345,67" from following sequence
# ".+SZAbzüge1.234,56Netto2.345,67IBAN:.+"
net_salary = re.match(r'.+Netto(\d\.\d{3},\d{2})IBAN.+', content).group(1)
net_salary = re.search(r'.+Ally Bank CHECKING XXXXXX9933 ([0-9,]+)', content).group(1)
except:
logging.error('derive_new_filename_from_content(' + filename + '): I recognized pattern ' +
'for salary file but content format for extracting net salary must have changed.')
@ -872,8 +840,8 @@ class GuessFilename(object):
# hint when he wants to open the PDF in a PDF viewer
print(' ' * 7 + colorama.Style.DIM + '→ PDF file password: ' + self.config.SALARY_PDF_PASSWORD +
colorama.Style.RESET_ALL)
return datetimestr + ' ' + self.config.SALARY_DESCRIPTION + ' ' + month_of_salary + ' - ' + \
net_salary + ' -- detego private.pdf'
return datetimestr + ' ' + self.config.SALARY_DESCRIPTION + ' ' + month_of_salary + ' - $' + \
net_salary + ' -- private.pdf'
# 2010-06-08 easybank - neue TAN-Liste -- scan private.pdf
if self.fuzzy_contains_all_of(content, ["Transaktionsnummern (TANs)", "Ihre TAN-Liste in Verlust geraten"]) and \
@ -964,7 +932,6 @@ class GuessFilename(object):
"extractor" in data.keys() and \
"display_id" in data.keys() and \
"ext" in data.keys() and \
"duration_string" in data.keys() and \
"fulltitle" in data.keys():
if data['upload_date'] and len(data['upload_date']) == 8 and \
@ -972,7 +939,7 @@ class GuessFilename(object):
logging.debug('derive_new_filename_from_json_metadata: found all ' +
'required meta data for YouTube download file style')
# example from unit tests: "2007-09-13 youtube - The Star7 PDA Prototype - Ahg8OBYixL0.mp4"
return data['upload_date'][:4] + '-' + data['upload_date'][4:6] + '-' + data['upload_date'][6:] + ' ' + data["extractor"] + ' - ' + data["fulltitle"] + ' - ' + data["display_id"] + ' ' + data["duration_string"].replace(':', ';') + '.' + data["ext"]
return data['upload_date'][:4] + '-' + data['upload_date'][4:6] + '-' + data['upload_date'][6:] + ' ' + data["extractor"] + ' - ' + data["fulltitle"] + ' - ' + data["display_id"] + '.' + data["ext"]
else:
logging.debug('derive_new_filename_from_json_metadata: found all required meta data ' +
'for YouTube download file style but upload_date or extractor_key do ' +
@ -1036,8 +1003,10 @@ class GuessFilename(object):
print("Could not find Python module \"exiftool\".\nPlease install it, e.g., with \"sudo pip install pyexiftool\".")
sys.exit(1)
myexiftool = exiftool.ExifToolHelper()
metadata = myexiftool.get_metadata(files = [os.path.join(dirname, basename)])[0]
myexiftool = exiftool.ExifTool()
myexiftool.start()
metadata = myexiftool.get_metadata(filename = os.path.join(dirname, basename))
myexiftool.terminate()
extension = os.path.splitext(basename)[1]
@ -1525,17 +1494,6 @@ class GuessFilename(object):
assert(regex_match.group('year'))
return regex_match.group('year') + '-' + regex_match.group('month') + '-' + regex_match.group('day')
def get_incremented_date_string_from_named_groups(self, regex_match):
"""Extracts YMDHM(S) from match groups and returns YYYY.MM.DDTHH.MM(.SS) from the following day
"""
assert(regex_match)
assert(regex_match.group('day'))
assert(regex_match.group('month'))
assert(regex_match.group('year'))
mydatetime = datetime.datetime(int(regex_match.group('year')), int(regex_match.group('month')), int(regex_match.group('day')), 0, 0, 0)
the_next_day = mydatetime + datetime.timedelta(days=1)
return the_next_day.strftime('%Y-%m-%d')
def get_datetime_description_extension_filename(self, regex_match, replace_description_underscores=False):
"""
When a regex_match has matching groups for datetime elements, an optional description

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python; -*-
# Time-stamp: <2023-03-15 10:52:15 vk>
# Time-stamp: <2020-06-17 18:13:56 vk>
import unittest
import logging
@ -442,7 +442,6 @@ class TestGuessFilename(unittest.TestCase):
"acodec": "mp4a.40.2",
"dislike_count": 3,
"abr": 96,
"duration_string": "12:34:56",
"creator": null,
"filesize": 26294671,
"id": "Ahg8OBYixL0",
@ -459,10 +458,8 @@ class TestGuessFilename(unittest.TestCase):
"tbr": 355.714
}""")
new_mediafilename = self.guess_filename.handle_file(mediafile, False)
assert(type(new_mediafilename) == str)
new_mediafilename_generated = os.path.join(tmpdir, new_mediafilename)
new_mediafilename_comparison = os.path.join(tmpdir, "2007-09-13 youtube - The Star7 PDA Prototype - Ahg8OBYixL0 12;34;56.mp4")
new_mediafilename_generated = os.path.join(tmpdir, self.guess_filename.handle_file(mediafile, False))
new_mediafilename_comparison = os.path.join(tmpdir, "2007-09-13 youtube - The Star7 PDA Prototype - Ahg8OBYixL0.mp4")
self.assertEqual(new_mediafilename_generated, new_mediafilename_comparison)
os.remove(new_mediafilename_generated)
@ -863,7 +860,7 @@ class TestGuessFilename(unittest.TestCase):
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename("20180510T090000 ORF - ZIB - Weitere Signale der Entspannung -ORIGINAL- 2018-05-10_0900_tl_02_ZIB-9-00_Weitere-Signale__13976423__o__5968792755__s14297694_4__WEB03HD_09011813P_09020710P_Q4A.mp4"),
"2018-05-10T09.01.18 ORF - ZIB - Weitere Signale der Entspannung -- lowquality.mp4")
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename("20180520T201500 ORF - Tatort - Tatort_ Aus der Tiefe der Zeit -ORIGINAL- 2018-05-20_2015_in_02_Tatort--Aus-der_____13977411__o__1151703583__s14303062_Q8C.mp4"),
"2018-05-20T20.15.00 ORF - Tatort Aus der Tiefe der Zeit -- highquality.mp4")
"2018-05-20T20.15.00 ORF - Tatort - Tatort Aus der Tiefe der Zeit -- highquality.mp4")
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename("20180521T193000 ORF - ZIB 1 - Parlament bereitet sich auf EU-Vorsitz vor -ORIGINAL- 2018-05-21_1930_tl_02_ZIB-1_Parlament-berei__13977453__o__277886215b__s14303762_2__WEB03HD_19350304P_19371319P_Q4A.mp4"),
"2018-05-21T19.35.03 ORF - ZIB 1 - Parlament bereitet sich auf EU-Vorsitz vor -- lowquality.mp4")
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20190902T220000 ORF - ZIB 2 - Bericht über versteckte ÖVP-Wahlkampfkosten -ORIGINALlow- 2019-09-02_2200_tl_02_ZIB-2_Bericht-ueber-v__14024705__o__71528285d6__s14552793_3__ORF2HD_22033714P_22074303P_Q4A.mp4'),
@ -912,29 +909,19 @@ class TestGuessFilename(unittest.TestCase):
# ORF TV Mediathek as of 2018-11-01: when there is no original filename with %N, I have to use the data I've got
# see https://github.com/mediathekview/MServer/issues/436
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181028T201400 ORF - Tatort - Tatort Blut -ORIGINALhd- playlist.m3u8.mp4'),
'2018-10-28T20.14.00 ORF - Tatort Blut -- highquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181028T201400 ORF - Tatort - Tatort Blut -ORIGINALlow- playlist.m3u8.mp4'),
'2018-10-28T20.14.00 ORF - Tatort Blut -- lowquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181028T201400 ORF - Tatort - Tatort_ Blut -ORIGINALhd- playlist.m3u8.mp4'),
'2018-10-28T20.14.00 ORF - Tatort - Tatort_ Blut -- highquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181028T201400 ORF - Tatort - Tatort_ Blut -ORIGINALlow- playlist.m3u8.mp4'),
'2018-10-28T20.14.00 ORF - Tatort - Tatort_ Blut -- lowquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181022T211100 ORF - Thema - Das Essen der Zukunft -ORIGINALhd- playlist.m3u8.mp4'),
'2018-10-22T21.11.00 ORF - Thema - Das Essen der Zukunft -- highquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181022T211100 ORF - Thema - Das Essen der Zukunft -ORIGINALlow- playlist.m3u8.mp4'),
'2018-10-22T21.11.00 ORF - Thema - Das Essen der Zukunft -- lowquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181025T210500 ORF - Am Schauplatz - Am Schauplatz Wenn alles zusammenbricht -ORIGINALhd- playlist.m3u8.mp4'),
'2018-10-25T21.05.00 ORF - Am Schauplatz Wenn alles zusammenbricht -- highquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181025T210500 ORF - Am Schauplatz - Am Schauplatz Wenn alles zusammenbricht -ORIGINALlow- playlist.m3u8.mp4'),
'2018-10-25T21.05.00 ORF - Am Schauplatz Wenn alles zusammenbricht -- lowquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181025T210500 ORF - Am Schauplatz Wenn alles zusammenbricht - Am Schauplatz -ORIGINALlow- playlist.m3u8.mp4'),
'2018-10-25T21.05.00 ORF - Am Schauplatz Wenn alles zusammenbricht -- lowquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181025T210500 ORF - Am Schauplatz - Am Schauplatz_ Wenn alles zusammenbricht -ORIGINALhd- playlist.m3u8.mp4'),
'2018-10-25T21.05.00 ORF - Am Schauplatz - Am Schauplatz_ Wenn alles zusammenbricht -- highquality.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20181025T210500 ORF - Am Schauplatz - Am Schauplatz_ Wenn alles zusammenbricht -ORIGINALlow- playlist.m3u8.mp4'),
'2018-10-25T21.05.00 ORF - Am Schauplatz - Am Schauplatz_ Wenn alles zusammenbricht -- lowquality.mp4')
## ORF TV Mediathek as of 2023-03-05:
## 20230303T232946 ORF - Gute Nacht Österreich mit Peter Klien - Wirtschaftliche Probleme in Großbritannien -ORIGINALlow- 2023-03-03_2329_tl_01_Gute-Nacht-Oest_Wirtschaftliche__14170146__o__3365936366__s15349885_5__ORF1HD_00005621P_00105414P_Q4A.mp4
## 2023-03-04T00.00.56 ORF - Gute Nacht Österreich mit Peter Klien - Wirtschaftliche Probleme in Großbritannien -- lowquality.mp4
## ... the day should be incremented because this did start shortly before midnight but this part was started after midnight
## When the actual start time (2nd timestamp in filename) is older than 10 hours compared to the file name start time, assume it is actually started after midnight.
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20230303T232946 ORF - Gute Nacht Österreich mit Peter Klien - Wirtschaftliche Probleme in Großbritannien -ORIGINALlow- 2023-03-03_2329_tl_01_Gute-Nacht-Oest_Wirtschaftliche__14170146__o__3365936366__s15349885_5__ORF1HD_00005621P_00105414P_Q4A.mp4'),
'2023-03-04T00.00.56 ORF - Gute Nacht Österreich mit Peter Klien - Wirtschaftliche Probleme in Großbritannien -- lowquality.mp4')
# Digital camera from Android
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('IMG_20190118_133928.jpg'),
'2019-01-18T13.39.28.jpg')