mirror of
https://github.com/novoid/guess-filename.py.git
synced 2026-02-16 05:14:16 +00:00
re-ordered function definitions
This commit is contained in:
parent
207728809d
commit
0dbdc168ca
1 changed files with 410 additions and 414 deletions
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
PROG_VERSION = u"Time-stamp: <2019-10-19 12:13:52 vk>"
|
||||
PROG_VERSION = u"Time-stamp: <2019-10-19 12:52:48 vk>"
|
||||
|
||||
|
||||
# TODO:
|
||||
|
|
@ -275,419 +275,6 @@ class GuessFilename(object):
|
|||
self.logger = logger
|
||||
self.config = config
|
||||
|
||||
def adding_tags(self, tagarray, newtags):
|
||||
"""
|
||||
Returns unique array of tags containing the newtag.
|
||||
|
||||
@param tagarray: a array of unicode strings containing tags
|
||||
@param newtag: a array of unicode strings containing tags
|
||||
@param return: a array of unicode strings containing tags
|
||||
"""
|
||||
|
||||
assert tagarray.__class__ == list
|
||||
assert newtags.__class__ == list
|
||||
|
||||
resulting_tags = tagarray
|
||||
|
||||
for tag in newtags:
|
||||
if tag not in tagarray:
|
||||
resulting_tags.append(tag)
|
||||
|
||||
return resulting_tags
|
||||
|
||||
def split_filename_entities(self, filename):
|
||||
"""
|
||||
Takes a filename of format ( (date(time)?)?(--date(time)?)? )? filename (tags)? (extension)?
|
||||
and returns a set of (date/time/duration, filename, array of tags, extension).
|
||||
"""
|
||||
|
||||
# FIXXME: return directory as well!
|
||||
|
||||
assert(type(filename) == str or type(filename) == str)
|
||||
assert(len(filename) > 0)
|
||||
|
||||
components = re.match(self.ISO_NAME_TAGS_EXTENSION_REGEX, filename)
|
||||
|
||||
assert(components)
|
||||
|
||||
if components.group(self.TAGS_INDEX):
|
||||
tags = components.group(self.TAGS_INDEX).split(' ')
|
||||
else:
|
||||
tags = []
|
||||
return components.group(self.DAYTIME_DURATION_INDEX), \
|
||||
components.group(self.NAME_INDEX), \
|
||||
tags, \
|
||||
components.group(self.EXTENSION_INDEX)
|
||||
|
||||
def contains_one_of(self, string, entries):
|
||||
"""
|
||||
Returns true, if the string contains one of the strings within entries array
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(entries) == list)
|
||||
assert(len(string) > 0)
|
||||
assert(len(entries) > 0)
|
||||
|
||||
for entry in entries:
|
||||
if entry in string:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def contains_all_of(self, string, entries):
|
||||
"""
|
||||
Returns true, if the string contains all of the strings within entries array
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(entries) == list)
|
||||
assert(len(string) > 0)
|
||||
assert(len(entries) > 0)
|
||||
|
||||
for entry in entries:
|
||||
if entry not in string:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def fuzzy_contains_one_of(self, string, entries):
|
||||
"""
|
||||
Returns true, if the string contains a similar one of the strings within entries array
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(entries) == list)
|
||||
assert(len(string) > 0)
|
||||
assert(len(entries) > 0)
|
||||
|
||||
for entry in entries:
|
||||
similarity = fuzz.partial_ratio(string, entry)
|
||||
if similarity > 64:
|
||||
# logging.debug(u"MATCH fuzzy_contains_one_of(%s, %s) == %i" % (string, str(entry), similarity))
|
||||
return True
|
||||
else:
|
||||
# logging.debug(u"¬ MATCH fuzzy_contains_one_of(%s, %s) == %i" % (string, str(entry), similarity))
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
def fuzzy_contains_all_of(self, string, entries):
|
||||
"""
|
||||
Returns true, if the string contains all similar ones of the strings within the entries array
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(entries) == list)
|
||||
assert(len(string) > 0)
|
||||
assert(len(entries) > 0)
|
||||
|
||||
for entry in entries:
|
||||
assert(type(entry) == str or type(entry) == str)
|
||||
# logging.debug(u"fuzzy_contains_all_of(%s..., %s...) ... " % (string[:30], str(entry[:30])))
|
||||
if entry not in string:
|
||||
# if entry is found in string (exactly), try with fuzzy search:
|
||||
|
||||
similarity = fuzz.partial_ratio(string, entry)
|
||||
if similarity > 64:
|
||||
# logging.debug(u"MATCH fuzzy_contains_all_of(%s..., %s) == %i" % (string[:30], str(entry), similarity))
|
||||
pass
|
||||
else:
|
||||
# logging.debug(u"¬ MATCH fuzzy_contains_all_of(%s..., %s) == %i" % (string[:30], str(entry), similarity))
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def has_euro_charge(self, string):
|
||||
"""
|
||||
Returns true, if the single-line string contains a number with a €-currency
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(len(string) > 0)
|
||||
|
||||
components = re.match(self.EURO_CHARGE_REGEX, string)
|
||||
|
||||
if components:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_euro_charge(self, string):
|
||||
"""
|
||||
Returns the first included €-currency within single-line "string" or False
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(len(string) > 0)
|
||||
|
||||
components = re.match(self.EURO_CHARGE_REGEX, string)
|
||||
|
||||
if components:
|
||||
return components.group(self.EURO_CHARGE_INDEX)
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_euro_charge_from_context_or_basename(self, string, before, after, basename):
|
||||
"""
|
||||
Returns the included €-currency which is between before and after
|
||||
strings or within the basename or return 'FIXXME'
|
||||
"""
|
||||
|
||||
charge = self.get_euro_charge_from_context(string, before, after)
|
||||
if not charge:
|
||||
charge = self.get_euro_charge(basename)
|
||||
if not charge:
|
||||
return 'FIXXME'
|
||||
|
||||
return charge
|
||||
|
||||
def get_euro_charge_from_context(self, string, before, after):
|
||||
"""
|
||||
Returns the included €-currency which is between before and after strings or False
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(before) == str or type(before) == str)
|
||||
assert(type(after) == str or type(after) == str)
|
||||
assert(len(string) > 0)
|
||||
|
||||
context_range = '5' # range of characters where before/after is valid
|
||||
|
||||
# for testing: re.search(".*" + before + r"\D{0,6}(\d{1,6}[,.]\d{2})\D{0,6}" + after + ".*", string).groups()
|
||||
components = re.search(".*" + before + r"\D{0," + context_range + "}((\d{1,6})[,.](\d{2}))\D{0," + context_range + "}" + after + ".*", string)
|
||||
|
||||
if components:
|
||||
floatstring = components.group(2) + ',' + components.group(3)
|
||||
# logging.debug("get_euro_charge_from_context extracted float: [%s]" % floatstring)
|
||||
return floatstring
|
||||
else:
|
||||
logging.warning("Sorry, I was not able to extract a charge for this file, please fix manually")
|
||||
logging.debug("get_euro_charge_from_context was not able to extract a float: between [%s] and [%s] within [%s]" % (before, after, string[:30] + "..."))
|
||||
return False
|
||||
|
||||
def rename_file(self, dirname, oldbasename, newbasename, dryrun=False, quiet=False):
|
||||
"""
|
||||
Renames a file from oldbasename to newbasename in dirname.
|
||||
|
||||
Only simulates result if dryrun is True.
|
||||
|
||||
@param dirname: string containing the directory of the file
|
||||
@param oldbasename: string containing the old file name (basename)
|
||||
@param newbasename: string containing the new file name (basename)
|
||||
@param dryrun: boolean which defines if files should be changed (False) or not (True)
|
||||
"""
|
||||
|
||||
if oldbasename == newbasename:
|
||||
logging.info("Old filename is same as new filename: skipping file")
|
||||
return False
|
||||
|
||||
oldfile = os.path.join(dirname, oldbasename)
|
||||
newfile = os.path.join(dirname, newbasename)
|
||||
|
||||
if not os.path.isfile(oldfile):
|
||||
logging.error("file to rename does not exist: [%s]" % oldfile)
|
||||
return False
|
||||
|
||||
if os.path.isfile(newfile):
|
||||
logging.error("file can't be renamed since new file name already exists: [%s]" % newfile)
|
||||
return False
|
||||
|
||||
if not quiet:
|
||||
print(' → ' + colorama.Style.BRIGHT + colorama.Fore.GREEN + newbasename + colorama.Style.RESET_ALL)
|
||||
logging.debug(" renaming \"%s\"" % oldfile)
|
||||
logging.debug(" ⤷ \"%s\"" % newfile)
|
||||
if not dryrun:
|
||||
os.rename(oldfile, newfile)
|
||||
return True
|
||||
|
||||
def build_string_via_indexgroups(self, regex_match, indexgroups):
|
||||
"""This function takes a regex_match object and concatenates its
|
||||
groups. It does this by traversing the list of indexgroups. If
|
||||
the list item is an integer, the corresponding
|
||||
regex_match.group() is appended to the result string. If the
|
||||
list item is a string, the string is appended to the result
|
||||
string.
|
||||
|
||||
When a list item is a list, its elements are appended as well as
|
||||
long as all list items exist.
|
||||
|
||||
match-groups that are in the indexgroups but are None are ignored.
|
||||
|
||||
@param regex_match: a regex match object from re.match(REGEX, STRING)
|
||||
@param indexgroups: list of strings and integers like [1, '-', 2, '-', 3, 'T', 4, '.', 5, ' foo .png']
|
||||
@param return: string containing the concatenated string
|
||||
|
||||
"""
|
||||
|
||||
if not regex_match:
|
||||
logging.error('no re.match object found; please check before calling build_string_via_indexgroups()')
|
||||
return "ERROR"
|
||||
|
||||
def append_element(string, indexgroups):
|
||||
result = string
|
||||
for element in indexgroups:
|
||||
if type(element) == str:
|
||||
result += element
|
||||
# print 'DEBUG: result after element [' + str(element) + '] = [' + str(result) + ']'
|
||||
elif type(element) == int:
|
||||
potential_element = regex_match.group(element)
|
||||
# ignore None matches
|
||||
if potential_element:
|
||||
result += regex_match.group(element)
|
||||
# print 'DEBUG: result after element [' + str(element) + '] = [' + str(result) + ']'
|
||||
else:
|
||||
# print 'DEBUG: match-group element ' + str(element) + ' is None'
|
||||
pass
|
||||
elif type(element) == list:
|
||||
# recursive: if a list element is a list, process if all elements exists:
|
||||
# print 'DEBUG: found list item = ' + str(element)
|
||||
# print 'DEBUG: result before = [' + str(result) + ']'
|
||||
all_found = True
|
||||
for listelement in element:
|
||||
if type(listelement) == int and (regex_match.group(listelement) is None or
|
||||
len(regex_match.group(listelement)) < 1):
|
||||
all_found = False
|
||||
if all_found:
|
||||
result = append_element(result, element)
|
||||
# print 'DEBUG: result after = [' + str(result) + ']'
|
||||
else:
|
||||
pass
|
||||
# print 'DEBUG: result after = [' + str(result) + ']' + \
|
||||
# ' -> not changed because one or more elements of sub-list were not found'
|
||||
return result
|
||||
|
||||
logging.debug('build_string_via_indexgroups: FILENAME: ' + str(regex_match.group(0)))
|
||||
logging.debug('build_string_via_indexgroups: GROUPS: ' + str(regex_match.groups()))
|
||||
result = append_element('', indexgroups)
|
||||
logging.debug('build_string_via_indexgroups: RESULT: ' + result)
|
||||
return result
|
||||
|
||||
|
||||
def NumToMonth(self, month):
|
||||
|
||||
months = ['Dezember', 'Jaenner', 'Februar', 'Maerz', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
||||
return months[month]
|
||||
|
||||
|
||||
def translate_ORF_quality_string_to_tag(self, quality_string):
|
||||
"""
|
||||
Returns a filetag which is derived from a key string. The key strings are defined
|
||||
by the file names of the ORF company offering its download file names.
|
||||
"""
|
||||
|
||||
if quality_string == 'Q4A' or quality_string == 'LOW':
|
||||
return 'lowquality'
|
||||
elif quality_string == 'Q6A' or quality_string == 'Q8C' or quality_string == 'HD':
|
||||
return 'highquality'
|
||||
else:
|
||||
return 'UNKNOWNQUALITY'
|
||||
|
||||
|
||||
def get_file_size(self, filename):
|
||||
"""
|
||||
A simple wrapper to determine file sizes.
|
||||
|
||||
For some hard-coded file names, a hard-coded file size is returned. This enables
|
||||
unit-testing for file sizes that do not exist in the file system.
|
||||
"""
|
||||
|
||||
# these are the hard-coded sizes for unit test cases:
|
||||
if filename in ['20180510T090000 ORF - ZIB - Signation -ORIGINAL- 2018-05-10_0900_tl_02_ZIB-9-00_Signation__13976423__o__1368225677__s14297692_2__WEB03HD_09000305P_09001400P_Q4A.mp4',
|
||||
'20180510T090000 ORF - ZIB - Weitere Signale der Entspannung -ORIGINAL- 2018-05-10_0900_tl_02_ZIB-9-00_Weitere-Signale__13976423__o__5968792755__s14297694_4__WEB03HD_09011813P_09020710P_Q4A.mp4',
|
||||
'20180520T201500 ORF - Tatort - Tatort_ Aus der Tiefe der Zeit -ORIGINAL- 2018-05-20_2015_in_02_Tatort--Aus-der_____13977411__o__1151703583__s14303062_Q8C.mp4',
|
||||
'20180521T193000 ORF - ZIB 1 - Parlament bereitet sich auf EU-Vorsitz vor -ORIGINAL- 2018-05-21_1930_tl_02_ZIB-1_Parlament-berei__13977453__o__277886215b__s14303762_2__WEB03HD_19350304P_19371319P_Q4A.mp4',
|
||||
'20180608T193000 ORF - Österreich Heute - Das Magazin - Österreich Heute - Das Magazin -ORIGINAL- 13979231_0007_Q8C.mp4',
|
||||
'20190902T220000 ORF - ZIB 2 - Bericht über versteckte ÖVP-Wahlkampfkosten -ORIGINALlow- 2019-09-02_2200_tl_02_ZIB-2_Bericht-ueber-v__14024705__o__71528285d6__s14552793_3__ORF2HD_22033714P_22074303P_Q4A.mp4',
|
||||
'20190902T220000 ORF - ZIB 2 - Hinweis _ Verabschiedung -ORIGINALlow- 2019-09-02_2200_tl_02_ZIB-2_Hinweis---Verab__14024705__o__857007705d__s14552799_9__ORF2HD_22285706P_22300818P_Q4A.mp4']:
|
||||
# don't care about file sizes, return a high number that is abote the expected minimum in any case:
|
||||
return 99999999
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q4A.mp4':
|
||||
return 5017289 # from an actual downloaded file
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size not okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q4A.mp4':
|
||||
return 4217289 # manually reduced size from the value of an actual downloaded file
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q8C.mp4':
|
||||
return 15847932 # from an actual downloaded file
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size not okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q8C.mp4':
|
||||
return 14050000 # manually reduced size from the value of an actual downloaded file
|
||||
elif filename == '20180610T000000 ORF - Kleinkunst - Kleinkunst_ Cordoba - Das Rückspiel (2_2) -ORIGINAL- 2018-06-10_0000_sd_06_Kleinkunst--Cor_____13979381__o__1483927235__s14313621_1__ORF3HD_23592020P_00593103P_Q8C.mp4':
|
||||
return 1506829698 # from actual file
|
||||
elif filename == '2018-06-14_2105_sd_02_Am-Schauplatz_-_Alles für die Katz-_____13979879__o__1907287074__s14316407_7__WEB03HD_21050604P_21533212P_Q8C.mp4':
|
||||
return 1214980782 # from actual file
|
||||
elif filename == '2018-06-14_2155_sd_06_Kottan-ermittelt - Wien Mitte_____13979903__o__1460660672__s14316392_2__ORF3HD_21570716P_23260915P_Q8C.mp4':
|
||||
return 2231522252 # from actual file
|
||||
elif filename == '2018-06-14_2330_sd_06_Sommerkabarett - Lukas Resetarits: Schmäh (1 von 2)_____13979992__o__1310584704__s14316464_4__ORF3HD_23301620P_00302415P_Q8C.mp4':
|
||||
return 1506983474 # from actual file
|
||||
|
||||
try:
|
||||
return os.stat(filename).st_size
|
||||
except OSError:
|
||||
error_exit(10, 'get_file_size(): Could not get file size of: ' + filename)
|
||||
|
||||
|
||||
def warn_if_ORF_file_seems_to_small_according_to_duration_and_quality_indicator(self, oldfilename, qualityindicator,
|
||||
start_hrs, start_min, start_sec,
|
||||
end_hrs, end_min, end_sec):
|
||||
"""
|
||||
Launches a warning if the expected size differs from the actual file size.
|
||||
|
||||
Expected size is derived from the detailed time-stamp information
|
||||
and tests with a ten minute file:
|
||||
|
||||
| Quality Indicator | file size | bytes per second |
|
||||
|-------------------------+-----------+------------------|
|
||||
| Q8C = HD | 240429907 | 400717 |
|
||||
| Q6A = high quality | 150198346 | 250331 |
|
||||
| Q4A = low quality | 74992178 | 124987 |
|
||||
"""
|
||||
|
||||
#FIXXME: 2019-08-26: disabled: correct from exception to warning #
|
||||
#FIXXME: 2019-09-03: assigned tests also disabled because this function never raises the expected exception
|
||||
return
|
||||
|
||||
TOLERANCE_FACTOR = 0.95 # To cover edge cases where a reduced file size is feasible
|
||||
|
||||
file_size = self.get_file_size(oldfilename)
|
||||
|
||||
day_of_end = 1
|
||||
if int(end_hrs) < int(start_hrs):
|
||||
logging.debug('end hours is less than begin hours, adding a day-change for calculating duration')
|
||||
day_of_end = 2
|
||||
|
||||
end = datetime.datetime(1980, 5, day_of_end, int(end_hrs), int(end_min), int(end_sec))
|
||||
start = datetime.datetime(1980, 5, 1, int(start_hrs), int(start_min), int(start_sec))
|
||||
duration = end - start
|
||||
duration_in_seconds = duration.seconds
|
||||
assert(duration_in_seconds > 0)
|
||||
|
||||
if qualityindicator == 'Q8C':
|
||||
minimum_expected_file_size = 400000 * duration_in_seconds * TOLERANCE_FACTOR
|
||||
elif qualityindicator == 'Q6A':
|
||||
minimum_expected_file_size = 250000 * duration_in_seconds * TOLERANCE_FACTOR
|
||||
elif qualityindicator == 'Q4A':
|
||||
minimum_expected_file_size = 125000 * duration_in_seconds * TOLERANCE_FACTOR
|
||||
else:
|
||||
logging.warn('Unknown quality indicator prevents file size check: ' + qualityindicator)
|
||||
return
|
||||
|
||||
## additional check for minimum duration because small videos often produced wrong error messages:
|
||||
if duration_in_seconds > 120 and file_size < minimum_expected_file_size:
|
||||
print('\n → ' + colorama.Style.BRIGHT + colorama.Fore.RED +
|
||||
'ERROR: file size seems to be too small for the given duration ' +
|
||||
'and quality indicator found (download aborted?): \n' +
|
||||
' ' * 10 + 'file size: ' + "{:,}".format(file_size) + ' Bytes\n' +
|
||||
' ' * 10 + 'expected minimum size: ' + "{:,}".format(minimum_expected_file_size) + ' Bytes\n' +
|
||||
' ' * 10 + 'duration: ' + str('%.1f' % (duration_in_seconds/60)) + ' minutes\n' +
|
||||
' ' * 10 + 'quality: ' + qualityindicator + '\n' +
|
||||
' ' * 10 + 'file name: ' + oldfilename + colorama.Style.RESET_ALL + '\n')
|
||||
raise(FileSizePlausibilityException('file size is not plausible (too small)'))
|
||||
else:
|
||||
logging.debug('warn_if_ORF_file_seems_to_small_according_to_duration_and_quality_indicator: ' +
|
||||
'file size (' + "{:,}".format(file_size) +
|
||||
') is plausible compared to expected minimum (' +
|
||||
"{:,}".format(minimum_expected_file_size) +
|
||||
')')
|
||||
|
||||
|
||||
def derive_new_filename_from_old_filename(self, oldfilename):
|
||||
"""
|
||||
|
|
@ -1286,6 +873,415 @@ class GuessFilename(object):
|
|||
move_to_error_dir(dirname, basename)
|
||||
return False
|
||||
|
||||
def adding_tags(self, tagarray, newtags):
|
||||
"""
|
||||
Returns unique array of tags containing the newtag.
|
||||
|
||||
@param tagarray: a array of unicode strings containing tags
|
||||
@param newtag: a array of unicode strings containing tags
|
||||
@param return: a array of unicode strings containing tags
|
||||
"""
|
||||
|
||||
assert tagarray.__class__ == list
|
||||
assert newtags.__class__ == list
|
||||
|
||||
resulting_tags = tagarray
|
||||
|
||||
for tag in newtags:
|
||||
if tag not in tagarray:
|
||||
resulting_tags.append(tag)
|
||||
|
||||
return resulting_tags
|
||||
|
||||
def split_filename_entities(self, filename):
|
||||
"""
|
||||
Takes a filename of format ( (date(time)?)?(--date(time)?)? )? filename (tags)? (extension)?
|
||||
and returns a set of (date/time/duration, filename, array of tags, extension).
|
||||
"""
|
||||
|
||||
# FIXXME: return directory as well!
|
||||
|
||||
assert(type(filename) == str or type(filename) == str)
|
||||
assert(len(filename) > 0)
|
||||
|
||||
components = re.match(self.ISO_NAME_TAGS_EXTENSION_REGEX, filename)
|
||||
|
||||
assert(components)
|
||||
|
||||
if components.group(self.TAGS_INDEX):
|
||||
tags = components.group(self.TAGS_INDEX).split(' ')
|
||||
else:
|
||||
tags = []
|
||||
return components.group(self.DAYTIME_DURATION_INDEX), \
|
||||
components.group(self.NAME_INDEX), \
|
||||
tags, \
|
||||
components.group(self.EXTENSION_INDEX)
|
||||
|
||||
def contains_one_of(self, string, entries):
|
||||
"""
|
||||
Returns true, if the string contains one of the strings within entries array
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(entries) == list)
|
||||
assert(len(string) > 0)
|
||||
assert(len(entries) > 0)
|
||||
|
||||
for entry in entries:
|
||||
if entry in string:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def contains_all_of(self, string, entries):
|
||||
"""
|
||||
Returns true, if the string contains all of the strings within entries array
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(entries) == list)
|
||||
assert(len(string) > 0)
|
||||
assert(len(entries) > 0)
|
||||
|
||||
for entry in entries:
|
||||
if entry not in string:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def fuzzy_contains_one_of(self, string, entries):
|
||||
"""
|
||||
Returns true, if the string contains a similar one of the strings within entries array
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(entries) == list)
|
||||
assert(len(string) > 0)
|
||||
assert(len(entries) > 0)
|
||||
|
||||
for entry in entries:
|
||||
similarity = fuzz.partial_ratio(string, entry)
|
||||
if similarity > 64:
|
||||
# logging.debug(u"MATCH fuzzy_contains_one_of(%s, %s) == %i" % (string, str(entry), similarity))
|
||||
return True
|
||||
else:
|
||||
# logging.debug(u"¬ MATCH fuzzy_contains_one_of(%s, %s) == %i" % (string, str(entry), similarity))
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
def fuzzy_contains_all_of(self, string, entries):
|
||||
"""
|
||||
Returns true, if the string contains all similar ones of the strings within the entries array
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(entries) == list)
|
||||
assert(len(string) > 0)
|
||||
assert(len(entries) > 0)
|
||||
|
||||
for entry in entries:
|
||||
assert(type(entry) == str or type(entry) == str)
|
||||
# logging.debug(u"fuzzy_contains_all_of(%s..., %s...) ... " % (string[:30], str(entry[:30])))
|
||||
if entry not in string:
|
||||
# if entry is found in string (exactly), try with fuzzy search:
|
||||
|
||||
similarity = fuzz.partial_ratio(string, entry)
|
||||
if similarity > 64:
|
||||
# logging.debug(u"MATCH fuzzy_contains_all_of(%s..., %s) == %i" % (string[:30], str(entry), similarity))
|
||||
pass
|
||||
else:
|
||||
# logging.debug(u"¬ MATCH fuzzy_contains_all_of(%s..., %s) == %i" % (string[:30], str(entry), similarity))
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def has_euro_charge(self, string):
|
||||
"""
|
||||
Returns true, if the single-line string contains a number with a €-currency
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(len(string) > 0)
|
||||
|
||||
components = re.match(self.EURO_CHARGE_REGEX, string)
|
||||
|
||||
if components:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_euro_charge(self, string):
|
||||
"""
|
||||
Returns the first included €-currency within single-line "string" or False
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(len(string) > 0)
|
||||
|
||||
components = re.match(self.EURO_CHARGE_REGEX, string)
|
||||
|
||||
if components:
|
||||
return components.group(self.EURO_CHARGE_INDEX)
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_euro_charge_from_context_or_basename(self, string, before, after, basename):
|
||||
"""
|
||||
Returns the included €-currency which is between before and after
|
||||
strings or within the basename or return 'FIXXME'
|
||||
"""
|
||||
|
||||
charge = self.get_euro_charge_from_context(string, before, after)
|
||||
if not charge:
|
||||
charge = self.get_euro_charge(basename)
|
||||
if not charge:
|
||||
return 'FIXXME'
|
||||
|
||||
return charge
|
||||
|
||||
def get_euro_charge_from_context(self, string, before, after):
|
||||
"""
|
||||
Returns the included €-currency which is between before and after strings or False
|
||||
"""
|
||||
|
||||
assert(type(string) == str or type(string) == str)
|
||||
assert(type(before) == str or type(before) == str)
|
||||
assert(type(after) == str or type(after) == str)
|
||||
assert(len(string) > 0)
|
||||
|
||||
context_range = '5' # range of characters where before/after is valid
|
||||
|
||||
# for testing: re.search(".*" + before + r"\D{0,6}(\d{1,6}[,.]\d{2})\D{0,6}" + after + ".*", string).groups()
|
||||
components = re.search(".*" + before + r"\D{0," + context_range + "}((\d{1,6})[,.](\d{2}))\D{0," + context_range + "}" + after + ".*", string)
|
||||
|
||||
if components:
|
||||
floatstring = components.group(2) + ',' + components.group(3)
|
||||
# logging.debug("get_euro_charge_from_context extracted float: [%s]" % floatstring)
|
||||
return floatstring
|
||||
else:
|
||||
logging.warning("Sorry, I was not able to extract a charge for this file, please fix manually")
|
||||
logging.debug("get_euro_charge_from_context was not able to extract a float: between [%s] and [%s] within [%s]" % (before, after, string[:30] + "..."))
|
||||
return False
|
||||
|
||||
def rename_file(self, dirname, oldbasename, newbasename, dryrun=False, quiet=False):
|
||||
"""
|
||||
Renames a file from oldbasename to newbasename in dirname.
|
||||
|
||||
Only simulates result if dryrun is True.
|
||||
|
||||
@param dirname: string containing the directory of the file
|
||||
@param oldbasename: string containing the old file name (basename)
|
||||
@param newbasename: string containing the new file name (basename)
|
||||
@param dryrun: boolean which defines if files should be changed (False) or not (True)
|
||||
"""
|
||||
|
||||
if oldbasename == newbasename:
|
||||
logging.info("Old filename is same as new filename: skipping file")
|
||||
return False
|
||||
|
||||
oldfile = os.path.join(dirname, oldbasename)
|
||||
newfile = os.path.join(dirname, newbasename)
|
||||
|
||||
if not os.path.isfile(oldfile):
|
||||
logging.error("file to rename does not exist: [%s]" % oldfile)
|
||||
return False
|
||||
|
||||
if os.path.isfile(newfile):
|
||||
logging.error("file can't be renamed since new file name already exists: [%s]" % newfile)
|
||||
return False
|
||||
|
||||
if not quiet:
|
||||
print(' → ' + colorama.Style.BRIGHT + colorama.Fore.GREEN + newbasename + colorama.Style.RESET_ALL)
|
||||
logging.debug(" renaming \"%s\"" % oldfile)
|
||||
logging.debug(" ⤷ \"%s\"" % newfile)
|
||||
if not dryrun:
|
||||
os.rename(oldfile, newfile)
|
||||
return True
|
||||
|
||||
def build_string_via_indexgroups(self, regex_match, indexgroups):
|
||||
"""This function takes a regex_match object and concatenates its
|
||||
groups. It does this by traversing the list of indexgroups. If
|
||||
the list item is an integer, the corresponding
|
||||
regex_match.group() is appended to the result string. If the
|
||||
list item is a string, the string is appended to the result
|
||||
string.
|
||||
|
||||
When a list item is a list, its elements are appended as well as
|
||||
long as all list items exist.
|
||||
|
||||
match-groups that are in the indexgroups but are None are ignored.
|
||||
|
||||
@param regex_match: a regex match object from re.match(REGEX, STRING)
|
||||
@param indexgroups: list of strings and integers like [1, '-', 2, '-', 3, 'T', 4, '.', 5, ' foo .png']
|
||||
@param return: string containing the concatenated string
|
||||
|
||||
"""
|
||||
|
||||
if not regex_match:
|
||||
logging.error('no re.match object found; please check before calling build_string_via_indexgroups()')
|
||||
return "ERROR"
|
||||
|
||||
def append_element(string, indexgroups):
|
||||
result = string
|
||||
for element in indexgroups:
|
||||
if type(element) == str:
|
||||
result += element
|
||||
# print 'DEBUG: result after element [' + str(element) + '] = [' + str(result) + ']'
|
||||
elif type(element) == int:
|
||||
potential_element = regex_match.group(element)
|
||||
# ignore None matches
|
||||
if potential_element:
|
||||
result += regex_match.group(element)
|
||||
# print 'DEBUG: result after element [' + str(element) + '] = [' + str(result) + ']'
|
||||
else:
|
||||
# print 'DEBUG: match-group element ' + str(element) + ' is None'
|
||||
pass
|
||||
elif type(element) == list:
|
||||
# recursive: if a list element is a list, process if all elements exists:
|
||||
# print 'DEBUG: found list item = ' + str(element)
|
||||
# print 'DEBUG: result before = [' + str(result) + ']'
|
||||
all_found = True
|
||||
for listelement in element:
|
||||
if type(listelement) == int and (regex_match.group(listelement) is None or
|
||||
len(regex_match.group(listelement)) < 1):
|
||||
all_found = False
|
||||
if all_found:
|
||||
result = append_element(result, element)
|
||||
# print 'DEBUG: result after = [' + str(result) + ']'
|
||||
else:
|
||||
pass
|
||||
# print 'DEBUG: result after = [' + str(result) + ']' + \
|
||||
# ' -> not changed because one or more elements of sub-list were not found'
|
||||
return result
|
||||
|
||||
logging.debug('build_string_via_indexgroups: FILENAME: ' + str(regex_match.group(0)))
|
||||
logging.debug('build_string_via_indexgroups: GROUPS: ' + str(regex_match.groups()))
|
||||
result = append_element('', indexgroups)
|
||||
logging.debug('build_string_via_indexgroups: RESULT: ' + result)
|
||||
return result
|
||||
|
||||
def NumToMonth(self, month):
|
||||
|
||||
months = ['Dezember', 'Jaenner', 'Februar', 'Maerz', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
||||
return months[month]
|
||||
|
||||
def translate_ORF_quality_string_to_tag(self, quality_string):
|
||||
"""
|
||||
Returns a filetag which is derived from a key string. The key strings are defined
|
||||
by the file names of the ORF company offering its download file names.
|
||||
"""
|
||||
|
||||
if quality_string == 'Q4A' or quality_string == 'LOW':
|
||||
return 'lowquality'
|
||||
elif quality_string == 'Q6A' or quality_string == 'Q8C' or quality_string == 'HD':
|
||||
return 'highquality'
|
||||
else:
|
||||
return 'UNKNOWNQUALITY'
|
||||
|
||||
def get_file_size(self, filename):
|
||||
"""
|
||||
A simple wrapper to determine file sizes.
|
||||
|
||||
For some hard-coded file names, a hard-coded file size is returned. This enables
|
||||
unit-testing for file sizes that do not exist in the file system.
|
||||
"""
|
||||
|
||||
# these are the hard-coded sizes for unit test cases:
|
||||
if filename in ['20180510T090000 ORF - ZIB - Signation -ORIGINAL- 2018-05-10_0900_tl_02_ZIB-9-00_Signation__13976423__o__1368225677__s14297692_2__WEB03HD_09000305P_09001400P_Q4A.mp4',
|
||||
'20180510T090000 ORF - ZIB - Weitere Signale der Entspannung -ORIGINAL- 2018-05-10_0900_tl_02_ZIB-9-00_Weitere-Signale__13976423__o__5968792755__s14297694_4__WEB03HD_09011813P_09020710P_Q4A.mp4',
|
||||
'20180520T201500 ORF - Tatort - Tatort_ Aus der Tiefe der Zeit -ORIGINAL- 2018-05-20_2015_in_02_Tatort--Aus-der_____13977411__o__1151703583__s14303062_Q8C.mp4',
|
||||
'20180521T193000 ORF - ZIB 1 - Parlament bereitet sich auf EU-Vorsitz vor -ORIGINAL- 2018-05-21_1930_tl_02_ZIB-1_Parlament-berei__13977453__o__277886215b__s14303762_2__WEB03HD_19350304P_19371319P_Q4A.mp4',
|
||||
'20180608T193000 ORF - Österreich Heute - Das Magazin - Österreich Heute - Das Magazin -ORIGINAL- 13979231_0007_Q8C.mp4',
|
||||
'20190902T220000 ORF - ZIB 2 - Bericht über versteckte ÖVP-Wahlkampfkosten -ORIGINALlow- 2019-09-02_2200_tl_02_ZIB-2_Bericht-ueber-v__14024705__o__71528285d6__s14552793_3__ORF2HD_22033714P_22074303P_Q4A.mp4',
|
||||
'20190902T220000 ORF - ZIB 2 - Hinweis _ Verabschiedung -ORIGINALlow- 2019-09-02_2200_tl_02_ZIB-2_Hinweis---Verab__14024705__o__857007705d__s14552799_9__ORF2HD_22285706P_22300818P_Q4A.mp4']:
|
||||
# don't care about file sizes, return a high number that is abote the expected minimum in any case:
|
||||
return 99999999
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q4A.mp4':
|
||||
return 5017289 # from an actual downloaded file
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size not okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q4A.mp4':
|
||||
return 4217289 # manually reduced size from the value of an actual downloaded file
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q8C.mp4':
|
||||
return 15847932 # from an actual downloaded file
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size not okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q8C.mp4':
|
||||
return 14050000 # manually reduced size from the value of an actual downloaded file
|
||||
elif filename == '20180610T000000 ORF - Kleinkunst - Kleinkunst_ Cordoba - Das Rückspiel (2_2) -ORIGINAL- 2018-06-10_0000_sd_06_Kleinkunst--Cor_____13979381__o__1483927235__s14313621_1__ORF3HD_23592020P_00593103P_Q8C.mp4':
|
||||
return 1506829698 # from actual file
|
||||
elif filename == '2018-06-14_2105_sd_02_Am-Schauplatz_-_Alles für die Katz-_____13979879__o__1907287074__s14316407_7__WEB03HD_21050604P_21533212P_Q8C.mp4':
|
||||
return 1214980782 # from actual file
|
||||
elif filename == '2018-06-14_2155_sd_06_Kottan-ermittelt - Wien Mitte_____13979903__o__1460660672__s14316392_2__ORF3HD_21570716P_23260915P_Q8C.mp4':
|
||||
return 2231522252 # from actual file
|
||||
elif filename == '2018-06-14_2330_sd_06_Sommerkabarett - Lukas Resetarits: Schmäh (1 von 2)_____13979992__o__1310584704__s14316464_4__ORF3HD_23301620P_00302415P_Q8C.mp4':
|
||||
return 1506983474 # from actual file
|
||||
|
||||
try:
|
||||
return os.stat(filename).st_size
|
||||
except OSError:
|
||||
error_exit(10, 'get_file_size(): Could not get file size of: ' + filename)
|
||||
|
||||
def warn_if_ORF_file_seems_to_small_according_to_duration_and_quality_indicator(self, oldfilename, qualityindicator,
|
||||
start_hrs, start_min, start_sec,
|
||||
end_hrs, end_min, end_sec):
|
||||
"""
|
||||
Launches a warning if the expected size differs from the actual file size.
|
||||
|
||||
Expected size is derived from the detailed time-stamp information
|
||||
and tests with a ten minute file:
|
||||
|
||||
| Quality Indicator | file size | bytes per second |
|
||||
|-------------------------+-----------+------------------|
|
||||
| Q8C = HD | 240429907 | 400717 |
|
||||
| Q6A = high quality | 150198346 | 250331 |
|
||||
| Q4A = low quality | 74992178 | 124987 |
|
||||
"""
|
||||
|
||||
#FIXXME: 2019-08-26: disabled: correct from exception to warning #
|
||||
#FIXXME: 2019-09-03: assigned tests also disabled because this function never raises the expected exception
|
||||
return
|
||||
|
||||
TOLERANCE_FACTOR = 0.95 # To cover edge cases where a reduced file size is feasible
|
||||
|
||||
file_size = self.get_file_size(oldfilename)
|
||||
|
||||
day_of_end = 1
|
||||
if int(end_hrs) < int(start_hrs):
|
||||
logging.debug('end hours is less than begin hours, adding a day-change for calculating duration')
|
||||
day_of_end = 2
|
||||
|
||||
end = datetime.datetime(1980, 5, day_of_end, int(end_hrs), int(end_min), int(end_sec))
|
||||
start = datetime.datetime(1980, 5, 1, int(start_hrs), int(start_min), int(start_sec))
|
||||
duration = end - start
|
||||
duration_in_seconds = duration.seconds
|
||||
assert(duration_in_seconds > 0)
|
||||
|
||||
if qualityindicator == 'Q8C':
|
||||
minimum_expected_file_size = 400000 * duration_in_seconds * TOLERANCE_FACTOR
|
||||
elif qualityindicator == 'Q6A':
|
||||
minimum_expected_file_size = 250000 * duration_in_seconds * TOLERANCE_FACTOR
|
||||
elif qualityindicator == 'Q4A':
|
||||
minimum_expected_file_size = 125000 * duration_in_seconds * TOLERANCE_FACTOR
|
||||
else:
|
||||
logging.warn('Unknown quality indicator prevents file size check: ' + qualityindicator)
|
||||
return
|
||||
|
||||
## additional check for minimum duration because small videos often produced wrong error messages:
|
||||
if duration_in_seconds > 120 and file_size < minimum_expected_file_size:
|
||||
print('\n → ' + colorama.Style.BRIGHT + colorama.Fore.RED +
|
||||
'ERROR: file size seems to be too small for the given duration ' +
|
||||
'and quality indicator found (download aborted?): \n' +
|
||||
' ' * 10 + 'file size: ' + "{:,}".format(file_size) + ' Bytes\n' +
|
||||
' ' * 10 + 'expected minimum size: ' + "{:,}".format(minimum_expected_file_size) + ' Bytes\n' +
|
||||
' ' * 10 + 'duration: ' + str('%.1f' % (duration_in_seconds/60)) + ' minutes\n' +
|
||||
' ' * 10 + 'quality: ' + qualityindicator + '\n' +
|
||||
' ' * 10 + 'file name: ' + oldfilename + colorama.Style.RESET_ALL + '\n')
|
||||
raise(FileSizePlausibilityException('file size is not plausible (too small)'))
|
||||
else:
|
||||
logging.debug('warn_if_ORF_file_seems_to_small_according_to_duration_and_quality_indicator: ' +
|
||||
'file size (' + "{:,}".format(file_size) +
|
||||
') is plausible compared to expected minimum (' +
|
||||
"{:,}".format(minimum_expected_file_size) +
|
||||
')')
|
||||
|
||||
|
||||
def move_to_success_dir(dirname, newfilename):
|
||||
"""
|
||||
|
|
|
|||
Loading…
Reference in a new issue