forked from Github/guess-filename.py
added plausibility size checks for ORF
This commit is contained in:
parent
f079077dc7
commit
890e70785f
2 changed files with 134 additions and 6 deletions
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
PROG_VERSION = u"Time-stamp: <2018-06-09 16:00:48 vk>"
|
||||
PROG_VERSION = u"Time-stamp: <2018-06-09 18:07:40 vk>"
|
||||
|
||||
|
||||
# TODO:
|
||||
|
|
@ -21,6 +21,7 @@ import time
|
|||
import logging
|
||||
from optparse import OptionParser
|
||||
import colorama
|
||||
import datetime # for calculating duration of chunks
|
||||
|
||||
try:
|
||||
from fuzzywuzzy import fuzz # for fuzzy comparison of strings
|
||||
|
|
@ -100,6 +101,18 @@ def error_exit(errorcode, text):
|
|||
sys.exit(errorcode)
|
||||
|
||||
|
||||
class FileSizePlausibilityException(Exception):
|
||||
"""
|
||||
Exception for file sizes being to small according to their duration and quality indicator
|
||||
"""
|
||||
|
||||
def __init__(self, message):
|
||||
self.value = message
|
||||
|
||||
def __str__(self):
|
||||
return repr(self.value)
|
||||
|
||||
|
||||
class GuessFilename(object):
|
||||
"""
|
||||
Contains methods of the guess filename domain
|
||||
|
|
@ -201,7 +214,7 @@ class GuessFilename(object):
|
|||
# with the quality indicator Q4A or Q8C when used with the ORF sender file format.
|
||||
MEDIATHEKVIEW_LONG_WITH_DETAILED_TIMESTAMPS_REGEX = re.compile(MEDIATHEKVIEW_SHORT_REGEX_STRING +
|
||||
'.+__o__(\d+b?)__s(\d+)_' + # e.g., "2018-05-10_0900_tl_02_ZIB-9-00_Signation__13976423__o__1368225677__s14297692"
|
||||
'(.+_(' + TIMESTAMP_REGEX + ').+P_(' + TIMESTAMP_REGEX + ').+P_)?' + # OPTIONAL: time-stamps of chunks: "_2__WEB03HD_09000305P_09001400P"
|
||||
'(.+_(' + TIMESTAMP_REGEX + ').+P_(' + TIMESTAMP_REGEX + ').+P_)' + # OPTIONAL: time-stamps of chunks: "_2__WEB03HD_09000305P_09001400P"
|
||||
'(Q4A|Q8C).mp4', re.UNICODE) # "Q4A.mp4" for lowquality or "Q8C.mp4" for highquality
|
||||
|
||||
# C112345678901EUR20150930001.pdf -> 2015-09-30 Bank Austria Kontoauszug 2017-001 12345678901.pdf
|
||||
|
|
@ -541,6 +554,92 @@ class GuessFilename(object):
|
|||
return 'UNKNOWNQUALITY'
|
||||
|
||||
|
||||
def get_file_size(self, filename):
|
||||
"""
|
||||
A simple wrapper to determine file sizes.
|
||||
|
||||
For some hard-coded file names, a hard-coded file size is returned. This enables
|
||||
unit-testing for file sizes that do not exist in the file system.
|
||||
"""
|
||||
|
||||
# these are the hard-coded sizes for unit test cases:
|
||||
if filename in ['20180510T090000 ORF - ZIB - Signation -ORIGINAL- 2018-05-10_0900_tl_02_ZIB-9-00_Signation__13976423__o__1368225677__s14297692_2__WEB03HD_09000305P_09001400P_Q4A.mp4',
|
||||
'20180510T090000 ORF - ZIB - Weitere Signale der Entspannung -ORIGINAL- 2018-05-10_0900_tl_02_ZIB-9-00_Weitere-Signale__13976423__o__5968792755__s14297694_4__WEB03HD_09011813P_09020710P_Q4A.mp4',
|
||||
'20180520T201500 ORF - Tatort - Tatort_ Aus der Tiefe der Zeit -ORIGINAL- 2018-05-20_2015_in_02_Tatort--Aus-der_____13977411__o__1151703583__s14303062_Q8C.mp4',
|
||||
'20180521T193000 ORF - ZIB 1 - Parlament bereitet sich auf EU-Vorsitz vor -ORIGINAL- 2018-05-21_1930_tl_02_ZIB-1_Parlament-berei__13977453__o__277886215b__s14303762_2__WEB03HD_19350304P_19371319P_Q4A.mp4',
|
||||
'20180608T193000 ORF - Österreich Heute - Das Magazin - Österreich Heute - Das Magazin -ORIGINAL- 13979231_0007_Q8C.mp4']:
|
||||
# don't care about file sizes, return a high number that is abote the expected minimum in any case:
|
||||
return 99999999
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q4A.mp4':
|
||||
return 5017289 # from an actual downloaded file
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size not okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q4A.mp4':
|
||||
return 4217289 # manually reduced size from the value of an actual downloaded file
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q8C.mp4':
|
||||
return 15847932 # from an actual downloaded file
|
||||
elif filename == '20180608T170000 ORF - ZIB 17_00 - size not okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q8C.mp4':
|
||||
return 14050000 # manually reduced size from the value of an actual downloaded file
|
||||
|
||||
try:
|
||||
return os.stat(filename).st_size
|
||||
except OSError:
|
||||
error_exit(10, 'get_file_size(): Could not get file size of: ' + filename)
|
||||
|
||||
|
||||
def warn_if_ORF_file_seems_to_small_according_to_duration_and_quality_indicator(self, oldfilename, qualityindicator,
|
||||
start_hrs, start_min, start_sec,
|
||||
end_hrs, end_min, end_sec):
|
||||
"""
|
||||
Launches a warning if the expected size differs from the actual file size.
|
||||
|
||||
Expected size is derived from the detailed time-stamp information
|
||||
and tests with a ten minute file:
|
||||
|
||||
| Quality Indicator | file size | bytes per second |
|
||||
|-------------------------+-----------+------------------|
|
||||
| Q8C = HD | 240429907 | 400717 |
|
||||
| Q6A = high quality | 150198346 | 250331 |
|
||||
| Q4A = low quality | 74992178 | 124987 |
|
||||
"""
|
||||
|
||||
TOLERANCE_FACTOR = 0.95 # To cover edge cases where a reduced file size is feasible
|
||||
|
||||
file_size = self.get_file_size(oldfilename)
|
||||
|
||||
start = datetime.datetime(1980, 5, 1, int(start_hrs), int(start_min), int(start_sec))
|
||||
end = datetime.datetime(1980, 5, 1, int(end_hrs), int(end_min), int(end_sec))
|
||||
duration = end - start
|
||||
duration_in_seconds = duration.seconds
|
||||
assert(duration_in_seconds > 0)
|
||||
|
||||
if qualityindicator == 'Q8C':
|
||||
minimum_expected_file_size = 400000 * duration_in_seconds * TOLERANCE_FACTOR
|
||||
elif qualityindicator == 'Q6A':
|
||||
minimum_expected_file_size = 250000 * duration_in_seconds * TOLERANCE_FACTOR
|
||||
elif qualityindicator == 'Q4A':
|
||||
minimum_expected_file_size = 125000 * duration_in_seconds * TOLERANCE_FACTOR
|
||||
else:
|
||||
logging.warn('Unknown quality indicator prevents file size check: ' + qualityindicator)
|
||||
return
|
||||
|
||||
# import pdb; pdb.set_trace()
|
||||
if file_size < minimum_expected_file_size:
|
||||
print('\n → ' + colorama.Style.BRIGHT + colorama.Fore.RED +
|
||||
'ERROR: file size seems to be too small for the given duration ' +
|
||||
'and quality indicator found (download aborted?): \n' +
|
||||
' ' * 10 + 'file size: ' + "{:,}".format(file_size) + ' Bytes\n' +
|
||||
' ' * 10 + 'expected minimum size: ' + "{:,}".format(minimum_expected_file_size) + ' Bytes\n' +
|
||||
' ' * 10 + 'duration: ' + str('%.1f'%(duration_in_seconds/60)) + ' minutes\n' +
|
||||
' ' * 10 + 'quality: ' + qualityindicator + '\n' +
|
||||
' ' * 10 + 'file name: ' + oldfilename + colorama.Style.RESET_ALL + '\n')
|
||||
raise(FileSizePlausibilityException('file size is not plausible (too small)'))
|
||||
else:
|
||||
logging.debug('warn_if_ORF_file_seems_to_small_according_to_duration_and_quality_indicator: ' +
|
||||
'file size (' + "{:,}".format(file_size) +
|
||||
') is plausible compared to expected minimum (' +
|
||||
"{:,}".format(minimum_expected_file_size) +
|
||||
')')
|
||||
|
||||
|
||||
def derive_new_filename_from_old_filename(self, oldfilename):
|
||||
"""
|
||||
Analyses the old filename and returns a new one if feasible.
|
||||
|
|
@ -601,7 +700,20 @@ class GuessFilename(object):
|
|||
# where the files do not exist
|
||||
pass
|
||||
|
||||
qualitytag = self.translate_ORF_quality_string_to_tag(regex_match.group(len(regex_match.groups())).upper())
|
||||
qualityindicator = regex_match.group(len(regex_match.groups())).upper()
|
||||
qualitytag = self.translate_ORF_quality_string_to_tag(qualityindicator)
|
||||
start_hrs = regex_match.group(15)
|
||||
start_min = regex_match.group(16)
|
||||
start_sec = regex_match.group(17)
|
||||
end_hrs = regex_match.group(20)
|
||||
if end_hrs < start_hrs:
|
||||
# hack to overcome the midnight issue where end hours is less than begin hours:
|
||||
end_hrs = 24 + end_hrs
|
||||
end_min = regex_match.group(21)
|
||||
end_sec = regex_match.group(22)
|
||||
self.warn_if_ORF_file_seems_to_small_according_to_duration_and_quality_indicator(oldfilename, qualityindicator,
|
||||
start_hrs, start_min, start_sec,
|
||||
end_hrs, end_min, end_sec)
|
||||
|
||||
if regex_match.group(13):
|
||||
# the file name contained the optional chunk time-stamp(s)
|
||||
|
|
@ -1118,8 +1230,11 @@ def main():
|
|||
for filename in files:
|
||||
if filename.__class__ == str:
|
||||
filename = str(filename)
|
||||
if not guess_filename.handle_file(filename, options.dryrun):
|
||||
filenames_could_not_be_found += 1
|
||||
try:
|
||||
if not guess_filename.handle_file(filename, options.dryrun):
|
||||
filenames_could_not_be_found += 1
|
||||
except:
|
||||
error_exit(99, 'An exception occurred. Aborting further file processing.')
|
||||
|
||||
if not options.quiet:
|
||||
# add empty line for better screen output readability
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8; mode: python; -*-
|
||||
# Time-stamp: <2018-06-09 15:41:49 vk>
|
||||
# Time-stamp: <2018-06-09 17:46:00 vk>
|
||||
|
||||
import unittest
|
||||
import logging
|
||||
|
|
@ -9,6 +9,7 @@ import os
|
|||
import os.path
|
||||
import sys
|
||||
from guessfilename import GuessFilename
|
||||
from guessfilename import FileSizePlausibilityException
|
||||
|
||||
|
||||
class TestGuessFilename(unittest.TestCase):
|
||||
|
|
@ -158,6 +159,18 @@ class TestGuessFilename(unittest.TestCase):
|
|||
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename("20180608T193000 ORF - Österreich Heute - Das Magazin - Österreich Heute - Das Magazin -ORIGINAL- 13979231_0007_Q8C.mp4"),
|
||||
"2018-06-08T19.30.00 ORF - Österreich Heute - Das Magazin - Österreich Heute - Das Magazin -- highquality.mp4")
|
||||
|
||||
# plausibility checks of file sizes: report plausible sizes
|
||||
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename("20180608T170000 ORF - ZIB 17_00 - size okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q4A.mp4"),
|
||||
"2018-06-08T17.02.06 ORF - ZIB 17 00 - size okay -- lowquality.mp4")
|
||||
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename("20180608T170000 ORF - ZIB 17_00 - size okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q8C.mp4"),
|
||||
"2018-06-08T17.02.06 ORF - ZIB 17 00 - size okay -- highquality.mp4")
|
||||
|
||||
# plausibility checks of file sizes: report non-plausible sizes
|
||||
with self.assertRaises(FileSizePlausibilityException, message='file size is not plausible (too small)'):
|
||||
self.guess_filename.derive_new_filename_from_old_filename("20180608T170000 ORF - ZIB 17_00 - size not okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q4A.mp4")
|
||||
with self.assertRaises(FileSizePlausibilityException, message='file size is not plausible (too small)'):
|
||||
self.guess_filename.derive_new_filename_from_old_filename("20180608T170000 ORF - ZIB 17_00 - size not okay -ORIGINAL- 2018-06-08_1700_tl__13979222__o__1892278656__s14313181_1__WEB03HD_17020613P_17024324P_Q8C.mp4")
|
||||
|
||||
# self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename(""),
|
||||
# "")
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue