moving from hard coded RegEx index to named groups (ongoing)

This commit is contained in:
Karl Voit 2020-02-29 22:56:50 +01:00
parent 6d043c8d2e
commit 9b88d4852b
2 changed files with 97 additions and 35 deletions

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
PROG_VERSION = u"Time-stamp: <2020-02-29 19:14:00 vk>"
PROG_VERSION = u"Time-stamp: <2020-02-29 22:56:20 vk>"
# TODO:
@ -139,30 +139,37 @@ class GuessFilename(object):
DATETIME_DURATION_REGEX = DATETIMESTAMP_REGEX + '(--?' + DATETIMESTAMP2_REGEX + ')?'
ISO_NAME_TAGS_EXTENSION_REGEX = re.compile('((?P<daytimeduration>' + DATETIME_DURATION_REGEX + ')[ -_])?(?P<description>.+?)(' + FILENAME_TAG_SEPARATOR + '(?P<tags>(\w+[' + BETWEEN_TAG_SEPARATOR + ']?)+))?(\.(?P<extension>\w+))?$', re.UNICODE)
ISO_NAME_TAGS_EXTENSION_REGEX = re.compile('((?P<daytimeduration>' + DATETIME_DURATION_REGEX + \
')[ -_])?(?P<description>.+?)(' + FILENAME_TAG_SEPARATOR + \
'(?P<tags>(\w+[' + BETWEEN_TAG_SEPARATOR + \
']?)+))?(\.(?P<extension>\w+))?$', re.UNICODE)
RAW_EURO_CHARGE_REGEX = '(?P<charge>\d+([,.]\d+)?)[-_ ]?(EUR|€)'
EURO_CHARGE_REGEX = re.compile('^(.+[-_ ])?' + RAW_EURO_CHARGE_REGEX + '([-_ .].+)?$', re.UNICODE)
# Screenshot_2017-11-29_10-32-12.png
# Screenshot_2017-11-07_07-52-59 my description.png
MISC_SCREENSHOT_REGEX = re.compile('Screenshot_' + DATESTAMP_REGEX + '[-_T]' + TIMESTAMP_REGEX + '(?P<description>.*)?.(?P<extension>png|jpg)', re.UNICODE)
MISC_SCREENSHOT_REGEX = re.compile('Screenshot_' + DATESTAMP_REGEX + '[-_T]' + TIMESTAMP_REGEX + \
'(?P<description>.*)?\.(?P<extension>png|jpg)', re.UNICODE)
# Firefox_Screenshot_2018-05-03T20-07-14.972Z.png
EASY_SCREENSHOT_REGEX = re.compile('Firefox_Screenshot_' + DATESTAMP_REGEX + '[-_T]' + TIMESTAMP_REGEX + '\.\d{3}Z(.*).(?P<extension>png|jpg)', re.UNICODE)
EASY_SCREENSHOT_REGEX = re.compile('Firefox_Screenshot_' + DATESTAMP_REGEX + '[-_T]' + \
TIMESTAMP_REGEX + '\.\d{3}Z(.*)\.(?P<extension>png|jpg)', re.UNICODE)
# 2017-12-07_09-23_Thu Went for a walk .gpx
OSMTRACK_REGEX = re.compile(DATESTAMP_REGEX + '[T_]?' + TIMESTAMP_REGEX + '(_' + WEEKDAYS_TLA_REGEX + ')?([ _](?P<description>.*))?\.(?P<extension>.+)', re.UNICODE)
OSMTRACK_REGEX = re.compile(DATESTAMP_REGEX + '[T_]?' + TIMESTAMP_REGEX + '(_' + \
WEEKDAYS_TLA_REGEX + ')?([ _](?P<description>.*))?\.(?P<extension>.+)', re.UNICODE)
SIGNAL_REGEX = re.compile('signal-(attachment-)?' + DATESTAMP_REGEX + '-' + TIMESTAMP_REGEX + '(.+)?(\..+)', re.UNICODE)
SIGNAL_REGEX = re.compile('signal-(attachment-)?' + DATESTAMP_REGEX + '-' + \
TIMESTAMP_REGEX + '(?P<description>.+)?(\.(?P<extension>.+))', re.UNICODE)
IMG_REGEX = re.compile('IMG_' + DATESTAMP_REGEX + '_' + TIMESTAMP_REGEX + '(_Bokeh)?(.+)?.jpg', re.UNICODE)
IMG_INDEXGROUPS_NORMAL = [1, '-', 2, '-', 3, 'T', 4, '.', 5, ['.', 7], 9, '.jpg']
IMG_INDEXGROUPS_BOKEH = [1, '-', 2, '-', 3, 'T', 4, '.', 5, ['.', 7], ' Bokeh', 9, '.jpg']
VID_REGEX = re.compile('VID_' + DATESTAMP_REGEX + '_' + TIMESTAMP_REGEX + '(.+)?.mp4', re.UNICODE)
VID_INDEXGROUPS = [1, '-', 2, '-', 3, 'T', 4, '.', 5, ['.', 7], 8, '.mp4']
IMG_REGEX = re.compile('IMG_' + DATESTAMP_REGEX + '_' + TIMESTAMP_REGEX + \
'(?P<bokeh>_Bokeh)?(?P<description>.+)?\.jpg', re.UNICODE)
VID_REGEX = re.compile('VID_' + DATESTAMP_REGEX + '_' + TIMESTAMP_REGEX + \
'(?P<description>.+)?\.(?P<extension>mp4)', re.UNICODE)
NEWSPAPER1_REGEX = re.compile('(.+) \((\d{2})\.(\d{2})\.(\d{4})\)(.*)(.pdf)', re.UNICODE)
# 2019-12-04: "Die Presse (31.10.2019) - Unknown.pdf" -> "2019-10-31 Die Presse.pdf"
NEWSPAPER1_REGEX = re.compile('(?P<description>.+) \((?P<day>\d{2})\.(?P<month>\d{2})\.(?P<year>\d{4})\)(?P<misc>.*)\.(?P<extension>pdf)', re.UNICODE)
# OLD # # MediathekView: Settings > modify Set > Targetfilename: "%DT%d h%i %s %t - %T - %N.mp4" (limited to 120 characters)
# OLD # # results in files like:
@ -256,11 +263,11 @@ class GuessFilename(object):
BANKAUSTRIA_BANK_TRANSACTIONS_REGEX = re.compile('^' + DATETIMESTAMP_REGEX + '_IKS-(\d{29}).csv$', re.UNICODE)
BANKAUSTRIA_BANK_TRANSACTIONS_INDEXGROUPS = [1, ' Bank Austria Umsatzliste IKS-', 4, '.csv']
RECORDER_REGEX = re.compile('rec_([12]\d{3})([01]\d)([0123]\d)-([012]\d)([012345]\d)(.+)?.(wav|mp3)')
RECORDER_REGEX = re.compile('rec_' + DATESTAMP_REGEX + '-' + TIMESTAMP_REGEX + '(?P<description>.+?)?\.(?P<extension>wav|mp3)')
# modet_2018-03-27_16-10.mkv
# modet_2018-03-27_17-44-1.mkv
MODET_REGEX = re.compile('modet_(' + DATESTAMP_REGEX + ')_' + TIMESTAMP_REGEX + '(.*).mkv')
MODET_REGEX = re.compile('modet_(' + DATESTAMP_REGEX + ')_' + TIMESTAMP_REGEX + '(?P<description>.*).mkv')
# 20200224-0914_Foo_bar.wav
SMARTREC_REGEX = re.compile('(?P<DAY>' + DATESTAMP_REGEX + ')-' + TIMESTAMP_REGEX + '(_(?P<description>.+))?.(?P<extension>wav|mp3)')
@ -476,14 +483,18 @@ class GuessFilename(object):
# digital camera images: IMG_20161014_214404 foo bar.jpg -> 2016-10-14T21.44.04 foo bar.jpg OR
regex_match = re.match(self.IMG_REGEX, oldfilename)
if regex_match:
if regex_match.group(8) == '_Bokeh':
return self.build_string_via_indexgroups(regex_match, self.IMG_INDEXGROUPS_BOKEH)
if regex_match.group('bokeh') and regex_match.group('description'):
return self.get_datetime_string_from_named_groups(regex_match) + ' Bokeh' + regex_match.group('description') + '.jpg'
elif not regex_match.group('bokeh') and regex_match.group('description'):
return self.get_datetime_string_from_named_groups(regex_match) + regex_match.group('description') + '.jpg'
elif regex_match.group('bokeh') and not regex_match.group('description'):
return self.get_datetime_string_from_named_groups(regex_match) + ' Bokeh' + '.jpg'
else:
return self.build_string_via_indexgroups(regex_match, self.IMG_INDEXGROUPS_NORMAL)
# VID_20170105_173104.mp4 -> 2017-01-05T17.31.04.mp4
return self.get_datetime_string_from_named_groups(regex_match) + '.jpg'
# VID_20170105_173104.mp4 -> 2017-01-05T17.31.04.mp4
regex_match = re.match(self.VID_REGEX, oldfilename)
if regex_match:
return self.build_string_via_indexgroups(regex_match, self.VID_INDEXGROUPS)
return self.get_datetime_description_extension_filename(regex_match, replace_description_underscores=True)
# 2018-04-01:
# signal-2018-03-08-102332.jpg → 2018-03-08T10.23.32.jpg
@ -491,20 +502,17 @@ class GuessFilename(object):
# signal-attachment-2019-11-23-090716_001.jpeg -> 2019-11-23T09.07.16_001.jpeg
regex_match = re.match(self.SIGNAL_REGEX, oldfilename)
if regex_match:
if regex_match.group(9):
result = self.build_string_via_indexgroups(regex_match, [2, '-', 3, '-', 4, 'T', 5, '.', 6, '.', 7, 9, 10])
else:
result = self.build_string_via_indexgroups(regex_match, [2, '-', 3, '-', 4, 'T', 5, '.', 6, '.', 7, '.jpg'])
return result
return self.get_datetime_description_extension_filename(regex_match, replace_description_underscores=True)
# 2018-03-27:
# modet_2018-03-27_16-10.mkv
# modet_2018-03-27_17-44-1.mkv
regex_match = re.match(self.MODET_REGEX, oldfilename)
if regex_match:
result = self.build_string_via_indexgroups(regex_match, [1, 'T', 2, '.', 3, ' modet ', 6, '.mkv'])
return result
if regex_match.group('description'):
return self.get_datetime_string_from_named_groups(regex_match) + ' modet ' + regex_match.group('description') + '.mkv'
else:
return self.get_datetime_string_from_named_groups(regex_match) + ' modet' + '.mkv'
# 2017-11-30:
# rec_20171129-0902 A nice recording .wav -> 2017-11-29T09.02 A nice recording.wav
@ -513,10 +521,7 @@ class GuessFilename(object):
# rec_20171129-0902.mp3 -> 2017-11-29T09.02.mp3
regex_match = re.match(self.RECORDER_REGEX, oldfilename)
if regex_match:
result = self.build_string_via_indexgroups(regex_match, [1, '-', 2, '-', 3, 'T', 4, '.', 5])
if regex_match.group(6):
result += ' ' + regex_match.group(6).strip()
return result + '.' + regex_match.group(7)
return self.get_datetime_description_extension_filename(regex_match, replace_description_underscores=True)
# 2019-04-01 oekostrom AG - Teilbetragsrechnung Stromverbrauch 54 EUR -- scan bill.pdf
if 'teilbetragsrechnung' in oldfilename.lower() and \
@ -639,7 +644,7 @@ class GuessFilename(object):
# 2019-12-04: NEWSPAPER1_REGEX such as : "Die Presse (31.10.2019) - Unknown.pdf" -> "2019-10-31 Die Presse.pdf"
regex_match = re.match(self.NEWSPAPER1_REGEX, oldfilename)
if regex_match:
return self.build_string_via_indexgroups(regex_match, [4, '-', 3, '-', 2, ' ', 1, 6])
return self.get_date_description_extension_filename(regex_match, replace_description_underscores=True)
# 20200224-0914_Foo_bar.wav
@ -1185,6 +1190,15 @@ class GuessFilename(object):
return regex_match.group('year') + '-' + regex_match.group('month') + '-' + regex_match.group('day') + 'T' + \
regex_match.group('hour') + '.' + regex_match.group('minute') + second
def get_date_string_from_named_groups(self, regex_match):
"""Extracts YMDHM(S) from match groups and returns YYYY.MM.DDTHH.MM(.SS)
"""
assert(regex_match)
assert(regex_match.group('day'))
assert(regex_match.group('month'))
assert(regex_match.group('year'))
return regex_match.group('year') + '-' + regex_match.group('month') + '-' + regex_match.group('day')
def get_datetime_description_extension_filename(self, regex_match, replace_description_underscores=False):
"""
When a regex_match has matching groups for datetime elements, an optional description
@ -1193,7 +1207,7 @@ class GuessFilename(object):
if regex_match.group('description'):
if replace_description_underscores:
return self.get_datetime_string_from_named_groups(regex_match) + ' ' + \
regex_match.group('description').strip().replace('_', ' ') + '.' + \
regex_match.group('description').strip().replace('_', ' ').strip() + '.' + \
regex_match.group('extension')
else:
return self.get_datetime_string_from_named_groups(regex_match) + ' ' + \
@ -1201,6 +1215,22 @@ class GuessFilename(object):
else:
return self.get_datetime_string_from_named_groups(regex_match) + '.' + regex_match.group('extension')
def get_date_description_extension_filename(self, regex_match, replace_description_underscores=False):
"""
When a regex_match has matching groups for datetime elements, an optional description
and an extension, this function composes the standard file name of pattern "YYYY-MM-DD( description).extension"
"""
if regex_match.group('description'):
if replace_description_underscores:
return self.get_date_string_from_named_groups(regex_match) + ' ' + \
regex_match.group('description').strip().replace('_', ' ').strip() + '.' + \
regex_match.group('extension')
else:
return self.get_date_string_from_named_groups(regex_match) + ' ' + \
regex_match.group('description').strip() + '.' + regex_match.group('extension')
else:
return self.get_date_string_from_named_groups(regex_match) + '.' + regex_match.group('extension')
def build_string_via_indexgroups(self, regex_match, indexgroups):
"""This function takes a regex_match object and concatenates its
groups. It does this by traversing the list of indexgroups. If

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python; -*-
# Time-stamp: <2020-02-29 18:31:58 vk>
# Time-stamp: <2020-02-29 22:48:30 vk>
import unittest
import logging
@ -927,6 +927,10 @@ class TestGuessFilename(unittest.TestCase):
'2019-01-18T13.39.28 Bokeh.jpg')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('IMG_20190118_133928_Bokeh This is a note.jpg'),
'2019-01-18T13.39.28 Bokeh This is a note.jpg')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('VID_20170105_173104.mp4'),
'2017-01-05T17.31.04.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('VID_20170105_173104 foo bar.mp4'),
'2017-01-05T17.31.04 foo bar.mp4')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('2019-10-10 a file exported by Boox Max 2-Exported.pdf'),
'2019-10-10 a file exported by Boox Max 2 -- notes.pdf')
@ -935,7 +939,6 @@ class TestGuessFilename(unittest.TestCase):
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('2019-10-10 a file exported by Boox Max 2 -- draft-Exported.pdf'),
'2019-10-10 a file exported by Boox Max 2 -- draft notes.pdf')
# Smartrecorder
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20190512-1125_Recording_1.wav'),
'2019-05-12T11.25 Recording 1.wav')
@ -946,6 +949,25 @@ class TestGuessFilename(unittest.TestCase):
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('20190512-1125.mp3'),
'2019-05-12T11.25.mp3')
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('Die Presse (31.10.2019) - Unknown.pdf'),
'2019-10-31 Die Presse.pdf')
# signal-2018-03-08-102332.jpg → 2018-03-08T10.23.32.jpg
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('signal-2018-03-08-102332.jpg'),
'2018-03-08T10.23.32.jpg')
# signal-2018-03-08-102332 foo bar.jpg → 2018-03-08T10.23.32 foo bar.jpg
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('signal-2018-03-08-102332 foo bar.jpg'),
'2018-03-08T10.23.32 foo bar.jpg')
# signal-attachment-2019-11-23-090716_001.jpeg -> 2019-11-23T09.07.16_001.jpeg
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('signal-attachment-2019-11-23-090716_001.jpeg'),
'2019-11-23T09.07.16 001.jpeg')
# modet_2018-03-27_16-10.mkv
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('modet_2018-03-27_16-10.mkv'),
'2018-03-27T16.10 modet.mkv')
# modet_2018-03-27_17-44-1.mkv
self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename('modet_2018-03-27_17-44-1.mkv'),
'2018-03-27T17.44 modet -1.mkv')
# self.assertEqual(self.guess_filename.derive_new_filename_from_old_filename(''),
# '')
@ -980,6 +1002,16 @@ class TestGuessFilename(unittest.TestCase):
replace_description_underscores=True),
'2020-02-29T15.07.52 with seconds.png')
def test_get_date_description_extension_filename(self):
regex_match = re.match(self.guess_filename.ISO_NAME_TAGS_EXTENSION_REGEX, '2020-02-29 with seconds.png')
self.assertTrue(self.guess_filename.get_date_description_extension_filename(regex_match), '2020-02-29 with seconds.png')
regex_match = re.match(self.guess_filename.ISO_NAME_TAGS_EXTENSION_REGEX, '2020-02-29_with_seconds.png')
self.assertTrue(self.guess_filename.get_date_description_extension_filename(regex_match,
replace_description_underscores=True),
'2020-02-29 with seconds.png')
def test_contains_one_of(self):
self.assertTrue(self.guess_filename.contains_one_of("foo bar baz", ['foo']))