From 8f0a456222fffefb2fed2eb2c0cabfac84430308 Mon Sep 17 00:00:00 2001 From: Karl Voit Date: Sun, 18 Mar 2018 15:03:37 +0100 Subject: [PATCH] populating cache_of_files_with_metadata + tag group statistics when gardening --- filetags/__init__.py | 238 ++++++++++++++++++++++++++++++++++++++++--- tests/unit_tests.py | 18 +++- 2 files changed, 243 insertions(+), 13 deletions(-) diff --git a/filetags/__init__.py b/filetags/__init__.py index d7470d2..98a4948 100755 --- a/filetags/__init__.py +++ b/filetags/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -PROG_VERSION = "Time-stamp: <2018-02-25 15:24:04 vk>" +PROG_VERSION = "Time-stamp: <2018-03-18 14:59:48 vk>" # TODO: # - fix parts marked with «FIXXME» @@ -88,7 +88,8 @@ else: max_file_length = 0 # will be set after iterating over source files182 -unique_tags = [['teststring1', 'teststring2']] # list of list which contains tags that are mutually exclusive +UNIQUE_TAG_TESTSTRINGS = ['teststring1', 'teststring2'] +unique_tags = [UNIQUE_TAG_TESTSTRINGS] # list of list which contains tags that are mutually exclusive # Note: u'teststring1' and u'teststring2' are hard-coded for testing purposes. # You might delete them if you don't use my unit test suite. @@ -139,7 +140,10 @@ FILE_WITH_EXTENSION_REGEX = re.compile("(.*)\.(.*)$") FILE_WITH_EXTENSION_REGEX_FILENAME_INDEX = 1 FILE_WITH_EXTENSION_REGEX_EXTENSION_INDEX = 2 +YYYY_MM_DD_PATTERN = re.compile('^(\d{4,4})-([01]\d)-([0123]\d)[- _T]') + cache_of_tags_by_folder = {} +cache_of_files_with_metadata = {} # dict of big list of dicts: 'filename', 'path' and other metadata controlled_vocabulary_filename = '' list_of_symlink_directories = [] @@ -350,6 +354,65 @@ def extract_tags_from_filename(filename): return components.group(FILE_WITH_TAGS_REGEX_TAGLIST_INDEX).split(BETWEEN_TAG_SEPARATOR) +def extract_tags_from_path(path): + """ + Returns list of all tags contained within the absolute path that may contain + directories and an optional file. If no tag is found, return empty list. + + @param path: an unicode string containing a path + @param return: list of tags + """ + + def splitall(path): + """ + Snippet from https://www.safaribooksonline.com/library/view/python-cookbook/0596001673/ch04s16.html + + >>> splitall('a/b/c') + ['a', 'b', 'c'] + >>> splitall('/a/b/c/') + ['/', 'a', 'b', 'c', ''] + >>> splitall('/') + ['/'] + >>> splitall('C:') + ['C:'] + >>> splitall('C:\\') + ['C:\\'] + >>> splitall('C:\\a') + ['C:\\', 'a'] + >>> splitall('C:\\a\\') + ['C:\\', 'a', ''] + >>> splitall('C:\\a\\b') + ['C:\\', 'a', 'b'] + >>> splitall('a\\b') + ['a', 'b'] + """ + + allparts = [] + while 1: + parts = os.path.split(path) + if parts[0] == path: # sentinel for absolute paths + allparts.insert(0, parts[0]) + break + elif parts[1] == path: # sentinel for relative paths + allparts.insert(0, parts[1]) + break + else: + path = parts[0] + allparts.insert(0, parts[1]) + return allparts + + assert(path.__class__ == str) + + tags = [] + abspath = os.path.abspath(path) + for item in splitall(abspath): + itemtags = extract_tags_from_filename(item) + for currentitemtag in itemtags: + if currentitemtag not in tags: + tags.append(currentitemtag) + return tags + + def adding_tag_to_filename(filename, tagname): """ Returns string of file name with tagname as additional tag. @@ -452,7 +515,7 @@ def extract_filenames_from_argument(argument): @param return: a list of unicode file names """ - # FIXXME: works at my computer without need to convertion but add check later on + # FIXXME: currently works without need to convertion but add check later on return argument @@ -849,10 +912,86 @@ def add_tag_to_countdict(tag, tags): return tags +def extract_iso_datestamp_from_filename(filename): + """ + Returns array of year, month, day if filename starts with + YYYY-MM-DD datestamp. Returns empty array else. + """ + + components = re.match(YYYY_MM_DD_PATTERN, filename) + if components: + return [components.group(1), components.group(2), components.group(3)] + else: + return [] + + +def get_files_with_metadata(startdir=os.getcwd(), use_cache=True): + """ + Traverses the file system starting with given directory, + returns list: filename and metadata-dict: + + The result is stored in the global dict as + cache_of_files_with_metadata[startdir] with dict elements like: + 'filename': '2018-03-18 this is a file name -- tag1 tag2.txt', + 'filetags': ['tag1', 'tag2'], + 'path': '/this/is -- tag1/the -- tag3/path', + 'alltags': ['tag1', 'tag2', 'tag3'], + 'ctime': time.struct_time, + 'datestamp': ['2018', '03', '18'], + + @param use_cache: FOR FUTURE USE; default = True + @param return: list of filenames and metadata-dict + """ + + global cache_of_files_with_metadata + + assert(os.path.isdir(startdir)) + + logging.debug('get_files_with_metadata called with startdir [%s], cached startdirs [%s]' % (startdir, str(len(list(cache_of_files_with_metadata.keys()))))) + + if use_cache and len(cache_of_files_with_metadata) > 0: + logging.debug("found " + str(len(cache_of_files_with_metadata)) + " files in cache for files") + return cache_of_files_with_metadata + + else: + + cache = [] + for root, dirs, files in os.walk(startdir): + + # logging.debug('get_files_with_metadata: root [%s]' % root) # LOTS of debug output + for filename in files: + + absfilename = os.path.abspath(os.path.join(root, filename)) + # logging.debug('get_files_with_metadata: file [%s]' % absfilename) # LOTS of debug output + path, basename = os.path.split(absfilename) + cache.append({ + 'filename': basename, + 'filetags': extract_tags_from_filename(basename), + 'path': path, + 'alltags': extract_tags_from_path(absfilename), + 'ctime': time.localtime(os.path.getctime(absfilename)), + 'datestamp': extract_iso_datestamp_from_filename(basename) + }) + + # Enable recursive directory traversal for specific options: + if not (options.recursive and (options.list_tags_by_alphabet or + options.list_tags_by_number or + options.list_unknown_tags or + options.tag_gardening)): + break # do not loop + + logging.debug("Writing " + str(len(cache)) + " files in cache for directory: " + startdir) + if use_cache: + cache_of_files_with_metadata[startdir] = cache + return cache + + def get_tags_from_files_and_subfolders(startdir=os.getcwd(), use_cache=True): """ Traverses the file system starting with given directory, - returns dict of all tags (including starttags) of all file + returns dict of all tags (including starttags) of all file. + Uses cache_of_files_with_metadata of use_cache is true and + cache is populated with same startdir. @param use_cache: FOR FUTURE USE @param return: dict of tags and their number of occurrence @@ -877,6 +1016,16 @@ def get_tags_from_files_and_subfolders(startdir=os.getcwd(), use_cache=True): logging.debug("found " + str(len(cache_of_tags_by_folder[startdir])) + " tags in cache for directory: " + startdir) return cache_of_tags_by_folder[startdir] + elif use_cache and startdir in cache_of_files_with_metadata.keys(): + logging.debug('using cache_of_files_with_metadata instead of traversing file system again') + cachedata = cache_of_files_with_metadata[startdir] + + # FIXXME: check if tags are extracted from dirnames as in traversal algorithm below + + for entry in cachedata: + for tag in entry['alltags']: + tags = add_tag_to_countdict(tag, tags) + else: for root, dirs, files in os.walk(startdir): @@ -898,10 +1047,10 @@ def get_tags_from_files_and_subfolders(startdir=os.getcwd(), use_cache=True): options.tag_gardening)): break # do not loop - logging.debug("Writing " + str(len(list(tags.keys()))) + " tags in cache for directory: " + startdir) - if use_cache: - cache_of_tags_by_folder[startdir] = tags - return tags + logging.debug("Writing " + str(len(list(tags.keys()))) + " tags in cache for directory: " + startdir) + if use_cache: + cache_of_tags_by_folder[startdir] = tags + return tags def find_similar_tags(tag, tags): @@ -1054,6 +1203,7 @@ def handle_tag_gardening(vocabulary): @param return: - """ + files_with_metadata = get_files_with_metadata(startdir=os.getcwd()) # = cache_of_files_with_metadata of current dir tag_dict = get_tags_from_files_and_subfolders(startdir=os.getcwd()) if not tag_dict: print("\nNo file containing tags found in this folder hierarchy.\n") @@ -1061,9 +1211,38 @@ def handle_tag_gardening(vocabulary): print("\nYou have used " + str(len(tag_dict)) + " tags in total.\n") - if vocabulary: + number_of_files = len(files_with_metadata) + print("\nNumber of total files: " + str(number_of_files)) - print('\nYour controlled vocabulary is defined in ' + controlled_vocabulary_filename + ' and contains ' + str(len(vocabulary)) + ' tags.\n') + def str_percentage(fraction, total): + "returns a string containing the percentage of the fraction wrt the total" + assert(type(fraction) == int) + assert(type(total) == int) + if total == 0: + return "0%" # avoid division by zero + else: + return str(round(100*fraction/total, 1)) + '%' + + files_without_alltags = [x for x in files_with_metadata if not x['alltags']] + num_files_without_alltags = len(files_without_alltags) + print("Number of files without tags including pathtags: " + str(num_files_without_alltags) + + " (" + str_percentage(num_files_without_alltags, number_of_files) + " of total files)") + + files_without_filetags = [x for x in files_with_metadata if not x['filetags']] + num_files_without_filetags = len(files_without_filetags) + print("Number of files without filetags: " + str(num_files_without_filetags) + + " (" + str_percentage(num_files_without_filetags, number_of_files) + " of total files)") + + num_files_with_alltags = number_of_files - len(files_without_alltags) + + files_with_filetags = [x for x in files_with_metadata if x['filetags']] + num_files_with_filetags = len(files_with_filetags) + print("Number of files with filetags: " + str(num_files_with_filetags) + + " (" + str_percentage(num_files_with_filetags, number_of_files) + " of total files)") + + if vocabulary: + print('\nYour controlled vocabulary is defined in ' + controlled_vocabulary_filename + + ' and contains ' + str(len(vocabulary)) + ' tags.\n') vocabulary_tags_not_used = set(vocabulary) - set(tag_dict.keys()) if vocabulary_tags_not_used: @@ -1075,13 +1254,47 @@ def handle_tag_gardening(vocabulary): print("\nTags you used that are not in the vocabulary:\n") print_tag_set(tags_not_in_vocabulary) + if unique_tags and len(unique_tags) > 0: + # There are mutually exclusive tags defined in the controlled vocabulary + for taggroup in unique_tags: + # iterate over mutually exclusive tag groups one by one + + if taggroup == UNIQUE_TAG_TESTSTRINGS: + continue + if len(set(tag_dict.keys()).intersection(set(taggroup))) > 0: + files_with_any_tag_from_taggroup = [x for x in + files_with_metadata if + len(set(x['alltags']).intersection(set(taggroup))) > 0] + num_files_with_any_tag_from_taggroup = len(files_with_any_tag_from_taggroup) + print('\nTag group ' + str(taggroup) + ":\n Number of files with tag from tag group: " + + str(num_files_with_any_tag_from_taggroup) + + " (" + str_percentage(num_files_with_any_tag_from_taggroup, num_files_with_alltags) + + " of tagged files)") + + longest_tagname = max(taggroup, key=len) + for tag in taggroup: + files_with_tag_from_taggroup = [x for x in files_with_metadata if tag in x['alltags']] + num_files_with_tag_from_taggroup = len(files_with_tag_from_taggroup) + if num_files_with_tag_from_taggroup > 0: + print(' {:<{}} • {:>{}} tagged file(s) = {:>5} of tag group'.format( + tag, + len(longest_tagname), + str(num_files_with_tag_from_taggroup), + len(str(num_files_with_any_tag_from_taggroup)), + str_percentage(num_files_with_tag_from_taggroup, num_files_with_any_tag_from_taggroup))) + else: + print(' "' + tag + '": Not used') + else: + print('Tag group ' + str(taggroup) + ': Not used') + print("\nTags that appear only once are most probably typos or you have forgotten them:") tags_only_used_once_dict = {key: value for key, value in list(tag_dict.items()) if value < 2} print_tag_dict(tags_only_used_once_dict, vocabulary, sort_index=0, print_only_tags_with_similar_tags=False) print("\nTags which have similar other tags are probably typos or plural/singular forms of others:") tags_for_comparing = list(set(tag_dict.keys()).union(set(vocabulary))) # unified elements of both lists - only_similar_tags_by_alphabet_dict = {key: value for key, value in list(tag_dict.items()) if find_similar_tags(key, tags_for_comparing)} + only_similar_tags_by_alphabet_dict = {key: value for key, value in list(tag_dict.items()) + if find_similar_tags(key, tags_for_comparing)} print_tag_dict(only_similar_tags_by_alphabet_dict, vocabulary, sort_index=0, print_similar_vocabulary_tags=True) tags_only_used_once_set = set(tags_only_used_once_dict.keys()) @@ -1102,7 +1315,8 @@ def locate_file_in_cwd_and_parent_directories(startfile, filename): @param return: file name found """ - if startfile and os.path.isfile(startfile) and os.path.isfile(os.path.join(os.path.dirname(os.path.abspath(startfile)), filename)): + if startfile and os.path.isfile(startfile) and os.path.isfile( + os.path.join(os.path.dirname(os.path.abspath(startfile)), filename)): logging.debug('found \"%s\" in directory of \"%s\" ..' % (filename, startfile)) return filename elif startfile and os.path.isdir(startfile) and os.path.isfile(os.path.join(startfile, filename)): diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 3b97313..b5526ce 100755 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Time-stamp: <2017-11-11 17:54:40 vk> +# Time-stamp: <2018-03-18 11:00:42 vk> # invoke tests using following command line: # ~/src/vktag % PYTHONPATH="~/src/filetags:" tests/unit_tests.py --verbose @@ -147,6 +147,22 @@ class TestMethods(unittest.TestCase): 'file3 -- common foo bar baz.txt' 'file4 -- common foo bar jodel.txt'])), set(['common', 'foo'])) + def test_extract_tags_from_path(self): + self.assertEqual(set(filetags.extract_tags_from_path('/a/path/without/tags')), set([])) + self.assertEqual(set(filetags.extract_tags_from_path('/path -- ptag1/with -- ptag1 ptag2/tags')), + set(['ptag1', 'ptag2'])) + self.assertEqual(set(filetags.extract_tags_from_path('/path -- ptag1/with -- ptag1 ptag2/tags -- ftag1')), + set(['ptag1', 'ptag2', 'ftag1'])) + + def test_extract_iso_datestamp_from_filename(self): + self.assertEqual(filetags.extract_iso_datestamp_from_filename(''), []) + self.assertEqual(filetags.extract_iso_datestamp_from_filename('foo'), []) + self.assertEqual(filetags.extract_iso_datestamp_from_filename('9999-99-99 foo bar'), []) + self.assertEqual(filetags.extract_iso_datestamp_from_filename('2018-03-18 foo bar'), ['2018', '03', '18']) + self.assertEqual(filetags.extract_iso_datestamp_from_filename('2018-03-18_foo bar'), ['2018', '03', '18']) + self.assertEqual(filetags.extract_iso_datestamp_from_filename('2018-03-18-foo bar'), ['2018', '03', '18']) + self.assertEqual(filetags.extract_iso_datestamp_from_filename('2018-03-18T23.59 foo bar'), ['2018', '03', '18']) + def tearDown(self): pass