populating cache_of_files_with_metadata + tag group statistics when gardening

2026-02-16 22:04:15 +00:00 · 2018-03-18 15:03:37 +01:00 · 2018-03-18 15:03:37 +01:00 · 8f0a456222
commit 8f0a456222
parent b8eb9ed641
2 changed files with 243 additions and 13 deletions
--- a/filetags/init.py
+++ b/filetags/init.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-PROG_VERSION = "Time-stamp: <2018-02-25 15:24:04 vk>"
+PROG_VERSION = "Time-stamp: <2018-03-18 14:59:48 vk>"

 # TODO:
 # - fix parts marked with «FIXXME»
@ -88,7 +88,8 @@ else:

 max_file_length = 0  # will be set after iterating over source files182

-unique_tags = [['teststring1', 'teststring2']]  # list of list which contains tags that are mutually exclusive
+UNIQUE_TAG_TESTSTRINGS = ['teststring1', 'teststring2']
+unique_tags = [UNIQUE_TAG_TESTSTRINGS]  # list of list which contains tags that are mutually exclusive
 # Note: u'teststring1' and u'teststring2' are hard-coded for testing purposes.
 #       You might delete them if you don't use my unit test suite.

@ -139,7 +140,10 @@ FILE_WITH_EXTENSION_REGEX = re.compile("(.*)\.(.*)$")
 FILE_WITH_EXTENSION_REGEX_FILENAME_INDEX = 1
 FILE_WITH_EXTENSION_REGEX_EXTENSION_INDEX = 2

+YYYY_MM_DD_PATTERN = re.compile('^(\d{4,4})-([01]\d)-([0123]\d)[- _T]')
+
 cache_of_tags_by_folder = {}
+cache_of_files_with_metadata = {}  # dict of big list of dicts: 'filename', 'path' and other metadata
 controlled_vocabulary_filename = ''
 list_of_symlink_directories = []

@ -350,6 +354,65 @@ def extract_tags_from_filename(filename):
        return components.group(FILE_WITH_TAGS_REGEX_TAGLIST_INDEX).split(BETWEEN_TAG_SEPARATOR)


+def extract_tags_from_path(path):
+    """
+    Returns list of all tags contained within the absolute path that may contain
+    directories and an optional file. If no tag is found, return empty list.
+
+    @param path: an unicode string containing a path
+    @param return: list of tags
+    """
+
+    def splitall(path):
+        """
+        Snippet from https://www.safaribooksonline.com/library/view/python-cookbook/0596001673/ch04s16.html
+
+        >>> splitall('a/b/c')
+        ['a', 'b', 'c']
+        >>> splitall('/a/b/c/')
+        ['/', 'a', 'b', 'c', '']
+        >>> splitall('/')
+        ['/']
+        >>> splitall('C:')
+        ['C:']
+        >>> splitall('C:\\')
+        ['C:\\']
+        >>> splitall('C:\\a')
+        ['C:\\', 'a']
+        >>> splitall('C:\\a\\')
+        ['C:\\', 'a', '']
+        >>> splitall('C:\\a\\b')
+        ['C:\\', 'a', 'b']
+        >>> splitall('a\\b')
+        ['a', 'b']
+        """
+
+        allparts = []
+        while 1:
+            parts = os.path.split(path)
+            if parts[0] == path:  # sentinel for absolute paths
+                allparts.insert(0, parts[0])
+                break
+            elif parts[1] == path:  # sentinel for relative paths
+                allparts.insert(0, parts[1])
+                break
+            else:
+                path = parts[0]
+                allparts.insert(0, parts[1])
+        return allparts
+
+    assert(path.__class__ == str)
+
+    tags = []
+    abspath = os.path.abspath(path)
+    for item in splitall(abspath):
+        itemtags = extract_tags_from_filename(item)
+        for currentitemtag in itemtags:
+            if currentitemtag not in tags:
+                tags.append(currentitemtag)
+    return tags
+
+
 def adding_tag_to_filename(filename, tagname):
    """
    Returns string of file name with tagname as additional tag.
@ -452,7 +515,7 @@ def extract_filenames_from_argument(argument):
    @param return: a list of unicode file names
    """

-    # FIXXME: works at my computer without need to convertion but add check later on
+    # FIXXME: currently works without need to convertion but add check later on
    return argument


@ -849,10 +912,86 @@ def add_tag_to_countdict(tag, tags):
    return tags


+def extract_iso_datestamp_from_filename(filename):
+    """
+    Returns array of year, month, day if filename starts with
+    YYYY-MM-DD datestamp. Returns empty array else.
+    """
+
+    components = re.match(YYYY_MM_DD_PATTERN, filename)
+    if components:
+        return [components.group(1), components.group(2), components.group(3)]
+    else:
+        return []
+
+
+def get_files_with_metadata(startdir=os.getcwd(), use_cache=True):
+    """
+    Traverses the file system starting with given directory,
+    returns list: filename and metadata-dict:
+
+    The result is stored in the global dict as
+    cache_of_files_with_metadata[startdir] with dict elements like:
+      'filename': '2018-03-18 this is a file name -- tag1 tag2.txt',
+      'filetags': ['tag1', 'tag2'],
+      'path': '/this/is -- tag1/the -- tag3/path',
+      'alltags': ['tag1', 'tag2', 'tag3'],
+      'ctime': time.struct_time,
+      'datestamp': ['2018', '03', '18'],
+
+    @param use_cache: FOR FUTURE USE; default = True
+    @param return: list of filenames and metadata-dict
+    """
+
+    global cache_of_files_with_metadata
+
+    assert(os.path.isdir(startdir))
+
+    logging.debug('get_files_with_metadata called with startdir [%s], cached startdirs [%s]' % (startdir, str(len(list(cache_of_files_with_metadata.keys())))))
+
+    if use_cache and len(cache_of_files_with_metadata) > 0:
+        logging.debug("found " + str(len(cache_of_files_with_metadata)) + " files in cache for files")
+        return cache_of_files_with_metadata
+
+    else:
+
+        cache = []
+        for root, dirs, files in os.walk(startdir):
+
+            # logging.debug('get_files_with_metadata: root [%s]' % root)  # LOTS of debug output
+            for filename in files:
+
+                absfilename = os.path.abspath(os.path.join(root, filename))
+                # logging.debug('get_files_with_metadata: file [%s]' % absfilename)  # LOTS of debug output
+                path, basename = os.path.split(absfilename)
+                cache.append({
+                    'filename': basename,
+                    'filetags': extract_tags_from_filename(basename),
+                    'path': path,
+                    'alltags': extract_tags_from_path(absfilename),
+                    'ctime': time.localtime(os.path.getctime(absfilename)),
+                    'datestamp': extract_iso_datestamp_from_filename(basename)
+                })
+
+            # Enable recursive directory traversal for specific options:
+            if not (options.recursive and (options.list_tags_by_alphabet or
+                                           options.list_tags_by_number or
+                                           options.list_unknown_tags or
+                                           options.tag_gardening)):
+                break  # do not loop
+
+        logging.debug("Writing " + str(len(cache)) + " files in cache for directory: " + startdir)
+        if use_cache:
+            cache_of_files_with_metadata[startdir] = cache
+        return cache
+
+
 def get_tags_from_files_and_subfolders(startdir=os.getcwd(), use_cache=True):
    """
    Traverses the file system starting with given directory,
-    returns dict of all tags (including starttags) of all file
+    returns dict of all tags (including starttags) of all file.
+    Uses cache_of_files_with_metadata of use_cache is true and
+    cache is populated with same startdir.

    @param use_cache: FOR FUTURE USE
    @param return: dict of tags and their number of occurrence
@ -877,6 +1016,16 @@ def get_tags_from_files_and_subfolders(startdir=os.getcwd(), use_cache=True):
        logging.debug("found " + str(len(cache_of_tags_by_folder[startdir])) + " tags in cache for directory: " + startdir)
        return cache_of_tags_by_folder[startdir]

+    elif use_cache and startdir in cache_of_files_with_metadata.keys():
+        logging.debug('using cache_of_files_with_metadata instead of traversing file system again')
+        cachedata = cache_of_files_with_metadata[startdir]
+
+        # FIXXME: check if tags are extracted from dirnames as in traversal algorithm below
+
+        for entry in cachedata:
+            for tag in entry['alltags']:
+                tags = add_tag_to_countdict(tag, tags)
+
    else:

        for root, dirs, files in os.walk(startdir):
@ -898,10 +1047,10 @@ def get_tags_from_files_and_subfolders(startdir=os.getcwd(), use_cache=True):
                                           options.tag_gardening)):
                break  # do not loop

-        logging.debug("Writing " + str(len(list(tags.keys()))) + " tags in cache for directory: " + startdir)
-        if use_cache:
-            cache_of_tags_by_folder[startdir] = tags
-        return tags
+    logging.debug("Writing " + str(len(list(tags.keys()))) + " tags in cache for directory: " + startdir)
+    if use_cache:
+        cache_of_tags_by_folder[startdir] = tags
+    return tags


 def find_similar_tags(tag, tags):
@ -1054,6 +1203,7 @@ def handle_tag_gardening(vocabulary):
    @param return: -
    """

+    files_with_metadata = get_files_with_metadata(startdir=os.getcwd())  # = cache_of_files_with_metadata of current dir
    tag_dict = get_tags_from_files_and_subfolders(startdir=os.getcwd())
    if not tag_dict:
        print("\nNo file containing tags found in this folder hierarchy.\n")
@ -1061,9 +1211,38 @@ def handle_tag_gardening(vocabulary):

    print("\nYou have used " + str(len(tag_dict)) + " tags in total.\n")

-    if vocabulary:
+    number_of_files = len(files_with_metadata)
+    print("\nNumber of total files:                           " + str(number_of_files))

-        print('\nYour controlled vocabulary is defined in ' + controlled_vocabulary_filename + ' and contains ' + str(len(vocabulary)) + ' tags.\n')
+    def str_percentage(fraction, total):
+        "returns a string containing the percentage of the fraction wrt the total"
+        assert(type(fraction) == int)
+        assert(type(total) == int)
+        if total == 0:
+            return "0%"  # avoid division by zero
+        else:
+            return str(round(100*fraction/total, 1)) + '%'
+
+    files_without_alltags = [x for x in files_with_metadata if not x['alltags']]
+    num_files_without_alltags = len(files_without_alltags)
+    print("Number of files without tags including pathtags: " + str(num_files_without_alltags) +
+          "   (" + str_percentage(num_files_without_alltags, number_of_files) + " of total files)")
+
+    files_without_filetags = [x for x in files_with_metadata if not x['filetags']]
+    num_files_without_filetags = len(files_without_filetags)
+    print("Number of files without filetags:                " + str(num_files_without_filetags) +
+          "   (" + str_percentage(num_files_without_filetags, number_of_files) + " of total files)")
+
+    num_files_with_alltags = number_of_files - len(files_without_alltags)
+
+    files_with_filetags = [x for x in files_with_metadata if x['filetags']]
+    num_files_with_filetags = len(files_with_filetags)
+    print("Number of files with filetags:                   " + str(num_files_with_filetags) +
+          "   (" + str_percentage(num_files_with_filetags, number_of_files) + " of total files)")
+
+    if vocabulary:
+        print('\nYour controlled vocabulary is defined in ' + controlled_vocabulary_filename +
+              ' and contains ' + str(len(vocabulary)) + ' tags.\n')

        vocabulary_tags_not_used = set(vocabulary) - set(tag_dict.keys())
        if vocabulary_tags_not_used:
@ -1075,13 +1254,47 @@ def handle_tag_gardening(vocabulary):
            print("\nTags you used that are not in the vocabulary:\n")
            print_tag_set(tags_not_in_vocabulary)

+        if unique_tags and len(unique_tags) > 0:
+            # There are mutually exclusive tags defined in the controlled vocabulary
+            for taggroup in unique_tags:
+                # iterate over mutually exclusive tag groups one by one
+
+                if taggroup == UNIQUE_TAG_TESTSTRINGS:
+                    continue
+                if len(set(tag_dict.keys()).intersection(set(taggroup))) > 0:
+                    files_with_any_tag_from_taggroup = [x for x in
+                                                        files_with_metadata if
+                                                        len(set(x['alltags']).intersection(set(taggroup))) > 0]
+                    num_files_with_any_tag_from_taggroup = len(files_with_any_tag_from_taggroup)
+                    print('\nTag group ' + str(taggroup) + ":\n   Number of files with tag from tag group: " +
+                          str(num_files_with_any_tag_from_taggroup) +
+                          "   (" + str_percentage(num_files_with_any_tag_from_taggroup, num_files_with_alltags) +
+                          " of tagged files)")
+
+                    longest_tagname = max(taggroup, key=len)
+                    for tag in taggroup:
+                        files_with_tag_from_taggroup = [x for x in files_with_metadata if tag in x['alltags']]
+                        num_files_with_tag_from_taggroup = len(files_with_tag_from_taggroup)
+                        if num_files_with_tag_from_taggroup > 0:
+                            print('   {:<{}}  •  {:>{}} tagged file(s)   = {:>5} of tag group'.format(
+                                tag,
+                                len(longest_tagname),
+                                str(num_files_with_tag_from_taggroup),
+                                len(str(num_files_with_any_tag_from_taggroup)),
+                                str_percentage(num_files_with_tag_from_taggroup, num_files_with_any_tag_from_taggroup)))
+                        else:
+                            print('   "' + tag + '": Not used')
+                else:
+                    print('Tag group ' + str(taggroup) + ': Not used')
+
    print("\nTags that appear only once are most probably typos or you have forgotten them:")
    tags_only_used_once_dict = {key: value for key, value in list(tag_dict.items()) if value < 2}
    print_tag_dict(tags_only_used_once_dict, vocabulary, sort_index=0, print_only_tags_with_similar_tags=False)

    print("\nTags which have similar other tags are probably typos or plural/singular forms of others:")
    tags_for_comparing = list(set(tag_dict.keys()).union(set(vocabulary)))  # unified elements of both lists
-    only_similar_tags_by_alphabet_dict = {key: value for key, value in list(tag_dict.items()) if find_similar_tags(key, tags_for_comparing)}
+    only_similar_tags_by_alphabet_dict = {key: value for key, value in list(tag_dict.items())
+                                          if find_similar_tags(key, tags_for_comparing)}
    print_tag_dict(only_similar_tags_by_alphabet_dict, vocabulary, sort_index=0, print_similar_vocabulary_tags=True)

    tags_only_used_once_set = set(tags_only_used_once_dict.keys())
@ -1102,7 +1315,8 @@ def locate_file_in_cwd_and_parent_directories(startfile, filename):
    @param return: file name found
    """

-    if startfile and os.path.isfile(startfile) and os.path.isfile(os.path.join(os.path.dirname(os.path.abspath(startfile)), filename)):
+    if startfile and os.path.isfile(startfile) and os.path.isfile(
+            os.path.join(os.path.dirname(os.path.abspath(startfile)), filename)):
        logging.debug('found \"%s\" in directory of \"%s\" ..' % (filename, startfile))
        return filename
    elif startfile and os.path.isdir(startfile) and os.path.isfile(os.path.join(startfile, filename)):
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-# Time-stamp: <2017-11-11 17:54:40 vk>
+# Time-stamp: <2018-03-18 11:00:42 vk>

 # invoke tests using following command line:
 # ~/src/vktag % PYTHONPATH="~/src/filetags:" tests/unit_tests.py --verbose
@ -147,6 +147,22 @@ class TestMethods(unittest.TestCase):
                                                                     'file3 -- common foo bar baz.txt'
                                                                     'file4 -- common foo bar jodel.txt'])), set(['common', 'foo']))

+    def test_extract_tags_from_path(self):
+        self.assertEqual(set(filetags.extract_tags_from_path('/a/path/without/tags')), set([]))
+        self.assertEqual(set(filetags.extract_tags_from_path('/path -- ptag1/with -- ptag1 ptag2/tags')),
+                         set(['ptag1', 'ptag2']))
+        self.assertEqual(set(filetags.extract_tags_from_path('/path -- ptag1/with -- ptag1 ptag2/tags -- ftag1')),
+                         set(['ptag1', 'ptag2', 'ftag1']))
+
+    def test_extract_iso_datestamp_from_filename(self):
+        self.assertEqual(filetags.extract_iso_datestamp_from_filename(''), [])
+        self.assertEqual(filetags.extract_iso_datestamp_from_filename('foo'), [])
+        self.assertEqual(filetags.extract_iso_datestamp_from_filename('9999-99-99 foo bar'), [])
+        self.assertEqual(filetags.extract_iso_datestamp_from_filename('2018-03-18 foo bar'), ['2018', '03', '18'])
+        self.assertEqual(filetags.extract_iso_datestamp_from_filename('2018-03-18_foo bar'), ['2018', '03', '18'])
+        self.assertEqual(filetags.extract_iso_datestamp_from_filename('2018-03-18-foo bar'), ['2018', '03', '18'])
+        self.assertEqual(filetags.extract_iso_datestamp_from_filename('2018-03-18T23.59 foo bar'), ['2018', '03', '18'])
+
    def tearDown(self):

        pass