initial commit with stub

2026-02-16 05:14:16 +00:00 · 2016-03-05 11:56:29 +01:00 · 2016-03-05 11:56:29 +01:00 · fc8fae6cfd
commit fc8fae6cfd
6 changed files with 561 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+*.pyc
--- a/README.org
+++ b/README.org
@ -0,0 +1,86 @@
+## Time-stamp: <2016-03-05 11:51:45 vk>
+## -*- coding: utf-8 -*-
+## This file is best viewed with GNU Emacs Org-mode: http://orgmode.org/
+
+* guessfilename.py
+
+This Python script tries to come up with a new file name for each
+file from command line argument.
+
+It does this with several methods: first, the current file name is
+analyzed and any [[https://en.wikipedia.org/wiki/Iso_date][ISO date/timestamp]] and [[https://github.com/novoid/filetags/][filetags]] are re-used.
+Secondly, if the parsing of the file name did not lead to any new file
+name, the content of the file is analyzed. Following file types are
+supported by now:
+- PDF files
+
+The script accepts an arbitrary number of files (see your shell for
+possible length limitations).
+
+- *Target group*: users who are able to use command line tools and who
+  are using tags in file names.
+- Hosted on github: https://github.com/novoid/guessfilename
+
+** Why
+
+I do scan almost all paper mail. Many of those documents are sent to
+me regularily. Such documents are bills or insurance informations, for
+example.
+
+Being too lazy to name those files manually with high chances of
+getting many variants for the same document type, I came up with a
+method to derive file names from either the old file name (cues I
+enter without knowing the exact target file name) or the file content.
+
+Analyzing the content enables this script to recognize bills via
+customer numbers or phone numbers, amounts to pay, and so on.
+
+** Usage
+
+: guessfilename.py a_file_name.txt
+... FIXXME
+
+
+For a complete list of parameters, please try:
+: guessfilename.py --help
+
+* Related tools and workflows
+
+This tool is part of a tool-set which I use to manage my digital files
+such as photographs. My work-flows are described in [[http://karl-voit.at/managing-digital-photographs/][this blog posting]]
+you might like to read.
+
+In short:
+
+For *tagging*, please refer to [[https://github.com/novoid/filetags][filetags]] and its documentation.
+
+See [[https://github.com/novoid/date2name][date2name]] for easily adding ISO *time-stamps or date-stamps* to
+files.
+
+For *easily naming and tagging* files within file browsers that allow
+integration of external tools, see [[https://github.com/novoid/appendfilename][appendfilename]] (once more) and
+[[https://github.com/novoid/filetags][filetags]].
+
+Moving to the archive folders is done using [[https://github.com/novoid/move2archive][move2archive]].
+
+Having tagged photographs gives you many advantages. For example, I
+automatically [[https://github.com/novoid/set_desktop_background_according_to_season][choose my *desktop background image* according to the
+current season]].
+
+Files containing an ISO time/date-stamp gets indexed by the
+filename-module of [[https://github.com/novoid/Memacs][Memacs]].
+
+* Contribute!
+
+I am looking for your ideas!
+
+If you want to contribute to this cool project, please fork and
+contribute!
+
+
+* Local Variables                                                  :noexport:
+# Local Variables:
+# mode: auto-fill
+# mode: flyspell
+# eval: (ispell-change-dictionary "en_US")
+# End:
--- a/init.py
+++ b/init.py
--- a/guessfilename.py
+++ b/guessfilename.py
@ -0,0 +1,415 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Time-stamp: <2016-03-05 11:48:05 vk>
+
+## TODO:
+## * fix parts marked with «FIXXME»
+
+
+## ===================================================================== ##
+##  You might not want to modify anything below this line if you do not  ##
+##  know, what you are doing :-)                                         ##
+## ===================================================================== ##
+
+import re
+import sys
+import os
+import os.path   # for directory traversal to look for .tagfiles
+import time
+import logging
+#import operator  # for sorting dicts
+#import difflib   # for good enough matching words
+#from sets import Set  # to find out union/intersection of tag sets
+#import readline  # for raw_input() reading from stdin
+import codecs    # for handling Unicode content in .tagfiles
+from optparse import OptionParser
+
+PROG_VERSION_NUMBER = u"0.1"
+PROG_VERSION_DATE = u"2016-03-04"
+INVOCATION_TIME = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
+FILENAME_TAG_SEPARATOR = u' -- '
+BETWEEN_TAG_SEPARATOR = u' '
+
+
+USAGE = u"\n\
+    " + sys.argv[0] + u" [<options>] <list of files>\n\
+\n\
+FIXXME\n\
+\n\
+\n\
+Example usages:\n\
+  " + sys.argv[0] + u" --tags=\"presentation projectA\" *.pptx\n\
+      ... FIXXME\n\
+\n\
+\n\
+\n\
+Verbose description: FIXXME: http://Karl-Voit.at/managing-digital-photographs/\n\
+\n\
+:copyright: (c) by Karl Voit <tools@Karl-Voit.at>\n\
+:license: GPL v3 or any later version\n\
+:URL: https://github.com/novoid/guess-filename.py\n\
+:bugreports: via github or <tools@Karl-Voit.at>\n\
+:version: " + PROG_VERSION_NUMBER + " from " + PROG_VERSION_DATE + "\n"
+
+
+
+
+parser = OptionParser(usage=USAGE)
+
+parser.add_option("-s", "--dryrun", dest="dryrun", action="store_true",
+                  help="enable dryrun mode: just simulate what would happen, do not modify files")
+
+parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
+                  help="enable verbose mode")
+
+parser.add_option("-q", "--quiet", dest="quiet", action="store_true",
+                  help="enable quiet mode")
+
+parser.add_option("--version", dest="version", action="store_true",
+                  help="display version and exit")
+
+(options, args) = parser.parse_args()
+
+
+def handle_logging():
+    """Log handling and configuration"""
+
+    if options.verbose:
+        FORMAT = "%(levelname)-8s %(asctime)-15s %(message)s"
+        logging.basicConfig(level=logging.DEBUG, format=FORMAT)
+    elif options.quiet:
+        FORMAT = "%(levelname)-8s %(message)s"
+        logging.basicConfig(level=logging.ERROR, format=FORMAT)
+    else:
+        FORMAT = "%(levelname)-8s %(message)s"
+        logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+def error_exit(errorcode, text):
+    """exits with return value of errorcode and prints to stderr"""
+
+    sys.stdout.flush()
+    logging.error(text)
+
+    sys.exit(errorcode)
+
+
+class GuessFilename(object):
+    """
+    Contains methods of the guess filename domain
+    """
+
+    ## file names containing tags matches following regular expression
+    ## ( (date(time)?)?(--date(time)?)? )? filename (tags)? (extension)?
+    DAY_REGEX="[12]\d{3}-[01]\d-[0123]\d"
+    TIME_REGEX="T[012]\d.[012345]\d(.[012345]\d)?"
+    DAYTIME_REGEX="(" + DAY_REGEX + "(" + TIME_REGEX + ")?)"
+    DAYTIME_DURATION_REGEX=DAYTIME_REGEX + "(--" + DAYTIME_REGEX + ")?"
+
+    ISO_NAME_TAGS_EXTENSION_REGEX = re.compile("((" + DAYTIME_DURATION_REGEX + ")[ -_])?(.+?)( -- (\w+[ ]?)+)?(\.(\w+))?$")
+    DAYTIME_DURATION_INDEX=2
+    NAME_INDEX=10
+    TAGS_INDEX=11
+    EXTENSION_INDEX=14
+
+    def adding_tag_to_filename(self, filename, tagname):
+        """
+        Returns string of file name with tagname as additional tag.
+
+        @param filename: an unicode string containing a file name
+        @param tagname: an unicode string containing a tag name
+        @param return: an unicode string of filename containing tagname
+        """
+
+        assert filename.__class__ == str or \
+            filename.__class__ == unicode
+        assert tagname.__class__ == str or \
+            tagname.__class__ == unicode
+
+        if contains_tag(filename) is False:
+            logging.debug(u"adding_tag_to_filename(%s, %s): no tag found so far" % (filename, tagname))
+
+            components = re.match(FILE_WITH_EXTENSION_REGEX, os.path.basename(filename))
+            if components:
+                old_filename = components.group(1)
+                extension = components.group(2)
+                return os.path.join(os.path.dirname(filename), old_filename + FILENAME_TAG_SEPARATOR + tagname + u'.' + extension)
+            else:
+                return os.path.join(os.path.dirname(filename), os.path.basename(filename) + FILENAME_TAG_SEPARATOR + tagname)
+
+        elif contains_tag(filename, tagname):
+            logging.debug("adding_tag_to_filename(%s, %s): tag already found in filename" % (filename, tagname))
+
+            return filename
+
+        else:
+            logging.debug("adding_tag_to_filename(%s, %s): add as additional tag to existing list of tags" %
+                          (filename, tagname))
+
+            components = re.match(FILE_WITH_EXTENSION_REGEX, os.path.basename(filename))
+            if components:
+                old_filename = components.group(1)
+                extension = components.group(2)
+                return os.path.join(os.path.dirname(filename), old_filename + BETWEEN_TAG_SEPARATOR + tagname + u'.' + extension)
+            else:
+                return os.path.join(os.path.dirname(filename), filename + BETWEEN_TAG_SEPARATOR + tagname)
+
+    def handle_file(self, filename, tags, do_remove, dryrun):
+        """
+        @param filename: string containing one file name
+        @param tags: list containing one or more tags
+        @param do_remove: boolean which defines if tags should be added (False) or removed (True)
+        @param dryrun: boolean which defines if files should be changed (False) or not (True)
+        @param return: error value or new filename
+        """
+
+        assert filename.__class__ == str or \
+            filename.__class__ == unicode
+        assert tags.__class__ == list
+        if do_remove:
+            assert do_remove.__class__ == bool
+        if dryrun:
+            assert dryrun.__class__ == bool
+
+        if os.path.isdir(filename):
+            logging.warning("Skipping directory \"%s\" because this tool only renames file names." % filename)
+            return
+        elif not os.path.isfile(filename):
+            logging.debug("file type error in folder [%s]: file type: is file? %s  -  is dir? %s  -  is mount? %s" % (os.getcwdu(), str(os.path.isfile(filename)), str(os.path.isdir(filename)), str(os.path.islink(filename))))
+            logging.error("Skipping \"%s\" because this tool only renames existing file names." % filename)
+            return
+
+        new_filename = filename
+
+        ## if tag within UNIQUE_LABELS found, and new UNIQUE_LABEL is given, remove old label:
+        ## e.g.: UNIQUE_LABELS = (u'yes', u'no') -> if 'no' should be added, remove existing label 'yes' (and vice versa)
+        ## FIXXME: this is an undocumented feature -> please add proper documentation
+        if not do_remove:
+            unique_labels_in_old_filename = set(extract_tags_from_filename(filename)).intersection(UNIQUE_LABELS)
+            unique_label_to_add = set(tags).intersection(UNIQUE_LABELS)
+            if unique_label_to_add and unique_labels_in_old_filename:
+                logging.debug("found unique label %s which require old unique label to be removed: %s" % (str(unique_label_to_add), str(unique_labels_in_old_filename)))
+                for tagname in unique_labels_in_old_filename:
+                    new_filename = removing_tag_from_filename(new_filename, tagname)
+
+        for tagname in tags:
+            if do_remove:
+                new_filename = removing_tag_from_filename(new_filename, tagname)
+            else:
+                new_filename = adding_tag_to_filename(new_filename, tagname)
+
+        if dryrun:
+            logging.info(u" ")
+            logging.info(u" renaming \"%s\"" % filename)
+            logging.info(u"      ⤷   \"%s\"" % (new_filename))
+        else:
+            if filename != new_filename:
+                if not options.quiet:
+                    print u"   %s  ⤷  %s" % (filename, new_filename)
+                logging.debug(u" renaming \"%s\"" % filename)
+                logging.debug(u"      ⤷   \"%s\"" % (new_filename))
+                os.rename(filename, new_filename)
+
+        return new_filename
+
+    def split_filename_entities(self, filename):
+        """
+        Takes a filename of format ( (date(time)?)?(--date(time)?)? )? filename (tags)? (extension)?
+        and returns a set of (date/time/duration, filename, array of tags, extension).
+        """
+
+        assert(type(filename) == unicode or type(filename) == str)
+        assert(len(filename)>0)
+
+        components = re.match(self.ISO_NAME_TAGS_EXTENSION_REGEX, filename)
+
+        assert(components)
+
+        if components.group(11):
+            tags = components.group(11)[4:].split(' ')
+        else:
+            tags = []
+        return components.group(2), \
+            components.group(10), \
+            tags, \
+            components.group(14)
+
+
+def main():
+    """Main function"""
+
+    guess_filename = GuessFilename()
+
+    sys.exit(0)
+
+    if options.version:
+        print os.path.basename(sys.argv[0]) + " version " + PROG_VERSION_NUMBER + \
+            " from " + PROG_VERSION_DATE
+        sys.exit(0)
+
+    handle_logging()
+
+    if options.verbose and options.quiet:
+        error_exit(1, "Options \"--verbose\" and \"--quiet\" found. " +
+                   "This does not make any sense, you silly fool :-)")
+
+    ## interactive mode and tags are given
+    if options.interactive and options.tags:
+        error_exit(3, "I found option \"--tag\" and option \"--interactive\". \n" +
+                   "Please choose either tag option OR interactive mode.")
+
+    if options.list_tags_by_number and options.list_tags_by_alphabet:
+        error_exit(6, "Please use only one list-by-option at once.")
+
+    if options.tag_gardening and (options.list_tags_by_number or options.list_tags_by_alphabet or options.tags or options.remove):
+        error_exit(7, "Please don't use that gardening option together with any other option.")
+
+    if (options.list_tags_by_alphabet or options.list_tags_by_number) and (options.tags or options.interactive or options.remove):
+        error_exit(8, "Please don't use list any option together with add/remove tag options.")
+
+    logging.debug("extracting list of files ...")
+    logging.debug("len(args) [%s]" % str(len(args)))
+
+    files = extract_filenames_from_argument(args)
+
+    logging.debug("%s filenames found: [%s]" % (str(len(files)), '], ['.join(files)))
+
+    tags_from_userinput = []
+    vocabulary = locate_and_parse_controlled_vocabulary(os.getcwdu())
+
+    if len(args) < 1 and not (options.list_tags_by_alphabet or options.list_tags_by_number or options.list_unknown_tags or options.tag_gardening):
+        error_exit(5, "Please add at least one file name as argument")
+
+    if options.list_tags_by_alphabet:
+        logging.debug("handling option list_tags_by_alphabet")
+        list_tags_by_alphabet()
+
+    elif options.list_tags_by_number:
+        logging.debug("handling option list_tags_by_number")
+        list_tags_by_number()
+
+    elif options.list_unknown_tags:
+        logging.debug("handling option list_unknown_tags")
+        list_unknown_tags()
+
+    elif options.tag_gardening:
+        logging.debug("handling option for tag gardening")
+        handle_tag_gardening(vocabulary)
+
+    elif options.interactive or not options.tags:
+
+        completionhint = u''
+
+        if len(args) < 1:
+            error_exit(5, "Please add at least one file name as argument")
+
+        tags_from_filenames_of_arguments_dict = {}
+        upto9_tags_from_filenames_of_same_dir_list = []
+
+        ## look out for .filetags file and add readline support for tag completion if found with content
+        if options.remove:
+            ## vocabulary for completing tags is current tags of files
+            for file in files:
+                ## add tags so that list contains all unique tags:
+                for newtag in extract_tags_from_filename(file):
+                    add_tag_to_countdict(newtag, tags_from_filenames_of_arguments_dict)
+
+            vocabulary = sorted(tags_from_filenames_of_arguments_dict.keys())
+            upto9_tags_from_filenames_of_arguments_list = sorted(get_upto_nine_keys_of_dict_with_highest_value(tags_from_filenames_of_arguments_dict))
+        else:
+            if files:
+
+                upto9_tags_from_filenames_of_same_dir_list = sorted(get_upto_nine_keys_of_dict_with_highest_value(get_tags_from_files_and_subfolders(startdir=os.path.dirname(os.path.abspath(files[0])))))
+            vocabulary = sorted(locate_and_parse_controlled_vocabulary(args[0]))
+
+        if vocabulary:
+
+            assert(vocabulary.__class__ == list)
+
+            # Register our completer function
+            readline.set_completer(SimpleCompleter(vocabulary).complete)
+
+            # Use the tab key for completion
+            readline.parse_and_bind('tab: complete')
+
+            completionhint = u'; complete %s tags with TAB' % str(len(vocabulary))
+
+        logging.debug("len(args) [%s]" % str(len(args)))
+        logging.debug("args %s" % str(args))
+
+        print "                 "
+        print "Please enter tags, separated by \"" + BETWEEN_TAG_SEPARATOR + "\"; abort with Ctrl-C" + \
+            completionhint
+        print "                     "
+        print "        ,---------.  "
+        print "        |  ?     o | "
+        print "        `---------'  "
+        print "                     "
+
+        if options.remove:
+            logging.info("Interactive mode: tags get REMOVED from file names ...")
+            if len(upto9_tags_from_filenames_of_arguments_list) > 0:
+                print_tag_shortcut_with_numbers(upto9_tags_from_filenames_of_arguments_list, tags_get_added=False)
+        else:
+            logging.debug("Interactive mode: tags get ADDED to file names ...")
+            if upto9_tags_from_filenames_of_same_dir_list:
+                print_tag_shortcut_with_numbers(upto9_tags_from_filenames_of_same_dir_list, tags_get_added=True)
+
+
+        ## interactive: ask for list of tags
+        logging.debug("interactive mode: asking for tags ...")
+
+        entered_tags = raw_input('Tags: ').strip()
+
+        tags_from_userinput = extract_tags_from_argument(entered_tags)
+
+        if not tags_from_userinput:
+            logging.info("no tags given, exiting.")
+            sys.stdout.flush()
+            sys.exit(0)
+
+        if options.remove:
+            if len(tags_from_userinput) == 1 and len(upto9_tags_from_filenames_of_arguments_list) > 0:
+                ## check if user entered number shortcuts for tags to be removed:
+                tags_from_userinput = check_for_possible_shortcuts_in_entered_tags(tags_from_userinput, upto9_tags_from_filenames_of_arguments_list)
+
+            logging.info("removing tags \"%s\" ..." % str(BETWEEN_TAG_SEPARATOR.join(tags_from_userinput)))
+        else:
+            if len(tags_from_userinput) == 1 and upto9_tags_from_filenames_of_same_dir_list:
+                ## check if user entered number shortcuts for tags to be removed:
+                tags_from_userinput = check_for_possible_shortcuts_in_entered_tags(tags_from_userinput, upto9_tags_from_filenames_of_same_dir_list)
+            logging.info("adding tags \"%s\" ..." % str(BETWEEN_TAG_SEPARATOR.join(tags_from_userinput)))
+
+    else:
+        ## non-interactive: extract list of tags
+        logging.debug("non-interactive mode: extracting tags from argument ...")
+
+        tags_from_userinput = extract_tags_from_argument(options.tags)
+
+        if not tags_from_userinput:
+            ## FIXXME: check: can this be the case?
+            logging.info("no tags given, exiting.")
+            sys.stdout.flush()
+            sys.exit(0)
+
+    logging.debug("tags found: [%s]" % '], ['.join(tags_from_userinput))
+
+    logging.debug("iterate over files ...")
+    for filename in files:
+        if filename.__class__ == str:
+            filename = unicode(filename, "UTF-8")
+        handle_file(filename, tags_from_userinput, options.remove, options.dryrun)
+
+    logging.debug("successfully finished.")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+
+        logging.info("Received KeyboardInterrupt")
+
+## END OF FILE #################################################################
+
+#end
--- a/guessfilename_test.py
+++ b/guessfilename_test.py
@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- coding: utf-8; mode: python; -*-
+# Time-stamp: <2016-03-05 11:54:37 vk>
+
+import unittest
+from guessfilename import GuessFilename
+
+class TestGuessFilename(unittest.TestCase):
+
+    logging = None
+    guess_filename = GuessFilename()
+
+    def setUp(self):
+        verbose = False
+        quiet = False
+
+
+    def tearDown(self):
+        pass
+
+
+    def test_split_filename_entities(self):
+
+        self.assertEqual(self.guess_filename.split_filename_entities(u"2016-03-05T23.59.42--2017-04-06T12.13.14 foo bar -- eins zwei.extension"),
+                         (u"2016-03-05T23.59.42--2017-04-06T12.13.14", u"foo bar", [u"eins", u"zwei"], u"extension"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"2016-03-05T23.59--2017-04-06T12.13 foo - bar.baz - zum -- eins zwei.extension"),
+                         (u"2016-03-05T23.59--2017-04-06T12.13", u"foo - bar.baz - zum", [u"eins", u"zwei"], u"extension"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"2016-03-05T23.59.42 foo - bar.baz - zum -- eins zwei.extension"),
+                         (u"2016-03-05T23.59.42", u"foo - bar.baz - zum", [u"eins", u"zwei"], u"extension"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"2016-03-05T23.59.42 foo bar -- eins zwei.extension"),
+                         (u"2016-03-05T23.59.42", u"foo bar", [u"eins", u"zwei"], u"extension"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"2016-03-05T23.59--2017-04-06T12.13 foo bar -- eins zwei.extension"),
+                         (u"2016-03-05T23.59--2017-04-06T12.13", u"foo bar", [u"eins", u"zwei"], u"extension"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"2016-03-05T23.59--2017-04-06T12.13 foo bar.extension"),
+                         (u"2016-03-05T23.59--2017-04-06T12.13", u"foo bar", [], u"extension"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"2016-03-05T23.59 foo bar.extension"),
+                         (u"2016-03-05T23.59", u"foo bar", [], u"extension"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"foo bar.extension"),
+                         (None, u"foo bar", [], u"extension"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"foo bar"),
+                         (None, u"foo bar", [], None))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"foo.bar"),
+                         (None, u"foo", [], u"bar"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"foo -- bar"),
+                         (None, u"foo", [u"bar"], None))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"foo -- bar.baz"),
+                         (None, u"foo", [u"bar"], u"baz"))
+        self.assertEqual(self.guess_filename.split_filename_entities(u" -- "),
+                         (None, u' -- ', [], None))
+        self.assertEqual(self.guess_filename.split_filename_entities(u"."),
+                         (None, u'.', [], None))
+
+
+
+# Local Variables:
+# mode: flyspell
+# eval: (ispell-change-dictionary "en_US")
+# End:
--- a/guessfilename_test.sh
+++ b/guessfilename_test.sh
@ -0,0 +1 @@
+pytest guessfilename_test.py