Merge pull request #663 from rhertzog/fact-parser-changes

Simplify parsing of facts with a clear tag barrier
projecthamster · Nov 17, 2023 · fe26c5a · fe26c5a
2 parents 8af3927 + 2b5c059
commit fe26c5a
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 93 deletions.
diff --git a/help/C/input.page b/help/C/input.page
@@ -11,20 +11,25 @@
     To start tracking, press the <gui style="button">+</gui> button,
     type in the activity name in the entry,
     and hit the <key>Enter</key> key.
-    To specify more detail on the fly, use this syntax: 
-    `time_info activity name @category,, some description #tag #other tag with spaces`.
+    To specify more detail on the fly, use this syntax:
+    `time_info activity name@category, some description, #tag #other tag with spaces`.
 </p>
 
 <steps>
     <item><p>Specify specific times as `13:10-13:45`, and "started 5 minutes ago" as `-5`.</p></item>
     <item><p>Next comes the activity name</p></item>
     <item><p>Place the category after the activity name, and start it with an at sign `@`, e.g. `@garden`</p></item>
-    <item><p>If you want to add a description, add a double comma `,,`.</p></item>
-    <item><p>The description is just freeform text immediately after the double comma, and runs until the end of the string or until the beginning of tags.</p></item>
-    <item><p>Place tags at the end, and start each tag with a hash mark `#`.</p></item>
-    <item><p>A double comma `,,` can also be placed to indicate the beginning of tags. Otherwise any `#` in the activity, category or description would be interpreted as a starting a tag.</p></item>
+    <item><p>If you want to add a description and/or tags, add a comma `,`.</p></item>
+    <item><p>The description is just freeform text immediately after the comma, and runs until the end of the string or until the beginning of tags.</p></item>
+    <item><p>Place tags at the end right after a comma, and start each tag with a hash mark `#`.</p></item>
 </steps>
 
+<p>Note that you can embed single-word tags in the description just by
+prepending a hash to any word. Note that sequences of alphanumeric
+characters that start with a digit are not considered as words so if you
+use <code>Fix bug #123</code> as your description, the hash will be kept
+and there will be no supplementary tag named <code>123</code>.</p>
+
 <p>
     A few examples:
 </p>
@@ -33,33 +38,41 @@
     <p>Forgot to note the important act of watering flowers over lunch.</p>
 </example>
 <example>
-    <code>tomatoes@garden,, digging holes</code>
+    <code>tomatoes@garden, digging holes</code>
     <p>
         Need more tomatoes in the garden. Digging holes is purely informational,
         so added it as a description.
     </p>
 </example>
 <example>
-    <code>-7 existentialism,, thinking about the vastness of the universe</code>
+    <code>-7 existentialism, thinking about the vastness of the universe</code>
     <p>
         Corrected information by informing application that I've been
         doing something else for the last seven minutes.
     </p>
 </example>
+<example>
+    <code>Hamster@Software, doing some #reviews of pull requests</code>
+    <code>Hamster@Software, doing some reviews of pull requests, #reviews</code>
+    <p>
+	Those two syntaxes are equivalent. Single word tags can be embedded in the
+	description (except on the first word).
+    </p>
+</example>
 
 <note style="info">
     <title>Time input</title>
     <list>
         <item>
             <p>Relative times work both for <var>start</var> and <var>end</var>,
-            provided they are preceded by an explicit sign, 
+            provided they are preceded by an explicit sign,
             and <em>separated by a space</em>.</p>
             <p><code>-30 -10</code> means started 30 minutes ago and stopped 10 minutes ago.</p>
             <p><code>-5 +30</code> means started 5 minutes ago and will stop in 30 minutes
             (duration of 35 minutes).</p>
         </item>
         <item>
-            <p>Duration can be given instead of <var>end</var>, 
+            <p>Duration can be given instead of <var>end</var>,
             as 1, 2 or 3 digits without any sign.</p>
             <p><code>-50 30</code> means started 50 minutes ago and lasted 30 minutes
             (so it ended 20 minutes ago).</p>

diff --git a/src/hamster/lib/fact.py b/src/hamster/lib/fact.py
@@ -14,7 +14,7 @@
 from copy import deepcopy
 
 from hamster.lib import datetime as dt
-from hamster.lib.parsing import parse_fact
+from hamster.lib.parsing import parse_fact, get_tags_from_description
 
 
 class FactError(Exception):
@@ -186,20 +186,17 @@ def serialized_name(self):
             res += "@%s" % self.category
 
         if self.description:
-            res += ',, '
+            res += ', '
             res += self.description
 
-        if ('#' in self.activity
-            or '#' in self.category
-            or '#' in self.description
-           ):
-            # need a tag barrier
-            res += ",, "
-
         if self.tags:
-            # double comma is a left barrier for tags,
-            # which is useful only if previous fields contain a hash
-            res += " %s" % " ".join("#%s" % tag for tag in self.tags)
+            # Don't duplicate tags that are already in the description
+            seen_tags = get_tags_from_description(self.description)
+            remaining_tags = [
+                tag for tag in self.tags if tag not in seen_tags
+            ]
+            if remaining_tags:
+                res += ", %s" % " ".join("#%s" % tag for tag in remaining_tags)
         return res
 
     def serialized(self, range_pos="head", default_day=None):

diff --git a/src/hamster/lib/parsing.py b/src/hamster/lib/parsing.py
@@ -7,8 +7,7 @@
 
 
 # separator between times and activity
-ACTIVITY_SEPARATOR = "\s+"
-
+activity_separator = r"\s+"
 
 # match #tag followed by any space or # that will be ignored
 # tag must not contain '#' or ','
@@ -17,27 +16,39 @@
     (?P<tag>
         [^#,]+  # (anything but hash or comma)
     )
-    \s*         # maybe spaces
-                # forbid double comma (tag can not be before the tags barrier):
-    ,?          # single comma (or none)
-    \s*         # maybe space
-    $           # end of text
+""", flags=re.VERBOSE)
+
+tags_in_description = re.compile(r"""
+    \#
+    (?P<tag>
+        [a-zA-Z] # Starts with an alphabetic character (digits excluded)
+        [^\s]+   # followed by anything except spaces
+    )
 """, flags=re.VERBOSE)
 
 tags_separator = re.compile(r"""
-    (,{0,2})    # 0, 1 or 2 commas
+    ,{1,2}      # 1 or 2 commas
+    \s*         # maybe spaces
+    (?=\#)      # hash character (start of first tag, doesn't consume it)
+""", flags=re.VERBOSE)
+
+description_separator = re.compile(r"""
+    ,+          # 1 or more commas
     \s*         # maybe spaces
-    $           # end of text
 """, flags=re.VERBOSE)
 
 
+def get_tags_from_description(description):
+    return list(re.findall(tags_in_description, description))
+
+
 def parse_fact(text, range_pos="head", default_day=None, ref="now"):
     """Extract fact fields from the string.
 
     Returns found fields as a dict.
 
     Tentative syntax (not accurate):
-    start [- end_time] activity[@category][,, description][,,]{ #tag}
+    start [- end_time] activity[@category][, description][,]{ #tag}
     According to the legacy tests, # were allowed in the description
     """
 
@@ -50,44 +61,32 @@ def parse_fact(text, range_pos="head", default_day=None, ref="now"):
     # datetimes
     # force at least a space to avoid matching 10.00@cat
     (start, end), remaining_text = dt.Range.parse(text, position=range_pos,
-                                                   separator=ACTIVITY_SEPARATOR,
-                                                   default_day=default_day)
+                                                  separator=activity_separator,
+                                                  default_day=default_day)
     res["start_time"] = start
     res["end_time"] = end
 
     # tags
-    # Need to start from the end, because
-    # the description can hold some '#' characters
-    tags = []
-    while True:
-        # look for tags separators
-        # especially the tags barrier
-        m = re.search(tags_separator, remaining_text)
-        remaining_text = remaining_text[:m.start()]
-        if m.group(1) == ",,":
-            # tags  barrier found
-            break
-
-        # look for tag
-        m = re.search(tag_re, remaining_text)
-        if m:
-            tag = m.group('tag').strip()
-            # strip the matched string (including #)
-            remaining_text = remaining_text[:m.start()]
-            tags.append(tag)
-        else:
-            # no tag
-            break
-
-    # put tags back in input order
-    res["tags"] = list(reversed(tags))
+    split = re.split(tags_separator, remaining_text, 1)
+    remaining_text = split[0]
+    tags_part = split[1] if len(split) > 1 else None
+    if tags_part:
+        tags = list(map(lambda x: x.strip(), re.findall(tag_re, tags_part)))
+    else:
+        tags = []
 
     # description
-    # first look for double comma (description hard left boundary)
-    head, sep, description = remaining_text.partition(",,")
+    # first look for comma (description hard left boundary)
+    split = re.split(description_separator, remaining_text, 1)
+    head = split[0]
+    description = split[1] if len(split) > 1 else ""
+    # Extract tags from description, put them before other tags
+    tags = get_tags_from_description(description) + tags
     res["description"] = description.strip()
     remaining_text = head.strip()
 
+    res["tags"] = tags
+
     # activity
     split = remaining_text.rsplit('@', maxsplit=1)
     activity = split[0]