SequenceSearchDef: report file search progress on debug log

shorten the search file name to just parent_folder/filename and also print the search def tags before the search, so we can tell which search is currently being run. Signed-off-by: Mustafa Kemal Gilor <[email protected]>
dosaboy · May 9, 2024 · 0337068 · 0337068
1 parent f2c5882
commit 0337068
Showing 1 changed file with 47 additions and 3 deletions.
diff --git a/searchkit/search.py b/searchkit/search.py
@@ -11,6 +11,7 @@
 import threading
 import time
 import uuid
+from pathlib import Path
 from functools import cached_property
 from collections import namedtuple, UserDict, UserList
 
@@ -970,6 +971,38 @@ def _process_sequence_results(self, sequence_results, current_ln):
 
                 self.put_result(r)
 
+    @classmethod
+    def _remove_common_string_parts(cls, orig_list):
+        """Remove the common prefix and suffix from
+        all elements in the orig_list.
+
+        Example
+            Input: ["openstack.a.b.c.test", "openstack.d.e.test"]
+            Output ["a.b.c", "d.e"]
+        """
+        new_list = []
+
+        for elem in orig_list:
+            lcp = lcs = ""
+            # Determine the longest common prefix/suffix
+            for elem2 in orig_list:
+                if elem == elem2:
+                    # Skip self
+                    continue
+                common_prefix = os.path.commonprefix([elem, elem2])
+                common_suffix = os.path.commonprefix([elem[::-1], elem2[::-1]])
+                if len(common_prefix) > len(lcp):
+                    lcp = common_prefix
+                if len(common_suffix) > len(lcs):
+                    lcs = common_suffix[::-1]  # reverse it back
+            # Remove the longest common prefix / suffix
+            if elem.startswith(lcp):
+                elem = elem[len(lcp):]
+            if elem.endswith(lcs):
+                elem = elem[:-len(lcs)]
+            new_list.append(elem)
+        return set(new_list)
+
     def _run_search(self, fd):
         """
         @param fd: open file descriptor
@@ -978,16 +1011,27 @@ def _run_search(self, fd):
         sequence_results = SequenceSearchResults()
         search_ids = set([s.id for s in self.search_defs])  # noqa, pylint: disable=R1718
         offset = self.constraints_manager.apply_global(search_ids, fd)
-        log.debug("starting search of %s (offset=%s, pos=%s)", fd.name, offset,
-                  fd.tell())
+        log.debug("starting search of %s (offset=%s, pos=%s) for tags [%s]",
+                  fd.name, offset, fd.tell(),
+                  self._remove_common_string_parts(
+                        [s.tag for s in self.search_defs]))
+
+        # Get the total file size (needed for calculating the search progress)
+        before_pos = fd.tell()
+        fd.seek(0, 2)
+        eof_offset = fd.tell()
+        fd.seek(before_pos)
+
         runnable = {s.id: _runnable
                     for s, _runnable in self.search_defs.items()}
         ln = 0
+        filename_short = Path(*Path(fd.name).parts[-2:])
         # NOTE: line numbers start at 1 hence offset + 1
         for ln, line in enumerate(fd, start=offset + 1):
             # This could be helpful to show progress for large files
             if ln % 100000 == 0:
-                log.debug("%s lines searched in %s", ln, fd.name)
+                log.debug("%s lines searched in %s %.3f%%)",
+                          ln, filename_short, (fd.tell() / eof_offset) * 100.0)
 
             self.stats['lines_searched'] += 1
             line = line.decode("utf-8", **self.decode_kwargs)