Skip to content

Commit

Permalink
SequenceSearchDef: report file search progress on debug log
Browse files Browse the repository at this point in the history
shorten the search file name to just parent_folder/filename and
also print the search def tags before the search, so we can tell
which search is currently being run.

Signed-off-by: Mustafa Kemal Gilor <[email protected]>
  • Loading branch information
xmkg committed May 9, 2024
1 parent f2c5882 commit 0337068
Showing 1 changed file with 47 additions and 3 deletions.
50 changes: 47 additions & 3 deletions searchkit/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import threading
import time
import uuid
from pathlib import Path
from functools import cached_property
from collections import namedtuple, UserDict, UserList

Expand Down Expand Up @@ -970,6 +971,38 @@ def _process_sequence_results(self, sequence_results, current_ln):

self.put_result(r)

@classmethod
def _remove_common_string_parts(cls, orig_list):
"""Remove the common prefix and suffix from
all elements in the orig_list.
Example
Input: ["openstack.a.b.c.test", "openstack.d.e.test"]
Output ["a.b.c", "d.e"]
"""
new_list = []

for elem in orig_list:
lcp = lcs = ""
# Determine the longest common prefix/suffix
for elem2 in orig_list:
if elem == elem2:
# Skip self
continue
common_prefix = os.path.commonprefix([elem, elem2])
common_suffix = os.path.commonprefix([elem[::-1], elem2[::-1]])
if len(common_prefix) > len(lcp):
lcp = common_prefix
if len(common_suffix) > len(lcs):
lcs = common_suffix[::-1] # reverse it back
# Remove the longest common prefix / suffix
if elem.startswith(lcp):
elem = elem[len(lcp):]
if elem.endswith(lcs):
elem = elem[:-len(lcs)]
new_list.append(elem)
return set(new_list)

def _run_search(self, fd):
"""
@param fd: open file descriptor
Expand All @@ -978,16 +1011,27 @@ def _run_search(self, fd):
sequence_results = SequenceSearchResults()
search_ids = set([s.id for s in self.search_defs]) # noqa, pylint: disable=R1718
offset = self.constraints_manager.apply_global(search_ids, fd)
log.debug("starting search of %s (offset=%s, pos=%s)", fd.name, offset,
fd.tell())
log.debug("starting search of %s (offset=%s, pos=%s) for tags [%s]",
fd.name, offset, fd.tell(),
self._remove_common_string_parts(
[s.tag for s in self.search_defs]))

# Get the total file size (needed for calculating the search progress)
before_pos = fd.tell()
fd.seek(0, 2)
eof_offset = fd.tell()
fd.seek(before_pos)

runnable = {s.id: _runnable
for s, _runnable in self.search_defs.items()}
ln = 0
filename_short = Path(*Path(fd.name).parts[-2:])
# NOTE: line numbers start at 1 hence offset + 1
for ln, line in enumerate(fd, start=offset + 1):
# This could be helpful to show progress for large files
if ln % 100000 == 0:
log.debug("%s lines searched in %s", ln, fd.name)
log.debug("%s lines searched in %s %.3f%%)",
ln, filename_short, (fd.tell() / eof_offset) * 100.0)

self.stats['lines_searched'] += 1
line = line.decode("utf-8", **self.decode_kwargs)
Expand Down

0 comments on commit 0337068

Please sign in to comment.