Toufool · Avasam · Jun 16, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/README.md b/README.md
@@ -51,6 +51,7 @@ This program can be used to automatically start, split, and reset your preferred
   - Wayland is not currently supported
   - WSL2/WSLg requires an additional Desktop Environment, external X11 server, and/or systemd
 - Python 3.10+ (Not required for normal use. Refer to the [build instructions](/docs/build%20instructions.md) if you'd like run the application directly in Python).
+- Tesseract-OCR (optional; requierd for text recognition as an alternative comparison method). See https://github.com/UB-Mannheim/tesseract/wiki for installation instructions.
 
 ## OPTIONS
 
@@ -226,6 +227,38 @@ You can have one (and only one) image with the keyword `reset` in its name. Auto
 
 The Start Image is similar to the Reset Image. You can only have one Start Image with the keyword `start_auto_splitter`.You can reload the image using the "`Reload Start Image`" button. The pause time is the amount of seconds AutoSplit will wait before starting comparisons of the first split image. Delay times will be used to delay starting your timer after the threshold is met.
 
+### Text Recognition (OCR)
+
+You can use text recognition as an alternative comparison method.
+To use this feature you need to place a text file (.txt) in your splits folder instead of an image file.
+Place the expected text in the text file that should be looked for.
+
+An example file name and content could look like this:
+
+Filename: `001_start_auto_splitter.txt`
+
+Content:
+
+```
+texts = ["complete any 2 encounters"]
+top_left = 275
+top_right = 540
+bottom_left = 70
+bottom_right = 95
+fps_limit = 1
+```
+
+The `texts` field is an array and can take more than one text to look for:
+
+```
+texts = ["look for me", "or this text"]
+```
+
+The `top`, `bottom`, `left` and `right` options define a rectangle where the text you are looking for is expected to appear in the image.
+
+Note: This method can cause high CPU usage at the standard comparison FPS. You should therefor limit the comparison FPS when you use this method to 1 or 2 FPS using the `fps_limit` option.
+The size of the selected rectangle can also impact the CPU load (bigger = more CPU load).
+
 ### Profiles
 
 <!-- TODO: Profiles are saved under `%appdata%\AutoSplit\profiles` and -->

diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -4,6 +4,7 @@
 #
 # Dependencies:
 git+https://github.com/boppreh/keyboard.git#egg=keyboard  # Fix install on macos and linux-ci https://github.com/boppreh/keyboard/pull/568
+Levenshtein
 numpy>=1.26  # Python 3.12 support
 opencv-python-headless>=4.9.0.80  # Typing fixes
 packaging

diff --git a/src/AutoSplit.py b/src/AutoSplit.py
@@ -307,7 +307,10 @@ def __reload_start_image(self, started_by_button: bool = False, wait_for_delay:
         self.highest_similarity = 0.0
         self.reset_highest_similarity = 0.0
         self.split_below_threshold = False
-        self.timer_start_image.start(int(ONE_SECOND / self.settings_dict["fps_limit"]))
+        start_image_fps = self.settings_dict["fps_limit"]
+        if self.start_image.fps != 0:
+            start_image_fps = self.start_image.fps
+        self.timer_start_image.start(int(ONE_SECOND / start_image_fps))
 
         QApplication.processEvents()
 
@@ -682,8 +685,12 @@ def __similarity_threshold_loop(self, number_of_split_images: int, dummy_splits_
                 self.undo_split_button.setEnabled(self.split_image_number != 0)
             QApplication.processEvents()
 
+            fps = self.settings_dict["fps_limit"]
+            if self.split_image.fps != 0:
+                fps = self.split_image.fps
+
             # Limit the number of time the comparison runs to reduce cpu usage
-            frame_interval = 1 / self.settings_dict["fps_limit"]
+            frame_interval = 1 / fps
             # Use a time delta to have a consistant check interval
             wait_delta_ms = int((frame_interval - (time() - start) % frame_interval) * ONE_SECOND)
 
@@ -867,7 +874,10 @@ def __update_split_image(self, specific_image: AutoSplitImage | None = None):
 
         # Get split image
         self.split_image = specific_image or self.split_images_and_loop_number[0 + self.split_image_number][0]
-        if is_valid_image(self.split_image.byte_array):
+        if self.split_image.ocr:
+            text = "\nor\n".join(self.split_image.texts)
+            self.current_split_image.setText(f"Looking for OCR text:\n{text}")
+        elif is_valid_image(self.split_image.byte_array):
             set_preview_image(self.current_split_image, self.split_image.byte_array)
 
         self.current_image_file_label.setText(self.split_image.filename)

diff --git a/src/AutoSplitImage.py b/src/AutoSplitImage.py
@@ -5,6 +5,7 @@
 
 import cv2
 import numpy as np
+import tomllib
 from cv2.typing import MatLike
 
 import error_messages
@@ -37,16 +38,23 @@
     filename: str
     flags: int
     loops: int
+    fps: int
     image_type: ImageType
     byte_array: MatLike | None = None
     mask: MatLike | None = None
+    texts: list[str]
+    ocr: bool
     # This value is internal, check for mask instead
     _has_transparency = False
     # These values should be overriden by some Defaults if None. Use getters instead
     __delay_time: float | None = None
     __comparison_method: int | None = None
     __pause_time: float | None = None
     __similarity_threshold: float | None = None
+    __x: int
+    __xx: int
+    __y: int
+    __yy: int
 
     def get_delay_time(self, default: "AutoSplit | int"):
         """Get image's delay time or fallback to the default value from spinbox."""
@@ -89,7 +97,18 @@
         self.__comparison_method = comparison_method_from_filename(self.filename)
         self.__pause_time = pause_from_filename(self.filename)
         self.__similarity_threshold = threshold_from_filename(self.filename)
-        self.__read_image_bytes(path)
+        self.__x = 0
+        self.__xx = 0
+        self.__y = 0
+        self.__yy = 0
+        self.texts = list[str]()
+        self.fps = 0
+        self.ocr = False
+        if path.endswith("txt"):
+            self.ocr = True
+            self.__parse_text_file(path)
+        else:
+            self.__read_image_bytes(path)
 
         if START_KEYWORD in self.filename:
             self.image_type = ImageType.START
@@ -98,6 +117,18 @@
         else:
             self.image_type = ImageType.SPLIT
 
+    def __parse_text_file(self, path: str):
+        with open(path, "rb") as f:
+            data = tomllib.load(f)
+            self.texts = data["texts"]
+            self.__x = data["top_left"]
+            self.__xx = data["top_right"]
+            self.__y = data["bottom_left"]
+            self.__yy = data["bottom_right"]
+            self.fps = 1
+            if "fps_limit" in data:
+                self.fps = data["fps_limit"]
+
     def __read_image_bytes(self, path: str):
         image = cv2.imread(path, cv2.IMREAD_UNCHANGED)
         if not is_valid_image(image):
@@ -140,7 +171,13 @@
         default: "AutoSplit | int",
         capture: MatLike | None,
     ):
-        """Compare image with capture using image's comparison method. Falls back to combobox."""
+        """
+        Compare image with capture using image's comparison method. Falls back to combobox.
+        For OCR text files: extract image text from rectangle position and compare it with the expected string.
+        """
+        if self.ocr:
+            return extract_and_compare_text(capture[self.__y:self.__yy, self.__x:self.__xx], self.texts)
+
         if not is_valid_image(self.byte_array) or not is_valid_image(capture):
             return 0.0
         resized_capture = cv2.resize(capture, self.byte_array.shape[1::-1])
@@ -155,6 +192,7 @@
 
 
 if True:
+    from compare import extract_and_compare_text
     from split_parser import (
         comparison_method_from_filename,
         delay_time_from_filename,

diff --git a/src/compare.py b/src/compare.py
@@ -1,6 +1,9 @@
+import subprocess
 from math import sqrt
+from os import environ
 
 import cv2
+import Levenshtein
 import numpy as np
 from cv2.typing import MatLike
 from scipy import fft
@@ -13,6 +16,10 @@
 RANGES = [0, MAXRANGE, 0, MAXRANGE, 0, MAXRANGE]
 MASK_SIZE_MULTIPLIER = ColorChannel.Alpha * MAXBYTE * MAXBYTE
 
+# TODO: use PATH variable
+TESSERACT_CMD = [r"C:\Program Files\Tesseract-OCR\tesseract", "-", "-", "--oem", "1", "--psm", "6"]
+DEFAULT_ENCODING = "utf-8"
+
 
 def compare_histograms(source: MatLike, capture: MatLike, mask: MatLike | None = None):
     """
@@ -126,6 +133,59 @@
     return 1 - (hash_diff / 64.0)
 
 
+# copied from https://github.com/madmaze/pytesseract
+def subprocess_args():
+    # See https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess
+    # for reference and comments.
+
+    kwargs = {
+        "stdin": subprocess.PIPE,
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.DEVNULL,
+        "startupinfo": None,
+        "env": environ,
+    }
+
+    if hasattr(subprocess, "STARTUPINFO"):
+        kwargs["startupinfo"] = subprocess.STARTUPINFO()
+        kwargs["startupinfo"].dwFlags |= subprocess.STARTF_USESHOWWINDOW
+        kwargs["startupinfo"].wShowWindow = subprocess.SW_HIDE
+
+    return kwargs
+
+
+def run_tesseract(capture: MatLike):
+    png = np.array(cv2.imencode(".png", capture)[1]).tobytes()
+    p = subprocess.Popen(TESSERACT_CMD, **subprocess_args())
+    output = p.communicate(input=png)[0]
+    return output.decode(DEFAULT_ENCODING)
+
+
+def extract_and_compare_text(capture: MatLike, texts: list[str]):
+    """
+    Compares the extracted text of the given image and returns the similarity between the two texts.
+    The best match of all texts is returned.
+
+    @param capture: Image of any given shape as a numpy array
+    @param texts: a list of strings to match for
+    @return: The similarity between the text in the image and the text supplied as a number 0 to 1.
+    """
+    # if the string is found 1:1 in the string extracted from the image a 1 is returned.
+    # otherwise the levenshtein ratio is calculated between the two strings and gets returned.
+    image_string = run_tesseract(capture).lower().strip()
+
+    ratio = 0.0
+    for text in texts:
+        if text in image_string:
+            ratio = 1.0
+            break
+        ratio = max(ratio, Levenshtein.ratio(text, image_string))
+    # TODO: debug: remove me
+    if ratio > 0.9:  # noqa: PLR2004
+        print(f"text from image ({ratio:,.2f}): {image_string}")
+    return ratio
+
+
 def __compare_dummy(*_: object):
     return 0.0
 

diff --git a/src/split_parser.py b/src/split_parser.py
@@ -208,7 +208,7 @@ def parse_and_validate_images(autosplit: "AutoSplit"):
     else:
         for image in split_images:
             # Test for image without transparency
-            if not is_valid_image(image.byte_array):
+            if not image.ocr and not is_valid_image(image.byte_array):
 
                 def image_validity(filename: str):
                     return lambda: error_messages.image_validity(filename)