Toufool · Avasam · Jun 16, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -174,6 +174,70 @@ You can have one (and only one) image with the keyword `reset` in its name. Auto
 
 The Start Image is similar to the Reset Image. You can only have one Start Image with the keyword `start_auto_splitter`.You can reload the image using the "`Reload Start Image`" button. The pause time is the amount of seconds AutoSplit will wait before starting comparisons of the first split image. Delay times will be used to delay starting your timer after the threshold is met.
 
+### Text Recognition (OCR)
+
+You can use text recognition as an alternative comparison method.
+
+#### Tesseract install
+
+First you need to install tesseract and include it in your system or user environment variables.
+- See <https://tesseract-ocr.github.io/tessdoc/Installation.html> for installation instruction on all platforms.
+- For Windows:
+  1. You can go directly to <https://github.com/UB-Mannheim/tesseract/wiki> to find the installer.
+  2. If you change the "Destination Folder" during install, then you'll also need to add it to your `PATH` environment variable.
+
+#### Usage
+
+To use this feature you need to place a text file (.txt) in your splits folder instead of an image file.
+
+An example file name and content could look like this:
+
+Filename: `001_start_auto_splitter.txt`
+
+Content:
+
+```toml
+texts = ["complete any 2 encounters"]
+top_left = [275, 70]
+bottom_right = [540, 95]
+methods = [0]
+fps_limit = 1
+```
+
+The `texts` field is an array and can take more than one text to look for:
+
+```toml
+texts = ["look for me", "or this text"]
+```
+
+Note: for now we only use lowercase letters in the comparison. All uppercase letters are converted to lowercase before the comparison.
+
+The rectangle coordinates where the text you are looking for is expected to appear in the image are configured as follows:
+
+```toml
+top_left = [X, Y]
+bottom_right = [X, Y]
+```
+
+`top_left` is the top left and `bottom_right` is the bottom right corner of the rectangle.
+
+Currently there are three comparison methods:
+
+* `0` - uses the Levenshtein distance (the default)
+* `1` - checks if the OCR text contains the searched text
+* `2` - looks for a perfect 1:1 match
+
+You can also chain multiple comparison methods using the array notation:
+
+```toml
+methods = [1, 0]
+```
+
+The methods are then checked in the order you defined and the best match apon them wins.
+
+Note: This method can cause high CPU usage at the standard comparison FPS. You should therefor limit the comparison FPS when you use this method to 1 or 2 FPS using the `fps_limit` option.
+The size of the selected rectangle can also impact the CPU load (bigger = more CPU load).
+
 ### Profiles
 
 <!-- TODO: Profiles are saved under `%appdata%\AutoSplit\profiles` and -->

diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -4,6 +4,7 @@
 #
 # Dependencies:
 git+https://github.com/boppreh/keyboard.git#egg=keyboard  # Fix install on macos and linux-ci https://github.com/boppreh/keyboard/pull/568
+Levenshtein>=0.25
 numpy>=1.26  # Python 3.12 support
 opencv-python-headless>=4.9.0.80  # Typing fixes
 packaging

diff --git a/src/AutoSplit.py b/src/AutoSplit.py
@@ -307,7 +307,8 @@ def __reload_start_image(self, *, started_by_button: bool = False, wait_for_dela
         self.highest_similarity = 0.0
         self.reset_highest_similarity = 0.0
         self.split_below_threshold = False
-        self.timer_start_image.start(int(ONE_SECOND / self.settings_dict["fps_limit"]))
+
+        self.timer_start_image.start(int(ONE_SECOND / self.start_image.get_fps_limit(self)))
 
         QApplication.processEvents()
 
@@ -689,7 +690,7 @@ def __similarity_threshold_loop(self, number_of_split_images: int, dummy_splits_
             QApplication.processEvents()
 
             # Limit the number of time the comparison runs to reduce cpu usage
-            frame_interval = 1 / self.settings_dict["fps_limit"]
+            frame_interval = 1 / self.split_image.get_fps_limit(self)
             # Use a time delta to have a consistant check interval
             wait_delta_ms = int((frame_interval - (time() - start) % frame_interval) * ONE_SECOND)
 
@@ -873,7 +874,11 @@ def __update_split_image(self, specific_image: AutoSplitImage | None = None):
 
         # Get split image
         self.split_image = specific_image or self.split_images_and_loop_number[0 + self.split_image_number][0]
-        if is_valid_image(self.split_image.byte_array):
+        if self.split_image.is_ocr:
+            # TODO: test if setText clears a set image
+            text = "\nor\n".join(self.split_image.texts)
+            self.current_split_image.setText(f"Looking for OCR text:\n{text}")
+        elif is_valid_image(self.split_image.byte_array):
             set_preview_image(self.current_split_image, self.split_image.byte_array)
 
         self.current_image_file_label.setText(self.split_image.filename)

diff --git a/src/AutoSplitImage.py b/src/AutoSplitImage.py
@@ -5,11 +5,12 @@
 
 import cv2
 import numpy as np
+import toml
 from cv2.typing import MatLike
 
 import error_messages
-from compare import check_if_image_has_transparency, get_comparison_method_by_index
-from utils import BGR_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image
+from compare import check_if_image_has_transparency, extract_and_compare_text, get_comparison_method_by_index
+from utils import BGR_CHANNEL_COUNT, MAXBYTE, TESSERACT_PATH, ColorChannel, ImageShape, is_valid_image
 
 if TYPE_CHECKING:
     from AutoSplit import AutoSplit
@@ -40,13 +41,25 @@ class AutoSplitImage:
     image_type: ImageType
     byte_array: MatLike | None = None
     mask: MatLike | None = None
+    texts: list[str]
     # This value is internal, check for mask instead
     _has_transparency = False
     # These values should be overriden by some Defaults if None. Use getters instead
     __delay_time: float | None = None
     __comparison_method: int | None = None
     __pause_time: float | None = None
     __similarity_threshold: float | None = None
+    __rect: list[int]
+    __ocr_comparison_methods: list[int]
+    __fps_limit: int = 0
+
+    @property
+    def is_ocr(self):
+        """
+        Whether a "split image" is actually for Optical Text Recognition
+        based on whether there's any text strings to search for.
+        """
+        return bool(self.texts)
 
     def get_delay_time(self, default: "AutoSplit | int"):
         """Get image's delay time or fallback to the default value from spinbox."""
@@ -80,6 +93,12 @@ def get_similarity_threshold(self, default: "AutoSplit | float"):
             return default
         return default.settings_dict["default_similarity_threshold"]
 
+    def get_fps_limit(self, default: "AutoSplit"):
+        """Get image's fps limit or fallback to the default value from spinbox."""
+        if self.__fps_limit != 0:
+            return self.__fps_limit
+        return default.settings_dict["fps_limit"]
+
     def __init__(self, path: str):
         self.path = path
         self.filename = os.path.split(path)[-1].lower()
@@ -89,7 +108,11 @@ def __init__(self, path: str):
         self.__comparison_method = comparison_method_from_filename(self.filename)
         self.__pause_time = pause_from_filename(self.filename)
         self.__similarity_threshold = threshold_from_filename(self.filename)
-        self.__read_image_bytes(path)
+        self.texts = []
+        if path.endswith("txt"):
+            self.__parse_text_file(path)
+        else:
+            self.__read_image_bytes(path)
 
         if START_KEYWORD in self.filename:
             self.image_type = ImageType.START
@@ -98,6 +121,39 @@ def __init__(self, path: str):
         else:
             self.image_type = ImageType.SPLIT
 
+    def __parse_text_file(self, path: str):
+        if not TESSERACT_PATH:
+            error_messages.tesseract_missing(path)
+            return
+
+        with open(path, encoding="utf-8") as f:
+            data = toml.load(f)
+
+        self.texts = [text.lower().strip() for text in data["texts"]]
+        self.__rect = [
+            data["top_left"][0],
+            data["bottom_right"][0],
+            data["top_left"][1],
+            data["bottom_right"][1],
+        ]
+        self.__ocr_comparison_methods = data.get("methods", [0])
+        self.__fps_limit = data.get("fps_limit", 0)
+
+        if self.__validate_ocr():
+            error_messages.wrong_ocr_values(path)
+            return
+
+    def __validate_ocr(self):
+        values = self.__rect + self.__ocr_comparison_methods
+        values.append(self.__fps_limit)
+        return (
+            any(  # Check for invalid negative values
+                value < 0 for value in values
+            )
+            or self.__rect[1] <= self.__rect[0]
+            or self.__rect[3] <= self.__rect[2]
+        )
+
     def __read_image_bytes(self, path: str):
         image = cv2.imread(path, cv2.IMREAD_UNCHANGED)
         if not is_valid_image(image):
@@ -140,8 +196,24 @@ def compare_with_capture(
         default: "AutoSplit | int",
         capture: MatLike | None,
     ):
-        """Compare image with capture using image's comparison method. Falls back to combobox."""
-        if not is_valid_image(self.byte_array) or not is_valid_image(capture):
+        """
+        Compare image with capture using image's comparison method. Falls back to combobox.
+        For OCR text files: extract image text from rectangle position and compare it with the expected string.
+        """
+        if not is_valid_image(capture):
+            return 0.0
+
+        if self.is_ocr:
+            return extract_and_compare_text(
+                capture[
+                    self.__rect[2]:self.__rect[3],
+                    self.__rect[0]:self.__rect[1],
+                ],
+                self.texts,
+                self.__ocr_comparison_methods,
+            )
+
+        if not is_valid_image(self.byte_array):
             return 0.0
         resized_capture = cv2.resize(capture, self.byte_array.shape[1::-1])
 

diff --git a/src/compare.py b/src/compare.py
@@ -1,17 +1,19 @@
 from math import sqrt
 
 import cv2
+import Levenshtein
 import numpy as np
 from cv2.typing import MatLike
 from scipy import fft
 
-from utils import BGRA_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image
+from utils import BGRA_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image, run_tesseract
 
 MAXRANGE = MAXBYTE + 1
 CHANNELS = (ColorChannel.Red.value, ColorChannel.Green.value, ColorChannel.Blue.value)
 HISTOGRAM_SIZE = (8, 8, 8)
 RANGES = (0, MAXRANGE, 0, MAXRANGE, 0, MAXRANGE)
 MASK_SIZE_MULTIPLIER = ColorChannel.Alpha * MAXBYTE * MAXBYTE
+MAX_VALUE = 1.0
 
 
 def compare_histograms(source: MatLike, capture: MatLike, mask: MatLike | None = None):
@@ -126,10 +128,57 @@ def compare_phash(source: MatLike, capture: MatLike, mask: MatLike | None = None
     return 1 - (hash_diff / 64.0)
 
 
+def extract_and_compare_text(capture: MatLike, texts: list[str], methods_index: list[int]):
+    """
+    Compares the extracted text of the given image and returns the similarity between the two texts.
+    The best match of all texts and methods is returned.
+
+    @param capture: Image of any given shape as a numpy array
+    @param texts: a list of strings to match for
+    @param methods_index: a list of comparison methods to use in order
+    @return: The similarity between the text in the image and the text supplied as a number 0 to 1.
+    """
+    methods = [get_ocr_comparison_method_by_index(i) for i in methods_index]
+    png = np.array(cv2.imencode(".png", capture)[1]).tobytes()
+    # Especially with stylised characters, OCR could conceivably get the right
+    # letter, but mix up the casing (m/M, o/O, t/T, etc.)
+    image_string = run_tesseract(png).lower().strip()
+
+    ratio = 0.0
+    for text in texts:
+        for method in methods:
+            ratio = max(ratio, method(text, image_string))
+            if ratio == MAX_VALUE:
+                return ratio  # we found the best match; try to return early
+    return ratio
+
+
+def compare_submatch(a: str, b: str):
+    return float(a in b)
+
+
+def compare_one_to_one(a: str, b: str):
+    if a == b:
+        return MAX_VALUE
+    return 0.0
+
+
 def __compare_dummy(*_: object):
     return 0.0
 
 
+def get_ocr_comparison_method_by_index(comparison_method_index: int):
+    match comparison_method_index:
+        case 0:
+            return Levenshtein.ratio
+        case 1:
+            return compare_submatch
+        case 2:
+            return compare_one_to_one
+        case _:
+            return __compare_dummy
+
+
 def get_comparison_method_by_index(comparison_method_index: int):
     match comparison_method_index:
         case 0:

diff --git a/src/error_messages.py b/src/error_messages.py
@@ -228,3 +228,19 @@ def handle_top_level_exceptions(exception: Exception) -> NoReturn:
     else:
         traceback.print_exception(type(exception), exception, exception.__traceback__)
     sys.exit(1)
+
+
+def tesseract_missing(ocr_split_file_path: str):
+    set_text_message(
+        f"{ocr_split_file_path!r} is an Optical Character Recognition split file but tesseract couldn't be found."
+        + f'\nPlease read <a href="https://github.com/{GITHUB_REPOSITORY}#install-tesseract">'
+        + f"github.com/{GITHUB_REPOSITORY}#install-tesseract</a> for installation instructions.",
+    )
+
+
+def wrong_ocr_values(ocr_split_file_path: str):
+    set_text_message(
+        f"{ocr_split_file_path!r} has invalid values."
+        + "\nPlease make sure that the X and Y coordinates of 'bottom_right' are not euqal to or lower then the "
+        + "X and Y coordinates of 'top_left'. Also check for negative values in the 'methods' or 'fps_limit' settings",
+    )
diff --git a/src/split_parser.py b/src/split_parser.py
@@ -225,7 +225,7 @@ def parse_and_validate_images(autosplit: "AutoSplit"):
     else:
         for image in split_images:
             # Test for image without transparency
-            if not is_valid_image(image.byte_array):
+            if not image.is_ocr and not is_valid_image(image.byte_array):
                 error_message = partial(error_messages.image_validity, image.filename)
                 break