diff --git a/DocTest/CompareImage.py b/DocTest/CompareImage.py index be7ce5f..0964276 100644 --- a/DocTest/CompareImage.py +++ b/DocTest/CompareImage.py @@ -99,8 +99,9 @@ def get_text_content(self): self.text_content.append(text) return self.text_content - def get_ocr_text_data(self, ocr_config: str='--psm 11', ocr_lang: str='eng'): - self.increase_resolution_for_ocr() + def get_ocr_text_data(self, ocr_config: str='--psm 11', ocr_lang: str='eng', increase_resolution: bool=True): + if increase_resolution: + self.increase_resolution_for_ocr() for i in range(len(self.opencv_images)): text_list = [] left_list = [] @@ -153,8 +154,9 @@ def increase_resolution_for_ocr(self): - def get_text_content_with_east(self): - self.increase_resolution_for_ocr() + def get_text_content_with_east(self, increase_resolution: bool=True): + if increase_resolution: + self.increase_resolution_for_ocr() self.east_text_extractor = EastTextExtractor() for frame in self.opencv_images: text = self.east_text_extractor.get_image_text(frame) diff --git a/DocTest/VisualTest.py b/DocTest/VisualTest.py index 86592f9..521d846 100644 --- a/DocTest/VisualTest.py +++ b/DocTest/VisualTest.py @@ -834,7 +834,7 @@ def check_for_differences(self, reference, candidate, i, detected_differences, c detected_differences.append(True) @keyword - def get_text_from_document(self, image: str, ocr_engine: str="tesseract"): + def get_text_from_document(self, image: str, ocr_engine: str="tesseract", ocr_config: str='--psm 11', ocr_lang: str='eng', increase_resolution: bool=True): """Gets Text Content from documents/images ``image``. Text content is returned as a list of strings. None if no text is identified. @@ -842,6 +842,9 @@ def get_text_from_document(self, image: str, ocr_engine: str="tesseract"): | =Arguments= | =Description= | | ``image`` | Path of the Image/Document from which the text content shall be retrieved | | ``ocr_engine`` | OCR Engine to be used. Options are ``tesseract`` and ``east``. Default is ``tesseract``. | + | ``ocr_config`` | OCR Config to be used for tesseract. Default is ``--psm 11``. | + | ``ocr_lang`` | OCR Language to be used for tesseract. Default is ``eng``. | + | ``increase_resolution`` | Increase resolution of image to 300 DPI before OCR. Default is ``True``. | Examples: | ${text} | `Get Text From Document` | reference.pdf | #Gets Text Content from .pdf | @@ -863,14 +866,14 @@ def get_text_from_document(self, image: str, ocr_engine: str="tesseract"): else: if ocr_engine == "tesseract": try: - img.get_ocr_text_data() + img.get_ocr_text_data(ocr_config, ocr_lang, increase_resolution) # if confidence is higher than 20, add to text list text = [x for x in img.text_content[0]['text'] if x] except: text = None elif ocr_engine == "east": try: - img.get_text_content_with_east() + img.get_text_content_with_east(increase_resolution) text = [x for x in img.text_content[0]['text'] if x] except: text = None diff --git a/docs/PdfTest.html b/docs/PdfTest.html index aec8001..6ecb7f9 100644 --- a/docs/PdfTest.html +++ b/docs/PdfTest.html @@ -1190,7 +1190,7 @@ jQuery.extend({highlight:function(e,t,n,r){if(e.nodeType===3){var i=e.data.match(t);if(i){var s=document.createElement(n||"span");s.className=r||"highlight";var o=e.splitText(i.index);o.splitText(i[0].length);var u=o.cloneNode(true);s.appendChild(u);o.parentNode.replaceChild(s,o);return 1}}else if(e.nodeType===1&&e.childNodes&&!/(script|style)/i.test(e.tagName)&&!(e.tagName===n.toUpperCase()&&e.className===r)){for(var a=0;a diff --git a/docs/PrintJobTest.html b/docs/PrintJobTest.html index a202877..4e56389 100644 --- a/docs/PrintJobTest.html +++ b/docs/PrintJobTest.html @@ -1190,7 +1190,7 @@ jQuery.extend({highlight:function(e,t,n,r){if(e.nodeType===3){var i=e.data.match(t);if(i){var s=document.createElement(n||"span");s.className=r||"highlight";var o=e.splitText(i.index);o.splitText(i[0].length);var u=o.cloneNode(true);s.appendChild(u);o.parentNode.replaceChild(s,o);return 1}}else if(e.nodeType===1&&e.childNodes&&!/(script|style)/i.test(e.tagName)&&!(e.tagName===n.toUpperCase()&&e.className===r)){for(var a=0;a diff --git a/docs/VisualTest.html b/docs/VisualTest.html index 02738fe..8fee7ff 100644 --- a/docs/VisualTest.html +++ b/docs/VisualTest.html @@ -1190,7 +1190,7 @@ jQuery.extend({highlight:function(e,t,n,r){if(e.nodeType===3){var i=e.data.match(t);if(i){var s=document.createElement(n||"span");s.className=r||"highlight";var o=e.splitText(i.index);o.splitText(i[0].length);var u=o.cloneNode(true);s.appendChild(u);o.parentNode.replaceChild(s,o);return 1}}else if(e.nodeType===1&&e.childNodes&&!/(script|style)/i.test(e.tagName)&&!(e.tagName===n.toUpperCase()&&e.className===r)){for(var a=0;a diff --git a/pyproject.toml b/pyproject.toml index 339255d..759bde2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "robotframework-doctestlibrary" -version = "0.14.0" +version = "0.15.0" description = "A library for Visual Document Testing" authors = ["Many Kasiriha "] maintainers = ["Many Kasiriha "] diff --git a/utest/test_ocr.py b/utest/test_ocr.py index 9bff272..71a61a1 100644 --- a/utest/test_ocr.py +++ b/utest/test_ocr.py @@ -41,6 +41,36 @@ def test_image_text_content_with_pytesseract(testdata_dir): assert "01-Jan-2021" in img.text_content[0]['text'] assert "ABCDEFGHI" in img.text_content[0]['text'] +def test_image_text_content_with_pytesseract_custom_options_01(testdata_dir): + img = CompareImage(testdata_dir / 'text_small.png') + img.get_ocr_text_data(ocr_config='--psm 11', increase_resolution=True) + assert any('ABCDEFGHI' in s for s in img.text_content[0]['text']) + assert any('abcdefghi' in s for s in img.text_content[0]['text']) + assert any('1234567890' in s for s in img.text_content[0]['text']) + +def test_image_text_content_with_pytesseract_custom_options_02(testdata_dir): + img = CompareImage(testdata_dir / 'text_small.png') + img.get_ocr_text_data(ocr_config='--psm 6', increase_resolution=True) + assert any('ABCDEFGHI' in s for s in img.text_content[0]['text']) + assert any('abcdefghi' in s for s in img.text_content[0]['text']) + assert any('1234567890' in s for s in img.text_content[0]['text']) + +def test_image_text_content_with_pytesseract_custom_options_03(testdata_dir): + img = CompareImage(testdata_dir / 'text_big.png') + img.get_ocr_text_data(increase_resolution=False) + assert any('abcdefghi' in s for s in img.text_content[0]['text']) + assert any('1234567890' in s for s in img.text_content[0]['text']) + assert any('ABCDEFGHI' in s for s in img.text_content[0]['text']) + +def test_image_text_content_with_pytesseract_custom_options_04(testdata_dir): + img = CompareImage(testdata_dir / 'text_small.png') + img.get_ocr_text_data(increase_resolution=False) + assert len(img.text_content[0]['text']) > 0 + with pytest.raises(AssertionError): + assert any('ABCDEFGHI' in s for s in img.text_content[0]['text']) + + + @pytest.mark.skip(reason="Currently, tesseract is not so good at recognizing bright text") def test_white_text_on_dark_background(testdata_dir): img = CompareImage(testdata_dir / 'whitetext_blackbackground.png')