From dbdfe32b247a4f7ef0cfd0e9a5eaf6e1191197c6 Mon Sep 17 00:00:00 2001 From: Many Kasiriha Date: Tue, 4 Apr 2023 12:37:27 +0200 Subject: [PATCH 1/2] Update docs for 0.11.0 --- README.md | 21 ++++++++++++--------- docs/PdfTest.html | 2 +- docs/PrintJobTest.html | 2 +- docs/VisualTest.html | 2 +- pyproject.toml | 2 +- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index de1d0f4..f76fe15 100644 --- a/README.md +++ b/README.md @@ -4,22 +4,25 @@ [Robot Framework](https://robotframework.org) DocTest library. Simple Automated Visual Document Testing. -See **keyword documentation** for +See keyword documentation for - [Visual Document Tests](https://manykarim.github.io/robotframework-doctestlibrary/VisualTest.html) - [Print Job Tests](https://manykarim.github.io/robotframework-doctestlibrary/PrintJobTest.html) - [Pdf Tests (very basic)](https://manykarim.github.io/robotframework-doctestlibrary/PdfTest.html) -[![DocTest Library presentation at robocon.io 2021](https://img.youtube.com/vi/qmpwlQoJ-nE/0.jpg)](https://youtu.be/qmpwlQoJ-nE "DocTest Library presentation at robocon.io 2021") -```RobotFramework -*** Settings *** -Library DocTest.VisualTest +See [Talk from RoboCon2021](https://www.youtube.com/watch?v=qmpwlQoJ-nE) for a short demo and some background. -*** Test Cases *** -Compare two Images and highlight differences - Compare Images Reference.jpg Candidate.jpg -``` +Powered by +- Open CV +- scikit-image +- ImageMagick (only needed for rendering .ps and .pcl files) +- Ghostscript (only needed for rendering .ps and .pcl files) +- PyWand (only needed for rendering .ps and .pcl files) +- Tesseract OCR +- parsimonious (only needed for parsing .pcl and .ps files for) +- pymupdf +- The knowledge of stackoverflow.com # Installation instructions diff --git a/docs/PdfTest.html b/docs/PdfTest.html index abf44b6..c96780d 100644 --- a/docs/PdfTest.html +++ b/docs/PdfTest.html @@ -1190,7 +1190,7 @@ jQuery.extend({highlight:function(e,t,n,r){if(e.nodeType===3){var i=e.data.match(t);if(i){var s=document.createElement(n||"span");s.className=r||"highlight";var o=e.splitText(i.index);o.splitText(i[0].length);var u=o.cloneNode(true);s.appendChild(u);o.parentNode.replaceChild(s,o);return 1}}else if(e.nodeType===1&&e.childNodes&&!/(script|style)/i.test(e.tagName)&&!(e.tagName===n.toUpperCase()&&e.className===r)){for(var a=0;a diff --git a/docs/PrintJobTest.html b/docs/PrintJobTest.html index aa1877a..6a9335c 100644 --- a/docs/PrintJobTest.html +++ b/docs/PrintJobTest.html @@ -1190,7 +1190,7 @@ jQuery.extend({highlight:function(e,t,n,r){if(e.nodeType===3){var i=e.data.match(t);if(i){var s=document.createElement(n||"span");s.className=r||"highlight";var o=e.splitText(i.index);o.splitText(i[0].length);var u=o.cloneNode(true);s.appendChild(u);o.parentNode.replaceChild(s,o);return 1}}else if(e.nodeType===1&&e.childNodes&&!/(script|style)/i.test(e.tagName)&&!(e.tagName===n.toUpperCase()&&e.className===r)){for(var a=0;a diff --git a/docs/VisualTest.html b/docs/VisualTest.html index d334895..2d1677e 100644 --- a/docs/VisualTest.html +++ b/docs/VisualTest.html @@ -1190,7 +1190,7 @@ jQuery.extend({highlight:function(e,t,n,r){if(e.nodeType===3){var i=e.data.match(t);if(i){var s=document.createElement(n||"span");s.className=r||"highlight";var o=e.splitText(i.index);o.splitText(i[0].length);var u=o.cloneNode(true);s.appendChild(u);o.parentNode.replaceChild(s,o);return 1}}else if(e.nodeType===1&&e.childNodes&&!/(script|style)/i.test(e.tagName)&&!(e.tagName===n.toUpperCase()&&e.className===r)){for(var a=0;a diff --git a/pyproject.toml b/pyproject.toml index 7838187..562dd83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "robotframework-doctestlibrary" -version = "0.10.1" +version = "0.11.0" description = "A library for Visual Document Testing" authors = ["Many Kasiriha "] maintainers = ["Many Kasiriha "] From 8479dc3bf1e48c76b73749a6562eb62bd5480131 Mon Sep 17 00:00:00 2001 From: Many Kasiriha Date: Tue, 4 Apr 2023 12:37:57 +0200 Subject: [PATCH 2/2] Prevent upscaling for OCR if resolution too high --- DocTest/CompareImage.py | 11 +++++++++-- utest/test_ocr.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/DocTest/CompareImage.py b/DocTest/CompareImage.py index 1bad6e2..be7ce5f 100644 --- a/DocTest/CompareImage.py +++ b/DocTest/CompareImage.py @@ -131,8 +131,7 @@ def get_ocr_text_data(self, ocr_config: str='--psm 11', ocr_lang: str='eng'): def increase_resolution_for_ocr(self): # experimental: IF OCR is used and DPI is lower than self.MINIMUM_OCR_RESOLUTION DPI, re-render with self.MINIMUM_OCR_RESOLUTION DPI - if (self.DPI < self.MINIMUM_OCR_RESOLUTION): - self.rerendered_for_ocr = True + if (self.DPI < self.MINIMUM_OCR_RESOLUTION): print("Re-Render document for OCR at {} DPI as current resolution is only {} DPI".format(self.MINIMUM_OCR_RESOLUTION, self.DPI)) if self.extension == '.pdf': self.convert_mupdf_to_opencv_image(resolution=self.MINIMUM_OCR_RESOLUTION) @@ -142,9 +141,17 @@ def increase_resolution_for_ocr(self): scale = self.MINIMUM_OCR_RESOLUTION / self.DPI # percent of original size width = int(self.opencv_images[0].shape[1] * scale) height = int(self.opencv_images[0].shape[0] * scale) + # Check if any page has a width or height higher than 32767 pixels + # If so, do not re-render as this will cause an error + if (width > 32767) or (height > 32767): + print("Re-rendering of image for OCR not possible as one of the pages has a width or height higher than 32767 pixels") + return dim = (width, height) # resize image self.opencv_images[0] = cv2.resize(self.opencv_images[0], dim, interpolation = cv2.INTER_CUBIC) + self.rerendered_for_ocr = True + + def get_text_content_with_east(self): self.increase_resolution_for_ocr() diff --git a/utest/test_ocr.py b/utest/test_ocr.py index a865db4..a9a6e79 100644 --- a/utest/test_ocr.py +++ b/utest/test_ocr.py @@ -61,3 +61,14 @@ def test_text_on_colored_background_with_east(testdata_dir): assert any('01-Jan-2021' in s for s in img.text_content[0]['text']) assert any('123456789' in s for s in img.text_content[0]['text']) assert any('SOUVENIR' in s for s in img.text_content[0]['text']) + +def test_ocr_in_hires_without_rerender(testdata_dir): + import cv2 + low_res_image = cv2.imread(str(testdata_dir / 'birthday_1080_date_id.png')) + # resize image to 10x bigger + low_res_image = cv2.resize(low_res_image, None, fx=10, fy=10, interpolation=cv2.INTER_CUBIC) + cv2.imwrite(str(testdata_dir / 'birthday_1080_date_id_10x.png'), low_res_image) + img = CompareImage(testdata_dir / 'birthday_1080_date_id_10x.png') + img.get_ocr_text_data() + assert "01-Jan-2021" in img.text_content[0]['text'] + assert "ABCDEFGHI" in img.text_content[0]['text']