Merge pull request #71 from manykarim/fix_tesseract_max_resolution_error

Fix tesseract max resolution error
manykarim · Apr 4, 2023 · 2648fd2 · 2648fd2
2 parents 071428f + 8479dc3
commit 2648fd2
Show file tree

Hide file tree

Showing 7 changed files with 36 additions and 15 deletions.
diff --git a/DocTest/CompareImage.py b/DocTest/CompareImage.py
@@ -131,8 +131,7 @@ def get_ocr_text_data(self, ocr_config: str='--psm 11', ocr_lang: str='eng'):
 
     def increase_resolution_for_ocr(self):
         # experimental: IF OCR is used and DPI is lower than self.MINIMUM_OCR_RESOLUTION DPI, re-render with self.MINIMUM_OCR_RESOLUTION DPI
-        if (self.DPI < self.MINIMUM_OCR_RESOLUTION):
-            self.rerendered_for_ocr = True
+        if (self.DPI < self.MINIMUM_OCR_RESOLUTION):            
             print("Re-Render document for OCR at {} DPI as current resolution is only {} DPI".format(self.MINIMUM_OCR_RESOLUTION, self.DPI))
             if self.extension == '.pdf':
                 self.convert_mupdf_to_opencv_image(resolution=self.MINIMUM_OCR_RESOLUTION)
@@ -142,9 +141,17 @@ def increase_resolution_for_ocr(self):
                 scale = self.MINIMUM_OCR_RESOLUTION / self.DPI # percent of original size
                 width = int(self.opencv_images[0].shape[1] * scale)
                 height = int(self.opencv_images[0].shape[0] * scale)
+                # Check if any page has a width or height higher than 32767 pixels
+                # If so, do not re-render as this will cause an error
+                if (width > 32767) or (height > 32767):
+                    print("Re-rendering of image for OCR not possible as one of the pages has a width or height higher than 32767 pixels")
+                    return
                 dim = (width, height)
                 # resize image
                 self.opencv_images[0] = cv2.resize(self.opencv_images[0], dim, interpolation = cv2.INTER_CUBIC)
+            self.rerendered_for_ocr = True
+
+
 
     def get_text_content_with_east(self):
         self.increase_resolution_for_ocr()

diff --git a/README.md b/README.md
@@ -4,22 +4,25 @@
 [Robot Framework](https://robotframework.org) DocTest library.
 Simple Automated Visual Document Testing.
 
-See **keyword documentation** for
+See keyword documentation for
 
 - [Visual Document Tests](https://manykarim.github.io/robotframework-doctestlibrary/VisualTest.html)
 - [Print Job Tests](https://manykarim.github.io/robotframework-doctestlibrary/PrintJobTest.html)
 - [Pdf Tests (very basic)](https://manykarim.github.io/robotframework-doctestlibrary/PdfTest.html)
 
-[![DocTest Library presentation at robocon.io 2021](https://img.youtube.com/vi/qmpwlQoJ-nE/0.jpg)](https://youtu.be/qmpwlQoJ-nE "DocTest Library presentation at robocon.io 2021")
 
-```RobotFramework
-*** Settings ***
-Library    DocTest.VisualTest
+See [Talk from RoboCon2021](https://www.youtube.com/watch?v=qmpwlQoJ-nE) for a short demo and some background.
 
-*** Test Cases ***
-Compare two Images and highlight differences
-    Compare Images    Reference.jpg    Candidate.jpg
-```
+Powered by
+- Open CV
+- scikit-image
+- ImageMagick (only needed for rendering .ps and .pcl files)
+- Ghostscript (only needed for rendering .ps and .pcl files)
+- PyWand (only needed for rendering .ps and .pcl files)
+- Tesseract OCR
+- parsimonious (only needed for parsing .pcl and .ps files for)
+- pymupdf
+- The knowledge of stackoverflow.com
 
 # Installation instructions
 

diff --git a/docs/PdfTest.html b/docs/PdfTest.html
@@ -1190,7 +1190,7 @@
 jQuery.extend({highlight:function(e,t,n,r){if(e.nodeType===3){var i=e.data.match(t);if(i){var s=document.createElement(n||"span");s.className=r||"highlight";var o=e.splitText(i.index);o.splitText(i[0].length);var u=o.cloneNode(true);s.appendChild(u);o.parentNode.replaceChild(s,o);return 1}}else if(e.nodeType===1&&e.childNodes&&!/(script|style)/i.test(e.tagName)&&!(e.tagName===n.toUpperCase()&&e.className===r)){for(var a=0;a<e.childNodes.length;a++){a+=jQuery.highlight(e.childNodes[a],t,n,r)}}return 0}});jQuery.fn.unhighlight=function(e){var t={className:"highlight",element:"span"};jQuery.extend(t,e);return this.find(t.element+"."+t.className).each(function(){var e=this.parentNode;e.replaceChild(this.firstChild,this);e.normalize()}).end()};jQuery.fn.highlight=function(e,t){var n={className:"highlight",element:"span",caseSensitive:false,wordsOnly:false};jQuery.extend(n,t);if(e.constructor===String){e=[e]}e=jQuery.grep(e,function(e,t){return e!=""});e=jQuery.map(e,function(e,t){return e.replace(/[-[\]{}()*+?.,\\^$|#\s]/g,"\\$&")});if(e.length==0){return this}var r=n.caseSensitive?"":"i";var i="("+e.join("|")+")";if(n.wordsOnly){i="\\b"+i+"\\b"}var s=new RegExp(i,r);return this.each(function(){jQuery.highlight(this,s,n.element,n.className)})}
 </script>
 <script type="text/javascript">
-libdoc = {"specversion": 1, "name": "PdfTest", "doc": "<p>Documentation for library <code>PdfTest</code>.</p>", "version": "0.10.0", "generated": "2023-03-28T08:16:12+00:00", "type": "LIBRARY", "scope": "TEST", "docFormat": "HTML", "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 10, "tags": [], "inits": [], "keywords": [{"name": "Check Text Content", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p><b>DEPRECATED!!</b> Use keyword <a href=\"#PDF%20Should%20Contain%20Strings\" class=\"name\">PDF Should Contain Strings</a> instead.</p>\n<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>Check Text Content</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "*DEPRECATED!!* Use keyword `PDF Should Contain Strings` instead.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 147, "deprecated": true}, {"name": "Compare Pdf Documents", "args": [{"name": "reference_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "reference_document"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}, {"name": "kwargs", "types": [], "typedocs": {}, "defaultValue": null, "kind": "VAR_NAMED", "required": false, "repr": "**kwargs"}], "doc": "<p>Compares some PDF metadata/properties of <code>reference_document</code> and <code>candidate_document</code>.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> shall both be path to <code>PDF</code> files. <code>compare</code> can be passed as an optional argument with following values:</p>\n<ul>\n<li>all</li>\n<li>metadata</li>\n<li>text</li>\n<li>fonts</li>\n<li>images</li>\n<li>signatures</li>\n</ul>\n<p>Multiple values shall be separated by <code>|</code> symbol e.g. <code>compare=text|metadata</code></p>\n<p>The compared properties are are:</p>\n<ul>\n<li>metadata</li>\n<li>page_count</li>\n<li>sigflags</li>\n<li>text</li>\n</ul>\n<p>Result is passed if all properties are equal.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> are both .pdf files</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<th>Keyword</th>\n<th>reference_document</th>\n<th>candidate_document</th>\n<th>comment</th>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>#Performs a property comparison of both files</td>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>compare=text</td>\n<td>#Performs a property comparison of both files. Only text content will be compared</td>\n</tr>\n</table>\n<p>compare=text</p>", "shortdoc": "Compares some PDF metadata/properties of ``reference_document`` and ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 17}, {"name": "PDF Should Contain Strings", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings or a single string, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>One String</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "Checks if each item provided in the list ``expected_text_list`` appears in the PDF File ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 178}], "dataTypes": {"enums": [], "typedDicts": []}, "typedocs": []}
+libdoc = {"specversion": 1, "name": "PdfTest", "doc": "<p>Documentation for library <code>PdfTest</code>.</p>", "version": "0.11.0", "generated": "2023-04-04T10:34:40+00:00", "type": "LIBRARY", "scope": "TEST", "docFormat": "HTML", "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 10, "tags": [], "inits": [], "keywords": [{"name": "Check Text Content", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p><b>DEPRECATED!!</b> Use keyword <a href=\"#PDF%20Should%20Contain%20Strings\" class=\"name\">PDF Should Contain Strings</a> instead.</p>\n<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>Check Text Content</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "*DEPRECATED!!* Use keyword `PDF Should Contain Strings` instead.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 147, "deprecated": true}, {"name": "Compare Pdf Documents", "args": [{"name": "reference_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "reference_document"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}, {"name": "kwargs", "types": [], "typedocs": {}, "defaultValue": null, "kind": "VAR_NAMED", "required": false, "repr": "**kwargs"}], "doc": "<p>Compares some PDF metadata/properties of <code>reference_document</code> and <code>candidate_document</code>.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> shall both be path to <code>PDF</code> files. <code>compare</code> can be passed as an optional argument with following values:</p>\n<ul>\n<li>all</li>\n<li>metadata</li>\n<li>text</li>\n<li>fonts</li>\n<li>images</li>\n<li>signatures</li>\n</ul>\n<p>Multiple values shall be separated by <code>|</code> symbol e.g. <code>compare=text|metadata</code></p>\n<p>The compared properties are are:</p>\n<ul>\n<li>metadata</li>\n<li>page_count</li>\n<li>sigflags</li>\n<li>text</li>\n</ul>\n<p>Result is passed if all properties are equal.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> are both .pdf files</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<th>Keyword</th>\n<th>reference_document</th>\n<th>candidate_document</th>\n<th>comment</th>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>#Performs a property comparison of both files</td>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>compare=text</td>\n<td>#Performs a property comparison of both files. Only text content will be compared</td>\n</tr>\n</table>\n<p>compare=text</p>", "shortdoc": "Compares some PDF metadata/properties of ``reference_document`` and ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 17}, {"name": "PDF Should Contain Strings", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings or a single string, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>One String</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "Checks if each item provided in the list ``expected_text_list`` appears in the PDF File ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 178}], "dataTypes": {"enums": [], "typedDicts": []}, "typedocs": []}
 </script>
 <title></title>
 </head>