Skip to content

Commit

Permalink
Merge pull request #71 from manykarim/fix_tesseract_max_resolution_error
Browse files Browse the repository at this point in the history
Fix tesseract max resolution error
  • Loading branch information
manykarim authored Apr 4, 2023
2 parents 071428f + 8479dc3 commit 2648fd2
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 15 deletions.
11 changes: 9 additions & 2 deletions DocTest/CompareImage.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,7 @@ def get_ocr_text_data(self, ocr_config: str='--psm 11', ocr_lang: str='eng'):

def increase_resolution_for_ocr(self):
# experimental: IF OCR is used and DPI is lower than self.MINIMUM_OCR_RESOLUTION DPI, re-render with self.MINIMUM_OCR_RESOLUTION DPI
if (self.DPI < self.MINIMUM_OCR_RESOLUTION):
self.rerendered_for_ocr = True
if (self.DPI < self.MINIMUM_OCR_RESOLUTION):
print("Re-Render document for OCR at {} DPI as current resolution is only {} DPI".format(self.MINIMUM_OCR_RESOLUTION, self.DPI))
if self.extension == '.pdf':
self.convert_mupdf_to_opencv_image(resolution=self.MINIMUM_OCR_RESOLUTION)
Expand All @@ -142,9 +141,17 @@ def increase_resolution_for_ocr(self):
scale = self.MINIMUM_OCR_RESOLUTION / self.DPI # percent of original size
width = int(self.opencv_images[0].shape[1] * scale)
height = int(self.opencv_images[0].shape[0] * scale)
# Check if any page has a width or height higher than 32767 pixels
# If so, do not re-render as this will cause an error
if (width > 32767) or (height > 32767):
print("Re-rendering of image for OCR not possible as one of the pages has a width or height higher than 32767 pixels")
return
dim = (width, height)
# resize image
self.opencv_images[0] = cv2.resize(self.opencv_images[0], dim, interpolation = cv2.INTER_CUBIC)
self.rerendered_for_ocr = True



def get_text_content_with_east(self):
self.increase_resolution_for_ocr()
Expand Down
21 changes: 12 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,25 @@
[Robot Framework](https://robotframework.org) DocTest library.
Simple Automated Visual Document Testing.

See **keyword documentation** for
See keyword documentation for

- [Visual Document Tests](https://manykarim.github.io/robotframework-doctestlibrary/VisualTest.html)
- [Print Job Tests](https://manykarim.github.io/robotframework-doctestlibrary/PrintJobTest.html)
- [Pdf Tests (very basic)](https://manykarim.github.io/robotframework-doctestlibrary/PdfTest.html)

[![DocTest Library presentation at robocon.io 2021](https://img.youtube.com/vi/qmpwlQoJ-nE/0.jpg)](https://youtu.be/qmpwlQoJ-nE "DocTest Library presentation at robocon.io 2021")

```RobotFramework
*** Settings ***
Library DocTest.VisualTest
See [Talk from RoboCon2021](https://www.youtube.com/watch?v=qmpwlQoJ-nE) for a short demo and some background.

*** Test Cases ***
Compare two Images and highlight differences
Compare Images Reference.jpg Candidate.jpg
```
Powered by
- Open CV
- scikit-image
- ImageMagick (only needed for rendering .ps and .pcl files)
- Ghostscript (only needed for rendering .ps and .pcl files)
- PyWand (only needed for rendering .ps and .pcl files)
- Tesseract OCR
- parsimonious (only needed for parsing .pcl and .ps files for)
- pymupdf
- The knowledge of stackoverflow.com

# Installation instructions

Expand Down
2 changes: 1 addition & 1 deletion docs/PdfTest.html
Original file line number Diff line number Diff line change
Expand Up @@ -1190,7 +1190,7 @@
jQuery.extend({highlight:function(e,t,n,r){if(e.nodeType===3){var i=e.data.match(t);if(i){var s=document.createElement(n||"span");s.className=r||"highlight";var o=e.splitText(i.index);o.splitText(i[0].length);var u=o.cloneNode(true);s.appendChild(u);o.parentNode.replaceChild(s,o);return 1}}else if(e.nodeType===1&&e.childNodes&&!/(script|style)/i.test(e.tagName)&&!(e.tagName===n.toUpperCase()&&e.className===r)){for(var a=0;a<e.childNodes.length;a++){a+=jQuery.highlight(e.childNodes[a],t,n,r)}}return 0}});jQuery.fn.unhighlight=function(e){var t={className:"highlight",element:"span"};jQuery.extend(t,e);return this.find(t.element+"."+t.className).each(function(){var e=this.parentNode;e.replaceChild(this.firstChild,this);e.normalize()}).end()};jQuery.fn.highlight=function(e,t){var n={className:"highlight",element:"span",caseSensitive:false,wordsOnly:false};jQuery.extend(n,t);if(e.constructor===String){e=[e]}e=jQuery.grep(e,function(e,t){return e!=""});e=jQuery.map(e,function(e,t){return e.replace(/[-[\]{}()*+?.,\\^$|#\s]/g,"\\$&")});if(e.length==0){return this}var r=n.caseSensitive?"":"i";var i="("+e.join("|")+")";if(n.wordsOnly){i="\\b"+i+"\\b"}var s=new RegExp(i,r);return this.each(function(){jQuery.highlight(this,s,n.element,n.className)})}
</script>
<script type="text/javascript">
libdoc = {"specversion": 1, "name": "PdfTest", "doc": "<p>Documentation for library <code>PdfTest</code>.</p>", "version": "0.10.0", "generated": "2023-03-28T08:16:12+00:00", "type": "LIBRARY", "scope": "TEST", "docFormat": "HTML", "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 10, "tags": [], "inits": [], "keywords": [{"name": "Check Text Content", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p><b>DEPRECATED!!</b> Use keyword <a href=\"#PDF%20Should%20Contain%20Strings\" class=\"name\">PDF Should Contain Strings</a> instead.</p>\n<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>Check Text Content</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "*DEPRECATED!!* Use keyword `PDF Should Contain Strings` instead.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 147, "deprecated": true}, {"name": "Compare Pdf Documents", "args": [{"name": "reference_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "reference_document"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}, {"name": "kwargs", "types": [], "typedocs": {}, "defaultValue": null, "kind": "VAR_NAMED", "required": false, "repr": "**kwargs"}], "doc": "<p>Compares some PDF metadata/properties of <code>reference_document</code> and <code>candidate_document</code>.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> shall both be path to <code>PDF</code> files. <code>compare</code> can be passed as an optional argument with following values:</p>\n<ul>\n<li>all</li>\n<li>metadata</li>\n<li>text</li>\n<li>fonts</li>\n<li>images</li>\n<li>signatures</li>\n</ul>\n<p>Multiple values shall be separated by <code>|</code> symbol e.g. <code>compare=text|metadata</code></p>\n<p>The compared properties are are:</p>\n<ul>\n<li>metadata</li>\n<li>page_count</li>\n<li>sigflags</li>\n<li>text</li>\n</ul>\n<p>Result is passed if all properties are equal.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> are both .pdf files</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<th>Keyword</th>\n<th>reference_document</th>\n<th>candidate_document</th>\n<th>comment</th>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>#Performs a property comparison of both files</td>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>compare=text</td>\n<td>#Performs a property comparison of both files. Only text content will be compared</td>\n</tr>\n</table>\n<p>compare=text</p>", "shortdoc": "Compares some PDF metadata/properties of ``reference_document`` and ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 17}, {"name": "PDF Should Contain Strings", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings or a single string, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>One String</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "Checks if each item provided in the list ``expected_text_list`` appears in the PDF File ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 178}], "dataTypes": {"enums": [], "typedDicts": []}, "typedocs": []}
libdoc = {"specversion": 1, "name": "PdfTest", "doc": "<p>Documentation for library <code>PdfTest</code>.</p>", "version": "0.11.0", "generated": "2023-04-04T10:34:40+00:00", "type": "LIBRARY", "scope": "TEST", "docFormat": "HTML", "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 10, "tags": [], "inits": [], "keywords": [{"name": "Check Text Content", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p><b>DEPRECATED!!</b> Use keyword <a href=\"#PDF%20Should%20Contain%20Strings\" class=\"name\">PDF Should Contain Strings</a> instead.</p>\n<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>Check Text Content</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "*DEPRECATED!!* Use keyword `PDF Should Contain Strings` instead.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 147, "deprecated": true}, {"name": "Compare Pdf Documents", "args": [{"name": "reference_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "reference_document"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}, {"name": "kwargs", "types": [], "typedocs": {}, "defaultValue": null, "kind": "VAR_NAMED", "required": false, "repr": "**kwargs"}], "doc": "<p>Compares some PDF metadata/properties of <code>reference_document</code> and <code>candidate_document</code>.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> shall both be path to <code>PDF</code> files. <code>compare</code> can be passed as an optional argument with following values:</p>\n<ul>\n<li>all</li>\n<li>metadata</li>\n<li>text</li>\n<li>fonts</li>\n<li>images</li>\n<li>signatures</li>\n</ul>\n<p>Multiple values shall be separated by <code>|</code> symbol e.g. <code>compare=text|metadata</code></p>\n<p>The compared properties are are:</p>\n<ul>\n<li>metadata</li>\n<li>page_count</li>\n<li>sigflags</li>\n<li>text</li>\n</ul>\n<p>Result is passed if all properties are equal.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> are both .pdf files</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<th>Keyword</th>\n<th>reference_document</th>\n<th>candidate_document</th>\n<th>comment</th>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>#Performs a property comparison of both files</td>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>compare=text</td>\n<td>#Performs a property comparison of both files. Only text content will be compared</td>\n</tr>\n</table>\n<p>compare=text</p>", "shortdoc": "Compares some PDF metadata/properties of ``reference_document`` and ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 17}, {"name": "PDF Should Contain Strings", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings or a single string, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>One String</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "Checks if each item provided in the list ``expected_text_list`` appears in the PDF File ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 178}], "dataTypes": {"enums": [], "typedDicts": []}, "typedocs": []}
</script>
<title></title>
</head>
Expand Down
Loading

0 comments on commit 2648fd2

Please sign in to comment.