Merge pull request #80 from manykarim/add_tesseract_config_arg

v0.15.0 add tesseract_options as args
manykarim · May 24, 2023 · 22dc2ed · 22dc2ed
2 parents 68d92c5 + 11beb29
commit 22dc2ed
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 11 deletions.
diff --git a/DocTest/CompareImage.py b/DocTest/CompareImage.py
@@ -99,8 +99,9 @@ def get_text_content(self):
             self.text_content.append(text)
         return self.text_content
 
-    def get_ocr_text_data(self, ocr_config: str='--psm 11', ocr_lang: str='eng'):
-        self.increase_resolution_for_ocr()
+    def get_ocr_text_data(self, ocr_config: str='--psm 11', ocr_lang: str='eng', increase_resolution: bool=True):
+        if increase_resolution:
+            self.increase_resolution_for_ocr()
         for i in range(len(self.opencv_images)):
             text_list = []
             left_list = []
@@ -153,8 +154,9 @@ def increase_resolution_for_ocr(self):
 
 
 
-    def get_text_content_with_east(self):
-        self.increase_resolution_for_ocr()
+    def get_text_content_with_east(self, increase_resolution: bool=True):
+        if increase_resolution:
+            self.increase_resolution_for_ocr()
         self.east_text_extractor = EastTextExtractor()
         for frame in self.opencv_images:
             text = self.east_text_extractor.get_image_text(frame)

diff --git a/DocTest/VisualTest.py b/DocTest/VisualTest.py
@@ -834,14 +834,17 @@ def check_for_differences(self, reference, candidate, i, detected_differences, c
                 detected_differences.append(True)
 
     @keyword
-    def get_text_from_document(self, image: str, ocr_engine: str="tesseract"):
+    def get_text_from_document(self, image: str, ocr_engine: str="tesseract", ocr_config: str='--psm 11', ocr_lang: str='eng', increase_resolution: bool=True):
         """Gets Text Content from documents/images ``image``.
 
         Text content is returned as a list of strings. None if no text is identified.
 
         | =Arguments= | =Description= |
         | ``image`` | Path of the Image/Document from which the text content shall be retrieved |
         | ``ocr_engine`` | OCR Engine to be used. Options are ``tesseract`` and ``east``.  Default is ``tesseract``. |
+        | ``ocr_config`` | OCR Config to be used for tesseract. Default is ``--psm 11``. |
+        | ``ocr_lang`` | OCR Language to be used for tesseract. Default is ``eng``. |
+        | ``increase_resolution`` | Increase resolution of image to 300 DPI before OCR. Default is ``True``. |
 
         Examples:
         | ${text} | `Get Text From Document` | reference.pdf | #Gets Text Content from .pdf |
@@ -863,14 +866,14 @@ def get_text_from_document(self, image: str, ocr_engine: str="tesseract"):
         else:
             if ocr_engine == "tesseract":
                 try:
-                    img.get_ocr_text_data()
+                    img.get_ocr_text_data(ocr_config, ocr_lang, increase_resolution)
                     # if confidence is higher than 20, add to text list
                     text = [x for x in img.text_content[0]['text'] if x]
                 except:
                     text = None
             elif ocr_engine == "east":
                 try:
-                    img.get_text_content_with_east()
+                    img.get_text_content_with_east(increase_resolution)
                     text = [x for x in img.text_content[0]['text'] if x]
                 except:
                     text = None

diff --git a/docs/PdfTest.html b/docs/PdfTest.html
@@ -1190,7 +1190,7 @@
 jQuery.extend({highlight:function(e,t,n,r){if(e.nodeType===3){var i=e.data.match(t);if(i){var s=document.createElement(n||"span");s.className=r||"highlight";var o=e.splitText(i.index);o.splitText(i[0].length);var u=o.cloneNode(true);s.appendChild(u);o.parentNode.replaceChild(s,o);return 1}}else if(e.nodeType===1&&e.childNodes&&!/(script|style)/i.test(e.tagName)&&!(e.tagName===n.toUpperCase()&&e.className===r)){for(var a=0;a<e.childNodes.length;a++){a+=jQuery.highlight(e.childNodes[a],t,n,r)}}return 0}});jQuery.fn.unhighlight=function(e){var t={className:"highlight",element:"span"};jQuery.extend(t,e);return this.find(t.element+"."+t.className).each(function(){var e=this.parentNode;e.replaceChild(this.firstChild,this);e.normalize()}).end()};jQuery.fn.highlight=function(e,t){var n={className:"highlight",element:"span",caseSensitive:false,wordsOnly:false};jQuery.extend(n,t);if(e.constructor===String){e=[e]}e=jQuery.grep(e,function(e,t){return e!=""});e=jQuery.map(e,function(e,t){return e.replace(/[-[\]{}()*+?.,\\^$|#\s]/g,"\\$&")});if(e.length==0){return this}var r=n.caseSensitive?"":"i";var i="("+e.join("|")+")";if(n.wordsOnly){i="\\b"+i+"\\b"}var s=new RegExp(i,r);return this.each(function(){jQuery.highlight(this,s,n.element,n.className)})}
 </script>
 <script type="text/javascript">
-libdoc = {"specversion": 1, "name": "PdfTest", "doc": "<p>Documentation for library <code>PdfTest</code>.</p>", "version": "0.14.0", "generated": "2023-05-24T09:22:06+00:00", "type": "LIBRARY", "scope": "TEST", "docFormat": "HTML", "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 10, "tags": [], "inits": [], "keywords": [{"name": "Check Text Content", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p><b>DEPRECATED!!</b> Use keyword <a href=\"#PDF%20Should%20Contain%20Strings\" class=\"name\">PDF Should Contain Strings</a> instead.</p>\n<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>Check Text Content</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "*DEPRECATED!!* Use keyword `PDF Should Contain Strings` instead.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 149, "deprecated": true}, {"name": "Compare Pdf Documents", "args": [{"name": "reference_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "reference_document"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}, {"name": "kwargs", "types": [], "typedocs": {}, "defaultValue": null, "kind": "VAR_NAMED", "required": false, "repr": "**kwargs"}], "doc": "<p>Compares some PDF metadata/properties of <code>reference_document</code> and <code>candidate_document</code>.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> shall both be path to <code>PDF</code> files. <code>compare</code> can be passed as an optional argument with following values:</p>\n<ul>\n<li>all</li>\n<li>metadata</li>\n<li>text</li>\n<li>fonts</li>\n<li>images</li>\n<li>signatures</li>\n</ul>\n<p>Multiple values shall be separated by <code>|</code> symbol e.g. <code>compare=text|metadata</code></p>\n<p>The compared properties are are:</p>\n<ul>\n<li>metadata</li>\n<li>page_count</li>\n<li>sigflags</li>\n<li>text</li>\n</ul>\n<p>Result is passed if all properties are equal.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> are both .pdf files</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<th>Keyword</th>\n<th>reference_document</th>\n<th>candidate_document</th>\n<th>comment</th>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>#Performs a property comparison of both files</td>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>compare=text</td>\n<td>#Performs a property comparison of both files. Only text content will be compared</td>\n</tr>\n</table>\n<p>compare=text</p>", "shortdoc": "Compares some PDF metadata/properties of ``reference_document`` and ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 17}, {"name": "PDF Should Contain Strings", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings or a single string, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>One String</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "Checks if each item provided in the list ``expected_text_list`` appears in the PDF File ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 181}, {"name": "PDF Should Not Contain Strings", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p>Checks if each item provided in the list <code>expected_text_list</code> does NOT appear in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings or a single string, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>PDF Should Not Contain Strings</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n<tr>\n<td>PDF Should Not Contain Strings</td>\n<td>One String</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "Checks if each item provided in the list ``expected_text_list`` does NOT appear in the PDF File ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 221}], "dataTypes": {"enums": [], "typedDicts": []}, "typedocs": []}
+libdoc = {"specversion": 1, "name": "PdfTest", "doc": "<p>Documentation for library <code>PdfTest</code>.</p>", "version": "0.15.0", "generated": "2023-05-24T10:33:48+00:00", "type": "LIBRARY", "scope": "TEST", "docFormat": "HTML", "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 10, "tags": [], "inits": [], "keywords": [{"name": "Check Text Content", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p><b>DEPRECATED!!</b> Use keyword <a href=\"#PDF%20Should%20Contain%20Strings\" class=\"name\">PDF Should Contain Strings</a> instead.</p>\n<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>Check Text Content</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "*DEPRECATED!!* Use keyword `PDF Should Contain Strings` instead.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 149, "deprecated": true}, {"name": "Compare Pdf Documents", "args": [{"name": "reference_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "reference_document"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}, {"name": "kwargs", "types": [], "typedocs": {}, "defaultValue": null, "kind": "VAR_NAMED", "required": false, "repr": "**kwargs"}], "doc": "<p>Compares some PDF metadata/properties of <code>reference_document</code> and <code>candidate_document</code>.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> shall both be path to <code>PDF</code> files. <code>compare</code> can be passed as an optional argument with following values:</p>\n<ul>\n<li>all</li>\n<li>metadata</li>\n<li>text</li>\n<li>fonts</li>\n<li>images</li>\n<li>signatures</li>\n</ul>\n<p>Multiple values shall be separated by <code>|</code> symbol e.g. <code>compare=text|metadata</code></p>\n<p>The compared properties are are:</p>\n<ul>\n<li>metadata</li>\n<li>page_count</li>\n<li>sigflags</li>\n<li>text</li>\n</ul>\n<p>Result is passed if all properties are equal.</p>\n<p><code>reference_document</code> and <code>candidate_document</code> are both .pdf files</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<th>Keyword</th>\n<th>reference_document</th>\n<th>candidate_document</th>\n<th>comment</th>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>#Performs a property comparison of both files</td>\n<td></td>\n</tr>\n<tr>\n<td>Compare Pdf Documents</td>\n<td>reference.pdf</td>\n<td>candidate.pdf</td>\n<td>compare=text</td>\n<td>#Performs a property comparison of both files. Only text content will be compared</td>\n</tr>\n</table>\n<p>compare=text</p>", "shortdoc": "Compares some PDF metadata/properties of ``reference_document`` and ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 17}, {"name": "PDF Should Contain Strings", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p>Checks if each item provided in the list <code>expected_text_list</code> appears in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings or a single string, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n<tr>\n<td>PDF Should Contain Strings</td>\n<td>One String</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "Checks if each item provided in the list ``expected_text_list`` appears in the PDF File ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 181}, {"name": "PDF Should Not Contain Strings", "args": [{"name": "expected_text_list", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "expected_text_list"}, {"name": "candidate_document", "types": [], "typedocs": {}, "defaultValue": null, "kind": "POSITIONAL_OR_NAMED", "required": true, "repr": "candidate_document"}], "doc": "<p>Checks if each item provided in the list <code>expected_text_list</code> does NOT appear in the PDF File <code>candidate_document</code>.</p>\n<p><code>expected_text_list</code> is a list of strings or a single string, <code>candidate_document</code> is the path to a PDF File.</p>\n<p>Examples:</p>\n<table border=\"1\">\n<tr>\n<td>@{strings}=</td>\n<td>Create List</td>\n<td>One String</td>\n<td>Another String</td>\n</tr>\n<tr>\n<td>PDF Should Not Contain Strings</td>\n<td>${strings}</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n<tr>\n<td>PDF Should Not Contain Strings</td>\n<td>One String</td>\n<td>candidate.pdf</td>\n<td></td>\n</tr>\n</table>", "shortdoc": "Checks if each item provided in the list ``expected_text_list`` does NOT appear in the PDF File ``candidate_document``.", "tags": [], "source": "C:\\workspace\\robotframework-doctestlibrary\\DocTest\\PdfTest.py", "lineno": 221}], "dataTypes": {"enums": [], "typedDicts": []}, "typedocs": []}
 </script>
 <title></title>
 </head>