diff --git a/src/QueueProcessor.py b/src/QueueProcessor.py index 6703749..c11be22 100644 --- a/src/QueueProcessor.py +++ b/src/QueueProcessor.py @@ -17,7 +17,6 @@ TASK_QUEUE_NAME, SERVICE_HOST, SERVICE_PORT, - DOCUMENT_LAYOUT_ANALYSIS_PORT, ENVIRONMENT, SENTRY_DSN, service_logger, diff --git a/src/extract_segments.py b/src/extract_segments.py index fcb1068..efffe52 100644 --- a/src/extract_segments.py +++ b/src/extract_segments.py @@ -1,5 +1,3 @@ -from time import sleep - from configuration import DOCUMENT_LAYOUT_ANALYSIS_URL, service_logger from data_model.SegmentBox import SegmentBox from PdfFile import PdfFile @@ -14,11 +12,12 @@ def get_xml_name(task: Task) -> str: def extract_segments(task: Task, xml_file_name: str = "") -> ExtractionData: pdf_file = PdfFile(task.tenant) + data = {"fast": "True"} + url = DOCUMENT_LAYOUT_ANALYSIS_URL + (f"/save_xml/{xml_file_name}" if xml_file_name else "") with open(pdf_file.get_path(task.params.filename), "rb") as stream: files = {"file": stream} - url = DOCUMENT_LAYOUT_ANALYSIS_URL + (f"/save_xml/{xml_file_name}" if xml_file_name else "") - results = requests.post(url, files=files) + results = requests.post(url, files=files, data=data) if results.status_code != 200: service_logger.error(f"Response error: {results.status_code} - {results.text}") diff --git a/src/test_end_to_end.py b/src/test_end_to_end.py index a701d49..c4b8365 100644 --- a/src/test_end_to_end.py +++ b/src/test_end_to_end.py @@ -67,7 +67,7 @@ def test_async_extraction(self): self.assertLess(15, len(extraction_data.paragraphs)) self.assertEqual(612, extraction_data.page_width) self.assertEqual(792, extraction_data.page_height) - self.assertEqual("A /INF/76/1", extraction_data.paragraphs[0].text) + self.assertTrue(extraction_data.paragraphs[0].text in ["A /INF/76/1", "United Nations"]) self.assertEqual({1, 2}, {x.page_number for x in extraction_data.paragraphs}) response = requests.get(extraction_message.file_url)