Skip to content

Commit

Permalink
Use fast document layout analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Sep 5, 2024
1 parent a645bd4 commit c59a941
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 6 deletions.
1 change: 0 additions & 1 deletion src/QueueProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
TASK_QUEUE_NAME,
SERVICE_HOST,
SERVICE_PORT,
DOCUMENT_LAYOUT_ANALYSIS_PORT,
ENVIRONMENT,
SENTRY_DSN,
service_logger,
Expand Down
7 changes: 3 additions & 4 deletions src/extract_segments.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from time import sleep

from configuration import DOCUMENT_LAYOUT_ANALYSIS_URL, service_logger
from data_model.SegmentBox import SegmentBox
from PdfFile import PdfFile
Expand All @@ -14,11 +12,12 @@ def get_xml_name(task: Task) -> str:

def extract_segments(task: Task, xml_file_name: str = "") -> ExtractionData:
pdf_file = PdfFile(task.tenant)
data = {"fast": "True"}
url = DOCUMENT_LAYOUT_ANALYSIS_URL + (f"/save_xml/{xml_file_name}" if xml_file_name else "")

with open(pdf_file.get_path(task.params.filename), "rb") as stream:
files = {"file": stream}
url = DOCUMENT_LAYOUT_ANALYSIS_URL + (f"/save_xml/{xml_file_name}" if xml_file_name else "")
results = requests.post(url, files=files)
results = requests.post(url, files=files, data=data)

if results.status_code != 200:
service_logger.error(f"Response error: {results.status_code} - {results.text}")
Expand Down
2 changes: 1 addition & 1 deletion src/test_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_async_extraction(self):
self.assertLess(15, len(extraction_data.paragraphs))
self.assertEqual(612, extraction_data.page_width)
self.assertEqual(792, extraction_data.page_height)
self.assertEqual("A /INF/76/1", extraction_data.paragraphs[0].text)
self.assertTrue(extraction_data.paragraphs[0].text in ["A /INF/76/1", "United Nations"])
self.assertEqual({1, 2}, {x.page_number for x in extraction_data.paragraphs})

response = requests.get(extraction_message.file_url)
Expand Down

0 comments on commit c59a941

Please sign in to comment.