From 44b36a2f210c651f1491d14a6eb0fadc32a37136 Mon Sep 17 00:00:00 2001 From: Gabo Date: Thu, 11 Jul 2024 14:24:00 +0200 Subject: [PATCH] Fix race condition --- Makefile | 2 +- src/QueueProcessor.py | 1 - src/extract_segments.py | 36 ++++++++++++++++++------------------ 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 7ae904b..18903fb 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ start: docker compose up --build stop: - docker compose stop ; docker compose stop -f docker-compose-test.yml + docker compose stop ; docker compose -f docker-compose-test.yml stop start-test: docker compose -f docker-compose-test.yml up --attach api-pdf-layout --attach queue-processor-pdf-layout --attach worker-pdf-layout --build diff --git a/src/QueueProcessor.py b/src/QueueProcessor.py index 795dc2e..5178526 100644 --- a/src/QueueProcessor.py +++ b/src/QueueProcessor.py @@ -9,7 +9,6 @@ from sentry_sdk.integrations.redis import RedisIntegration import sentry_sdk -from PdfFile import PdfFile from configuration import ( MONGO_HOST, MONGO_PORT, diff --git a/src/extract_segments.py b/src/extract_segments.py index 1c46246..1eaccaa 100644 --- a/src/extract_segments.py +++ b/src/extract_segments.py @@ -1,4 +1,6 @@ -from configuration import DOCUMENT_LAYOUT_ANALYSIS_URL +from time import sleep + +from configuration import DOCUMENT_LAYOUT_ANALYSIS_URL, service_logger from data_model.SegmentBox import SegmentBox from PdfFile import PdfFile from data_model.ExtractionData import ExtractionData @@ -10,11 +12,24 @@ def get_xml_name(task: Task) -> str: return f"{task.tenant}__{task.params.filename.lower().replace('.pdf', '.xml')}" +def exists_file(tenant: str, file_name: str) -> bool: + for i in range(5): + pdf_file = PdfFile(tenant) + if pdf_file.get_path(file_name).exists(): + return True + + service_logger.info(f"File {pdf_file.get_path(file_name)} exists") + sleep(1) + + return False + + def extract_segments(task: Task, xml_file_name: str = "") -> ExtractionData: - pdf_file = PdfFile(task.tenant) - if not pdf_file.get_path(task.params.filename).exists(): + if not exists_file(task.tenant, task.params.filename): raise FileNotFoundError + pdf_file = PdfFile(task.tenant) + with open(pdf_file.get_path(task.params.filename), "rb") as stream: files = {"file": stream} @@ -34,18 +49,3 @@ def extract_segments(task: Task, xml_file_name: str = "") -> ExtractionData: page_height=0 if not segments else segments[0].page_height, page_width=0 if not segments else segments[0].page_width, ) - - -if __name__ == "__main__": - a = { - "left": 1, - "top": 1, - "width": 1, - "height": 1, - "page_number": 1, - "page_width": 1, - "page_height": 1, - "text": "", - "type": "Section_Header", - } - print(SegmentBox(**a))