From 636f58687e7bfb6d7a221ac777b82a9f0228fbeb Mon Sep 17 00:00:00 2001 From: Gabo Date: Mon, 12 Feb 2024 14:34:11 +0100 Subject: [PATCH] Update token type model --- requirements.txt | 18 ++++---- setup.py | 2 +- src/benchmark.py | 43 +++++++++++++++---- src/benchmark_table.txt | 8 ++-- .../download_models.py | 6 +-- 5 files changed, 50 insertions(+), 27 deletions(-) diff --git a/requirements.txt b/requirements.txt index c020107..c6448b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,14 @@ -fastapi==0.103.1 +fastapi==0.109.2 graypy==2.1.0 -python-multipart==0.0.6 -uvicorn==0.23.2 +python-multipart==0.0.9 +uvicorn==0.27.1 gunicorn==21.2.0 nltk==3.8.1 PyYAML==6.0.1 requests==2.31.0 -pymongo==4.5.0 -PyRSMQ==0.4.5 -redis==5.0.0 -httpx==0.25.0 -sentry-sdk==1.30.0 -git+https://github.com/huridocs/pdf-tokens-type-labeler@5b38fdb764e628e108426a7c6a560f97a283f19c +pymongo==4.6.1 +PyRSMQ==0.5.0 +redis==5.0.1 +httpx==0.26.0 +sentry-sdk==1.40.3 +git+https://github.com/huridocs/pdf-tokens-type-labeler@7f46632cff996baaf42b8c8ac96bab20ab59dd32 diff --git a/setup.py b/setup.py index 8301238..813aa4f 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ name=PROJECT_NAME, packages=["paragraph_extraction_trainer"], package_dir={"": "src"}, - version="0.26", + version="0.27", url="https://github.com/huridocs/pdf_paragraphs_extraction", author="HURIDOCS", description="Service for extracting paragraphs from PDFs.", diff --git a/src/benchmark.py b/src/benchmark.py index 1a09439..d7cce39 100644 --- a/src/benchmark.py +++ b/src/benchmark.py @@ -36,7 +36,20 @@ def train_for_benchmark(): trainer.train(str(BENCHMARK_MODEL_PATH), labels) -def predict_for_benchmark(pdf_paragraph_tokens_list: list[PdfParagraphTokens], get_granular_scores: bool): +def train(): + pdf_paragraph_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH) + print("length of pdf paragraphs for training", len(pdf_paragraph_tokens_list)) + pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in pdf_paragraph_tokens_list] + trainer = ParagraphExtractorTrainer(pdfs_features=pdf_features_list, model_configuration=MODEL_CONFIGURATION) + + labels = [] + for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens(pdf_paragraph_tokens_list): + labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token)) + model_path = Path(join(ROOT_PATH, "model", "all_data.model")) + trainer.train(str(model_path), labels) + + +def predict_for_benchmark(pdf_paragraph_tokens_list: list[PdfParagraphTokens], model_path: str = ""): pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in pdf_paragraph_tokens_list] trainer = ParagraphExtractorTrainer(pdfs_features=pdf_features_list, model_configuration=MODEL_CONFIGURATION) truths = [] @@ -45,20 +58,34 @@ def predict_for_benchmark(pdf_paragraph_tokens_list: list[PdfParagraphTokens], g print("predicting") start_time = time() - trainer.predict(BENCHMARK_MODEL_PATH) + if model_path: + trainer.predict(model_path) + else: + trainer.predict(BENCHMARK_MODEL_PATH) predictions = [token.prediction for token in trainer.loop_tokens()] total_time = time() - start_time - if get_granular_scores: - benchmark_table = BenchmarkTable(pdf_paragraph_tokens_list, total_time) - benchmark_table.prepare_benchmark_table() + benchmark_table = BenchmarkTable(pdf_paragraph_tokens_list, total_time) + benchmark_table.prepare_benchmark_table() return truths, predictions -def benchmark(get_granular_scores: bool): +def benchmark(): + train_for_benchmark() + pdf_paragraph_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in="test") + truths, predictions = predict_for_benchmark(pdf_paragraph_tokens_list) + + f1 = round(f1_score(truths, predictions, average="macro") * 100, 2) + accuracy = round(accuracy_score(truths, predictions) * 100, 2) + print(f"F1 score {f1}%") + print(f"Accuracy score {accuracy}%") + + +def benchmark_all(): train_for_benchmark() pdf_paragraph_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in="test") - truths, predictions = predict_for_benchmark(pdf_paragraph_tokens_list, get_granular_scores) + model_path = str(Path(join(ROOT_PATH, "model", "all_data.model"))) + truths, predictions = predict_for_benchmark(pdf_paragraph_tokens_list, model_path) f1 = round(f1_score(truths, predictions, average="macro") * 100, 2) accuracy = round(accuracy_score(truths, predictions) * 100, 2) @@ -69,5 +96,5 @@ def benchmark(get_granular_scores: bool): if __name__ == "__main__": print("start") start = time() - benchmark(get_granular_scores=True) + benchmark_all() print("finished in", int(time() - start), "seconds") diff --git a/src/benchmark_table.txt b/src/benchmark_table.txt index c870f98..a9dfb37 100644 --- a/src/benchmark_table.txt +++ b/src/benchmark_table.txt @@ -1,7 +1,7 @@ File Type Mistakes ----------------- --------------- -multi_column_test 3/410 (99.27%) -one_column_test 72/2181 (96.7%) +multi_column_test 0/410 (100.0%) +one_column_test 0/2181 (100.0%) -Average Accuracy: 75/2591 (97.11%) -Total Time: 0.82 \ No newline at end of file +Average Accuracy: 0/2591 (100.0%) +Total Time: 0.84 \ No newline at end of file diff --git a/src/paragraph_extraction_trainer/download_models.py b/src/paragraph_extraction_trainer/download_models.py index cd5fcf6..14835c4 100644 --- a/src/paragraph_extraction_trainer/download_models.py +++ b/src/paragraph_extraction_trainer/download_models.py @@ -3,11 +3,7 @@ paragraph_extraction_model_path = hf_hub_download( repo_id="HURIDOCS/pdf-segmentation", filename="paragraph_extraction_model.model", - revision="ffc4f1753ac1ca55981be06c29f1e73787e611b5", -) - -letter_corpus_path = hf_hub_download( - repo_id="HURIDOCS/pdf-segmentation", filename="letter_corpus.txt", revision="da00a69c8d6a84493712e819580c0148757f466c" + revision="3dc98aa51e073066a78edde745d8882121c4891f", ) toc_model_path = hf_hub_download(