Skip to content

Commit

Permalink
Update token type model
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Feb 12, 2024
1 parent 4da8e2c commit 636f586
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 27 deletions.
18 changes: 9 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
fastapi==0.103.1
fastapi==0.109.2
graypy==2.1.0
python-multipart==0.0.6
uvicorn==0.23.2
python-multipart==0.0.9
uvicorn==0.27.1
gunicorn==21.2.0
nltk==3.8.1
PyYAML==6.0.1
requests==2.31.0
pymongo==4.5.0
PyRSMQ==0.4.5
redis==5.0.0
httpx==0.25.0
sentry-sdk==1.30.0
git+https://github.com/huridocs/pdf-tokens-type-labeler@5b38fdb764e628e108426a7c6a560f97a283f19c
pymongo==4.6.1
PyRSMQ==0.5.0
redis==5.0.1
httpx==0.26.0
sentry-sdk==1.40.3
git+https://github.com/huridocs/pdf-tokens-type-labeler@7f46632cff996baaf42b8c8ac96bab20ab59dd32
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
name=PROJECT_NAME,
packages=["paragraph_extraction_trainer"],
package_dir={"": "src"},
version="0.26",
version="0.27",
url="https://github.com/huridocs/pdf_paragraphs_extraction",
author="HURIDOCS",
description="Service for extracting paragraphs from PDFs.",
Expand Down
43 changes: 35 additions & 8 deletions src/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,20 @@ def train_for_benchmark():
trainer.train(str(BENCHMARK_MODEL_PATH), labels)


def predict_for_benchmark(pdf_paragraph_tokens_list: list[PdfParagraphTokens], get_granular_scores: bool):
def train():
pdf_paragraph_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH)
print("length of pdf paragraphs for training", len(pdf_paragraph_tokens_list))
pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in pdf_paragraph_tokens_list]
trainer = ParagraphExtractorTrainer(pdfs_features=pdf_features_list, model_configuration=MODEL_CONFIGURATION)

labels = []
for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens(pdf_paragraph_tokens_list):
labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token))
model_path = Path(join(ROOT_PATH, "model", "all_data.model"))
trainer.train(str(model_path), labels)


def predict_for_benchmark(pdf_paragraph_tokens_list: list[PdfParagraphTokens], model_path: str = ""):
pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in pdf_paragraph_tokens_list]
trainer = ParagraphExtractorTrainer(pdfs_features=pdf_features_list, model_configuration=MODEL_CONFIGURATION)
truths = []
Expand All @@ -45,20 +58,34 @@ def predict_for_benchmark(pdf_paragraph_tokens_list: list[PdfParagraphTokens], g

print("predicting")
start_time = time()
trainer.predict(BENCHMARK_MODEL_PATH)
if model_path:
trainer.predict(model_path)
else:
trainer.predict(BENCHMARK_MODEL_PATH)
predictions = [token.prediction for token in trainer.loop_tokens()]
total_time = time() - start_time
if get_granular_scores:
benchmark_table = BenchmarkTable(pdf_paragraph_tokens_list, total_time)
benchmark_table.prepare_benchmark_table()
benchmark_table = BenchmarkTable(pdf_paragraph_tokens_list, total_time)
benchmark_table.prepare_benchmark_table()

return truths, predictions


def benchmark(get_granular_scores: bool):
def benchmark():
train_for_benchmark()
pdf_paragraph_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in="test")
truths, predictions = predict_for_benchmark(pdf_paragraph_tokens_list)

f1 = round(f1_score(truths, predictions, average="macro") * 100, 2)
accuracy = round(accuracy_score(truths, predictions) * 100, 2)
print(f"F1 score {f1}%")
print(f"Accuracy score {accuracy}%")


def benchmark_all():
train_for_benchmark()
pdf_paragraph_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in="test")
truths, predictions = predict_for_benchmark(pdf_paragraph_tokens_list, get_granular_scores)
model_path = str(Path(join(ROOT_PATH, "model", "all_data.model")))
truths, predictions = predict_for_benchmark(pdf_paragraph_tokens_list, model_path)

f1 = round(f1_score(truths, predictions, average="macro") * 100, 2)
accuracy = round(accuracy_score(truths, predictions) * 100, 2)
Expand All @@ -69,5 +96,5 @@ def benchmark(get_granular_scores: bool):
if __name__ == "__main__":
print("start")
start = time()
benchmark(get_granular_scores=True)
benchmark_all()
print("finished in", int(time() - start), "seconds")
8 changes: 4 additions & 4 deletions src/benchmark_table.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
File Type Mistakes
----------------- ---------------
multi_column_test 3/410 (99.27%)
one_column_test 72/2181 (96.7%)
multi_column_test 0/410 (100.0%)
one_column_test 0/2181 (100.0%)

Average Accuracy: 75/2591 (97.11%)
Total Time: 0.82
Average Accuracy: 0/2591 (100.0%)
Total Time: 0.84
6 changes: 1 addition & 5 deletions src/paragraph_extraction_trainer/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@
paragraph_extraction_model_path = hf_hub_download(
repo_id="HURIDOCS/pdf-segmentation",
filename="paragraph_extraction_model.model",
revision="ffc4f1753ac1ca55981be06c29f1e73787e611b5",
)

letter_corpus_path = hf_hub_download(
repo_id="HURIDOCS/pdf-segmentation", filename="letter_corpus.txt", revision="da00a69c8d6a84493712e819580c0148757f466c"
revision="3dc98aa51e073066a78edde745d8882121c4891f",
)

toc_model_path = hf_hub_download(
Expand Down

0 comments on commit 636f586

Please sign in to comment.