Skip to content

Commit

Permalink
Fix load labeled data
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Feb 9, 2024
1 parent 9765c33 commit bab0ec6
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 18 deletions.
62 changes: 62 additions & 0 deletions results/results.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
### Letter corpus

F1 score 99.25%
Accuracy score 99.52%
finished in 54 seconds

File Type Mistakes
----------------- ----------------
one_column_test 162/213 (23.94%)
multi_column_test 56/81 (30.86%)

Average Accuracy: 218/294 (25.85%)
Total Time: 0.53



### Unicode 2 first characters + 2 last characters

F1 score 99.19%
Accuracy score 99.48%
finished in 66 seconds

Process finished with exit code 0

File Type Mistakes
----------------- ----------------
one_column_test 162/213 (23.94%)
multi_column_test 55/81 (32.1%)

Average Accuracy: 217/294 (26.19%)
Total Time: 0.81



### No Unicode features

F1 score 99.17%
Accuracy score 99.46%
finished in 55 seconds

File Type Mistakes
----------------- ----------------
one_column_test 162/213 (23.94%)
multi_column_test 55/81 (32.1%)

Average Accuracy: 217/294 (26.19%)
Total Time: 0.87


### Only first two letters for unicode

F1 score 99.21%
Accuracy score 99.49%
finished in 62 seconds

File Type Mistakes
----------------- ----------------
one_column_test 162/213 (23.94%)
multi_column_test 55/81 (32.1%)

Average Accuracy: 217/294 (26.19%)
Total Time: 0.8
4 changes: 2 additions & 2 deletions src/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from paragraph_extraction_trainer.load_labeled_data import load_labeled_data
from paragraph_extraction_trainer.model_configuration import MODEL_CONFIGURATION

BENCHMARK_MODEL_PATH = Path(join(ROOT_PATH, "model", "paragraph_extraction_benchmark.model"))
BENCHMARK_MODEL_PATH = Path(join(ROOT_PATH, "model", "benchmark.model"))


def loop_pdf_paragraph_tokens(pdf_paragraph_tokens_list: list[PdfParagraphTokens]):
Expand All @@ -25,7 +25,7 @@ def loop_pdf_paragraph_tokens(pdf_paragraph_tokens_list: list[PdfParagraphTokens

def train_for_benchmark():
pdf_paragraph_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in="train")

print('length of pdf paragraphs for training', len(pdf_paragraph_tokens_list))
pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in pdf_paragraph_tokens_list]
trainer = ParagraphExtractorTrainer(pdfs_features=pdf_features_list, model_configuration=MODEL_CONFIGURATION)

Expand Down
7 changes: 7 additions & 0 deletions src/benchmark_table.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
File Type Mistakes
----------------- ----------------
multi_column_test 55/81 (32.1%)
one_column_test 162/213 (23.94%)

Average Accuracy: 217/294 (26.19%)
Total Time: 0.8
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pdf_token_type_labels.TokenType import TokenType
from pdf_tokens_type_trainer.TokenFeatures import TokenFeatures
from pdf_tokens_type_trainer.TokenTypeTrainer import TokenTypeTrainer
from pdf_tokens_type_trainer.config import CHARACTER_TYPE


class ParagraphExtractorTrainer(TokenTypeTrainer):
Expand Down
33 changes: 17 additions & 16 deletions src/paragraph_extraction_trainer/PdfParagraphTokens.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from os.path import join
from tkinter import Label

from pdf_features.PdfToken import PdfToken
from pdf_token_type_labels.PdfLabels import PdfLabels

Expand All @@ -14,17 +16,13 @@ def __init__(self, pdf_features: PdfFeatures, paragraphs: list[Paragraph]):
self.paragraphs = paragraphs

@staticmethod
def get_page_labels(paragraphs_extractions_labels: PdfLabels, page_number: int):
page_labels = []
def get_page_number_labels(paragraphs_extractions_labels: PdfLabels):
page_number_labels = {}

for page in paragraphs_extractions_labels.pages:
if page.number != page_number:
continue

for label_index, label in enumerate(sorted(page.labels, key=lambda _label: _label.area())):
page_labels.append((label_index, label, page.number))
page_number_labels[page.number] = list(sorted(page.labels, key=lambda _label: (_label.area(), _label.top)))

return page_labels
return page_number_labels

@staticmethod
def from_labeled_data(pdf_labeled_data_root_path, dataset, pdf_name):
Expand All @@ -36,29 +34,32 @@ def from_labeled_data(pdf_labeled_data_root_path, dataset, pdf_name):

@staticmethod
def set_paragraphs(pdf_features: PdfFeatures, paragraphs_extractions_labels: PdfLabels):
tokens_by_labels: dict[int, Paragraph] = dict()
tokens_by_labels: dict[tuple[int, int], Paragraph] = dict()

page_number_labels = PdfParagraphTokens.get_page_number_labels(paragraphs_extractions_labels)

for token_index, (page, token) in enumerate(pdf_features.loop_tokens()):
page_labels = PdfParagraphTokens.get_page_labels(paragraphs_extractions_labels, page.page_number)
page_labels = page_number_labels[page.page_number]
intersection, best_label = PdfParagraphTokens.get_intersected_label(page_labels, token)

if intersection:
tokens_by_labels.setdefault(best_label, Paragraph([])).add_token(token)
label_index = page_labels.index(best_label)
tokens_by_labels.setdefault((page.page_number, label_index), Paragraph([])).add_token(token)
else:
tokens_by_labels[-token_index - 1] = Paragraph(tokens=[token])
tokens_by_labels[(page.page_number, -token_index - 1)] = Paragraph(tokens=[token])

return PdfParagraphTokens(pdf_features, list(tokens_by_labels.values()))

@staticmethod
def get_intersected_label(page_labels, token):
def get_intersected_label(page_labels: list[Label], token: PdfToken):
max_intersection = 0
best_label = -1
for label_index, label, label_page_number in page_labels:
best_label = None
for label in page_labels:
intersection = token.get_label_intersection_percentage(label)

if intersection > max_intersection:
max_intersection = intersection
best_label = label_index
best_label = label

if max_intersection > 99:
break
Expand Down

0 comments on commit bab0ec6

Please sign in to comment.