Skip to content

Commit

Permalink
Eval app improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Guillaume Millot authored and Guillaume Millot committed Apr 30, 2024
1 parent ef31063 commit 29b141a
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 157 deletions.
1 change: 0 additions & 1 deletion collecte/pdf_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def download_pdf(
company_folder = download_folder / website_name

Path.mkdir(company_folder, parents=True, exist_ok=True)
#local_filename = Path(company_folder) / url.split("/")[-1]
local_filename = Path(company_folder) / (website_name + "___" + url.split("/")[-1])

exception_status = None
Expand Down
27 changes: 11 additions & 16 deletions configs/eval_table_extraction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,21 @@ pagefilter:
type: FromFilename

table_extraction:
# - type: Camelot
# params:
# flavor: stream
# - type: Camelot
# params:
# flavor: lattice
- type: Unstructured
- type: FromCSV
params:
hi_res_model_name: "yolox"
- type: Unstructured
csv_directory: "data/extractions/extracttable"
- type: LLamaParse
- type: UnstructuredAPI
params:
hi_res_model_name: "yolox"
pdf_image_dpi: 300
- type: UnstructuredAPI
params:
hi_res_model_name: "detectron2_onnx"
- type: Unstructured
params:
hi_res_model_name: "yolox"
pdf_image_dpi: 500
- type: UnstructuredAPI
pdf_image_dpi: 400
- type: Unstructured
params:
hi_res_model_name: "yolox"
- type: LLamaParse

# table_cleaning:
hi_res_model_name: "detectron2_onnx"
pdf_image_dpi: 400
20 changes: 15 additions & 5 deletions country_by_country/table_extraction/unstructured_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,21 @@ def __call__(self, pdf_filepath: str) -> dict:
except Exception as e:
print(e)
else:
tables_list = [
pd.read_html(StringIO(el["metadata"]["text_as_html"]))[0]
for el in resp.elements
if el["type"] == "Table"
]
tables_list = []
for el in resp.elements:
if el["type"] == "Table":
# Enclose in try block to ignore case when pandas can't read the table
# Happens when the html is incorrectly formatted
try:
table = pd.read_html(StringIO(el["metadata"]["text_as_html"]))[
0
]
except Exception:
logging.info(
"Html table discarded. Pandas couldn't read the table.",
)
else:
tables_list.append(table)

# Create asset
new_asset = {
Expand Down
Loading

0 comments on commit 29b141a

Please sign in to comment.