Skip to content

Commit

Permalink
Merge pull request #79 from CambioML/csv_feature
Browse files Browse the repository at this point in the history
feat: add csv feature to extract_tables
  • Loading branch information
lingjiekong authored Jan 14, 2025
2 parents 891e9e7 + 3972e7a commit fc996f0
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 17 deletions.
55 changes: 53 additions & 2 deletions any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import json
import time
import uuid
from collections.abc import Iterable
from io import StringIO
from pathlib import Path

import requests
Expand Down Expand Up @@ -184,26 +186,75 @@ def extract_pii(
file_type=file_type,
)

@staticmethod
def flatten_to_string(item):
"""
Flatten any iterable object to a string.
"""

if isinstance(item, str):
return item

# if item is a dict, flatten all keys and values
if isinstance(item, dict):
parts = []
for k, v in item.items():
parts.append(AnyParser.flatten_to_string(k))
parts.append(AnyParser.flatten_to_string(v))
return "".join(parts)

# item is other iterable objects
if isinstance(item, Iterable):
parts = []
for sub_item in item:
parts.append(AnyParser.flatten_to_string(sub_item))
return "".join(parts)

# item is not iterable objects
return str(item)

@handle_file_processing
def extract_tables(
self,
file_path=None,
file_content=None,
file_type=None,
return_type="html",
):
"""Extract tables from a file in real-time.
Args:
file_path (str): The path to the file to be parsed.
return_type (str): 'html' or 'csv'
Returns:
tuple(str, str): The extracted data and the time taken.
tuple(str, str)
"""
return self._sync_extract_tables.extract(
extracted_html, time_elapsed = self._sync_extract_tables.extract(
file_path=file_path,
file_content=file_content,
file_type=file_type,
)

if isinstance(extracted_html, list):
extracted_html = AnyParser.flatten_to_string(extracted_html)

if return_type.lower() == "csv":
try:
import pandas as pd
except ImportError:
raise ImportError("Please install pandas to use CSV return_type")

if isinstance(extracted_html, list):
extracted_html = "".join(str(item) for item in extracted_html)

df_list = pd.read_html(StringIO(extracted_html))
combined_df = pd.concat(df_list, ignore_index=True)
csv_output = combined_df.to_csv(index=False)

return csv_output, time_elapsed

return extracted_html, time_elapsed

@handle_file_processing
def extract_key_value(
self,
Expand Down
69 changes: 54 additions & 15 deletions examples/extract_tables.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -15,15 +15,23 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/ubuntu/any-parser/any_parser/__init__.py\n"
]
}
],
"source": [
"from IPython.display import display, Markdown\n",
"from any_parser import AnyParser"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -36,8 +44,13 @@
"metadata": {},
"outputs": [],
"source": [
"file_path = \"./sample_data/test_1figure_1table.png\"\n",
"html_output, time = ap.extract_tables(file_path)"
"csv_output, time_info = ap.extract_tables(\n",
" file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"csv\"\n",
")\n",
"\n",
"html_output, time_info = ap.extract_tables(\n",
" file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"html\"\n",
")"
]
},
{
Expand All @@ -46,14 +59,12 @@
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Time Elapsed: 3.97 seconds'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2 μs, sys: 0 ns, total: 2 μs\n",
"Wall time: 5.25 μs\n"
]
}
],
"source": [
Expand All @@ -62,9 +73,31 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"0,1,2\n",
",latency,(ms)\n",
"participants,mean,99th percentile\n",
"1,17.0 +1.4,75.0 34.9\n",
"2,24.5 +2.5,87.6 35.9\n",
"5,31.5 +6.2,104.5 52.2\n",
"10,30.0 +3.7,95.6 25.4\n",
"25,35.5 +5.6,100.4 42.7\n",
"50,42.7 +4.1,93.7 22.9\n",
"100,71.4 +7.6,131.2 +17.6\n",
"200,150.5 +11.0,320.3 35.1\n"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
Expand Down Expand Up @@ -93,6 +126,12 @@
}
],
"source": [
"if isinstance(csv_output, list):\n",
" csv_output_str = \"\\n\".join(csv_output)\n",
"else:\n",
" csv_output_str = csv_output\n",
"\n",
"display(Markdown(csv_output_str))\n",
"display(Markdown(html_output))"
]
}
Expand All @@ -113,7 +152,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "-1.-1.-1"
"version": "3.12.2"
}
},
"nbformat": 4,
Expand Down

0 comments on commit fc996f0

Please sign in to comment.