Skip to content

Commit

Permalink
add e2e demo for KM
Browse files Browse the repository at this point in the history
  • Loading branch information
goldmermaid committed May 7, 2024
1 parent dbd4f36 commit 3337fee
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 62 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,8 @@ cython_debug/
.DS_Store

# vscode
.vscode/
.vscode/

# data/
*.xlsx
*.csv
205 changes: 144 additions & 61 deletions demo/demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,29 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 21,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting openpyxl\n",
" Downloading openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)\n",
"Collecting et-xmlfile (from openpyxl)\n",
" Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)\n",
"Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.0/250.0 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hDownloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n",
"Installing collected packages: et-xmlfile, openpyxl\n",
"Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2\n"
]
}
],
"source": [
"# !pip3 install python-dotenv\n",
"# !pip3 install --upgrade any-parser"
"# !pip3 install --upgrade any-parser\n",
"# !pip3 install openpyxl"
]
},
{
Expand All @@ -26,79 +43,101 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"sys.path.append(\".\")\n",
"sys.path.append(\"..\")\n",
"sys.path.append(\"../..\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/mb/7wp0k3g17jd11kk9xlv5mh3m0000gn/T/ipykernel_76979/2399796935.py:2: DeprecationWarning: \n",
"Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
"(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
"but was not found to be installed on your system.\n",
"If this would cause problems for you,\n",
"please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
" \n",
" import pandas as pd\n"
]
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"\n",
"from dotenv import load_dotenv\n",
"from any_parser import AnyParser\n",
"\n",
"from IPython.display import HTML, display\n"
"from IPython.display import HTML, display"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load api key"
"## Load api key and Initialize AnyParser"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"example_apikey = os.getenv(\"CAMBIO_API_KEY\")"
"example_apikey = os.getenv(\"CAMBIO_API_KEY\")\n",
"\n",
"op = AnyParser(example_apikey)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## parse into html: March_2024 page 8"
"## Helper function: Convert HTML to Excel"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"example_local_file = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf\"\n",
"import os\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"from io import StringIO\n",
"\n",
"\n",
"op = AnyParser(example_apikey)\n",
"# mode can be \"basic\" or \"advanced\"\n",
"qa_result = op.parse(example_local_file)"
"def html_to_excel(html_string, output_folder, output_filename):\n",
" if not os.path.exists(output_folder):\n",
" os.makedirs(output_folder)\n",
"\n",
" soup = BeautifulSoup(html_string, 'html.parser')\n",
"\n",
" tables = soup.find_all('table')\n",
"\n",
" dfs = {}\n",
" for i, table in enumerate(tables):\n",
" dfs[f\"Table_{i+1}\"] = pd.read_html(StringIO(str(table)))[0]\n",
"\n",
" output_file = os.path.join(output_folder, output_filename)\n",
" with pd.ExcelWriter(output_file) as writer:\n",
" for name, df in dfs.items():\n",
" df.to_excel(writer, sheet_name=name, index=False)\n",
"\n",
" print(f\"Excel file saved to {output_file}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## parse into html: March_2024 page 8"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -155,27 +194,35 @@
}
],
"source": [
"display(HTML(qa_result[0]))"
"sample_page8 = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf\"\n",
"sample_page8_result = op.parse(sample_page8)\n",
"display(HTML(sample_page8_result[0]))"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Excel file saved to output/./Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf.xlsx\n"
]
}
],
"source": [
"## parse into html: March_2024 page 35"
"output_folder = 'output'\n",
"sample_page8_output = '{}.xlsx'.format(sample_page8)\n",
"html_to_excel(sample_page8_result[0], output_folder, sample_page8_output)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"example_local_file = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf\"\n",
"\n",
"op = AnyParser(example_apikey)\n",
"# mode can be \"basic\" or \"advanced\"\n",
"qa_result = op.parse(example_local_file)"
"## parse into html: March_2024 page 35"
]
},
{
Expand Down Expand Up @@ -203,32 +250,40 @@
}
],
"source": [
"display(HTML(qa_result[0]))"
"sample_page35 = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf\"\n",
"sample_page35_result = op.parse(sample_page35)\n",
"display(HTML(sample_page35_result[0]))"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Excel file saved to output/./Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf.xlsx\n"
]
}
],
"source": [
"## parse into html: March_2024 page 43"
"output_folder = 'output'\n",
"sample_page35_output = '{}.xlsx'.format(sample_page35)\n",
"html_to_excel(sample_page35_result[0], output_folder, sample_page35_output)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"example_local_file = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf\"\n",
"\n",
"op = AnyParser(example_apikey)\n",
"# mode can be \"basic\" or \"advanced\"\n",
"qa_result = op.parse(example_local_file)"
"## parse into html: March_2024 page 43"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -261,8 +316,36 @@
}
],
"source": [
"display(HTML(qa_result[0]))"
"sample_page43 = \"Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf\"\n",
"sample_page43_result = op.parse(\"./\"+ sample_page43)\n",
"display(HTML(sample_page43_result[0]))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Excel file saved to output/Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf.xlsx\n"
]
}
],
"source": [
"output_folder = 'output'\n",
"sample_page43_output = '{}.xlsx'.format(sample_page43)\n",
"html_to_excel(sample_page43_result[0], output_folder, sample_page43_output)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 3337fee

Please sign in to comment.