From 3337feeeec66800b09e87db3d83892c8e8bd5868 Mon Sep 17 00:00:00 2001 From: Rachel Hu Date: Tue, 7 May 2024 09:54:00 -0700 Subject: [PATCH] add e2e demo for KM --- .gitignore | 6 +- demo/demo.ipynb | 205 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 149 insertions(+), 62 deletions(-) diff --git a/.gitignore b/.gitignore index 65a8a3d..3ced46f 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,8 @@ cython_debug/ .DS_Store # vscode -.vscode/ \ No newline at end of file +.vscode/ + +# data/ +*.xlsx +*.csv \ No newline at end of file diff --git a/demo/demo.ipynb b/demo/demo.ipynb index 36938dc..a409758 100644 --- a/demo/demo.ipynb +++ b/demo/demo.ipynb @@ -9,12 +9,29 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting openpyxl\n", + " Downloading openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)\n", + "Collecting et-xmlfile (from openpyxl)\n", + " Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)\n", + "Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.0/250.0 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hDownloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n", + "Installing collected packages: et-xmlfile, openpyxl\n", + "Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2\n" + ] + } + ], "source": [ "# !pip3 install python-dotenv\n", - "# !pip3 install --upgrade any-parser" + "# !pip3 install --upgrade any-parser\n", + "# !pip3 install openpyxl" ] }, { @@ -26,79 +43,101 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import sys" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "sys.path.append(\".\")\n", - "sys.path.append(\"..\")\n", - "sys.path.append(\"../..\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/mb/7wp0k3g17jd11kk9xlv5mh3m0000gn/T/ipykernel_76979/2399796935.py:2: DeprecationWarning: \n", + "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", + "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", + "but was not found to be installed on your system.\n", + "If this would cause problems for you,\n", + "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", + " \n", + " import pandas as pd\n" + ] + } + ], "source": [ "import os\n", "import pandas as pd\n", "\n", "from dotenv import load_dotenv\n", "from any_parser import AnyParser\n", - "\n", - "from IPython.display import HTML, display\n" + "from IPython.display import HTML, display" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Load api key" + "## Load api key and Initialize AnyParser" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "load_dotenv(override=True)\n", - "example_apikey = os.getenv(\"CAMBIO_API_KEY\")" + "example_apikey = os.getenv(\"CAMBIO_API_KEY\")\n", + "\n", + "op = AnyParser(example_apikey)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## parse into html: March_2024 page 8" + "## Helper function: Convert HTML to Excel" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "example_local_file = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf\"\n", + "import os\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "from io import StringIO\n", + "\n", "\n", - "op = AnyParser(example_apikey)\n", - "# mode can be \"basic\" or \"advanced\"\n", - "qa_result = op.parse(example_local_file)" + "def html_to_excel(html_string, output_folder, output_filename):\n", + " if not os.path.exists(output_folder):\n", + " os.makedirs(output_folder)\n", + "\n", + " soup = BeautifulSoup(html_string, 'html.parser')\n", + "\n", + " tables = soup.find_all('table')\n", + "\n", + " dfs = {}\n", + " for i, table in enumerate(tables):\n", + " dfs[f\"Table_{i+1}\"] = pd.read_html(StringIO(str(table)))[0]\n", + "\n", + " output_file = os.path.join(output_folder, output_filename)\n", + " with pd.ExcelWriter(output_file) as writer:\n", + " for name, df in dfs.items():\n", + " df.to_excel(writer, sheet_name=name, index=False)\n", + "\n", + " print(f\"Excel file saved to {output_file}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## parse into html: March_2024 page 8" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -155,27 +194,35 @@ } ], "source": [ - "display(HTML(qa_result[0]))" + "sample_page8 = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf\"\n", + "sample_page8_result = op.parse(sample_page8)\n", + "display(HTML(sample_page8_result[0]))" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 13, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Excel file saved to output/./Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf.xlsx\n" + ] + } + ], "source": [ - "## parse into html: March_2024 page 35" + "output_folder = 'output'\n", + "sample_page8_output = '{}.xlsx'.format(sample_page8)\n", + "html_to_excel(sample_page8_result[0], output_folder, sample_page8_output)" ] }, { - "cell_type": "code", - "execution_count": 8, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "example_local_file = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf\"\n", - "\n", - "op = AnyParser(example_apikey)\n", - "# mode can be \"basic\" or \"advanced\"\n", - "qa_result = op.parse(example_local_file)" + "## parse into html: March_2024 page 35" ] }, { @@ -203,32 +250,40 @@ } ], "source": [ - "display(HTML(qa_result[0]))" + "sample_page35 = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf\"\n", + "sample_page35_result = op.parse(sample_page35)\n", + "display(HTML(sample_page35_result[0]))" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 12, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Excel file saved to output/./Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf.xlsx\n" + ] + } + ], "source": [ - "## parse into html: March_2024 page 43" + "output_folder = 'output'\n", + "sample_page35_output = '{}.xlsx'.format(sample_page35)\n", + "html_to_excel(sample_page35_result[0], output_folder, sample_page35_output)" ] }, { - "cell_type": "code", - "execution_count": 10, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "example_local_file = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf\"\n", - "\n", - "op = AnyParser(example_apikey)\n", - "# mode can be \"basic\" or \"advanced\"\n", - "qa_result = op.parse(example_local_file)" + "## parse into html: March_2024 page 43" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -261,8 +316,36 @@ } ], "source": [ - "display(HTML(qa_result[0]))" + "sample_page43 = \"Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf\"\n", + "sample_page43_result = op.parse(\"./\"+ sample_page43)\n", + "display(HTML(sample_page43_result[0]))" ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Excel file saved to output/Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf.xlsx\n" + ] + } + ], + "source": [ + "output_folder = 'output'\n", + "sample_page43_output = '{}.xlsx'.format(sample_page43)\n", + "html_to_excel(sample_page43_result[0], output_folder, sample_page43_output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {