diff --git a/demo/pdf_to_html_to_excel.ipynb b/demo/pdf_to_html_to_excel.ipynb index a409758..33f60cc 100644 --- a/demo/pdf_to_html_to_excel.ipynb +++ b/demo/pdf_to_html_to_excel.ipynb @@ -4,30 +4,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Install dependency" + "# Extract a Table from PDF into Excel\n", + "\n", + "Below it's an example of using AnyParser to extract a complicated table from a financial report (PDF) into Excel spread sheet.\n", + "\n", + "\n", + "## 1. Load the libraries\n", + "\n", + "If you have install `any_parser`, uncomment the below line." ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting openpyxl\n", - " Downloading openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)\n", - "Collecting et-xmlfile (from openpyxl)\n", - " Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)\n", - "Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.0/250.0 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hDownloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n", - "Installing collected packages: et-xmlfile, openpyxl\n", - "Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2\n" - ] - } - ], + "outputs": [], "source": [ "# !pip3 install python-dotenv\n", "# !pip3 install --upgrade any-parser\n", @@ -38,19 +29,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Import and update path" + "## 2. Import and update path" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/mb/7wp0k3g17jd11kk9xlv5mh3m0000gn/T/ipykernel_76979/2399796935.py:2: DeprecationWarning: \n", + "/var/folders/mb/7wp0k3g17jd11kk9xlv5mh3m0000gn/T/ipykernel_79818/3792744553.py:2: DeprecationWarning: \n", "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", "but was not found to be installed on your system.\n", @@ -74,12 +65,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Load api key and Initialize AnyParser" + "## 3. Load Your API Key and Initialize AnyParser" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -93,12 +84,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Helper function: Convert HTML to Excel" + "## 4. Helper function: Convert HTML to Excel" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -132,12 +123,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## parse into html: March_2024 page 8" + "## 5. Parse into HTML and Excel\n", + "\n", + "### 5.1 sample: March_2024 page 8" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -194,21 +187,22 @@ } ], "source": [ - "sample_page8 = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf\"\n", - "sample_page8_result = op.parse(sample_page8)\n", + "sample_page8 = \"Nevada_Gaming_Revenue_Report_(March_2024)_page8\"\n", + "sample_page8_result = op.parse(\"./{}.pdf\".format(sample_page8))\n", + "\n", "display(HTML(sample_page8_result[0]))" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Excel file saved to output/./Nevada_Gaming_Revenue_Report_(March_2024)_page8.pdf.xlsx\n" + "Excel file saved to output/Nevada_Gaming_Revenue_Report_(March_2024)_page8.xlsx\n" ] } ], @@ -222,12 +216,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## parse into html: March_2024 page 35" + "### 5.2 sample: March_2024 page 35" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -250,21 +244,22 @@ } ], "source": [ - "sample_page35 = \"./Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf\"\n", - "sample_page35_result = op.parse(sample_page35)\n", + "sample_page35 = \"Nevada_Gaming_Revenue_Report_(March_2024)_page35\"\n", + "sample_page35_result = op.parse(\"./{}.pdf\".format(sample_page35))\n", + "\n", "display(HTML(sample_page35_result[0]))" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Excel file saved to output/./Nevada_Gaming_Revenue_Report_(March_2024)_page35.pdf.xlsx\n" + "Excel file saved to output/Nevada_Gaming_Revenue_Report_(March_2024)_page35.xlsx\n" ] } ], @@ -278,7 +273,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## parse into html: March_2024 page 43" + "### 5.3 sample: March_2024 page 43" ] }, { @@ -316,8 +311,8 @@ } ], "source": [ - "sample_page43 = \"Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf\"\n", - "sample_page43_result = op.parse(\"./\"+ sample_page43)\n", + "sample_page43 = \"Nevada_Gaming_Revenue_Report_(March_2024)_page43\"\n", + "sample_page43_result = op.parse(\"./{}.pdf\".format(sample_page43))\n", "display(HTML(sample_page43_result[0]))" ] }, @@ -330,7 +325,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Excel file saved to output/Nevada_Gaming_Revenue_Report_(March_2024)_page43.pdf.xlsx\n" + "Excel file saved to output/Nevada_Gaming_Revenue_Report_(March_2024)_page43.xlsx\n" ] } ],