diff --git a/duckdb/Code Download.ipynb b/duckdb/Code Download.ipynb new file mode 100644 index 0000000000..8e0face75b --- /dev/null +++ b/duckdb/Code Download.ipynb @@ -0,0 +1,415 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d9515c08-c3cc-4b67-8298-cb5e6da8fae8", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install duckdb" + ] + }, + { + "cell_type": "markdown", + "id": "de891a8f-ef46-418e-81f1-bb400a20ffe0", + "metadata": {}, + "source": [ + "# Test Code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "542e9da3-c459-45de-84d6-7b6287422e4e", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "duckdb.sql(\"SELECT 'whistling_duck' AS waterfowl, 'whistle' AS call\")" + ] + }, + { + "cell_type": "markdown", + "id": "3c9020b1-a8b7-4dde-a70d-7ddc35d23dac", + "metadata": {}, + "source": [ + "# Parquet Import Code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ed567c9-06e3-4793-a43e-a6a9a985b4a8", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "conn = duckdb.connect(database=\"presidents.db\")\n", + "\n", + "presidents_relation = conn.read_parquet(\"presidents.parquet\")\n", + "\n", + "conn.sql(\n", + " \"\"\"\n", + " SELECT sequence, last_name, first_name\n", + " FROM presidents_relation\n", + " WHERE sequence <= 2\n", + " \"\"\"\n", + ").show()\n", + "\n", + "presidents_relation.to_table(\"presidents\")\n", + "\n", + "conn.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88f82cd9-9056-4cdc-9ff1-9827d0900792", + "metadata": {}, + "outputs": [], + "source": [ + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " conn.sql(\n", + " \"\"\"\n", + " SELECT last_name, first_name\n", + " FROM presidents\n", + " WHERE last_name = 'Washington' \n", + " \"\"\"\n", + " ).show()" + ] + }, + { + "cell_type": "raw", + "id": "be1aeacd-cdb3-4641-b3cd-19173e2875ed", + "metadata": {}, + "source": [ + "# This code won't work.\n", + "\n", + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " conn.sql(\"SELECT * FROM presidents_relation\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "d87920b4-cf7b-462b-8f89-7ab40d6ba8fa", + "metadata": {}, + "source": [ + "# Data Interpretation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "491262f1-8841-459a-9fff-4d9b23caa4bb", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " presidents_relation = conn.read_csv(\"presidents.csv\")\n", + " presidents_relation.limit(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1b48835-4f78-4be1-93b9-4b563a91238b", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " presidents_relation = conn.read_csv(\n", + " \"presidents.csv\", date_format=\"%B %d %Y\"\n", + " )\n", + " presidents_relation.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "281fc954-cbe8-4e13-bc35-13ead7d28fcb", + "metadata": {}, + "source": [ + "# Database Queries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8342da7-a20e-49f7-a156-18b875734f52", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " (conn.read_json(\"parties.json\").to_table(\"parties\"))\n", + "\n", + "with duckdb.connect(\"presidents.db\") as conn:\n", + " conn.sql(\n", + " \"\"\"\n", + " SELECT first_name, last_name, party_name\n", + " FROM parties\n", + " JOIN presidents\n", + " ON presidents.party_id = parties.party_id\n", + " ORDER BY last_name DESC\n", + " \"\"\"\n", + " ).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11437a3f-c64e-4ba1-ab14-6d98f808c5f1", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "presidents = duckdb.read_parquet(\"presidents.parquet\")\n", + "parties = duckdb.read_json(\"parties.json\")\n", + "\n", + "duckdb.sql(\n", + " \"\"\"\n", + " SELECT first_name, last_name, party_name\n", + " FROM parties\n", + " JOIN presidents\n", + " ON presidents.party_id = parties.party_id\n", + " ORDER BY last_name DESC\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2223996-ed7f-49cb-8aa7-34c4e4cf45c6", + "metadata": {}, + "outputs": [], + "source": [ + "presidents = duckdb.read_parquet(\"presidents.parquet\").set_alias(\"presidents\")\n", + "parties = duckdb.read_json(\"parties.json\").set_alias(\"parties\")\n", + "\n", + "(\n", + " presidents.join(parties, \"presidents.party_id = parties.party_id\")\n", + " .select(\"first_name\", \"last_name\", \"party_name\")\n", + " .order(\"last_name DESC\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8c300c6e-1b5e-41e6-bbdf-04ec2ce04b8c", + "metadata": {}, + "source": [ + "# Concurrent Access" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03637669-43fb-4593-91e6-c44b425399b3", + "metadata": {}, + "outputs": [], + "source": [ + "import concurrent.futures\n", + "import duckdb\n", + "\n", + "\n", + "def read_data(thread_id):\n", + " print(f\"Thread {thread_id} starting its read.\")\n", + " with duckdb.connect(\"presidents.db\") as conn:\n", + " conn.sql(\n", + " \"\"\"\n", + " SELECT first_name, last_name\n", + " FROM presidents\n", + " WHERE sequence = 1\n", + " \"\"\"\n", + " ).show()\n", + " print(f\"Thread {thread_id} ending its read.\")\n", + "\n", + "\n", + "def concurrent_access():\n", + " with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:\n", + " executor.map(read_data, range(3))\n", + "\n", + "\n", + "concurrent_access()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65384b11-a04d-456a-956d-49ccc5057bf4", + "metadata": {}, + "outputs": [], + "source": [ + "import concurrent.futures\n", + "import duckdb\n", + "\n", + "\n", + "def update_data(thread_id):\n", + " new_name = f\"George ({thread_id})\"\n", + " with duckdb.connect(\"presidents.db\") as conn:\n", + " print(f\"Thread {thread_id} starting update.\")\n", + " conn.sql(\n", + " f\"\"\"\n", + " UPDATE presidents\n", + " SET first_name = '{new_name}'\n", + " WHERE sequence = 1\n", + " \"\"\"\n", + " )\n", + " print(f\"Thread {thread_id} ending update.\")\n", + "\n", + "\n", + "def concurrent_access():\n", + " with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:\n", + " executor.map(update_data, range(3))\n", + "\n", + "\n", + "concurrent_access()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "892a7a41-89b2-4ceb-8e6c-221d417520d0", + "metadata": {}, + "outputs": [], + "source": [ + "with duckdb.connect(\"presidents.db\") as conn:\n", + " conn.sql(\n", + " \"\"\"\n", + " SELECT last_name, first_name\n", + " FROM presidents\n", + " WHERE sequence = 1\n", + " \"\"\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "bd5f57cf-b8a0-4833-9675-bcc7a6907124", + "metadata": {}, + "source": [ + "## Integrating With Python's Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ecd1c32-608a-43a1-a3fd-25519585a4e5", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "\n", + "def short_name(first_name: str, last_name: str) -> str:\n", + " return f\"{first_name[0]}. {last_name}\"\n", + "\n", + "\n", + "duckdb.remove_function(\"short_name\") # Prevents NotImplementedException\n", + "duckdb.create_function(\"short_name\", short_name)\n", + "\n", + "\n", + "presidents = duckdb.read_parquet(\"presidents.parquet\")\n", + "\n", + "duckdb.sql(\n", + " \"\"\" \n", + " SELECT short_name(first_name, last_name) AS name,\n", + " (term_end - term_start) AS \"days in office\"\n", + " FROM presidents\n", + " \"\"\"\n", + ").limit(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3934e0e3-7875-40ad-9882-a06c87030c6f", + "metadata": {}, + "outputs": [], + "source": [ + "python -m pip install pandas polars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77a11ef6-308e-4279-94f1-a38b1b046874", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "import pandas\n", + "\n", + "with duckdb.connect(\"presidents.db\") as conn:\n", + " pandas_df = conn.sql(\n", + " \"\"\"\n", + " SELECT last_name, first_name\n", + " FROM presidents\n", + " WHERE sequence <= 4\n", + " \"\"\"\n", + " ).df()\n", + "\n", + "pandas_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d348d5dc-ff6a-4a61-9924-24ff345a63bd", + "metadata": {}, + "outputs": [], + "source": [ + "import polars\n", + "\n", + "presidents = duckdb.read_parquet(\"presidents.parquet\").set_alias(\"presidents\")\n", + "parties = duckdb.read_json(\"parties.json\").set_alias(\"parties\")\n", + "\n", + "(\n", + " presidents.join(parties, \"presidents.party_id = parties.party_id\")\n", + " .select(\"first_name\", \"last_name\", \"party_name\")\n", + " .order(\"last_name DESC\")\n", + ").pl().head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8bed33b-966c-40c0-9d80-3332cf7e5a56", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/duckdb/README.md b/duckdb/README.md new file mode 100644 index 0000000000..6663e007c5 --- /dev/null +++ b/duckdb/README.md @@ -0,0 +1,9 @@ +These files will allow you to work along with the [Introducing DuckDB](https://realpython.com/introducing-duckdb/) tutorial. + +The files are: + +Code Download.ipynb - Contains the code you see in the tutorial. +parties.json - JSON file containing political party information. +presidents.csv - CSV file containing president information. +presidents.parquet - Parquet file containing president information. + \ No newline at end of file diff --git a/duckdb/parties.json b/duckdb/parties.json new file mode 100644 index 0000000000..6db857044a --- /dev/null +++ b/duckdb/parties.json @@ -0,0 +1,5 @@ +{"party_id":10,"party_name":"Unaffiliated"} +{"party_id":20,"party_name":"Federalist"} +{"party_id":30,"party_name":"Democratic Republican"} +{"party_id":40,"party_name":"Democratic"} +{"party_id":50,"party_name":"Whig"} \ No newline at end of file diff --git a/duckdb/presidents.csv b/duckdb/presidents.csv new file mode 100644 index 0000000000..1fd16ff8ff --- /dev/null +++ b/duckdb/presidents.csv @@ -0,0 +1,11 @@ +order,last_name,first_name,term_start,term_end,party +1,Washington,George,April 30 1789,March 04 1797,10 +2,Adams,John,March 4 1797,March 04 1801,20 +3,Jefferson,Thomas,March 4 1801,March 04 1809,30 +4,Madison,James,March 4 1809,March 04 1817,30 +5,Monroe,James,March 4 1817,March 04 1825,30 +6,Adams,John Quincy,March 4 1825,March 4 1829,30 +7,Jackson,Andrew,March 4 1829,March 4 1837,30 +8,Van Buren,Martin,March 4 1837,March 4 1841,40 +9,Harrison,William Henry,March 4 1841,April 4 1841,50 +10,Tyler,John,April 4 1841,March 04 1845,50 \ No newline at end of file diff --git a/duckdb/presidents.parquet b/duckdb/presidents.parquet new file mode 100644 index 0000000000..370b5a50fc Binary files /dev/null and b/duckdb/presidents.parquet differ