moved things around for tuesday

cab938 · Sep 29, 2019 · 34c0110 · 34c0110
1 parent c65562a
commit 34c0110
Show file tree

Hide file tree

Showing 5 changed files with 2,360 additions and 184 deletions.
diff --git a/190930_more_cleaning.ipynb b/190930_more_cleaning.ipynb
@@ -0,0 +1,51 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Real Data Cleaning\n",
+    "* I need to learn about US presidents! Let's hit wikipedia..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "df=pd.read_html(\"https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States\")[1]\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Let's try and build a dataframe that has the president number, start and end date of term, name and birth/death years. I think there are likely questions like this on the citizenship test."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/190930_more_cleaning_solutions.ipynb b/190930_more_cleaning_solutions.ipynb
@@ -25,6 +25,139 @@
    "source": [
     "* Let's try and build a dataframe that has the president number, start and end date of term, name and birth/death years. I think there are likely questions like this on the citizenship test."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.columns.levels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.columns = df.columns.droplevel(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df=df.rename(columns={\"Presidency[a]\":\"Presidency\"})\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_start(values):\n",
+    "    splits=values.split(\"–\")\n",
+    "    return splits[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"Presidency[a].1\"].apply(get_start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# When using apply we have two differnet options, we can modify in place, or return a result\n",
+    "# talk about apply on a series object\n",
+    "df[\"start\"]=df[\"Presidency[a].1\"].apply(get_start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# talk about apply on a dataframe object and the importance of the axis\n",
+    "def get_end(whole_row):\n",
+    "    splits=whole_row[\"Presidency[a].1\"].split(\"–\")\n",
+    "    whole_row[\"end\"]=splits[-1]\n",
+    "    return whole_row\n",
+    "df.apply(get_end, axis='columns')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* apply is super powerful, super awesome, super useful, live and love apply. braodcast ftw!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# how about that president name?\n",
+    "# we can use regex in replace! and assign to new column!\n",
+    "df[\"name\"]=df[\"President.1\"].str.replace(\"\\d+.*\",\"\")\n",
+    "df[\"name\"].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extract will pull out multiple columns! this is amazing!\n",
+    "# and we can unpack columns into sub frames!\n",
+    "df[[\"born\",\"died\"]]=df[\"President.1\"].str.extract(\"(?P<born>\\d\\d\\d\\d)–(?P<died>\\d\\d\\d\\d)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# jimmy carter is a zombie!\n",
+    "df.iloc[69]"
+   ]
   }
  ],
  "metadata": {

diff --git a/190930_more_pandas.ipynb b/190930_more_pandas.ipynb
diff --git a/191002_merging.ipynb → 191002_a_merging.ipynb b/191002_merging.ipynb → 191002_a_merging.ipynb