From d231143989c571f8ee6d3a46fd62cab5f4ae8d6b Mon Sep 17 00:00:00 2001 From: Christopher Brooks Date: Sun, 6 Oct 2019 20:57:50 +0000 Subject: [PATCH] scales --- 191009_scales.ipynb | 231 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 191009_scales.ipynb diff --git a/191009_scales.ipynb b/191009_scales.ipynb new file mode 100644 index 0000000..d5ccedd --- /dev/null +++ b/191009_scales.ipynb @@ -0,0 +1,231 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Scales\n", + "* We're going to talk about things you probably learned in grade school but also probably don't think about much\n", + "* And of course, we're going to talk about them in Pandas!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Let's look at some letter grades...\n", + "import pandas as pd\n", + "df=pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],\n", + " index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', \n", + " 'ok', 'ok', 'ok', 'poor', 'poor'],\n", + " columns=[\"Grades\"])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# What is our series datatype?\n", + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "* That seems pretty broad, eh? \"object\" pretty much means anything...\n", + "* We know more here. We have clear categories that have meaning to us as people. We can put this meaning into pandas `DataFrame` objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# We can use the astype() function to tell pandas to mark this as a category\n", + "df[\"Grades\"].astype(\"category\").head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "* Notice that there are now 11 categories!\n", + "* But actually, our data isn't really categorical, is it? What else do we know about this data?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# We can tell pandas that the data is ordered by first creating our own data type\n", + "my_categories=pd.CategoricalDtype(categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'], \n", + " ordered=True)\n", + "# then we just pass this to the astype() function\n", + "grades=df[\"Grades\"].astype(my_categories)\n", + "grades.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Now we can do ordinal comparisons! Look at the bad example first (no category original dataframe)\n", + "df[df[\"Grades\"]>\"C\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Now how's that look in a category aware sense?\n", + "grades[grades>\"C\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "* Great! So we can encapsulate a limited set of data types (categories) and an ordering if appropriate (through our own dtype) in pandas and it allows us to do operations we otherwise couldn't do\n", + "* Now, it turns out we use this in machine learning and data mining a fair bit. Some techniques (regression) are used to predict continuous values, while others (classification) are used to predict categories\n", + "* So how do we change from continuous data to categorical data in pandas? I'm glad you asked!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Let's look at that census data\n", + "import numpy as np\n", + "df=pd.read_csv(\"datasets/census.csv\")\n", + "df=df[df['SUMLEV']==50]\n", + "df=df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg(np.average)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Now if we just want to make \"bins\" of each of these, we can use cut()\n", + "# this just takes the dataframe, and the number of bins, and returns a new dataframe\n", + "df=pd.cut(df,10)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "* Notice the notation is mathematical (open/closed intervals)\n", + "* See how Alabama and Alaska are now in the same category, but Arizon is in another category\n", + "* Notice that pandas ordered all of these now too\n", + "* What happens if we want to add a new value into the mix?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "df.loc[\"Canada\"]=50000\n", + "df.tail()" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}