From aa9a60fd95755136e988566dbff90ffc740c3e93 Mon Sep 17 00:00:00 2001 From: George Ho <19851673+eigenfoo@users.noreply.github.com> Date: Wed, 26 Jun 2019 00:47:21 +0000 Subject: [PATCH] BLD: begin porting ANOVA chapter (#11) * MAINT: update toc function * BLD: building * BUG: add d as a good name to pylint * MAINT: remove patsy as req --- .pylintrc | 2 +- Makefile | 2 +- index.html | 346 ++++++++++++++++++++++++++++++++++-------- plots.py | 51 +++++++ requirements-dev.txt | 5 + requirements.txt | 5 - tests-as-linear.ipynb | 297 ++++++++++++++++++++++++++++++------ utils.py | 7 +- 8 files changed, 602 insertions(+), 113 deletions(-) create mode 100644 requirements-dev.txt diff --git a/.pylintrc b/.pylintrc index 399caca..7c70d31 100644 --- a/.pylintrc +++ b/.pylintrc @@ -94,7 +94,7 @@ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / stateme bad-functions=map,filter,input # Good variable names which should always be accepted, separated by a comma -good-names=a,b,c,f,i,j,k,df,x,y,y2,_,fig,ax +good-names=a,b,c,d,f,i,j,k,df,x,y,y2,_,fig,ax # Bad variable names which should always be refused, separated by a comma bad-names=foo,bar,baz,toto,tutu,tata diff --git a/Makefile b/Makefile index 3f18912..7f342eb 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ venv: # Set up Python virtual environment. python -m venv ${VENV_PATH}; \ source ${VENV_PATH}/bin/activate; \ pip install -U pip; \ - pip install -r requirements.txt; \ + pip install -r requirements-dev.txt; \ deactivate; \ ) @printf "\n\nVirtual environment created! \033[1;34mRun \`source ${VENV_PATH}/bin/activate\` to activate it.\033[0m\n\n\n" diff --git a/index.html b/index.html index baae204..6e4e239 100644 --- a/index.html +++ b/index.html @@ -13187,7 +13187,7 @@
Model: One mean for each group predicts $y$.
$y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 +... \qquad \mathcal{H}_0: y = \beta_0$
where $x_i$ are indicators ($x=0$ or $x=1$) where at most one $x_i=1$ while all others are $x_i=0$.
-Notice how this is just "more of the same" of what we already did in other models above. When there are only two groups, this model is $y = \beta_0 + \beta_1*x$, i.e. the independent t-test. If there is only one group, it is $y = \beta_0$, i.e. the one-sample t-test. This is easy to see in the visualization below - just cover up a few groups and see that it matches the other visualizations above.
+Notice how this is just "more of the same" of what we already did in other models above. When there are only two groups, this model is $y = \beta_0 + \beta_1*x$, i.e. the independent t-test. If there is only one group, it is $y = \beta_0$, i.e. the one-sample t-test. This is easy to see in the visualization below - just cover up a few groups and see that it matches the other visualizations above.
@@ -14769,19 +14728,41 @@
+plots.one_way_anova_plot()
+plt.show()
A one-way ANOVA has a log-linear counterpart called goodness-of-fit test which we'll return to. By the way, since we now regress on more than one $x$, the one-way ANOVA is a multiple regression model.
+A one-way ANOVA has a log-linear counterpart called goodness-of-fit test which we'll return to. By the way, since we now regress on more than one $x$, the one-way ANOVA is a multiple regression model.
The Kruskal-Wallis test is simply a one-way ANOVA on the rank-transformed $y$ (value
):
$rank(y) = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 +...$
+$\text{rank}(y) = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 +...$
This approximation is good enough for 12 or more data points. Again, if you do this for just one or two groups, we're already acquainted with those equations, i.e. the Wilcoxon signed-rank test or the Mann-Whitney U test respectively.
+num_points = 20
+df = pd.DataFrame()
+df["y"] = np.concatenate([
+ np.random.normal(0.0, 1, num_points),
+ np.random.normal(1.0, 1, num_points),
+ np.random.normal(0.5, 1, num_points),
+])
+
+df["group"] = list("".join([num_points * char for char in "abc"]))
+df = df.join(pd.get_dummies(df.group, prefix="group", drop_first=True).astype(np.float64))
df.head()
+
+F, p = scipy.stats.f_oneway(df[df["group"] == "a"].y,
+ df[df["group"] == "b"].y,
+ df[df["group"] == "c"].y)
+
+res = smf.ols("y ~ 1 + group_b + group_c", df).fit()
# FIXME what to tabulate here?
+utils.tabulate_results([None, p, None, None, None],
+ res,
+ ["scipy.stats.f_oneway", "smf.ols (y ~ 1 + group_b + group_c)"],
+ coeff="group_b")
+
+_, p = scipy.stats.kruskal(df[df["group"] == "a"].y,
+ df[df["group"] == "b"].y,
+ df[df["group"] == "c"].y)
+
+res = smf.ols("y ~ 1 + group_b + group_c", df).fit() # TODO rank
!cat requirements.txt
+
\n", + " | y | \n", + "group | \n", + "group_b | \n", + "group_c | \n", + "
---|---|---|---|---|
0 | \n", + "1.080830 | \n", + "a | \n", + "0.0 | \n", + "0.0 | \n", + "
1 | \n", + "-2.316629 | \n", + "a | \n", + "0.0 | \n", + "0.0 | \n", + "
2 | \n", + "-0.138365 | \n", + "a | \n", + "0.0 | \n", + "0.0 | \n", + "
3 | \n", + "2.003775 | \n", + "a | \n", + "0.0 | \n", + "0.0 | \n", + "
4 | \n", + "2.051200 | \n", + "a | \n", + "0.0 | \n", + "0.0 | \n", + "
\n", + " | value | \n", + "p-values | \n", + "t-values | \n", + "0.025 CI | \n", + "0.975 CI | \n", + "
---|---|---|---|---|---|
scipy.stats.f_oneway | \n", + "NaN | \n", + "0.015156 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
smf.ols (y ~ 1 + group_b + group_c) | \n", + "0.801863 | \n", + "0.012126 | \n", + "2.591178 | \n", + "0.182182 | \n", + "1.421545 | \n", + "