-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinclude_pvalue.py
56 lines (45 loc) · 1.86 KB
/
include_pvalue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn import linear_model
data = pd.read_csv('data/1.02. Multiple linear regression.csv')
x = data[['SAT','Rand 1,2,3']]
y = data['GPA']
import scipy.stats as stat
class LinearRegression(linear_model.LinearRegression):
"""
LinearRegression class after sklearn's, but calculate t-statistics
and p-values for model coefficients (betas).
Additional attributes available after .fit()
are `t` and `p` which are of the shape (y.shape[1], X.shape[1])
which is (n_features, n_coefs)
This class sets the intercept to 0 by default, since usually we include it
in X.
"""
# nothing changes in __init__
def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
n_jobs=1):
self.fit_intercept = fit_intercept
self.normalize = normalize
self.copy_X = copy_X
self.n_jobs = n_jobs
def fit(self, X, y, n_jobs=1):
self = super(LinearRegression, self).fit(X, y, n_jobs)
# Calculate SSE (sum of squared errors)
# and SE (standard error)
sse = np.sum((self.predict(X) - y) ** 2, axis=0) / float(X.shape[0] - X.shape[1])
se = np.array([np.sqrt(np.diagonal(sse * np.linalg.inv(np.dot(X.T, X))))])
# compute the t-statistic for each feature
self.t = self.coef_ / se
# find the p-value for each feature
self.p = np.squeeze(2 * (1 - stat.t.cdf(np.abs(self.t), y.shape[0] - X.shape[1])))
return self
reg_with_pvalues = LinearRegression()
reg_with_pvalues.fit(x,y)
print('P-Values : ', reg_with_pvalues.p)
reg_summary = pd.DataFrame([['SAT'],['Rand 1,2,3']],columns =['Features'])
reg_summary['Coefficients'] = reg_with_pvalues.coef_
reg_summary['p-values'] = reg_with_pvalues.p.round(3)
print(reg_summary)