Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]-corrected the #275 pull request #277

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
98 changes: 98 additions & 0 deletions pygam/distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,10 +657,108 @@ def sample(self, mu):
"""
return np.random.wald(mean=mu, scale=self.scale, size=None)

#class tDist(Distribution):
# """
# non standardized student's t distribution.
# X = mu + sigma* T
# where T~ standard_t distribution
# """
#
# def __init__(self,df = None,sigma = None):
# """
# creates an instance of the tDist class
#
# Parameters
# ----------
# scale : float or None, default: None
# scale/standard deviation of the distribution
#
# Returns
# -------
# self
# """
# super(tDist, self).__init__(name='tdist', df = df)
# self.nu = df
# self.sigma = sigma
#
# def log_pdf(self, y, mu, weights=None):
# """
# computes the log of the pdf or pmf of the values under the current distribution
#
# Parameters
# ----------
# y : array-like of length n
# target values
# mu : array-like of length n
# expected values
# weights : array-like shape (n,) or None, default: None
# sample weights
# if None, defaults to array of ones
#
# Returns
# -------
# pdf/pmf : np.array of length n
# """
# if weights is None:
# weights = np.ones_like(mu)
# return sp.stats.t.logpdf(y, df = self.df, loc=mu, scale= 1.0)
#
# @divide_weights
# def V(self, mu):
# """
# have to set the definition for non-exponential families.
# """
# return np.ones_like(mu)*[(self.sigma**2)*self.nu/(self.nu-2)]
#
# @multiply_weights
# def deviance(self, y, mu, scaled=True):
# """
# model deviance
#
# for a t-distribution, how do we calculate deviance??
#
# Parameters
# ----------
# y : array-like of length n
# target values
# mu : array-like of length n
# expected values
# scaled : boolean, default: True
# whether to divide the deviance by the distribution scaled
#
# Returns
# -------
# deviances : np.array of length n
# """
# dev = np.nan
#
# return dev
#
# def sample(self, mu):
# """
# Return random samples from this non-standardized t-distribution.
#
# Samples are drawn independently from distributions
# with means given by the values in `mu` and with standard deviations
# equal to the `scale` attribute if it exists otherwise 1.0.
#
# Parameters
# ----------
# mu : array-like of shape n_samples or shape (n_simulations, n_samples)
# expected values
#
# Returns
# -------
# random_samples : np.array of same shape as mu
# """
#
# return np.random.standard_t(df = np.ones_like(mu)*self.nu,size=None)*self.sigma + mu


DISTRIBUTIONS = {'normal': NormalDist,
'poisson': PoissonDist,
'binomial': BinomialDist,
'gamma': GammaDist,
'inv_gauss': InvGaussDist
# 't': tDist
}
63 changes: 41 additions & 22 deletions pygam/pygam.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,11 @@ class GAM(Core, MetaTermMixin):

verbose : bool, optional
whether to show pyGAM warnings.


gamma : float, default: 1.4
serves as a weighting to increase the impact of the influence matrix
on the score

Attributes
----------
coef_ : array, shape (n_classes, m_features)
Expand Down Expand Up @@ -150,12 +154,14 @@ class GAM(Core, MetaTermMixin):
def __init__(self, terms='auto', max_iter=100, tol=1e-4,
distribution='normal', link='identity',
callbacks=['deviance', 'diffs'],
fit_intercept=True, verbose=False, **kwargs):
fit_intercept=True, gamma = 1.4, verbose=False,
**kwargs):

self.max_iter = max_iter
self.tol = tol
self.distribution = distribution
self.link = link
self.gamma = gamma
self.callbacks = callbacks
self.verbose = verbose
self.terms = TermList(terms) if isinstance(terms, Term) else terms
Expand Down Expand Up @@ -243,7 +249,10 @@ def _validate_params(self):
raise ValueError('unsupported link {}'.format(self.link))
if self.link in LINKS:
self.link = LINKS[self.link]()


if self.gamma < 1:
raise ValueError("gamma must be >1, "\
"but found gamma = {}".format(self.gamma))
# callbacks
if not isiterable(self.callbacks):
raise ValueError('Callbacks must be iterable, but found {}'\
Expand Down Expand Up @@ -394,7 +403,7 @@ def _linear_predictor(self, X=None, modelmat=None, b=None, term=-1):

def predict_mu(self, X):
"""
preduct expected value of target given model and input X
predict expected value of target given model and input X

Parameters
---------
Expand All @@ -416,21 +425,37 @@ def predict_mu(self, X):
lp = self._linear_predictor(X)
return self.link.mu(lp, self.distribution)

def predict(self, X):
def predict(self, X, output = "response"):
"""
preduct expected value of target given model and input X
predict expected value of target given model and input X
often this is done via expected value of GAM given input X

Parameters
---------
X : array-like of shape (n_samples, m_features)
containing the input dataset

output : str in {'response','terms'}
response to provide prediction values.
terms to provide the terms values in linear portion.

Returns
-------

y : np.array of shape (n_samples,)
containing predicted values under the model
"""
if output is set to response.

or
list of Numpy arrays of shape (n_samples,)
containing partial contributions of each term in each numpy array.
if output is set to terms.
"""
if output not in ['response','terms']:
raise ValueError('output not equal to response or terms.')
if output == 'terms':
term_values = [self.partial_dependence(term = i,X = X) for i,term in enumerate(self.terms) if not term.isintercept]
return term_values
return self.predict_mu(X)

def _modelmat(self, X, term=-1):
Expand Down Expand Up @@ -1004,7 +1029,7 @@ def _estimate_model_statistics(self, y, modelmat, inner=None, BW=None,
- edof: estimated degrees freedom
- scale: distribution scale, if applicable
- cov: coefficient covariances
- se: standarrd errors
- se: standard errors
- AIC: Akaike Information Criterion
- AICc: corrected Akaike Information Criterion
- pseudo_r2: dict of Pseudo R-squared metrics
Expand Down Expand Up @@ -1137,7 +1162,7 @@ def _estimate_r2(self, X=None, y=None, mu=None, weights=None):

return r2

def _estimate_GCV_UBRE(self, X=None, y=None, modelmat=None, gamma=1.4,
def _estimate_GCV_UBRE(self, X=None, y=None, modelmat=None,
add_scale=True, weights=None):
"""
Generalized Cross Validation and Un-Biased Risk Estimator.
Expand All @@ -1151,9 +1176,6 @@ def _estimate_GCV_UBRE(self, X=None, y=None, modelmat=None, gamma=1.4,
output data vector
modelmat : array-like, default: None
contains the spline basis for each feature evaluated at the input
gamma : float, default: 1.4
serves as a weighting to increase the impact of the influence matrix
on the score
add_scale : boolean, default: True
UBRE score can be negative because the distribution scale
is subtracted. to keep things positive we can add the scale back.
Expand All @@ -1175,10 +1197,7 @@ def _estimate_GCV_UBRE(self, X=None, y=None, modelmat=None, gamma=1.4,

see Wood 2006 pg. 177-182, 220 for more details.
"""
if gamma < 1:
raise ValueError('gamma scaling should be greater than 1, '\
'but found gamma = {}',format(gamma))


if modelmat is None:
modelmat = self._modelmat(X)

Expand All @@ -1198,10 +1217,10 @@ def _estimate_GCV_UBRE(self, X=None, y=None, modelmat=None, gamma=1.4,
if self.distribution._known_scale:
# scale is known, use UBRE
scale = self.distribution.scale
UBRE = 1./n * dev - (~add_scale)*(scale) + 2.*gamma/n * edof * scale
UBRE = 1./n * dev - (~add_scale)*(scale) + 2.*self.gamma/n * edof * scale
else:
# scale unkown, use GCV
GCV = (n * dev) / (n - gamma * edof)**2
GCV = (n * dev) / (n - self.gamma * edof)**2
return (GCV, UBRE)

def _estimate_p_values(self):
Expand Down Expand Up @@ -1245,7 +1264,7 @@ def _compute_p_value(self, term_i):
based on equations from Wood 2006 section 4.8.5 page 191
and errata https://people.maths.bris.ac.uk/~sw15190/igam/iGAMerrata-12.pdf

the errata shows a correction for the f-statisitc.
the errata shows a correction for the f-statistics.
"""
if not self._is_fitted:
raise AttributeError('GAM has not been fitted. Call fit first.')
Expand Down Expand Up @@ -2469,7 +2488,7 @@ def score(self, X, y):

def predict(self, X):
"""
preduct binary targets given model and input X
predict binary targets given model and input X

Parameters
---------
Expand All @@ -2485,7 +2504,7 @@ def predict(self, X):

def predict_proba(self, X):
"""
preduct targets given model and input X
predict targets given model and input X

Parameters
---------
Expand Down Expand Up @@ -2727,7 +2746,7 @@ def fit(self, X, y, exposure=None, weights=None):

def predict(self, X, exposure=None):
"""
preduct expected value of target given model and input X
predict expected value of target given model and input X
often this is done via expected value of GAM given input X

Parameters
Expand Down
26 changes: 25 additions & 1 deletion pygam/tests/test_GAM_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ def test_LinearGAM_prediction(mcycle_X_y, mcycle_gam):
preds = mcycle_gam.predict(X)
assert(preds.shape == y.shape)

def test_gamma(mcycle_X_y):
"""
check that if we can fit with not default gamma parameter
"""
X,y = mcycle_X_y
gam = LinearGAM(gamma = 3)
try:
gam = LinearGAM(gamma = -1.0)
except ValueError:
assert(True)
def test_LogisticGAM_accuracy(default_X_y):
"""
check that we can compute accuracy correctly
Expand Down Expand Up @@ -250,7 +260,21 @@ def test_get_params():
params = gam.get_params()
assert(params['lam'] == 420)


def test_predict_terms_output(mcycle_X_y):
"""
test to check output = 'terms' in GAM.predict()
"""
x,y = mcycle_X_y
gam = LinearGAM().fit(x,y)
terms = gam.predict(x,output = "terms")
n,m = x.shape
assert(len(terms) == m)
assert(len(terms[0]) == n)
try:
terms = gam.predict(x,output = "shyambhu")
except:
assert(True)

class TestSamplingFromPosterior(object):

def test_drawing_samples_from_unfitted_model(self, mcycle_X_y, mcycle_gam):
Expand Down