Skip to content

Commit

Permalink
Merge pull request #55 from rmarkello/missingno
Browse files Browse the repository at this point in the history
[REF] Handle missing data rows in pls_regression
  • Loading branch information
rmarkello authored Nov 4, 2019
2 parents ae162f7 + 4fbd403 commit d8a19d5
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 21 deletions.
11 changes: 11 additions & 0 deletions pyls/tests/types/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,17 @@ def test_regression_3dbootstrap(aggfunc):
bootsamples=bootsamples, n_boot=10)


def test_regression_missingdata():
X = rs.rand(subj, Xf)
X[10] = np.nan
PLSRegressionTests(X=X, n_components=2)
X[20] = np.nan
PLSRegressionTests(X=X, n_components=2)
Y = rs.rand(subj, Yf)
Y[11] = np.nan
PLSRegressionTests(X=X, Y=Y, n_components=2)


def test_errors():
with pytest.raises(ValueError):
PLSRegressionTests(n_components=1000)
Expand Down
58 changes: 37 additions & 21 deletions pyls/types/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@ def resid_yscores(x_scores, y_scores, copy=True):
return y_scores


def get_mask(X, Y):
""" Returns mask removing rows where either X/Y contain all NaN values
"""

return np.logical_not(np.logical_or(np.all(np.isnan(X), axis=1),
np.all(np.isnan(Y), axis=1)))


def simpls(X, Y, n_components=None, seed=1234):
"""
Performs partial least squares regression with the SIMPLS algorithm
Expand Down Expand Up @@ -226,10 +234,11 @@ def __init__(self, X, Y, *, n_components=None,
'or one of {}'.format(sorted(aggfuncs)))
self.aggfunc = aggfuncs.get(aggfunc, aggfunc)

# these need to be zero -- they're not implemented for PLSRegression
kwargs.update(n_split=0, test_split=0)
super().__init__(X=np.asarray(X), Y=np.asarray(Y),
n_components=n_components, n_perm=n_perm,
n_boot=n_boot, n_split=0, test_split=0,
rotate=rotate, ci=ci, aggfunc=aggfunc,
n_boot=n_boot, rotate=rotate, ci=ci, aggfunc=aggfunc,
permsamples=permsamples, bootsamples=bootsamples,
seed=seed, verbose=verbose, n_proc=n_proc, **kwargs)

Expand Down Expand Up @@ -258,7 +267,9 @@ def svd(self, X, Y, seed=None):
Variance explained by PLS-derived components; diagonal array
"""

out = simpls(X, Y, self.n_components, seed=seed)
# find nan rows and mask for the decomposition
mask = get_mask(X, Y)
out = simpls(X[mask], Y[mask], self.n_components, seed=seed)

# need to return len-3 for compatibility purposes
# use the variance explained in Y in lieu of the singular values since
Expand Down Expand Up @@ -301,18 +312,19 @@ def _single_boot(self, X, Y, inds, groups=None, original=None, seed=None):
else:
Xi, Yi = X[inds], Y[inds]

out = simpls(Xi, Yi, self.n_components, seed=seed)
x_weights = self.svd(Xi, Yi, seed=seed)[0]

if original is not None:
# flip signs of weights based on correlations with `original`
flip = np.sign(compute.efficient_corr(out['x_weights'], original))
out['x_weights'] = out['x_weights'] * flip
flip = np.sign(compute.efficient_corr(x_weights, original))
x_weights *= flip
# NOTE: should we be doing a procrustes here?

# recompute y_loadings based on new x_weight signs
out['y_loadings'] = Yi.T @ (Xi @ out['x_weights'])
# compute y_loadings
mask = get_mask(Xi, Yi)
y_loadings = Yi[mask].T @ (Xi @ x_weights)[mask]

return out['y_loadings'], out['x_weights']
return y_loadings, x_weights

def _single_perm(self, X, Y, inds, groups=None, original=None, seed=None):
"""
Expand Down Expand Up @@ -340,20 +352,22 @@ def _single_perm(self, X, Y, inds, groups=None, original=None, seed=None):
Variance explained by PLS decomposition of permuted data
"""

# should permute Y (but not X) by default
Xp, Yp = self.make_permutation(X, Y, inds)
out = simpls(Xp, Yp, self.n_components, seed=seed)
x_weights, varexp, _ = self.svd(Xp, Yp, seed=seed)

if self.inputs.rotate and original is not None:
# flip signs of weights based on correlations with `original`
flip = np.sign(compute.efficient_corr(out['x_weights'], original))
out['x_weights'] = out['x_weights'] * flip
flip = np.sign(compute.efficient_corr(x_weights, original))
x_weights *= flip
# NOTE: should we be doing a procrustes here?

# recompute pctvar based on new x_weight signs
y_loadings = Y[inds].T @ (X[inds] @ out['x_weights'])
varexp = np.sum(y_loadings ** 2, axis=0) / np.sum(Yp ** 2)
mask = get_mask(Xp, Yp)
y_loadings = Yp[mask].T @ (Xp @ x_weights)[mask]
varexp = np.sum(y_loadings ** 2, axis=0) / np.sum(Yp[mask] ** 2)
else:
varexp = out['pctvar'][1]
varexp = np.diag(varexp)

# need to return len-3 for compatibility purposes
return varexp, None, None
Expand All @@ -378,13 +392,15 @@ def run_pls(self, X, Y):
'the specified axis.')

# mean-center here so that our outputs are generated accordingly
X -= X.mean(axis=0, keepdims=True)
Y_agg -= Y_agg.mean(axis=0, keepdims=True)
X -= np.nanmean(X, axis=0, keepdims=True)
Y_agg -= np.nanmean(Y_agg, axis=0, keepdims=True)
mask = get_mask(X, Y_agg)

res = super().run_pls(X, Y_agg)
res['y_loadings'] = Y_agg.T @ res['x_scores']
res['y_scores'] = resid_yscores(res['x_scores'],
Y_agg @ res['y_loadings'])
res['y_loadings'] = Y_agg[mask].T @ res['x_scores'][mask]
res['y_scores'] = np.full((len(Y_agg), self.n_components), np.nan)
res['y_scores'][mask] = resid_yscores(res['x_scores'][mask],
Y_agg[mask] @ res['y_loadings'])

if self.inputs.n_boot > 0:
# compute bootstraps
Expand All @@ -400,7 +416,7 @@ def run_pls(self, X, Y):
corrci = np.stack(compute.boot_ci(distrib, ci=self.inputs.ci), -1)
res['bootres'].update(dict(x_weights_normed=bsrs,
x_weights_stderr=uboot_se,
y_loadings=res['y_loadings'].copy(),
y_loadings=res['y_loadings'],
y_loadings_boot=distrib,
y_loadings_ci=corrci,
bootsamples=self.bootsamp,))
Expand Down

0 comments on commit d8a19d5

Please sign in to comment.