-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathREADME.Rmd
145 lines (99 loc) · 5.08 KB
/
README.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
---
output: github_document
---
```{r setup, include=FALSE}
library(knitr)
opts_chunk$set(message = FALSE, warning=FALSE)
```
[![version](http://www.r-pkg.org/badges/version/fastglm)](https://cran.r-project.org/package=fastglm)
[![Build Status](https://travis-ci.org/jaredhuling/fastglm.svg?branch=master)](https://travis-ci.org/jaredhuling/fastglm)
# Overview of 'fastglm'
The 'fastglm' package is a re-write of `glm()` using `RcppEigen` designed to be computationally efficient and algorithmically stable.
# Installing the 'fastglm' package
Install the development version using the **devtools** package:
```{r, eval = FALSE}
devtools::install_github("jaredhuling/fastglm")
```
or by cloning and building using `R CMD INSTALL`
# Quick Usage Overview
Load the package:
```{r, message = FALSE, warning = FALSE}
library(fastglm)
```
A (not comprehensive) comparison with `glm.fit()` and `speedglm.wfit()`:
```{r gen_data, echo = TRUE, out.width= "100%", fig.width = 9, fig.height = 4.5, fig.path="vignettes/"}
library(speedglm)
library(microbenchmark)
library(ggplot2)
set.seed(123)
n.obs <- 10000
n.vars <- 100
x <- matrix(rnorm(n.obs * n.vars, sd = 3), n.obs, n.vars)
Sigma <- 0.99 ^ abs(outer(1:n.vars, 1:n.vars, FUN = "-"))
x <- MASS::mvrnorm(n.obs, mu = runif(n.vars, min = -1), Sigma = Sigma)
y <- 1 * ( drop(x[,1:25] %*% runif(25, min = -0.1, max = 0.10)) > rnorm(n.obs))
ct <- microbenchmark(
glm.fit = {gl1 <- glm.fit(x, y, family = binomial())},
speedglm.eigen = {sg1 <- speedglm.wfit(y, x, intercept = FALSE,
family = binomial())},
speedglm.chol = {sg2 <- speedglm.wfit(y, x, intercept = FALSE,
family = binomial(), method = "Chol")},
speedglm.qr = {sg3 <- speedglm.wfit(y, x, intercept = FALSE,
family = binomial(), method = "qr")},
fastglm.qr.cpiv = {gf1 <- fastglm(x, y, family = binomial())},
fastglm.qr = {gf2 <- fastglm(x, y, family = binomial(), method = 1)},
fastglm.LLT = {gf3 <- fastglm(x, y, family = binomial(), method = 2)},
fastglm.LDLT = {gf4 <- fastglm(x, y, family = binomial(), method = 3)},
fastglm.qr.fpiv = {gf5 <- fastglm(x, y, family = binomial(), method = 4)},
times = 25L
)
autoplot(ct, log = FALSE) + stat_summary(fun.y = median, geom = 'point', size = 2)
# comparison of estimates
c(glm_vs_fastglm_qrcpiv = max(abs(coef(gl1) - gf1$coef)),
glm_vs_fastglm_qr = max(abs(coef(gl1) - gf2$coef)),
glm_vs_fastglm_qrfpiv = max(abs(coef(gl1) - gf5$coef)),
glm_vs_fastglm_LLT = max(abs(coef(gl1) - gf3$coef)),
glm_vs_fastglm_LDLT = max(abs(coef(gl1) - gf4$coef)))
# now between glm and speedglm
c(glm_vs_speedglm_eigen = max(abs(coef(gl1) - sg1$coef)),
glm_vs_speedglm_Chol = max(abs(coef(gl1) - sg2$coef)),
glm_vs_speedglm_qr = max(abs(coef(gl1) - sg3$coef)))
```
# Stability
The `fastglm` package does not compromise computational stability for speed. In fact, for many situations where `glm()` and even `glm2()` do not converge, `fastglm()` does converge.
As an example, consider the following data scenario, where the response distribution is (mildly) misspecified, but the link function is quite badly misspecified. In such scenarios, the standard IRLS algorithm tends to have convergence issues. The `glm2()` package was designed to handle such cases, however, it still can have convergence issues. The `fastglm()` package uses a similar step-halving technique as `glm2()`, but it starts at better initialized values and thus tends to have better convergence properties in practice.
```{r, fig.show='hold'}
set.seed(1)
x <- matrix(rnorm(10000 * 100), ncol = 100)
y <- (exp(0.25 * x[,1] - 0.25 * x[,3] + 0.5 * x[,4] - 0.5 * x[,5] + rnorm(10000)) ) + 0.1
system.time(gfit1 <- fastglm(cbind(1, x), y, family = Gamma(link = "sqrt")))
system.time(gfit2 <- glm(y~x, family = Gamma(link = "sqrt")) )
system.time(gfit3 <- glm2::glm2(y~x, family = Gamma(link = "sqrt")) )
system.time(gfit4 <- speedglm(y~x, family = Gamma(link = "sqrt")))
## speedglm appears to diverge
system.time(gfit5 <- speedglm(y~x, family = Gamma(link = "sqrt"), maxit = 500))
## Note that fastglm() returns estimates with the
## largest likelihood
c(fastglm = logLik(gfit1), glm = logLik(gfit2), glm2 = logLik(gfit3),
speedglm = logLik(gfit4), speedglm500 = logLik(gfit5))
rbind(fastglm = coef(gfit1)[1:5],
glm = coef(gfit2)[1:5],
glm2 = coef(gfit3)[1:5],
speedglm = coef(gfit4)[1:5],
speedglm500 = coef(gfit5)[1:5])
## check convergence of fastglm and #iterations
# 1 means converged, 0 means not converged
c(gfit1$converged, gfit1$iter)
## now check convergence for glm()
c(gfit2$converged, gfit2$iter)
## check convergence for glm2()
c(gfit3$converged, gfit3$iter)
## check convergence for speedglm()
c(gfit4$convergence, gfit4$iter, gfit5$convergence, gfit5$iter)
## increasing number of IRLS iterations for glm() does not help that much
system.time(gfit2 <- glm(y~x, family = Gamma(link = "sqrt"), control = list(maxit = 1000)) )
gfit2$converged
gfit2$iter
logLik(gfit1)
logLik(gfit2)
```