Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for rank deficiency in bootstrap #755

Merged
merged 16 commits into from
Mar 19, 2024
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docs/src/assets/*.svg
test/scratch.jl
tune.json
Manifest.toml
Manifest-v1.*.toml
settings.json
docs/jmd
LocalPreferences.toml
Expand Down
30 changes: 19 additions & 11 deletions src/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ been constructed with the same structure as the source of the saved replicates.

The two-argument method constructs a [`MixedModelBootstrap`](@ref) with the same eltype as `m`.
If an eltype is specified as the third argument, then a `MixedModelBootstrap` is returned.
If a subtype of `MixedModelFitCollection` is specified as the third argument, then that
If a subtype of `MixedModelFitCollection` is specified as the third argument, then that
is the return type.

See also [`savereplicates`](@ref), [`restoreoptsum!`](@ref).
Expand All @@ -91,7 +91,7 @@ end

# why this weird second method? it allows us to define custom types and write methods
# to load into those types directly. For example, we could define a `PowerAnalysis <: MixedModelFitCollection`
# in MixedModelsSim and then overload this method to get a convenient object.
# in MixedModelsSim and then overload this method to get a convenient object.
# Also, this allows us to write `restorereplicateS(f, m, ::Type{<:MixedModelNonparametricBootstrap})` for
# entities in MixedModels bootstrap
function restorereplicates(
Expand All @@ -103,7 +103,7 @@ function restorereplicates(
rep = first(Tables.rows(tbl))
allgood =
length(rep.θ) == length(m.θ) &&
string.(propertynames(rep.β)) == Tuple(coefnames(m))
string.(propertynames(rep.β)) == Tuple(fixefnames(m))
palday marked this conversation as resolved.
Show resolved Hide resolved
allgood ||
throw(ArgumentError("Model is not compatible with saved replicates."))

Expand All @@ -120,14 +120,14 @@ end
"""
savereplicates(f, b::MixedModelFitCollection)

Save the replicates associated with a [`MixedModelFitCollection`](@ref),
e.g. [`MixedModelBootstrap`](@ref) as an Arrow file.
Save the replicates associated with a [`MixedModelFitCollection`](@ref),
e.g. [`MixedModelBootstrap`](@ref) as an Arrow file.

See also [`restorereplicates`](@ref), [`saveoptsum`](@ref)

!!! note
**Only** the replicates are saved, not the entire contents of the `MixedModelFitCollection`.
`restorereplicates` requires a model compatible with the bootstrap to restore the full object.
`restorereplicates` requires a model compatible with the bootstrap to restore the full object.
"""
savereplicates(f, b::MixedModelFitCollection) = Arrow.write(f, b.fits)

Expand Down Expand Up @@ -200,6 +200,14 @@ fit during the bootstrapping process. For example, `optsum_overrides=(;ftol_rel=
reduces the convergence criterion, which can greatly speed up the bootstrap fits.
Taking advantage of this speed up to increase `n` can often lead to better estimates
of coverage intervals.


!!! note
All coefficients are bootstrapped. In the rank deficient case, the inestimatable coefficients are
treated as -0.0 in the simulations underlying the bootstrap, which will generally result
in their estimate from the simulated data also being being inestimable and thus set to -0.0.
**However this behavior behavior may change in future releases to explicitly drop the
palday marked this conversation as resolved.
Show resolved Hide resolved
extraneous columns before simulation and thus not include their estimates in the bootstrap result.**
"""
function parametricbootstrap(
rng::AbstractRNG,
Expand Down Expand Up @@ -234,7 +242,7 @@ function parametricbootstrap(
# this seemed to slow things down?!
# _copy_away_from_lowerbd!(m.optsum.initial, morig.optsum.final, m.lowerbd; incr=0.05)

β_names = Tuple(Symbol.(fixefnames(morig)))
β_names = Tuple(Symbol.(coefnames(morig)))

use_threads && Base.depwarn(
"use_threads is deprecated and will be removed in a future release",
Expand All @@ -247,7 +255,7 @@ function parametricbootstrap(
(
objective=ftype.(m.objective),
σ=ismissing(m.σ) ? missing : ftype(m.σ),
β=NamedTuple{β_names}(fixef!(βsc, m)),
β=NamedTuple{β_names}(coef!(βsc, m)),
se=SVector{p,ftype}(stderror!(βsc, m)),
θ=SVector{k,ftype}(getθ!(θsc, m)),
)
Expand Down Expand Up @@ -331,8 +339,8 @@ end

Compute bootstrap confidence intervals for coefficients and variance components, with confidence level level (by default 95%).

The keyword argument `method` determines whether the `:shortest`, i.e. highest density, interval is used
or the `:equaltail`, i.e. quantile-based, interval is used. For historical reasons, the default is `:shortest`,
The keyword argument `method` determines whether the `:shortest`, i.e. highest density, interval is used
or the `:equaltail`, i.e. quantile-based, interval is used. For historical reasons, the default is `:shortest`,
but `:equaltail` gives the interval that is most comparable to the profile and Wald confidence intervals.

!!! note
Expand Down Expand Up @@ -621,7 +629,7 @@ end
function σρ!(v::AbstractVector{T}, t::LowerTriangular{T}, σ::T) where {T}
"""
σρ!(v, t, σ)
push! `σ` times the row lengths (σs) and the inner products of normalized rows (ρs) of `t` onto `v`
push! `σ` times the row lengths (σs) and the inner products of normalized rows (ρs) of `t` onto `v`
"""
dat = t.data
for i in axes(dat, 1)
Expand Down
4 changes: 2 additions & 2 deletions src/generalizedlinearmixedmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ StatsAPI.deviance(m::GeneralizedLinearMixedModel) = deviance(m, m.optsum.nAGQ)

fixef(m::GeneralizedLinearMixedModel) = m.β

function fixef!(v::AbstractVector{T}, m::GeneralizedLinearMixedModel{T}) where {T}
return copyto!(fill!(v, -zero(T)), m.β)
function fixef!(v::AbstractVector{Tv}, m::GeneralizedLinearMixedModel{T}) where {Tv, T}
palday marked this conversation as resolved.
Show resolved Hide resolved
return copyto!(fill!(v, -zero(Tv)), m.β)
end

objective(m::GeneralizedLinearMixedModel) = deviance(m)
Expand Down
6 changes: 4 additions & 2 deletions src/linearmixedmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,11 @@ function StatsAPI.fit(
end
end

function StatsAPI.coef(m::LinearMixedModel{T}) where {T}
StatsAPI.coef(m::LinearMixedModel{T}) where {T} = coef!(Vector{T}(undef, length(m.feterm.piv)), m)
palday marked this conversation as resolved.
Show resolved Hide resolved

function coef!(v::AbstractVector{Tv}, m::MixedModel{T}) where {Tv,T}
piv = m.feterm.piv
return invpermute!(fixef!(similar(piv, T), m), piv)
return invpermute!(fixef!(v, m), piv)
end

βs(m::LinearMixedModel) = NamedTuple{(Symbol.(coefnames(m))...,)}(coef(m))
Expand Down
25 changes: 8 additions & 17 deletions src/simulate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,6 @@ scale.
The `wts` argument is currently ignored except for `GeneralizedLinearMixedModel`
models with a `Binomial` distribution.

!!! warning
Models are assumed to be full rank.

!!! note
Note that `simulate!` methods with a `y::AbstractVector` as the first argument
(besides the RNG) and `simulate` methods return the simulated response. This is
Expand Down Expand Up @@ -152,20 +149,17 @@ end
function simulate!(
rng::AbstractRNG, y::AbstractVector, m::LinearMixedModel{T}; β=m.β, σ=m.σ, θ=m.θ
) where {T}
length(β) == length(fixef(m)) ||
length(β) == length(coef(m)) ||
length(β) == length(m.feterm.piv) || length(β) == m.feterm.rank ||
ararslan marked this conversation as resolved.
Show resolved Hide resolved
throw(ArgumentError("You must specify all (non-singular) βs"))

β = convert(Vector{T}, β)
σ = T(σ)
θ = convert(Vector{T}, θ)
isempty(θ) || setθ!(m, θ)

if length(β) ≠ length(coef(m))
padding = length(coef(m)) - length(β)
for ii in 1:padding
push!(β, -0.0)
end
if length(β) ≠ length(m.feterm.piv)
padding = length(model.feterm.piv) - m.feterm.rank
append!(β, fill(-0.0, padding))
ararslan marked this conversation as resolved.
Show resolved Hide resolved
end

# initialize y to standard normal
Expand Down Expand Up @@ -234,8 +228,7 @@ function _simulate!(
θ,
resp=nothing,
) where {T}
length(β) == length(fixef(m)) ||
length(β) == length(coef(m)) ||
length(β) == length(m.feterm.piv) || length(β) == m.feterm.rank ||
throw(ArgumentError("You must specify all (non-singular) βs"))

dispersion_parameter(m) ||
Expand All @@ -254,11 +247,9 @@ function _simulate!(

d = m.resp.d

if length(β) ≠ length(coef(m))
padding = length(coef(m)) - length(β)
for ii in 1:padding
push!(β, -0.0)
end
if length(β) ≠ length(m.feterm.piv)
padding = length(model.feterm.piv) - m.feterm.rank
append!(β, fill(-0.0, padding))
end

fast = (length(m.θ) == length(m.optsum.final))
Expand Down
20 changes: 18 additions & 2 deletions test/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ end
@test pb1 == restorereplicates(seekstart(io), MixedModels.unfit!(deepcopy(m1)))

@test pb1 ≈ restorereplicates(seekstart(io), m1, Float16) rtol=1
end
end

@testset "Bernoulli simulate! and GLMM bootstrap" begin
contra = dataset(:contra)
Expand Down Expand Up @@ -220,6 +220,22 @@ end
σ=2, progress=false)
@test sum(issingular(bs)) == 0
end

@testset "Rank deficient" begin
rng = MersenneTwister(0);
x = rand(rng, 100);
data = (x = x, x2 = 1.5 .* x, y = rand(rng, 100), z = repeat('A':'T', 5))
model = @suppress fit(MixedModel, @formula(y ~ x + x2 + (1|z)), data; progress=false)
boot = quickboot(model, 10)

dropped_idx = model.feterm.piv[end]
dropped_coef = coefnames(model)[dropped_idx]
@test all(boot.β) do nt
# if we're the dropped coef, then we must be -0.0
# need isequal because of -0.0
return nt.coefname != dropped_coef || isequal(nt.β, -0.0)
end
end
end

@testset "show and summary" begin
Expand All @@ -236,7 +252,7 @@ end
@test nrow(df) == 151
@test propertynames(df) == collect(propertynames(pr.tbl))

@testset "CI method comparison" begin
@testset "CI method comparison" begin
level = 0.68
ci_boot_equaltail = confint(pb; level, method=:equaltail)
ci_boot_shortest = confint(pb; level, method=:shortest)
Expand Down
Loading