#R demo for ADA SDNS webinar
#An Introduction to Model Uncertainty and Averaging for Categorical Data Analysis
#Chris Franck, June 21, 2022

#Import data
Crabs<-read.table("http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat",
                  header=TRUE)

#Probability predictions for simple logistic regression
#maximum likelihood
grid<-seq(0,35,.1)
plot(Crabs$width,jitter(Crabs$y,.1),pch=16,ylab='Satellites?',xlab="Shell width")
fit<-glm(y~width,family=binomial,data=Crabs)
grid.frame<-data.frame(width=grid)
grid.pred<-predict.glm(fit,newdata=grid.frame,type='response')
lines(grid, grid.pred,lwd=2,col='red')


#plot the data
par(mfrow=c(2,2),las=1)
plot(Crabs$width,jitter(Crabs$y,.1),pch=16,ylab='Satellites?',xlab="Shell width")
plot(jitter(Crabs$color,.3),jitter(Crabs$y,.1),pch=16,ylab='Satellites?',xlab="Shell color")
plot(jitter(Crabs$spine,.3),jitter(Crabs$y,.1),pch=16,ylab='Satellites?',xlab="Spine condition")
plot(Crabs$weight,jitter(Crabs$y,.1),pch=16,ylab="Satellites?")

#A look at potential multicollinearity
y.j<-Crabs$y+rnorm(173,0,.1)
color.j<-Crabs$color+rnorm(173,0,.1)
spine.j<-Crabs$spine+rnorm(173,0,.1)
pairs(data.frame(y=y.j,width=Crabs$width, color=color.j, spine=spine.j, weight=Crabs$weight),
      pch=16)

#BIC approximation approach to posterior model probablities
library(bestglm)
?bestglm
Xy<-data.frame(Crabs[,4:7],y=Crabs$y)
BIC.list<-bestglm(Xy,family=binomial, IC="BIC",TopModels=16,
                  RequireFullEnumerationQ = TRUE)
#bestglm will not output all 16 models, only 15
#I manually add the 16th model (spine only)
#this row was determined from
#bestglm(Xy,family=binomial, IC="BIC",TopModels=5,nvmax=1)$BestModels
Model.frame<-rbind.data.frame(BIC.list$BestModels,
c(FALSE, FALSE, FALSE,  TRUE,  230.7776))
Model.frame
table(apply(Model.frame[,1:4],1,sum))

#The next chunk of code implements the formulas for the BIC approximation
#to posterior model probabilities in the slides
bic.vec<-Model.frame$Criterion
base.bic<-bic.vec[1]
BFk1<-exp(-.5*(bic.vec-base.bic))
#uniform prior on model space
mod.prior<-rep(1/16,16)
base.prior<-mod.prior[1]
p.m1gy<-1/sum((mod.prior/base.prior)*BFk1)
post.mod.prob<-(mod.prior/base.prior)*BFk1*p.m1gy
Model.frame$post.mod.prob<-round(post.mod.prob,3)

#A look at the posterior model probabilities
Model.frame

#Compute and plot the posterior inclusion probabilities
width.incl<-sum(post.mod.prob[Model.frame$width==1])
weight.incl<-sum(post.mod.prob[Model.frame$weight==1])
color.incl<-sum(post.mod.prob[Model.frame$color==1])
spine.incl<-sum(post.mod.prob[Model.frame$spine==1])
par(mfrow=c(1,1),las=1)
plot(1:4,c(width.incl,weight.incl,color.incl,spine.incl),
        ylim=c(0,1),type='h',axes=FALSE,
     ylab='Inclusion probability',xlab="Candidate predictor")
axis(1,at=1:4,labels = c('width','weight','color','spine'))
axis(2,las=1)
abline(h=.5,lty=3,col='lightgray')

#BAS offers a convenient fully Bayesian way to do all of this
install.packages("BAS")
library(BAS)
Crabs.BAS <- bas.glm(y ~ .,n.models= 2^4,data=Crabs[,3:7],
                    method="MCMC", MCMC.iterations=5000,
                    betaprior=bic.prior(), family=binomial(),
                    modelprior=uniform())
plot(Crabs.BAS)

Crabs.BAS$probne0

#obtain model averaged prediction
BMA.pred<-predict(Crabs.BAS,type="response")$fit
#obtain highest posterior model probability predictions
HPM.pred<-predict(Crabs.BAS,type="response",estimator="HPM")$fit

#simple logistic fit from the beginning of the demo
SLR.pred<-predict.glm(fit,type="response")
plot(SLR.pred,BMA.pred)
abline(0,1)

plot(HPM.pred,BMA.pred)
abline(0,1)

#point biserial correlations between outcome and predictions of each type
cor(BMA.pred,Crabs$y) #Model averaged predictions
cor(HPM.pred,Crabs$y) #Simple logistic regression predictions
cor(SLR.pred,Crabs$y) #Simple logistic regression predictions

#examine model averaged posterior distributions of coefficients
coef.BMA <- coef(Crabs.BMA)
plot(coef.BMA)