#R demo for ADA SDNS webinar #An Introduction to Model Uncertainty and Averaging for Categorical Data Analysis #Chris Franck, June 21, 2022 #Import data Crabs<-read.table("http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat", header=TRUE) #Probability predictions for simple logistic regression #maximum likelihood grid<-seq(0,35,.1) plot(Crabs$width,jitter(Crabs$y,.1),pch=16,ylab='Satellites?',xlab="Shell width") fit<-glm(y~width,family=binomial,data=Crabs) grid.frame<-data.frame(width=grid) grid.pred<-predict.glm(fit,newdata=grid.frame,type='response') lines(grid, grid.pred,lwd=2,col='red') #plot the data par(mfrow=c(2,2),las=1) plot(Crabs$width,jitter(Crabs$y,.1),pch=16,ylab='Satellites?',xlab="Shell width") plot(jitter(Crabs$color,.3),jitter(Crabs$y,.1),pch=16,ylab='Satellites?',xlab="Shell color") plot(jitter(Crabs$spine,.3),jitter(Crabs$y,.1),pch=16,ylab='Satellites?',xlab="Spine condition") plot(Crabs$weight,jitter(Crabs$y,.1),pch=16,ylab="Satellites?") #A look at potential multicollinearity y.j<-Crabs$y+rnorm(173,0,.1) color.j<-Crabs$color+rnorm(173,0,.1) spine.j<-Crabs$spine+rnorm(173,0,.1) pairs(data.frame(y=y.j,width=Crabs$width, color=color.j, spine=spine.j, weight=Crabs$weight), pch=16) #BIC approximation approach to posterior model probablities library(bestglm) ?bestglm Xy<-data.frame(Crabs[,4:7],y=Crabs$y) BIC.list<-bestglm(Xy,family=binomial, IC="BIC",TopModels=16, RequireFullEnumerationQ = TRUE) #bestglm will not output all 16 models, only 15 #I manually add the 16th model (spine only) #this row was determined from #bestglm(Xy,family=binomial, IC="BIC",TopModels=5,nvmax=1)$BestModels Model.frame<-rbind.data.frame(BIC.list$BestModels, c(FALSE, FALSE, FALSE, TRUE, 230.7776)) Model.frame table(apply(Model.frame[,1:4],1,sum)) #The next chunk of code implements the formulas for the BIC approximation #to posterior model probabilities in the slides bic.vec<-Model.frame$Criterion base.bic<-bic.vec[1] BFk1<-exp(-.5*(bic.vec-base.bic)) #uniform prior on model space mod.prior<-rep(1/16,16) base.prior<-mod.prior[1] p.m1gy<-1/sum((mod.prior/base.prior)*BFk1) post.mod.prob<-(mod.prior/base.prior)*BFk1*p.m1gy Model.frame$post.mod.prob<-round(post.mod.prob,3) #A look at the posterior model probabilities Model.frame #Compute and plot the posterior inclusion probabilities width.incl<-sum(post.mod.prob[Model.frame$width==1]) weight.incl<-sum(post.mod.prob[Model.frame$weight==1]) color.incl<-sum(post.mod.prob[Model.frame$color==1]) spine.incl<-sum(post.mod.prob[Model.frame$spine==1]) par(mfrow=c(1,1),las=1) plot(1:4,c(width.incl,weight.incl,color.incl,spine.incl), ylim=c(0,1),type='h',axes=FALSE, ylab='Inclusion probability',xlab="Candidate predictor") axis(1,at=1:4,labels = c('width','weight','color','spine')) axis(2,las=1) abline(h=.5,lty=3,col='lightgray') #BAS offers a convenient fully Bayesian way to do all of this install.packages("BAS") library(BAS) Crabs.BAS <- bas.glm(y ~ .,n.models= 2^4,data=Crabs[,3:7], method="MCMC", MCMC.iterations=5000, betaprior=bic.prior(), family=binomial(), modelprior=uniform()) plot(Crabs.BAS) Crabs.BAS$probne0 #obtain model averaged prediction BMA.pred<-predict(Crabs.BAS,type="response")$fit #obtain highest posterior model probability predictions HPM.pred<-predict(Crabs.BAS,type="response",estimator="HPM")$fit #simple logistic fit from the beginning of the demo SLR.pred<-predict.glm(fit,type="response") plot(SLR.pred,BMA.pred) abline(0,1) plot(HPM.pred,BMA.pred) abline(0,1) #point biserial correlations between outcome and predictions of each type cor(BMA.pred,Crabs$y) #Model averaged predictions cor(HPM.pred,Crabs$y) #Simple logistic regression predictions cor(SLR.pred,Crabs$y) #Simple logistic regression predictions #examine model averaged posterior distributions of coefficients coef.BMA <- coef(Crabs.BMA) plot(coef.BMA)