bic.surv <- function(x, surv.t, cens, strict = F, OR = 20, prior.param = c(rep(0.5, ncol(x))), OR.fix=2, nbest = 150, factor.type=T, factor.prior.adjust=F) { ### S-PLUS function to apply Bayesian Model Averaging to variable ### selection for Cox proportional hazard models. Bayesian ### Model Averaging accounts for the model uncertainty inherent in the ### variable selection problem by averaging over the best models in the ### model class according to posterior model probability. For more details see ### references at the end of the documentation. ### Developer: Chris Volinsky (volinsky@stat.washington.edu) ### ### This software is not formally maintained, but I will be happy to ### hear from people who have problems with it, although I cannot guarantee to ### to solve them! ### ### Permission is hereby given to StatLib to redistribute this software. ### The software can be freely used for non-commercial purposes, and can ### be freely distributed for non-commercial purposes only. ### The copyright is retained by the developer. ### Copyright 1994, 1996, 1997 Chris T. Volinsky ### n = total number of data points, p = number of independent variables ### ### Inputs: ### x matrix of independent variables (n x p matrix) ### surv.t n-vector of survival times ### cens indicator of length n of censoring (0=censored 1=uncensored) ### strict = T will return a more parsimonious set of models (Occam's Window). ### Any modelwith a more likely submodel will be eliminated. ### = F (default). Returns all models within 1/OR in posterior model prob. ### OR maximum ratio for excluding models in Occam's window ### Default is 20. ### prior.param p-vector of prior probabilities that parameter is non-zero. Default ### puts a prior of .5 on all parameters. Setting to 1 forces the variable ### into the model. ### OR.fix Width of the window which keeps models after the leaps ### approximation is done. Because the leaps and bounds gives only an ### approximation to BIC, there is a need to increase the window at this ### first "cut" so as to assure that no good models are deleted. The level ### of the cut is at 1/(OR^OR.fix), and the default value for OR.fix is 2. ### nbest sets the nbest parameter for the S-PLUS leaps function. Default is 150, ### this is a conservative value which makes the probability of "losing" a ### good model slim. Reducing this value could increase run time, but there ### is a slight risk of losing valuable models. ### factor.type How should the function model categorical variables of class "factor"? ### A categorical variable with d levels is turned into (d-1) dummy variables ### using a treatment contrast. If factor.type = T, models will contain ### either all or none of these dummy variables. If factor.type = F, models ### are free to select the dummy variables independently. In this case, ### factor.prior.adjust determines the prior on these variables. ### factor.prior.adjust ### If factor.type=F, this parameter determines the prior on the d-1 dummy ### variables. When factor.prior.adjust=F, all dummy variables for ### variable i have prior equal to prior.param[i] (or pp[i]). Note that ### this makes the prior probability of the union of these variables much ### higher than pp[i]. Setting factor.prior.adjust=T corrects for this so ### that the union of the dummies equals pp[i] (and hence the deletion of ### the factor has a prior of 1-pp[i]). This adjustment ### changes the individual priors on each dummy variable to ### 1-(1-pp[i])^(1/(k+1)). ### Outputs: ### postprob posterior probabilities of the models selected ### label labels identifying the models selected ### size number of variables in each model (length M) ### bic BIC = - 2 * (loglik - loglik.null) + size * log(n) ### (Note: BIC returned does not take into account model priors, ### so rank order of BIC may not be the same as the rank ### order of postprob if non-uniform parameter priors are ### specified) ### which a logical matrix of size length(label) x size ### probne0 posterior probability that each variable is non-zero (in %) ### postmean model-averaged posterior mean of the parameters (includes averaged-in zeros). ### postsd model-averaged posterior sd of the parameters (includes averaged-in zeros). ### mle n x p matrix giving the maximum likelihood estimate of each coefficient for ### each model. ### se n x p matrix giving the standard errors of each coefficient for each model. ### prior.param the priors on the parameters. Will be different than input if factors ### are used and factor.type =F. ### prior.model.weights ### If priors on variables are not all .5, then prior weights gives the ratio ### of the model prior to a uniform model prior, based on prior.param. A ### prior.model.weight > 1 means the model is weighed more than uniform ### namesx string of variable names ### ### References: ### ### Volinsky, C.T., Madigan, D., Raftery, A.E. and Kronmal, R.A. (1997). ### "Bayesian Model Averaging in Proportional Hazard Models: Assessing the Risk ### of a Stroke." Applied Statistics (46). Available at: ### http://www.stat.washington.edu/volinsky/vmrk.ps ### ### Furnival, G.M. and R.W. Wilson "Regression by leaps and bounds" ### Technometrics 16, 499-511. ### ### Lawless, J. and K. Singhal "Efficient screening on nonnormal ### regression models. Biometrics 34 318-27. ########################################################################### ########################################################################### leaps.bs_function(info,coef, nbest = nbest, names.arg = names(x)) { ### Function to be used in conjunction with bic.surv. ### This is just a modification of the S-Plus "leaps" function, adjusted ### to return the best models for a proportional hazards model (using coxph) ### using the method of Lawless and Singhal (1978). names.arg <<- names.arg # evaluate names.arg before cbind if(length(names.arg) < ncol(info)) stop("Too few names") bIb <- coef %*% info %*% coef kx <- ncol(info) maxreg <- nbest * kx if(kx < 3) stop("Too few independent variables") imeth <- 1 # Corresponds to R^2 df <- kx + 1 # df not important for r2 Ib <- info %*% coef rr <- cbind(info, Ib) rr <- rbind(rr, c(Ib, bIb)) ans <- .Fortran("leaps", ### Fortran routine as.single(rr), as.integer(kx), as.integer(kx + 1), as.integer(df), as.integer(imeth), as.integer(nbest), as.single(0.0001), regid = integer(maxreg), Cp = single(maxreg), size = integer(maxreg), nreg = integer(1), single((nbest + 4) * (kx + 1) + (((kx + 1) * (kx + 2))/2) * ((2 * (kx + 3))/3 + 1)), integer(4 * (kx + 1)^2 + 8 * (kx + 1) + (nbest + 1) * (kx + 1)) )[c("Cp", "size", "nreg", "regid")] ### cleanup Fortran output; fix Cp, r2 and adjr2. nreg <- ans$nreg Cp <- ans$Cp length(Cp) <- nreg size <- ans$size length(size) <- nreg which <- matrix(T, nreg, kx) z <- ans$regid length(z) <- nreg ### Rick Becker's mask function to create which and then labels which <- matrix(as.logical((rep.int(z, kx) %/% rep.int(2^((kx - 1):0),rep.int(length(z), kx))) %% 2), byrow = F, ncol = kx) label <- character(nreg) sep <- if(all(nchar(names.arg) == 1)) "" else "," for(i in 1:nreg) label[i] <- paste(names.arg[which[i, ]], collapse = sep) ans <- list(Cp, size = size, label = label, which = which) names(ans)[1] <- "r2" ans } # end of leaps.bs ### Start of bic.surv function options(contrasts=c("contr.treatment","contr.treatment")) x <- data.frame(x) # will give it names as well output.names <- names(x) prior.weight.denom <- .5^ncol(x) ### If there are more than 30 columns in x, reduce to 30 using backwards ### elimination. if(ncol(x) > 30) { while(ncol(x)>30) { x.coxph <- data.frame(x, surv.t = surv.t, cens = cens) cox <- coxph(Surv(surv.t, cens) ~ ., data = x.coxph,method="breslow",iter.max=30) coef <- cox$coef se <- sqrt(diag(cox$var)) pvec <- signif(1 - pchisq((coef/se)^2, 1), 3) x <- x[, - rev(order(pvec))[1]] } } ### Remove NA's x2 <- na.omit(x) used <- match(row.names(x), row.names(x2)) omitted <- seq(nrow(x))[is.na(used)] if(length(omitted) > 0) { x <- x2 surv.t <- surv.t[ - omitted] cens <- cens[ - omitted] warning(paste("There were ",length(omitted),"records deleted due to NA's")) } leaps.x_x ### locate factors and count factor levels factors_!all(unlist(lapply(factor.names(x),is.null))) if(factors) { fn_factor.names(x) cdf <- cbind.data.frame(y=surv.t,x) mm <- model.matrix(formula(cdf),data=cdf)[,-1,drop=F] mmm <- data.frame(matrix(mm,nrow=nrow(mm),byrow=F)) names(mmm) <- dimnames(mm)[[2]] output.names <- names(mmm) x.coxph <- data.frame(surv.t=surv.t,cens=cens,x) cox.out <- coxph(Surv(surv.t,cens)~.,data=x.coxph,method="breslow") cox.assign_cox.out$assign fac.levels <- unlist(lapply(cox.assign,length)[-1]) if (factor.type) { for (i in 1:length(names(x))){ if (!is.null(fn[[i]])){ nx <- names(x)[i] coefs_cox.out$coef[cox.out$assign[[i+1]]] old.vals_x[,i] new.vals_c(0,coefs) new.vec_as.vector(new.vals[match(old.vals,fn[[i]])]) leaps.x[,nx]_new.vec } } } else ## factor.type = F { new.prior_NULL for (i in 1:length(names(x))){ addprior_prior.param[i] if (!is.null(fn[[i]])) { k_length(fn[[i]]) if (factor.prior.adjust) addprior_rep(1-(1-prior.param[i])^(1/(k-1)),k-1) else addprior_rep(prior.param[i],k-1) } new.prior_c(new.prior,addprior) } prior.param_new.prior x <- leaps.x <- mmm } } ### Execute leaps and bounds algorithm x.coxph <- data.frame(surv.t=surv.t,cens=cens,leaps.x) cox.out <- coxph(Surv(surv.t,cens)~.,data=x.coxph,method="breslow") n <- sum(cens) if (ncol(leaps.x)>=3) { coef <- cox.out$coef info <- solve(cox.out$var) a <- leaps.bs(info,coef, names.arg = names(x),nbest = nbest) a$r2 <- c(0, a$r2)/100 a$size <- c(0, a$size) a$label <- c("NULL", a$label) a$which <- rbind(rep(F, ncol(x)), a$which) nmod <- length(a$size) ### Calculate BIC and apply the first OW rule approximately, using the ### WLS approximation but using OR^OR.fix instead of OR. prior.mat <- matrix(rep(prior.param, nmod), nmod,ncol(x), byrow = T) prior <- apply(a$which * prior.mat + (!a$which) * (1 - prior.mat), 1, prod) bIb <- as.numeric(coef %*% info %*% coef) lrt <- bIb - (a$r2 * bIb) bic <- lrt + (a$size) * log(n) - 2 * log(prior) occam <- bic - min(bic) < 2*OR.fix * log(OR) # From MUS p.24 with OR=B_12 size <- a$size[occam] label <- a$label[occam] which <- a$which[occam, ,drop=F] bic <- bic[occam] prior <- prior[occam] } else # ncol(leaps.x) <= 2 { nmod <- switch(ncol(x),2,4) bic <- label <- rep(0, nmod) which <- matrix(c(F, T, F, T, F, F, T, T), nmod, nmod/2) size <- c(0, 1, 1, 2)[1:nmod] sep <- if(all(nchar(names.arg) == 1)) "" else "," prior.mat <- matrix(rep(prior.param, nmod), nmod, ncol(x), byrow = T) prior <- apply(which * prior.mat + (!which) * (1 - prior.mat), 1, prod) for(k in 1:nmod) { if (k==1) label[k] <- "NULL" else label[k] <- paste(names(leaps.x)[which[k, ]], collapse = sep) } } ### Calculate BIC exactly for the remaining models using "coxph" model.fits <- as.list(rep(0, length(label))) loglik <- rep(0, length(label)) size <- rep(0, length(label)) loglik.null <- cox.out$loglik[1] for(k in (1:length(label))) { if(sum(which[k, ]) != 0) { x.coxph_data.frame(x[, which[k, ],drop=F],surv.t=surv.t,cens=cens) cox.out <- coxph(Surv(surv.t,cens)~.,data=x.coxph, iter.max = 30, method="breslow") loglik[k] <- cox.out$loglik[2] size[k] <- length(cox.out$coef) model.fits[[k]] <- matrix(rep(0, 2 * length(cox.out$coef)), ncol = 2) model.fits[[k]][, 1] <- cox.out$coef model.fits[[k]][, 2] <- sqrt(diag(cox.out$var)) } else { loglik[k] <- loglik.null } } bic <- size * log(n) - 2 * (loglik - loglik.null) - 2 * log(prior) ### Now apply the first Occam's window rule using the exact BIC occam <- bic - min(bic) < 2 * log(OR) size <- size[occam] label <- label[occam] which <- which[occam, ,drop=F] if(length(size) == 1) which <- as.vector(which) bic <- bic[occam] prior <- prior[occam] model.fits <- model.fits[occam] postprob <- (exp(-0.5 * (bic - min(bic))))/sum(exp(-0.5 * (bic - min(bic)))) ### Order models in descending order of posterior probability order.bic <- order(bic, size, label) size <- size[order.bic] label <- label[order.bic] which <- which[order.bic, ,drop=F] bic <- bic[order.bic] prior <- prior[order.bic] postprob <- postprob[order.bic] model.fits <- model.fits[order.bic] ### Apply the second rule to eliminate models with better ones nested nmod_length(size) if(strict &(nmod != 1)) { occam <- rep(T, nmod) for(k in (2:nmod)) { for(j in (1:(k - 1))) { which.diff <- which[k, ] - which[j, ] if(all(which.diff >= 0)) occam[k] <- F } } size <- size[occam] label <- label[occam] which <- which[occam, ,drop=F] bic <- bic[occam] prior <- prior[occam] postprob <- postprob[occam]/(sum(postprob[occam])) model.fits <- model.fits[occam] } ### Re scale so bic for null model = 0 bic <- bic + 2* log(prior) ### Calculate Pr[beta_i != 0 | D] for each coefficient beta_i probne0 <- round(100 * t(which) %*% as.matrix(postprob), 1) nmod <- length(bic) nvar <- length(output.names) Ebi <- rep(0, nvar) SDbi <- rep(0, nvar) EbiMk <- matrix(rep(0, nmod * nvar), nrow = nmod) sebiMk <- matrix(rep(0, nmod * nvar), nrow = nmod) if (factors==F || factor.type==F) { for(i in (1:nvar)) if(sum(which[, i] != 0)) for(k in (1:nmod)) if(which[k, i] == T) { pos <- sum(which[k, (1:i)]) EbiMk[k, i] <- model.fits[[k]][(pos), 1] sebiMk[k, i] <- model.fits[[k]][(pos), 2] } } else # factor.type==T { for(i in (1:ncol(x))) { whereisit_cox.assign[[i+1]] if(any(which[, i])) for(k in (1:nmod)) if(which[k, i] == T) { spot_sum(which[k,(1:i)]) posMk <- (c(0,cumsum(fac.levels[which[k, ]]))+1)[spot] posMk <- posMk:(posMk+fac.levels[i]-1) EbiMk[k,whereisit]<- model.fits[[k]][posMk, 1] sebiMk[k, whereisit] <- model.fits[[k]][posMk, 2] } } } Ebi <- postprob %*% EbiMk Ebimat_matrix(rep(Ebi,nmod),nrow=nmod,byrow=T) SDbi <- sqrt(postprob%*%(sebiMk^2) + postprob%*%((EbiMk-Ebimat)^2)) dimnames(which) <- list(NULL,names(x)) dimnames(EbiMk)_dimnames(sebiMk)_list(NULL,output.names) ### Output list(postprob = postprob, label = label, size = size, bic = bic, prior.param=prior.param, prior.model.weights=prior/prior.weight.denom, which = which, probne0 = c(probne0), postmean = as.vector(Ebi), postsd = as.vector(SDbi), mle = EbiMk, se = sebiMk, namesx = output.names) }