#This function is taken from the GSEA software since it is very fast. #Because of this it cannot be distributed until it is rewritten from #scratch. mean.sd.ests <- function(geneset.data, c.lab=NULL, s0=0.01){ if(is.null(c.lab)){ c.lab <- geneset.data$class.labels } P <- c.lab P[c.lab == TRUE] <- 1 P[c.lab == FALSE] <- 0 A <- geneset.data$expr n1 <- sum(P) M1 <- A %*% P M1 <- M1/n1 A2 <- A*A S1 <- A2 %*% P S1 <- S1/n1 - M1*M1 P <- 1-P n2 <- sum(P) M2 <- A %*% P M2 <- M2/n2 gc() A2 <- A*A S2 <- A2 %*% P S2 <- S2/n2 - M2*M2 t.num1 <- M1 - M2 pooled.var <- s0+(S2*(n2-1) + S1*(n1-1))/ (n1 + n2 -2) t.denom1 <- sqrt( pooled.var * (n2^-1 + n1^-1) ) t.stat <- t.num1 / t.denom1 pt1 <- pt(-abs(t.stat), n1+n2-2)*2 return(list(m1=M1, m2=M2, s1=S1, s2=S2, t=t.stat, pt=pt1, n=c(n1, n2), pooled.var=pooled.var)) } p.val.mod.v1 <- function(x){ return( exp(28*(1-x) -25)/(1+exp(28*(1-x) -25) ) ) } #This function computes the score. there are several commented lines starting with #'T0'. These are variations on the statistic. whw.score.sub <- function( geneset.data, geneset.sorted.idx, geneset.transf.data, c.lab, p.val.mod=p.val.mod.v1, ratio.diff.mod=function(x,y){ return( sqrt(abs(x)) * y ) }, ratio.base=10, s0=0.01 ) { transf.ests <- mean.sd.ests(geneset.transf.data, c.lab, s0=s0) ests <- mean.sd.ests(geneset.data, c.lab, s0=s0) m1 <- ests$m1 m2 <- ests$m2 med1.idx = c(floor(median(1:sum(c.lab))), ceiling(median(1:sum(c.lab)))) med2.idx = c(floor(median(1:sum(!c.lab))), ceiling(median(1:sum(!c.lab)))) med1 = matrix(0, nrow(geneset.data$expr), 2) med2 = matrix(0, nrow(geneset.data$expr), 2) for(i in 1:nrow(geneset.data$expr)) { med1[i, ] = geneset.data$expr[i, geneset.sorted.idx[i, c.lab[geneset.sorted.idx[i, ]]][med1.idx]] med2[i, ] = geneset.data$expr[i, geneset.sorted.idx[i, !c.lab[geneset.sorted.idx[i, ]]][med2.idx]] } med1 = rowMeans(med1) med2 = rowMeans(med2) ratio.v <- pmax(0, (ratio.base+ pmax(med1, med2))/(ratio.base+ pmin(m1, m2))-1 ) T0 <- ratio.diff.mod(m1-m2, ratio.v) px <- transf.ests$pt px2 <- ests$pt comp.stats <- T0 * ( p.val.mod( px ) * p.val.mod( px2 ) ) return( list(base.stats=T0, px=px, px2=px2, comp.stats=comp.stats) ) } whw.score <- function( geneset.data, n.perms, p.val.mod=p.val.mod.v1, ratio.diff.mod=function(x,y){ return( sqrt(abs(x)) * y ) }, t.pretransf=function(x){ return(log(abs(x)) ) }, n.fdr.calc=Inf, debug.iter=100, ratio.base=10, n.boot.rep=0, n.gene.boot=100, s0=0.01 ) { geneset.sorted.idx = matrix(0, nrow(geneset.data$expr), ncol(geneset.data$expr)) for(i in 1:nrow(geneset.data$expr)) geneset.sorted.idx[i, ] = order(geneset.data$expr[i, ]) if(n.perms > 0){ base.stats <- array(0, c(nrow(geneset.data$expr), n.perms+1) ) comp.stats <- array(0, c(nrow(geneset.data$expr), n.perms+1) ) base.pvals <- array(0, c(nrow(geneset.data$expr), n.perms+1) ) base.pvals2 <- array(0, c(nrow(geneset.data$expr), n.perms+1) ) geneset.data$ests <- mean.sd.ests(geneset.data, geneset.data$class.labels) geneset.transf.data <- geneset.data geneset.transf.data$expr <- t.pretransf(geneset.transf.data$expr) print("fdr permutations") res1 <- whw.score.sub(geneset.data, geneset.sorted.idx, geneset.transf.data, geneset.data$class.labels, p.val.mod=p.val.mod, ratio.diff.mod=ratio.diff.mod, ratio.base=ratio.base) c.lab <- geneset.data$class.labels base.stats[,1] <- res1$base.stats comp.stats[,1] <- res1$comp.stats base.pvals[,1] <- res1$px base.pvals2[,1] <- res1$px2 for(i in 1:n.perms){ #print(paste("c", i) ) c.lab.p <- sample( c.lab, ncol(geneset.data$expr), replace=FALSE ) res1 <- whw.score.sub(geneset.data, geneset.sorted.idx, geneset.transf.data, c.lab.p, p.val.mod=p.val.mod, ratio.diff.mod=ratio.diff.mod, ratio.base=ratio.base) base.stats[,i+1] <- res1$base.stats comp.stats[,i+1] <- res1$comp.stats base.pvals[,i+1] <- res1$px base.pvals2[,i+1] <- res1$px2 if( i %% debug.iter == 0){ print( paste("permutation iteration ", i) ) } } i4a <- sort(abs(comp.stats[,1]), decreasing=FALSE, index.return=TRUE)$ix abs.obs.scores <- abs(comp.stats[,1]) abs.perm.scores <- abs(comp.stats) T.perm <- abs.perm.scores[, 2:ncol(abs.perm.scores)] }else{ T.perm <- geneset.data$score2$T.perm abs.obs.scores <- abs(geneset.data$score2$comp.stats) abs.perm.scores <- cbind(abs.obs.scores, abs(T.perm)) comp.stats <- abs.perm.scores base.stats = comp.stats base.pvals = comp.stats base.pvals2=comp.stats final.pvals = comp.stats } n.perms <- ncol(T.perm) fdr.ests <- NULL exp.T.list <- NULL h2l.ind <- sort(abs.obs.scores, decreasing=TRUE, index.return=TRUE)$ix last.tot.false <- 0 if(n.fdr.calc > 0){ print("calculating fdr") bin.inds <- rep(1, length(geneset.data$ests$m1)) n.fdr.calc <- min(n.fdr.calc, nrow(geneset.data$expr)-1) fdr.ests <- matrix(nrow=nrow(geneset.data$expr), ncol=4 ) for(i in 1:n.fdr.calc){ v1 <- 2:ncol(comp.stats) cur.stat <- abs.obs.scores[ h2l.ind[i] ] for(j in 2:ncol(comp.stats)){ v1[j-1] <- sum(abs.perm.scores[,j] > cur.stat)/i } fdr.ests[h2l.ind[i],] <- c(quantile(v1, probs = c(.1, .5, .9) ), mean(v1)) if( i %% debug.iter == 0){ print( paste("fdr iteration ", i) ) } } } n.gene.boot <- min(nrow(geneset.data$expr), n.gene.boot) T.star <- NULL if(n.boot.rep > 0){ print("bootstrap sampling") T.star <- matrix(nrow=nrow(comp.stats), ncol=n.boot.rep) sub.inds <- h2l.ind[1:n.gene.boot] geneset.data2 <- geneset.data geneset.data2$expr <- geneset.data2$expr[ sub.inds, ] i2a <- (1:ncol(geneset.data2$expr))[geneset.data2$class.labels == TRUE] i2b <- (1:ncol(geneset.data2$expr))[geneset.data2$class.labels == FALSE] geneset.data2$class.labels <- rep(FALSE, length(geneset.data2$class.labels)) geneset.data2$class.labels[1:length(i2a)] <- TRUE for(i in 1:n.boot.rep){ geneset.data2b <- geneset.data2 s1 <- sample(i2a, length(i2a), replace=TRUE) s2 <- sample(i2b, length(i2b), replace=TRUE) geneset.data2b$expr <- geneset.data2b$expr[, c(s1, s2)] geneset.transf.data2b <- geneset.data2b geneset.transf.data2b$expr <- t.pretransf(geneset.transf.data2b$expr) res1 <- whw.score.sub(geneset.data2b, geneset.transf.data2b, geneset.data2b$class.labels, p.val.mod=p.val.mod, ratio.diff.mod=ratio.diff.mod, ratio.base=ratio.base) T.star[ sub.inds,i] <- res1$comp.stats } } print("saving and exiting") foo1 <- function(x){ return( (sum(x >= x[1])-1)/n.perms ) } final.pvals <- apply( comp.stats, 1, foo1 ) geneset.data$score <- list(base.stats=base.stats[,1], t.pvals=base.pvals[,1], logt.pvals2=base.pvals2[,1], T=comp.stats[,1], perm.pvals=final.pvals, est.fdr=fdr.ests, T.star=T.star, T.perm=T.perm) return( geneset.data ) } #Returns a matrix which contains 10%, median, 90% quantiles, and mean FDR estimate #in the first through fourth columns respectively. T.fdr.curve <- function( geneset.data, list.size=Inf){ list.size <- min( nrow(geneset.data$expr), list.size, sum(!is.na(geneset.data$score$est.fdr[,1])) ) if(list.size == 0){ return(NULL) }else{ i3 <- sort( abs(geneset.data$score$T), decreasing=TRUE, index.return=TRUE)$ix[1:list.size] return( geneset.data$score$est.fdr[i3,2] ) } } T.bootstrap.ranks <- function( geneset.data, quant.seq=c(.7, .5, .1), small.list=50, large.list=min(200, nrow(geneset.data$expr)) ) { i3 <- sort( abs(geneset.data$score$T), index.return=TRUE)$ix[1:large.list] n.boot.rep <- ncol(geneset.data$score$T.star) r1.newstat.m <- matrix(nrow=large.list, ncol=n.boot.rep) for(i in 1:n.boot.rep){ r1.newstat.m[,i] <- large.list+1-rank(abs(geneset.data$score$T.star[i3,i])) } func2 <- function(x){ return(quantile(x, quant.seq)) } return( apply(r1.newstat.m, 1, func2) ) } plot.bootstrap.ranks <- function(geneset.data, num.genes=200){ top.list <- sort(geneset.data$score$T, decreasing=TRUE, index.return=TRUE)$ix[1:(num.genes*2)] res1 <- num.genes*2 + 1 - apply(geneset.data$score$T.star[top.list,], 2, rank) res1 <- res1[1:num.genes,] my.quant <- function(x){ return( quantile(x, c(.1, .2, .5, .8, .9) ) ) } res2 <- apply(res1, 1, my.quant) plot(c(1, num.genes), range(res2), col="white", xlab="gene index", ylab="rank", main="bootstrap rankings") lty.v <- c("dotted", "dashed", "solid", "dashed", "dotted") for(j in 1:nrow(res2)){ lines(1:num.genes, res2[j,], lty=lty.v[j]) } abline(a=0, b=1, col="red") } analyze.data <- function(data, clabel, file.str = "test6", n.perms = 300, n.fdr.calc = 3000, ratio.base = 30, save.plots=TRUE){ data.list <- list( expr=as.matrix(data[,-1]), class.labels=clabel, probesets=rownames(data) ) data.list$expr <- pmax(data.list$expr, 1) data.list$ests <- mean.sd.ests(data.list, data.list$class.labels) if(save.plots){ png(paste(file.str, "_modRatio.png", sep="") ) r0.seq <- as.integer( seq(from=10, to=40, length=4) ) par(mfrow=c(2,2)) max.mean <- pmax(data.list$ests$m1, data.list$ests$m2) min.mean <- pmin(data.list$ests$m1, data.list$ests$m2) for(i in 1:4){ r0 <- r0.seq[i] plot(log(min.mean), (max.mean + r0) / (min.mean + r0), xlab="log min mean", ylab="moderated ratio", main=paste("r0=", r0) ) } dev.off() } data.list <- whw.score(data.list, n.perms=n.perms, p.val.mod=p.val.mod.v1, ratio.diff.mod=function(x,y){ return( sqrt(abs(x)) * y ) }, n.fdr.calc=n.fdr.calc, debug.iter=1, ratio.base=ratio.base, n.boot.rep=0, n.gene.boot=400 ) if(save.plots){ png(paste(file.str, "_medFDR.png", sep=""), width=900 ) par(mfrow=c(1,3)) fdr.curve <- T.fdr.curve(data.list, 300) plot(fdr.curve, type="l", xlab="gene ranking", ylab="FDR", main=paste( file.str, "median FDR")) fdr.curve <- T.fdr.curve(data.list, 900) plot(fdr.curve, type="l", xlab="gene ranking", ylab="FDR", main=paste( file.str, "median FDR")) fdr.curve <- T.fdr.curve(data.list) plot(fdr.curve, type="l", xlab="gene ranking", ylab="FDR", main=paste( file.str, "median FDR")) dev.off() png(paste(file.str, "_bsRanks.png", sep=""), width=500 ) #plot.bootstrap.ranks(data.list, 100) dev.off() } #output the top genes top.list <- sort(data.list$score$T, decreasing=TRUE, index.return=TRUE)$ix data = cbind(data, rowMeans(data[-1][, clabel]) / rowMeans(data[-1][, !clabel])) data = cbind(rownames(data), data) names(data)[1] = "RowNum" names(data)[ncol(data)] = "foldchange" data$fdr <- data.list$score$est.fdr[,2] data$fdrmean <- data.list$score$est.fdr[,4] data$t.pvals <- data.list$score$t.pvals data$T <- data.list$score$T data = data[!is.na(data[["fdr"]]), ] #data = data[data[["fdr"]] < 0.5, ] data = data[order(data[["fdr"]]), ] write.table(data[data[["foldchange"]] > 1, ], file=paste(file.str, "_up.csv", sep=""), row.names = FALSE, sep="\t") write.table(data[data[["foldchange"]] < 1, ], file=paste(file.str, "_down.csv", sep=""), row.names = FALSE, sep="\t") # top.list <- top.list[1:500] # write.table(data[top.list,], file=paste(file.str, "_top.csv", sep=""), row.names = FALSE, sep=",") # save(data.list, file=paste(file.str, ".Rdata", sep="")) }