############################################################################### ## Author: Andrea Riebler [andrea *.* riebler *a*t* math *.* ntnu *.* no] ## Mark Robinson [mark *.* robinson *a*t* imls *.* uzh *.* ch] ## Time-stamp: <07/01/2014> ## ## Description: ## Helper functions ############################################################################### library(GenomicRanges) mybarplot <- function(values, pos, offset, xlim, ylim, col, ...){ plot(1:1, type="n", xlim=xlim, ylim=ylim, ...) for(i in 1:length(values)){ polygon(c(pos[i] - offset, pos[i] + offset, pos[i] + offset, pos[i]-offset), c(0,0,values[i], values[i]), col=col[i]) } } ## adapted smoothScatter function from package graphics (added range.x to .smoothScatterCalcDensity) ## changed image to image.plot to get color legend mysmoothScatter <- function (x, y = NULL, nbin = 128, bandwidth, colramp = colorRampPalette(c("white", blues9)), nrpoints = 100, pch = ".", cex = 1, col = "black", transformation = function(x) x^0.25, postPlotHook = box, xlab = NULL, ylab = NULL, xlim, ylim, xaxs = par("xaxs"), yaxs = par("yaxs"), range.x=list(c(0,1),c(0,1)),...) { require(fields) if (!is.numeric(nrpoints) | (nrpoints < 0) | (length(nrpoints) != 1)) stop("'nrpoints' should be numeric scalar with value >= 0.") xlabel <- if (!missing(x)) deparse(substitute(x)) ylabel <- if (!missing(y)) deparse(substitute(y)) xy <- xy.coords(x, y, xlabel, ylabel) xlab <- if (is.null(xlab)) xy$xlab else xlab ylab <- if (is.null(ylab)) xy$ylab else ylab x <- cbind(xy$x, xy$y)[is.finite(xy$x) & is.finite(xy$y), , drop = FALSE] if (!missing(xlim)) { stopifnot(is.numeric(xlim), length(xlim) == 2, is.finite(xlim)) x <- x[min(xlim) <= x[, 1] & x[, 1] <= max(xlim), ] } else { xlim <- range(x[, 1]) } if (!missing(ylim)) { stopifnot(is.numeric(ylim), length(ylim) == 2, is.finite(ylim)) x <- x[min(ylim) <= x[, 2] & x[, 2] <= max(ylim), ] } else { ylim <- range(x[, 2]) } map <- grDevices:::.smoothScatterCalcDensity(x, nbin, bandwidth, range.x=range.x) xm <- map$x1 ym <- map$x2 dens <- map$fhat dens[] <- transformation(dens) image.plot(xm, ym, z = dens, col = colramp(256), xlab = xlab, ylab = ylab, xlim = xlim, ylim = ylim, xaxs = xaxs, yaxs = yaxs, ...) if (!is.null(postPlotHook)) postPlotHook() if (nrpoints > 0) { nrpoints <- min(nrow(x), ceiling(nrpoints)) stopifnot((nx <- length(xm)) == nrow(dens), (ny <- length(ym)) == ncol(dens)) ixm <- 1L + as.integer((nx - 1) * (x[, 1] - xm[1])/(xm[nx] - xm[1])) iym <- 1L + as.integer((ny - 1) * (x[, 2] - ym[1])/(ym[ny] - ym[1])) sel <- order(dens[cbind(ixm, iym)])[seq_len(nrpoints)] points(x[sel, ], pch = pch, cex = cex, col = col) } } ## getTruth_450k(...) ## Function to get 450k methylation estimates for provided windows. To obtain bin-specific methylation profiles, ## we average the methylation estimates from all CpG sites within 100bp (upstream and downstream; total ## window of 200bp) from the center of the provide windows (here 100bp bins) ## ## Arguments: ############## ## windows - for which we would like to have estimates ## data450k - GRanges object containing CpG wise methylation estimates ## ## Value: ############## ## ## A vector with methylation estimates (one for each window). getTruth_450k <- function(windows, data_450k){ windows.extend <- resize(windows,200,fix="center") foL <- findOverlaps(data_450k, windows.extend) #inds <- split( foL@matchMatrix[,1], foL@matchMatrix[,2]) inds <- split(foL@queryHits, foL@subjectHits) # get the mean of the beta values lying in one bin of interest b <- values(data_450k)$beta beta <- sapply(inds, function(u){mean(b[u])}) beta_res <- rep(NA, length(windows.extend)) # set the values for the windows for which we have an estimate ind <- as.integer(names(inds)) beta_res[ind] <- beta return(beta_res) } ## getBatman(...) ## Function to get the Batman results from one chromsome for the same windows ## ## Arguments: ############## ## windows - for which we would like to have estimates ## batmanTable - results table from Batman (modified Java code!). ## ## Value: ############## ## ## A vector with methylation estimates (one for each window). getBatman <- function(windows, batmanTable){ tmp <- batmanTable # delete the ";" from the mean value (TODO: change output format of Batman) tmp[,13] <- as.vector(tmp[,13]) tmp[,13] <- as.numeric(substr(tmp[,13], 1, nchar(tmp[,13])-1)) # generate a GRranges object for the bins used by batman chrs <- tolower(as.vector(tmp[,10])) batman_bins <- GRanges(seqnames=chrs, ranges=IRanges(start=tmp[,4], end=tmp[,5]), strand="+") # find overlaps with the windows (in the chromosome) we used fo_batman <- findOverlaps(windows, batman_bins) #inds_batman <- split(fo_batman@matchMatrix[,2], fo_batman@matchMatrix[,1]) inds_batman <- split(fo_batman@subjectHits, fo_batman@queryHits) # generate a vector including information for all windows len <- length(windows) batman_res <- matrix(NA, ncol=2, nrow=len) colnames(batman_res) <- c("mean", "sd") # set the values for the windows for which we have an estimate ind_batman <- as.integer(names(inds_batman)) batman_res[ind_batman,1] <- tmp$V13[unlist(inds_batman)] batman_res[ind_batman,2] <- tmp$V15[unlist(inds_batman)] rownames(batman_res) <- as.vector(seqnames(windows)) return(batman_res) } ## getMedips(...) ## Function to get BALM estimates for provided windows. To obtain bin-specific methylation profiles, ## we average the methylation estimates from all CpG sites within 100bp (upstream and downstream; total ## window of 200bp) from the center of the provide windows (here 100bp bins) ## ## Arguments: ############## ## windows - for which we would like to have estimates ## medipsOutput - object returned by the function MEDIPS.methylProfiling ## ## Value: ############## ## ## A vector with methylation estimates (one for each window). getMedips <- function(windows, medipsOutput){ medips_ranges <- GRanges(seqnames=medipsOutput$chr, ranges=IRanges(start=medipsOutput$start, end=medipsOutput$stop), strand="+") # find out which CpG overlaps which bin medips_fo <- findOverlaps(windows, medips_ranges) #medips_inds <- split(medips_fo@matchMatrix[,2], medips_fo@matchMatrix[,1]) medips_inds <- split(medips_fo@subjectHits, medips_fo@queryHits) medips_inds2 <- unlist(medips_inds) medipsOutput_sel <- medipsOutput[medips_inds2,] len <- length(windows) medips_res <- matrix(NA, ncol=2, nrow=len) colnames(medips_res) <- c("ams_mean", "rms_mean") # set the values for the windows for which we have an estimate ind_medips<- as.integer(names(medips_inds)) tmp_ams <- (medipsOutput_sel$ams_A/1000) tmp_rms <- (medipsOutput_sel$rms_A/1000) tmp_ams[tmp_ams > 1] <- 1 tmp_rms[tmp_rms > 1] <- 1 medips_res[ind_medips,1] <- tmp_ams medips_res[ind_medips,2] <- tmp_rms rownames(medips_res) <- as.vector(seqnames(windows)) return(medips_res) } ## getBalm(...) ## Function to get BALM estimates for provided windows. To obtain bin-specific methylation profiles, ## we average the methylation estimates from all CpG sites within 100bp (upstream and downstream; total ## window of 200bp) from the center of the provide windows (here 100bp bins) ## ## Arguments:f ############## ## windows - for which we would like to have estimates ## balmRes - GRanges object containing CpG wise methylation estimates ## ## Value: ############## ## ## A vector with methylation estimates (one for each window). getBalm <- function(windows, balm_res){ windows.extend <- resize(windows,200,fix="center") foL <- findOverlaps(balm_res, windows.extend) #inds <- split( foL@matchMatrix[,1], foL@matchMatrix[,2]) inds <- split(foL@queryHits, foL@subjectHits) b <- values(balm_res)$balm balm_meth <- sapply(inds, function(u){mean(b[u])}) balm_res <- rep(NA, length(windows.extend)) ind <- as.integer(names(inds)) balm_res[ind] <- balm_meth return(balm_res) } get_meanDisp <- function(bml_object, idx=1){ pT <- priorTab(bml_object) lev <- levels(pT[[1]]) ngroups <- length(lev) lim <- unlist(strsplit(lev, ",")) lci <- lim[seq(1, 2*ngroups, 2)] uci <- lim[seq(2, 2*ngroups, 2)] l <- sapply(1:ngroups, function(u){return(as.numeric(substr(lci[u], 2, nchar(lci[u]))))}) u <- sapply(1:ngroups, function(u){return(as.numeric(substr(uci[u], 1, nchar(uci[u])-1)))}) cuts <- c(l, u[ngroups]) bins <- cuts[1:(length(cuts)-1)]+diff(cuts)/2 mu <- pT[[3+idx]][1,]/pT[[3+idx]][2,] disp <- 1/pT[[3+idx]][1,] return(cbind(bins=bins, mean=mu, dispersion=disp)) } # for uniform prior for methylation level my_dmarginal <- function(mu, y1, y2, al, bl, W, cons, log=F){ if(log){ if(mu < 0 || mu > 1){ return(-Inf) } return(log( mu^{y1}/W* (1- (cons*(1-mu))/(bl + 1 +cons))^{-(al + y1 + y2)})) } else { if(mu < 0 || mu > 1){ return(0) } return(mu^{y1}/W* (1- (cons*(1-mu))/(bl + 1 +cons))^{-(al + y1 + y2)}) } } ## Mean Bias mean_bias <- function(truth, est){ return(mean(est - truth, na.rm=TRUE)) } ## Mean Dawid-Sebastiani score, defined as ## 1/n \sum_{i=1}^n (x_i - \mu_i)^2/\sigma_i^2 + 2*log(\sigma_i) mean_dss <- function(truth, est, est_sd){ est_sd[est_sd < 1e-6] <- 1e-6 return(mean(((truth-est)/est_sd)^2 + 2*log(est_sd), na.rm=TRUE)) } ## Mean-squared error mse <- function(truth, est, na.rm=TRUE, ...){ sqd_diff <- (truth-est)^2 return(mean(sqd_diff, na.rm=na.rm, ...)) } ## Mean-squared error score mses <- function(truth, est, est_sd, na.rm=TRUE, ...){ mses <- mean((est-truth)^2 + est_sd^2, na.rm=na.rm, ...) return(mses) } ## Absolute error mabse <- function(x, y, na.rm=TRUE, ...){ abs_diff <- abs((x-y)) return(mean(abs_diff, na.rm=na.rm, ...)) } ## Spearman-correlation for pairwise complete observations corS <- function(truth,est){ return(cor(truth,est, use="complete.obs", method="spearman")) } ## Get the number of points that are unequal to NA in truth and est numPoints <- function(truth,est){ return(sum(!is.na(truth) & !is.na(est))) } check_coverage <- function(truth, lb, ub){ su <- sum(truth >= lb & truth <= ub) return(c(su=su, n=length(lb), avg= su/length(lb))) } ## compute: mean bias, Spearman correlation, mse, if possible dss getMetrics <- function(truth, est, est_sd=NULL, group=NULL, digits=2){ if(length(truth) != length(est)){ stop("Truth and est must be of same length") } if(is.null(group)){ res <- c(npoints=numPoints(truth,est), bias=mean_bias(truth,est), mse=mse(truth,est), corS=corS(truth,est)) if(!is.null(est_sd)){ if(length(est) != length(est_sd)){ stop("The vector of standard errors has not the same length as the estimates") } res <- c(res, dss=mean_dss(truth, est, est_sd), mses=mses(truth, est, est_sd)) } cnames <- names(res) res <- matrix(res, nrow=1) colnames(res) <- cnames } else { if(length(group) != length(est)){ stop("The grouping vector must be of same length as the other vectors") } truthg <- split(truth, group) estg <- split(est, group) ngroups <- length(estg) res <- sapply(1:ngroups, function(u){c(npoints=numPoints(truthg[[u]],estg[[u]]), bias=mean_bias(truthg[[u]],estg[[u]]), mse=mse(truthg[[u]],estg[[u]]), corS=corS(truthg[[u]],estg[[u]]))}) if(!is.null(est_sd)){ if(length(est) != length(est_sd)){ stop("The vector of standard errors has not the same length as the estimates") } est_sdg <- split(est_sd, group) res <- rbind(res, sapply(1:ngroups, function(u){c(dss=mean_dss(truthg[[u]],estg[[u]], est_sdg[[u]]), mses=mses(truthg[[u]],estg[[u]], est_sdg[[u]]))})) } colnames(res) <- names(estg) } return(apply(res, 1, format, digits=digits, nsmall=digits)) } # # ## EXAMPLE # truth <- rnorm(20) # est <- rnorm(20) # est_sd <- rnorm(20, sd=0.1) # group <- rep(1:4, each=5) # # # getMetrics(truth=truth, est=est, group=group, digits=3) # getMetrics(truth=truth, est=est, group=NULL, digits=3) # getMetrics(truth=truth, est=est, est_sd=est_sd, group=group, digits=3) # getMetrics(truth=truth, est=est, est_sd=est_sd,group=NULL, digits=3) # # # checks # estg <- split(est, group) # truthg <- split(truth, group) # est_sdg <- split(est_sd, group) # mean_bias(truthg[[4]], estg[[4]]) # mean_dss(truthg[[2]], estg[[2]], est_sdg[[2]])