## Get the number of data sets
nDatasets <- length(ddsList)
## Determine the number of rows and columns in facetted plots
## (first element = columns, second element = rows)
colRow <- grDevices::n2mfrow(length(ddsList))
## Set height/width of each panel in multi-panel plots
panelSize <- 4
## Initialize plot list
plots <- list()
## Define plot theme
thm <- theme_bw() +
theme(axis.text = element_text(size = 15),
axis.title = element_text(size = 14),
strip.text = element_text(size = 15))
## ----------------------- Calculate dispersions ----------------------- ##
if (!quiet)
message("Calculating dispersions. This can take some time, please be patient.")
if (is.finite(maxNForDisp)) {
msg <- paste0("If there are more than ", maxNForDisp,
" samples in a data set, the dispersions are calculated using ",
maxNForDisp, " randomly selected samples.")
} else {
msg <- ""
}
obj <- calculateDispersionsddsList(ddsList = ddsList,
maxNForDisp = maxNForDisp, seed = seed)
## --------------- Calculate between-sample correlations --------------- ##
if (!quiet)
message("Calculating sample correlations.")
sampleCorrDF <- calculateSampleCorrs(ddsList = obj,
maxNForCorr = maxNForCorr, seed = seed)
## -------------- Calculate between-feature correlations --------------- ##
if (!quiet)
message("Calculating feature correlations.")
featureCorrDF <- calculateFeatureCorrs(ddsList = obj,
maxNForCorr = maxNForCorr, seed = seed)
## ----------------- Summarize sample characteristics ------------------ ##
if (!quiet)
message("Summarizing calculated values.")
sampleDF <- lapply(obj, function(x) {
data.frame(
Libsize = colSums(x$dge$counts),
Fraczero = colMeans(x$dge$counts == 0),
TMM = x$dge$samples$norm.factors
) %>% dplyr::mutate(EffLibsize = Libsize * TMM)
})
ns <- sapply(sampleDF, nrow)
sampleDF <- do.call(rbind, sampleDF) %>%
dplyr::mutate(dataset = rep(names(sampleDF), ns))
## ---------------- Summarize feature characteristics ------------------ ##
featureDF <- lapply(obj, function(x) {
data.frame(
Tagwise = sqrt(x$dge$tagwise.dispersion),
Common = sqrt(x$dge$common.dispersion),
Trend = sqrt(x$dge$trended.dispersion),
AveLogCPM = x$dge$AveLogCPM,
AveLogCPMDisp = x$dge$AveLogCPMDisp,
average_log2_cpm = apply(edgeR::cpm(x$dge, prior.count = 2, log = TRUE), 1, mean),
variance_log2_cpm = apply(edgeR::cpm(x$dge, prior.count = 2, log = TRUE), 1, var),
Fraczero = rowMeans(x$dge$counts == 0),
dispGeneEst = rowData(x$dds)$dispGeneEst,
dispFit = rowData(x$dds)$dispFit,
dispFinal = rowData(x$dds)$dispersion,
baseMeanDisp = rowData(x$dds)$baseMeanDisp,
baseMean = rowData(x$dds)$baseMean
)
})
ns <- sapply(featureDF, nrow)
featureDF <- do.call(rbind, featureDF) %>%
dplyr::mutate(dataset = rep(names(featureDF), ns))
## ----------------- Summarize data set characteristics ---------------- ##
datasetDF <- do.call(rbind, lapply(obj, function(x) {
data.frame(
prior_df = paste0("prior.df = ", round(x$dge$prior.df, 2)),
nVars = nrow(x$dge$counts),
nSamples = ncol(x$dge$counts)
)
})) %>%
dplyr::mutate(AveLogCPMDisp = 0.8 * max(featureDF$AveLogCPMDisp)) %>%
dplyr::mutate(Tagwise = 0.9 * max(featureDF$Tagwise)) %>%
dplyr::mutate(dataset = names(obj))
This report compares the characteristics of four subsets of samples from different body subsites in the Human Microbiome Project 16S marker gene sequencing data.
The following model formulae were used in the dispersion calculations for the different data sets. Note that if a count matrix or data frame was provided in place of a DESeqDataSet for some data set, the corresponding design formula is set to ~1
, thus assuming that all samples are to be treated as replicates.
## Print design information
for (ds in names(ddsList)) {
cat(ds, ": ", as.character(DESeq2::design(ddsList[[ds]])), "\n")
}
Mid vagina : ~ group
Left Antecubital fossa : ~ group
Palatine Tonsils : ~ group
Saliva : ~ group
These bar plots show the number of samples (columns) and features (rows) in each data set.
plots[["nSamples"]] <-
ggplot(datasetDF, aes(x = dataset, y = nSamples, fill = dataset)) +
geom_bar(stat = "identity", alpha = 0.5) +
xlab("") + ylab("Number of samples (columns)") +
thm + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
plots[["nSamples"]]
plots[["nVariables"]] <-
ggplot(datasetDF, aes(x = dataset, y = nVars, fill = dataset)) +
geom_bar(stat = "identity", alpha = 0.5) +
xlab("") + ylab("Number of features (rows)") +
thm + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
plots[["nVariables"]]
Disperson/BCV plots show the association between the average abundance and the dispersion or “biological coefficient of variation” (sqrt(dispersion)), as calculated by edgeR
(Robinson, McCarthy, and Smyth 2010) and DESeq2
(Love, Huber, and Anders 2014). In the edgeR
plot, the estimate of the prior degrees of freedom is indicated.
The black dots represent the tagwise dispersion estimates, the red line the common dispersion and the blue curve represents the trended dispersion estimates. For further information about the dispersion estimation in edgeR
, see Chen, Lun, and Smyth (2014).
plots[["BCVedgeR"]] <-
ggplot(featureDF %>% dplyr::arrange(AveLogCPMDisp),
aes(x = AveLogCPMDisp, y = Tagwise)) +
geom_point(size = 0.25, alpha = 0.5) +
facet_wrap(~dataset, nrow = colRow[2]) +
geom_line(aes(y = Trend), color = "blue", size = 1.5) +
geom_line(aes(y = Common), color = "red", size = 1.5) +
geom_text(data = datasetDF, aes(label = prior_df)) +
xlab("Average log CPM") + ylab("Biological coefficient of variation") +
thm
plots[["BCVedgeR"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the average log CPM and tagwise dispersion. The following statistics and test results are included:
if (calculateStatistics)
makeDF(df = featureDF, column = c("AveLogCPMDisp", "Tagwise"),
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
The black dots are the gene-wise dispersion estimates, the red curve the fitted mean-dispersion relationship and the blue circles represent the final dispersion estimates.For further information about the dispersion estimation in DESeq2
, see Love, Huber, and Anders (2014).
plots[["dispersionDESeq2"]] <-
ggplot(featureDF %>% dplyr::arrange(baseMeanDisp),
aes(x = baseMeanDisp, y = dispGeneEst)) +
geom_point(size = 0.25, alpha = 0.5) +
facet_wrap(~dataset, nrow = colRow[2]) + scale_x_log10() + scale_y_log10() +
geom_point(aes(y = dispFinal), color = "lightblue", shape = 21) +
geom_line(aes(y = dispFit), color = "red", size = 1.5) +
xlab("Base mean") + ylab("Dispersion") +
thm
plots[["dispersionDESeq2"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the base mean and genewise dispersion. The following statistics and test results are included:
if (calculateStatistics)
makeDF(df = featureDF, column = c("baseMeanDisp", "dispGeneEst"),
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
These scatter plots show the relation between the empirical mean and variance of the features. The difference between these mean-variance plots and the mean-dispersion plots above is that the plots in this section do not take the information about the experimental design and sample grouping into account, but simply display the mean and variance of log2(CPM) estimates across all samples, calculated using the cpm
function from edgeR
(Robinson, McCarthy, and Smyth 2010), with a prior count of 2.
plots[["meanVarSepScatter"]] <-
ggplot(featureDF, aes(x = average_log2_cpm, y = variance_log2_cpm)) +
geom_point(size = 0.75, alpha = 0.5) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Mean of log2(CPM)") + ylab("Variance of log2(CPM)") +
thm
plots[["meanVarSepScatter"]]
plots[["meanVarOverlaidScatter"]] <-
ggplot(featureDF, aes(x = average_log2_cpm, y = variance_log2_cpm, color = dataset)) +
geom_point(size = 0.75, alpha = 0.5) +
xlab("Mean of log2(CPM)") + ylab("Variance of log2(CPM)") +
thm
plots[["meanVarOverlaidScatter"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the average and variance of the log CPM. The following statistics and test results are included:
if (calculateStatistics)
makeDF(df = featureDF, column = c("average_log2_cpm", "variance_log2_cpm"),
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
These plots illustrate the distribution of the total read count per sample, i.e., the column sums of the respective data matrices.
plots[["libsizeSepHist"]] <-
ggplot(sampleDF, aes(x = Libsize)) + geom_histogram(bins = 30) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Library size") + thm
plots[["libsizeSepHist"]]
plots[["libsizeOverlaidHist"]] <-
ggplot(sampleDF, aes(x = Libsize, group = dataset, fill = dataset)) +
geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
xlab("Library size") + thm
plots[["libsizeOverlaidHist"]]
plots[["libsizeDensity"]] <-
ggplot(sampleDF, aes(x = Libsize, group = dataset, color = dataset)) +
geom_line(alpha = 0.5, size = 1.5, stat = "density") +
xlab("Library size") + thm
plots[["libsizeDensity"]]
plots[["libsizeDensityFilled"]] <-
ggplot(sampleDF, aes(x = Libsize, group = dataset, fill = dataset)) +
geom_density(alpha = 0.5) +
xlab("Library size") + thm
plots[["libsizeDensityFilled"]]
plots[["libsizeBox"]] <-
ggplot(sampleDF, aes(x = dataset, y = Libsize, color = dataset)) +
geom_boxplot(outlier.size = -1, alpha = 0.5) +
geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) +
ylab("Library size") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["libsizeBox"]]
plots[["libsizeViolin"]] <-
ggplot(sampleDF, aes(x = dataset, y = Libsize, fill = dataset)) +
geom_violin(alpha = 0.5) +
ylab("Library size") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["libsizeViolin"]]
plots[["libsizeEcdf"]] <-
ggplot(sampleDF, aes(x = Libsize, group = dataset, color = dataset)) +
stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") +
xlab("Library size") + thm
plots[["libsizeEcdf"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the library size. The following statistics and test results are included:
It should be noted that the reported statistics in some cases are highly dependent on the number of underlying observations, and especially with a large number of observations, very small p-values can correspond to small effective differences. Thus, interpretation should ideally be guided by both quantitative and qualitative, visual information.
if (calculateStatistics)
makeDF(df = sampleDF, column = "Libsize",
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
The plots below show the distribution of the TMM normalization factors (Robinson and Oshlack 2010), intended to adjust for differences in RNA composition, as calculated by edgeR
(Robinson, McCarthy, and Smyth 2010).
plots[["tmmSepHist"]] <-
ggplot(sampleDF, aes(x = TMM)) + geom_histogram(bins = 30) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("TMM normalization factor") + thm
plots[["tmmSepHist"]]
plots[["tmmOverlaidHist"]] <-
ggplot(sampleDF, aes(x = TMM, group = dataset, fill = dataset)) +
geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
xlab("TMM normalization factor") + thm
plots[["tmmOverlaidHist"]]
plots[["tmmDensity"]] <-
ggplot(sampleDF, aes(x = TMM, group = dataset, color = dataset)) +
geom_line(alpha = 0.5, size = 1.5, stat = "density") +
xlab("TMM normalization factor") + thm
plots[["tmmDensity"]]
plots[["tmmDensityFilled"]] <-
ggplot(sampleDF, aes(x = TMM, group = dataset, fill = dataset)) +
geom_density(alpha = 0.5) +
xlab("TMM normalization factor") + thm
plots[["tmmDensityFilled"]]
plots[["tmmBox"]] <-
ggplot(sampleDF, aes(x = dataset, y = TMM, color = dataset)) +
geom_boxplot(outlier.size = -1, alpha = 0.5) +
geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) +
ylab("TMM normalization factor") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["tmmBox"]]
plots[["tmmViolin"]] <-
ggplot(sampleDF, aes(x = dataset, y = TMM, fill = dataset)) +
geom_violin(alpha = 0.5) +
ylab("TMM normalization factor") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["tmmViolin"]]
plots[["tmmEcdf"]] <-
ggplot(sampleDF, aes(x = TMM, group = dataset, color = dataset)) +
stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") +
xlab("TMM normalization factor") + thm
plots[["tmmEcdf"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the TMM factor. The following statistics and test results are included:
It should be noted that the reported statistics in some cases are highly dependent on the number of underlying observations, and especially with a large number of observations, very small p-values can correspond to small effective differences. Thus, interpretation should ideally be guided by both quantitative and qualitative, visual information.
if (calculateStatistics)
makeDF(df = sampleDF, column = "TMM",
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
These plots show the distribution of the “effective library sizes”, defined as the total count per sample multiplied by the corresponding TMM normalization factor.
plots[["effLibsizeSepHist"]] <-
ggplot(sampleDF, aes(x = EffLibsize)) + geom_histogram(bins = 30) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Effective library size") + thm
plots[["effLibsizeSepHist"]]
plots[["effLibsizeOverlaidHist"]] <-
ggplot(sampleDF, aes(x = EffLibsize, group = dataset, fill = dataset)) +
geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
xlab("Effective library size") + thm
plots[["effLibsizeOverlaidHist"]]
plots[["effLibsizeDensity"]] <-
ggplot(sampleDF, aes(x = EffLibsize, group = dataset, color = dataset)) +
geom_line(alpha = 0.5, size = 1.5, stat = "density") +
xlab("Effective library size") + thm
plots[["effLibsizeDensity"]]
plots[["effLibsizeDensityFilled"]] <-
ggplot(sampleDF, aes(x = EffLibsize, group = dataset, fill = dataset)) +
geom_density(alpha = 0.5) +
xlab("Effective library size") + thm
plots[["effLibsizeDensityFilled"]]
plots[["effLibsizeBox"]] <-
ggplot(sampleDF, aes(x = dataset, y = EffLibsize, color = dataset)) +
geom_boxplot(outlier.size = -1, alpha = 0.5) +
geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) +
ylab("Effective library size") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["effLibsizeBox"]]
plots[["effLibsizeViolin"]] <-
ggplot(sampleDF, aes(x = dataset, y = EffLibsize, fill = dataset)) +
geom_violin(alpha = 0.5) +
ylab("Effective library size") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["effLibsizeViolin"]]
plots[["effLibsizeEcdf"]] <-
ggplot(sampleDF, aes(x = EffLibsize, group = dataset, color = dataset)) +
stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") +
xlab("Effective library size") + thm
plots[["effLibsizeEcdf"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the effective library size. The following statistics and test results are included:
It should be noted that the reported statistics in some cases are highly dependent on the number of underlying observations, and especially with a large number of observations, very small p-values can correspond to small effective differences. Thus, interpretation should ideally be guided by both quantitative and qualitative, visual information.
if (calculateStatistics)
makeDF(df = sampleDF, column = "EffLibsize",
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
The plots in this section show the distribution of average abundance values for the features. The abundances are log CPM values calculated by edgeR
.
plots[["logCPMSepHist"]] <-
ggplot(featureDF, aes(x = AveLogCPM)) + geom_histogram(bins = 30) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Average log CPM") + thm
plots[["logCPMSepHist"]]
plots[["logCPMOverlaidHist"]] <-
ggplot(featureDF, aes(x = AveLogCPM, group = dataset, fill = dataset)) +
geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
xlab("Average log CPM") + thm
plots[["logCPMOverlaidHist"]]
plots[["logCPMDensity"]] <-
ggplot(featureDF, aes(x = AveLogCPM, group = dataset, color = dataset)) +
geom_line(alpha = 0.5, size = 1.5, stat = "density") +
xlab("Average log CPM") + thm
plots[["logCPMDensity"]]
plots[["logCPMDensityFilled"]] <-
ggplot(featureDF, aes(x = AveLogCPM, group = dataset, fill = dataset)) +
geom_density(alpha = 0.5) +
xlab("Average log CPM") + thm
plots[["logCPMDensityFilled"]]
plots[["logCPMBox"]] <-
ggplot(featureDF, aes(x = dataset, y = AveLogCPM, color = dataset)) +
geom_boxplot(outlier.size = -1, alpha = 0.5) +
geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) +
ylab("Average log CPM") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["logCPMBox"]]
plots[["logCPMViolin"]] <-
ggplot(featureDF, aes(x = dataset, y = AveLogCPM, fill = dataset)) +
geom_violin(alpha = 0.5) +
ylab("Average log CPM") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["logCPMViolin"]]
plots[["logCPMEcdf"]] <-
ggplot(featureDF, aes(x = AveLogCPM, group = dataset, color = dataset)) +
stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") +
xlab("Average log CPM") + thm
plots[["logCPMEcdf"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the average log CPM. The following statistics and test results are included:
It should be noted that the reported statistics in some cases are highly dependent on the number of underlying observations, and especially with a large number of observations, very small p-values can correspond to small effective differences. Thus, interpretation should ideally be guided by both quantitative and qualitative, visual information.
if (calculateStatistics)
makeDF(df = featureDF, column = "AveLogCPM",
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
These plots show the distribution of the fraction of zeros observed per sample (column) in the count matrices.
plots[["fraczeroSampleSepHist"]] <-
ggplot(sampleDF, aes(x = Fraczero)) + geom_histogram(bins = 30) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleSepHist"]]
plots[["fraczeroSampleOverlaidHist"]] <-
ggplot(sampleDF, aes(x = Fraczero, group = dataset, fill = dataset)) +
geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleOverlaidHist"]]
plots[["fraczeroSampleDensity"]] <-
ggplot(sampleDF, aes(x = Fraczero, group = dataset, color = dataset)) +
geom_line(alpha = 0.5, size = 1.5, stat = "density") +
xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleDensity"]]
plots[["fraczeroSampleDensityFilled"]] <-
ggplot(sampleDF, aes(x = Fraczero, group = dataset, fill = dataset)) +
geom_density(alpha = 0.5) +
xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleDensityFilled"]]
plots[["fraczeroSampleBox"]] <-
ggplot(sampleDF, aes(x = dataset, y = Fraczero, color = dataset)) +
geom_boxplot(outlier.size = -1, alpha = 0.5) +
geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) +
ylab("Fraction zeros per sample") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["fraczeroSampleBox"]]
plots[["fraczeroSampleViolin"]] <-
ggplot(sampleDF, aes(x = dataset, y = Fraczero, fill = dataset)) +
geom_violin(alpha = 0.5) +
ylab("Fraction zeros per sample") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["fraczeroSampleViolin"]]
plots[["fraczeroSampleEcdf"]] <-
ggplot(sampleDF, aes(x = Fraczero, group = dataset, color = dataset)) +
stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") +
xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleEcdf"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the fraction zeros. The following statistics and test results are included:
It should be noted that the reported statistics in some cases are highly dependent on the number of underlying observations, and especially with a large number of observations, very small p-values can correspond to small effective differences. Thus, interpretation should ideally be guided by both quantitative and qualitative, visual information.
if (calculateStatistics)
makeDF(df = sampleDF, column = "Fraczero",
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
These plots illustrate the distribution of the fraction of zeros observed per feature (row) in the count matrices.
plots[["fraczeroFeatureSepHist"]] <-
ggplot(featureDF, aes(x = Fraczero)) + geom_histogram(bins = 30) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureSepHist"]]
plots[["fraczeroFeatureOverlaidHist"]] <-
ggplot(featureDF, aes(x = Fraczero, group = dataset, fill = dataset)) +
geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureOverlaidHist"]]
plots[["fraczeroFeatureDensity"]] <-
ggplot(featureDF, aes(x = Fraczero, group = dataset, color = dataset)) +
geom_line(alpha = 0.5, size = 1.5, stat = "density") +
xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureDensity"]]
plots[["fraczeroFeatureDensityFilled"]] <-
ggplot(featureDF, aes(x = Fraczero, group = dataset, fill = dataset)) +
geom_density(alpha = 0.5) +
xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureDensityFilled"]]
plots[["fraczeroFeatureBox"]] <-
ggplot(featureDF, aes(x = dataset, y = Fraczero, color = dataset)) +
geom_boxplot(outlier.size = -1, alpha = 0.5) +
geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) +
ylab("Fraction zeros per feature") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["fraczeroFeatureBox"]]
plots[["fraczeroFeatureViolin"]] <-
ggplot(featureDF, aes(x = dataset, y = Fraczero, fill = dataset)) +
geom_violin(alpha = 0.5) +
ylab("Fraction zeros per feature") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["fraczeroFeatureViolin"]]
plots[["fraczeroFeatureEcdf"]] <-
ggplot(featureDF, aes(x = Fraczero, group = dataset, color = dataset)) +
stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") +
xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureEcdf"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the fraction zeros. The following statistics and test results are included:
It should be noted that the reported statistics in some cases are highly dependent on the number of underlying observations, and especially with a large number of observations, very small p-values can correspond to small effective differences. Thus, interpretation should ideally be guided by both quantitative and qualitative, visual information.
if (calculateStatistics)
makeDF(df = featureDF, column = "Fraczero",
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
The plots below show the distribution of Spearman correlation coefficients for pairs of samples, calculated from the log(CPM) values obtained via the cpm
function from edgeR
, with a prior.count of 2. If there are more than 500 samples in a data set, the pairwise correlations between 500 randomly selected samples are shown.
plots[["sampleCorrSepHist"]] <-
ggplot(sampleCorrDF, aes(x = Correlation)) + geom_histogram(bins = 30) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Sample-sample correlation") + thm
plots[["sampleCorrSepHist"]]
plots[["sampleCorrOverlaidHist"]] <-
ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) +
geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
xlab("Sample-sample correlation") + thm
plots[["sampleCorrOverlaidHist"]]
plots[["sampleCorrDensity"]] <-
ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, color = dataset)) +
geom_line(alpha = 0.5, size = 1.5, stat = "density") +
xlab("Sample-sample correlation") + thm
plots[["sampleCorrDensity"]]
plots[["sampleCorrDensityFilled"]] <-
ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) +
geom_density(alpha = 0.5) +
xlab("Sample-sample correlation") + thm
plots[["sampleCorrDensityFilled"]]
plots[["sampleCorrBox"]] <-
ggplot(sampleCorrDF, aes(x = dataset, y = Correlation, color = dataset)) +
geom_boxplot(outlier.size = -1, alpha = 0.5) +
geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) +
ylab("Sample-sample correlation") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["sampleCorrBox"]]
plots[["sampleCorrViolin"]] <-
ggplot(sampleCorrDF, aes(x = dataset, y = Correlation, fill = dataset)) +
geom_violin(alpha = 0.5) +
ylab("Sample-sample correlation") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["sampleCorrViolin"]]
plots[["sampleCorrEcdf"]] <-
ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, color = dataset)) +
stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") +
xlab("Sample-sample correlation") + thm
plots[["sampleCorrEcdf"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the Spearman correlation. The following statistics and test results are included:
It should be noted that the reported statistics in some cases are highly dependent on the number of underlying observations, and especially with a large number of observations, very small p-values can correspond to small effective differences. Thus, interpretation should ideally be guided by both quantitative and qualitative, visual information.
if (calculateStatistics)
makeDF(df = sampleCorrDF, column = "Correlation",
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
These plots illustrate the distribution of Spearman correlation coefficients for pairs of features, calculated from the log(CPM) values obtained via the cpm
function from edgeR
, with a prior.count of 2. Only non-constant features are considered, and if there are more than 500 such features in a data set, the pairwise correlations between 500 randomly selected features are shown.
plots[["featureCorrSepHist"]] <-
ggplot(featureCorrDF, aes(x = Correlation)) + geom_histogram(bins = 30) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Feature-feature correlation") + thm
plots[["featureCorrSepHist"]]
plots[["featureCorrOverlaidHist"]] <-
ggplot(featureCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) +
geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
xlab("Feature-feature correlation") + thm
plots[["featureCorrOverlaidHist"]]
plots[["featureCorrDensity"]] <-
ggplot(featureCorrDF, aes(x = Correlation, group = dataset, color = dataset)) +
geom_line(alpha = 0.5, size = 1.5, stat = "density") +
xlab("Feature-feature correlation") + thm
plots[["featureCorrDensity"]]
plots[["featureCorrDensityFilled"]] <-
ggplot(featureCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) +
geom_density(alpha = 0.5) +
xlab("Feature-feature correlation") + thm
plots[["featureCorrDensityFilled"]]
plots[["featureCorrBox"]] <-
ggplot(featureCorrDF, aes(x = dataset, y = Correlation, color = dataset)) +
geom_boxplot(outlier.size = -1, alpha = 0.5) +
geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) +
ylab("Feature-feature correlation") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["featureCorrBox"]]
plots[["featureCorrViolin"]] <-
ggplot(featureCorrDF, aes(x = dataset, y = Correlation, fill = dataset)) +
geom_violin(alpha = 0.5) +
ylab("Feature-feature correlation") + xlab("") + thm +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["featureCorrViolin"]]
plots[["featureCorrEcdf"]] <-
ggplot(featureCorrDF, aes(x = Correlation, group = dataset, color = dataset)) +
stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") +
xlab("Feature-feature correlation") + thm
plots[["featureCorrEcdf"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the Spearman correlation. The following statistics and test results are included:
It should be noted that the reported statistics in some cases are highly dependent on the number of underlying observations, and especially with a large number of observations, very small p-values can correspond to small effective differences. Thus, interpretation should ideally be guided by both quantitative and qualitative, visual information.
if (calculateStatistics)
makeDF(df = featureCorrDF, column = "Correlation",
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
These scatter plots show the association between the total count (column sums) and the fraction of zeros observed per sample.
plots[["libsizeFraczeroSepScatter"]] <-
ggplot(sampleDF, aes(x = Libsize, y = Fraczero)) +
geom_point(size = 1, alpha = 0.5) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Library size") + ylab("Fraction zeros") + thm
plots[["libsizeFraczeroSepScatter"]]
plots[["libsizeFraczeroOverlaidScatter"]] <-
ggplot(sampleDF, aes(x = Libsize, y = Fraczero, color = dataset)) +
geom_point(size = 1, alpha = 0.5) +
geom_line(stat = "smooth", method = "loess", size = 1, alpha = 0.5) +
xlab("Library size") + ylab("Fraction zeros") + thm
plots[["libsizeFraczeroOverlaidScatter"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the library size and fraction of zeros. The following statistics and test results are included:
if (calculateStatistics)
makeDF(df = sampleDF, column = c("Libsize", "Fraczero"),
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
These scatter plots show the association between the average abundance and the fraction of zeros observed per feature. The abundance is defined as the log(CPM) values as calculated by edgeR
.
plots[["logCPMFraczeroSepScatter"]] <-
ggplot(featureDF, aes(x = AveLogCPM, y = Fraczero)) +
geom_point(size = 0.75, alpha = 0.5) +
facet_wrap(~dataset, nrow = colRow[2]) +
xlab("Average log CPM") + ylab("Fraction zeros") + thm
plots[["logCPMFraczeroSepScatter"]]
plots[["logCPMFraczeroOverlaidScatter"]] <-
ggplot(featureDF, aes(x = AveLogCPM, y = Fraczero, color = dataset)) +
geom_point(size = 0.75, alpha = 0.5) +
xlab("Average log CPM") + ylab("Fraction zeros") + thm
plots[["logCPMFraczeroOverlaidScatter"]]
The table below contains a range of quantitative statistics and test results evaluating the degree of similarity between each pair of data sets based on the average log CPM and fraction of zeros. The following statistics and test results are included:
if (calculateStatistics)
makeDF(df = featureDF, column = c("AveLogCPM", "Fraczero"),
permutationPvalues = permutationPvalues, nPermutations = nPermutations,
subsampleSize = subsampleSize, seed = seed, kmin = kmin, kfrac = kfrac)
sessionInfo()
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/local/R/R-3.4.0/lib/libRblas.so
## LAPACK: /usr/local/R/R-3.4.0/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
## [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] bindrcpp_0.2 countsimQC_0.5.2
## [3] DESeq2_1.16.1 SummarizedExperiment_1.6.3
## [5] DelayedArray_0.2.7 matrixStats_0.52.2
## [7] Biobase_2.36.2 GenomicRanges_1.28.3
## [9] GenomeInfoDb_1.12.2 IRanges_2.10.2
## [11] S4Vectors_0.14.3 BiocGenerics_0.22.0
##
## loaded via a namespace (and not attached):
## [1] tidyr_0.7.1 edgeR_3.19.0
## [3] jsonlite_1.5 bit64_0.9-7
## [5] splines_3.4.0 Formula_1.2-1
## [7] assertthat_0.2.0 latticeExtra_0.6-28
## [9] blob_1.1.0 GenomeInfoDbData_0.99.0
## [11] yaml_2.1.14 RSQLite_2.0
## [13] backports_1.1.0 lattice_0.20-35
## [15] limma_3.32.2 glue_1.1.1
## [17] digest_0.6.12 RColorBrewer_1.1-2
## [19] XVector_0.16.0 checkmate_1.8.2
## [21] colorspace_1.3-2 htmltools_0.3.6
## [23] Matrix_1.2-10 plyr_1.8.4
## [25] XML_3.98-1.9 pkgconfig_2.0.1
## [27] genefilter_1.58.1 zlibbioc_1.22.0
## [29] purrr_0.2.3 xtable_1.8-2
## [31] scales_0.4.1 BiocParallel_1.10.1
## [33] htmlTable_1.9 tibble_1.3.3
## [35] annotate_1.54.0 ggplot2_2.2.1
## [37] DT_0.2 nnet_7.3-12
## [39] lazyeval_0.2.0 survival_2.41-3
## [41] magrittr_1.5 evaluate_0.10.1
## [43] memoise_1.1.0 foreign_0.8-69
## [45] tools_3.4.0 data.table_1.10.4
## [47] stringr_1.2.0 munsell_0.4.3
## [49] locfit_1.5-9.1 cluster_2.0.6
## [51] AnnotationDbi_1.38.1 compiler_3.4.0
## [53] caTools_1.17.1 rlang_0.1.2
## [55] randtests_1.0 grid_3.4.0
## [57] RCurl_1.95-4.8 htmlwidgets_0.9
## [59] labeling_0.3 rmarkdown_1.6
## [61] bitops_1.0-6 base64enc_0.1-3
## [63] gtable_0.2.0 DBI_0.7
## [65] R6_2.2.2 gridExtra_2.2.1
## [67] knitr_1.16 dplyr_0.7.1
## [69] bit_1.1-12 rprojroot_1.2
## [71] bindr_0.1 Hmisc_4.0-3
## [73] stringi_1.1.5 Rcpp_0.12.12
## [75] geneplotter_1.54.0 rpart_4.1-11
## [77] acepack_1.4.1
Chen, Yunshun, Aaron TL Lun, and Gordon K Smyth. 2014. “Differential Expression Analysis of Complex RNA-Seq Experiments Using EdgeR.” In Statistical Analysis of Next Generation Sequence Data. Somnath Datta and Daniel S Nettleton (Eds), Springer, New York. https://link.springer.com/chapter/10.1007%2F978-3-319-07212-8_3.
Kolmogorov, Andrey. 1933. “Sulla Determinazione Empirica Di Una Legge Di Distribuzione.” Giornale Dell’Istituto Italiano Degli Attuari 4: 83–91.
Love, Michael I, Wolfgang Huber, and Simon Anders. 2014. “Moderated Estimation of Fold Change and Dispersion for RNA-Seq Data with DESeq2.” Genome Biology 15: 550. https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0550-8.
Robinson, Mark D, and Alicia Oshlack. 2010. “A Scaling Normalization Method for Differential Expression Analysis of RNA-Seq Data.” Genome Biology 11: R25. https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25.
Robinson, Mark D, Davis J McCarthy, and Gordon K Smyth. 2010. “EdgeR-a Bioconductor Package for Differential Expression Analysis of Digital Gene Expression Data.” Bioinformatics 26: 139–40. https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btp616.
Rousseeuw, Peter J. 1987. “Silhouettes a Graphical Aid to the Interpretation and Validation of Cluster Analysis.” Journal of Computational and Applied Mathematics, 53–65. http://www.sciencedirect.com/science/article/pii/0377042787901257.
Smirnov, Nikolai Vasilyevich. 1948. “Table for Estimating the Goodness of Fit of Empirical Distributions.” Annals of Mathematical Statistics 19: 279–81. https://projecteuclid.org/euclid.aoms/1177730256.
Wald, Abraham, and Jacob Wolfowitz. 1940. “On a Test Whether Two Samples Are from the Same Population.” The Annals of Mathematical Statistics 11: 147–62. https://projecteuclid.org/euclid.aoms/1177731909.