```{r setup, include=FALSE} library(tidyverse) library(tximport) library(biomaRt) library(RColorBrewer) library(reshape2) library(genefilter) library(edgeR) library(matrixStats) library(hrbrthemes) library(gplots) library(limma) library(heatmaply) library(d3heatmap) library(DT) library(gt) library(plotly) knitr::opts_chunk$set(message = FALSE) ``` Overview {data-icon="fa-signal"} ===================================== Row {data-height=400} ------------------------------------------------------------------------------ ### Signal distribution: raw data ```{r raw data} # importing data into R using Tximport package targets <- read_tsv("../../studyDesign.txt")# read in your study design path <- file.path("../readMapping", targets$sample, "abundance.h5") # set file paths to your mapped data targets <- mutate(targets, path) # add paths to your study design (only necessary for Sleuth) Hs.anno <- useMart(biomart="ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl") # select 'mart' from biomaRt for annotations Tx <- getBM(attributes=c('ensembl_transcript_id_version', # get gene symbols for each transcript ID 'external_gene_name'), mart = Hs.anno) Tx <- as_tibble(Tx) # convert this annotation mapping file to a tibble (the tidyverse version of a dataframe) Txi_gene <- tximport(path, #reading kallisto data into R type = "kallisto", tx2gene = Tx, txOut = FALSE, countsFromAbundance = "lengthScaledTPM") myCPM <- as_tibble(Txi_gene$abundance, rownames = "geneSymbol") # these are you counts after adjusting for transcript length myCounts <- as_tibble(Txi_gene$counts, rownames = "geneSymbol") # these are your transcript per million (TPM) values, or counts per million (CPM) if you collapsed data to gene level groups1 <- targets$treatment groups1 <- factor(groups1) sampleLabels <- targets$sample myDGEList <- DGEList(Txi_gene$counts) log2.cpm <- cpm(myDGEList, log=TRUE) nsamples <- ncol(log2.cpm) myColors <- brewer.pal(nsamples, "Paired") log2.cpm.df <- as_tibble(log2.cpm) colnames(log2.cpm.df) <- sampleLabels log2.cpm.df.melt <- melt(log2.cpm.df) ggplot(log2.cpm.df.melt, aes(x=variable, y=value, fill=variable)) + geom_violin(trim = FALSE, show.legend = FALSE) + stat_summary(fun.y = "median", geom = "point", shape = 124, size = 6, color = "black", show.legend = FALSE) + labs(y="log2 expression", x = "sample") + #title="Log2 Counts per Million (CPM)", #subtitle="unfiltered, non-normalized", #caption=paste0("produced on ", Sys.time())) + coord_flip() + theme_ipsum_rc() ``` ### Signal distribution: filtered and normalized data ```{r processed data} cpm <- cpm(myDGEList) keepers <- rowSums(cpm>1)>=3 #user defined myDGEList.filtered <- myDGEList[keepers,] myDGEList.filtered.norm <- calcNormFactors(myDGEList.filtered, method = "TMM") log2.cpm.filtered.norm <- cpm(myDGEList.filtered.norm, log=TRUE) log2.cpm.filtered.norm.df <- as_tibble(log2.cpm.filtered.norm) colnames(log2.cpm.filtered.norm.df) <- sampleLabels log2.cpm.filtered.norm.df.melt <- melt(log2.cpm.filtered.norm.df) ggplot(log2.cpm.filtered.norm.df.melt, aes(x=variable, y=value, fill=variable)) + geom_violin(trim = FALSE, show.legend = FALSE) + stat_summary(fun.y = "median", geom = "point", shape = 124, size = 6, color = "black", show.legend = FALSE) + labs(y="log2 expression", x = "sample") + #title="Log2 Counts per Million (CPM)", #subtitle="filtered, TMM normalized", #caption=paste0("produced on ", Sys.time())) + coord_flip() + theme_ipsum_rc() ``` ### PCA plot ```{r PCA} pca.res <- prcomp(t(log2.cpm.filtered.norm), scale.=F, retx=T) pc.var<-pca.res$sdev^2 pc.per<-round(pc.var/sum(pc.var)*100, 1) pca.res.df <- as_tibble(pca.res$x) ggplot(pca.res.df, aes(x=PC1, y=PC3, color=groups1)) + geom_point(size=5) + xlab(paste0("PC1 (",pc.per[1],"%",")")) + ylab(paste0("PC3 (",pc.per[3],"%",")")) + theme_ipsum_rc() + theme(legend.title=element_blank()) ``` Row {data-height=400} ------------------------------------------------------------------------------ ### Filtered and normalized counts per million (CPM) ```{r filtered and normalized data} mydata.df <- as_tibble(log2.cpm.filtered.norm, rownames = "geneSymbol") colnames(mydata.df) <- c("geneSymbol", sampleLabels) #user defined mydata.df <- mutate(mydata.df, uninfected.AVG = (uninf_rep1 + uninf_rep2 + uninf_rep1)/3, crypto.wt.AVG = (crypto.wt_rep1 + crypto.wt_rep2 + crypto.wt_rep3)/3, crypto.mut.AVG = (crypto.mut_rep1 + crypto.mut_rep2 + crypto.mut_rep3)/3, #now make columns comparing each of the averages above that you're interested in LogFC.crypto.wt_vs_uninfected = (crypto.wt.AVG - uninfected.AVG), LogFC.crypto.mut_vs_uninfected = (crypto.mut.AVG - uninfected.AVG)) %>% mutate_if(is.numeric, round, 2) mydata.sort <- mydata.df %>% dplyr::arrange(desc(LogFC.crypto.wt_vs_uninfected)) %>% dplyr::select(geneSymbol, uninfected.AVG, crypto.wt.AVG, LogFC.crypto.wt_vs_uninfected) datatable(mydata.df, extensions = c('KeyTable', "FixedHeader"), options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 5, lengthMenu = c("10", "25", "50", "100"))) %>% formatRound(columns=c(1:6), digits=2) ``` DEGs {data-orientation=columns data-icon="fa-random"} ===================================== Column {data-width=500} ------------------------------------------------------------------------------ ### Volcano plot ```{r volcano plot} design <- model.matrix(~0 + groups1) colnames(design) <- levels(groups1) v.DEGList.filtered.norm <- voom(myDGEList.filtered.norm, design, plot = FALSE) contrast.matrix <- makeContrasts(infection_with_WT = crypto.wt - uninfected, infection_with_Mut = crypto.mut - uninfected, levels=design) fit <- lmFit(v.DEGList.filtered.norm, design) fits <- contrasts.fit(fit, contrast.matrix) ebFit <- eBayes(fits) myTopHits <- topTable(ebFit, adjust ="BH", coef=1, number=40000, sort.by="logFC") myTopHits <- as_tibble(myTopHits, rownames = "geneSymbol") ggplotly(ggplot(myTopHits, aes(y=-log10(adj.P.Val), x=logFC, text = paste("Symbol:", geneSymbol))) + geom_point(size=2) + ylim(-0.5,12) + geom_hline(yintercept = -log10(0.01), linetype="longdash", colour="grey", size=1) + geom_vline(xintercept = 1, linetype="longdash", colour="#BE684D", size=1) + geom_vline(xintercept = -1, linetype="longdash", colour="#2C467A", size=1) + theme_ipsum_rc()) ``` Column {.tabset data-width=500} ------------------------------------------------------------------------------ ### heatmap ```{r heatmap} #heatmap of all DEGs #create additional heatmaps for co-expressed modules or a priori list of genes design <- model.matrix(~0 + groups1) colnames(design) <- levels(groups1) v.DEGList.filtered.norm <- voom(myDGEList.filtered.norm, design, plot = FALSE) fit <- lmFit(v.DEGList.filtered.norm, design) contrast.matrix <- makeContrasts(infection_with_WT = crypto.wt - uninfected, infection_with_Mut = crypto.mut - uninfected, levels=design) fits <- contrasts.fit(fit, contrast.matrix) ebFit <- eBayes(fits) results <- decideTests(ebFit, method="global", adjust.method="BH", p.value=0.01, lfc=2) colnames(v.DEGList.filtered.norm$E) <- sampleLabels diffGenes <- v.DEGList.filtered.norm$E[results[,1] !=0 | results[,2] !=0,] myheatcolors2 <- colorRampPalette(colors=c("yellow","white","blue"))(100) clustRows <- hclust(as.dist(1-cor(t(diffGenes), method="pearson")), method="complete") #cluster rows by pearson correlation clustColumns <- hclust(as.dist(1-cor(diffGenes, method="spearman")), method="complete") module.assign <- cutree(clustRows, k=2) module.color <- rainbow(length(unique(module.assign)), start=0.1, end=0.9) module.color <- module.color[as.vector(module.assign)] d3heatmap(diffGenes, colors = myheatcolors2, Rowv=as.dendrogram(clustRows), row_side_colors = module.color, dendrogram = "none", scale='row' ) ``` ### DEG table ```{r DEG table} diffGenes.df <- as_tibble(diffGenes, rownames = "geneSymbol") diffGenes.df <- mutate(diffGenes.df, uninfected.AVG = (uninf_rep1 + uninf_rep2 + uninf_rep1)/3, crypto.wt.AVG = (crypto.wt_rep1 + crypto.wt_rep2 + crypto.wt_rep3)/3, crypto.mut.AVG = (crypto.mut_rep1 + crypto.mut_rep2 + crypto.mut_rep3)/3, #now make columns comparing each of the averages above that you're interested in LogFC = (crypto.wt.AVG - uninfected.AVG), LogFC.crypto.mut_vs_uninfected = (crypto.mut.AVG - uninfected.AVG)) %>% mutate_if(is.numeric, round, 2) diffGenes.sort <- diffGenes.df %>% dplyr::arrange(desc(LogFC)) %>% dplyr::select(geneSymbol, uninfected.AVG, crypto.wt.AVG, LogFC) datatable(diffGenes.sort, extensions = c('KeyTable', "FixedHeader"), options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 25, lengthMenu = c("10", "25", "50", "100"))) %>% formatRound(columns=c(1:6), digits=2) ``` About {data-orientation=rows data-icon="fa-comment-alt"} ===================================== Row {data-height=1000} ------------------------------------------------------------------------------ Use this page to describe the results in your own words. Some things to think about: * What are the key takeaways from the analysis? * What types of analyses would you want to do next? * Based on your analysis, what wet-lab experiments would you pursue? * How could you expand on or otherwise enhance this data dashboard?