Abstract

Aedes aegypti mosquitoes cause high morbidity and mortality by transmitting pathogens such as dengue virus, chikungunya virus, and yellow fever virus. The midgut of the mosquito is the primary barrier that inhibits dissemination of pathogens acquired in a bloodmeal into the mosquito circulatory system. This inhibition is crucial because once a pathogen, such as a virus or parasite, has successfully entered the circulatory system they can be transmitted to another host upon a subsequent bite. Therefore, understanding regulation of the midgut environment is critical. Specifically, this work focuses on understanding the regulation of the midgut by the steroid hormone, 20-hydroxyecdysone, which is produced in direct response to bloodfeeding.

The analysis below is based on RNAseq data from 10 samples:
-Dissected female midguts (control)
-Dissected female carcasses (control)
-Dissected 20E-injected female midguts at 5hr
-Dissected 20E-injected female carcasses at 5hr
-Dissected PBS-injected female midguts at 5hr
-Dissected PBS-injected female carcasses at 5hr
-Dissected 20E-injected female midguts at 18hr
-Dissected 20E-injected female carcasses at 18hr
-Dissected PBS-injected female midguts at 18hr
-Dissected PBS-injected female carcasses at 18hr
*each sample is pooled RNA from 30 mosquitoes

R Packages Required for Analysis

library(Rsubread)
library(limma)
library(edgeR)
library(ShortRead)
library(biomaRt)
library(ggplot2)
library(reshape2)
library(dplyr)
library(ggvis)
library(gplots)
library(RColorBrewer)

This summary report was compiled using RPackages:

library(rmarkdown)
library(knitr)

Experimental Set Up

Prepare groups that represent the samples and incorporate information about their different components (tissue, timepoint, hormone-treatment).

#Read in study design file
targets <- readTargets("Aedes_studyDesign.txt", row.names=NULL)
targets
groups <- factor(paste(targets$treatment, targets$timepoint, targets$tissue, sep="."))
#Creates more understandable labels for samples using info in the study design file
sampleLabels <- paste(targets$treatment, targets$timepoint, targets$tissue, sep=".")
#Set-up of experimental design
design <- model.matrix(~0+groups)
colnames(design) <- levels(groups)
design

Reading in Primary Data using raw fastq files, these steps completed on CHMI desktop workstation

#Build index from reference genome; must have already downloaded the fasta file for genome of interest and have it in the working directory
buildindex(basename="Aedes",reference="Aedes_aegypti.AaegL3.26.dna.genome.fa")
#Aligns reads (in the fastq files) to indexed reference genome that above
reads <- targets$fastq
align(index="Aedes", readfile1=reads, input_format="FASTQ",output_format="BAM",
      output_file=targets$OutputFile,tieBreakHamming=TRUE,unique=TRUE,indels=5, nthreads=8)
#Read in text file with bam file names
MyBAM <- targets$OutputFile
#Summarize aligned reads to genomic features
fc <- featureCounts(files=MyBAM, annot.ext="Aedes-aegypti-Liverpool_BASEFEATURES_AaegL3.3.gtf", 
                    isGTFAnnotationFile=TRUE, useMetaFeatures=TRUE, strandSpecific=2, nthreads=8)
#Use the 'DGEList' function from EdgeR to make a 'digital gene expression list' object
DGEList <- DGEList(counts=fc$counts, genes=fc$annotation)
save(DGEList, file="DGEList")
load("DGEList")
#Retrieve all gene/transcript identifiers from this DGEList object
myGeneIDs <- DGEList$genes$GeneID

Filtering and Normalizing Data (for samples without replicates)

Filtered data so that only genes that have greater than 10 reads per million mapped reads in at least two samples are kept in the dataset for analysis. There are no replicates in this dataset, so the only standardization that is useful is converting both rpkm-filtered and rpkm-unfiltered datasets into binary logarithm (log2).

#RPKM (Reads Per Kilobase per Mapped reads) for unfiltered data
rpkm.unfiltered <- rpkm(DGEList, DGEList$genes$Length)
rpkm.unfiltered <- log2(rpkm.unfiltered + 1)
#Filtering the data to only keep in analysis genes that have >10 reads per million mapped reads in at least two libraries.
cpm.matrix.filtered <- rowSums(cpm(DGEList) > 10) >= 2
DGEList.filtered <- DGEList[cpm.matrix.filtered,]
dim(DGEList.filtered)
rpkm.filtered <- rpkm(DGEList.filtered, DGEList.filtered$genes$Length)
rpkm.filtered <- log2(rpkm.filtered + 1)
#Note that data is now in log2, so numbers smaller than 1 will be negative

Figure 1: Visualizing normalized data

Data can be visualized by histogram and boxplot

#Choose color scheme for graphs
cols.ALL <- topo.colors (n=10, alpha=1)
hist(rpkm.filtered, xlab = "log2 expression", main = "normalized data - histograms", col=cols.ALL)

boxplot(rpkm.filtered, ylab = "normalized log2 expression", main = "non-normalized data - boxplots", col=cols.ALL)

Annotating normalized data using mosquito-specific database

library(biomaRt)
#List all the available 'marts' to choose from
listMarts()
#Select mart of interest
vectorBase <- useMart("vb_gene_mart_1506")
#List the databases available in that mart
listDatasets(vectorBase)
#Select database to work with
aedesDB <- useDataset("aaegypti_eg_gene", mart=vectorBase)
#List the various filters, or keys, that can used to access this database
listFilters(aedesDB)
#List all the attributes that can be retrieved from the database
listAttributes(aedesDB)
#Pull out rownames of your dataset to use as one of the filters
myEnsemblIDs.filtered <- rownames(rpkm.filtered)
myEnsemblIDs.unfiltered <- rownames(rpkm.unfiltered)
#Attach annotation information to file
myAnnot.filtered <- getBM(attributes = c("external_gene_id", "ensembl_gene_id", "description"),
                      filters = "ensembl_gene_id",
                      values = myEnsemblIDs.filtered,
                      mart=aedesDB)
myAnnot.unfiltered <- getBM(attributes = c("external_gene_id", "ensembl_gene_id", "description"),
                          filters = "ensembl_gene_id",
                          values = myEnsemblIDs.unfiltered,
                          mart=aedesDB)
#Transform your identifiers to entrezIDs
resultTable.filtered <- merge(myAnnot.filtered, rpkm.filtered, by.x="ensembl_gene_id", by.y=0)
resultTable.unfiltered <- merge(myAnnot.unfiltered, rpkm.unfiltered, by.x="ensembl_gene_id", by.y=0)

Figure 2: Determine hierarchial clustering of filtered data

This dendrogram shows the relationship between the samples. Tissue-specific differences are responsible for the greatest difference (eg. between midgut and carcass), followed by treatement (20E vs. PBS), and then time point (5hr v. 18hr).

resultTable.filtered.alt <- resultTable.filtered[,-2:-3]
rownames(resultTable.filtered.alt) <- resultTable.filtered.alt[,1]
resultTable.filtered.alt <- resultTable.filtered.alt[,-1]
resultTable.filtered.alt.matrix <- as.matrix(resultTable.filtered.alt)
rownames(resultTable.filtered.alt.matrix)
distance <- dist(t(resultTable.filtered.alt.matrix),method="euclidean")
clusters <- hclust(distance, method = "average") 
plot(clusters, label = sampleLabels, hang = -1)

Figure 3: Principal component analysis of filtered data

pca.res <- prcomp(t(resultTable.filtered.alt.matrix), scale.=F, retx=T)
ls(pca.res)
summary(pca.res) #Prints variance summary for all principal components (PCs).
head(pca.res$rotation) # $rotation shows you how much each GENE influenced each PC (callled 'eigenvalues', or loadings)
head(pca.res$x) # $x shows you how much each SAMPLE influenced each PC (called 'scores')
plot(pca.res, las=1)

pc.var<-pca.res$sdev^2 # $sdev^2 gives you the eigenvalues
pc.per<-round(pc.var/sum(pc.var)*100, 1)
pc.per

#Making graphs to visualize PCA result
library(ggplot2)
#First plot any two PCs against each other
#Turn scores for each gene into a data frame
data.frame <- as.data.frame(pca.res$x)
ggplot(data.frame, aes(x=PC1, y=PC2, colour=factor(groups))) +
  geom_point(size=5) +
  theme(legend.position="right")

#Create a 'small multiples' chart to look at impact of each variable on each pricipal component
library(reshape2)
melted <- cbind(groups, melt(pca.res$x[,1:3]))
#Look at 'melted' data
ggplot(melted) +
  geom_bar(aes(x=Var1, y=value, fill=groups), stat="identity") +
  facet_wrap(~Var2)

Result: PC1 is tissue (midugt or carcass) and PC2 is treatment (20E-injection or PBS-injection)

Organizing and Exploring Data

Generating Organized Data Table with Differentially Expressed Genes

#clean-up data table
colnames(resultTable.filtered) <- c("geneID", "symbol", "description", sampleLabels)
colnames(resultTable.filtered)

#use the dplyr 'mutate' command to get averages and fold changes for all your replicates (since do not have replicates, these are only the difference in fold changes)
myIDs <- resultTable.filtered[,1]
myData <- mutate(resultTable.filtered,
                 Ecdysone.vs.PBS_5hr_carcass = (ecdysone.5hr.carcass - PBS.5hr.carcass),
                 Ecdysone.vs.PBS_18hr_carcass = (ecdysone.18hr.carcass - PBS.18hr.carcass),
                 Ecdysone.vs.PBS_5hr_gut = (ecdysone.5hr.midgut - PBS.5hr.midgut),
                 Ecdysone.vs.PBS_18hr_gut = (ecdysone.18hr.midgut - PBS.18hr.midgut),
                 geneID = myIDs)

Sorting Data Table by Magnitude of Differential Expression
The samples below are looking at the transciptional differences between an 18hr ecdysone treated gut and an 18hr PBS treated gut, to change which comparison is being analyzed insert one of the other 3 options: “Ecdysone.vs.PBS_5hr_carcass”, “Ecdysone.vs.PBS_5hr_gut”, “Ecdysone.vs.PBS_18hr_carcass”

##Down sorting (sorts genes by the greatest down-regulated fold change)
myData.sort <- myData %>%
  arrange(desc(Ecdysone.vs.PBS_18hr_carcass)) %>%
  select(geneID, description, Ecdysone.vs.PBS_18hr_gut)
head(myData.sort)

##Up sorting (sorts genes by greatest up-regulated fold change)
myData.sort <- myData %>%
  arrange(Ecdysone.vs.PBS_18hr_carcass) %>%
  select(geneID, description, Ecdysone.vs.PBS_18hr_gut)
head(myData.sort)

Filtering Data Table by Predetermined Groups of Genes

##Use dplyr "filter" and "select" functions to pick out genes of interest (filter)
#and again display only columns of interest (select)

#Selecting specific genes of interest to examine
myData.filter <- myData %>%
  filter(geneID=="AAEL000395" | geneID=="AAEL009600" | geneID == "AAEL001414") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#TEPs
 myData.filter <- myData %>%
   filter(geneID=="AAEL001163" | geneID=="AAEL000087" | geneID == "AAEL001802" | geneID == "AAEL001794"| geneID == "AAEL014755" | geneID == "AAEL012267") %>%
   select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
 head(myData.filter)

#AMPS
myData.filter <- myData %>%
  filter(geneID=="AAEL003389" | geneID=="AAEL000627" | geneID=="AAEL004223" | geneID=="AAEL000598" | geneID=="AAEL000611" | geneID=="AAEL000625" | geneID=="AAEL015515" | geneID=="AAEL000775" | geneID=="AAEL000777" | geneID=="AAEL000621" | geneID=="AAEL003841" | geneID=="AAEL003832" | geneID=="AAEL003857" | geneID=="AAEL003849" | geneID=="AAEL004833" | geneID=="AAEL004522") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#PGRPs
myData.filter <- myData %>%
  filter(geneID=="AAEL012380" | geneID=="AAEL010171" | geneID=="AAEL014640" | geneID=="AAEL011608" | geneID=="AAEL013112" | geneID=="AAEL009474" | geneID=="AAEL007037" | geneID=="AAEL007039") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#CLIPs
myData.filter <- myData %>%
  filter(geneID=="AAEL002601" | geneID=="AAEL001675" | geneID=="AAEL002126" | geneID=="AAEL008404" | geneID=="AAEL007006" | geneID=="AAEL005718" | geneID=="AAEL000074" | geneID=="AAEL003243" | geneID=="AAEL003253" | geneID=="AAEL014349" | geneID=="AAEL005648" | geneID=="AAEL000059" | geneID=="AAEL001084" | geneID=="AAEL008668" | geneID=="AAEL012785" | geneID=="AAEL014140" | geneID=="AAEL014137" | geneID=="AAEL003280" | geneID=="AAEL007993" | geneID=="AAEL013245" | geneID=="AAEL006674" | geneID=="AAEL000760" | geneID=="AAEL006161" | geneID=="AAEL000086" | geneID=="AAEL000099" | geneID=="AAEL000028" | geneID=="AAEL000037" | geneID=="AAEL005431" | geneID=="AAEL003628" | geneID=="AAEL003632" | geneID=="AAEL003614" | geneID=="AAEL003631" | geneID=="AAEL006168" | geneID=="AAEL014354" | geneID=="AAEL005060" | geneID=="AAEL001077" | geneID=="AAEL005093" | geneID=="AAEL005064" | geneID=="AAEL000038" | geneID=="AAEL003625" | geneID=="AAEL003610" | geneID=="AAEL011991" | geneID=="AAEL011593" | geneID=="AAEL012711" | geneID=="AAEL012712" | geneID=="AAEL004948" | geneID=="AAEL010270" | geneID=="AAEL012713" | geneID=="AAEL007593" | geneID=="AAEL007597" | geneID=="AAEL004518" | geneID=="AAEL004524" | geneID=="AAEL004540" | geneID=="AAEL007796" | geneID=="AAEL015109" | geneID=="AAEL011375" | geneID=="AAEL004979" | geneID=="AAEL002997" | geneID=="AAEL002124" | geneID=="AAEL015439" | geneID=="AAEL005906" | geneID=="AAEL000238" | geneID=="AAEL010773" | geneID=="AAEL005800" | geneID=="AAEL005644" | geneID=="AAEL005792" | geneID=="AAEL001233" | geneID=="AAEL012087" | geneID=="AAEL007587") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#PPOs
myData.filter <- myData %>%
  filter(geneID=="AAEL013498" | geneID=="AAEL011764" | geneID=="AAEL013499" | geneID=="AAEL011763" | geneID=="AAEL013501" | geneID=="AAEL013492" | geneID=="AAEL014544" | geneID=="AAEL013493" | geneID=="AAEL013496" | geneID=="AAEL014837") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#lysozyme
myData.filter <- myData %>%
  filter(geneID=="AAEL005988" | geneID=="AAEL003712" | geneID=="AAEL003723" | geneID=="AAEL010100" | geneID=="AAEL015404" | geneID=="AAEL009670") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#gnbp
myData.filter <- myData %>%
  filter(geneID=="AAEL007626" | geneID=="AAEL000652" | geneID=="AAEL003889" | geneID=="AAEL009176" | geneID=="AAEL009178" | geneID=="AAEL003894" | geneID=="AAEL007064") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#imd
myData.filter <- myData %>%
  filter(geneID=="AAEL014738" | geneID=="AAEL003579" | geneID=="AAEL001932" | geneID=="AAEL003245" | geneID=="AAEL010548" | geneID=="AAEL012510" | geneID=="AAEL010083" | geneID=="AAEL007035") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#tlr
myData.filter <- myData %>%
  filter(geneID=="AAEL004000" | geneID=="AAEL009551" | geneID=="AAEL007613" | geneID=="AAEL003507" | geneID=="AAEL007619" | geneID=="AAEL000057" | geneID=="AAEL000671" | geneID=="AAEL002583" | geneID=="AAEL000633" | geneID=="AAEL013441" | geneID=="AAEL011734") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#jak/stat
myData.filter <- myData %>%
  filter(geneID=="AAEL012471" | geneID=="AAEL012553" | geneID=="AAEL009692") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#rel-like
myData.filter <- myData %>%
  filter(geneID=="AAEL007696" | geneID=="AAEL006930" | geneID=="AAEL007624") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#tlr pathway members
myData.filter <- myData %>%
  filter(geneID=="AAEL000709" | geneID=="AAEL007768" | geneID=="AAEL006571"| geneID=="AAEL011363"| geneID=="AAEL007642") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#autophagy
myData.filter <- myData %>%
  filter(geneID=="AAEL013815" | geneID=="AAEL009089" | geneID=="AAEL007574" | geneID=="AAEL013995" | geneID=="AAEL003799" | geneID=="AAEL000955" | geneID=="AAEL010516" | geneID=="AAEL007228" | geneID=="AAEL002286" | geneID=="AAEL010427" | geneID=="AAEL010641" | geneID=="AAEL012306" | geneID=="AAEL007162" | geneID=="AAEL009105" | geneID=="AAEL001521" | geneID=="AAEL001515" | geneID=="AAEL003777" | geneID=="AAEL007570" | geneID=="AAEL000693") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut) 
head(myData.filter)

#inhibitors of apoptosis 
myData.filter <- myData %>%
  filter(geneID=="AAEL009074" | geneID=="AAEL006633" | geneID=="AAEL014251" | geneID=="AAEL012446" | geneID=="AAEL012512") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut) 
head(myData.filter)

#caspase
myData.filter <- myData %>%
  filter(geneID=="AAEL014148" | geneID=="AAEL011562" | geneID=="AAEL005963"| geneID=="AAEL005956"| geneID=="AAEL003439"| geneID=="AAEL003444"| geneID=="AAEL014658"| geneID=="AAEL012143"| geneID=="AAEL014348") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#caspase activators 
myData.filter <- myData %>%
  filter(geneID=="AAEL00872" | geneID=="AAEL004392" | geneID=="AAEL014196") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#catalases
myData.filter <- myData %>%
  filter(geneID=="AAEL013407") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter)

#c-type lectins
myData.filter <- myData %>%
  filter(geneID=="AAEL009338" | geneID=="AAEL008299" | geneID=="AAEL008681" | geneID=="AAEL004679" | geneID=="AAEL011453" | geneID=="AAEL012353" | geneID=="AAEL000533" | geneID=="AAEL011446" | geneID=="AAEL005482" | geneID=="AAEL011404" | geneID=="AAEL011407" | geneID=="AAEL011408" | geneID=="AAEL011609" | geneID=="AAEL006456" | geneID=="AAEL002524" | geneID=="AAEL000556" | geneID=="AAEL003119" | geneID=="AAEL010992" | geneID=="AAEL013748" | geneID=="AAEL011402" | geneID=="AAEL011078" | geneID=="AAEL013566" | geneID=="AAEL011070" | geneID=="AAEL005641" | geneID=="AAEL009209" | geneID=="AAEL011610" | geneID=="AAEL011619" | geneID=="AAEL014385" | geneID=="AAEL011079" | geneID=="AAEL000543" | geneID=="AAEL011455" | geneID=="AAEL011621" | geneID=="AAEL014382" | geneID=="AAEL000563" | geneID=="AAEL000283" | geneID=="AAEL011612" | geneID=="AAEL008929" | geneID=="AAEL014356" | geneID=="AAEL014390") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut) 
head(myData.filter)

#scavenger receptors (SCRs)
myData.filter <- myData %>%
  filter(geneID=="AAEL001914" | geneID=="AAEL015308" | geneID=="AAEL009192" | geneID=="AAEL010655" | geneID=="AAEL014367" | geneID=="AAEL005374" | geneID=="AAEL007748" | geneID=="AAEL008370" | geneID=="AAEL005987" | geneID=="AAEL005979" | geneID=="AAEL011222" | geneID=="AAEL002741" | geneID=="AAEL000234" | geneID=="AAEL000227" | geneID=="AAEL000256" | geneID=="AAEL009420" | geneID=="AAEL009423" | geneID=="AAEL009432" | geneID=="AAEL006355" | geneID=="AAEL006361") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut) 
head(myData.filter)

#superoxide dismutases (SODs)
myData.filter <- myData %>%
  filter(geneID=="AAEL009436" | geneID=="AAEL014091" | geneID=="AAEL006271" | geneID=="AAEL011498" | geneID=="AAEL000259" | geneID=="AAEL004823" | geneID=="AAEL005108") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut) 
head(myData.filter)

#spaetzle-like proteins (SPZ)
myData.filter <- myData %>%
  filter(geneID=="AAEL000499" | geneID=="AAEL013434" | geneID=="AAEL013433" | geneID=="AAEL001435" | geneID=="AAEL008596" | geneID=="AAEL014950" | geneID=="AAEL007897" | geneID=="AAEL001929" | geneID=="AAEL012164") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut) 
head(myData.filter)

#Peroxidases (PRDX)
myData.filter <- myData %>%
  filter(geneID=="AAEL012069" | geneID=="AAEL008397" | geneID=="AAEL000495" | geneID=="AAEL003933" | geneID=="AAEL007563" | geneID=="AAEL006014" | geneID=="AAEL013171" | geneID=="AAEL005416" | geneID=="AAEL000376" | geneID=="AAEL002354" | geneID=="AAEL012481" | geneID=="AAEL004401" | geneID=="AAEL004388" | geneID=="AAEL004390" | geneID=="AAEL004386" | geneID=="AAEL003612" | geneID=="AAEL000507" | geneID=="AAEL013528" | geneID=="AAEL004112" | geneID=="AAEL014548" | geneID=="AAEL002309" | geneID=="AAEL009051") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut) 
head(myData.filter)

Figure 4: Graphing Filtered Genes by Tissue (Carcass v. Midgut)

Filtered differentially expressed genes in carcass and gut tissues respectively, and pulled out genes of interest: AAEL000395 is Ultraspiracle; AAEL009600 is ecdysone receptor; AAEL001414 is LRIM9.

#Filtered carcass for specific genes; AAEL000395 is Ultraspiracle; AAEL009600 is ecdysone receptor; AAEL001414 is LRIM9
myData.filter.carcass <- myData %>%
  filter(geneID=="AAEL000395" | geneID=="AAEL009600" | geneID == "AAEL001414") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass)
head(myData.filter.carcass)

library(reshape2)
myData.filter.long.carcass <- melt(myData.filter.carcass, id.var="geneID", variable.name="timetissue", value.name="log2expression")

library(biomaRt)
myData.filter.long.annotated <- getBM(attributes = c("external_gene_id"),
                                      filters = "ensembl_gene_id",
                                      values = myData.filter.long.carcass,
                                      mart=aedesDB)
myData.filter.long.carcass[,1] <- myData.filter.long.annotated [,1]

ggplot(myData.filter.long.carcass, aes(y=log2expression)) +
  geom_bar(aes(x=geneID, fill=timetissue), position="dodge", stat="identity") +
  theme(axis.text.x=element_text(angle=-45)) 

#Filtered gut for specific genes; AAEL000395 is Ultraspiracle; AAEL009600 is ecdysone receptor; AAEL001414 is LRIM9
myData.filter.gut <- myData %>%
  filter(geneID=="AAEL000395" | geneID=="AAEL009600" | geneID == "AAEL001414") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter.gut)

library(reshape2)
myData.filter.long.gut <- melt(myData.filter.gut, id.var="geneID", variable.name="timetissue", value.name="log2expression")

library(biomaRt)
myData.filter.long.annotated <- getBM(attributes = c("external_gene_id"),
                                      filters = "ensembl_gene_id",
                                      values = myData.filter.long.gut,
                                      mart=aedesDB)
myData.filter.long.gut[,1] <- myData.filter.long.annotated [,1]

ggplot(myData.filter.long.gut, aes(y=log2expression)) +
  geom_bar(aes(x=geneID, fill=timetissue), position="dodge", stat="identity") +
  theme(axis.text.x=element_text(angle=-45)) 

Another example looking at antimicrobial peptide (AMP) expression

#AMPS carcass
myData.filter.carcass <- myData %>%
  filter(geneID=="AAEL003389" | geneID=="AAEL000627" | geneID=="AAEL004223" | geneID=="AAEL000598" | geneID=="AAEL000611" | geneID=="AAEL000625" | geneID=="AAEL015515" | geneID=="AAEL000775" | geneID=="AAEL000777" | geneID=="AAEL000621" | geneID=="AAEL003841" | geneID=="AAEL003832" | geneID=="AAEL003857" | geneID=="AAEL003849" | geneID=="AAEL004833" | geneID=="AAEL004522") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass)
head(myData.filter.carcass)

library(reshape2)
myData.filter.long.carcass <- melt(myData.filter.carcass, id.var="geneID", variable.name="timetissue", value.name="log2expression")

library(biomaRt)
myData.filter.long.annotated <- getBM(attributes = c("external_gene_id"),
                                      filters = "ensembl_gene_id",
                                      values = myData.filter.long.carcass,
                                      mart=aedesDB)
myData.filter.long.carcass[,1] <- myData.filter.long.annotated [,1]

ggplot(myData.filter.long.carcass, aes(y=log2expression)) +
  geom_bar(aes(x=geneID, fill=timetissue), position="dodge", stat="identity") +
  theme(axis.text.x=element_text(angle=-45)) 

#AMPs guts
myData.filter.gut <- myData %>%
  filter(geneID=="AAEL003389" | geneID=="AAEL000627" | geneID=="AAEL004223" | geneID=="AAEL000598" | geneID=="AAEL000611" | geneID=="AAEL000625" | geneID=="AAEL015515" | geneID=="AAEL000775" | geneID=="AAEL000777" | geneID=="AAEL000621" | geneID=="AAEL003841" | geneID=="AAEL003832" | geneID=="AAEL003857" | geneID=="AAEL003849" | geneID=="AAEL004833" | geneID=="AAEL004522") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut)
head(myData.filter.gut)

library(reshape2)
myData.filter.long.gut <- melt(myData.filter.gut, id.var="geneID", variable.name="timetissue", value.name="log2expression")

library(biomaRt)
myData.filter.long.annotated <- getBM(attributes = c("external_gene_id"),
                                      filters = "ensembl_gene_id",
                                      values = myData.filter.long.gut,
                                      mart=aedesDB)
myData.filter.long.gut[,1] <- myData.filter.long.annotated [,1]

ggplot(myData.filter.long.gut, aes(y=log2expression)) +
  geom_bar(aes(x=geneID, fill=timetissue), position="dodge", stat="identity") +
  theme(axis.text.x=element_text(angle=-45)) 

Figure 5: Simple bar graphs

Looking at a group of filtered genes within each condition individually

#Pick genes by which to filter data (eg. Peroxidases)
#Peroxidases (PRDX)
myData.filter <- myData %>%
  filter(geneID=="AAEL012069" | geneID=="AAEL008397" | geneID=="AAEL000495" | geneID=="AAEL003933" | geneID=="AAEL007563" | geneID=="AAEL006014" | geneID=="AAEL013171" | geneID=="AAEL005416" | geneID=="AAEL000376" | geneID=="AAEL002354" | geneID=="AAEL012481" | geneID=="AAEL004401" | geneID=="AAEL004388" | geneID=="AAEL004390" | geneID=="AAEL004386" | geneID=="AAEL003612" | geneID=="AAEL000507" | geneID=="AAEL013528" | geneID=="AAEL004112" | geneID=="AAEL014548" | geneID=="AAEL002309" | geneID=="AAEL009051") %>%
  select(geneID, Ecdysone.vs.PBS_5hr_carcass, Ecdysone.vs.PBS_18hr_carcass, Ecdysone.vs.PBS_5hr_gut, Ecdysone.vs.PBS_18hr_gut) 

library(dplyr)
myData.filter.transpose <- as.data.frame(t(myData.filter))
treatments <- row.names(myData.filter.transpose)
myData.filter.transpose <- mutate(myData.filter.transpose, treatments)
myData.filter.transpose

library(biomaRt)
myData.filter.long.annotated <- getBM(attributes = c("external_gene_id"),
                          filters = "ensembl_gene_id",
                          values = myData.filter.long,
                          mart=aedesDB)
myData.filter.long[,1] <- myData.filter.long.annotated [,1]

ggplot(myData.filter, aes(y=Ecdysone.vs.PBS_5hr_gut)) +
  geom_bar(aes(x=geneID, fill=Ecdysone.vs.PBS_5hr_gut), stat="identity") +
  theme(axis.text.x=element_text(angle=-45))

ggplot(myData.filter, aes(y=Ecdysone.vs.PBS_18hr_gut)) +
  geom_bar(aes(x=geneID, fill=Ecdysone.vs.PBS_18hr_gut), stat="identity") +
  theme(axis.text.x=element_text(angle=-45))

ggplot(myData.filter, aes(y=Ecdysone.vs.PBS_5hr_carcass)) +
  geom_bar(aes(x=geneID, fill=Ecdysone.vs.PBS_5hr_carcass), stat="identity") +
  theme(axis.text.x=element_text(angle=-45))

ggplot(myData.filter, aes(y=Ecdysone.vs.PBS_18hr_carcass)) +
  geom_bar(aes(x=geneID, fill=Ecdysone.vs.PBS_18hr_carcass), stat="identity") +
  theme(axis.text.x=element_text(angle=-45))

Figure 6: Create a scatterplot

Comparing differences in expression between midguts treated with 20E at 5hr and 18hr

#Create a basic scatterplot using ggplot
ggplot(myData, aes(x=Ecdysone.vs.PBS_5hr_gut, y=Ecdysone.vs.PBS_18hr_gut)) +
  geom_point(shape=1) +
  geom_point(size=4)

Creating an interactive graphic
Using this code chunk in RStudio will result in an interactive graphic where you can click on genes of interest on a scatterplot.

#define a tooltip that shows gene symbol and Log2 expression data when you mouse over each data point in the plot
tooltip <- function(data, ...) {
  paste0("<b>","Symbol: ", data$geneID, "</b><br>",
         "20E5hr_gut: ", data$ecdysone.5hr.midgut, "<br>",
         "PBS5hr_gut: ", data$PBS.5hr.midgut)
}

#plot the interactive graphic
myData %>% 
  ggvis(x= ~PBS.5hr.midgut, y= ~ecdysone.5hr.midgut, key := ~geneID) %>% 
  layer_points(fill = ~Ecdysone.vs.PBS_5hr_gut) %>%
  add_tooltip(tooltip)

Figure 7: Generating a Heat Map of Differentially Expressed Genes

This will generate a heatmap looking at differential expression across all samples

#Use the dplyr 'filter' command to capture all the genes that are up/down regulated x-fold in n conditions
#In this case, 'myData' is a dataframe generated with Log2 expression and annotation
myData.filter.log <- myData %>%
  filter((abs(ecdysone.5hr.midgut) >= 1) | (abs(PBS.5hr.midgut) >= 1)) %>%
  select(geneID, ecdysone.5hr.midgut, ecdysone.18hr.midgut, PBS.5hr.midgut, PBS.18hr.midgut, ecdysone.5hr.carcass, ecdysone.18hr.carcass, PBS.5hr.carcass, PBS.18hr.carcass)
head(myData.filter)

rownames(myData.filter.log)<- myData.filter.log[,1]
myData.filter.log <- myData.filter.log[-1]
myData.filter.log.matrix <- as.matrix(myData.filter.log)

#Generate a heatmap of differentially expressed transcripts using the entire dataset 
hr <- hclust(as.dist(1-cor(t(myData.filter.log.matrix), method="pearson")), method="complete") #Cluster rows by pearson correlation
hc <- hclust(as.dist(1-cor(myData.filter.log.matrix, method="spearman")), method="complete") #cluster columns by spearman correlation
#Cut the resulting tree and create color vector for clusters.  Vary the cut height to give more or fewer clusters, or you the 'k' argument to force n number of clusters
dim(myData.filter.log.matrix)
mycl <- cutree(hr, k=7)
mycolhc <- rainbow(length(unique(mycl)), start=0.1, end=0.9) 
mycolhc <- mycolhc[as.vector(mycl)] 
#load the gplots package for plotting the heatmap
library(gplots) 
#Assign your favorite heatmap color scheme. Some useful examples: colorpanel(40, "darkblue", "yellow", "white"); heat.colors(75); cm.colors(75); rainbow(75); redgreen(75); library(RColorBrewer); rev(brewer.pal(9,"Blues")[-1]).
myheatcol <- greenred(75)
library(RColorBrewer)
#Plot the hclust results as a heatmap
heatmap.2(myData.filter.log.matrix, Rowv=as.dendrogram(hr), Colv=NA, 
          col=myheatcol, scale="row", labRow=NA,
          density.info="none", trace="none", RowSideColors=mycolhc, 
          cexRow=1, cexCol=1, margins=c(10,25)) 

Figure 8: Midgut-specific Heat Map

#Notice that the heatmap includes ALL the columns from your dataset
#Lets fix this so that we are only showing appropriate columns 
myData.filter.log.matrix.subset <- myData.filter.log.matrix[,c(1,2,3,4)]

hr <- hclust(as.dist(1-cor(t(myData.filter.log.matrix.subset), method="pearson")), method="complete") #Cluster rows by pearson correlation
hc <- hclust(as.dist(1-cor(myData.filter.log.matrix.subset, method="spearman")), method="complete") #Cluster columns by spearman correlation

dim(myData.filter.log.matrix.subset)
mycl <- cutree(hr, k=7)
mycolhc <- rainbow(length(unique(mycl)), start=0.1, end=0.9) 
mycolhc <- mycolhc[as.vector(mycl)] 

heatmap.2(myData.filter.log.matrix.subset, Rowv=as.dendrogram(hr), Colv=NA, 
          col=myheatcol, scale="row", labRow=NA,
          density.info="none", trace="none", RowSideColors=mycolhc, 
          cexRow=1, cexCol=1, margins=c(10,25)) 

#Now repeat heatmap only on these selected columns
names(mycolhc) <- names(mycl) 
barplot(rep(10, max(mycl)), 
        col=unique(mycolhc[hr$labels[hr$order]]), 
        horiz=T, names=unique(mycl[hr$order])) # Prints color key for cluster assignments. The numbers next to the color boxes correspond to the cluster numbers in 'mycl'.

Figure 9: Heat Maps of Specific Midgut Clusters of Interest

Cluster 1, Genes Downregulated in 5 hr treated midgut

#Select sub-clusters of co-regulated transcripts for downstream analysis
clid <- c(1) 
ysub <- myData.filter.log.matrix.subset[names(mycl[mycl%in%clid]),] 
hrsub <- hclust(as.dist(1-cor(t(ysub), method="pearson")), method="complete") 
clusterIDs <- data.frame(Labels=rev(hrsub$labels[hrsub$order]))
clusterIDs <- as.vector(t(clusterIDs))
heatmap.2(ysub, Rowv=as.dendrogram(hrsub), Colv=NA, col=myheatcol, scale="row", density.info="none", labRow=NA, trace="none", RowSideColors=mycolhc[mycl%in%clid], margins=c(20,25)) # Create heatmap for chosen sub-cluster.

Cluster 2, Genes Upregulated in 5 hr 20E treated midgut

clid <- c(2) 
ysub <- myData.filter.log.matrix.subset[names(mycl[mycl%in%clid]),] 
hrsub <- hclust(as.dist(1-cor(t(ysub), method="pearson")), method="complete") 
clusterIDs <- data.frame(Labels=rev(hrsub$labels[hrsub$order]))
clusterIDs <- as.vector(t(clusterIDs))
heatmap.2(ysub, Rowv=as.dendrogram(hrsub), Colv=NA, col=myheatcol, scale="row", density.info="none", labRow=NA, trace="none", RowSideColors=mycolhc[mycl%in%clid], margins=c(20,25)) 

Cluster 5, Genes Upregulated in 18 hr 20E treated midgut

clid <- c(5) 
ysub <- myData.filter.log.matrix.subset[names(mycl[mycl%in%clid]),] 
hrsub <- hclust(as.dist(1-cor(t(ysub), method="pearson")), method="complete") 
clusterIDs <- data.frame(Labels=rev(hrsub$labels[hrsub$order]))
clusterIDs <- as.vector(t(clusterIDs))
heatmap.2(ysub, Rowv=as.dendrogram(hrsub), Colv=NA, col=myheatcol, scale="row", density.info="none", labRow=NA, trace="none", RowSideColors=mycolhc[mycl%in%clid], margins=c(20,25)) 

Cluster 6, Genes Down-regulated in 18 hr 20E treated midgut

clid <- c(6) 
ysub <- myData.filter.log.matrix.subset[names(mycl[mycl%in%clid]),] 
hrsub <- hclust(as.dist(1-cor(t(ysub), method="pearson")), method="complete") 
clusterIDs <- data.frame(Labels=rev(hrsub$labels[hrsub$order]))
clusterIDs <- as.vector(t(clusterIDs))
heatmap.2(ysub, Rowv=as.dendrogram(hrsub), Colv=NA, col=myheatcol, scale="row", density.info="none", labRow=NA, trace="none", RowSideColors=mycolhc[mycl%in%clid], margins=c(20,25)) 

Pull out genes from cluster groups and write them into a separate table

clusterIDsmatrix <- as.matrix (clusterIDs)
library(biomaRt)
clusterIDsentrez <- getBM(attributes = c("entrezgene", "ensembl_gene_id"),
                         filters = "ensembl_gene_id",
                         values = clusterIDsmatrix,
                         mart=aedesDB)

#How to make an excel sheet! 
write.table(clusterIDsentrez, "Cluster1entrezIDs.xls", sep="\t", quote=FALSE)