##edgeR Differential Expression analyses of RNA-seq data ##Based edgeR R Tutorial by Sean Ruddy, 2011 ##Based on Edger:differential expression analysis of digital gene expression data User's Guide. Yunshun Chen, Davis McCarthy, Mark Robinson, Gordon K. Smyth, 2014. ##Citation: Robinson MD, McCarthy DJ and Smyth GK (2010). “edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.” Bioinformatics, 26, pp. -1. ##Working on O2 #Login to 02, with 2 factor outside of HMS private ssh -XY ecommonas@o2.hms.harvard.edu #Work on )2 Interactively #Interactive Jobs - Max of 12 hours srun --pty -p interactive -t 0-12:00 --mem 8G bash #You will then be assigned a Compute Node, which will allow you to work interactively for up to 12 hours. #Interactive Jobs - Max of 12 hours & 32G of memory srun --pty -p interactive -t 0-12:00 --mem 32G bash #Setting up a personal R library on O2 mfk8@login00:~$ mkdir -p ~/R/library mfk8@login00:~$ echo 'R_LIBS_USER="~/R/library"' > $HOME/.Renviron mrk8@login00:~$ export R_LIBS_USER="/home/user123/R/library" #set working directory - Example cd /your/working/Directory/ #Directly Set R Version module load gcc/6.2.0 R/3.5.1-extra #Launch R R ##Or working on a Mac ##setwd("/your/working/Directory/") dir() ####Class 1-4 - Assessment of Differential Expression of RNA-seq Data with edgeR#### #Install edgeR - updated 31st Mar,2020 if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") # updated 31st Mar,2020 BiocManager::install("edgeR") #Load the edgeR package library(edgeR) #set working directory #setwd("C:/Users/Owner/Documents/RCCG_HMS/RC_Biostatistics_Courses/Biostats_RNA-Seq/Supervised/Class_4") #setwd("C:/Users/Tom/OneDrive/Documents/My Received Files/RCCG_HMS/RC_Biostatistics_Courses/Biostats_RNA-Seq/Supervised/Class_4") setwd("C:/Users/Owner/Documents/WuXi_NextCODE_Genomics/HMS_MIT_Biostats_Course/Supervised/Class_1_4") dir() #read in entire counts data .txt file x <- read.delim("TNBC10vNormal10_Raw_2.txt", row.names=1, stringsAsFactors=FALSE) head(x) tail(x) dim(x) #create groups group <- c(rep("TNBC", 10), rep("Normal", 10)) #Check group #Building the edgeR Object #create DGEList object to work on, where are the columns are the counts #Put the counts and other information into a DGEList object #colnames(y) <- targets$Label y <- DGEList(counts=x[,1:20], group=group) dim(y) #[1] 20531 20 #Filter out low count reads since it would be impossible to detect differential expression. The method used in the #edgeR vignette is to keep only those genes that have at least 1 read per million in at least 10 samples. Therefore, Filter out very lowly expressed tags, #keeping genes that are expressed at a reasonable level in at least one condition. Since the group sizes are 10, keep genes that achieve at least #one count per million (cpm) in 10 samples #Once this filtering is done, calculate the normalization factors which correct for the different compositions of the samples. The effective #library sizes are then the product of the actual library sizes and these factors. keep <- rowSums(cpm(y)>1) >= 10 y <- y[keep,] dim(y) #[1] 14332 20 #Re-compute the library sizes y$samples$lib.size <- colSums(y$counts) #Check y$samples ##TMM Normalization #Compute effective library sizes using Trimmed Mean of M Values (TMM) normalization #This accounts for differences in transcriptome sizes y <- calcNormFactors(y) y$samples #effective library sizes y$samples$lib.size*y$samples$norm.factors ##Data exploration #Generate a Multi-dimensional Scaling (MDS) plot #An MDS plot measures the similarity of the samples and projects this measure into 2-dimensions. #An MDS plot show distances, in terms of biological coefficient of variation (BCV), between samples #Dimension 1 separates the Normal from TNBC samples. The MDS plot shows the replicates are consistent. #This is a good indication edgeR analysis will produce adequate DE genes. #To view MDS plot plotMDS(y, col=as.numeric(as.factor(group)), main ="MDS Plot for TMM Normalized Count Data") ?cmdscale #Write plot as a pdf pdf("MDS_Plot.pdf" , width = 11 , height = 7) # in inches plotMDS(y, col=as.numeric(as.factor(group)), main ="MDS Plot for TMM Normalized Count Data") dev.off() # this tells [R] to close and stop writing to the pdf ##Estimating common and tagwise dispersion #The common dispersion estimates the overall BCV of the dataset, averaged over all genes with the qCML method y <- estimateCommonDisp(y, verbose=TRUE) #Disp = 0.39294 , BCV = 0.6268 #BCV (square root of the common dispersion) is *63%. BCV for a laboratory experiment with cell lines or model organisms is typically 10-15%. #Now estimate gene-specific (tagwise) dispersions with the qCML method y <- estimateTagwiseDisp(y) summary(y$tagwise.dispersion) #Min. 1st Qu. Median Mean 3rd Qu. Max. #0.1097 0.1948 0.2763 0.3864 0.4544 4.4520 #Plot the estimated dispersions #In general, BCV is higher with lower average log CPM #To view estimated dispersions plot plotBCV(y, main = "Plot of Estimated Dispersions") #Write plot as a pdf pdf("Est_Dispersions_Plot.pdf" , width = 11 , height = 7) # in inches plotBCV(y, main = "Plot of Estimated Dispersions") dev.off() # this tells [R] to close and stop writing to the pdf ##Differential expression #Compute NB exact genewise tests for differential expression between TNBC and Normal samples et <- exactTest(y) options(digits = 3) #print only 3 digits top <- topTags(et) top #Check the individual cpm values for the top genes #Check up/down expression to TNBC/Normal *VERY IMPORTANT* cpm(y)[rownames(top), ] #print the total number of DE genes at 5% FDR summary(de <- decideTestsDGE(et)) # [,1] #-1 2290 #0 9019 #1 3023 ##14,332 total genes after filtering for low expression ##Of the 5313 genes identified as DE, 3023 are up-regulated in TNBC samples and 2290 are down-regulated. #Plot the log-fold-changes with a smear plot, highlighting the DE genes #Blue lines indicate 2-fold changes #to view Smear Plot detags <- rownames(y)[as.logical(de)] plotSmear(et, de.tags=detags, main = "Smear Plot") abline(h=c(-1, 1), col="blue") #Write plot as a pdf pdf("Smear_Plot.pdf" , width = 7 , height = 7) # in inches plotSmear(et, de.tags=detags, main = "Smear Plot") abline(h=c(-1, 1), col="blue") dev.off() # this tells [R] to close and stop writing to the pdf #Take a look at the smear plot function ?plotSmear #Create an object called "resultsTbl.tagwise" containing the four output parameters: logFC, logCPM, PValue,FDR resultsTbl.tagwise <- topTags(et, n = nrow(et$table))$table #Check resultsTbl.tagwise head(resultsTbl.tagwise) dim(resultsTbl.tagwise) #Generate objects according to three significance levels for tagwise dispersion estimates: 0.05, 0.001, 1e-06 #These will be used for GOseq analyses in classes 5 and 6. #Names/IDs of DE genes de.genes.tagwise_0.05 <- rownames(resultsTbl.tagwise)[resultsTbl.tagwise$FDR <= 0.05] de.genes.tagwise_0.001 <- rownames(resultsTbl.tagwise)[resultsTbl.tagwise$FDR <= 0.001] de.genes.tagwise_1e_06 <- rownames(resultsTbl.tagwise)[resultsTbl.tagwise$FDR <= 1e-06] #Check number of genes at each significance level length(de.genes.tagwise_0.05) #5313 DE genes length(de.genes.tagwise_0.001) #2398 DE genes length(de.genes.tagwise_1e_06) #944 DE genes #Check percentage of total genes length(de.genes.tagwise_0.05)/nrow(resultsTbl.tagwise)*100 #37.07089 length(de.genes.tagwise_0.001)/nrow(resultsTbl.tagwise)*100 #16.73179 length(de.genes.tagwise_1e_06)/nrow(resultsTbl.tagwise)*100 #6.586659 #Check Up/Down regulated summaries for tagwise results summary(de <- decideTestsDGE(et, p.value=0.001)) # [,1] #-1 1179 #0 11934 #1 1219 summary(de <- decideTestsDGE(et, p.value=1e-06)) # [,1] #-1 535 #0 13388 #1 409 #Check de.genes.tagwise_1e_06 head(de.genes.tagwise_1e_06) #Note de.genes.tagwise_1e_06 is a vector of 1e06 genes #Double check is.vector(de.genes.tagwise_1e_06) #[1] TRUE ##Format Results and count data form DGEList object(y) colnames(resultsTbl.tagwise) <- c("logFC", "logCPM","PValue", "FDR") wh.rows.tagwise <- match(rownames(resultsTbl.tagwise), rownames(y$counts)) #Check for all 14332 genes length(wh.rows.tagwise) #Combine exact test results, count data from DGEList object(y) and tagwise dispersion combResults.tagwise <- cbind(resultsTbl.tagwise, "Tgw.Disp" = y$tagwise.dispersion[wh.rows.tagwise], "UpDown.Tgw" = decideTestsDGE(et, p.value = 0.05)[wh.rows.tagwise], y$counts[wh.rows.tagwise,]) #Check combResults.tagwise head(combResults.tagwise) dim(combResults.tagwise) #Create significance level objects from original counts data (x) and FDR vectors de.tag_05<-x[which(row.names(x)%in%(de.genes.tagwise_0.05)),] de.tag_001<-x[which(row.names(x)%in%(de.genes.tagwise_0.001)),] de.tag_1e_06<-x[which(row.names(x)%in%(de.genes.tagwise_1e_06)),] #Check head(de.tag_05) dim(de.tag_05) head(de.tag_001) dim(de.tag_001) head(de.tag_1e_06) dim(de.tag_1e_06) #Check again head(combResults.tagwise) head(resultsTbl.tagwise) ##Merge all output into FDR specific objects combResults.tagwise_05<-merge(combResults.tagwise [,1:6], de.tag_05, by="row.names") head(combResults.tagwise_05) #Almost there. Need to rename first column and Ensembl and Entrez columns to front. Ensembl and Entrez columns are there as a double check #Rename column 1 as "HGNC" colnames(combResults.tagwise_05)[1] ="HGNC" head(combResults.tagwise_05) dim(combResults.tagwise_05) #Create edgeR_05 output object and reorder columns edgeR_05=cbind(combResults.tagwise_05[,28:29],combResults.tagwise_05[,1:27]) head(edgeR_05) dim(edgeR_05) #Repeat for FDR of 0.001 combResults.tagwise_001<-merge(combResults.tagwise [,1:6], de.tag_001, by="row.names") head(combResults.tagwise_001) #Rename column 1 as "HGNC" colnames(combResults.tagwise_001)[1] ="HGNC" head(combResults.tagwise_001) dim(combResults.tagwise_001) #Create edgeR_001 output object and reorder columns edgeR_001=cbind(combResults.tagwise_001[,28:29],combResults.tagwise_001[,1:27]) head(edgeR_001) dim(edgeR_001) #Repeat for FDR of 1e06 combResults.tagwise_1e_06<-merge(combResults.tagwise [,1:6], de.tag_1e_06, by="row.names") head(combResults.tagwise_1e_06) #Rename column 1 as "HGNC" colnames(combResults.tagwise_1e_06)[1] ="HGNC" head(combResults.tagwise_1e_06) dim(combResults.tagwise_1e_06) #Create edgeR_1e_06 output object and reorder columns edgeR_1e_06=cbind(combResults.tagwise_1e_06[,28:29],combResults.tagwise_1e_06[,1:27]) head(edgeR_1e_06) dim(edgeR_1e_06) #Sort results by FDR as a check for row names. edgeR_05_out<-edgeR_05[order(edgeR_05[,7],decreasing=FALSE),] head(edgeR_05_out) edgeR_001_out<-edgeR_001[order(edgeR_001[,7],decreasing=FALSE),] head(edgeR_001_out) edgeR_1e_06_out<-edgeR_1e_06[order(edgeR_1e_06[,7],decreasing=FALSE),] head(edgeR_1e_06_out) #Write all three edgeR FDR specific objects to .txt files - row.names=FALSE to maintain appropriate column order write.table(edgeR_05_out, file="edgeR_tagwise_05_output.txt", row.names=FALSE, sep="\t") write.table(edgeR_001_out, file="edgeR_tagwise_001_output.txt", row.names=FALSE, sep="\t") write.table(edgeR_1e_06_out, file="edgeR_tagwise_1e_06_output.txt", row.names=FALSE, sep="\t")