#Install BioConductor packages goseq, GO.db, org.Hs.eg.db using the biocLite.R installation script.
source("http://bioconductor.org/biocLite.R")


#biocLite("gage")
biocLite("pathview")

#Load goseq library
library(goseq)

#Load GO.db library
library(KEGG.db)

#Load org.Hs.eg.db library
library(org.Hs.eg.db)

#Load gage library
#library(gage)

#Load pathview library
library(pathview)

#Using R on the laptop              
#setwd("C:/Users/Owner/Documents/RCCG_HMS/RC_Biostatistics_Courses/Biostats_RNA-Seq/Supervised/Class_6")
#setwd("C:/Users/Tom/OneDrive/Documents/My Received Files/RCCG_HMS/RC_Biostatistics_Courses/Biostats_RNA-Seq/Supervised/Class_6")
setwd("C:/Users/Owner/Documents/WuXi_NextCODE_Genomics/HMS_MIT_Biostats_Course/Supervised/Class_3_6")
dir() 

#Get transcript lengths for entire background  - start by transposing the data frame before converting to a vector
#read table and name genesinput
genesinput=read.table("background_geneSymbol.txt",as.is=TRUE,header=FALSE) 

#Check 
head(genesinput) #genes symbols listed in column 1

#Transpose data frame and convert to vector
genes <- as.vector(t(genesinput)) #Please note that, if you want to have by columns you should use as.vector(as.matrix(test))-(t implicitly converts to a matrix). c(t(test)) works too.

#check
is.vector(genes) #worked

#calculate median transcript lengths
transcriptlengths = getlength(genes, "hg19", "geneSymbol")

#check
head(transcriptlengths) #worked!
tail(transcriptlengths) #worked!

#create a data frame with both vectors and with no column headings
df <- data.frame(genes,transcriptlengths)
head(df)

#write table - no column names
write.table(df, file="background_geneSymbol_transcript_length.txt", sep="\t", quote=F, col.names=F, row.names=F)

##Set up for GOseq analysis
#Run each cell list with different input file
#Name GOSeqInputFile 'GOSeqInputMDA2'
GOSeqInputMDA2 = read.table("outedgeR05_geneSymbol.txt")
#GOSeqInputMDA2 = read.table("outedgeR001_geneSymbol.txt")
#GOSeqInputMDA2 = read.table("outedgeR1e_06_geneSymbol.txt")

#Convert to a single named vector containing two pieces of gene information: Gene Symbol & 0 (normal) & 1 (abnormal expression) calls
MDA2.gene.vector=GOSeqInputMDA2[,2]
names(MDA2.gene.vector)=GOSeqInputMDA2[,1]

#Name TranscriptLengthInputFile 'GOSeqGeneSymbolTranscriptLength'
#GOSeqGeneSymbolTranscriptLength = read.table("background_geneSymbol_transcript_length.txt")
GOSeqGeneSymbolTranscriptLength = df

#Convert to a single named vector containing two pieces of gene information: Gene Symbol & Associated Transcript Lengths
transcript.lengths=GOSeqGeneSymbolTranscriptLength[,2]
names(transcript.lengths)=GOSeqGeneSymbolTranscriptLength[,1]

#Grab transcript length information and Fit the probability Weighting Function and plot the resulting fit
MDA2.pwf=nullp(MDA2.gene.vector,"hg19","geneSymbol", bias.data=transcript.lengths)

#Wallenius approximation
#The use_genes_without_cat=TRUE argument allows unannotated genes in the analysis 
MDA2.KEGG.wall=goseq(MDA2.pwf,"hg19","geneSymbol", test.cats="KEGG", use_genes_without_cat=FALSE)
#MDA2.KEGG.wall=goseq(MDA2.pwf,"hg19","geneSymbol", test.cats="KEGG", use_genes_without_cat=TRUE)

#hypergeometric 
#The use_genes_without_cat=TRUE argument allows unannotated genes in the analysis 
#MDA2.KEGG.wall=goseq(MDA2.pwf,"hg19","geneSymbol", test.cats="KEGG", use_genes_without_cat=FALSE, method="Hypergeometric")
#MDA2.KEGG.wall=goseq(MDA2.pwf,"hg19","geneSymbol", test.cats="KEGG", use_genes_without_cat=TRUE, method="Hypergeometric")

#Find KEGG to gene symbol matches, only need to do this once for the data set.
pwf=MDA2.gene.vector
gene2cat = getgo(names(pwf), "hg19","geneSymbol", fetch.cats=c("KEGG"))
names(gene2cat) = names(pwf)
tmp=unlist(gene2cat,use.names=FALSE)
names(tmp)=rep(names(gene2cat),times=as.numeric(summary(gene2cat)[,1]))
cat2gene=split(names(tmp),as.vector(tmp))   

#number of genes for each KEGG
lens=lapply(cat2gene, length)

#add adjusted p_value
data=cbind(MDA2.KEGG.wall[,c(1,2)], p.adjust(MDA2.KEGG.wall$over_represented_pvalue,method="BH"))

#query by adjusted p-value, get a smaller list
table=data[which(data[,3]<=0.05),]

#get and add all terms
table$KEGG_PATH_NAME <- unlist(mget(table$category, KEGGPATHID2NAME, ifnotfound=NA))

#get total number of genes for each KEGG
table$Gene_total <- unlist(lens[table$category])

#Get all abnormal expressed genes
abexpress=names(pwf[pwf==1]) 

#Get gene list of each KEGG, require the gene exist in the input file (the file with 1 and 0)
genes=lapply(table$category, function(x) intersect(unlist(cat2gene[x]), abexpress))

#Add the number of observed gene to table
table$SNV_Gene_KEGG_Term=unlist(lapply(genes, length))

#Convert the list of genes to list of concatenated gene in string format
glist=lapply(genes, function(x) paste(x, collapse = ', '))

#Add all the gene string to table
table$Gene_ids=unlist(glist)

#Change Column Names
colnames(table)=c("KEGG_Id", "pvalue", "BH_pvalue", "KEGG_Term", "KEGG_Term_Gene_Total","Gene_Count_KEGG_Term", "Gene_Ids")

#Reorder Column Names
table=table[,c(1,4,2,3,5,6,7)]

#Write 05 tables
write.table(table, file="edgeR_05_GOseq_KEGG_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_05_GOseq_KEGG_hypergeo_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_05_GOseq_KEGG_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_05_GOseq_KEGG_hypergeo_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)

#Write 001 tables
#write.table(table, file="edgeR_001_GOseq_KEGG_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_001_GOseq_KEGG_hypergeo_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_001_GOseq_KEGG_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_001_GOseq_KEGG_hypergeo_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)

#Write 1e_06 tables
#write.table(table, file="edgeR_1e_06_GOseq_KEGG_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_1e_06_GOseq_KEGG_hypergeo_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_1e_06_GOseq_KEGG_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_1e_06_GOseq_KEGG_hypergeo_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)

##Print enriched KEGG pathway with Pathview
#Read in the Entrez Gene Ids from the 0.05, 0.001, and 1e-06 analyses as data frames (read.table)
#Grab enriched KEGG pathway Ids from column 1 of the table "table" above
#The apply loop iterates over all KEGG pathways in the table
entrezids<-read.table("edgeR_tagwise_05_out_Entrez.txt")
pv.out.list<-sapply(table[,1], function(pid) pathview(gene.data=entrezids, pathway.id= pid, species = "hsa"))