#Install BioConductor packages goseq, GO.db, org.Hs.eg.db using the biocLite.R installation script.
# updated 31st Mar,2020
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
    
# Load the packages - - updated 31st Mar,2020
BiocManager::install("goseq")
BiocManager::install("GO.db")
BiocManager::install("org.Hs.eg.db")

#Load goseq library
library(goseq)

#Load GO.db library
library(GO.db)

#Load org.Hs.eg.db library
library(org.Hs.eg.db)

#Using R on the laptop              
#setwd("C:/Users/Tom/OneDrive/Documents/My Received Files/RCCG_HMS/RC_Biostatistics_Courses/Biostats_RNA-Seq/Supervised/Class_5")
setwd("C:/Users/Owner/Documents/WuXi_NextCODE_Genomics/HMS_MIT_Biostats_Course/Supervised/Class_2_5")
dir() 

#Get transcript lengths for entire background  - start by transposing the data frame before converting to a vector
#read table and name genesinput
genesinput=read.table("background_geneSymbol.txt",as.is=TRUE,header=FALSE) 

#Check 
head(genesinput) #genes symbols listed in column 1

#Transpose data frame and convert to vector
genes <- as.vector(t(genesinput)) #Please note that, if you want to have by columns you should use as.vector(as.matrix(test))-(t implicitly converts to a matrix). c(t(test)) works too.

#check
is.vector(genes) #worked

#calculate median transcript lengths
transcriptlengths = getlength(genes, "hg19", "geneSymbol")

#check
head(transcriptlengths) #worked!
tail(transcriptlengths) #worked!

#create a data frame with both vectors and with no column headings
df <- data.frame(genes,transcriptlengths)
head(df)

#write table - no column names
write.table(df, file="background_geneSymbol_transcript_length.txt", sep="\t", quote=F, col.names=F, row.names=F)

##Set up for GOseq analysis
#Run each cell list with different input file
#Name GOSeqInputFile 'GOSeqInputMDA2'
GOSeqInputMDA2 = read.table("outedgeR05_geneSymbol.txt")
#GOSeqInputMDA2 = read.table("outedgeR001_geneSymbol.txt")
#GOSeqInputMDA2 = read.table("outedgeR1e_06_geneSymbol.txt")

#Convert to a single named vector containing two pieces of gene information: Gene Symbol & 0 (normal) & 1 (abnormal expression) calls
MDA2.gene.vector=GOSeqInputMDA2[,2]
names(MDA2.gene.vector)=GOSeqInputMDA2[,1]

#Name TranscriptLengthInputFile 'GOSeqGeneSymbolTranscriptLength'
#GOSeqGeneSymbolTranscriptLength = read.table("background_geneSymbol_transcript_length.txt")
GOSeqGeneSymbolTranscriptLength = df

#Convert to a single named vector containing two pieces of gene information: Gene Symbol & Associated Transcript Lengths
transcript.lengths=GOSeqGeneSymbolTranscriptLength[,2]
names(transcript.lengths)=GOSeqGeneSymbolTranscriptLength[,1]

#Grab transcript length information and Fit the probability Weighting Function and plot the resulting fit
MDA2.pwf=nullp(MDA2.gene.vector,"hg19","geneSymbol", bias.data=transcript.lengths)

#Wallenius approximation
#The use_genes_without_cat=TRUE argument allows unannotated genes in the analysis 
MDA2.GO.wall=goseq(MDA2.pwf,"hg19","geneSymbol", use_genes_without_cat=FALSE)
#MDA2.GO.wall=goseq(MDA2.pwf,"hg19","geneSymbol", use_genes_without_cat=TRUE)

#hypergeometric 
#The use_genes_without_cat=TRUE argument allows unannotated genes in the analysis 
#MDA2.GO.wall=goseq(MDA2.pwf,"hg19","geneSymbol", use_genes_without_cat=FALSE, method="Hypergeometric")
#MDA2.GO.wall=goseq(MDA2.pwf,"hg19","geneSymbol", use_genes_without_cat=TRUE, method="Hypergeometric")

#Find GO to gene symbol matches, only need to do this once for the data set.
pwf=MDA2.gene.vector
gene2cat = getgo(names(pwf), "hg19","geneSymbol")
names(gene2cat) = names(pwf)
tmp=unlist(gene2cat,use.names=FALSE)
names(tmp)=rep(names(gene2cat),times=as.numeric(summary(gene2cat)[,1]))
cat2gene=split(names(tmp),as.vector(tmp))   

#number of genes for each GO
lens=lapply(cat2gene, length)

#add adjusted p_value
data=cbind(MDA2.GO.wall[,c(1,2)], p.adjust(MDA2.GO.wall$over_represented_pvalue,method="BH"))

#query by adjusted p-value, get a smaller list
table=data[which(data[,3]<=0.05),]

#get all terms
terms <- stack(lapply(mget(table$category, GOTERM, ifnotfound=NA), Term))

#add term
table$GO_Term <- with(table, terms$values[match(terms$ind, table$category)])

#get total number of genes for each GO
table$Gene_total <- unlist(lens[table$category])

#Get all abnormal expressed genes
abexpress=names(pwf[pwf==1]) 

#Get gene list of each GO, require the gene exist in the input file (the file with 1 and 0)
genes=lapply(table$category, function(x) intersect(unlist(cat2gene[x]), abexpress))

#Add the number of observed gene to table
table$Gene_GO_Term=unlist(lapply(genes, length))

#convert the list of genes to list of concatenated gene in string format
glist=lapply(genes, function(x) paste(x, collapse = ', '))

#All the gene string to table
table$Gene_ids=unlist(glist)

#Change Column Names
colnames(table)=c("GO_Term_Id", "pvalue", "BH_pvalue", "GO_Term", "GO_Term_Gene_Total","Gene_Count_GO_Term", "Gene_Ids")

#Reorder Column Names
table=table[,c(1,4,2,3,5,6,7)]

#Write 05 tables
write.table(table, file="edgeR_05_GOseq_GO_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_05_GOseq_GO_hypergeo_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_05_GOseq_GO_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_05_GOseq_GO_hypergeo_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)

#Write 001 tables
#write.table(table, file="edgeR_001_GOseq_GO_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_001_GOseq_GO_hypergeo_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_001_GOseq_GO_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_001_GOseq_GO_hypergeo_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)

#Write 1e_06 tables
#write.table(table, file="edgeR_1e_06_GOseq_GO_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_1e_06_GOseq_GO_hypergeo_restrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_1e_06_GOseq_GO_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)
#write.table(table, file="edgeR_1e_06_GOseq_GO_hypergeo_unrestrict.txt", sep="\t", quote=F, col.names=T, row.names=F)