##### R CODE FOR PERMUATION TESTING OF PHOSPHOTRAP DATA #####
# Eva K Fischer
# May 2019


library(DESeq2)
library(edgeR)


### read in counts, TPM normalized values (an output from Kallisto), sample info
counts <- read.csv(file="trap.counts.csv", row.names=1)
tpm <- read.csv(file="trap_TPM.csv", row.names=1)
samps <- read.csv(file="samples.csv")


### keep only samples that have tpm values above desired cutoff
tpm <- tpm[rowSums(tpm) >= 1,]
keep <- rownames(tpm)
counts <- counts[which(rownames(counts) %in% keep),]


### normalize
# using DESeq method (RLE)
norm_factor=estimateSizeFactorsForMatrix(counts)
norm_counts <-mapply("/",as.data.frame(counts),norm_factor)
rownames(norm_counts) <- rownames(counts)


### calculate relative expression ratios for ip vs total
# this step is a problem because you always have an issue of zero division. initially i added one to all counts to deal with this problem. instead, i'm now only replacing all 0's with 1's. still not perfect, but i think it creates slightly less bias (e.g. consider for example a gene where ip=1, total=0)

# replace all 0's with 1's to deal with division problems
norm_counts <- replace(norm_counts, norm_counts == 0, 1)

# split into ip vs total
counts_ip <- norm_counts[,c(1,3,5,7,9,11)]
counts_t <- norm_counts[,c(2,4,6,8,10,12)]

# divide ip by total to get fold expression changes, change column names to be sample names
ratios <- counts_ip/counts_t
colnames(ratios) <- gsub("_IP", "", colnames(ratios), fixed=TRUE)
logratios <- log2(ratios)


# divide transport by control ratios to get 'differential fold enrichment' 
means <- data.frame(control=rowMeans(ratios[,1:3]), transport=rowMeans(ratios[,4:6]))
diffratios <- data.frame(row.names=rownames(means),diffratio=means$transport/means$control)
logdiffratios <- log2(diffratios)
names(logdiffratios) <- "logdiffratio"
logdiffratios <- logdiffratios[order(logdiffratios$logdiffratio),,drop=FALSE]

# write out ratio & logratio data
write.csv(ratios, file="ratios.csv")
write.csv(logratios, file="logratios.csv")
write.csv(diffratios, file="diffratios.csv")
write.csv(logdiffratios, file="logdiffratios.csv")


### t tests to compare ratios between control and transport
# make sure columns and groups line up below by sorting
logratios <- logratios[,order(colnames(logratios))]
group <- group[sort(group$fishID),]

# create empty dataframe to populate
out <- data.frame(contig=character(), t.value=numeric(), p.value=numeric(), stringsAsFactors=FALSE)

# for loop to run t test on each contig and put results into data frame
for (contig in (1:length(rownames(logratios))))
{
temp <- as.data.frame((logratios[contig,]))
temp$group <- group$group
colnames(temp) <- c("logratio", "group")

test <- t.test(logratio~group, data=temp, mu=0, var.equal=TRUE)

out[contig,] <- c((rownames(logratios)[contig]), test$statistic, test$p.value)
}

# fdr correction for pvalues
out$p.adj <- p.adjust(out$p.value, method="BH")

# how many genes survive fdr correction?
table(out$padj<0.05)

# write out t-test results
write.csv(out, file="ttests.csv", row.names=FALSE)


### permutations for call DE genes. permutations themselves are in separate code (see below). here uploading the output from that code to compare above p-values to. 
tests <- read.csv(file="ttests.csv", row.names=1)
perms <- read.csv(file="permutation_tval_percentiles.csv", row.names=1)

merge <- merge(tests, perms, by="row.names")
merge$de <- ifelse(abs(merge$t.value) > merge$p99, "DE", "notDE")

# how many DE?
table(merge$de)

write.csv(merge, file="perms_DE.csv")


### THIS PART OF THE CODE IS USED TO GENERATE DISTRIBUTIONS OF PERMUTED DATA. IT IS COMPUTATIONALLY HEAVY AND BEST RUN ON A CLUSTER.


### read in log ratios and sample info
logratios <- read.csv(file="logratios.csv", row.names=1)
samples <- read.csv(file="samples.csv")


### create permuted lists of samples
# number of permutations to do?
n=250

# create empty data frame to put permuted sample info into
permSamples <- data.frame(set=integer(), sample=factor(), group=factor())

# fill data frame with permuted sample lists
for (i in 1:n)
{
	temp <- samples
	perm <- sample(samples$group, length(samples$sample), replace=FALSE)
	temp$group <- perm
	temp$set <- i
	
	permSamples = rbind(permSamples, temp)
}


### permutations tests

# create empty data frame to put results into
permResults <- data.frame(contig=character(), set=integer(), t.value=numeric(), p.value=numeric())

# run permutation tests!
for (i in 1:n)
{
	tempSamples <- permSamples[which(permSamples$set==i),]
	out <- data.frame(contig=character(), set=integer(), t.value=numeric(), p.value=numeric(), stringsAsFactors=FALSE)
	
	for (contig in (1:length(rownames(logratios))))
	{
		temp <- as.data.frame(t(logratios[contig,]))
		temp$group <- tempSamples$group
		colnames(temp) <- c("logratio", "group")

		test <- t.test(logratio~group, data=temp, mu=0, var.equal=TRUE)

		out[contig,] <- c((rownames(logratios)[contig]), i, test$statistic, test$p.value)
	}
	
	permResults <- rbind(permResults, out)
		
}

# not sure why, but it changes t- and p-values to character, so had to force them back
permResults$t.value <- as.numeric(permResults$t.value)
permResults$p.value <- as.numeric(permResults$p.value)


### calculate percentile t-values for permuted datasets

# reformat data
perm <- permResults[,c(1,2,3)]
perm <- reshape(perm, timevar="set", idvar="contig", direction="wide")

percPerm <- as.data.frame(t(apply(perm[,2:length(colnames(perm))], 1, function(x) quantile(x, probs=c(0.01,0.05,0.1,0.9,0.95,0.99), na.rm=TRUE))))
rownames(percPerm) <- perm$contig
colnames(percPerm) <- c("p01","p05","p10","p90","p95","p99")


write.csv(percPerm, "permutation_tval_percentiles.csv")