# Author: Jonathan Heiss
# Date: 5/11/18

# Requires ~22GB of disk space and ~80GB of RAM

dir.create("detectionP")
setwd("detectionP")

library(stringi)
library(data.table)
library(magrittr)
library(ggplot2)
library(cowplot)
library(purrr)
library(car)
library(utils)

if( packageDescription("ewastools")$RemoteSha != "eebedc95fc71fb0131c1831086794be86b538067")
  devtools::install_github("hhhh5/ewastools@eebedc95fc71fb0131c1831086794be86b538067")
library(ewastools)

#-------------------------- detection p-values, the old way

detectionP.neg <- function(raw){

  if(!all(c('manifest','controls','ctrlG','ctrlR','M','U','oobG','oobR')%in%names(raw))) stop('Invalid argument') 
  
  with(raw,{

    bkgR = bkgG = controls[group=='NEGATIVE',index] 

    bkgR = ctrlR[bkgR,,drop=FALSE]
    bkgG = ctrlG[bkgG,,drop=FALSE]

    muG = apply(bkgG,2,median,na.rm=TRUE)
    sdG = apply(bkgG,2,mad   ,na.rm=TRUE)

    muR = apply(bkgR,2,median,na.rm=TRUE) 
    sdR = apply(bkgR,2,mad   ,na.rm=TRUE) 

    detP = matrix(NA_real_,nrow=nrow(U),ncol=ncol(U))

    i = manifest[channel=='Red' ,index] 
        for(j in 1:ncol(M))  
            detP[i,j] = pnorm(U[i,j]+M[i,j],mean=2*muR[j],sd=sqrt(2)*sdR[j],log.p=TRUE,lower.tail=FALSE) 
     
    i = manifest[channel=='Grn' ,index] 
        for(j in 1:ncol(M))  
            detP[i,j] = pnorm(U[i,j]+M[i,j],mean=2*muG[j],sd=sqrt(2)*sdG[j],log.p=TRUE,lower.tail=FALSE) 
     
    i = manifest[channel=='Both',index] 
        for(j in 1:ncol(M)) 
            detP[i,j] = pnorm(U[i,j]+M[i,j],mean=muR[j]+muG[j],sd=sqrt(sdR[j]^2+sdG[j]^2),log.p=TRUE,lower.tail=FALSE) 

    detP/log(10)

  })
}

#-------------------------- Fetch the data

### Select datasets by GSE accession 
selection = c(60655,61496,63106,65163,66459,69502,72120,74432,75196,75248,85042,85566,86961,87571,89251,90871,97362,102177)
selection = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE" %s+% selection %s+% "&targ=gsm&form=text&view=brief"
selection %<>% map(readLines) %>% unlist
selection = split( selection, cumsum(selection %like% "^\\^SAMPLE = GSM") )

names(selection) =  map(selection,1) %>% stri_match_first(regex="GSM\\d+")

selection %<>% imap(function(s,acc){
  s = strsplit(s,split=" = ",fixed=TRUE)  
  data.table(gsm=acc,variable=map_chr(s,1),value=map_chr(s,2))
})

selection %<>% rbindlist

### parse metadata
selection = selection[variable %chin% c("!Sample_characteristics_ch1","!Sample_supplementary_file","!Sample_series_id","!")]
i = selection[variable == "!Sample_characteristics_ch1",which=TRUE]
ch = selection$value[i] %>% stri_split(fixed=": ")
selection$variable[i] = map_chr(ch,1)
selection$value   [i] = map_chr(ch,2)
rm(ch,i)

selection[variable=="sex, 1=m, 2=f",value:=ifelse(value==1,"m","f")]
selection[variable %like% "[Ss]ex|[Gg]ender",variable:="sex"]
selection[variable=="sex",value:=recode(value,"c('Male','male','M')='m'; c('Female','female','F')='f'")]

selection[variable == "!Sample_supplementary_file" & value %like% "_Red\\.idat",variable:="red"]
selection[variable == "!Sample_supplementary_file" & value %like% "_Grn\\.idat",variable:="grn"]
selection[variable == "!Sample_series_id",variable:="gse"]

### keep `pair id` and `birth-weight` for the dataset of monozygotic twins
selection = selection[variable %in% c("sex","gse","red","grn","pair id","birth-weight")]
selection = dcast(selection, gsm ~ variable,paste0,collapse=";")

selection = selection[!is.na(sex)]

selection$sex %<>% factor(levels=c("m","f"))

### download the .idat files
map2(selection$red, selection$gsm %s+% "_Red.idat.gz", ~ download.file(.x,.y,quiet=TRUE) ) %>% invisible
map2(selection$grn, selection$gsm %s+% "_Grn.idat.gz", ~ download.file(.x,.y,quiet=TRUE) ) %>% invisible

### check whether all files were successfully downloaded
selection$gsm %s+% "_Grn.idat.gz" %>% file.exists %>% stopifnot
selection$gsm %s+% "_Red.idat.gz" %>% file.exists %>% stopifnot
selection$red = NULL; selection$grn = NULL

raw = read_idats(selection$gsm,quiet=FALSE)
raw$T = NULL
raw$S = NULL

#-------------------------- Quality control

### Remove samples failing according to control probes
selection$failed = control_metrics(raw) %>% sample_failure

### Dye-bias correction using RELIC
dye = correct_dye_bias(raw)

### Check for mislabeled samples
selection[,c("X","Y"):=check_sex(dye)]
selection[,predicted_sex:=predict_sex(X,Y,which(sex=="m"),which(sex=="f"))]

### Check for problematic samples (outliers or imprecise measurements for SNP probes, can indicate contamination)
SNPs = dye$manifest[probe_type=="rs"]$index
selection$contaminated = dont_normalize(dye) %>% extract(SNPs,) %>% call_genotypes %>% snp_outliers
rm(SNPs)

### Keep only samples that passed all checks
keep = selection[ !failed & sex==predicted_sex & contaminated < -4,which=TRUE]
drop = setdiff(1:nrow(selection),keep)

selection = selection[keep]
raw = drop_samples(raw,j=drop)
dye = drop_samples(dye,j=drop)
rm(drop,keep)

#-------------------------- Detection p-values
table(selection$sex)

### column indices of male and female samples
  males = selection[sex=="m",which=TRUE]
females = selection[sex=="f",which=TRUE]

### row indices of Y chromosome probes
Y = raw$manifest[chr=='Y']$index

### Lists to store the results
call_rates = list()
counts = list()

### Detection p-values using negative control probes
detP.neg = detectionP.neg(raw)

### Determine for various cutoff the number of detected probes, for males and females separately
cutoffs = -(0:80)
tmp = sapply(cutoffs,function(t){ colSums( detP.neg[Y,] < t,na.rm=TRUE) })
m = apply(tmp[  males,],2,quantile,prob=c(.025,.5,.975)) # median and 95% CI
f = apply(tmp[females,],2,quantile,prob=c(.025,.5,.975))

### Arrange data in data.table for use in ggplot
tmp = data.table(
  cutoff=-cutoffs
  ,sex=rep(c("m","f"),each=81)
  ,l=c(m[1,],f[1,])
  ,m=c(m[2,],f[2,])
  ,u=c(m[3,],f[3,])
)

t0.05 = log10(0.05)

### Generate figure 1A
p1 = 
( ggplot(tmp,aes(x=cutoff,y=m))
+ geom_ribbon(aes(ymin=l,ymax=u,fill=sex),show.legend=FALSE,alpha=0.5)
+ geom_line(aes(group=sex,color=sex),show.legend=FALSE)
+ geom_vline(xintercept=16,linetype=3)
+ geom_hline(yintercept=32,linetype=3)
+ scale_colour_manual(values=c("f"="red","m"="black"))
+ scale_fill_manual(values=c("f"="red","m"="black"))
+ scale_y_continuous(name="# detected Y-chromosome probes",breaks=c(0,32,100,200,300,400,416),limits=c(0,416))
+ scale_x_continuous(limits=c(0,80),breaks=c(0,16,40,60,80))
+ xlab(expression(-log[10]~p~cut-off))
+ ggtitle("Negative controls probes")
+ theme(panel.background=element_rect(fill="white",color=1),
        panel.border=element_rect(color=1,fill=NA,linetype=1,size=1),
        axis.text=element_text(color=1))
)

### How many probes are declared undetected in total (not counting Y-chromosome probes) for each sample?
counts$neg    = colSums(detP.neg[-Y,] > t0.05,na.rm=TRUE)
counts$neg16  = colSums(detP.neg[-Y,] >   -16,na.rm=TRUE)
counts$neg117 = colSums(detP.neg[-Y,] >  -117,na.rm=TRUE)

### What is the call rate among females for every Y chromosome probe?
(detP.neg[Y,females] <= t0.05) %>% apply(.,1,sum,na.rm=TRUE) -> call_rates$neg
(detP.neg[Y,females] <=   -16) %>% apply(.,1,sum,na.rm=TRUE) -> call_rates$neg16
(detP.neg[Y,females] <=  -117) %>% apply(.,1,sum,na.rm=TRUE) -> call_rates$neg117

rm(raw); gc()

### Detection p-values using out-of-band intensities
dye = ewastools::detectionP(dye)

cutoffs = seq(0,-3,by=-0.05)
cutoffs = c(10^cutoffs,c(1,0.5,0.1,0.05,0.01,0.001))
cutoffs %<>% unique %>% sort %>% rev

tmp = sapply(cutoffs,function(t){ colSums( dye$detP[Y,] < t,na.rm=TRUE) })
m = apply(tmp[  males,],2,quantile,prob=c(.025,.5,.975)) # median and 95% CI
f = apply(tmp[females,],2,quantile,prob=c(.025,.5,.975))

tmp = data.table(
  cutoff=cutoffs
  ,sex=rep(c("m","f"),each=length(cutoffs))
  ,l=c(m[1,],f[1,])
  ,m=c(m[2,],f[2,])
  ,u=c(m[3,],f[3,])
)

ticks = c(1,0.5,0.1,0.05,0.01,0.001)

### Generate figure 1B
p2 = 
( ggplot(tmp,aes(x=-log10(cutoff),y=m))
+ geom_ribbon(aes(ymin=l,ymax=u,fill=sex),show.legend=FALSE,alpha=0.5)
+ geom_line(aes(group=sex,color=sex),show.legend=FALSE)
+ geom_vline(xintercept=-log10(0.05),linetype=3)
+ geom_hline(yintercept=32,linetype=3)
+ scale_colour_manual(values=c("f"="red","m"="black"))
+ scale_fill_manual(values=c("f"="red","m"="black"))
+ scale_y_continuous(name="",breaks=c(0,32,100,200,300,400,416),limits=c(0,416))
+ scale_x_continuous(name="p cut-off",breaks=-log10(ticks),labels=ticks,limits=c(0,3))
+ annotate("text",x=2,y=300,label="Male"  ,color="black",size=5)
+ annotate("text",x=2,y= 80,label="Female",color="red"  ,size=5)
+ ggtitle("Out-of-band intensities")
+ theme(panel.background=element_rect(fill="white",color=1),
        panel.border=element_rect(color=1,fill=NA,linetype=1,size=1),
        axis.text=element_text(color=1))
)

p = plot_grid(p1,p2,labels=c("A","B"),ncol=2)
save_plot(filename="fig1.png",plot=p,base_width=10,base_height=5)
rm(p,p1,p2,cutoffs)

counts$oob  = colSums(dye$detP[-Y,] > 0.05,na.rm=TRUE)

(dye$detP[Y,females] <= 0.05) %>% apply(.,1,sum,na.rm=TRUE) -> call_rates$oob

rm(m,f)

### Generate figure 2
( ggplot()
+ geom_hline(yintercept=c(0,100),linetype=3)
+ geom_vline(xintercept=c(0,416),linetype=3)
+ geom_line(aes(x=1:416,y=call_rates$neg    %>%sort %>% divide_by(13.58)),color="black")
+ geom_line(aes(x=1:416,y=call_rates$neg117 %>%sort %>% divide_by(13.58)),color="black")
+ geom_line(aes(x=1:416,y=call_rates$oob    %>%sort %>% divide_by(13.58)),color="red"  )
+ annotate("text",x=120 ,y=30,label="Negative\np=0.05",color="black")
+ annotate("text",x=320,y=15,label="Negative\np=1e-117",color="black")
+ annotate("text",x=320,y=30,label="OOB\np=0.05",color="red")
+ xlab("Index")
+ ylab("Call rate (%)")
+ theme(panel.background=element_rect(fill="white",color=1),
          panel.border=element_rect(color=1,fill=NA,linetype=1,size=1),
          axis.text=element_text(color=1))
)

ggplot2::ggsave("fig2.png",height=5,width=5)
rm(dye,ticks)

lapply(counts,median)
# $neg
# [1] 64

# $neg16
# [1] 331.5

# $neg117
# [1] 4007

# $oob
# [1] 3780


#----------------------------------------------------------------------

### Are the dropped data less accurate? Check the technical replicates from the monozygotic twin dataset whether they show larger differences between duplicates
twins = copy(selection[gse%like%"GSE61496"]) # Dataset of monozygotic twins
twins = twins[,list(gsm,n=.N,g=1:.N),keyby=list(`pair id`,`birth-weight`)][n>1 & g %in% 1:2]

### indices of technical replicates
rep1 = seq(1,12,2) # index samples
rep2 = seq(2,12,2) # replicates

raw = read_idats(twins$gsm,quiet=TRUE)
detP.neg = detectionP.neg(raw)

dye = correct_dye_bias(raw)
dye = ewastools::detectionP(dye); detP = dye$detP

beta = normalize(dye)

D = abs(beta[,rep1] - beta[,rep2]) # Absolute difference between index samples and replicates

### Pairs for which index sample or replicate (or both) was undetected.
P1 = detP.neg > t0.05; P1 = P1[,rep1] | P1[,rep2]
P2 = detP.neg >   -16; P2 = P2[,rep1] | P2[,rep2]
P3 = detP     >  0.05; P3 = P3[,rep1] | P3[,rep2]

### Number of excluded outliers with a more than 20pp difference in methylation levels
sum((D>0.2)    ,na.rm=TRUE) # 3733
sum((D>0.2)[P1],na.rm=TRUE) #  283
sum((D>0.2)[P2],na.rm=TRUE) # 1058
sum((D>0.2)[P3],na.rm=TRUE) # 2576

### Median difference between detected/undetected probes
h1 = split(D,P1)
h2 = split(D,P2)
h3 = split(D,P3)

median(h1$`TRUE`,na.rm=TRUE) # 0.07691732
median(h2$`TRUE`,na.rm=TRUE) # 0.06152366
median(h3$`TRUE`,na.rm=TRUE) # 0.05047891

median(h1$`FALSE`,na.rm=TRUE) # 0.01298537
median(h2$`FALSE`,na.rm=TRUE) # 0.01294899
median(h3$`FALSE`,na.rm=TRUE) # 0.01273315