# Author: Jonathan Heiss # Date: 5/11/18 # Requires ~22GB of disk space and ~80GB of RAM dir.create("detectionP") setwd("detectionP") library(stringi) library(data.table) library(magrittr) library(ggplot2) library(cowplot) library(purrr) library(car) library(utils) if( packageDescription("ewastools")$RemoteSha != "eebedc95fc71fb0131c1831086794be86b538067") devtools::install_github("hhhh5/ewastools@eebedc95fc71fb0131c1831086794be86b538067") library(ewastools) #-------------------------- detection p-values, the old way detectionP.neg <- function(raw){ if(!all(c('manifest','controls','ctrlG','ctrlR','M','U','oobG','oobR')%in%names(raw))) stop('Invalid argument') with(raw,{ bkgR = bkgG = controls[group=='NEGATIVE',index] bkgR = ctrlR[bkgR,,drop=FALSE] bkgG = ctrlG[bkgG,,drop=FALSE] muG = apply(bkgG,2,median,na.rm=TRUE) sdG = apply(bkgG,2,mad ,na.rm=TRUE) muR = apply(bkgR,2,median,na.rm=TRUE) sdR = apply(bkgR,2,mad ,na.rm=TRUE) detP = matrix(NA_real_,nrow=nrow(U),ncol=ncol(U)) i = manifest[channel=='Red' ,index] for(j in 1:ncol(M)) detP[i,j] = pnorm(U[i,j]+M[i,j],mean=2*muR[j],sd=sqrt(2)*sdR[j],log.p=TRUE,lower.tail=FALSE) i = manifest[channel=='Grn' ,index] for(j in 1:ncol(M)) detP[i,j] = pnorm(U[i,j]+M[i,j],mean=2*muG[j],sd=sqrt(2)*sdG[j],log.p=TRUE,lower.tail=FALSE) i = manifest[channel=='Both',index] for(j in 1:ncol(M)) detP[i,j] = pnorm(U[i,j]+M[i,j],mean=muR[j]+muG[j],sd=sqrt(sdR[j]^2+sdG[j]^2),log.p=TRUE,lower.tail=FALSE) detP/log(10) }) } #-------------------------- Fetch the data ### Select datasets by GSE accession selection = c(60655,61496,63106,65163,66459,69502,72120,74432,75196,75248,85042,85566,86961,87571,89251,90871,97362,102177) selection = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE" %s+% selection %s+% "&targ=gsm&form=text&view=brief" selection %<>% map(readLines) %>% unlist selection = split( selection, cumsum(selection %like% "^\\^SAMPLE = GSM") ) names(selection) = map(selection,1) %>% stri_match_first(regex="GSM\\d+") selection %<>% imap(function(s,acc){ s = strsplit(s,split=" = ",fixed=TRUE) data.table(gsm=acc,variable=map_chr(s,1),value=map_chr(s,2)) }) selection %<>% rbindlist ### parse metadata selection = selection[variable %chin% c("!Sample_characteristics_ch1","!Sample_supplementary_file","!Sample_series_id","!")] i = selection[variable == "!Sample_characteristics_ch1",which=TRUE] ch = selection$value[i] %>% stri_split(fixed=": ") selection$variable[i] = map_chr(ch,1) selection$value [i] = map_chr(ch,2) rm(ch,i) selection[variable=="sex, 1=m, 2=f",value:=ifelse(value==1,"m","f")] selection[variable %like% "[Ss]ex|[Gg]ender",variable:="sex"] selection[variable=="sex",value:=recode(value,"c('Male','male','M')='m'; c('Female','female','F')='f'")] selection[variable == "!Sample_supplementary_file" & value %like% "_Red\\.idat",variable:="red"] selection[variable == "!Sample_supplementary_file" & value %like% "_Grn\\.idat",variable:="grn"] selection[variable == "!Sample_series_id",variable:="gse"] ### keep `pair id` and `birth-weight` for the dataset of monozygotic twins selection = selection[variable %in% c("sex","gse","red","grn","pair id","birth-weight")] selection = dcast(selection, gsm ~ variable,paste0,collapse=";") selection = selection[!is.na(sex)] selection$sex %<>% factor(levels=c("m","f")) ### download the .idat files map2(selection$red, selection$gsm %s+% "_Red.idat.gz", ~ download.file(.x,.y,quiet=TRUE) ) %>% invisible map2(selection$grn, selection$gsm %s+% "_Grn.idat.gz", ~ download.file(.x,.y,quiet=TRUE) ) %>% invisible ### check whether all files were successfully downloaded selection$gsm %s+% "_Grn.idat.gz" %>% file.exists %>% stopifnot selection$gsm %s+% "_Red.idat.gz" %>% file.exists %>% stopifnot selection$red = NULL; selection$grn = NULL raw = read_idats(selection$gsm,quiet=FALSE) raw$T = NULL raw$S = NULL #-------------------------- Quality control ### Remove samples failing according to control probes selection$failed = control_metrics(raw) %>% sample_failure ### Dye-bias correction using RELIC dye = correct_dye_bias(raw) ### Check for mislabeled samples selection[,c("X","Y"):=check_sex(dye)] selection[,predicted_sex:=predict_sex(X,Y,which(sex=="m"),which(sex=="f"))] ### Check for problematic samples (outliers or imprecise measurements for SNP probes, can indicate contamination) SNPs = dye$manifest[probe_type=="rs"]$index selection$contaminated = dont_normalize(dye) %>% extract(SNPs,) %>% call_genotypes %>% snp_outliers rm(SNPs) ### Keep only samples that passed all checks keep = selection[ !failed & sex==predicted_sex & contaminated < -4,which=TRUE] drop = setdiff(1:nrow(selection),keep) selection = selection[keep] raw = drop_samples(raw,j=drop) dye = drop_samples(dye,j=drop) rm(drop,keep) #-------------------------- Detection p-values table(selection$sex) ### column indices of male and female samples males = selection[sex=="m",which=TRUE] females = selection[sex=="f",which=TRUE] ### row indices of Y chromosome probes Y = raw$manifest[chr=='Y']$index ### Lists to store the results call_rates = list() counts = list() ### Detection p-values using negative control probes detP.neg = detectionP.neg(raw) ### Determine for various cutoff the number of detected probes, for males and females separately cutoffs = -(0:80) tmp = sapply(cutoffs,function(t){ colSums( detP.neg[Y,] < t,na.rm=TRUE) }) m = apply(tmp[ males,],2,quantile,prob=c(.025,.5,.975)) # median and 95% CI f = apply(tmp[females,],2,quantile,prob=c(.025,.5,.975)) ### Arrange data in data.table for use in ggplot tmp = data.table( cutoff=-cutoffs ,sex=rep(c("m","f"),each=81) ,l=c(m[1,],f[1,]) ,m=c(m[2,],f[2,]) ,u=c(m[3,],f[3,]) ) t0.05 = log10(0.05) ### Generate figure 1A p1 = ( ggplot(tmp,aes(x=cutoff,y=m)) + geom_ribbon(aes(ymin=l,ymax=u,fill=sex),show.legend=FALSE,alpha=0.5) + geom_line(aes(group=sex,color=sex),show.legend=FALSE) + geom_vline(xintercept=16,linetype=3) + geom_hline(yintercept=32,linetype=3) + scale_colour_manual(values=c("f"="red","m"="black")) + scale_fill_manual(values=c("f"="red","m"="black")) + scale_y_continuous(name="# detected Y-chromosome probes",breaks=c(0,32,100,200,300,400,416),limits=c(0,416)) + scale_x_continuous(limits=c(0,80),breaks=c(0,16,40,60,80)) + xlab(expression(-log[10]~p~cut-off)) + ggtitle("Negative controls probes") + theme(panel.background=element_rect(fill="white",color=1), panel.border=element_rect(color=1,fill=NA,linetype=1,size=1), axis.text=element_text(color=1)) ) ### How many probes are declared undetected in total (not counting Y-chromosome probes) for each sample? counts$neg = colSums(detP.neg[-Y,] > t0.05,na.rm=TRUE) counts$neg16 = colSums(detP.neg[-Y,] > -16,na.rm=TRUE) counts$neg117 = colSums(detP.neg[-Y,] > -117,na.rm=TRUE) ### What is the call rate among females for every Y chromosome probe? (detP.neg[Y,females] <= t0.05) %>% apply(.,1,sum,na.rm=TRUE) -> call_rates$neg (detP.neg[Y,females] <= -16) %>% apply(.,1,sum,na.rm=TRUE) -> call_rates$neg16 (detP.neg[Y,females] <= -117) %>% apply(.,1,sum,na.rm=TRUE) -> call_rates$neg117 rm(raw); gc() ### Detection p-values using out-of-band intensities dye = ewastools::detectionP(dye) cutoffs = seq(0,-3,by=-0.05) cutoffs = c(10^cutoffs,c(1,0.5,0.1,0.05,0.01,0.001)) cutoffs %<>% unique %>% sort %>% rev tmp = sapply(cutoffs,function(t){ colSums( dye$detP[Y,] < t,na.rm=TRUE) }) m = apply(tmp[ males,],2,quantile,prob=c(.025,.5,.975)) # median and 95% CI f = apply(tmp[females,],2,quantile,prob=c(.025,.5,.975)) tmp = data.table( cutoff=cutoffs ,sex=rep(c("m","f"),each=length(cutoffs)) ,l=c(m[1,],f[1,]) ,m=c(m[2,],f[2,]) ,u=c(m[3,],f[3,]) ) ticks = c(1,0.5,0.1,0.05,0.01,0.001) ### Generate figure 1B p2 = ( ggplot(tmp,aes(x=-log10(cutoff),y=m)) + geom_ribbon(aes(ymin=l,ymax=u,fill=sex),show.legend=FALSE,alpha=0.5) + geom_line(aes(group=sex,color=sex),show.legend=FALSE) + geom_vline(xintercept=-log10(0.05),linetype=3) + geom_hline(yintercept=32,linetype=3) + scale_colour_manual(values=c("f"="red","m"="black")) + scale_fill_manual(values=c("f"="red","m"="black")) + scale_y_continuous(name="",breaks=c(0,32,100,200,300,400,416),limits=c(0,416)) + scale_x_continuous(name="p cut-off",breaks=-log10(ticks),labels=ticks,limits=c(0,3)) + annotate("text",x=2,y=300,label="Male" ,color="black",size=5) + annotate("text",x=2,y= 80,label="Female",color="red" ,size=5) + ggtitle("Out-of-band intensities") + theme(panel.background=element_rect(fill="white",color=1), panel.border=element_rect(color=1,fill=NA,linetype=1,size=1), axis.text=element_text(color=1)) ) p = plot_grid(p1,p2,labels=c("A","B"),ncol=2) save_plot(filename="fig1.png",plot=p,base_width=10,base_height=5) rm(p,p1,p2,cutoffs) counts$oob = colSums(dye$detP[-Y,] > 0.05,na.rm=TRUE) (dye$detP[Y,females] <= 0.05) %>% apply(.,1,sum,na.rm=TRUE) -> call_rates$oob rm(m,f) ### Generate figure 2 ( ggplot() + geom_hline(yintercept=c(0,100),linetype=3) + geom_vline(xintercept=c(0,416),linetype=3) + geom_line(aes(x=1:416,y=call_rates$neg %>%sort %>% divide_by(13.58)),color="black") + geom_line(aes(x=1:416,y=call_rates$neg117 %>%sort %>% divide_by(13.58)),color="black") + geom_line(aes(x=1:416,y=call_rates$oob %>%sort %>% divide_by(13.58)),color="red" ) + annotate("text",x=120 ,y=30,label="Negative\np=0.05",color="black") + annotate("text",x=320,y=15,label="Negative\np=1e-117",color="black") + annotate("text",x=320,y=30,label="OOB\np=0.05",color="red") + xlab("Index") + ylab("Call rate (%)") + theme(panel.background=element_rect(fill="white",color=1), panel.border=element_rect(color=1,fill=NA,linetype=1,size=1), axis.text=element_text(color=1)) ) ggplot2::ggsave("fig2.png",height=5,width=5) rm(dye,ticks) lapply(counts,median) # $neg # [1] 64 # $neg16 # [1] 331.5 # $neg117 # [1] 4007 # $oob # [1] 3780 #---------------------------------------------------------------------- ### Are the dropped data less accurate? Check the technical replicates from the monozygotic twin dataset whether they show larger differences between duplicates twins = copy(selection[gse%like%"GSE61496"]) # Dataset of monozygotic twins twins = twins[,list(gsm,n=.N,g=1:.N),keyby=list(`pair id`,`birth-weight`)][n>1 & g %in% 1:2] ### indices of technical replicates rep1 = seq(1,12,2) # index samples rep2 = seq(2,12,2) # replicates raw = read_idats(twins$gsm,quiet=TRUE) detP.neg = detectionP.neg(raw) dye = correct_dye_bias(raw) dye = ewastools::detectionP(dye); detP = dye$detP beta = normalize(dye) D = abs(beta[,rep1] - beta[,rep2]) # Absolute difference between index samples and replicates ### Pairs for which index sample or replicate (or both) was undetected. P1 = detP.neg > t0.05; P1 = P1[,rep1] | P1[,rep2] P2 = detP.neg > -16; P2 = P2[,rep1] | P2[,rep2] P3 = detP > 0.05; P3 = P3[,rep1] | P3[,rep2] ### Number of excluded outliers with a more than 20pp difference in methylation levels sum((D>0.2) ,na.rm=TRUE) # 3733 sum((D>0.2)[P1],na.rm=TRUE) # 283 sum((D>0.2)[P2],na.rm=TRUE) # 1058 sum((D>0.2)[P3],na.rm=TRUE) # 2576 ### Median difference between detected/undetected probes h1 = split(D,P1) h2 = split(D,P2) h3 = split(D,P3) median(h1$`TRUE`,na.rm=TRUE) # 0.07691732 median(h2$`TRUE`,na.rm=TRUE) # 0.06152366 median(h3$`TRUE`,na.rm=TRUE) # 0.05047891 median(h1$`FALSE`,na.rm=TRUE) # 0.01298537 median(h2$`FALSE`,na.rm=TRUE) # 0.01294899 median(h3$`FALSE`,na.rm=TRUE) # 0.01273315