Quality Control

library(bigsnpr)
## Loading required package: bigstatsr
NCORES <- nb_cores()

popresQC.bed <- snp_plinkQC(
  prefix.in = "../POPRES_data/POPRES_allchr",
  file.type = "--file", # ped/map
  geno = 0.05,
  mind = 0.05,
  maf = 0.05,
  hwe = 1e-10,
  autosome.only = TRUE
)

popresQC2.bed <- snp_plinkIBDQC(popresQC.bed, ncores = NCORES)
celiacQC.bed <- snp_plinkQC(
  prefix.in = "../thesis-celiac/Dubois2010_data/FinnuncorrNLITUK1UK3hap300",
  geno = 0.05,
  mind = 0.05,
  maf = 0.05,
  hwe = 1e-10,
  autosome.only = TRUE
)

celiacQC2.bed <- snp_plinkIBDQC(celiacQC.bed, ncores = NCORES)

Imputation of Celiac dataset

system.time(
  celiac <- snp_attach(celiacQC.rds)
)
##    user  system elapsed 
##   0.242   0.000   0.241
str(celiac, max.level = 2)
## List of 3
##  $ genotypes:Reference class 'FBM.code256' [package "bigstatsr"] with 7 fields
##   ..and 21 methods, of which 7 are  possibly relevant:
##   ..  as.FBM, copy#envRefClass, initialize, initialize#FBM, save,
##   ..  show#envRefClass, show#FBM
##  $ fam      :'data.frame':   15155 obs. of  6 variables:
##   ..$ family.ID  : int [1:15155] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ sample.ID  : chr [1:15155] "74230_A08_WTCCCT511838" "74230_B08_WTCCCT511842" "74236_A01_BLOOD292527" "74236_A02_BLOOD292509" ...
##   ..$ paternal.ID: int [1:15155] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ maternal.ID: int [1:15155] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ sex        : int [1:15155] 2 2 2 1 2 1 1 1 1 1 ...
##   ..$ affection  : int [1:15155] 1 1 1 1 1 1 1 1 1 1 ...
##  $ map      :'data.frame':   281122 obs. of  6 variables:
##   ..$ chromosome  : int [1:281122] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ marker.ID   : chr [1:281122] "rs3934834" "rs3737728" "rs6687776" "rs9651273" ...
##   ..$ genetic.dist: int [1:281122] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ physical.pos: int [1:281122] 995669 1011278 1020428 1021403 1038818 1039813 1054842 1084601 1089205 1096336 ...
##   ..$ allele1     : chr [1:281122] "A" "A" "A" "A" ...
##   ..$ allele2     : chr [1:281122] "G" "G" "G" "G" ...
##  - attr(*, "class")= chr "bigSNP"
G <- celiac$genotypes
big_counts(G, ind.col = 1:10)
##       [,1] [,2]  [,3] [,4]  [,5]  [,6]  [,7] [,8]  [,9] [,10]
## 0      418 1166   424 1114   165   293   354 2044    81   601
## 1     4054 6050  4197 5967  2720  3553  3833 6831  2007  4618
## 2    10656 7932 10532 8073 12269 11307 10956 6267 13064  9931
## <NA>    27    7     2    1     1     2    12   13     3     5
# Fast imputation
system.time(
  infos <- snp_fastImpute(G, celiac$map$chromosome, ncores = NCORES)
)
##      user    system   elapsed 
##     0.556     0.107 34616.842
plot(subset(infos, pNA > 0.001), pch = 19, cex = 0.5)
pvals <- c(0.01, 0.005, 0.002, 0.001); colvals <- 2:5
idc <- lapply(seq_along(pvals), function(i) {
  curve(pvals[i] / x, from = 0, lwd = 2, 
        col = colvals[i], add = TRUE)
})
legend("topright", legend = pvals, title = "p(NA & Error)",
       col = colvals, lty = 1, lwd = 2)

big_counts(G, ind.col = 1:10)
##       [,1] [,2]  [,3] [,4]  [,5]  [,6]  [,7] [,8]  [,9] [,10]
## 0      418 1166   424 1114   165   293   354 2044    81   601
## 1     4054 6050  4197 5967  2720  3553  3833 6831  2007  4618
## 2    10656 7932 10532 8073 12269 11307 10956 6267 13064  9931
## <NA>    27    7     2    1     1     2    12   13     3     5
# You need to change the code of G
G$code256 <- bigsnpr:::CODE_IMPUTE_PRED
big_counts(G, ind.col = 1:10)
##       [,1] [,2]  [,3] [,4]  [,5]  [,6]  [,7] [,8]  [,9] [,10]
## 0      418 1166   425 1114   165   293   354 2044    81   601
## 1     4073 6054  4198 5968  2720  3553  3843 6837  2008  4620
## 2    10664 7935 10532 8073 12270 11309 10958 6274 13066  9934
## <NA>     0    0     0    0     0     0     0    0     0     0
# To make this permanent, you need to save (modify) the file on disk
celiac$genotypes$code256 <- bigsnpr:::CODE_IMPUTE_PRED
saveRDS(celiac, "backingfiles/celiacQC.rds")