Quality Control
library(bigsnpr)
## Loading required package: bigstatsr
NCORES <- nb_cores()
popresQC.bed <- snp_plinkQC(
prefix.in = "../POPRES_data/POPRES_allchr",
file.type = "--file", # ped/map
geno = 0.05,
mind = 0.05,
maf = 0.05,
hwe = 1e-10,
autosome.only = TRUE
)
popresQC2.bed <- snp_plinkIBDQC(popresQC.bed, ncores = NCORES)
celiacQC.bed <- snp_plinkQC(
prefix.in = "../thesis-celiac/Dubois2010_data/FinnuncorrNLITUK1UK3hap300",
geno = 0.05,
mind = 0.05,
maf = 0.05,
hwe = 1e-10,
autosome.only = TRUE
)
celiacQC2.bed <- snp_plinkIBDQC(celiacQC.bed, ncores = NCORES)
Imputation of Celiac dataset
system.time(
celiac <- snp_attach(celiacQC.rds)
)
## user system elapsed
## 0.242 0.000 0.241
str(celiac, max.level = 2)
## List of 3
## $ genotypes:Reference class 'FBM.code256' [package "bigstatsr"] with 7 fields
## ..and 21 methods, of which 7 are possibly relevant:
## .. as.FBM, copy#envRefClass, initialize, initialize#FBM, save,
## .. show#envRefClass, show#FBM
## $ fam :'data.frame': 15155 obs. of 6 variables:
## ..$ family.ID : int [1:15155] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ sample.ID : chr [1:15155] "74230_A08_WTCCCT511838" "74230_B08_WTCCCT511842" "74236_A01_BLOOD292527" "74236_A02_BLOOD292509" ...
## ..$ paternal.ID: int [1:15155] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ maternal.ID: int [1:15155] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ sex : int [1:15155] 2 2 2 1 2 1 1 1 1 1 ...
## ..$ affection : int [1:15155] 1 1 1 1 1 1 1 1 1 1 ...
## $ map :'data.frame': 281122 obs. of 6 variables:
## ..$ chromosome : int [1:281122] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ marker.ID : chr [1:281122] "rs3934834" "rs3737728" "rs6687776" "rs9651273" ...
## ..$ genetic.dist: int [1:281122] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ physical.pos: int [1:281122] 995669 1011278 1020428 1021403 1038818 1039813 1054842 1084601 1089205 1096336 ...
## ..$ allele1 : chr [1:281122] "A" "A" "A" "A" ...
## ..$ allele2 : chr [1:281122] "G" "G" "G" "G" ...
## - attr(*, "class")= chr "bigSNP"
G <- celiac$genotypes
big_counts(G, ind.col = 1:10)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## 0 418 1166 424 1114 165 293 354 2044 81 601
## 1 4054 6050 4197 5967 2720 3553 3833 6831 2007 4618
## 2 10656 7932 10532 8073 12269 11307 10956 6267 13064 9931
## <NA> 27 7 2 1 1 2 12 13 3 5
# Fast imputation
system.time(
infos <- snp_fastImpute(G, celiac$map$chromosome, ncores = NCORES)
)
## user system elapsed
## 0.556 0.107 34616.842
plot(subset(infos, pNA > 0.001), pch = 19, cex = 0.5)
pvals <- c(0.01, 0.005, 0.002, 0.001); colvals <- 2:5
idc <- lapply(seq_along(pvals), function(i) {
curve(pvals[i] / x, from = 0, lwd = 2,
col = colvals[i], add = TRUE)
})
legend("topright", legend = pvals, title = "p(NA & Error)",
col = colvals, lty = 1, lwd = 2)

big_counts(G, ind.col = 1:10)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## 0 418 1166 424 1114 165 293 354 2044 81 601
## 1 4054 6050 4197 5967 2720 3553 3833 6831 2007 4618
## 2 10656 7932 10532 8073 12269 11307 10956 6267 13064 9931
## <NA> 27 7 2 1 1 2 12 13 3 5
# You need to change the code of G
G$code256 <- bigsnpr:::CODE_IMPUTE_PRED
big_counts(G, ind.col = 1:10)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## 0 418 1166 425 1114 165 293 354 2044 81 601
## 1 4073 6054 4198 5968 2720 3553 3843 6837 2008 4620
## 2 10664 7935 10532 8073 12270 11309 10958 6274 13066 9934
## <NA> 0 0 0 0 0 0 0 0 0 0
# To make this permanent, you need to save (modify) the file on disk
celiac$genotypes$code256 <- bigsnpr:::CODE_IMPUTE_PRED
saveRDS(celiac, "backingfiles/celiacQC.rds")