@article {Buksz{\'a}r2020.09.15.298505, author = {J{\'o}zsef Buksz{\'a}r and Edwin JCG van den Oord}, title = {A rigorous method for integrating multiple heterogeneous databases in genetic studies}, elocation-id = {2020.09.15.298505}, year = {2020}, doi = {10.1101/2020.09.15.298505}, publisher = {Cold Spring Harbor Laboratory}, abstract = {The large number of existing databases provides a freely available independent source of information with a considerable potential to increase the likelihood of identifying genes for complex diseases. We developed a flexible framework for integrating such heterogeneous databases into novel large scale genetic studies and implemented the methods in a freely-available, user-friendly R package called MIND. For each marker, MIND computes the posterior probability that the marker has effect in the novel data collection based on the information in all available data. MIND 1) relies on a very general model, 2) is based on the mathematical formulas that provide us with the exact value of the posterior probability, and 3) has good estimation properties because of its very efficient parameterization. For an existing data set, only the ranks of the markers are needed, where ties among the ranks are allowed. Through simulations, cross-validation analyses involving 18 GWAS, and an independent replication study of 6,544 SNPs in 6,298 samples we show that MIND 1) is accurate, 2) outperforms marker selection for follow up studies based on p-values, and 3) identifies effects that would otherwise require replication of over 20 times as many markers.AUTHOR SUMMARY The large number of existing databases provides a freely available independent source of information with a considerable potential to increase the likelihood of identifying genes for complex diseases. We developed a flexible framework for integrating such heterogeneous databases into novel large scale genetic studies and implemented the methods in a freely-available, user-friendly R package called MIND. For each marker, MIND computes an estimate of the (posterior) probability that the marker has effect in the novel data collection based on the information in all available data. For an existing data set, only the ranks of the markers are needed to be known, where ties among the ranks are allowed. MIND 1) relies on a realistic model that takes confounding effects into account, 2) is based on the mathematical formulas that provide us with the exact value of the posterior probability, and 3) has good estimation properties because of its very efficient parameterization. Simulation, validation, and a replication study in independent samples show that MIND is accurate and greatly outperforms marker selection without using existing data sets.Competing Interest StatementThe authors have declared no competing interest.}, URL = {https://www.biorxiv.org/content/early/2020/09/16/2020.09.15.298505}, eprint = {https://www.biorxiv.org/content/early/2020/09/16/2020.09.15.298505.full.pdf}, journal = {bioRxiv} }