@article {Nori396127, author = {Vijay S. Nori and Christopher A. Hane and David C. Martin and Alexander D. Kravetz and Darshak M. Sanghavi}, title = {Identifying incident dementia by applying machine learning to a very large administrative claims dataset}, elocation-id = {396127}, year = {2018}, doi = {10.1101/396127}, publisher = {Cold Spring Harbor Laboratory}, abstract = {INTRODUCTION Alzheimer{\textquoteright}s disease and related dementias (ADRD) are highly prevalent conditions, and prior efforts to develop predictive models have relied on demographic and clinical risk factors using traditional logistical regression methods. We hypothesized that machine-learning algorithms using administrative claims data may represent a novel approach to predicting ADRD.METHODS Using a national de-identified dataset of more than 125 million patients including over 10,000 clinical, pharmaceutical, and demographic variables, we developed a cohort to train a machine learning model to predict ADRD 4-5 years in advance.RESULTS The Lasso algorithm selected a 50-variable model with an area under the curve (AUC) of 0.693. Top diagnosis codes in the model were memory loss (780.93), Parkinson{\textquoteright}s disease (332.0), mild cognitive impairment (331.83) and bipolar disorder (296.80), and top pharmacy codes were psychoactive drugs.DISCUSSION Machine learning algorithms can rapidly develop predictive models for ADRD with massive datasets, without requiring hypothesis-driven feature engineering.RESEARCH IN CONTEXTSystematic review: Previous attempts to predict incident dementia have relied on extensive clinical evaluations, cognitive testing, laboratory testing, neuro-imaging, genetic factors, demographics, and lifestyle variables. Applying machine learning to a large administrative claims dataset to identify individuals at increased likelihood for near-term diagnosis of dementia had not been tested.Interpretation: A 50-variable model to identify those at risk for near-term diagnosis of dementia was created and validated. Based on AUC analysis, the model compared favorably with other historical attempts at modeling more traditional forms of data.Future direction: Models, such as the one developed here, could be used to identify populations of higher prior probability for near-term diagnosis of dementia. These could then be subjected to more in-depth scrutiny for intervention or dementia-related research eligibility.}, URL = {https://www.biorxiv.org/content/early/2018/09/07/396127}, eprint = {https://www.biorxiv.org/content/early/2018/09/07/396127.full.pdf}, journal = {bioRxiv} }