@article {LaPierre2020.01.17.910521, author = {Nathan LaPierre and Mohammed Alser and Eleazar Eskin and David Koslicki and Serghei Mangul}, title = {Metalign: Efficient alignment-based metagenomic profiling via containment min hash}, elocation-id = {2020.01.17.910521}, year = {2020}, doi = {10.1101/2020.01.17.910521}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Whole-genome shotgun sequencing enables the analysis of microbial communities in unprecedented detail, with major implications in medicine and ecology. Predicting the presence and relative abundances of microbes in a sample, known as {\textquotedblleft}metagenomic profiling{\textquotedblright}, is a critical first step in microbiome analysis. Existing profiling methods have been shown to suffer from poor false positive or false negative rates, while alignment-based approaches are often considered accurate but computationally infeasible. Here we present a novel method, Metalign, that addresses these concerns by performing efficient alignment-based metagenomic profiling. We use a containment min hash approach to reduce the reference database size dramatically before alignment and a method to estimate organism relative abundances in the sample by resolving reads aligned to multiple genomes. We show that Metalign achieves significantly improved results over existing methods on simulated datasets from a large benchmarking study, CAMI, and performs well on in vitro mock community data and environmental data from the Tara Oceans project. Metalign is freely available at https://github.com/nlapier2/Metalign, along with the results and plots used in this paper, and a docker image is also available at https://hub.docker.com/repository/docker/nlapier2/metalign.}, URL = {https://www.biorxiv.org/content/early/2020/01/18/2020.01.17.910521}, eprint = {https://www.biorxiv.org/content/early/2020/01/18/2020.01.17.910521.full.pdf}, journal = {bioRxiv} }