@article {Zentgraf2020.05.14.095604, author = {Jens Zentgraf and Sven Rahmann}, title = {Fast lightweight accurate xenograft sorting}, elocation-id = {2020.05.14.095604}, year = {2020}, doi = {10.1101/2020.05.14.095604}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Motivation With an increasing number of patient-derived xenograft (PDX) models being created and subsequently sequenced to study tumor heterogeneity and to guide therapy decisions, there is a similarly increasing need for methods to separate reads originating from the graft (human) tumor and reads originating from the host species{\textquoteright} (mouse) surrounding tissue. Two kinds of methods are in use: On the one hand, alignment-based tools require that reads are mapped and aligned (by an external mapper/aligner) to the host and graft genomes separately first; the tool itself then processes the resulting alignments and quality metrics (typically BAM files) to assign each read or read pair. On the other hand, alignment-free tools work directly on the raw read data (typically FASTQ files). Recent studies compare different approaches and tools, with varying results.Results We show that alignment-free methods for xenograft sorting are superior concerning CPU time usage and equivalent in accuracy. We improve upon the state of the art sorting by presenting a fast lightweight approach based on three-way bucketed quotiented Cuckoo hashing. Our hash table requires memory comparable to an FM index typically used for read alignment and less than other alignment-free approaches. It allows extremely fast lookups and uses less CPU time than other alignment-free methods and alignment-based methods at similar accuracy.Availability Our software xengsort is available under the MIT license at http://gitlab.com/genomeinformatics/xengsort. It is written in numba-compiled Python and comes with Snakemake workflows for hash table construction and dataset processing.Contact Sven.Rahmann{at}uni-due.deCompeting Interest StatementThe authors have declared no competing interest.}, URL = {https://www.biorxiv.org/content/early/2020/05/19/2020.05.14.095604}, eprint = {https://www.biorxiv.org/content/early/2020/05/19/2020.05.14.095604.full.pdf}, journal = {bioRxiv} }