@article {Parkhomchuk166835, author = {Dmitri Parkhomchuk and Andreas Bremges and Alice C. McHardy}, title = {Fast and memory-efficient noisy read overlapping with KD-trees}, elocation-id = {166835}, year = {2017}, doi = {10.1101/166835}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Motivation Third-generation sequencing technologies produce long, but noisy reads with increasing sequencing throughput and decreasing per-base costs. Detecting read-to-read overlaps in such data is the most computationally intensive step in de novo assembly. Recently, efficient algorithms were developed for this task; nearly all of these utilize long k-mers (\>10 bp) to compare reads, but vary in their approaches to indexing, hashing, filtering, and dimensionality reduction.Results We describe an algorithm for efficient overlap detection that directly compares the full spectrum of short k-mers, namely tetramers, through geometric embedding and approximate nearest neighbor search in multidimensional KD-trees. A proof of concept implementation detected read-to-read overlaps in bacterial PacBio and ONT datasets with notably lower memory consumption than state-of-the-art approaches and allowed downstream de novo assembly into single contigs. We also introduce a sequence-context dependent tagging scheme that contributes to memory and computational efficiency and could be used with other aligning and overlapping algorithms.Availability A C++14 implementation is available under the open source Apache License 2.0 at: https://github.com/dzif/kd-tree-overlapper}, URL = {https://www.biorxiv.org/content/early/2017/07/21/166835}, eprint = {https://www.biorxiv.org/content/early/2017/07/21/166835.full.pdf}, journal = {bioRxiv} }