@article {Nelson095372, author = {Michael G. Nelson and Raquel S. Linheiro and Casey M. Bergman}, title = {McClintock: An integrated pipeline for detecting transposable element insertions in whole genome shotgun sequencing data}, elocation-id = {095372}, year = {2016}, doi = {10.1101/095372}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Background Transposable element (TE) insertions are among the most challenging type of variants to detect in genomic data because of their repetitive nature and complex mechanisms of replication. Nevertheless, the recent availability of large resequencing datasets has spurred the development of many new methods to detect TE insertions in whole genome shotgun sequences. These methods generate output in diverse formats and have a large number of software and data dependencies, making their comparative evaluation challenging for potential users.Results Here we develop an integrated bioinformatics pipeline for the detection of TE insertions in whole genome shotgun data, called McClintock, that automatically runs and generates standardized output for multiple TE detection methods. We demonstrate the utility of the McClintock system by performing comparative evaluation of six TE detection methods using simulated and real genome data from the model microbal eukaryote, Saccharomyces cerevisiae. We find substantial variation among McClintock component methods in their ability to detect non-reference insertions in the yeast genome, but show that non-reference TEs at nearly all biologically-realistic locations can be detected in simulated data by combining multiple methods that use split-read and read-pair evidence. In general, our results reveal that split-read methods detect fewer non-reference TE insertions than read-pair methods, but generally have much higher positional accuracy. Analysis of a large sample of real yeast genomes reveals that most, but not all, McClintock component methods can recover known aspects of TE biology in yeast such as the transpositional activity status of families, tRNA gene target preferences, and TSD structure, albeit with varying levels of positional accuracy.Conclusions Our results suggest that no single TE detection method currently provides comprehensive detection of non-reference TEs, even in the context of a simplified model eukaryotic genome like S. cerevisiae. In spite of these limitations, the McClintock system provides a framework for testing, developing and integrating results from multiple TE detection methods to achieve this ultimate aim, as well as useful guidance for yeast researchers to select appropriate TE detection tools.}, URL = {https://www.biorxiv.org/content/early/2016/12/19/095372}, eprint = {https://www.biorxiv.org/content/early/2016/12/19/095372.full.pdf}, journal = {bioRxiv} }