@article {Modlin358986, author = {Samuel J. Modlin and Deepika Gunasekaran and Alyssa M. Zlotnicki and Afif Elghraoui and Norman Kuo and Carmela K. Chan and Faramarz Valafar}, title = {Resolving the hypotheticome: annotating M. tuberculosis gene function through bibliomic reconciliation andstructural modeling}, elocation-id = {358986}, year = {2018}, doi = {10.1101/358986}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Each decade, billions are invested in Tuberculosis (TB) research to further characterize M. tuberculosis pathogenesis. Despite this investment, nearly half of the 4,031 M. tuberculosis protein-coding genes lack descriptive annotation in community databases, due largely to incomplete reconciliation with the literature and a lack of structure-based methods for functional inference. We coin the term {\textquotedblleft}hypotheticome{\textquotedblright} as the set of genes in an organism without known function. For M. tuberculosis{\textquoteright} hypotheticome, we compiled the set of genes lacking functional assignment in the most frequently used Mycobacteria annotation database through systematic, exhaustive manual literature curation and 3D-protein structure-based inference, and reconciled these annotations with frequented functional databases, creating a comprehensive M. tuberculosis functional knowledge-base. In doing so, we also introduce standard usage of qualifying adjectives based on quantitative measures of certainty with the hope that this approach is adopted in choosing qualifiers for future functional assignments.Through these methods we functionally annotated 41.3\% of the M. tuberculosis hypotheticome, and provide insight into its pathogenesis, antibiotic-resistance, and virulence. Processes implicated in the unique lifestyle of M. tuberculosis of long-term persistence and obligate pathogenesis in genotoxic host microenvironments {\textendash} lipid metabolism, polyketide biosynthesis, and membrane transport and efflux {\textendash} were overrepresented in our annotation. Our structural similarity approach unturned proteins that appear critical in host-interaction through apparent host mimicry, particularly involving the phagosome and vesicle-mediated transport, as well as putative structural analogs for highly mutable protein classes, including dozens of PE/PPE family proteins which are major players at the host-pathogen interface, and sixteen potential efflux pumps which are integral to M. tuberculosis drug tolerance. Hypotheses drawn from these proteins{\textquoteright} function may help characterize the onset of latency and identify therapeutic targets. A unified annotation is essential for clear communication about M. tuberculosis. These improvements provide the most comprehensive M. tuberculosis genome annotation to date, and the approach presented can be applied to systematically annotate the genome of other organisms. We provide our novel annotations in General Feature Format with Enzyme Commission and Gene Ontology terms for integration into existing annotation frameworks.}, URL = {https://www.biorxiv.org/content/early/2018/07/03/358986}, eprint = {https://www.biorxiv.org/content/early/2018/07/03/358986.full.pdf}, journal = {bioRxiv} }