@article {M{\'e}heust2020.07.16.207365, author = {Rapha{\"e}l M{\'e}heust and Cindy J. Castelle and Alexander L. Jaffe and Jillian F. Banfield}, title = {Early acquisition of conserved, lineage-specific proteins currently lacking functional predictions were central to the rise and diversification of archaea}, elocation-id = {2020.07.16.207365}, year = {2020}, doi = {10.1101/2020.07.16.207365}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Recent genomic analyses of Archaea have profoundly reshaped our understanding of their distribution, functionalities and roles in eukaryotic evolution. Within the domain, major supergroups are Euryarchaeota, which includes many methanogens, the TACK, which includes Thaumarchaeaota that impact ammonia oxidation in soils and the ocean, the Asgard, which includes lineages inferred to be ancestral to eukaryotes, and the DPANN, a group of mostly symbiotic small-celled archaea. Here, we investigated the extent to which clustering based on protein family content recapitulates archaeal phylogeny and identified the proteins that distinguish the major subdivisions. We also defined 10,866 archaeal protein families that will serve as a community resource. Clustering based on these families broadly recovers the archaeal phylogenetic tree. Interestingly, all major groups are distinguished primarily by the presence of families of conserved hypothetical proteins that are either novel or so highly diverged that their functions are obscured. Given that these hypothetical proteins are near ubiquitous within phyla, we conclude that they were important in the origin of most of the major archaeal lineages.Competing Interest StatementJ.F.B. is a founder of Metagenomi.}, URL = {https://www.biorxiv.org/content/early/2020/07/17/2020.07.16.207365}, eprint = {https://www.biorxiv.org/content/early/2020/07/17/2020.07.16.207365.full.pdf}, journal = {bioRxiv} }