@article {Cuperus137547, author = {Josh Cuperus and Benjamin Groves and Anna Kuchina and Alexander B. Rosenberg and Nebojsa Jojic and Stanley Fields and Georg Seelig}, title = {Deep learning of the regulatory grammar of yeast 5{\textquoteright} untranslated regions from 500,000 random sequences}, elocation-id = {137547}, year = {2017}, doi = {10.1101/137547}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Our ability to predict protein expression from DNA sequence alone remains poor, reflecting our limited understanding of cis-regulatory grammar and hampering the design of engineered genes for synthetic biology applications. Here, we generate a model that predicts the translational efficiency of the 5{\textquoteright} untranslated region (UTR) of mRNAs in the yeast Saccharomyces cerevisiae. We constructed a library of half a million 50-nucleotide-long random 5{\textquoteright} UTRs and assayed their activity in a massively parallel growth selection experiment. The resulting data allow us to quantify the impact on translation of Kozak sequence composition, upstream open reading frames (uORFs) and secondary structure. We trained a convolutional neural network (CNN) on the random library and showed that it performs well at predicting the translational efficiency of both a held-out set of the random 5{\textquoteright} UTRs as well as native S. cerevisiae 5{\textquoteright} UTRs. The model additionally was used to computationally evolve highly translating 5{\textquoteright} UTRs. We confirmed experimentally that the great majority of the evolved sequences lead to higher translation rates than the starting sequences, demonstrating the predictive power of this model.}, URL = {https://www.biorxiv.org/content/early/2017/05/12/137547}, eprint = {https://www.biorxiv.org/content/early/2017/05/12/137547.full.pdf}, journal = {bioRxiv} }