@article {Brookes2021.05.24.445506, author = {David H. Brookes and Amirali Aghazadeh and Jennifer Listgarten}, title = {On the sparsity of fitness functions and implications for learning}, elocation-id = {2021.05.24.445506}, year = {2021}, doi = {10.1101/2021.05.24.445506}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Fitness functions map biological sequences to a scalar property of interest. Accurate estimation of these functions yields biological insight and sets the foundation for model-based sequence design. However, the amount of fitness data available to learn these functions is typically small relative to the large combinatorial space of sequences; characterizing how much data is needed for accurate estimation remains an open problem. There is a growing body of evidence demonstrating that empirical fitness functions display substantial sparsity when represented in terms of epistatic interactions. Moreover, the theory of Compressed Sensing provides scaling laws for the number of samples required to exactly recover a sparse function. Motivated by these results, we study the sparsity of fitness functions sampled from a generalization of the NK model, a widely-used random field model of fitness functions. In particular, we present theoretical results that allow us to test the effect of the Generalized NK (GNK) model{\textquoteright}s interpretable parameters{\textemdash}sequence length, alphabet size, and assumed interactions between sequence positions{\textemdash}on the sparsity of fitness functions sampled from the model and, consequently, the number of measurements required to exactly recover these functions. Further, we show that GNK fitness functions with parameters set according to protein structural contacts can be used to accurately approximate the number of samples required to estimate two empirical protein fitness functions, and are able to identify important higher-order epistatic interactions in these functions using only structural information.Competing Interest StatementThe authors have declared no competing interest.}, URL = {https://www.biorxiv.org/content/early/2021/05/25/2021.05.24.445506}, eprint = {https://www.biorxiv.org/content/early/2021/05/25/2021.05.24.445506.full.pdf}, journal = {bioRxiv} }