@article {Schreiber103614, author = {Jacob Schreiber and Maxwell Libbrecht and Jeffrey Bilmes and William Stafford Noble}, title = {Nucleotide sequence and DNaseI sensitivity are predictive of 3D chromatin architecture}, elocation-id = {103614}, year = {2018}, doi = {10.1101/103614}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Recently, Hi-C has been used to probe the 3D chromatin architecture of multiple organisms and cell types. The resulting collections of pairwise contacts across the genome have connected chromatin architecture to many cellular phenomena, including replication timing and gene regulation. However, high resolution (10 kb or finer) contact maps remain scarce due to the expense and time required for collection. A computational method for predicting pairwise contacts without the need to run a Hi-C experiment would be invaluable in understanding the role that 3D chromatin architecture plays in genome biology. We describe Rambutan, a deep convolutional neural network that predicts Hi-C contacts at 1 kb resolution using nucleotide sequence and DNaseI assay signal as inputs. Specifically, Rambutan identifies locus pairs that engage in high confidence contacts according to Fit-Hi-C, a previously described method for assigning statistical confidence estimates to Hi-C contacts. We first demonstrate Rambutan{\textquoteright}s performance across chromosomes at 1 kb resolution in the GM12878 cell line. Subsequently, we measure Rambutan{\textquoteright}s performance across six cell types. In this setting, the model achieves an area under the receiver operating characteristic curve between 0.7662 and 0.8246 and an area under the precision-recall curve between 0.3737 and 0.9008. We further demonstrate that the predicted contacts exhibit expected trends relative to histone modification ChlP-seq data, replication timing measurements, and annotations of functional elements such as promoters and enhancers. Finally, we predict Hi-C contacts for 53 human cell types and show that the predictions cluster by cellular function. [NOTE: After our original submission we discovered an error in our calling of statistically significant contacts. Briefly, when calculating the prior probability of a contact, we used the number of contacts at a certain genomic distance in a chromosome but divided by the total number of bins in the full genome. While we investigate what impact this had on our results, we ask that readers treat this manuscript skeptically.]}, URL = {https://www.biorxiv.org/content/early/2018/07/10/103614}, eprint = {https://www.biorxiv.org/content/early/2018/07/10/103614.full.pdf}, journal = {bioRxiv} }