@article {Dilthey006973, author = {Alexander Dilthey and Charles Cox and Zamin Iqbal and Matthew R. Nelson and Gil McVean}, title = {Improved genome inference in the MHC using a population reference graph}, elocation-id = {006973}, year = {2014}, doi = {10.1101/006973}, publisher = {Cold Spring Harbor Laboratory}, abstract = {In humans and many other species, while much is known about the extent and structure of genetic variation, such information is typically not used in assembling novel genomes. Rather, a single reference is used against which to map reads, which can lead to poor characterisation of regions of high sequence or structural diversity. Here, we introduce a population reference graph, which combines multiple reference sequences as well as catalogues of SNPs and short indels. The genomes of novel samples are reconstructed as paths through the graph using an efficient hidden Markov Model, allowing for recombination between different haplotypes and variants. By applying the method to the 4.5Mb extended MHC region on chromosome 6, combining eight assembled haplotypes, sequences of known classical HLA alleles and 87,640 SNP variants from the 1000 Genomes Project, we demonstrate, using simulations, SNP genotyping, short-read and longread data, how the method improves the accuracy of genome inference. Moreover, the analysis reveals regions where the current set of reference sequences is substantially incomplete, particularly within the Class II region, indicating the need for continued development of reference-quality genome sequences.Abbreviations:AAlignment AQ=(q1,{\textellipsis},qNQ)Query sequence of NQ charactersAall(Q, G)Set of alignments between Q and G.PRGPopulation Reference GraphCOVCatalogue of VariationGThe specific PRGVSet of verticesESet of edgesPn(e)Edge probability distribution at node nPsubSet of all subpathsPtraversalSet of all subpaths, constrained to complete traversalsVmTwo verticesvneOne edgel(v)The level of vertex vLScaffold haplotype MSA length; last level of haplotype graphH(v)The set of scaffold haplotypes attached to vK(v)The set of kMer-edges attached to vcvCurrent vertexr{\textquotedblleft}Recombination{\textquotedblright} parameterSNNumber of scaffold haplotypesSn,ii-th position (MSA) of haplotype nOiSet of kMers output from level io(kMer)Sample count of kMer kMerxGeneric variableXAdditional variant specifiers.suffix(v, r)Suffix function for vertex v of length rQAlignment query sequenceNQLength of QqiIndex for QQ{\textquoteright}Aligned query sequenceE{\textquoteright}Aligned edge sequenceALAlignment lengthMAlignment scoring matrixZlNumber of nodes at levelnode(l, z)Retrieve node z at level l.}, URL = {https://www.biorxiv.org/content/early/2014/07/08/006973}, eprint = {https://www.biorxiv.org/content/early/2014/07/08/006973.full.pdf}, journal = {bioRxiv} }