Contents

1 Setup

library(subtypeHeterogeneity)
library(consensusOV)
library(RaggedExperiment)
library(curatedTCGAData)
library(DropletUtils)
library(ComplexHeatmap)
library(ggplot2)
cb.pink <- "#CC79A7"
cb.red <- "#D55E00"
cb.blue <- "#0072B2"
cb.yellow <- "#F0E442"
cb.green <- "#009E73"
cb.lightblue <- "#56B4E9"
cb.orange <- "#E69F00"

stcols <- c(cb.lightblue, cb.green, cb.orange, cb.pink) 
names(stcols) <- c("PRO", "MES", "DIF", "IMR")

2 Data sources

2.1 Cancer types

RTCGAToolbox::getFirehoseDatasets()
##  [1] "ACC"      "BLCA"     "BRCA"     "CESC"     "CHOL"     "COADREAD"
##  [7] "COAD"     "DLBC"     "ESCA"     "FPPP"     "GBMLGG"   "GBM"     
## [13] "HNSC"     "KICH"     "KIPAN"    "KIRC"     "KIRP"     "LAML"    
## [19] "LGG"      "LIHC"     "LUAD"     "LUSC"     "MESO"     "OV"      
## [25] "PAAD"     "PCPG"     "PRAD"     "READ"     "SARC"     "SKCM"    
## [31] "STAD"     "STES"     "TGCT"     "THCA"     "THYM"     "UCEC"    
## [37] "UCS"      "UVM"

2.2 ABSOLUTE

data.dir <- system.file("extdata", package="subtypeHeterogeneity") 
absFile <- file.path(data.dir, "ABSOLUTE_grangeslist.rds")
absGRL <- readRDS(absFile) 
absGRL
## GRangesList object of length 10803:
## $TCGA-02-0001-01 
## GRanges object with 59 ranges and 3 metadata columns:
##        seqnames             ranges strand | Modal_HSCN_1 Modal_HSCN_2
##           <Rle>          <IRanges>  <Rle> |    <numeric>    <numeric>
##    [1]     chr1   3218923-74907316      * |            2            2
##    [2]     chr1  74918027-75377480      * |            0            2
##    [3]     chr1 75377891-247812431      * |            2            2
##    [4]     chr2    499482-57091538      * |            2            3
##    [5]     chr2  57097020-74690378      * |            0            2
##    ...      ...                ...    ... .          ...          ...
##   [55]    chr19  10050184-58866434      * |            1            2
##   [56]    chr20    456452-53835558      * |            0            2
##   [57]    chr20  53845647-62219837      * |            2            2
##   [58]    chr21  15372509-47678774      * |            2            3
##   [59]    chr22  17423930-49331012      * |            1            2
##            score
##        <integer>
##    [1]         0
##    [2]         0
##    [3]         1
##    [4]         0
##    [5]         0
##    ...       ...
##   [55]         0
##   [56]         1
##   [57]         1
##   [58]         0
##   [59]         0
## 
## ...
## <10802 more elements>
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths

2.3 GISTIC2

gisticOV <- gistic2RSE(ctype="OV", peak="wide")
gisticOV
## class: RangedSummarizedExperiment 
## dim: 70 579 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(1): type
## colnames(579): TCGA-04-1331-01 TCGA-04-1332-01 ... TCGA-VG-A8LO-01
##   TCGA-WR-A838-01
## colData names(0):
rowRanges(gisticOV)
## GRanges object with 70 ranges and 1 metadata column:
##        seqnames              ranges strand |          type
##           <Rle>           <IRanges>  <Rle> |   <character>
##    [1]     chr1   26963410-27570286      * |      Deletion
##    [2]     chr1   39887948-40168864      * | Amplification
##    [3]     chr1 150483517-150739128      * | Amplification
##    [4]     chr1 207316102-220705691      * |      Deletion
##    [5]     chr1 234417530-235727932      * | Amplification
##    ...      ...                 ...    ... .           ...
##   [66]    chr20   30061713-30332464      * | Amplification
##   [67]    chr20   62137482-63025520      * | Amplification
##   [68]    chr21   42519390-48129895      * |      Deletion
##   [69]    chr22   30113489-30692352      * | Amplification
##   [70]    chr22   48668761-51304566      * |      Deletion
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
assay(gisticOV)[1:5,1:5]
##      TCGA-04-1331-01 TCGA-04-1332-01 TCGA-04-1335-01 TCGA-04-1336-01
## [1,]               1               0               1               0
## [2,]               0               1               0               1
## [3,]               1               1               0               0
## [4,]               0               0               0               0
## [5,]               2               0               0               0
##      TCGA-04-1337-01
## [1,]               1
## [2,]               0
## [3,]               0
## [4,]               0
## [5,]               0

2.4 Expression-based subtypes

2.4.1 Broad subtypes

ovsubs <- getBroadSubtypes(ctype="OV", clust.alg="CNMF")
dim(ovsubs)
## [1] 569   2
head(ovsubs)
##                 cluster silhouetteValue
## TCGA-04-1331-01       1      0.07993633
## TCGA-04-1341-01       1      0.04587895
## TCGA-04-1342-01       1      0.09203928
## TCGA-04-1347-01       1      0.02585325
## TCGA-04-1350-01       1      0.26985422
## TCGA-04-1351-01       1      0.26021068
table(ovsubs[,"cluster"])
## 
##   1   2   3   4 
## 183 114 120 152

2.4.2 OV subtypes from different studies

pooled.file <- file.path(data.dir, "pooled_subtypes.rds")
pooled.subs <- readRDS(pooled.file)
table(pooled.subs[,"data.source"])
## 
##   E.MTAB.386     GSE13876     GSE14764     GSE17260     GSE18520 
##          128           95           41           43           48 
##     GSE26193     GSE26712     GSE32062     GSE49997     GSE51088 
##           45          174          128          122           78 
##      GSE9891 PMID17290060         TCGA 
##          139           56          448
table(pooled.subs[,"Verhaak"])  
## 
## IMR DIF PRO MES 
## 441 450 309 345

2.4.3 TCGA subtype consistency

tab <- mapOVSubtypes(ovsubs, pooled.subs)
tab
##      1  2  3  4
## IMR 17 21 18 98
## DIF 26  5 66 16
## PRO 85  5  7  3
## MES 11 62  7  1
(ind <- sort( apply(tab, 1, which.max) ))
## PRO MES DIF IMR 
##   1   2   3   4
sts <- names(ind)[ovsubs[,"cluster"]]
ovsubs <- data.frame(ovsubs, subtype=sts, stringsAsFactors=FALSE)

2.4.4 Subtype purity & ploidy

pp.file <- file.path(data.dir, "ABSOLUTE_Purity_Ploidy.rds")
puri.ploi <- readRDS(pp.file)
head(puri.ploi)
##                 purity ploidy Genome.doublings Subclonal.genome.fraction
## TCGA-04-1331-01   0.88   1.85                0                      0.19
## TCGA-04-1341-01   0.82   2.79                1                      0.32
## TCGA-04-1342-01   0.79   2.35                0                      0.23
## TCGA-04-1347-01   0.94   4.29                2                      0.46
## TCGA-04-1350-01   0.99   3.04                1                        NA
## TCGA-04-1351-01   0.72   4.37                2                        NA
plotSubtypePurityPloidy(ovsubs, puri.ploi)

Assessing the significance of differences between subtypes:

cids <- intersect(rownames(ovsubs), rownames(puri.ploi))
subtys <- ovsubs[cids, "cluster"]
subtys <- names(stcols)[subtys]
pp <- puri.ploi[cids, ]
summary(aov(purity ~ subtype, 
            data.frame(purity=pp[,"purity"], subtype=subtys)))
##              Df Sum Sq Mean Sq F value Pr(>F)    
## subtype       3  4.244   1.415   109.1 <2e-16 ***
## Residuals   511  6.626   0.013                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
summary(aov(ploidy ~ subtype, 
            data.frame(ploidy=pp[,"ploidy"], subtype=subtys)))
##              Df Sum Sq Mean Sq F value Pr(>F)   
## subtype       3   14.4   4.792   4.807 0.0026 **
## Residuals   511  509.4   0.997                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
summary(aov(subcl ~ subtype, 
            data.frame(subcl=pp[,"Subclonal.genome.fraction"], subtype=subtys)))
##              Df Sum Sq Mean Sq F value Pr(>F)   
## subtype       3  0.322 0.10728   4.496 0.0041 **
## Residuals   365  8.708 0.02386                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 153 observations deleted due to missingness
chisq.test(pp[,"Genome.doublings"], subtys)
## 
##  Pearson's Chi-squared test
## 
## data:  pp[, "Genome.doublings"] and subtys
## X-squared = 28.319, df = 6, p-value = 8.182e-05

Stratifying by purity:

sebin <- stratifyByPurity(ovsubs, puri.ploi, method="equal.bin")
lengths(sebin)
## (0.209,0.368] (0.368,0.526] (0.526,0.684] (0.684,0.842]     (0.842,1] 
##             4            36            79           199           197
squint <- stratifyByPurity(ovsubs, puri.ploi, method="quintile")
lengths(squint)
## (0.21,0.658] (0.658,0.77]  (0.77,0.84]   (0.84,0.9]      (0.9,1] 
##          102          112          103           95          102

3 Subtype association

pvals <- testSubtypes(gisticOV, ovsubs, padj.method="none")
adj.pvals <- p.adjust(pvals, method="BH")
length(adj.pvals)
## [1] 70
head(adj.pvals)
## [1] 0.80025111 0.14279265 0.11317350 0.90288094 0.03365597 0.45058975
sum(adj.pvals < 0.1)
## [1] 35
hist(pvals, breaks=25, col="firebrick",
        xlab="Subtype association p-value", main="")

3.1 Genomic distribution of subtype-associated CNAs

cnv.genes <- getCnvGenesFromTCGA()
sig.ind <- adj.pvals < 0.1
mcols(gisticOV)$subtype <- testSubtypes(gisticOV, ovsubs, what="subtype")
mcols(gisticOV)$significance <- ifelse(sig.ind, "*", "")
circosSubtypeAssociation(gisticOV, cnv.genes)

plotNrCNAsPerSubtype(mcols(gisticOV)$type[sig.ind], mcols(gisticOV)$subtype[sig.ind])

Annotate cytogenetic bands:

bands.file <- file.path(data.dir, "cytoBand_hg19.txt")
cbands <- read.delim(bands.file, header=FALSE)
cbands <- cbands[,-5]
colnames(cbands) <- c("seqnames", "start", "end", "band")
cbands[,4] <- paste0(sub("^chr", "", cbands[,1]), cbands[,4])
cbands <- makeGRangesFromDataFrame(cbands, keep.extra.columns=TRUE)
genome(cbands) <- "hg19"
cbands
## GRanges object with 862 ranges and 1 metadata column:
##         seqnames            ranges strand |        band
##            <Rle>         <IRanges>  <Rle> | <character>
##     [1]     chr1         0-2300000      * |     1p36.33
##     [2]     chr1   2300000-5400000      * |     1p36.32
##     [3]     chr1   5400000-7200000      * |     1p36.31
##     [4]     chr1   7200000-9200000      * |     1p36.23
##     [5]     chr1  9200000-12700000      * |     1p36.22
##     ...      ...               ...    ... .         ...
##   [858]     chrY 15100000-19800000      * |    Yq11.221
##   [859]     chrY 19800000-22100000      * |    Yq11.222
##   [860]     chrY 22100000-26200000      * |    Yq11.223
##   [861]     chrY 26200000-28800000      * |     Yq11.23
##   [862]     chrY 28800000-59373566      * |        Yq12
##   -------
##   seqinfo: 24 sequences from hg19 genome; no seqlengths
gisticOV <- annotateCytoBands(gisticOV, cbands)
mcols(gisticOV)
## DataFrame with 70 rows and 4 columns
##              type   subtype significance        band
##       <character> <integer>  <character> <character>
## 1        Deletion         2                  1p36.11
## 2   Amplification         1                   1p34.3
## 3   Amplification         2                   1q21.3
## 4        Deletion         1                     1q41
## 5   Amplification         1            *      1q42.3
## ...           ...       ...          ...         ...
## 66  Amplification         1            *    20q11.21
## 67  Amplification         1            *    20q13.33
## 68       Deletion         1                  21q22.3
## 69  Amplification         4                  22q12.2
## 70       Deletion         1            *    22q13.33
coocc <- analyzeCooccurence(gisticOV) 
rownames(coocc) <- mcols(gisticOV)$band
ComplexHeatmap::Heatmap(coocc, show_row_names=TRUE, show_column_names=FALSE, 
    column_title="SCNAs", row_title="SCNAs", name="Co-occurrence", 
    row_names_gp = gpar(fontsize = 8))