Introduction

The GRIN2 package is an improved version of GRIN software that streamlines its use in practice to analyze genomic lesion data, accelerate its computing, and expand its analysis capabilities to answer additional scientific questions including a rigorous evaluation of the association of genomic lesions with RNA expression.

T-ALL Example dataset

1) Obtain clinical, lesion and gene expression data

data(clin.data)
data(lesion.data)
data(expr.data)

head(lesion.data)
#>       ID chrom loc.start   loc.end lsn.type
#> 1 PARFIH    16  67650782  67650782 mutation
#> 2 PARFIH     3  49759671  49759671 mutation
#> 3 PARFIH     1   6313912   6313912 mutation
#> 4 PARFIH     1 235929422 235929422 mutation
#> 5 PARFIH     4 142053653 142053653 mutation
#> 6 PARFIH     9  27206737  27206737 mutation

# Specify a folder on your local machine to store the analysis results:
# resultsPath=tempdir()
# knitr::opts_knit$set(root.dir = normalizePath(path = resultsPath))

2) Retrieve Genomic Annotations for Genes and Regulatory Features

hg19.ann=get.ensembl.annotation("Human_GRCh37") 
# "Human_GRCh38" can be used instead of "Human_GRCh37" to retrieve data for hg38

# 1) Gene annotation data that include around 20,000 coding genes and 25,000 Non-coding processed transcripts such as lncRNAs, miRNAs, snRNA and snoRNAs:
gene.annotation=hg19.ann$gene.annotation

# 2)Annotation data for regulatory features retrieved from ensembl regulatory build that  include around 500,000 feauters (promoters, enhancer, TF and CTCF binding sites, etc...). Ensembl imports publicly available data from different large epigenomic consortia that  includes ENCODE, Roadmap Epigenomics and Blueprint (118 epigenome):
hg19.reg.annotation=hg19.ann$reg.annotation.predicted

# 3)Annotation data for experimentally validated regulatory features retrieved from FANTOM5  project:
hg19.reg.FANTOM=hg19.ann$reg.annotation.validated

# Instead of retrieving annotation data from Ensembl BioMart, users can use their own gene  annotation data files. File should has four required columns that include "gene" which is the ensembl ID of annotated genes to which the lesion data will be overlapped, "chrom" which is the chromosome on which the gene is located, "loc.start" which is the gene start position, and "loc.end" the gene end position. hg19.gene annotation will be used as an example gene annotation data file:
data(hg19.gene.annotation)

head(hg19.gene.annotation)
#>              gene chrom loc.start   loc.end
#> 1 ENSG00000177076     9  19408925  19452018
#> 2 ENSG00000122729     9  32384618  32454767
#> 3 ENSG00000167107    17  48503519  48552206
#> 4 ENSG00000130402    19  39138289  39222223
#> 5 ENSG00000173137     8 145596790 145618457
#> 6 ENSG00000035687     1 244571796 244615436
#>                                                          description gene.name
#> 1               alkaline ceramidase 2 [Source:HGNC Symbol;Acc:23675]     ACER2
#> 2                  aconitase 1, soluble [Source:HGNC Symbol;Acc:117]      ACO1
#> 3 acyl-CoA synthetase family member 2 [Source:HGNC Symbol;Acc:26101]     ACSF2
#> 4                      actinin, alpha 4 [Source:HGNC Symbol;Acc:166]     ACTN4
#> 5     aarF domain containing kinase 5 [Source:HGNC Symbol;Acc:21738]     ADCK5
#> 6             adenylosuccinate synthase [Source:HGNC Symbol;Acc:292]      ADSS
#>          biotype chrom.strand chrom.band
#> 1 protein_coding            1      p22.1
#> 2 protein_coding            1      p21.1
#> 3 protein_coding            1     q21.33
#> 4 protein_coding            1      q13.2
#> 5 protein_coding            1      q24.3
#> 6 protein_coding           -1        q44

3) Retrieve Chromosome Size Data

# To retrieve chromosome size data for GRCh37 (hg19) genome build from chr.info txt file  available on UCSC genome browser
hg19.chrom.size=get.chrom.length("Human_GRCh37")
# "Human_GRCh38" can be used to retrieve chrom size data for hg38

# Instead of retrieving chromosome size data from UCSC genome browser, users can use their own files that should has two required columns that include "chrom" with the chromosome number and "size" for the size of the chromosome in base pairs:
# data(hg19.chrom.size)

head(hg19.chrom.size)
#>   chrom      size
#> 1     1 249250621
#> 2     2 243199373
#> 3     3 198022430
#> 4     4 191154276
#> 5     5 180915260
#> 6     6 171115067

4) Run Genomic Random Interval (GRIN) Analysis

# Users can run GRIN analysis by just specifying the genome.version in grin.stats function. 
# A) Gene annotation data will be directly retrieved from Ensembl BioMart for the specified  genome assembly using get.ensembl.annotation function and chromosome size data will be also retrieved from UCSC genome browser:
# grin.results=grin.stats(lesion.data, 
#                         genome.version="Human_GRCh37")
# "Human_GRCh38" can be used instead of "Human_GRCh37" for hg38 genome assembly

# Users can also use their own annotation and chromosome size data files to run GRIN analysis:
grin.results=grin.stats(lesion.data, 
                        hg19.gene.annotation, 
                        hg19.chrom.size)
# it takes around 2 minutes to map 6,887 lesions to around 57,000 annotated genes and return the GRIN results.

# B) To run GRIN for computationally predicted regulatory features from Ensembl regulatory build:
# First get a group of 500 regulatory features for an example run: 
hg19.reg.example=hg19.reg.annotation[396500:397000,]
# whole file with around 500,000 feature takes around 25 minutes to return the results:
# Run GRIN analysis:
grin.results.reg=grin.stats(lesion.data, 
                            hg19.reg.example, 
                            hg19.chrom.size)

# C) To run GRIN analysis for experimentally verified regulatory features from FANTOM5 project:
# First get a group of 500 FANTOM5 regulatory features for an example run:
hg19.fantom.example=hg19.reg.FANTOM[232500:233000,]

grin.results.fantom=grin.stats(lesion.data, 
                               hg19.fantom.example, 
                               hg19.chrom.size)

5) Now, let’s Take a Look on the GRIN Output Results:

# Extract GRIN results table:
grin.table=grin.results$gene.hits
sorted.results <- grin.table[order(as.numeric(as.character(grin.table$p2.nsubj))),]

First section of GRIN results table will include gene annotation in addition to the number of subjects affected by each type of lesions:

head(sorted.results[,c(7,11:14)])
#>     gene.name nsubj.fusion nsubj.gain nsubj.loss nsubj.mutation
#> 273      PTEN            0          8         23             37
#> 219       MYB            4         26          2             13
#> 64     CDKN1B            0          1         26              4
#> 105      ETV6            2          2         21              7
#> 395       WT1            0          1          9             24
#> 86      DDX3X            2          1          3              4

Results will also include the probability (p) and FDR adjusted q-value for each gene to be affected by each type of lesion:

head(sorted.results[,c(7,19:22)])
#>     gene.name q.nsubj.fusion q.nsubj.gain q.nsubj.loss q.nsubj.mutation
#> 273      PTEN   1.000000e+00 1.000000e+00 4.797673e-35     8.227521e-77
#> 219       MYB   2.515949e-09 3.203506e-50 8.633060e-01     7.315274e-27
#> 64     CDKN1B   1.000000e+00 1.000000e+00 1.095989e-25     8.487617e-09
#> 105      ETV6   5.295505e-03 1.000000e+00 2.363783e-16     1.401030e-06
#> 395       WT1   1.000000e+00 1.000000e+00 1.439453e-06     6.683362e-50
#> 86      DDX3X   4.803643e-05 1.000000e+00 8.633060e-01     1.536521e-06

Another important part of the output is the constellation results testing if the gene is affected by one type of lesions (p1.nusubj) or a constellation of two types of lesions (p2.nsubj), three types of lesions (p3.nsubj), etc.. with FDR adjusted q-values added to the table as well:

head(sorted.results[,c(7,27:30)])
#>     gene.name     q1.nsubj     q2.nsubj     q3.nsubj q4.nsubj
#> 273      PTEN 6.483477e-77 1.102996e-69 1.000000e+00        1
#> 219       MYB 4.733319e-51 5.504218e-53 4.514161e-29        1
#> 64     CDKN1B 7.503116e-26 2.500805e-16 1.000000e+00        1
#> 105      ETV6 3.034201e-16 6.736903e-12 1.227281e-09        1
#> 395       WT1 6.380742e-50 9.437348e-11 1.000000e+00        1
#> 86      DDX3X 6.227043e-07 2.692079e-10 1.000000e+00        1

The second part of the results table report the same set of results but for the number of hits affecting each gene for each lesion type instead of the number of unique affected subjects. For example, if NOTCH1 gene is affected by 4 mutations in the same subject, this event will be counted as 4 hits in the n.hits stats but 1 subject in the n.subj stats:

head(sorted.results[,c(7,31:34)])
#>     gene.name nhit.fusion nhit.gain nhit.loss nhit.mutation
#> 273      PTEN           0         8        38            45
#> 219       MYB           4        27         2            14
#> 64     CDKN1B           0         1        26             4
#> 105      ETV6           2         2        22             7
#> 395       WT1           0         1         9            27
#> 86      DDX3X           2         1         3             5

6) Write GRIN Results

# write.grin.xlsx function return an excel file with multiple sheets that include GRIN results table, interpretation of each column in the results, and methods paragraph
# write.grin.xlsx(grin.results, "T-ALL_GRIN_result_annotated_genes.xlsx")

# To return the results table without other information (will be helpful in case of large lesion data files where the gene.lsn.data sheet will be > 1 million rows that halt the write.grin.xlsx function).
grin.res.table=grin.results$gene.hits

7) Genome-wide Lesion Plot

genomewide.plot=genomewide.lsn.plot(grin.results, 
                                    max.log10q=150) 
Figure 1. Genome-wide lesion plot

Figure 1. Genome-wide lesion plot

# This function use the list of grin.results

8) Stacked Barplot for a List of Genes of Interest

# This barplot shows the number of patients affected by different types of lesions in a list of genes of interest:
count.genes=as.vector(c("CDKN2A", "NOTCH1", "CDKN2B", "TAL1", "FBXW7", "PTEN", "IRF8",
                        "NRAS", "BCL11B", "MYB", "LEF1","RB1", "MLLT3", "EZH2", "ETV6",
                        "CTCF", "JAK1", "KRAS", "RUNX1", "IKZF1", "KMT2A", "RPL11", "TCF7",
                        "WT1", "JAK2", "JAK3", "FLT3"))

# return the stacked barplot
grin.barplt(grin.results,
            count.genes)
Figure 2. stacked barplot with number of patients affected by different types of lesions in a list of genes of interest

Figure 2. stacked barplot with number of patients affected by different types of lesions in a list of genes of interest

9) Prepare an OncoPrint Lesion Matrix

# First identify the list of genes to be included in the oncoprint:
oncoprint.genes=as.vector(c("ENSG00000101307", "ENSG00000171862", "ENSG00000138795", 
                            "ENSG00000139083", "ENSG00000162434", "ENSG00000134371",
                            "ENSG00000118058", "ENSG00000171843", "ENSG00000139687",
                            "ENSG00000184674", "ENSG00000118513", "ENSG00000197888",
                            "ENSG00000111276", "ENSG00000258223", "ENSG00000187266",
                            "ENSG00000174473", "ENSG00000133433", "ENSG00000159216",
                            "ENSG00000107104", "ENSG00000099984", "ENSG00000078403",
                            "ENSG00000183150", "ENSG00000081059", "ENSG00000175354",
                            "ENSG00000164438"))

# Prepare a lesion matrix for the selected list of genes with each row as a gene and each column is a patient (this matrix is compatible with oncoPrint function in ComplexHeatmap package):
oncoprint.mtx=grin.oncoprint.mtx(grin.results, 
                                 oncoprint.genes)

head(oncoprint.mtx[,1:6])
#>        PASGFH    PASKSY    PASTDU PASYWF PATDRC PATFWF
#> TCF7    loss;     loss; mutation;  loss;  loss;  loss;
#> KANK1             gain;                               
#> CDKN1B  loss;     loss;                   loss;  loss;
#> MYB           mutation;                               
#> LEF1                                                  
#> ETV6    loss;     loss;                   loss;  loss;

10) Pass the Lesion Matrix to OncoPrint Function

# Use onco.print.props function to specify a hgt for each lesion category to show all lesions that might affect a certain patient. For example, if the same patient is affected by gain and mutation, only 25% of the oncoprint rectangle will be filled with the mutations green color and the rest will appear as the gain red color.
onco.props<-onco.print.props(lesion.data,
                             hgt = c("gain"=5, "loss"=4, "mutation"=2, "fusion"=1))
column_title = "" # optional

# use oncoprint function from ComplexHeatmap library to plot the oncoprint:
oncoPrint(oncoprint.mtx,
          alter_fun = onco.props$alter_func,
          col = onco.props$col,
          column_title = column_title,
          heatmap_legend_param = onco.props$heatmap_legend_param)