BiocManager::install("regioneR")
BiocManager::install("regioneReloaded")
library(regioneR)
library(regioneReloaded)
library(plyranges)Overview
The regioneR package is an R tool designed to perform permutation-based statistical analyses of overlaps between genomic regions. Its successor, regioneReloaded, extends the functionality with improved performance and additional features for high-throughput workflows.
Both tools are integral for genomic data exploration, especially in identifying significant patterns in genomic overlaps. They integrate seamlessly into the Bioconductor ecosystem.
Installation
Usage
regioneR
regioneR is an R package specifically designed for the analysis of genomic regions. It focuses on studying overlaps between different sets of genomic regions, such as promoters, enhancers, genes, or peaks from ChIP-Seq and ATAC-Seq data. The core functionality relies on permutation tests to assess the statistical significance of observed overlaps.
Typical Use Cases for regioneR:
- Investigating Genomic Interactions:
- Analyzing whether specific DNA elements (e.g., TF binding sites) are enriched in certain genomic regions.
- Testing if chromatin modifications are preferentially found in a group of genes.
- Assessing Statistical Significance of Overlaps:
- Comparing peak sets from experiments with annotated regions (e.g., promoters, enhancers).
- Determining whether two genomic features overlap more frequently than expected by chance.
- Simulations and Permutations:
- Generating random regions for comparisons.
- Exploring the background distribution of genomic data.
regioneReloaded
regioneReloaded builds upon regioneR by adding additional methods and tools for analysis and visualization. It is particularly useful when working with large datasets or when advanced visualizations are required.
Typical Use Cases for regioneReloaded:
- More Efficient Analyses:
- Faster handling of large datasets through optimized algorithms.
- Support for high-performance computing workflows.
- Enhanced Visualizations:
- Improved representation of permutation results and overlap analyses.
- Creation of complex plots to intuitively present findings.
- New Analytical Functions:
- Introduction of novel metrics to evaluate overlaps.
- Integration of additional data formats (e.g., BigWig, BAM).
Practical Applications
These packages are ideal for tasks such as:
- Testing whether transcription factors are enriched in specific genomic regions.
- Identifying whether chromatin interactions preferentially occur within enhancers or introns.
- Simulating random genomic regions and comparing them to real datasets.
Examples from Documentation
Simple Permutationtest
special <- toGRanges(system.file("extdata", "my.special.genes.txt", package="regioneR"))
all.genes <- toGRanges(system.file("extdata", "all.genes.txt", package="regioneR"))
altered <- toGRanges(system.file("extdata", "my.altered.regions.txt", package="regioneR"))
pt <- permTest(A=special, ntimes=100, randomize.function=resampleRegions, universe=all.genes,
evaluate.function=numOverlaps, B=altered, verbose=FALSE)
plot(pt)lz <- localZScore(pt=pt, A=special, B=altered)
plot(lz)genome <- filterChromosomes(getGenome("hg38"), keep.chr="chr1")
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
Attaching package: 'rtracklayer'
The following object is masked from 'package:BiocIO':
FileForFormat
B <- createRandomRegions(nregions=100, length.mean=10000, length.sd=20000, genome=genome,
non.overlapping=FALSE)
A <- B[sample(20)]
pt <- overlapPermTest(A=A, B=B, ntimes=100, genome=genome, non.overlapping=FALSE)
lz <- localZScore(A=A, B=B, pt=pt, window=10*mean(width(A)), step=mean(width(A))/2 )
plot(lz)Crosswise Permtest
Create example genome
# Create genome
AlienGenome <-
toGRanges(data.frame(
chr = c("AlChr1", "AlChr2", "AlChr3", "AlChr4"),
start = c(rep(1, 4)),
end = c(2e6, 1e6, 5e5, 1e5)
))
gnm <- AlienGenome
nreg <- 100
# Create 3 different region sets
regA <-
createRandomRegions(
nregions = nreg,
length.mean = 100,
length.sd = 50,
non.overlapping = TRUE,
genome = gnm
)
regB <-
createRandomRegions(
nregions = nreg,
length.mean = 100,
length.sd = 50 ,
non.overlapping = TRUE,
genome = gnm
)
regC <-
createRandomRegions(
nregions = nreg,
length.mean = 100,
length.sd = 50,
non.overlapping = TRUE,
genome = gnm
)
# Create region sets with ranging similarity from 10% to 90%
vectorPerc <- seq(0.1, 0.9, 0.1)
RsetA <-
similarRegionSet(
GR = regA,
name = "regA",
genome = gnm,
vectorPerc = vectorPerc
)
RsetB <-
similarRegionSet(
GR = regB,
name = "regB",
genome = gnm,
vectorPerc = vectorPerc
)
RsetC <-
similarRegionSet(
GR = regC,
name = "regC",
genome = gnm,
vectorPerc = vectorPerc
)
# Create shared region set from A and B
vectorPerc2 <- seq(0.2, 0.8, 0.2)
regAB <- c(sample(regA, nreg / 2), sample(regB, nreg / 2))
RsetAB <-
similarRegionSet(
GR = regAB,
name = "regAB",
genome = gnm,
vectorPerc = vectorPerc2
)
# Create no similar region sets
reg_no_A <-
createRandomRegions(
nregions = nreg,
length.mean = 100,
length.sd = 50,
non.overlapping = TRUE,
genome = subtractRegions(gnm, regA)
)
reg_no_B <-
createRandomRegions(
nregions = nreg,
length.mean = 100,
length.sd = 50,
non.overlapping = TRUE,
genome = subtractRegions(gnm, regB)
)
reg_no_C <-
createRandomRegions(
nregions = nreg,
length.mean = 100,
length.sd = 50,
non.overlapping = TRUE,
genome = subtractRegions(gnm, regC)
)
reg_no_AB <-
createRandomRegions(
nregions = nreg,
length.mean = 100,
length.sd = 50,
non.overlapping = TRUE,
genome = subtractRegions(gnm, c(regA, regB))
)
reg_no_ABC <-
createRandomRegions(
nregions = nreg,
length.mean = 100,
length.sd = 50,
non.overlapping = TRUE,
genome = subtractRegions(gnm, c(regA, regB, regC))
)
# create close proximity region set but no overlaps
dst <- sample(c(-300,300),length(regA),replace = TRUE)
regD <- regioneR::toGRanges(
data.frame(
chr=as.vector(GenomeInfoDb::seqnames(regA)),
start = start(regA) + dst,
end = end (regA) + dst
)
)
RsetD <-
similarRegionSet(
GR = regD,
name = "regD",
genome = gnm,
vectorPerc = vectorPerc2
)
# Store all regions in a list
Rset_NO <- list(reg_no_A, reg_no_B, reg_no_C, reg_no_AB, reg_no_ABC)
names(Rset_NO) <- c("reg_no_A", "reg_no_B", "reg_no_C", "reg_no_AB", "reg_no_ABC")
AlienRSList_narrow <- c(RsetA, RsetB, RsetC, RsetD, RsetAB, Rset_NO)data("cw_Alien")Matrix Calculation and visualization
Crosswise Permtest
set.seed(42)
cw_Alien_RaR <- crosswisePermTest(
Alist = AlienRSList_narrow,
Blist = AlienRSList_narrow,
sampling = FALSE,
genome = AlienGenome,
per.chromosome=TRUE,
ranFUN = "randomizeRegions",
evFUN = "numOverlaps",
ntimes= 1000,
mc.cores = 20
)
set.seed(42)
cw_Alien_ReR <- crosswisePermTest(
Alist = AlienRSList_narrow,
Blist = AlienRSList_narrow,
sampling = FALSE,
genome = AlienGenome,
per.chromosome=TRUE,
ranFUN = "resampleRegions",
evFUN = "numOverlaps",
ntimes= 1000,
mc.cores = 20
)
set.seed(42)
cw_Alien_ReG <- crosswisePermTest(
Alist = AlienRSList_narrow,
Blist = AlienRSList_narrow,
sampling = FALSE,
genome = AlienGenome,
per.chromosome=TRUE,
ranFUN = "resampleGenome",
evFUN = "numOverlaps",
ntimes= 100,
mc.cores = 20
)cw_Alien_RaR <- makeCrosswiseMatrix(cw_Alien_RaR)
cw_Alien_ReG <- makeCrosswiseMatrix(cw_Alien_ReG)
cw_Alien_ReR <- makeCrosswiseMatrix(cw_Alien_ReR)
modX <- getHClust(cw_Alien_ReG,hctype = "rows")
modY <- getHClust(cw_Alien_ReG,hctype = "cols")
X<-modX$labels[modX$order]
Y<-modY$labels[modX$order]
ord<-list(X=X,Y=Y)the matrix_type parameter plotCrosswiseMatrix() allows to choose between association matrix (where the z-score value obtained from the permutation test calculation will be shown), or correlation (where the Pearson’s R-value between each Regions Set will be used).
p_ReG <- plotCrosswiseMatrix(cw_Alien_ReG, matrix_type = "association", ord_mat = ord)
plot(p_ReG)p_ReG_cor <- plotCrosswiseMatrix(cw_Alien_ReG, matrix_type = "correlation", ord_mat = ord)
plot(p_ReG_cor)plot(modX,cex = 0.8)Plot Single Permutation
Similar to the regioneR results and plots returning ggplot objects
plotSinglePT(cw_Alien_ReG, RS1 = "regA", RS2 = "regA_05")Warning in ggplot2::geom_segment(ggplot2::aes(x = mean.1, y = max_curve/2, : All aesthetics have length 1, but the data has 2 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.
plotSinglePT(cw_Alien_ReG, RS1 = "regA", RS2 = "regC")Warning in ggplot2::geom_segment(ggplot2::aes(x = mean.1, y = max_curve/2, : All aesthetics have length 1, but the data has 2 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.
Dimension Reduction
Possible algorithms:
- PCA
- tSNE
- UMAP
Possible clustering algorithms:
- kmeans
- hclust
- pam
set.seed(42)
plotCrosswiseDimRed(cw_Alien_ReG, nc = 6, type="PCA", clust_met = "kmeans")Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
Multi Local Zscore
mlz_Alien_ReG<-multiLocalZscore(A = AlienRSList_narrow$regA,
Blist = AlienRSList_narrow,
ranFUN = "resampleGenome",
evFUN = "numOverlaps",
window = 100,
step = 1,
max_pv = 1,
genome = AlienGenome)getParameters(mLZ_regA_ReG) parameter value
1 A AlienRSList_narrow$regA
2 Blist AlienRSList_narrow
3 sampling FALSE
4 fraction 0.15
5 min_sampling 0.15
6 ranFUN resampleGenome
7 evFUN numOverlaps
8 universe NULL
9 window 600
10 step 10
11 adj_pv_method BH
12 max_pv 0.05
13 Nregions 100
mlz_Me <- getMultiEvaluation(rR = mLZ_regA_ReG )
head(mlz_Me$resumeTable) name p_value z_score mean_perm_test sd_perm_test n_overlaps
1 regA_01 0.000999001 115.5593 0.576 0.7738364 90
2 regA_02 0.000999001 104.9830 0.559 0.7662285 81
3 regA_03 0.000999001 91.9693 0.568 0.7549476 70
4 regA_04 0.000999001 79.3280 0.555 0.7493574 60
5 regA_05 0.000999001 65.0379 0.598 0.7595874 50
6 regA_06 0.000999001 52.3927 0.600 0.7710996 41
norm_zscore adj.p_value
1 11.55593 0.003
2 10.49830 0.003
3 9.19693 0.003
4 7.93280 0.003
5 6.50379 0.003
6 5.23927 0.003
mLZ_regA_ReG <- makeLZMatrix(mLZ_regA_ReG)[1] "method selected for hclustering: complete"
complete average single ward.D2 median centroid mcquitty
0.9098230 0.8541174 0.7986368 0.8320493 0.8348816 0.8510903 0.8475492
[1] "method selected for hclustering: average"
complete average single ward.D2 median centroid mcquitty
0.8909663 0.8944326 0.7892161 0.8311822 0.8813696 0.8857083 0.8784679
plotLocalZScoreMatrix(mLZ_regA_ReG, maxVal = "max")Single Local ZScore
plotSingleLZ(mLZ = mLZ_regA_ReG, RS =c("regA"), smoothing = TRUE)plotSingleLZ(mLZ = mLZ_regA_ReG,RS =c("regA","regA_02","regA_06","regA_08","regD"),smoothing = TRUE)Own Example
Enrichment with Annotation with chromHMM overlaps
Annotation Enrichment Code
chrom.stat <- chrom.stat %>% mutate(state=case_when(name=="TSS"~"Promoter_TSS",
name=="PF"~"Promoter_flanking",
name=="E"~"Enhancer",
name=="WE"~"weak_enhancer_open_chromatin",
name=="CTCF"~"CTCF_enriched",
name=="T"~"predicted_transcribed",
name=="R"~"predicted_repressed"))
chrom.stat %>% dplyr::select(state) %>% unique()
chrom.stat.gr <- GRanges(seqnames=chrom.stat$chrom,
ranges = IRanges(start=chrom.stat$start,
end=chrom.stat$end),
state=chrom.stat$name,
state.exp=chrom.stat$state)
chrom.stat.list <- split(chrom.stat.gr, chrom.stat.gr$state.exp)
seqlevelsStyle(chrom.stat.list) <- "UCSC"
chrom.stat.hg38.gr <- liftOver(chrom.stat.list, ch)
set.seed(42)
histone_state_ReG<-crosswisePermTest(Alist = chrom.stat.hg38.gr, #GRanges list of chromatin state annotations (chromHMM)
Blist = TTS.tmp, #GRanges list of Triplex Target Sites
sampling = FALSE,
mc.cores= 40,
ranFUN = "resampleGenome",
evFUN = "numOverlaps",
genome = "hg38",
ntimes= 100)Code
histone_state_ReG <- makeCrosswiseMatrix(histone_state_ReG)
plotCrosswiseMatrix(histone_state_ReG, matrix_type = "correlation")Code
colMatrix <-
rev(c(rev(RColorBrewer::brewer.pal(9, "PuBu")), RColorBrewer::brewer.pal(9, "YlOrRd")))
maxVal<-max(c(abs(max(df$value)),abs(min(df$value))))
maxVal<-max(c(abs(max(df$norm_value)),abs(min(df$norm_value))))
plot_combined <- df %>% filter(Affinity!="total") %>% mutate(order.ord=as.factor(order),
order.ord=fct_relevel(order.ord, "high_blocked", "high_accessible", "mid_blocked", "mid_accessible", "low_blocked", "low_accessible")) %>%
filter(Y %in% c("predicted_repressed", "Promoter_flanking", "Promoter_TSS","Enhancer"),
Nucleosome=="accessible") %>%
ggplot()+
geom_tile(aes(order.ord, Y, fill=norm_value))+
scale_fill_gradientn(
colours = rev(colMatrix),
limits = c(-2.4, 2.4),
oob = scales::squish
)+
facet_wrap(RNA~.)+
theme(axis.text.x = element_text(angle=45, hjust = 1))Code
plotCrosswiseDimRed(histone_state_ReG, nc = 4, type="PCA", ellipse = FALSE)References
- Bernat Gel, Anna Díez-Villanueva, Eduard Serra, Marcus Buschbeck, Miguel A. Peinado, Roberto Malinverni, regioneR: an R/Bioconductor package for the association analysis of genomic regions based on permutation tests, Bioinformatics, Volume 32, Issue 2, January 2016, Pages 289–291, https://doi.org/10.1093/bioinformatics/btv562
- Malinverni R, Corujo D, Gel B, Buschbeck M. regioneReloaded: evaluating the association of multiple genomic region sets. Bioinformatics. 2023 Nov 1;39(11):btad704. doi: 10.1093/bioinformatics/btad704. PMID: 37988135; PMCID: PMC10681856.