BiocManager::install("biomaRt")
library(biomaRt)Overview
The biomaRt package provides an interface to the BioMart databases, allowing access to a wide range of genomic data directly within R. It simplifies the process of querying BioMart web services for data retrieval. Basically SQL interface to BioMart for R.
Durinck S, Spellman P, Birney E, Huber W (2009). “Mapping identifiers for the integration of genomic datasets with the R/Bioconductor package biomaRt.” Nature Protocols, 4, 1184–1191.
The most well known name is probably Wolfgang Huber, who is a professor at the European Molecular Biology Laboratory (EMBL) and one of the main contributors to the Bioconductor project. The package maintainer is Mike Smith https://support.bioconductor.org/u/3986/
Installation
Install the biomaRt package from Bioconductor:
Usage
Selecting a BioMart Database and Dataset
List available BioMart databases:
listMarts() biomart version
1 ENSEMBL_MART_ENSEMBL Ensembl Genes 113
2 ENSEMBL_MART_MOUSE Mouse strains 113
3 ENSEMBL_MART_SNP Ensembl Variation 113
4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 113
The most used database is probably the Ensembl database containing a wide range of genomic data:
ensembl <- useMart("ensembl")Connect to the Ensembl BioMart service and select the human gene dataset
List available datasets in Ensembl:
# Retrieve the list of available datasets
datasets <- listDatasets(ensembl)
# Display the datasets
datatable(datasets, options = list(pageLength = 5, autoWidth = TRUE))Protists, Plants, Metazoa and Fungi are also available using listEnsemblGenomes() and useEnsemblGenomes() only Ensembl Bacteria data is not available because it is to huge to be handled by BioMart
Let´s select the human gene dataset:
ensembl_hs <- useDataset("hsapiens_gene_ensembl", mart = ensembl)useMart("ensembl") and useDataset("hsapiens_gene_ensembl", mart = ensembl) can be combined in one step using the useEnsembl function if the dataset is known.
ensembl_hs <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")Retrieving Data
Fetch gene information for a list of gene symbols:
genes <- c("BRCA1", "TP53", "EGFR")
gene_data <- getBM(attributes = c("hgnc_symbol", "chromosome_name", "start_position", "end_position", "gene_biotype"),
filters = "hgnc_symbol",
values = genes,
mart = ensembl_hs)
print(gene_data) hgnc_symbol chromosome_name start_position end_position gene_biotype
1 BRCA1 17 43044295 43170245 protein_coding
2 EGFR 7 55019017 55211628 protein_coding
3 TP53 17 7661779 7687546 protein_coding
Customize
What can we get from Ensemble_BioMart?
filters <- listFilters(ensembl_hs)
datatable(filters, options = list(pageLength = 10, autoWidth = TRUE))attributes <- listAttributes(ensembl_hs)
datatable(attributes, options = list(pageLength = 10, autoWidth = TRUE))Example: Retrieve all HUGO gene symbols of genes that are located on chromosomes 17,20 or Y, and are associated with specific GO terms
The GO terms we are interested in are: GO:0051330, GO:0000080, GO:0000114, GO:0000082. The key to performing this query is to understand that the getBM() function enables you to use more than one filter at the same time. In order to do this, the filter argument should be a vector with the filter names. The values should be a list, where the first element of the list corresponds to the first filter and the second list element to the second filter and so on. The elements of this list are vectors containing the possible values for the corresponding filters.
go=c("GO:0051330","GO:0000080","GO:0000114","GO:0000082")
chrom=c(17,20,"Y")
getBM(attributes= "hgnc_symbol",
filters=c("go","chromosome_name"),
values=list(go, chrom), mart=ensembl_hs) hgnc_symbol
1 CDK3
2 RPS6KB1
Other marts ?
listMarts() biomart version
1 ENSEMBL_MART_ENSEMBL Ensembl Genes 113
2 ENSEMBL_MART_MOUSE Mouse strains 113
3 ENSEMBL_MART_SNP Ensembl Variation 113
4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 113
listMarts(host = "parasite.wormbase.org") biomart version
1 parasite_mart WBPS 19 Mart
wormbase <- useMart(biomart = "parasite_mart",
host = "https://parasite.wormbase.org",
port = 443)
wormbase <- useDataset(mart = wormbase, dataset = "wbps_gene")
filters <- listFilters(wormbase)
datatable(filters, options = list(pageLength = 5, autoWidth = TRUE))How I use it mostly
My_BioMatr <- function(
genes,
reference = "mgi_symbol",
addattributes = c(
"mgi_symbol", "ensembl_gene_id", "entrezgene_id", "description",
"namespace_1003", "go_id", "name_1006"
),
dataset = "mmusculus_gene_ensembl",
biomart = "ENSEMBL_MART_ENSEMBL",
collapseGoTerms = TRUE
) {
.Biomatr_annotation <- function() {
# Annotates additional information to input genes and exports a dataframe
ensembl <- useMart(biomart = biomart, dataset = dataset)
print("getBM")
gene <- getBM(
attributes = addattributes,
filters = reference,
values = genes,
mart = ensembl,
verbose = TRUE,
useCache = FALSE
)
colnames(gene)[colnames(gene) == reference] <- "reference"
return(gene)
}
GeneInfos <- .Biomatr_annotation()
if (collapseGoTerms) {
# Collapse GoTerms into single line
Infos_withGo <- GeneInfos %>%
dplyr::group_by(reference, namespace_1003) %>%
dplyr::summarize(dplyr::across(
.cols = everything(),
.fns = ~ stringr::str_c(unique(.x), collapse = " | ")
), .groups = "drop") %>%
dplyr::filter(!namespace_1003 %in% c("", "cellular_component"))
# Recover genes without BP or MF annotations
filtered_out_genes <- setdiff(
unique(GeneInfos$reference),
unique(Infos_withGo$reference)
)
Infos_withoutGo <- GeneInfos[GeneInfos$reference %in% filtered_out_genes, ]
# Concatenate both datasets
GeneInfos <- Infos_withoutGo %>%
dplyr::group_by(reference) %>%
dplyr::summarize(dplyr::across(
.cols = everything(),
.fns = ~ stringr::str_c(unique(.x), collapse = " | ")
), .groups = "drop") %>%
dplyr::bind_rows(Infos_withGo) %>%
dplyr::group_by(reference) %>%
dplyr::summarize(dplyr::across(
.cols = everything(),
.fns = ~ stringr::str_c(unique(.x), collapse = " -- ")
), .groups = "drop")
}
if ("description" %in% addattributes) {
# Remove source from description
GeneInfos <- GeneInfos %>%
tidyr::separate(
col = description,
sep = "\\[",
into = c("description", NA),
extra = "drop",
fill = "right"
)
}
return(GeneInfos)
}Problems commonly occuring and how to solve them
Connection timeouts especially using a big number of genes and attributes.
Solution: Try to reduce the number of genes and attributes you are querying in one go and combine them afterwards.
Operation timed out after 300012 milliseconds There’s a 5 minute time limit on queries to the Ensembl BioMart, but biomaRt tries to help with this by automatically splitting large queries into smaller batches, and submitting them for you, but it will still take a really long time if you still have thousands of batches. Source
Servers are down.
Solution: You can use the mirror argument to change to a different one.
ensembl_hs <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl",mirror = "asia")Values for the mirror argument are: useast, asia, and www.
Looping could potentially lead to ban of the ip adress !
gns <- long_vector_of_gene_IDS>
for(i in seq(along = gns)) getBM(attributes = c('ensembl_transcript_id','transcript_tsl'),
filters = 'external_gene_name',
values = gns[i],
mart = ensembl,
verbose = F,
uniqueRows = T,
quote = "\'")
## is pernicious and may get you banned. As compared to
getBM(c('ensembl_transcript_id','transcript_tsl'), 'external_gene_name', gns, ensembl)
## where you ask for everything at once and biomaRt does any necessary loopingLocal Biomart Database ?
### BioMart Website not available ?
Alternatives
Access Biomart using RestAPI or Perl
Download the data and use it locally
AnnotationHub
Short Overview over AnnotationHub
library(AnnotationHub) # connection to DBs
library(AnnotationDbi) # Access to content of DBs # Access to DBs
ah = AnnotationHub() # connects to Bioconductor's AnnotationHub service
colnames(mcols(ah)) # available metadata columns
Human_datasets<-query(ah, pattern="homo sapiens")#Return an AnnotationHub subset containing only those elements whose metadata matches the pattern as in grepl unique(Human_datasets$dataprovider) [1] "UCSC"
[2] "Ensembl"
[3] "RefNet"
[4] "Inparanoid8"
[5] "NHLBI"
[6] "NIH Pathway Interaction Database"
[7] "BroadInstitute"
[8] "Gencode"
[9] "MISO, VAST-TOOLS, UCSC"
[10] "Stanford"
[11] "dbSNP"
[12] "BioMart"
[13] "KEGG"
[14] "EMBL-EBI"
[15] "GENCODE"
[16] "RMBase v2.0"
[17] "snoRNAdb"
[18] "tRNAdb"
[19] "NCBI"
[20] "DrugAge, DrugBank, Broad Institute"
[21] "DrugAge"
[22] "DrugBank"
[23] "Broad Institute"
[24] "PathBank"
[25] "EBI/EMBL"
[26] "NCBI,DBCLS"
[27] "FANTOM5,DLRP,IUPHAR,HPRD,STRING,SWISSPROT,TREMBL,ENSEMBL,CELLPHONEDB,BADERLAB,SINGLECELLSIGNALR,HOMOLOGENE"
[28] "WikiPathways"
[29] "VAST-TOOLS"
[30] "NA"
[31] "TargetScan,miRTarBase,USCS,ENSEMBL"
[32] "TargetScan"
[33] "QuickGO"
[34] "CIS-BP"
[35] "CTCFBSDB 2.0"
[36] "HOCOMOCO v11"
[37] "JASPAR 2022"
[38] "Jolma 2013"
[39] "SwissRegulon"
[40] "ENCODE SCREEN v3"
[41] "excluderanges"
[42] "ENCODE"
[43] "GitHub"
[44] "CHM13"
[45] "UCSChub"
[46] "Google DeepMind"
[47] "UWashington"
[48] "Bioconductor"
[49] "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/"
[50] "ENCODE cCREs"
# Most used is probably OrgDb containing gene information and IDs
human_orgDB <- query(ah,pattern= c("OrgDb", "homo sapiens"),pattern.op= `&`)[[1]] loading from cache
# OrgDbs can alternatively directly be installed and access using Bioconductor BiocManager::install("org.Hs.eg.db")AnnotationDbi::keytypes(human_orgDB) # Available Information [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
[6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
[11] "GENETYPE" "GO" "GOALL" "IPI" "MAP"
[16] "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM"
[21] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
[26] "UNIPROT"
genes <- c("BRCA1", "TP53", "EGFR")
AnnotationDbi::select(human_orgDB, keys=genes,columns= c("SYMBOL", "GENENAME","GENETYPE","ENSEMBL"),keytype= "SYMBOL") # parallel to BioMart getBM keys=values, columns=attributes, keytypes=filter'select()' returned 1:1 mapping between keys and columns
SYMBOL GENENAME GENETYPE ENSEMBL
1 BRCA1 BRCA1 DNA repair associated protein-coding ENSG00000012048
2 TP53 tumor protein p53 protein-coding ENSG00000141510
3 EGFR epidermal growth factor receptor protein-coding ENSG00000146648
Conclusion
The biomaRt package is a powerful tool for accessing BioMart databases.It has direct access to regularly updated databases with the latest informations.
Connection timeouts limit access.
References
Original BioMart: Smedley D, Haider S, Ballester B, Holland R, London D, Thorisson G, Kasprzyk A. BioMart–biological queries made easy. BMC Genomics. 2009 Jan 14;10:22. doi: 10.1186/1471-2164-10-22. PMID: 19144180; PMCID: PMC2649164.