devtools::install_url('https://sels.tecnico.ulisboa.pt/gitlab/averissimo/coding.genes/repository/archive.zip')
library(coding.genes)
retrieve.protein.coding.genes(generate.rdata = TRUE)
run manually if needed
source("http://bioconductor.org/biocLite.R")
biocLite(c('ensembldb', 'EnsDb.Hsapiens.v86'))
suppressPackageStartupMessages({
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(futile.logger)
.Last.value <- flog.layout(layout.format('~m'))
})
Ensembl is a genome browser for vertebrate genomes that supports research in comparative genomics, evolution, sequence variation and transcriptional regulation. Ensembl annotate genes, computes multiple alignments, predicts regulatory function and collects disease data. Ensembl tools include BLAST, BLAT, BioMart and the Variant Effect Predictor (VEP) for all supported species.
edb <- EnsDb.Hsapiens.v86
# hasProteinData(edb) # from devel only, as of 3.4
ensembl.protein.coding <- genes(edb,
filter = list(GenebiotypeFilter('protein_coding')),
columns = c('gene_id', 'gene_name'))
{
flog.info(' Granges: %d', nrow(ensembl.protein.coding@elementMetadata))
flog.info('Metadata columns: %d', ncol(ensembl.protein.coding@elementMetadata))
}
## Granges: 22285
## Metadata columns: 3
The Consensus CDS (CCDS) project is a collaborative effort to identify a core set of human and mouse protein coding regions that are consistently annotated and of high quality. The long term goal is to support convergence towards a standard set of gene annotations.
ccds <- read.table(url('ftp://ftp.ncbi.nih.gov/pub/CCDS/current_human/CCDS.current.txt'),
sep = '\t',
header = T,
comment.char = "|", # necessary as header line has a # character
stringsAsFactors = FALSE)
flog.info('Size of ccds: %d x %d', nrow(ccds), ncol(ccds))
## Size of ccds: 34297 x 11
ensembl.genes <- sort(unique(ensembl.protein.coding@elementMetadata$gene_name))
ensembl.genes.ensg.id <- sort(ensembl.protein.coding@elementMetadata$gene_id)
ccds.genes <- sort(unique(ccds$gene))
ccds.extra.genes <- sort(ccds.genes[(!ccds.genes %in% ensembl.genes)])
ccds.extra.genes.ensg.id <- genes(edb, filter = list(GenenameFilter(ccds.extra.genes)),
columns = c('gene_id', 'gene_name'))
#
ensg.id <- sort(unique(c(ensembl.protein.coding@elementMetadata$gene_id, ensembl.genes.ensg.id)))
#
all.genes <- sort(unique(c(ensembl.genes, ccds.extra.genes)))
Gene names:
all.genes
: Union of genes from ensembldb
and ccds
ensembl.genes
: Protein coding genes from ensembldb
ccds.extra.genes
: Protein coding genes from ccds
Ensembl ID (ENGS****):
ensg.id
: Contains the union of both protein coding genes of ensembldb
package and ccds
current release.
## WARNING! not all genes from ccds have a ensg id (387 have out of 616)
# devtools::use_data(all.genes, overwrite = TRUE)
# devtools::use_data(ensembl.genes, overwrite = TRUE)
# devtools::use_data(ensg.id, overwrite = TRUE)
# devtools::use_data(ccds.extra.genes, overwrite = TRUE)
# save(all.genes, ensembl.genes, ensg.id, ccds.extra.genes, file = 'protein.coding.genes.RData')
##
## Unique genes from ensembldb: 20063
## Extra genes from ccds: + 616
## Overall unique genes: 20679
## -------------------------------------------
## Head from ensembl.genes
##
## [1] "A1BG" "A1CF" "A2M" "A2ML1" "A3GALT2" "A4GALT"
##
## Head from ccds.genes (additional ones not present in ensembl)
##
## [1] "AATBC" "ABCA11P" "ABCC13" "ADA2" "AFDN-AS1" "AFG1L"
## -------------------------------------------
## Head from ensg.id
##
## [1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457" "ENSG00000000460" "ENSG00000000938"
## -------------------------------------------
## CCDS genes that don't have an ENSG** id: 268
##
##
## [1] "ADA2" "AFG1L" "AGAP8" "AKD1" "ALKAL1" "ALKAL2" "ANKRD20A20P"
## [8] "ANXA8L2" "BUD23" "C10orf12" "C10orf93" "C11orf39" "C11orf69" "C12orf63"
## [15] "C14orf181" "C14orf184" "C16orf67" "C19orf49" "C1orf188" "C1orf215" "C1orf218"
## [22] "C1orf222" "C1QTNF12" "C20orf158" "C21orf124" "C21orf69" "C21orf77" "C21orf81"
## [29] "C2CD6" "C5orf23" "C6orf114" "C6orf224" "C6orf98" "C7orf13" "C7orf76"
## [36] "C8orf79" "C9orf102" "C9orf164" "CASTOR1" "CASTOR2" "CATSPERE" "CATSPERZ"
## [43] "CBWD6" "CCL4L1" "CENPS" "CENPX" "CNMD" "CT45A4" "CXorf30"
## [50] "CYorf15B" "CYTOR" "DCDC5" "DKFZp451M2119" "DKFZp566H0824" "DKFZp686K16132" "DNHL1"
## [57] "EEF1AKMT2" "EEF1AKMT3" "ELOA" "ELOA2" "ELOA3B" "ELOA3D" "ELOB"
## [64] "ELOC" "EPOP" "ERVFC1-1" "ERVFRD-2" "ETFRF1" "FAM108A3P" "FAM21B"
## [71] "FAM23B" "FAM92A" "FDX1L" "FLJ10246" "FLJ12684" "FLJ33534" "FLJ34503"
## [78] "FLJ35816" "FLJ36116" "FLJ37396" "FLJ40536" "FLJ41327" "FLJ41821" "FLJ42220"
## [85] "FLJ42986" "FLJ44076" "FLJ44674" "FLJ44874" "FLJ44955" "FLJ45337" "FLJ45422"
## [92] "FLJ45684" "FLJ46230" "FLJ46347" "FLJ46365" "FOXD4L2" "FRMPD2P2" "GAGE12I"
## [99] "GAGE2B" "GAGE2C" "GAGE2D" "GAGE4" "GAGE8" "GATSL1" "GCNA"
## [106] "GOLGA6L20" "GPR89C" "HDGFRP2" "HDGFRP3" "HDHD5" "INTS11" "INTS13"
## [113] "INTS14" "KIAA0754" "KIAA0889" "LINC00341" "LINC00597" "LINC00610" "LINC00696"
## [120] "LINC01599" "LINC01879" "LOC100129216" "LOC100130705" "LOC100131303" "LOC100132146" "LOC100133267"
## [127] "LOC100505478" "LOC100506127" "LOC100652758" "LOC100996634" "LOC100996693" "LOC101928917" "LOC101929726"
## [134] "LOC101929926" "LOC102724428" "LOC102724652" "LOC107984640" "LOC113386" "LOC124216" "LOC153684"
## [141] "LOC158434" "LOC200383" "LOC200726" "LOC283874" "LOC284912" "LOC285097" "LOC285398"
## [148] "LOC285696" "LOC339010" "LOC339047" "LOC389199" "LOC391343" "LOC400002" "LOC400576"
## [155] "LOC400707" "LOC400943" "LOC401040" "LOC401072" "LOC401123" "LOC401252" "LOC401281"
## [162] "LOC401610" "LOC403312" "LOC403323" "LOC440157" "LOC440742" "LOC541473" "LOC642980"
## [169] "LOC651503" "LOC729164" "LOC730098" "LOC730159" "LOC81691" "LOC90768" "LOC91431"
## [176] "LOC93622" "MAIP1" "MAP3K20" "MAP3K21" "MGC12965" "MGC14276" "MINDY1"
## [183] "MINDY2" "MINDY3" "MINDY4" "MPIG6B" "MRC1L1" "MRE11" "MRNIP"
## [190] "NBPF16" "NCRNA00173" "NCRNA00300" "NDUFAF8" "NOP53" "NPIPB13" "NSD2"
## [197] "NSD3" "PCLAF" "PFDN6L" "PIMREG" "PJCG6" "PLPBP" "PLSCR3"
## [204] "PNMA6C" "PNMA6D" "PRAG1" "PRAMEF16" "PRAMEF21" "PRAMEF23" "PRAMEF3"
## [211] "PRKN" "PTPN20A" "RFLNA" "RGSL2" "RIOX1" "RIOX2" "RIPOR1"
## [218] "RIPOR2" "RIPOR3" "RPL17P39" "RPS17L" "RUNDC2B" "SCXA" "SEC63D1"
## [225] "SELENOF" "SELENOH" "SELENOI" "SELENOK" "SELENOM" "SELENON" "SELENOO"
## [232] "SELENOP" "SELENOS" "SELENOT" "SELENOV" "SELENOW" "SEM1" "SFTPA2B"
## [239] "SPATA31A2" "SPATA46" "SQOR" "STN1" "TEX44" "THEG5" "TIMM29"
## [246] "TMEM137" "TMEM269" "TMPRSS11E2" "TOGARAM1" "TOGARAM2" "TWNK" "UFD1"
## [253] "UNQ5810" "VSIR" "WASHC1" "WASHC2A" "WASHC2C" "WASHC3" "WASHC4"
## [260] "WASHC5" "WBSCR23" "XAGE1C" "XAGE1D" "XAGE1E" "XAGE2B" "ZBTB8"
## [267] "ZC3H18-AS1" "ZNF663"
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 8 (jessie)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] coding.genes_0.1.0 BiocStyle_2.2.1 futile.logger_1.4.3 EnsDb.Hsapiens.v86_2.1.0
## [5] ensembldb_1.6.2 GenomicFeatures_1.26.3 AnnotationDbi_1.36.2 Biobase_2.34.0
## [9] GenomicRanges_1.26.3 GenomeInfoDb_1.10.3 IRanges_2.8.1 S4Vectors_0.12.1
## [13] BiocGenerics_0.20.0 devtools_1.12.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.9 lattice_0.20-34 Rsamtools_1.26.1 Biostrings_2.42.1
## [5] assertthat_0.1 rprojroot_1.2 digest_0.6.12 mime_0.5
## [9] R6_2.2.0 futile.options_1.0.0 backports_1.0.5 RSQLite_1.1-2
## [13] evaluate_0.10 httr_1.2.1 BiocInstaller_1.24.0 zlibbioc_1.20.0
## [17] Matrix_1.2-8 rmarkdown_1.3 desc_1.1.0 BiocParallel_1.8.1
## [21] AnnotationHub_2.6.4 stringr_1.2.0 RCurl_1.95-4.8 biomaRt_2.30.0
## [25] shiny_1.0.0 httpuv_1.3.3 rtracklayer_1.34.2 htmltools_0.3.5
## [29] SummarizedExperiment_1.4.0 interactiveDisplayBase_1.12.0 roxygen2_6.0.1 XML_3.98-1.5
## [33] crayon_1.3.2 withr_1.0.2 GenomicAlignments_1.10.0 bitops_1.0-6
## [37] commonmark_1.2 grid_3.3.2 xtable_1.8-2 DBI_0.6
## [41] magrittr_1.5 stringi_1.1.2 XVector_0.14.0 xml2_1.1.1
## [45] lambda.r_1.1.9 tools_3.3.2 yaml_2.1.14 memoise_1.0.0
## [49] knitr_1.15.1