Contents

1 Install coding.genes package

devtools::install_url('https://sels.tecnico.ulisboa.pt/gitlab/averissimo/coding.genes/repository/archive.zip')
library(coding.genes)
retrieve.protein.coding.genes(generate.rdata = TRUE)

2 Replication script

2.1 Install necessary packages

run manually if needed

source("http://bioconductor.org/biocLite.R")  
biocLite(c('ensembldb', 'EnsDb.Hsapiens.v86'))

2.2 Load libraries

suppressPackageStartupMessages({
  library(ensembldb)
  library(EnsDb.Hsapiens.v86)
  library(futile.logger)
  .Last.value <- flog.layout(layout.format('~m'))
})

3 Load data

3.1 Load ensembl data

Ensembl is a genome browser for vertebrate genomes that supports research in comparative genomics, evolution, sequence variation and transcriptional regulation. Ensembl annotate genes, computes multiple alignments, predicts regulatory function and collects disease data. Ensembl tools include BLAST, BLAT, BioMart and the Variant Effect Predictor (VEP) for all supported species.

edb <- EnsDb.Hsapiens.v86
# hasProteinData(edb) # from devel only, as of 3.4
ensembl.protein.coding <- genes(edb, 
                                filter = list(GenebiotypeFilter('protein_coding')), 
                                columns = c('gene_id', 'gene_name'))
{
  flog.info('         Granges: %d', nrow(ensembl.protein.coding@elementMetadata))
  flog.info('Metadata columns: %d', ncol(ensembl.protein.coding@elementMetadata))
}
##          Granges: 22285
## Metadata columns: 3

3.2 Load ccds data

The Consensus CDS (CCDS) project is a collaborative effort to identify a core set of human and mouse protein coding regions that are consistently annotated and of high quality. The long term goal is to support convergence towards a standard set of gene annotations.

ccds <- read.table(url('ftp://ftp.ncbi.nih.gov/pub/CCDS/current_human/CCDS.current.txt'),
                   sep = '\t', 
                   header = T, 
                   comment.char = "|", # necessary as header line has a # character
                   stringsAsFactors = FALSE)
flog.info('Size of ccds: %d x %d', nrow(ccds), ncol(ccds))
## Size of ccds: 34297 x 11

4 Build unique gene set

ensembl.genes         <- sort(unique(ensembl.protein.coding@elementMetadata$gene_name))
ensembl.genes.ensg.id <- sort(ensembl.protein.coding@elementMetadata$gene_id)

ccds.genes       <- sort(unique(ccds$gene))
ccds.extra.genes <- sort(ccds.genes[(!ccds.genes %in% ensembl.genes)])
ccds.extra.genes.ensg.id <- genes(edb, filter = list(GenenameFilter(ccds.extra.genes)), 
                                  columns = c('gene_id', 'gene_name'))

#
ensg.id <- sort(unique(c(ensembl.protein.coding@elementMetadata$gene_id, ensembl.genes.ensg.id)))

#
all.genes <- sort(unique(c(ensembl.genes, ccds.extra.genes)))

4.1 Export to RData

Gene names:

  • all.genes: Union of genes from ensembldb and ccds
  • ensembl.genes: Protein coding genes from ensembldb
  • ccds.extra.genes: Protein coding genes from ccds

Ensembl ID (ENGS****):

  • ensg.id: Contains the union of both protein coding genes of ensembldb package and ccds current release.
    • Note: If chunk below has warning, then not all genes ccds have an ensembl id
## WARNING! not all genes from ccds have a ensg id (387 have out of 616)
# devtools::use_data(all.genes,        overwrite = TRUE)
# devtools::use_data(ensembl.genes,    overwrite = TRUE)
# devtools::use_data(ensg.id,          overwrite = TRUE)
# devtools::use_data(ccds.extra.genes, overwrite = TRUE)
# save(all.genes, ensembl.genes, ensg.id, ccds.extra.genes, file = 'protein.coding.genes.RData')

4.2 Information on protein coding gene set

## 
## Unique genes from ensembldb:   20063
##       Extra genes from ccds: +   616
##        Overall unique genes:   20679
## -------------------------------------------
## Head from ensembl.genes
## 
## [1] "A1BG"    "A1CF"    "A2M"     "A2ML1"   "A3GALT2" "A4GALT" 
## 
## Head from ccds.genes (additional ones not present in ensembl)
## 
## [1] "AATBC"    "ABCA11P"  "ABCC13"   "ADA2"     "AFDN-AS1" "AFG1L"   
## -------------------------------------------
## Head from ensg.id
## 
## [1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457" "ENSG00000000460" "ENSG00000000938"
## -------------------------------------------
## CCDS genes that don't have an ENSG** id: 268
## 
## 
##   [1] "ADA2"           "AFG1L"          "AGAP8"          "AKD1"           "ALKAL1"         "ALKAL2"         "ANKRD20A20P"   
##   [8] "ANXA8L2"        "BUD23"          "C10orf12"       "C10orf93"       "C11orf39"       "C11orf69"       "C12orf63"      
##  [15] "C14orf181"      "C14orf184"      "C16orf67"       "C19orf49"       "C1orf188"       "C1orf215"       "C1orf218"      
##  [22] "C1orf222"       "C1QTNF12"       "C20orf158"      "C21orf124"      "C21orf69"       "C21orf77"       "C21orf81"      
##  [29] "C2CD6"          "C5orf23"        "C6orf114"       "C6orf224"       "C6orf98"        "C7orf13"        "C7orf76"       
##  [36] "C8orf79"        "C9orf102"       "C9orf164"       "CASTOR1"        "CASTOR2"        "CATSPERE"       "CATSPERZ"      
##  [43] "CBWD6"          "CCL4L1"         "CENPS"          "CENPX"          "CNMD"           "CT45A4"         "CXorf30"       
##  [50] "CYorf15B"       "CYTOR"          "DCDC5"          "DKFZp451M2119"  "DKFZp566H0824"  "DKFZp686K16132" "DNHL1"         
##  [57] "EEF1AKMT2"      "EEF1AKMT3"      "ELOA"           "ELOA2"          "ELOA3B"         "ELOA3D"         "ELOB"          
##  [64] "ELOC"           "EPOP"           "ERVFC1-1"       "ERVFRD-2"       "ETFRF1"         "FAM108A3P"      "FAM21B"        
##  [71] "FAM23B"         "FAM92A"         "FDX1L"          "FLJ10246"       "FLJ12684"       "FLJ33534"       "FLJ34503"      
##  [78] "FLJ35816"       "FLJ36116"       "FLJ37396"       "FLJ40536"       "FLJ41327"       "FLJ41821"       "FLJ42220"      
##  [85] "FLJ42986"       "FLJ44076"       "FLJ44674"       "FLJ44874"       "FLJ44955"       "FLJ45337"       "FLJ45422"      
##  [92] "FLJ45684"       "FLJ46230"       "FLJ46347"       "FLJ46365"       "FOXD4L2"        "FRMPD2P2"       "GAGE12I"       
##  [99] "GAGE2B"         "GAGE2C"         "GAGE2D"         "GAGE4"          "GAGE8"          "GATSL1"         "GCNA"          
## [106] "GOLGA6L20"      "GPR89C"         "HDGFRP2"        "HDGFRP3"        "HDHD5"          "INTS11"         "INTS13"        
## [113] "INTS14"         "KIAA0754"       "KIAA0889"       "LINC00341"      "LINC00597"      "LINC00610"      "LINC00696"     
## [120] "LINC01599"      "LINC01879"      "LOC100129216"   "LOC100130705"   "LOC100131303"   "LOC100132146"   "LOC100133267"  
## [127] "LOC100505478"   "LOC100506127"   "LOC100652758"   "LOC100996634"   "LOC100996693"   "LOC101928917"   "LOC101929726"  
## [134] "LOC101929926"   "LOC102724428"   "LOC102724652"   "LOC107984640"   "LOC113386"      "LOC124216"      "LOC153684"     
## [141] "LOC158434"      "LOC200383"      "LOC200726"      "LOC283874"      "LOC284912"      "LOC285097"      "LOC285398"     
## [148] "LOC285696"      "LOC339010"      "LOC339047"      "LOC389199"      "LOC391343"      "LOC400002"      "LOC400576"     
## [155] "LOC400707"      "LOC400943"      "LOC401040"      "LOC401072"      "LOC401123"      "LOC401252"      "LOC401281"     
## [162] "LOC401610"      "LOC403312"      "LOC403323"      "LOC440157"      "LOC440742"      "LOC541473"      "LOC642980"     
## [169] "LOC651503"      "LOC729164"      "LOC730098"      "LOC730159"      "LOC81691"       "LOC90768"       "LOC91431"      
## [176] "LOC93622"       "MAIP1"          "MAP3K20"        "MAP3K21"        "MGC12965"       "MGC14276"       "MINDY1"        
## [183] "MINDY2"         "MINDY3"         "MINDY4"         "MPIG6B"         "MRC1L1"         "MRE11"          "MRNIP"         
## [190] "NBPF16"         "NCRNA00173"     "NCRNA00300"     "NDUFAF8"        "NOP53"          "NPIPB13"        "NSD2"          
## [197] "NSD3"           "PCLAF"          "PFDN6L"         "PIMREG"         "PJCG6"          "PLPBP"          "PLSCR3"        
## [204] "PNMA6C"         "PNMA6D"         "PRAG1"          "PRAMEF16"       "PRAMEF21"       "PRAMEF23"       "PRAMEF3"       
## [211] "PRKN"           "PTPN20A"        "RFLNA"          "RGSL2"          "RIOX1"          "RIOX2"          "RIPOR1"        
## [218] "RIPOR2"         "RIPOR3"         "RPL17P39"       "RPS17L"         "RUNDC2B"        "SCXA"           "SEC63D1"       
## [225] "SELENOF"        "SELENOH"        "SELENOI"        "SELENOK"        "SELENOM"        "SELENON"        "SELENOO"       
## [232] "SELENOP"        "SELENOS"        "SELENOT"        "SELENOV"        "SELENOW"        "SEM1"           "SFTPA2B"       
## [239] "SPATA31A2"      "SPATA46"        "SQOR"           "STN1"           "TEX44"          "THEG5"          "TIMM29"        
## [246] "TMEM137"        "TMEM269"        "TMPRSS11E2"     "TOGARAM1"       "TOGARAM2"       "TWNK"           "UFD1"          
## [253] "UNQ5810"        "VSIR"           "WASHC1"         "WASHC2A"        "WASHC2C"        "WASHC3"         "WASHC4"        
## [260] "WASHC5"         "WBSCR23"        "XAGE1C"         "XAGE1D"         "XAGE1E"         "XAGE2B"         "ZBTB8"         
## [267] "ZC3H18-AS1"     "ZNF663"

5 Session info

sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 8 (jessie)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C              LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] coding.genes_0.1.0       BiocStyle_2.2.1          futile.logger_1.4.3      EnsDb.Hsapiens.v86_2.1.0
##  [5] ensembldb_1.6.2          GenomicFeatures_1.26.3   AnnotationDbi_1.36.2     Biobase_2.34.0          
##  [9] GenomicRanges_1.26.3     GenomeInfoDb_1.10.3      IRanges_2.8.1            S4Vectors_0.12.1        
## [13] BiocGenerics_0.20.0      devtools_1.12.0         
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.9                   lattice_0.20-34               Rsamtools_1.26.1              Biostrings_2.42.1            
##  [5] assertthat_0.1                rprojroot_1.2                 digest_0.6.12                 mime_0.5                     
##  [9] R6_2.2.0                      futile.options_1.0.0          backports_1.0.5               RSQLite_1.1-2                
## [13] evaluate_0.10                 httr_1.2.1                    BiocInstaller_1.24.0          zlibbioc_1.20.0              
## [17] Matrix_1.2-8                  rmarkdown_1.3                 desc_1.1.0                    BiocParallel_1.8.1           
## [21] AnnotationHub_2.6.4           stringr_1.2.0                 RCurl_1.95-4.8                biomaRt_2.30.0               
## [25] shiny_1.0.0                   httpuv_1.3.3                  rtracklayer_1.34.2            htmltools_0.3.5              
## [29] SummarizedExperiment_1.4.0    interactiveDisplayBase_1.12.0 roxygen2_6.0.1                XML_3.98-1.5                 
## [33] crayon_1.3.2                  withr_1.0.2                   GenomicAlignments_1.10.0      bitops_1.0-6                 
## [37] commonmark_1.2                grid_3.3.2                    xtable_1.8-2                  DBI_0.6                      
## [41] magrittr_1.5                  stringi_1.1.2                 XVector_0.14.0                xml2_1.1.1                   
## [45] lambda.r_1.1.9                tools_3.3.2                   yaml_2.1.14                   memoise_1.0.0                
## [49] knitr_1.15.1