Skip to content

RNA Benchmarking

Mike Lloyd edited this page Jul 27, 2023 · 3 revisions

RNA Benchmarking Documentation

What follows is text from an R based Jupyter notebook that generate simulated RNA data. The output from the pipeline is then compared against truth.

The Jupyter notebook is found here: /projects/omics_share/meta/scripts/benchmarking

Library Load

#BiocManager::install("Rsubread")
# https://rdrr.io/bioc/Rsubread/man/simReads.html
library(Rsubread)
library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Mouse

Files used:

wget http://ftp.ensembl.org/pub/release-102/fasta/mus_musculus/cds/Mus_musculus.GRCm38.cds.all.fa.gz

gunzip Mus_musculus.GRCm38.cds.all.fa.gz

reformat.sh in=Mus_musculus.GRCm38.cds.all.fa out=Mus_musculus.GRCm38.cds.all.LINEWRAP.fa fastawrap=80

BBMap is the source of reformat.sh. This script takes a single line fasta and turns it to 80 character line wrapped fasta. simReads can't use single line FASTA. An error is thrown.

# Scan through the fasta file to get transcript names and lengths
transcripts <- scanFasta('/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RNA/mouse_cds/Mus_musculus.GRCm38.cds.all.LINEWRAP.fa')
nsequences <- nrow(transcripts) - sum(transcripts$Duplicate)
Warning: 17539 duplicate sequences were found in the input. Please check the summary table.
head(transcripts)

#transcripts %>% filter(Duplicated == TRUE)
A data.frame: 6 × 6
TranscriptID Length MD5 Unique Occurrence Duplicated
<chr> <int> <chr> <lgl> <int> <lgl>
1 ENSMUST00000177564.1 cds chromosome:GRCm38:14:54122226:54122241:1 gene:ENSMUSG00000096176.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trdd2 description:T cell receptor delta diversity 2 [Source:MGI Symbol;Acc:MGI:4439546] 16 587E802B733230F3DA2F88F6EB24F77E TRUE 1 FALSE
2 ENSMUST00000196221.1 cds chromosome:GRCm38:14:54113468:54113476:1 gene:ENSMUSG00000096749.2 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trdd1 description:T cell receptor delta diversity 1 [Source:MGI Symbol;Acc:MGI:4439547] 9 E694792CBB1DF84D2D6CCBE57D8F3C66 TRUE 1 FALSE
3 ENSMUST00000178537.1 cds chromosome:GRCm38:6:41533201:41533212:1 gene:ENSMUSG00000095668.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trbd1 description:T cell receptor beta, D region 1 [Source:MGI Symbol;Acc:MGI:4439571] 12 1F47B55923E2D23090F894C439974B55 TRUE 1 FALSE
4 ENSMUST00000178862.1 cds chromosome:GRCm38:6:41542163:41542176:1 gene:ENSMUSG00000094569.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trbd2 description:T cell receptor beta, D region 2 [Source:MGI Symbol;Acc:MGI:4439727] 14 3AC456B17D30D58ED16896AE1FB699C7 TRUE 1 FALSE
5 ENSMUST00000179520.1 cds chromosome:GRCm38:12:113430528:113430538:-1 gene:ENSMUSG00000094028.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd4-1 description:immunoglobulin heavy diversity 4-1 [Source:MGI Symbol;Acc:MGI:4439801] 11 1B514044F866F7820E6F5A474D288132 TRUE 1 FALSE
6 ENSMUST00000179883.1 cds chromosome:GRCm38:12:113448214:113448229:-1 gene:ENSMUSG00000094552.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd3-2 description:immunoglobulin heavy diversity 3-2 [Source:MGI Symbol;Acc:MGI:4439707] 16 5A0A74B83FAC2445F1585E0D3E36018E TRUE 1 FALSE
nsequences
nrow(transcripts)

58141

68381

# Assign a random TPM value to each non-duplicated transcript sequence
TPMs <- rep(0, nrow(transcripts))
TPMs[!transcripts$Duplicate] <- rexp(nsequences)

# those that are duplicated are given '0' TPM values. 
true.counts <- simReads(

    # the transcript database and the wanted abundances
    '/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RNA/mouse_cds/Mus_musculus.GRCm38.cds.all.LINEWRAP.fa',
    TPMs,
  
    # the name of the output
    "/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RNA/mouse_cds/Mus_musculus_simRNA_1mReads",
  
    # options on the output
    library.size = 1000000,
    read.length = 100,
    truth.in.read.names = FALSE,
  
    # simulating sequencing errors
    simulate.sequencing.error = TRUE,
    quality.reference = NULL,
  
    # parameters for generating paired-end reads.
    paired.end = TRUE,
    fragment.length.min = 125,
    fragment.length.max = 500,
    fragment.length.mean = 250,
    fragment.length.sd = 40,
  
    # manipulating transcript names
    simplify.transcript.names = FALSE)

write.table(x = true.counts, file = '/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RNA/mouse_cds/Mus_musculus.GRCm38.1m_isoform_true_expression.txt', sep = '\t', quote = F, row.names = F)
true.counts <- simReads(

    # the transcript database and the wanted abundances
    '/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RNA/mouse_cds/Mus_musculus.GRCm38.cds.all.LINEWRAP.fa',
    TPMs,
  
    # the name of the output
    "/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RNA/mouse_cds/Mus_musculus_simRNA_30mReads",
  
    # options on the output
    library.size = 30000000,
    read.length = 100,
    truth.in.read.names = FALSE,
  
    # simulating sequencing errors
    simulate.sequencing.error = TRUE,
    quality.reference = NULL,
  
    # parameters for generating paired-end reads.
    paired.end = TRUE,
    fragment.length.min = 125,
    fragment.length.max = 500,
    fragment.length.mean = 250,
    fragment.length.sd = 40,
  
    # manipulating transcript names
    simplify.transcript.names = FALSE)

write.table(x = true.counts, file = '/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RNA/mouse_cds/Mus_musculus.GRCm38.30m_isoform_true_expression.txt', sep = '\t', quote = F, row.names = F)

Human

Files used:

wget http://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/cds/Homo_sapiens.GRCh38.cds.all.fa.gz

gunzip Homo_sapiens.GRCh38.cds.all.fa.gz

reformat.sh in=Homo_sapiens.GRCh38.cds.all.fa out=Homo_sapiens.GRCh38.cds.all.LINEWRAP.fa fastawrap=80

BBMap is the source of reformat.sh. This script takes a single line fasta and turns it to 80 character line wrapped fasta. simReads can't use single line FASTA. An error is thrown.

# Scan through the fasta file to get transcript names and lengths
transcripts <- scanFasta('/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RNA/human_cds/Homo_sapiens.GRCh38.cds.all.LINEWRAP.fa')
nsequences <- nrow(transcripts) - sum(transcripts$Duplicate)
Warning: 36435 duplicate sequences were found in the input. Please check the summary table.
head(transcripts)
A data.frame: 6 × 6
TranscriptID Length MD5 Unique Occurrence Duplicated
<chr> <int> <chr> <lgl> <int> <lgl>
1 ENST00000631435.1 cds chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142847306:142847317:1 gene:ENSG00000282253.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRBD1 description:T cell receptor beta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12158] 12 1F47B55923E2D23090F894C439974B55 FALSE 2 FALSE
2 ENST00000415118.1 cds chromosome:GRCh38:14:22438547:22438554:1 gene:ENSG00000223997.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD1 description:T cell receptor delta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12254] 8 940CA0BE6717D9330B746F88BBF1FC93 TRUE 1 FALSE
3 ENST00000448914.1 cds chromosome:GRCh38:14:22449113:22449125:1 gene:ENSG00000228985.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD3 description:T cell receptor delta diversity 3 [Source:HGNC Symbol;Acc:HGNC:12256] 13 8DFFD69B083D7B2ACC3879B5936B6D16 TRUE 1 FALSE
4 ENST00000434970.2 cds chromosome:GRCh38:14:22439007:22439015:1 gene:ENSG00000237235.2 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD2 description:T cell receptor delta diversity 2 [Source:HGNC Symbol;Acc:HGNC:12255] 9 00786AB95D4186F71651FE88A9AFD016 TRUE 1 FALSE
5 ENST00000632684.1 cds chromosome:GRCh38:7:142786213:142786224:1 gene:ENSG00000282431.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRBD1 description:T cell receptor beta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12158] 12 1F47B55923E2D23090F894C439974B55 FALSE 2 TRUE
6 ENST00000633010.1 cds chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105895279:105895294:-1 gene:ENSG00000282274.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-17 description:immunoglobulin heavy diversity 4-17 [Source:HGNC Symbol;Acc:HGNC:5503] 16 71E47CB9367D2425A5C45BEB905C00FB FALSE 2 FALSE
nsequences
nrow(transcripts)

93450

116646

# Assign a random TPM value to each non-duplicated transcript sequence
TPMs <- rep(0, nrow(transcripts))
TPMs[!transcripts$Duplicate] <- rexp(nsequences)

# those that are duplicated are given '0' TPM values. 
true.counts <- simReads(

    # the transcript database and the wanted abundances
    '/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RNA/human_cds/Homo_sapiens.GRCh38.cds.all.LINEWRAP.fa',
    TPMs,
  
    # the name of the output
    "/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RNA/human_cds/Homo_sapiens_simRNA_1mReads",
  
    # options on the output
    library.size = 1000000,
    read.length = 100,
    truth.in.read.names = FALSE,
  
    # simulating sequencing errors
    simulate.sequencing.error = TRUE,
    quality.reference = NULL,
  
    # parameters for generating paired-end reads.
    paired.end = TRUE,
    fragment.length.min = 125,
    fragment.length.max = 500,
    fragment.length.mean = 250,
    fragment.length.sd = 40,
  
    # manipulating transcript names
    simplify.transcript.names = FALSE)

write.table(x = true.counts, file = '/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RNA/human_cds/Homo_sapiens.GRCh38.1m_isoform_true_expression.txt', sep = '\t', quote = F, row.names = F)
true.counts <- simReads(

    # the transcript database and the wanted abundances
    '/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RNA/human_cds/Homo_sapiens.GRCh38.cds.all.LINEWRAP.fa',
    TPMs,
  
    # the name of the output
    "/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RNA/human_cds/Homo_sapiens_simRNA_30mReads",
  
    # options on the output
    library.size = 30000000,
    read.length = 100,
    truth.in.read.names = FALSE,
  
    # simulating sequencing errors
    simulate.sequencing.error = TRUE,
    quality.reference = NULL,
  
    # parameters for generating paired-end reads.
    paired.end = TRUE,
    fragment.length.min = 125,
    fragment.length.max = 500,
    fragment.length.mean = 250,
    fragment.length.sd = 40,
  
    # manipulating transcript names
    simplify.transcript.names = FALSE)

write.table(x = true.counts, file = '/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RNA/human_cds/Homo_sapiens.GRCh38.30m_isoform_true_expression.txt', sep = '\t', quote = F, row.names = F)

Compare to Truth

Mouse

mouse_simulated_truth_1m <- read.delim('/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RNA/mouse_cds/Mus_musculus.GRCm38.1m_isoform_true_expression.txt', sep="\t", header=T)
mouse_simulated_test_1m <- read.delim('/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RESULTS/RNA_mouse_nxf_bench/Mus_musculus_simRNA_1mReads/Mus_musculus_simRNA_1mReads.isoforms.results', sep="\t", header=T)

mouse_simulated_truth_30m <- read.delim('/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RNA/mouse_cds/Mus_musculus.GRCm38.30m_isoform_true_expression.txt', sep="\t", header=T)
mouse_simulated_test_30m <- read.delim('/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RESULTS/RNA_mouse_nxf_bench/Mus_musculus_simRNA_30mReads/Mus_musculus_simRNA_30mReads.isoforms.results', sep="\t", header=T)
head(mouse_simulated_truth_1m)
head(mouse_simulated_test_1m)
A data.frame: 6 × 3
TranscriptID Length NReads
<fct> <int> <int>
1 ENSMUST00000177564.1 cds chromosome:GRCm38:14:54122226:54122241:1 gene:ENSMUSG00000096176.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trdd2 description:T cell receptor delta diversity 2 [Source:MGI Symbol;Acc:MGI:4439546] 16 0
2 ENSMUST00000196221.1 cds chromosome:GRCm38:14:54113468:54113476:1 gene:ENSMUSG00000096749.2 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trdd1 description:T cell receptor delta diversity 1 [Source:MGI Symbol;Acc:MGI:4439547] 9 0
3 ENSMUST00000178537.1 cds chromosome:GRCm38:6:41533201:41533212:1 gene:ENSMUSG00000095668.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trbd1 description:T cell receptor beta, D region 1 [Source:MGI Symbol;Acc:MGI:4439571] 12 0
4 ENSMUST00000178862.1 cds chromosome:GRCm38:6:41542163:41542176:1 gene:ENSMUSG00000094569.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trbd2 description:T cell receptor beta, D region 2 [Source:MGI Symbol;Acc:MGI:4439727] 14 0
5 ENSMUST00000179520.1 cds chromosome:GRCm38:12:113430528:113430538:-1 gene:ENSMUSG00000094028.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd4-1 description:immunoglobulin heavy diversity 4-1 [Source:MGI Symbol;Acc:MGI:4439801] 11 0
6 ENSMUST00000179883.1 cds chromosome:GRCm38:12:113448214:113448229:-1 gene:ENSMUSG00000094552.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd3-2 description:immunoglobulin heavy diversity 3-2 [Source:MGI Symbol;Acc:MGI:4439707] 16 0
A data.frame: 6 × 8
transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct
<fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ENSMUST00000000001_Gnai3-201 ENSMUSG00000000001_Gnai3 3262 3013.28 0 0.00 0.00 0
2 ENSMUST00000000003_Pbsn-201 ENSMUSG00000000003_Pbsn 902 653.28 1 6.18 4.43 100
3 ENSMUST00000114041_Pbsn-202 ENSMUSG00000000003_Pbsn 697 448.30 0 0.00 0.00 0
4 ENSMUST00000000028_Cdc45-201 ENSMUSG00000000028_Cdc45 2143 1894.28 0 0.00 0.00 0
5 ENSMUST00000096990_Cdc45-202 ENSMUSG00000000028_Cdc45 1747 1498.28 7 18.87 13.51 100
6 ENSMUST00000115585_Cdc45-203 ENSMUSG00000000028_Cdc45 832 583.29 0 0.00 0.00 0
mouse_simulated_truth_1m_clean <- mouse_simulated_truth_1m %>% 
      separate( TranscriptID, into=c("transcript_id", "CDS", 'location', 'gene_id', 'gene_biotype', 'transcript_biotype', 'gene_symbol'), sep= " ", extra = 'drop') %>%
      mutate(location = str_remove(location, "chromosome:GRCm38:"), 
             gene_id = str_remove(gene_id, 'gene:'), 
             gene_biotype = str_remove(gene_biotype, 'gene_biotype:'), 
             transcript_biotype = str_remove(transcript_biotype, 'transcript_biotype:'), 
             gene_symbol = str_remove(gene_symbol, 'gene_symbol:'),
             transcript_id = sub('\\.[0-9]*$', '', transcript_id),
             gene_id = sub('\\.[0-9]*$', '', gene_id))
mouse_simulated_truth_30m_clean <- mouse_simulated_truth_30m %>% 
      separate( TranscriptID, into=c("transcript_id", "CDS", 'location', 'gene_id', 'gene_biotype', 'transcript_biotype', 'gene_symbol'), sep= " ", extra = 'drop') %>%
      mutate(location = str_remove(location, "chromosome:GRCm38:"), 
             gene_id = str_remove(gene_id, 'gene:'), 
             gene_biotype = str_remove(gene_biotype, 'gene_biotype:'), 
             transcript_biotype = str_remove(transcript_biotype, 'transcript_biotype:'), 
             gene_symbol = str_remove(gene_symbol, 'gene_symbol:'),
             transcript_id = sub('\\.[0-9]*$', '', transcript_id),
             gene_id = sub('\\.[0-9]*$', '', gene_id))
#This was used for ensembl CDS file. That file included additional transcripts relative to what we run against in the pipeline. 
library(stringi)

mouse_simulated_test_1m <- cbind(mouse_simulated_test_1m, `colnames<-`(stri_split_fixed(mouse_simulated_test_1m$transcript_id, '_', n = 2, simplify = TRUE), c("TranscriptID", "gene_symbol")))
mouse_simulated_test_30m <- cbind(mouse_simulated_test_30m, `colnames<-`(stri_split_fixed(mouse_simulated_test_30m$transcript_id, '_', n = 2, simplify = TRUE), c("TranscriptID", "gene_symbol")))
head(mouse_simulated_test_1m)
A data.frame: 6 × 10
transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct TranscriptID gene_symbol
<fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
1 ENSMUST00000000001_Gnai3-201 ENSMUSG00000000001_Gnai3 3262 3013.28 0 0.00 0.00 0 ENSMUST00000000001 Gnai3-201
2 ENSMUST00000000003_Pbsn-201 ENSMUSG00000000003_Pbsn 902 653.28 1 6.18 4.43 100 ENSMUST00000000003 Pbsn-201
3 ENSMUST00000114041_Pbsn-202 ENSMUSG00000000003_Pbsn 697 448.30 0 0.00 0.00 0 ENSMUST00000114041 Pbsn-202
4 ENSMUST00000000028_Cdc45-201 ENSMUSG00000000028_Cdc45 2143 1894.28 0 0.00 0.00 0 ENSMUST00000000028 Cdc45-201
5 ENSMUST00000096990_Cdc45-202 ENSMUSG00000000028_Cdc45 1747 1498.28 7 18.87 13.51 100 ENSMUST00000096990 Cdc45-202
6 ENSMUST00000115585_Cdc45-203 ENSMUSG00000000028_Cdc45 832 583.29 0 0.00 0.00 0 ENSMUST00000115585 Cdc45-203
mouse_1m_joined_data <- dplyr::left_join(mouse_simulated_truth_1m_clean, mouse_simulated_test_1m, by = c('transcript_id' = 'TranscriptID'))
mouse_30m_joined_data <- dplyr::left_join(mouse_simulated_truth_30m_clean, mouse_simulated_test_30m, by = c('transcript_id' = 'TranscriptID'))
head(mouse_1m_joined_data)
A data.frame: 6 × 18
transcript_id CDS location gene_id.x gene_biotype transcript_biotype gene_symbol.x Length NReads transcript_id.y gene_id.y length effective_length expected_count TPM FPKM IsoPct gene_symbol.y
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 ENSMUST00000177564 cds 14:54122226:54122241:1 ENSMUSG00000096176 TR_D_gene TR_D_gene Trdd2 16 0 ENSMUST00000177564_Trdd2-201 ENSMUSG00000096176_Trdd2 16 0 0 0 0 0 Trdd2-201
2 ENSMUST00000196221 cds 14:54113468:54113476:1 ENSMUSG00000096749 TR_D_gene TR_D_gene Trdd1 9 0 ENSMUST00000196221_Trdd1-202 ENSMUSG00000096749_Trdd1 9 0 0 0 0 0 Trdd1-202
3 ENSMUST00000178537 cds 6:41533201:41533212:1 ENSMUSG00000095668 TR_D_gene TR_D_gene Trbd1 12 0 ENSMUST00000178537_Trbd1-201 ENSMUSG00000095668_Trbd1 12 0 0 0 0 0 Trbd1-201
4 ENSMUST00000178862 cds 6:41542163:41542176:1 ENSMUSG00000094569 TR_D_gene TR_D_gene Trbd2 14 0 ENSMUST00000178862_Trbd2-201 ENSMUSG00000094569_Trbd2 14 0 0 0 0 0 Trbd2-201
5 ENSMUST00000179520 cds 12:113430528:113430538:-1 ENSMUSG00000094028 IG_D_gene IG_D_gene Ighd4-1 11 0 ENSMUST00000179520_Ighd4-1-201 ENSMUSG00000094028_Ighd4-1 11 0 0 0 0 0 Ighd4-1-201
6 ENSMUST00000179883 cds 12:113448214:113448229:-1 ENSMUSG00000094552 IG_D_gene IG_D_gene Ighd3-2 16 0 ENSMUST00000179883_Ighd3-2-201 ENSMUSG00000094552_Ighd3-2 16 0 0 0 0 0 Ighd3-2-201
head(mouse_1m_joined_data)
A data.frame: 6 × 18
transcript_id CDS location gene_id.x gene_biotype transcript_biotype gene_symbol.x Length NReads transcript_id.y gene_id.y length effective_length expected_count TPM FPKM IsoPct gene_symbol.y
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 ENSMUST00000177564 cds 14:54122226:54122241:1 ENSMUSG00000096176 TR_D_gene TR_D_gene Trdd2 16 0 ENSMUST00000177564_Trdd2-201 ENSMUSG00000096176_Trdd2 16 0 0 0 0 0 Trdd2-201
2 ENSMUST00000196221 cds 14:54113468:54113476:1 ENSMUSG00000096749 TR_D_gene TR_D_gene Trdd1 9 0 ENSMUST00000196221_Trdd1-202 ENSMUSG00000096749_Trdd1 9 0 0 0 0 0 Trdd1-202
3 ENSMUST00000178537 cds 6:41533201:41533212:1 ENSMUSG00000095668 TR_D_gene TR_D_gene Trbd1 12 0 ENSMUST00000178537_Trbd1-201 ENSMUSG00000095668_Trbd1 12 0 0 0 0 0 Trbd1-201
4 ENSMUST00000178862 cds 6:41542163:41542176:1 ENSMUSG00000094569 TR_D_gene TR_D_gene Trbd2 14 0 ENSMUST00000178862_Trbd2-201 ENSMUSG00000094569_Trbd2 14 0 0 0 0 0 Trbd2-201
5 ENSMUST00000179520 cds 12:113430528:113430538:-1 ENSMUSG00000094028 IG_D_gene IG_D_gene Ighd4-1 11 0 ENSMUST00000179520_Ighd4-1-201 ENSMUSG00000094028_Ighd4-1 11 0 0 0 0 0 Ighd4-1-201
6 ENSMUST00000179883 cds 12:113448214:113448229:-1 ENSMUSG00000094552 IG_D_gene IG_D_gene Ighd3-2 16 0 ENSMUST00000179883_Ighd3-2-201 ENSMUSG00000094552_Ighd3-2 16 0 0 0 0 0 Ighd3-2-201
plot(mouse_1m_joined_data$NReads, mouse_1m_joined_data$expected_count, xlab = 'True Read Counts', ylab = 'RSEM Expected Count')

plot(mouse_30m_joined_data$NReads, mouse_30m_joined_data$expected_count, xlab = 'True Read Counts', ylab = 'RSEM Expected Count')

mouse_1m_joined_data %>% filter(is.na(effective_length))
mouse_30m_joined_data %>% filter(is.na(effective_length))

# checking the join. 
A data.frame: 0 × 18
transcript_id CDS location gene_id.x gene_biotype transcript_biotype gene_symbol.x Length NReads transcript_id.y gene_id.y length effective_length expected_count TPM FPKM IsoPct gene_symbol.y
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
A data.frame: 0 × 18
transcript_id CDS location gene_id.x gene_biotype transcript_biotype gene_symbol.x Length NReads transcript_id.y gene_id.y length effective_length expected_count TPM FPKM IsoPct gene_symbol.y
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>

I have an Ensembl ID, what can I tell about it from the ID?

An Ensembl stable ID consists of five parts: ENS(species)(object type)(identifier).(version).

The first part, 'ENS', tells you that it's an Ensembl ID
The second part is a three-letter species code. For human, there is no species code so IDs are in the form ENS(object type)(identifier).(version). A list of the other species codes can be found here.
The third part is a one- or two-letter object type. For example E for exon, FM for protein family, G for gene, GT for gene tree, P for protein, R for regulatory feature and T for transcript.
The identifier is the number to that object. Combinations of prefixes and identifiers are unique.
Versions indicate how many times that model has changed during its time in Ensembl. This document explains how we determine that a model has changed sufficiently to update version number. History pages for features show you when these changes took place.

Using this information we can make assertions about an Ensembl ID. For example ENSMUSG00000017167.6. From this we can see that it's an Ensembl ID (ENS), from mouse (MUS), it's a gene (G) and it's on its sixth version (.6).

Human

human_simulated_truth_1m <- read.delim('/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RNA/human_cds/Homo_sapiens.GRCh38.1m_isoform_true_expression.txt', sep="\t", header=T)
human_simulated_test_1m <- read.delim('/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RESULTS/RNA_human_nxf_bench/Homo_sapiens_simRNA_1mReads/Homo_sapiens_simRNA_1mReads.isoforms.results', sep="\t", header=T)

human_simulated_truth_30m <- read.delim('/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RNA/human_cds/Homo_sapiens.GRCh38.30m_isoform_true_expression.txt', sep="\t", header=T)
human_simulated_test_30m <- read.delim('/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RESULTS/RNA_human_nxf_bench/Homo_sapiens_simRNA_30mReads/Homo_sapiens_simRNA_30mReads.isoforms.results', sep="\t", header=T)
head(human_simulated_test_1m)
head(human_simulated_truth_1m)
A data.frame: 6 × 8
transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct
<fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ENST00000373020_TSPAN6-201 ENSG00000000003_TSPAN6 3768 3519.41 1.23 1.33 1.01 2.85
2 ENST00000494424_TSPAN6-202 ENSG00000000003_TSPAN6 820 571.41 0.00 0.00 0.00 0.00
3 ENST00000496771_TSPAN6-203 ENSG00000000003_TSPAN6 1025 776.41 0.00 0.00 0.00 0.00
4 ENST00000612152_TSPAN6-204 ENSG00000000003_TSPAN6 3796 3547.41 0.00 0.00 0.00 0.00
5 ENST00000614008_TSPAN6-205 ENSG00000000003_TSPAN6 900 651.41 7.77 45.42 34.38 97.15
6 ENST00000373031_TNMD-201 ENSG00000000005_TNMD 1205 956.41 0.00 0.00 0.00 0.00
A data.frame: 6 × 3
TranscriptID Length NReads
<fct> <int> <int>
1 ENST00000631435.1 cds chromosome:GRCh38:CHR_HSCHR7_2_CTG6:142847306:142847317:1 gene:ENSG00000282253.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRBD1 description:T cell receptor beta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12158] 12 0
2 ENST00000415118.1 cds chromosome:GRCh38:14:22438547:22438554:1 gene:ENSG00000223997.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD1 description:T cell receptor delta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12254] 8 0
3 ENST00000448914.1 cds chromosome:GRCh38:14:22449113:22449125:1 gene:ENSG00000228985.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD3 description:T cell receptor delta diversity 3 [Source:HGNC Symbol;Acc:HGNC:12256] 13 0
4 ENST00000434970.2 cds chromosome:GRCh38:14:22439007:22439015:1 gene:ENSG00000237235.2 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD2 description:T cell receptor delta diversity 2 [Source:HGNC Symbol;Acc:HGNC:12255] 9 0
5 ENST00000632684.1 cds chromosome:GRCh38:7:142786213:142786224:1 gene:ENSG00000282431.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRBD1 description:T cell receptor beta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12158] 12 0
6 ENST00000633010.1 cds chromosome:GRCh38:CHR_HSCHR14_3_CTG1:105895279:105895294:-1 gene:ENSG00000282274.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:IGHD4-17 description:immunoglobulin heavy diversity 4-17 [Source:HGNC Symbol;Acc:HGNC:5503] 16 0
human_simulated_truth_1m_clean <- human_simulated_truth_1m %>% 
      separate( TranscriptID, into=c("transcript_id", "CDS", 'location', 'gene_id', 'gene_biotype', 'transcript_biotype', 'gene_symbol'), sep= " ", extra = 'drop') %>%
      mutate(location = str_remove(location, "chromosome:GRCh38:"), 
             gene_id = str_remove(gene_id, 'gene:'), 
             gene_biotype = str_remove(gene_biotype, 'gene_biotype:'), 
             transcript_biotype = str_remove(transcript_biotype, 'transcript_biotype:'), 
             gene_symbol = str_remove(gene_symbol, 'gene_symbol:'),
             transcript_id = sub('\\.[0-9]*$', '', transcript_id),
             gene_id = sub('\\.[0-9]*$', '', gene_id))
#This was used for ensembl CDS file. That file included additional transcripts relative to what we run against in the pipeline. 
Warning message:
“Expected 7 pieces. Missing pieces filled with `NA` in 387 rows [776, 777, 778, 779, 780, 961, 1182, 1183, 1205, 1206, 1248, 1256, 1364, 1365, 1444, 1445, 1523, 1524, 1630, 1720, ...].”
human_simulated_truth_30m_clean <- human_simulated_truth_30m %>% 
      separate( TranscriptID, into=c("transcript_id", "CDS", 'location', 'gene_id', 'gene_biotype', 'transcript_biotype', 'gene_symbol'), sep= " ", extra = 'drop') %>%
      mutate(location = str_remove(location, "chromosome:GRCh38:"), 
             gene_id = str_remove(gene_id, 'gene:'), 
             gene_biotype = str_remove(gene_biotype, 'gene_biotype:'), 
             transcript_biotype = str_remove(transcript_biotype, 'transcript_biotype:'), 
             gene_symbol = str_remove(gene_symbol, 'gene_symbol:'),
             transcript_id = sub('\\.[0-9]*$', '', transcript_id),
             gene_id = sub('\\.[0-9]*$', '', gene_id))
#This was used for ensembl CDS file. That file included additional transcripts relative to what we run against in the pipeline. 
Warning message:
“Expected 7 pieces. Missing pieces filled with `NA` in 387 rows [776, 777, 778, 779, 780, 961, 1182, 1183, 1205, 1206, 1248, 1256, 1364, 1365, 1444, 1445, 1523, 1524, 1630, 1720, ...].”
head(human_simulated_truth_1m_clean)
A data.frame: 6 × 9
transcript_id CDS location gene_id gene_biotype transcript_biotype gene_symbol Length NReads
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int>
1 ENST00000631435 cds CHR_HSCHR7_2_CTG6:142847306:142847317:1 ENSG00000282253 TR_D_gene TR_D_gene TRBD1 12 0
2 ENST00000415118 cds 14:22438547:22438554:1 ENSG00000223997 TR_D_gene TR_D_gene TRDD1 8 0
3 ENST00000448914 cds 14:22449113:22449125:1 ENSG00000228985 TR_D_gene TR_D_gene TRDD3 13 0
4 ENST00000434970 cds 14:22439007:22439015:1 ENSG00000237235 TR_D_gene TR_D_gene TRDD2 9 0
5 ENST00000632684 cds 7:142786213:142786224:1 ENSG00000282431 TR_D_gene TR_D_gene TRBD1 12 0
6 ENST00000633010 cds CHR_HSCHR14_3_CTG1:105895279:105895294:-1 ENSG00000282274 IG_D_gene IG_D_gene IGHD4-17 16 0
library(stringi)

human_simulated_test_1m <- cbind(human_simulated_test_1m, `colnames<-`(stri_split_fixed(human_simulated_test_1m$transcript_id, '_', n = 2, simplify = TRUE), c("TranscriptID", "gene_symbol")))
human_simulated_test_30m <- cbind(human_simulated_test_30m, `colnames<-`(stri_split_fixed(human_simulated_test_30m$transcript_id, '_', n = 2, simplify = TRUE), c("TranscriptID", "gene_symbol")))
human_1m_joined_data <- dplyr::left_join(human_simulated_truth_1m_clean, human_simulated_test_1m, by = c('transcript_id' = 'TranscriptID'))
human_30m_joined_data <- dplyr::left_join(human_simulated_truth_30m_clean, human_simulated_test_30m, by = c('transcript_id' = 'TranscriptID'))
head(human_1m_joined_data)
A data.frame: 6 × 18
transcript_id CDS location gene_id.x gene_biotype transcript_biotype gene_symbol.x Length NReads transcript_id.y gene_id.y length effective_length expected_count TPM FPKM IsoPct gene_symbol.y
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 ENST00000631435 cds CHR_HSCHR7_2_CTG6:142847306:142847317:1 ENSG00000282253 TR_D_gene TR_D_gene TRBD1 12 0 ENST00000631435_TRBD1-201 ENSG00000282253_TRBD1 12 0 0 0 0 0 TRBD1-201
2 ENST00000415118 cds 14:22438547:22438554:1 ENSG00000223997 TR_D_gene TR_D_gene TRDD1 8 0 ENST00000415118_TRDD1-201 ENSG00000223997_TRDD1 8 0 0 0 0 0 TRDD1-201
3 ENST00000448914 cds 14:22449113:22449125:1 ENSG00000228985 TR_D_gene TR_D_gene TRDD3 13 0 ENST00000448914_TRDD3-201 ENSG00000228985_TRDD3 13 0 0 0 0 0 TRDD3-201
4 ENST00000434970 cds 14:22439007:22439015:1 ENSG00000237235 TR_D_gene TR_D_gene TRDD2 9 0 ENST00000434970_TRDD2-201 ENSG00000237235_TRDD2 9 0 0 0 0 0 TRDD2-201
5 ENST00000632684 cds 7:142786213:142786224:1 ENSG00000282431 TR_D_gene TR_D_gene TRBD1 12 0 ENST00000632684_TRBD1-202 ENSG00000282431_TRBD1 12 0 0 0 0 0 TRBD1-202
6 ENST00000633010 cds CHR_HSCHR14_3_CTG1:105895279:105895294:-1 ENSG00000282274 IG_D_gene IG_D_gene IGHD4-17 16 0 ENST00000633010_IGHD4-17-202 ENSG00000282274_IGHD4-17 16 0 0 0 0 0 IGHD4-17-202
plot(human_1m_joined_data$NReads, human_1m_joined_data$expected_count, xlab = 'True Read Counts', ylab = 'RSEM Expected Count')

plot(human_30m_joined_data$NReads, human_30m_joined_data$expected_count, xlab = 'True Read Counts', ylab = 'RSEM Expected Count')

human_1m_joined_data %>% filter(is.na(effective_length))
human_30m_joined_data %>% filter(is.na(effective_length))
# checking the join. 
A data.frame: 0 × 18
transcript_id CDS location gene_id.x gene_biotype transcript_biotype gene_symbol.x Length NReads transcript_id.y gene_id.y length effective_length expected_count TPM FPKM IsoPct gene_symbol.y
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
A data.frame: 0 × 18
transcript_id CDS location gene_id.x gene_biotype transcript_biotype gene_symbol.x Length NReads transcript_id.y gene_id.y length effective_length expected_count TPM FPKM IsoPct gene_symbol.y
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
Clone this wiki locally