Skip to contents

Given a set of exons (encompassing the CDS and UTRs) and cds regions, add_utr() will calculate and add the corresponding UTR regions as ranges. This can be useful when combined with shorten_gaps() to visualize transcripts with long introns, whilst differentiating UTRs from CDS regions.

Usage

add_utr(exons, cds, group_var = NULL)

Arguments

exons

data.frame() contains exons which can originate from multiple transcripts differentiated by group_var.

cds

data.frame() contains coding sequence ranges for the transcripts in exons.

group_var

character() if input data originates from more than 1 transcript, group_var must specify the column that differentiates transcripts (e.g. "transcript_id").

Value

data.frame() contains differentiated CDS and UTR ranges.

Details

The definition of the inputted cds regions are expected to range from the beginning of the start codon to the end of the stop codon. Sometimes, for example in the case of Ensembl, reference annotation will omit the stop codons from the CDS definition. In such cases, users should manually ensure that the cds includes both the start and stop codons.

Examples


library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
pknox1_annotation %>% head()
#> # A tibble: 6 × 8
#>   seqnames  start    end strand type  gene_name transcript_name transcript_biot…
#>   <fct>     <int>  <int> <fct>  <fct> <chr>     <chr>           <chr>           
#> 1 21       4.30e7 4.30e7 +      gene  PKNOX1    NA              NA              
#> 2 21       4.30e7 4.30e7 +      tran… PKNOX1    PKNOX1-203      protein_coding  
#> 3 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  
#> 4 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  
#> 5 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  
#> 6 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  

# extract exons
pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
pknox1_exons %>% head()
#> # A tibble: 6 × 8
#>   seqnames  start    end strand type  gene_name transcript_name transcript_biot…
#>   <fct>     <int>  <int> <fct>  <fct> <chr>     <chr>           <chr>           
#> 1 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  
#> 2 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  
#> 3 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  
#> 4 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  
#> 5 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  
#> 6 21       4.30e7 4.30e7 +      exon  PKNOX1    PKNOX1-203      protein_coding  

# extract cds
pknox1_cds <- pknox1_annotation %>% dplyr::filter(type == "CDS")
pknox1_cds %>% head()
#> # A tibble: 6 × 8
#>   seqnames  start    end strand type  gene_name transcript_name transcript_biot…
#>   <fct>     <int>  <int> <fct>  <fct> <chr>     <chr>           <chr>           
#> 1 21       4.30e7 4.30e7 +      CDS   PKNOX1    PKNOX1-203      protein_coding  
#> 2 21       4.30e7 4.30e7 +      CDS   PKNOX1    PKNOX1-203      protein_coding  
#> 3 21       4.30e7 4.30e7 +      CDS   PKNOX1    PKNOX1-203      protein_coding  
#> 4 21       4.30e7 4.30e7 +      CDS   PKNOX1    PKNOX1-203      protein_coding  
#> 5 21       4.30e7 4.30e7 +      CDS   PKNOX1    PKNOX1-203      protein_coding  
#> 6 21       4.30e7 4.30e7 +      CDS   PKNOX1    PKNOX1-203      protein_coding  

# the CDS definition originating from the Ensembl reference annotation
# does not include the stop codon
# we must incorporate the stop codons into the CDS manually
# by adding 3 base pairs to the end of the CDS of each transcript
pknox1_cds_w_stop <- pknox1_cds %>%
    dplyr::group_by(transcript_name) %>%
    dplyr::mutate(
        end = ifelse(end == max(end), end + 3, end)
    ) %>%
    dplyr::ungroup()

# add_utr() adds ranges that represent the UTRs
pknox1_cds_utr <- add_utr(
    pknox1_exons,
    pknox1_cds_w_stop,
    group_var = "transcript_name"
)

pknox1_cds_utr %>% head()
#> # A tibble: 6 × 9
#>   seqnames    start      end width strand transcript_name type  gene_name
#>   <fct>       <int>    <int> <int> <fct>  <chr>           <chr> <chr>    
#> 1 21       42974510 42974664   155 +      PKNOX1-203      UTR   NA       
#> 2 21       43004326 43004432   107 +      PKNOX1-203      UTR   NA       
#> 3 21       43007491 43007618   128 +      PKNOX1-203      UTR   NA       
#> 4 21       43013068 43013238   171 +      PKNOX1-203      CDS   PKNOX1   
#> 5 21       43016908 43017007   100 +      PKNOX1-203      CDS   PKNOX1   
#> 6 21       43018133 43018230    98 +      PKNOX1-203      CDS   PKNOX1   
#> # … with 1 more variable: transcript_biotype <chr>

# this can be useful when combined with shorten_gaps()
# to visualize transcripts with long introns whilst differentiating UTRs
pknox1_cds_utr_rescaled <-
    shorten_gaps(
        exons = pknox1_cds_utr,
        introns = to_intron(pknox1_cds_utr, "transcript_name"),
        group_var = "transcript_name"
    )

pknox1_cds_utr_rescaled %>%
    dplyr::filter(type == "CDS") %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_range(
        data = pknox1_cds_utr_rescaled %>% dplyr::filter(type == "UTR"),
        height = 0.25,
        fill = "white"
    ) +
    geom_intron(
        data = to_intron(
            pknox1_cds_utr_rescaled %>% dplyr::filter(type != "intron"),
            "transcript_name"
        ),
        arrow.min.intron.length = 110
    )