Add untranslated regions (UTRs) — add

Given a set of exons (encompassing the CDS and UTRs) and cds regions, add_utr() will calculate and add the corresponding UTR regions as ranges. This can be useful when combined with shorten_gaps() to visualize transcripts with long introns, whilst differentiating UTRs from CDS regions.

Usage

add_utr(exons, cds, group_var = NULL)

Arguments

exons: data.frame() contains exons which can originate from multiple transcripts differentiated by group_var.
cds: data.frame() contains coding sequence ranges for the transcripts in exons.
group_var: character() if input data originates from more than 1 transcript, group_var must specify the column that differentiates transcripts (e.g. "transcript_id").

Value

data.frame() contains differentiated CDS and UTR ranges.

Details

The definition of the inputted cds regions are expected to range from the beginning of the start codon to the end of the stop codon. Sometimes, for example in the case of Ensembl, reference annotation will omit the stop codons from the CDS definition. In such cases, users should manually ensure that the cds includes both the start and stop codons.

Examples


library(magrittr)
library(ggplot2)

# to illustrate the package's functionality
# ggtranscript includes example transcript annotation
pknox1_annotation %>% head()
#> # A tibble: 6 × 8
#>   seqnames    start      end strand type       gene_name transcript_name
#>   <fct>       <int>    <int> <fct>  <fct>      <chr>     <chr>          
#> 1 21       42974510 43033931 +      gene       PKNOX1    NA             
#> 2 21       42974510 43033931 +      transcript PKNOX1    PKNOX1-203     
#> 3 21       42974510 42974664 +      exon       PKNOX1    PKNOX1-203     
#> 4 21       43004326 43004432 +      exon       PKNOX1    PKNOX1-203     
#> 5 21       43007491 43007618 +      exon       PKNOX1    PKNOX1-203     
#> 6 21       43013068 43013238 +      exon       PKNOX1    PKNOX1-203     
#> # ℹ 1 more variable: transcript_biotype <chr>

# extract exons
pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon")
pknox1_exons %>% head()
#> # A tibble: 6 × 8
#>   seqnames    start      end strand type  gene_name transcript_name
#>   <fct>       <int>    <int> <fct>  <fct> <chr>     <chr>          
#> 1 21       42974510 42974664 +      exon  PKNOX1    PKNOX1-203     
#> 2 21       43004326 43004432 +      exon  PKNOX1    PKNOX1-203     
#> 3 21       43007491 43007618 +      exon  PKNOX1    PKNOX1-203     
#> 4 21       43013068 43013238 +      exon  PKNOX1    PKNOX1-203     
#> 5 21       43016908 43017007 +      exon  PKNOX1    PKNOX1-203     
#> 6 21       43018133 43018230 +      exon  PKNOX1    PKNOX1-203     
#> # ℹ 1 more variable: transcript_biotype <chr>

# extract cds
pknox1_cds <- pknox1_annotation %>% dplyr::filter(type == "CDS")
pknox1_cds %>% head()
#> # A tibble: 6 × 8
#>   seqnames    start      end strand type  gene_name transcript_name
#>   <fct>       <int>    <int> <fct>  <fct> <chr>     <chr>          
#> 1 21       43013068 43013238 +      CDS   PKNOX1    PKNOX1-203     
#> 2 21       43016908 43017007 +      CDS   PKNOX1    PKNOX1-203     
#> 3 21       43018133 43018230 +      CDS   PKNOX1    PKNOX1-203     
#> 4 21       43021303 43021431 +      CDS   PKNOX1    PKNOX1-203     
#> 5 21       43024871 43024947 +      CDS   PKNOX1    PKNOX1-203     
#> 6 21       43028702 43028874 +      CDS   PKNOX1    PKNOX1-203     
#> # ℹ 1 more variable: transcript_biotype <chr>

# the CDS definition originating from the Ensembl reference annotation
# does not include the stop codon
# we must incorporate the stop codons into the CDS manually
# by adding 3 base pairs to the end of the CDS of each transcript
pknox1_cds_w_stop <- pknox1_cds %>%
    dplyr::group_by(transcript_name) %>%
    dplyr::mutate(
        end = ifelse(end == max(end), end + 3, end)
    ) %>%
    dplyr::ungroup()

# add_utr() adds ranges that represent the UTRs
pknox1_cds_utr <- add_utr(
    pknox1_exons,
    pknox1_cds_w_stop,
    group_var = "transcript_name"
)

pknox1_cds_utr %>% head()
#> # A tibble: 6 × 9
#>   seqnames    start      end width strand transcript_name type  gene_name
#>   <fct>       <int>    <int> <int> <fct>  <chr>           <chr> <chr>    
#> 1 21       42974510 42974664   155 +      PKNOX1-203      UTR   NA       
#> 2 21       43004326 43004432   107 +      PKNOX1-203      UTR   NA       
#> 3 21       43007491 43007618   128 +      PKNOX1-203      UTR   NA       
#> 4 21       43013068 43013238   171 +      PKNOX1-203      CDS   PKNOX1   
#> 5 21       43016908 43017007   100 +      PKNOX1-203      CDS   PKNOX1   
#> 6 21       43018133 43018230    98 +      PKNOX1-203      CDS   PKNOX1   
#> # ℹ 1 more variable: transcript_biotype <chr>

# this can be useful when combined with shorten_gaps()
# to visualize transcripts with long introns whilst differentiating UTRs
pknox1_cds_utr_rescaled <-
    shorten_gaps(
        exons = pknox1_cds_utr,
        introns = to_intron(pknox1_cds_utr, "transcript_name"),
        group_var = "transcript_name"
    )

pknox1_cds_utr_rescaled %>%
    dplyr::filter(type == "CDS") %>%
    ggplot(aes(
        xstart = start,
        xend = end,
        y = transcript_name
    )) +
    geom_range() +
    geom_range(
        data = pknox1_cds_utr_rescaled %>% dplyr::filter(type == "UTR"),
        height = 0.25,
        fill = "white"
    ) +
    geom_intron(
        data = to_intron(
            pknox1_cds_utr_rescaled %>% dplyr::filter(type != "intron"),
            "transcript_name"
        ),
        arrow.min.intron.length = 110
    )