vg
tools for working with variation graphs
Public Member Functions | Public Attributes | Private Member Functions | Private Attributes | List of all members
vg::Transcriptome Class Reference

#include <transcriptome.hpp>

Public Member Functions

 Transcriptome (unique_ptr< MutablePathDeletableHandleGraph > &&graph_in)
 
int32_t add_intron_splice_junctions (vector< istream * > intron_streams, unique_ptr< gbwt::GBWT > &haplotype_index, const bool update_haplotypes)
 
int32_t add_reference_transcripts (vector< istream * > transcript_streams, unique_ptr< gbwt::GBWT > &haplotype_index, const bool use_haplotype_paths, const bool update_haplotypes)
 
int32_t add_haplotype_transcripts (vector< istream * > transcript_streams, const gbwt::GBWT &haplotype_index, const bool proj_emded_paths)
 
const vector< CompletedTranscriptPath > & transcript_paths () const
 Returns transcript paths. More...
 
vector< CompletedTranscriptPathreference_transcript_paths () const
 Returns the reference transcript paths. More...
 
vector< CompletedTranscriptPathhaplotype_transcript_paths () const
 Returns the haplotype transcript paths. More...
 
const MutablePathDeletableHandleGraphgraph () const
 Returns the graph. More...
 
void remove_non_transcribed_nodes ()
 Removes non-transcribed (not in transcript paths) nodes. More...
 
void chop_nodes (const uint32_t max_node_length)
 
bool sort_compact_nodes ()
 
void embed_transcript_paths (const bool add_reference_transcripts, const bool add_haplotype_transcripts)
 
void add_transcripts_to_gbwt (gbwt::GBWTBuilder *gbwt_builder, const bool add_bidirectional, const bool exclude_reference_transcripts) const
 
void write_transcript_sequences (ostream *fasta_ostream, const bool exclude_reference_transcripts) const
 
void write_transcript_info (ostream *tsv_ostream, const gbwt::GBWT &haplotype_index, const bool exclude_reference_transcripts) const
 
void write_graph (ostream *graph_ostream) const
 Writes the graph to a file. More...
 

Public Attributes

bool show_progress = false
 Write progress to stderr. More...
 
int32_t num_threads = 1
 Number of threads used for transcript path construction. More...
 
string feature_type = "exon"
 Feature type to parse in the gtf/gff file. Parse all types if empty. More...
 
string transcript_tag = "transcript_id"
 Attribute tag used to parse the transcript id/name in the gtf/gff file. More...
 
string path_collapse_type = "haplotype"
 
bool error_on_missing_path = true
 Treat a missing path in the transcripts/introns as a data error. More...
 

Private Member Functions

void parse_introns (vector< Transcript > *introns, istream *intron_stream, const bdsg::PositionOverlay &graph_path_pos_overlay) const
 Parse BED file of introns. More...
 
int32_t parse_transcripts (vector< Transcript > *transcripts, uint32_t *number_of_excluded_transcripts, istream *transcript_stream, const bdsg::PositionOverlay &graph_path_pos_overlay, const gbwt::GBWT &haplotype_index, const bool use_haplotype_paths) const
 Parse gtf/gff3 file of transcripts. Returns the number of non-header lines in the parsed file. More...
 
string get_base_gbwt_path_name (const gbwt::GBWT &haplotype_index, const size_t path_id, const unordered_set< string > &gbwt_reference_samples) const
 Returns gbwt path name without phaseblock and subrange. More...
 
string parse_attribute_value (const string &attribute, const string &name) const
 Parse gtf/gff3 attribute value. More...
 
float mean_node_length () const
 Returns the mean node length of the graph. More...
 
void add_exon (Transcript *transcript, const pair< int32_t, int32_t > &exon_pos) const
 Adds the exon coordinates to a transcript. More...
 
void add_exon (Transcript *transcript, const pair< int32_t, int32_t > &exon_pos, const bdsg::PositionOverlay &graph_path_pos_overlay) const
 
void reorder_exons (Transcript *transcript) const
 
bool has_overlapping_exons (const vector< Exon > &exons) const
 Checks whether any adjacent exons overlap. More...
 
list< EditedTranscriptPathconstruct_reference_transcript_paths_embedded (const vector< Transcript > &transcripts, const bdsg::PositionOverlay &graph_path_pos_overlay) const
 
void construct_reference_transcript_paths_embedded_callback (list< EditedTranscriptPath > *edited_transcript_paths, spp::sparse_hash_map< handle_t, vector< EditedTranscriptPath * > > *edited_transcript_paths_index, mutex *edited_transcript_paths_mutex, const int32_t thread_idx, const vector< Transcript > &transcripts, const bdsg::PositionOverlay &graph_path_pos_overlay) const
 Threaded reference transcript path construction using embedded paths. More...
 
list< EditedTranscriptPathproject_transcript_embedded (const Transcript &cur_transcript, const bdsg::PositionOverlay &graph_path_pos_overlay, const bool use_reference_paths, const bool use_haplotype_paths) const
 Projects transcripts onto embedded paths in a graph and returns the resulting transcript paths. More...
 
list< EditedTranscriptPathconstruct_reference_transcript_paths_gbwt (const vector< Transcript > &transcripts, const gbwt::GBWT &haplotype_index) const
 
void construct_reference_transcript_paths_gbwt_callback (list< EditedTranscriptPath > *edited_transcript_paths, spp::sparse_hash_map< handle_t, vector< EditedTranscriptPath * > > *edited_transcript_paths_index, uint32_t *excluded_transcripts, mutex *edited_transcript_paths_mutex, const int32_t thread_idx, const vector< pair< uint32_t, uint32_t > > &chrom_transcript_sets, const vector< Transcript > &transcripts, const gbwt::GBWT &haplotype_index, const spp::sparse_hash_map< string, map< uint32_t, uint32_t > > &haplotype_name_index) const
 Threaded reference transcript path construction using GBWT haplotype paths. More...
 
void project_haplotype_transcripts (const vector< Transcript > &transcripts, const gbwt::GBWT &haplotype_index, const bdsg::PositionOverlay &graph_path_pos_overlay, const bool proj_emded_paths, const float mean_node_length)
 
void project_haplotype_transcripts_callback (list< CompletedTranscriptPath > *completed_transcript_paths, spp::sparse_hash_map< handle_t, vector< CompletedTranscriptPath * > > *completed_transcript_paths_index, mutex *completed_transcript_paths_mutex, const int32_t thread_idx, const vector< Transcript > &transcripts, const gbwt::GBWT &haplotype_index, const bdsg::PositionOverlay &graph_path_pos_overlay, const bool proj_emded_paths, const float mean_node_length)
 Threaded haplotype transcript projecting. More...
 
list< EditedTranscriptPathproject_transcript_gbwt (const Transcript &cur_transcript, const gbwt::GBWT &haplotype_index, const unordered_set< string > &reference_samples, const float mean_node_length) const
 Projects transcripts onto haplotypes in a GBWT index and returns the resulting transcript paths. More...
 
vector< pair< exon_nodes_t, thread_ids_t > > get_exon_haplotypes (const vg::id_t start_node, const vg::id_t end_node, const gbwt::GBWT &haplotype_index, const unordered_set< string > &reference_samples, const int32_t expected_length) const
 
template<class T >
void remove_redundant_transcript_paths (list< T > *new_transcript_paths, spp::sparse_hash_map< handle_t, vector< T * > > *transcript_paths_index) const
 Remove redundant transcript paths and update index. More...
 
list< CompletedTranscriptPathconstruct_completed_transcript_paths (const list< EditedTranscriptPath > &edited_transcript_paths) const
 
void add_edited_transcript_paths (const list< EditedTranscriptPath > &edited_transcript_paths)
 
bool has_novel_exon_boundaries (const list< EditedTranscriptPath > &edited_transcript_paths, const bool include_transcript_ends) const
 
void augment_graph (const list< EditedTranscriptPath > &edited_transcript_paths, const bool is_introns, unique_ptr< gbwt::GBWT > &haplotype_index, const bool update_haplotypes, const bool add_reference_transcript_paths)
 
void update_haplotype_index (unique_ptr< gbwt::GBWT > &haplotype_index, const spp::sparse_hash_map< handle_t, vector< pair< int32_t, handle_t > > > &update_index) const
 Update threads in gbwt index using graph translations. More...
 
void update_transcript_paths (const spp::sparse_hash_map< handle_t, vector< pair< int32_t, handle_t > > > &update_index)
 Update/split node handles in transcriptome transcript paths according to index. More...
 
void add_splice_junction_edges (const list< EditedTranscriptPath > &edited_transcript_paths)
 Adds transcript path splice-junction edges to the graph. More...
 
void add_splice_junction_edges (const list< CompletedTranscriptPath > &completed_transcript_paths)
 
void add_splice_junction_edges (const vector< CompletedTranscriptPath > &completed_transcript_paths)
 
void collect_transcribed_nodes (spp::sparse_hash_set< nid_t > *transcribed_nodes) const
 Collects all unique nodes in transcriptome transcript paths. More...
 
void sort_transcript_paths_update_copy_id ()
 

Private Attributes

vector< CompletedTranscriptPath_transcript_paths
 Transcript paths representing the transcriptome. More...
 
mutex mutex_transcript_paths
 
unique_ptr< MutablePathDeletableHandleGraph_graph
 Spliced pangenome graph. More...
 
mutex mutex_graph
 

Detailed Description

Class that defines a transcriptome represented by a set of transcript paths.

Constructor & Destructor Documentation

◆ Transcriptome()

vg::Transcriptome::Transcriptome ( unique_ptr< MutablePathDeletableHandleGraph > &&  graph_in)

Member Function Documentation

◆ add_edited_transcript_paths()

void vg::Transcriptome::add_edited_transcript_paths ( const list< EditedTranscriptPath > &  edited_transcript_paths)
private

Adds edited transcript paths to transcriptome Checks that the paths contain no edits compared to the graph.

◆ add_exon() [1/2]

void vg::Transcriptome::add_exon ( Transcript transcript,
const pair< int32_t, int32_t > &  exon_pos 
) const
private

Adds the exon coordinates to a transcript.

◆ add_exon() [2/2]

void vg::Transcriptome::add_exon ( Transcript transcript,
const pair< int32_t, int32_t > &  exon_pos,
const bdsg::PositionOverlay &  graph_path_pos_overlay 
) const
private

Adds the exon coordinates to a transcript and finds the position of each end of a exon on the contig path in the graph.

◆ add_haplotype_transcripts()

int32_t vg::Transcriptome::add_haplotype_transcripts ( vector< istream * >  transcript_streams,
const gbwt::GBWT &  haplotype_index,
const bool  proj_emded_paths 
)

Adds haplotype-specific transcript paths by projecting transcripts in gtf/gff3 files onto either non-reference embedded paths and/or haplotypes in a GBWT index. Returns the number of haplotype transcript paths projected.

◆ add_intron_splice_junctions()

int32_t vg::Transcriptome::add_intron_splice_junctions ( vector< istream * >  intron_streams,
unique_ptr< gbwt::GBWT > &  haplotype_index,
const bool  update_haplotypes 
)

Adds splice-junstions from intron BED files to the graph. Optionally update haplotype GBWT index with new splice-junctions. Returns the number of introns parsed.

◆ add_reference_transcripts()

int32_t vg::Transcriptome::add_reference_transcripts ( vector< istream * >  transcript_streams,
unique_ptr< gbwt::GBWT > &  haplotype_index,
const bool  use_haplotype_paths,
const bool  update_haplotypes 
)

Adds splice-junstions from transcript gtf/gff3 files to the graph and creates reference transcript paths. Optionally update haplotype GBWT index with new splice-junctions. Returns the number of transcripts parsed.

◆ add_splice_junction_edges() [1/3]

void vg::Transcriptome::add_splice_junction_edges ( const list< CompletedTranscriptPath > &  completed_transcript_paths)
private

◆ add_splice_junction_edges() [2/3]

void vg::Transcriptome::add_splice_junction_edges ( const list< EditedTranscriptPath > &  edited_transcript_paths)
private

Adds transcript path splice-junction edges to the graph.

◆ add_splice_junction_edges() [3/3]

void vg::Transcriptome::add_splice_junction_edges ( const vector< CompletedTranscriptPath > &  completed_transcript_paths)
private

◆ add_transcripts_to_gbwt()

void vg::Transcriptome::add_transcripts_to_gbwt ( gbwt::GBWTBuilder *  gbwt_builder,
const bool  add_bidirectional,
const bool  exclude_reference_transcripts 
) const

Adds transcriptome transcript paths as threads to a GBWT index. Returns the number of added threads.

◆ augment_graph()

void vg::Transcriptome::augment_graph ( const list< EditedTranscriptPath > &  edited_transcript_paths,
const bool  is_introns,
unique_ptr< gbwt::GBWT > &  haplotype_index,
const bool  update_haplotypes,
const bool  add_reference_transcript_paths 
)
private

Augments the graph with transcript path exon boundaries and splice-junctions. Updates threads in gbwt index to match the augmented graph. Optinally adds transcript paths to the transcriptome.

◆ chop_nodes()

void vg::Transcriptome::chop_nodes ( const uint32_t  max_node_length)

Chop nodes so that they are not longer than the supplied maximum node length. Returns number of chopped nodes.

◆ collect_transcribed_nodes()

void vg::Transcriptome::collect_transcribed_nodes ( spp::sparse_hash_set< nid_t > *  transcribed_nodes) const
private

Collects all unique nodes in transcriptome transcript paths.

◆ construct_completed_transcript_paths()

list< CompletedTranscriptPath > vg::Transcriptome::construct_completed_transcript_paths ( const list< EditedTranscriptPath > &  edited_transcript_paths) const
private

Constructs completed transcripts paths from edited transcript paths. Checks that the paths contain no edits compared to the graph.

◆ construct_reference_transcript_paths_embedded()

list< EditedTranscriptPath > vg::Transcriptome::construct_reference_transcript_paths_embedded ( const vector< Transcript > &  transcripts,
const bdsg::PositionOverlay &  graph_path_pos_overlay 
) const
private

Constructs edited reference transcript paths from a set of transcripts using embedded graph paths.

◆ construct_reference_transcript_paths_embedded_callback()

void vg::Transcriptome::construct_reference_transcript_paths_embedded_callback ( list< EditedTranscriptPath > *  edited_transcript_paths,
spp::sparse_hash_map< handle_t, vector< EditedTranscriptPath * > > *  edited_transcript_paths_index,
mutex *  edited_transcript_paths_mutex,
const int32_t  thread_idx,
const vector< Transcript > &  transcripts,
const bdsg::PositionOverlay &  graph_path_pos_overlay 
) const
private

Threaded reference transcript path construction using embedded paths.

◆ construct_reference_transcript_paths_gbwt()

list< EditedTranscriptPath > vg::Transcriptome::construct_reference_transcript_paths_gbwt ( const vector< Transcript > &  transcripts,
const gbwt::GBWT &  haplotype_index 
) const
private

Constructs edited reference transcript paths from a set of transcripts using haplotype paths in a GBWT index.

◆ construct_reference_transcript_paths_gbwt_callback()

void vg::Transcriptome::construct_reference_transcript_paths_gbwt_callback ( list< EditedTranscriptPath > *  edited_transcript_paths,
spp::sparse_hash_map< handle_t, vector< EditedTranscriptPath * > > *  edited_transcript_paths_index,
uint32_t *  excluded_transcripts,
mutex *  edited_transcript_paths_mutex,
const int32_t  thread_idx,
const vector< pair< uint32_t, uint32_t > > &  chrom_transcript_sets,
const vector< Transcript > &  transcripts,
const gbwt::GBWT &  haplotype_index,
const spp::sparse_hash_map< string, map< uint32_t, uint32_t > > &  haplotype_name_index 
) const
private

Threaded reference transcript path construction using GBWT haplotype paths.

◆ embed_transcript_paths()

void vg::Transcriptome::embed_transcript_paths ( const bool  add_reference_transcripts,
const bool  add_haplotype_transcripts 
)

Embeds transcriptome transcript paths in the graph.
Returns the number of paths embedded.

◆ get_base_gbwt_path_name()

string vg::Transcriptome::get_base_gbwt_path_name ( const gbwt::GBWT &  haplotype_index,
const size_t  path_id,
const unordered_set< string > &  gbwt_reference_samples 
) const
private

Returns gbwt path name without phaseblock and subrange.

◆ get_exon_haplotypes()

vector< pair< exon_nodes_t, thread_ids_t > > vg::Transcriptome::get_exon_haplotypes ( const vg::id_t  start_node,
const vg::id_t  end_node,
const gbwt::GBWT &  haplotype_index,
const unordered_set< string > &  reference_samples,
const int32_t  expected_length 
) const
private

Extracts all unique haplotype paths between two nodes from a GBWT index and returns the resulting paths and the corresponding haplotype ids for each path.

◆ graph()

const MutablePathDeletableHandleGraph & vg::Transcriptome::graph ( ) const

Returns the graph.

◆ haplotype_transcript_paths()

vector< CompletedTranscriptPath > vg::Transcriptome::haplotype_transcript_paths ( ) const

Returns the haplotype transcript paths.

◆ has_novel_exon_boundaries()

bool vg::Transcriptome::has_novel_exon_boundaries ( const list< EditedTranscriptPath > &  edited_transcript_paths,
const bool  include_transcript_ends 
) const
private

Checks whether transcript path only consist of whole nodes (complete).

◆ has_overlapping_exons()

bool vg::Transcriptome::has_overlapping_exons ( const vector< Exon > &  exons) const
private

Checks whether any adjacent exons overlap.

◆ mean_node_length()

float vg::Transcriptome::mean_node_length ( ) const
private

Returns the mean node length of the graph.

◆ parse_attribute_value()

string vg::Transcriptome::parse_attribute_value ( const string &  attribute,
const string &  name 
) const
private

Parse gtf/gff3 attribute value.

◆ parse_introns()

void vg::Transcriptome::parse_introns ( vector< Transcript > *  introns,
istream *  intron_stream,
const bdsg::PositionOverlay &  graph_path_pos_overlay 
) const
private

Parse BED file of introns.

◆ parse_transcripts()

int32_t vg::Transcriptome::parse_transcripts ( vector< Transcript > *  transcripts,
uint32_t *  number_of_excluded_transcripts,
istream *  transcript_stream,
const bdsg::PositionOverlay &  graph_path_pos_overlay,
const gbwt::GBWT &  haplotype_index,
const bool  use_haplotype_paths 
) const
private

Parse gtf/gff3 file of transcripts. Returns the number of non-header lines in the parsed file.

◆ project_haplotype_transcripts()

void vg::Transcriptome::project_haplotype_transcripts ( const vector< Transcript > &  transcripts,
const gbwt::GBWT &  haplotype_index,
const bdsg::PositionOverlay &  graph_path_pos_overlay,
const bool  proj_emded_paths,
const float  mean_node_length 
)
private

Constructs haplotype transcript paths by projecting transcripts onto embedded paths in a graph and/or haplotypes in a GBWT index. Adds haplotype transcript to transcriptome.

◆ project_haplotype_transcripts_callback()

void vg::Transcriptome::project_haplotype_transcripts_callback ( list< CompletedTranscriptPath > *  completed_transcript_paths,
spp::sparse_hash_map< handle_t, vector< CompletedTranscriptPath * > > *  completed_transcript_paths_index,
mutex *  completed_transcript_paths_mutex,
const int32_t  thread_idx,
const vector< Transcript > &  transcripts,
const gbwt::GBWT &  haplotype_index,
const bdsg::PositionOverlay &  graph_path_pos_overlay,
const bool  proj_emded_paths,
const float  mean_node_length 
)
private

Threaded haplotype transcript projecting.

◆ project_transcript_embedded()

list< EditedTranscriptPath > vg::Transcriptome::project_transcript_embedded ( const Transcript cur_transcript,
const bdsg::PositionOverlay &  graph_path_pos_overlay,
const bool  use_reference_paths,
const bool  use_haplotype_paths 
) const
private

Projects transcripts onto embedded paths in a graph and returns the resulting transcript paths.

◆ project_transcript_gbwt()

list< EditedTranscriptPath > vg::Transcriptome::project_transcript_gbwt ( const Transcript cur_transcript,
const gbwt::GBWT &  haplotype_index,
const unordered_set< string > &  reference_samples,
const float  mean_node_length 
) const
private

Projects transcripts onto haplotypes in a GBWT index and returns the resulting transcript paths.

◆ reference_transcript_paths()

vector< CompletedTranscriptPath > vg::Transcriptome::reference_transcript_paths ( ) const

Returns the reference transcript paths.

◆ remove_non_transcribed_nodes()

void vg::Transcriptome::remove_non_transcribed_nodes ( )

Removes non-transcribed (not in transcript paths) nodes.

◆ remove_redundant_transcript_paths()

template<class T >
void vg::Transcriptome::remove_redundant_transcript_paths ( list< T > *  new_transcript_paths,
spp::sparse_hash_map< handle_t, vector< T * > > *  transcript_paths_index 
) const
private

Remove redundant transcript paths and update index.

◆ reorder_exons()

void vg::Transcriptome::reorder_exons ( Transcript transcript) const
private

Reverses exon order if the transcript is on the reverse strand and the exons are ordered in reverse.

◆ sort_compact_nodes()

bool vg::Transcriptome::sort_compact_nodes ( )

Topological sorts graph and compacts node ids. Only works for graphs in the PackedGraph format. Return false if not sorted.

◆ sort_transcript_paths_update_copy_id()

void vg::Transcriptome::sort_transcript_paths_update_copy_id ( )
private

Sort transcriptome transcript paths by name and is reference, and update their copy ids.

◆ transcript_paths()

const vector< CompletedTranscriptPath > & vg::Transcriptome::transcript_paths ( ) const

Returns transcript paths.

◆ update_haplotype_index()

void vg::Transcriptome::update_haplotype_index ( unique_ptr< gbwt::GBWT > &  haplotype_index,
const spp::sparse_hash_map< handle_t, vector< pair< int32_t, handle_t > > > &  update_index 
) const
private

Update threads in gbwt index using graph translations.

◆ update_transcript_paths()

void vg::Transcriptome::update_transcript_paths ( const spp::sparse_hash_map< handle_t, vector< pair< int32_t, handle_t > > > &  update_index)
private

Update/split node handles in transcriptome transcript paths according to index.

◆ write_graph()

void vg::Transcriptome::write_graph ( ostream *  graph_ostream) const

Writes the graph to a file.

◆ write_transcript_info()

void vg::Transcriptome::write_transcript_info ( ostream *  tsv_ostream,
const gbwt::GBWT &  haplotype_index,
const bool  exclude_reference_transcripts 
) const

Writes info on transcriptome transcript paths to tsv file. Returns the number of written transcripts.

◆ write_transcript_sequences()

void vg::Transcriptome::write_transcript_sequences ( ostream *  fasta_ostream,
const bool  exclude_reference_transcripts 
) const

Writes transcriptome transcript path sequences to a fasta file.
Returns the number of written sequences.

Member Data Documentation

◆ _graph

unique_ptr<MutablePathDeletableHandleGraph> vg::Transcriptome::_graph
private

Spliced pangenome graph.

◆ _transcript_paths

vector<CompletedTranscriptPath> vg::Transcriptome::_transcript_paths
private

Transcript paths representing the transcriptome.

◆ error_on_missing_path

bool vg::Transcriptome::error_on_missing_path = true

Treat a missing path in the transcripts/introns as a data error.

◆ feature_type

string vg::Transcriptome::feature_type = "exon"

Feature type to parse in the gtf/gff file. Parse all types if empty.

◆ mutex_graph

mutex vg::Transcriptome::mutex_graph
private

◆ mutex_transcript_paths

mutex vg::Transcriptome::mutex_transcript_paths
private

◆ num_threads

int32_t vg::Transcriptome::num_threads = 1

Number of threads used for transcript path construction.

◆ path_collapse_type

string vg::Transcriptome::path_collapse_type = "haplotype"

Speicifies which paths should be compared when collapsing identical paths. Can be no, haplotype or all.

◆ show_progress

bool vg::Transcriptome::show_progress = false

Write progress to stderr.

◆ transcript_tag

string vg::Transcriptome::transcript_tag = "transcript_id"

Attribute tag used to parse the transcript id/name in the gtf/gff file.


The documentation for this class was generated from the following files: