vg
tools for working with variation graphs
|
#include <transcriptome.hpp>
Public Member Functions | |
Transcriptome (unique_ptr< MutablePathDeletableHandleGraph > &&graph_in) | |
int32_t | add_intron_splice_junctions (vector< istream * > intron_streams, unique_ptr< gbwt::GBWT > &haplotype_index, const bool update_haplotypes) |
int32_t | add_reference_transcripts (vector< istream * > transcript_streams, unique_ptr< gbwt::GBWT > &haplotype_index, const bool use_haplotype_paths, const bool update_haplotypes) |
int32_t | add_haplotype_transcripts (vector< istream * > transcript_streams, const gbwt::GBWT &haplotype_index, const bool proj_emded_paths) |
const vector< CompletedTranscriptPath > & | transcript_paths () const |
Returns transcript paths. More... | |
vector< CompletedTranscriptPath > | reference_transcript_paths () const |
Returns the reference transcript paths. More... | |
vector< CompletedTranscriptPath > | haplotype_transcript_paths () const |
Returns the haplotype transcript paths. More... | |
const MutablePathDeletableHandleGraph & | graph () const |
Returns the graph. More... | |
void | remove_non_transcribed_nodes () |
Removes non-transcribed (not in transcript paths) nodes. More... | |
void | chop_nodes (const uint32_t max_node_length) |
bool | sort_compact_nodes () |
void | embed_transcript_paths (const bool add_reference_transcripts, const bool add_haplotype_transcripts) |
void | add_transcripts_to_gbwt (gbwt::GBWTBuilder *gbwt_builder, const bool add_bidirectional, const bool exclude_reference_transcripts) const |
void | write_transcript_sequences (ostream *fasta_ostream, const bool exclude_reference_transcripts) const |
void | write_transcript_info (ostream *tsv_ostream, const gbwt::GBWT &haplotype_index, const bool exclude_reference_transcripts) const |
void | write_graph (ostream *graph_ostream) const |
Writes the graph to a file. More... | |
Public Attributes | |
bool | show_progress = false |
Write progress to stderr. More... | |
int32_t | num_threads = 1 |
Number of threads used for transcript path construction. More... | |
string | feature_type = "exon" |
Feature type to parse in the gtf/gff file. Parse all types if empty. More... | |
string | transcript_tag = "transcript_id" |
Attribute tag used to parse the transcript id/name in the gtf/gff file. More... | |
string | path_collapse_type = "haplotype" |
bool | error_on_missing_path = true |
Treat a missing path in the transcripts/introns as a data error. More... | |
Private Member Functions | |
void | parse_introns (vector< Transcript > *introns, istream *intron_stream, const bdsg::PositionOverlay &graph_path_pos_overlay) const |
Parse BED file of introns. More... | |
int32_t | parse_transcripts (vector< Transcript > *transcripts, uint32_t *number_of_excluded_transcripts, istream *transcript_stream, const bdsg::PositionOverlay &graph_path_pos_overlay, const gbwt::GBWT &haplotype_index, const bool use_haplotype_paths) const |
Parse gtf/gff3 file of transcripts. Returns the number of non-header lines in the parsed file. More... | |
string | get_base_gbwt_path_name (const gbwt::GBWT &haplotype_index, const size_t path_id, const unordered_set< string > &gbwt_reference_samples) const |
Returns gbwt path name without phaseblock and subrange. More... | |
string | parse_attribute_value (const string &attribute, const string &name) const |
Parse gtf/gff3 attribute value. More... | |
float | mean_node_length () const |
Returns the mean node length of the graph. More... | |
void | add_exon (Transcript *transcript, const pair< int32_t, int32_t > &exon_pos) const |
Adds the exon coordinates to a transcript. More... | |
void | add_exon (Transcript *transcript, const pair< int32_t, int32_t > &exon_pos, const bdsg::PositionOverlay &graph_path_pos_overlay) const |
void | reorder_exons (Transcript *transcript) const |
bool | has_overlapping_exons (const vector< Exon > &exons) const |
Checks whether any adjacent exons overlap. More... | |
list< EditedTranscriptPath > | construct_reference_transcript_paths_embedded (const vector< Transcript > &transcripts, const bdsg::PositionOverlay &graph_path_pos_overlay) const |
void | construct_reference_transcript_paths_embedded_callback (list< EditedTranscriptPath > *edited_transcript_paths, spp::sparse_hash_map< handle_t, vector< EditedTranscriptPath * > > *edited_transcript_paths_index, mutex *edited_transcript_paths_mutex, const int32_t thread_idx, const vector< Transcript > &transcripts, const bdsg::PositionOverlay &graph_path_pos_overlay) const |
Threaded reference transcript path construction using embedded paths. More... | |
list< EditedTranscriptPath > | project_transcript_embedded (const Transcript &cur_transcript, const bdsg::PositionOverlay &graph_path_pos_overlay, const bool use_reference_paths, const bool use_haplotype_paths) const |
Projects transcripts onto embedded paths in a graph and returns the resulting transcript paths. More... | |
list< EditedTranscriptPath > | construct_reference_transcript_paths_gbwt (const vector< Transcript > &transcripts, const gbwt::GBWT &haplotype_index) const |
void | construct_reference_transcript_paths_gbwt_callback (list< EditedTranscriptPath > *edited_transcript_paths, spp::sparse_hash_map< handle_t, vector< EditedTranscriptPath * > > *edited_transcript_paths_index, uint32_t *excluded_transcripts, mutex *edited_transcript_paths_mutex, const int32_t thread_idx, const vector< pair< uint32_t, uint32_t > > &chrom_transcript_sets, const vector< Transcript > &transcripts, const gbwt::GBWT &haplotype_index, const spp::sparse_hash_map< string, map< uint32_t, uint32_t > > &haplotype_name_index) const |
Threaded reference transcript path construction using GBWT haplotype paths. More... | |
void | project_haplotype_transcripts (const vector< Transcript > &transcripts, const gbwt::GBWT &haplotype_index, const bdsg::PositionOverlay &graph_path_pos_overlay, const bool proj_emded_paths, const float mean_node_length) |
void | project_haplotype_transcripts_callback (list< CompletedTranscriptPath > *completed_transcript_paths, spp::sparse_hash_map< handle_t, vector< CompletedTranscriptPath * > > *completed_transcript_paths_index, mutex *completed_transcript_paths_mutex, const int32_t thread_idx, const vector< Transcript > &transcripts, const gbwt::GBWT &haplotype_index, const bdsg::PositionOverlay &graph_path_pos_overlay, const bool proj_emded_paths, const float mean_node_length) |
Threaded haplotype transcript projecting. More... | |
list< EditedTranscriptPath > | project_transcript_gbwt (const Transcript &cur_transcript, const gbwt::GBWT &haplotype_index, const unordered_set< string > &reference_samples, const float mean_node_length) const |
Projects transcripts onto haplotypes in a GBWT index and returns the resulting transcript paths. More... | |
vector< pair< exon_nodes_t, thread_ids_t > > | get_exon_haplotypes (const vg::id_t start_node, const vg::id_t end_node, const gbwt::GBWT &haplotype_index, const unordered_set< string > &reference_samples, const int32_t expected_length) const |
template<class T > | |
void | remove_redundant_transcript_paths (list< T > *new_transcript_paths, spp::sparse_hash_map< handle_t, vector< T * > > *transcript_paths_index) const |
Remove redundant transcript paths and update index. More... | |
list< CompletedTranscriptPath > | construct_completed_transcript_paths (const list< EditedTranscriptPath > &edited_transcript_paths) const |
void | add_edited_transcript_paths (const list< EditedTranscriptPath > &edited_transcript_paths) |
bool | has_novel_exon_boundaries (const list< EditedTranscriptPath > &edited_transcript_paths, const bool include_transcript_ends) const |
void | augment_graph (const list< EditedTranscriptPath > &edited_transcript_paths, const bool is_introns, unique_ptr< gbwt::GBWT > &haplotype_index, const bool update_haplotypes, const bool add_reference_transcript_paths) |
void | update_haplotype_index (unique_ptr< gbwt::GBWT > &haplotype_index, const spp::sparse_hash_map< handle_t, vector< pair< int32_t, handle_t > > > &update_index) const |
Update threads in gbwt index using graph translations. More... | |
void | update_transcript_paths (const spp::sparse_hash_map< handle_t, vector< pair< int32_t, handle_t > > > &update_index) |
Update/split node handles in transcriptome transcript paths according to index. More... | |
void | add_splice_junction_edges (const list< EditedTranscriptPath > &edited_transcript_paths) |
Adds transcript path splice-junction edges to the graph. More... | |
void | add_splice_junction_edges (const list< CompletedTranscriptPath > &completed_transcript_paths) |
void | add_splice_junction_edges (const vector< CompletedTranscriptPath > &completed_transcript_paths) |
void | collect_transcribed_nodes (spp::sparse_hash_set< nid_t > *transcribed_nodes) const |
Collects all unique nodes in transcriptome transcript paths. More... | |
void | sort_transcript_paths_update_copy_id () |
Private Attributes | |
vector< CompletedTranscriptPath > | _transcript_paths |
Transcript paths representing the transcriptome. More... | |
mutex | mutex_transcript_paths |
unique_ptr< MutablePathDeletableHandleGraph > | _graph |
Spliced pangenome graph. More... | |
mutex | mutex_graph |
Class that defines a transcriptome represented by a set of transcript paths.
vg::Transcriptome::Transcriptome | ( | unique_ptr< MutablePathDeletableHandleGraph > && | graph_in | ) |
|
private |
Adds edited transcript paths to transcriptome Checks that the paths contain no edits compared to the graph.
|
private |
Adds the exon coordinates to a transcript.
|
private |
Adds the exon coordinates to a transcript and finds the position of each end of a exon on the contig path in the graph.
int32_t vg::Transcriptome::add_haplotype_transcripts | ( | vector< istream * > | transcript_streams, |
const gbwt::GBWT & | haplotype_index, | ||
const bool | proj_emded_paths | ||
) |
Adds haplotype-specific transcript paths by projecting transcripts in gtf/gff3 files onto either non-reference embedded paths and/or haplotypes in a GBWT index. Returns the number of haplotype transcript paths projected.
int32_t vg::Transcriptome::add_intron_splice_junctions | ( | vector< istream * > | intron_streams, |
unique_ptr< gbwt::GBWT > & | haplotype_index, | ||
const bool | update_haplotypes | ||
) |
Adds splice-junstions from intron BED files to the graph. Optionally update haplotype GBWT index with new splice-junctions. Returns the number of introns parsed.
int32_t vg::Transcriptome::add_reference_transcripts | ( | vector< istream * > | transcript_streams, |
unique_ptr< gbwt::GBWT > & | haplotype_index, | ||
const bool | use_haplotype_paths, | ||
const bool | update_haplotypes | ||
) |
Adds splice-junstions from transcript gtf/gff3 files to the graph and creates reference transcript paths. Optionally update haplotype GBWT index with new splice-junctions. Returns the number of transcripts parsed.
|
private |
|
private |
Adds transcript path splice-junction edges to the graph.
|
private |
void vg::Transcriptome::add_transcripts_to_gbwt | ( | gbwt::GBWTBuilder * | gbwt_builder, |
const bool | add_bidirectional, | ||
const bool | exclude_reference_transcripts | ||
) | const |
Adds transcriptome transcript paths as threads to a GBWT index. Returns the number of added threads.
|
private |
Augments the graph with transcript path exon boundaries and splice-junctions. Updates threads in gbwt index to match the augmented graph. Optinally adds transcript paths to the transcriptome.
void vg::Transcriptome::chop_nodes | ( | const uint32_t | max_node_length | ) |
Chop nodes so that they are not longer than the supplied maximum node length. Returns number of chopped nodes.
|
private |
Collects all unique nodes in transcriptome transcript paths.
|
private |
Constructs completed transcripts paths from edited transcript paths. Checks that the paths contain no edits compared to the graph.
|
private |
Constructs edited reference transcript paths from a set of transcripts using embedded graph paths.
|
private |
Threaded reference transcript path construction using embedded paths.
|
private |
Constructs edited reference transcript paths from a set of transcripts using haplotype paths in a GBWT index.
|
private |
Threaded reference transcript path construction using GBWT haplotype paths.
void vg::Transcriptome::embed_transcript_paths | ( | const bool | add_reference_transcripts, |
const bool | add_haplotype_transcripts | ||
) |
Embeds transcriptome transcript paths in the graph.
Returns the number of paths embedded.
|
private |
Returns gbwt path name without phaseblock and subrange.
|
private |
Extracts all unique haplotype paths between two nodes from a GBWT index and returns the resulting paths and the corresponding haplotype ids for each path.
const MutablePathDeletableHandleGraph & vg::Transcriptome::graph | ( | ) | const |
Returns the graph.
vector< CompletedTranscriptPath > vg::Transcriptome::haplotype_transcript_paths | ( | ) | const |
Returns the haplotype transcript paths.
|
private |
Checks whether transcript path only consist of whole nodes (complete).
|
private |
Checks whether any adjacent exons overlap.
|
private |
Returns the mean node length of the graph.
|
private |
Parse gtf/gff3 attribute value.
|
private |
Parse BED file of introns.
|
private |
Parse gtf/gff3 file of transcripts. Returns the number of non-header lines in the parsed file.
|
private |
Constructs haplotype transcript paths by projecting transcripts onto embedded paths in a graph and/or haplotypes in a GBWT index. Adds haplotype transcript to transcriptome.
|
private |
Threaded haplotype transcript projecting.
|
private |
Projects transcripts onto embedded paths in a graph and returns the resulting transcript paths.
|
private |
Projects transcripts onto haplotypes in a GBWT index and returns the resulting transcript paths.
vector< CompletedTranscriptPath > vg::Transcriptome::reference_transcript_paths | ( | ) | const |
Returns the reference transcript paths.
void vg::Transcriptome::remove_non_transcribed_nodes | ( | ) |
Removes non-transcribed (not in transcript paths) nodes.
|
private |
Remove redundant transcript paths and update index.
|
private |
Reverses exon order if the transcript is on the reverse strand and the exons are ordered in reverse.
bool vg::Transcriptome::sort_compact_nodes | ( | ) |
Topological sorts graph and compacts node ids. Only works for graphs in the PackedGraph format. Return false if not sorted.
|
private |
Sort transcriptome transcript paths by name and is reference, and update their copy ids.
const vector< CompletedTranscriptPath > & vg::Transcriptome::transcript_paths | ( | ) | const |
Returns transcript paths.
|
private |
Update threads in gbwt index using graph translations.
|
private |
Update/split node handles in transcriptome transcript paths according to index.
void vg::Transcriptome::write_graph | ( | ostream * | graph_ostream | ) | const |
Writes the graph to a file.
void vg::Transcriptome::write_transcript_info | ( | ostream * | tsv_ostream, |
const gbwt::GBWT & | haplotype_index, | ||
const bool | exclude_reference_transcripts | ||
) | const |
Writes info on transcriptome transcript paths to tsv file. Returns the number of written transcripts.
void vg::Transcriptome::write_transcript_sequences | ( | ostream * | fasta_ostream, |
const bool | exclude_reference_transcripts | ||
) | const |
Writes transcriptome transcript path sequences to a fasta file.
Returns the number of written sequences.
|
private |
Spliced pangenome graph.
|
private |
Transcript paths representing the transcriptome.
bool vg::Transcriptome::error_on_missing_path = true |
Treat a missing path in the transcripts/introns as a data error.
string vg::Transcriptome::feature_type = "exon" |
Feature type to parse in the gtf/gff file. Parse all types if empty.
|
private |
|
private |
int32_t vg::Transcriptome::num_threads = 1 |
Number of threads used for transcript path construction.
string vg::Transcriptome::path_collapse_type = "haplotype" |
Speicifies which paths should be compared when collapsing identical paths. Can be no, haplotype or all.
bool vg::Transcriptome::show_progress = false |
Write progress to stderr.
string vg::Transcriptome::transcript_tag = "transcript_id" |
Attribute tag used to parse the transcript id/name in the gtf/gff file.