vg
tools for working with variation graphs
|
#include <genotyper.hpp>
Classes | |
struct | Affinity |
struct | hash_ambiguous_allele_set |
struct | hash_node_traversal |
struct | hash_oriented_edge |
Public Types | |
enum | TraversalAlg { Reads, Exhaustive, Representative, Adaptive } |
Public Member Functions | |
void | run (AugmentedGraph &graph, ostream &out, string ref_path_name, string contig_name="", string sample_name="", string augmented_file_name="", bool output_vcf=false, bool output_json=false, int length_override=0, int variant_offset=0) |
int | alignment_qual_score (VG &graph, const Snarl *snarl, const Alignment &alignment) |
vector< SnarlTraversal > | get_snarl_traversals (AugmentedGraph &augmented_graph, SnarlManager &manager, map< string, const Alignment * > &reads_by_name, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, PathIndex *reference_index, TraversalAlg use_traversal_alg) |
string | get_qualities_in_snarl (VG &graph, const Snarl *snarl, const Alignment &alignment) |
map< const Alignment *, vector< Affinity > > | get_affinities (AugmentedGraph &aug, const map< string, const Alignment * > &reads_by_name, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, const SnarlManager &manager, const vector< SnarlTraversal > &superbubble_paths) |
map< const Alignment *, vector< Affinity > > | get_affinities_fast (AugmentedGraph &aug, const map< string, const Alignment * > &reads_by_name, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, const SnarlManager &manager, const vector< SnarlTraversal > &superbubble_paths, bool allow_internal_alignments=false) |
Locus | genotype_snarl (VG &graph, const Snarl *snarl, const vector< SnarlTraversal > &superbubble_paths, const map< const Alignment *, vector< Affinity >> &affinities) |
double | get_genotype_log_likelihood (VG &graph, const Snarl *snarl, const vector< int > &genotype, const vector< pair< const Alignment *, vector< Affinity >>> &alignment_consistency) |
double | get_genotype_log_prior (const vector< int > &genotype) |
vector< vcflib::Variant > | locus_to_variant (VG &graph, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, const SnarlManager &manager, const PathIndex &index, vcflib::VariantCallFile &vcf, const Locus &locus, const string &sample_name="SAMPLE") |
void | write_vcf_header (std::ostream &stream, const std::string &sample_name, const std::string &contig_name, size_t contig_size) |
vcflib::VariantCallFile * | start_vcf (std::ostream &stream, const PathIndex &index, const string &sample_name, const string &contig_name, size_t contig_size) |
pair< pair< int64_t, int64_t >, bool > | get_snarl_reference_bounds (const Snarl *snarl, const PathIndex &index, const HandleGraph *graph) |
void | report_snarl (const Snarl *snarl, const SnarlManager &manager, const PathIndex *index, VG &graph, PathIndex *reference_index) |
void | report_snarl_traversal (const Snarl *snarl, const SnarlManager &manager, VG &graph) |
void | report_affinities (map< const Alignment *, vector< Genotyper::Affinity >> &affinities, vector< SnarlTraversal > &paths, VG &graph) |
void | write_variant () |
void | print_statistics (ostream &out) |
VG | make_subset_graph (VG &graph, const string &ref_path_name, map< string, const Alignment * > &reads_by_name) |
void | edge_allele_labels (const VG &graph, const Snarl *snarl, const vector< list< NodeTraversal >> &superbubble_paths, unordered_map< pair< NodeTraversal, NodeTraversal >, unordered_set< size_t >, hash_oriented_edge > *out_edge_allele_sets) |
void | allele_ambiguity_log_probs (const VG &graph, const Snarl *snarl, const vector< list< NodeTraversal >> &superbubble_paths, const unordered_map< pair< NodeTraversal, NodeTraversal >, unordered_set< size_t >, hash_oriented_edge > &edge_allele_sets, vector< unordered_map< vector< size_t >, double, hash_ambiguous_allele_set >> *out_allele_ambiguity_probs) |
Static Public Member Functions | |
static bool | mapping_enters_side (const Mapping &mapping, const handle_t &side, const HandleGraph *graph) |
static bool | mapping_exits_side (const Mapping &mapping, const handle_t &side, const HandleGraph *graph) |
static bool | is_snarl_smaller_than_reads (AugmentedGraph &augmented_graph, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, map< string, const Alignment * > &reads_by_name) |
Public Attributes | |
size_t | max_path_search_steps = 100 |
int | unfold_max_length = 200 |
int | dagify_steps = 1 |
double | max_het_bias = 4 |
bool | use_mapq = false |
bool | realign_indels = false |
int | default_sequence_quality = 15 |
int | min_recurrence = 2 |
int | min_unique_per_strand = 2 |
double | min_score_per_base = 0.90 |
double | diploid_prior_logprob = prob_to_logprob(0.999) |
double | het_prior_logprob = prob_to_logprob(0.1) |
double | haploid_prior_logprob = prob_to_logprob(0.5) |
double | deleted_prior_logprob = prob_to_logprob(0.1) |
double | polyploid_prior_success_logprob = prob_to_logprob(0.5) |
Translator | translator |
unordered_map< size_t, size_t > | snarl_reference_length_histogram |
unordered_set< const Snarl * > | snarl_traversals |
unordered_set< const Snarl * > | all_snarls |
vector< Aligner > | normal_aligners |
vector< QualAdjAligner > | quality_aligners |
map< TraversalAlg, string > | alg2name |
TraversalAlg | traversal_alg = TraversalAlg::Reads |
bool | show_progress = false |
Class to hold on to genotyping parameters and genotyping functions.
int vg::Genotyper::alignment_qual_score | ( | VG & | graph, |
const Snarl * | snarl, | ||
const Alignment & | alignment | ||
) |
Given an Alignment and a Snarl, compute a phred score for the quality of the alignment's bases within the snarl overall (not counting the start and end nodes), which is supposed to be interpretable as the probability that the call of the sequence is wrong (to the degree that it would no longer support the alleles it appears to support).
In practice we're just going to average the quality scores for all the bases interior to the snarl (i.e. not counting the start and end nodes).
If the alignment doesn't have base qualities, or no qualities are available for bases internal to the snarl, returns a default value.
void vg::Genotyper::allele_ambiguity_log_probs | ( | const VG & | graph, |
const Snarl * | snarl, | ||
const vector< list< NodeTraversal >> & | superbubble_paths, | ||
const unordered_map< pair< NodeTraversal, NodeTraversal >, unordered_set< size_t >, hash_oriented_edge > & | edge_allele_sets, | ||
vector< unordered_map< vector< size_t >, double, hash_ambiguous_allele_set >> * | out_allele_ambiguity_probs | ||
) |
void vg::Genotyper::edge_allele_labels | ( | const VG & | graph, |
const Snarl * | snarl, | ||
const vector< list< NodeTraversal >> & | superbubble_paths, | ||
unordered_map< pair< NodeTraversal, NodeTraversal >, unordered_set< size_t >, hash_oriented_edge > * | out_edge_allele_sets | ||
) |
Locus vg::Genotyper::genotype_snarl | ( | VG & | graph, |
const Snarl * | snarl, | ||
const vector< SnarlTraversal > & | superbubble_paths, | ||
const map< const Alignment *, vector< Affinity >> & | affinities | ||
) |
Compute annotated genotype from affinities and superbubble paths. Needs access to the graph so it can chop up the alignments, which requires node sizes.
map< const Alignment *, vector< Genotyper::Affinity > > vg::Genotyper::get_affinities | ( | AugmentedGraph & | aug, |
const map< string, const Alignment * > & | reads_by_name, | ||
const Snarl * | snarl, | ||
const pair< unordered_set< id_t >, unordered_set< edge_t > > & | contents, | ||
const SnarlManager & | manager, | ||
const vector< SnarlTraversal > & | superbubble_paths | ||
) |
Get the affinity of all the reads relevant to the superbubble to all the paths through the superbubble.
Affinity is a double out of 1.0. Higher is better.
map< const Alignment *, vector< Genotyper::Affinity > > vg::Genotyper::get_affinities_fast | ( | AugmentedGraph & | aug, |
const map< string, const Alignment * > & | reads_by_name, | ||
const Snarl * | snarl, | ||
const pair< unordered_set< id_t >, unordered_set< edge_t > > & | contents, | ||
const SnarlManager & | manager, | ||
const vector< SnarlTraversal > & | superbubble_paths, | ||
bool | allow_internal_alignments = false |
||
) |
Get affinities as above but using only string comparison instead of alignment. Affinities are 0 for mismatch and 1 for a perfect match.
double vg::Genotyper::get_genotype_log_likelihood | ( | VG & | graph, |
const Snarl * | snarl, | ||
const vector< int > & | genotype, | ||
const vector< pair< const Alignment *, vector< Affinity >>> & | alignment_consistency | ||
) |
Compute the probability of the observed alignments given the genotype.
Takes a genotype as a vector of allele numbers, and support data as a collection of pairs of Alignments and vectors of bools marking whether each alignment is consistent with each allele.
Alignments should have had their quality values trimmed down to just the part covering the superbubble.
Returns a natural log likelihood.
double vg::Genotyper::get_genotype_log_prior | ( | const vector< int > & | genotype | ) |
Compute the prior probability of the given genotype.
Takes a genotype as a vector of allele numbers. It is not guaranteed that allele 0 corresponds to any notion of primary reference-ness.
Returns a natural log prior probability.
TODO: add in strand bias
string vg::Genotyper::get_qualities_in_snarl | ( | VG & | graph, |
const Snarl * | snarl, | ||
const Alignment & | alignment | ||
) |
Get all the quality values in the alignment between the start and end nodes of a snarl. Handles alignments that enter the snarl from the end, and alignments that never make it through the snarl.
If we run out of qualities, or qualities aren't present, returns no qualities.
If an alignment goes through the snarl multipe times, we get all the qualities from when it is in the snarl.
Does not return qualities on the start and end nodes. May return an empty string.
pair< pair< int64_t, int64_t >, bool > vg::Genotyper::get_snarl_reference_bounds | ( | const Snarl * | snarl, |
const PathIndex & | index, | ||
const HandleGraph * | graph | ||
) |
Utility function for getting the reference bounds (start and past-end) of a snarl with relation to a given reference index in the given graph. Computes bounds of the variable region, not including the fixed start and end node lengths. Also returns whether the reference path goes through the snarl forwards (false) or backwards (true).
vector< SnarlTraversal > vg::Genotyper::get_snarl_traversals | ( | AugmentedGraph & | augmented_graph, |
SnarlManager & | manager, | ||
map< string, const Alignment * > & | reads_by_name, | ||
const Snarl * | snarl, | ||
const pair< unordered_set< id_t >, unordered_set< edge_t > > & | contents, | ||
PathIndex * | reference_index, | ||
TraversalAlg | use_traversal_alg | ||
) |
Get traversals of a snarl in one of several ways.
|
static |
Check if a snarl is small enough to be covered by reads (very conservative)
vector< vcflib::Variant > vg::Genotyper::locus_to_variant | ( | VG & | graph, |
const Snarl * | snarl, | ||
const pair< unordered_set< id_t >, unordered_set< edge_t > > & | contents, | ||
const SnarlManager & | manager, | ||
const PathIndex & | index, | ||
vcflib::VariantCallFile & | vcf, | ||
const Locus & | locus, | ||
const string & | sample_name = "SAMPLE" |
||
) |
Make a VCFlib variant from a called Locus. Depends on an index of the reference path we want to call against.
Returns 0 or more variants we can articulate from the superbubble. Sometimes if we can't make a variant for the superbubble against the reference path, we'll emit 0 variants.
VG vg::Genotyper::make_subset_graph | ( | VG & | graph, |
const string & | ref_path_name, | ||
map< string, const Alignment * > & | reads_by_name | ||
) |
Subset the graph
|
static |
Check if a mapping corresponds to the beginning or end of snarl by making sure it crosses the given side in the expected direction. The handle should be forward for the left side and reverse for the right side.
|
static |
void vg::Genotyper::print_statistics | ( | ostream & | out | ) |
Print snarl statistics to the given stream.
void vg::Genotyper::report_affinities | ( | map< const Alignment *, vector< Genotyper::Affinity >> & | affinities, |
vector< SnarlTraversal > & | paths, | ||
VG & | graph | ||
) |
Print some information about affinities
void vg::Genotyper::report_snarl | ( | const Snarl * | snarl, |
const SnarlManager & | manager, | ||
const PathIndex * | index, | ||
VG & | graph, | ||
PathIndex * | reference_index | ||
) |
Tell the statistics tracking code that a snarl exists. We can do things like count up the snarl length in the reference and so on. Called only once per snarl, but may be called on multiple threads simultaneously.
void vg::Genotyper::report_snarl_traversal | ( | const Snarl * | snarl, |
const SnarlManager & | manager, | ||
VG & | graph | ||
) |
Tell the statistics tracking code that a read traverses a snarl completely. May be called multiple times for a given read and snarl, and may be called in parallel.
void vg::Genotyper::run | ( | AugmentedGraph & | graph, |
ostream & | out, | ||
string | ref_path_name, | ||
string | contig_name = "" , |
||
string | sample_name = "" , |
||
string | augmented_file_name = "" , |
||
bool | output_vcf = false , |
||
bool | output_json = false , |
||
int | length_override = 0 , |
||
int | variant_offset = 0 |
||
) |
Process and write output. Alignments must be embedded in the AugmentedGraph.
vcflib::VariantCallFile * vg::Genotyper::start_vcf | ( | std::ostream & | stream, |
const PathIndex & | index, | ||
const string & | sample_name, | ||
const string & | contig_name, | ||
size_t | contig_size | ||
) |
Start VCF output to a stream. Returns a VCFlib VariantCallFile that needs to be deleted.
void vg::Genotyper::write_variant | ( | ) |
Write the variant as output
void vg::Genotyper::write_vcf_header | ( | std::ostream & | stream, |
const std::string & | sample_name, | ||
const std::string & | contig_name, | ||
size_t | contig_size | ||
) |
Make a VCF header
map<TraversalAlg, string> vg::Genotyper::alg2name |
unordered_set<const Snarl*> vg::Genotyper::all_snarls |
int vg::Genotyper::dagify_steps = 1 |
int vg::Genotyper::default_sequence_quality = 15 |
double vg::Genotyper::deleted_prior_logprob = prob_to_logprob(0.1) |
double vg::Genotyper::diploid_prior_logprob = prob_to_logprob(0.999) |
double vg::Genotyper::haploid_prior_logprob = prob_to_logprob(0.5) |
double vg::Genotyper::het_prior_logprob = prob_to_logprob(0.1) |
double vg::Genotyper::max_het_bias = 4 |
size_t vg::Genotyper::max_path_search_steps = 100 |
int vg::Genotyper::min_recurrence = 2 |
double vg::Genotyper::min_score_per_base = 0.90 |
int vg::Genotyper::min_unique_per_strand = 2 |
vector<Aligner> vg::Genotyper::normal_aligners |
double vg::Genotyper::polyploid_prior_success_logprob = prob_to_logprob(0.5) |
vector<QualAdjAligner> vg::Genotyper::quality_aligners |
bool vg::Genotyper::realign_indels = false |
bool vg::Genotyper::show_progress = false |
unordered_map<size_t, size_t> vg::Genotyper::snarl_reference_length_histogram |
unordered_set<const Snarl*> vg::Genotyper::snarl_traversals |
Translator vg::Genotyper::translator |
TraversalAlg vg::Genotyper::traversal_alg = TraversalAlg::Reads |
int vg::Genotyper::unfold_max_length = 200 |
bool vg::Genotyper::use_mapq = false |