vg
tools for working with variation graphs
Classes | Public Types | Public Member Functions | Static Public Member Functions | Public Attributes | List of all members
vg::Genotyper Class Reference

#include <genotyper.hpp>

Classes

struct  Affinity
 
struct  hash_ambiguous_allele_set
 
struct  hash_node_traversal
 
struct  hash_oriented_edge
 

Public Types

enum  TraversalAlg { Reads, Exhaustive, Representative, Adaptive }
 

Public Member Functions

void run (AugmentedGraph &graph, ostream &out, string ref_path_name, string contig_name="", string sample_name="", string augmented_file_name="", bool output_vcf=false, bool output_json=false, int length_override=0, int variant_offset=0)
 
int alignment_qual_score (VG &graph, const Snarl *snarl, const Alignment &alignment)
 
vector< SnarlTraversalget_snarl_traversals (AugmentedGraph &augmented_graph, SnarlManager &manager, map< string, const Alignment * > &reads_by_name, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, PathIndex *reference_index, TraversalAlg use_traversal_alg)
 
string get_qualities_in_snarl (VG &graph, const Snarl *snarl, const Alignment &alignment)
 
map< const Alignment *, vector< Affinity > > get_affinities (AugmentedGraph &aug, const map< string, const Alignment * > &reads_by_name, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, const SnarlManager &manager, const vector< SnarlTraversal > &superbubble_paths)
 
map< const Alignment *, vector< Affinity > > get_affinities_fast (AugmentedGraph &aug, const map< string, const Alignment * > &reads_by_name, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, const SnarlManager &manager, const vector< SnarlTraversal > &superbubble_paths, bool allow_internal_alignments=false)
 
Locus genotype_snarl (VG &graph, const Snarl *snarl, const vector< SnarlTraversal > &superbubble_paths, const map< const Alignment *, vector< Affinity >> &affinities)
 
double get_genotype_log_likelihood (VG &graph, const Snarl *snarl, const vector< int > &genotype, const vector< pair< const Alignment *, vector< Affinity >>> &alignment_consistency)
 
double get_genotype_log_prior (const vector< int > &genotype)
 
vector< vcflib::Variant > locus_to_variant (VG &graph, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, const SnarlManager &manager, const PathIndex &index, vcflib::VariantCallFile &vcf, const Locus &locus, const string &sample_name="SAMPLE")
 
void write_vcf_header (std::ostream &stream, const std::string &sample_name, const std::string &contig_name, size_t contig_size)
 
vcflib::VariantCallFile * start_vcf (std::ostream &stream, const PathIndex &index, const string &sample_name, const string &contig_name, size_t contig_size)
 
pair< pair< int64_t, int64_t >, bool > get_snarl_reference_bounds (const Snarl *snarl, const PathIndex &index, const HandleGraph *graph)
 
void report_snarl (const Snarl *snarl, const SnarlManager &manager, const PathIndex *index, VG &graph, PathIndex *reference_index)
 
void report_snarl_traversal (const Snarl *snarl, const SnarlManager &manager, VG &graph)
 
void report_affinities (map< const Alignment *, vector< Genotyper::Affinity >> &affinities, vector< SnarlTraversal > &paths, VG &graph)
 
void write_variant ()
 
void print_statistics (ostream &out)
 
VG make_subset_graph (VG &graph, const string &ref_path_name, map< string, const Alignment * > &reads_by_name)
 
void edge_allele_labels (const VG &graph, const Snarl *snarl, const vector< list< NodeTraversal >> &superbubble_paths, unordered_map< pair< NodeTraversal, NodeTraversal >, unordered_set< size_t >, hash_oriented_edge > *out_edge_allele_sets)
 
void allele_ambiguity_log_probs (const VG &graph, const Snarl *snarl, const vector< list< NodeTraversal >> &superbubble_paths, const unordered_map< pair< NodeTraversal, NodeTraversal >, unordered_set< size_t >, hash_oriented_edge > &edge_allele_sets, vector< unordered_map< vector< size_t >, double, hash_ambiguous_allele_set >> *out_allele_ambiguity_probs)
 

Static Public Member Functions

static bool mapping_enters_side (const Mapping &mapping, const handle_t &side, const HandleGraph *graph)
 
static bool mapping_exits_side (const Mapping &mapping, const handle_t &side, const HandleGraph *graph)
 
static bool is_snarl_smaller_than_reads (AugmentedGraph &augmented_graph, const Snarl *snarl, const pair< unordered_set< id_t >, unordered_set< edge_t > > &contents, map< string, const Alignment * > &reads_by_name)
 

Public Attributes

size_t max_path_search_steps = 100
 
int unfold_max_length = 200
 
int dagify_steps = 1
 
double max_het_bias = 4
 
bool use_mapq = false
 
bool realign_indels = false
 
int default_sequence_quality = 15
 
int min_recurrence = 2
 
int min_unique_per_strand = 2
 
double min_score_per_base = 0.90
 
double diploid_prior_logprob = prob_to_logprob(0.999)
 
double het_prior_logprob = prob_to_logprob(0.1)
 
double haploid_prior_logprob = prob_to_logprob(0.5)
 
double deleted_prior_logprob = prob_to_logprob(0.1)
 
double polyploid_prior_success_logprob = prob_to_logprob(0.5)
 
Translator translator
 
unordered_map< size_t, size_t > snarl_reference_length_histogram
 
unordered_set< const Snarl * > snarl_traversals
 
unordered_set< const Snarl * > all_snarls
 
vector< Alignernormal_aligners
 
vector< QualAdjAlignerquality_aligners
 
map< TraversalAlg, string > alg2name
 
TraversalAlg traversal_alg = TraversalAlg::Reads
 
bool show_progress = false
 

Detailed Description

Class to hold on to genotyping parameters and genotyping functions.

Member Enumeration Documentation

◆ TraversalAlg

Enumerator
Reads 
Exhaustive 
Representative 
Adaptive 

Member Function Documentation

◆ alignment_qual_score()

int vg::Genotyper::alignment_qual_score ( VG graph,
const Snarl snarl,
const Alignment alignment 
)

Given an Alignment and a Snarl, compute a phred score for the quality of the alignment's bases within the snarl overall (not counting the start and end nodes), which is supposed to be interpretable as the probability that the call of the sequence is wrong (to the degree that it would no longer support the alleles it appears to support).

In practice we're just going to average the quality scores for all the bases interior to the snarl (i.e. not counting the start and end nodes).

If the alignment doesn't have base qualities, or no qualities are available for bases internal to the snarl, returns a default value.

◆ allele_ambiguity_log_probs()

void vg::Genotyper::allele_ambiguity_log_probs ( const VG graph,
const Snarl snarl,
const vector< list< NodeTraversal >> &  superbubble_paths,
const unordered_map< pair< NodeTraversal, NodeTraversal >, unordered_set< size_t >, hash_oriented_edge > &  edge_allele_sets,
vector< unordered_map< vector< size_t >, double, hash_ambiguous_allele_set >> *  out_allele_ambiguity_probs 
)

◆ edge_allele_labels()

void vg::Genotyper::edge_allele_labels ( const VG graph,
const Snarl snarl,
const vector< list< NodeTraversal >> &  superbubble_paths,
unordered_map< pair< NodeTraversal, NodeTraversal >, unordered_set< size_t >, hash_oriented_edge > *  out_edge_allele_sets 
)

◆ genotype_snarl()

Locus vg::Genotyper::genotype_snarl ( VG graph,
const Snarl snarl,
const vector< SnarlTraversal > &  superbubble_paths,
const map< const Alignment *, vector< Affinity >> &  affinities 
)

Compute annotated genotype from affinities and superbubble paths. Needs access to the graph so it can chop up the alignments, which requires node sizes.

◆ get_affinities()

map< const Alignment *, vector< Genotyper::Affinity > > vg::Genotyper::get_affinities ( AugmentedGraph aug,
const map< string, const Alignment * > &  reads_by_name,
const Snarl snarl,
const pair< unordered_set< id_t >, unordered_set< edge_t > > &  contents,
const SnarlManager manager,
const vector< SnarlTraversal > &  superbubble_paths 
)

Get the affinity of all the reads relevant to the superbubble to all the paths through the superbubble.

Affinity is a double out of 1.0. Higher is better.

◆ get_affinities_fast()

map< const Alignment *, vector< Genotyper::Affinity > > vg::Genotyper::get_affinities_fast ( AugmentedGraph aug,
const map< string, const Alignment * > &  reads_by_name,
const Snarl snarl,
const pair< unordered_set< id_t >, unordered_set< edge_t > > &  contents,
const SnarlManager manager,
const vector< SnarlTraversal > &  superbubble_paths,
bool  allow_internal_alignments = false 
)

Get affinities as above but using only string comparison instead of alignment. Affinities are 0 for mismatch and 1 for a perfect match.

◆ get_genotype_log_likelihood()

double vg::Genotyper::get_genotype_log_likelihood ( VG graph,
const Snarl snarl,
const vector< int > &  genotype,
const vector< pair< const Alignment *, vector< Affinity >>> &  alignment_consistency 
)

Compute the probability of the observed alignments given the genotype.

Takes a genotype as a vector of allele numbers, and support data as a collection of pairs of Alignments and vectors of bools marking whether each alignment is consistent with each allele.

Alignments should have had their quality values trimmed down to just the part covering the superbubble.

Returns a natural log likelihood.

◆ get_genotype_log_prior()

double vg::Genotyper::get_genotype_log_prior ( const vector< int > &  genotype)

Compute the prior probability of the given genotype.

Takes a genotype as a vector of allele numbers. It is not guaranteed that allele 0 corresponds to any notion of primary reference-ness.

Returns a natural log prior probability.

TODO: add in strand bias

◆ get_qualities_in_snarl()

string vg::Genotyper::get_qualities_in_snarl ( VG graph,
const Snarl snarl,
const Alignment alignment 
)

Get all the quality values in the alignment between the start and end nodes of a snarl. Handles alignments that enter the snarl from the end, and alignments that never make it through the snarl.

If we run out of qualities, or qualities aren't present, returns no qualities.

If an alignment goes through the snarl multipe times, we get all the qualities from when it is in the snarl.

Does not return qualities on the start and end nodes. May return an empty string.

◆ get_snarl_reference_bounds()

pair< pair< int64_t, int64_t >, bool > vg::Genotyper::get_snarl_reference_bounds ( const Snarl snarl,
const PathIndex index,
const HandleGraph graph 
)

Utility function for getting the reference bounds (start and past-end) of a snarl with relation to a given reference index in the given graph. Computes bounds of the variable region, not including the fixed start and end node lengths. Also returns whether the reference path goes through the snarl forwards (false) or backwards (true).

◆ get_snarl_traversals()

vector< SnarlTraversal > vg::Genotyper::get_snarl_traversals ( AugmentedGraph augmented_graph,
SnarlManager manager,
map< string, const Alignment * > &  reads_by_name,
const Snarl snarl,
const pair< unordered_set< id_t >, unordered_set< edge_t > > &  contents,
PathIndex reference_index,
TraversalAlg  use_traversal_alg 
)

Get traversals of a snarl in one of several ways.

◆ is_snarl_smaller_than_reads()

bool vg::Genotyper::is_snarl_smaller_than_reads ( AugmentedGraph augmented_graph,
const Snarl snarl,
const pair< unordered_set< id_t >, unordered_set< edge_t > > &  contents,
map< string, const Alignment * > &  reads_by_name 
)
static

Check if a snarl is small enough to be covered by reads (very conservative)

◆ locus_to_variant()

vector< vcflib::Variant > vg::Genotyper::locus_to_variant ( VG graph,
const Snarl snarl,
const pair< unordered_set< id_t >, unordered_set< edge_t > > &  contents,
const SnarlManager manager,
const PathIndex index,
vcflib::VariantCallFile &  vcf,
const Locus locus,
const string &  sample_name = "SAMPLE" 
)

Make a VCFlib variant from a called Locus. Depends on an index of the reference path we want to call against.

Returns 0 or more variants we can articulate from the superbubble. Sometimes if we can't make a variant for the superbubble against the reference path, we'll emit 0 variants.

◆ make_subset_graph()

VG vg::Genotyper::make_subset_graph ( VG graph,
const string &  ref_path_name,
map< string, const Alignment * > &  reads_by_name 
)

Subset the graph

◆ mapping_enters_side()

static bool vg::Genotyper::mapping_enters_side ( const Mapping mapping,
const handle_t side,
const HandleGraph graph 
)
static

Check if a mapping corresponds to the beginning or end of snarl by making sure it crosses the given side in the expected direction. The handle should be forward for the left side and reverse for the right side.

◆ mapping_exits_side()

static bool vg::Genotyper::mapping_exits_side ( const Mapping mapping,
const handle_t side,
const HandleGraph graph 
)
static

◆ print_statistics()

void vg::Genotyper::print_statistics ( ostream &  out)

Print snarl statistics to the given stream.

◆ report_affinities()

void vg::Genotyper::report_affinities ( map< const Alignment *, vector< Genotyper::Affinity >> &  affinities,
vector< SnarlTraversal > &  paths,
VG graph 
)

Print some information about affinities

◆ report_snarl()

void vg::Genotyper::report_snarl ( const Snarl snarl,
const SnarlManager manager,
const PathIndex index,
VG graph,
PathIndex reference_index 
)

Tell the statistics tracking code that a snarl exists. We can do things like count up the snarl length in the reference and so on. Called only once per snarl, but may be called on multiple threads simultaneously.

◆ report_snarl_traversal()

void vg::Genotyper::report_snarl_traversal ( const Snarl snarl,
const SnarlManager manager,
VG graph 
)

Tell the statistics tracking code that a read traverses a snarl completely. May be called multiple times for a given read and snarl, and may be called in parallel.

◆ run()

void vg::Genotyper::run ( AugmentedGraph graph,
ostream &  out,
string  ref_path_name,
string  contig_name = "",
string  sample_name = "",
string  augmented_file_name = "",
bool  output_vcf = false,
bool  output_json = false,
int  length_override = 0,
int  variant_offset = 0 
)

Process and write output. Alignments must be embedded in the AugmentedGraph.

◆ start_vcf()

vcflib::VariantCallFile * vg::Genotyper::start_vcf ( std::ostream &  stream,
const PathIndex index,
const string &  sample_name,
const string &  contig_name,
size_t  contig_size 
)

Start VCF output to a stream. Returns a VCFlib VariantCallFile that needs to be deleted.

◆ write_variant()

void vg::Genotyper::write_variant ( )

Write the variant as output

◆ write_vcf_header()

void vg::Genotyper::write_vcf_header ( std::ostream &  stream,
const std::string &  sample_name,
const std::string &  contig_name,
size_t  contig_size 
)

Make a VCF header

Member Data Documentation

◆ alg2name

map<TraversalAlg, string> vg::Genotyper::alg2name
Initial value:
= {
{Reads, "Reads"},
{Exhaustive, "Exhaustive"},
{Representative, "Representative"},
{Adaptive, "Adaptive"}
}

◆ all_snarls

unordered_set<const Snarl*> vg::Genotyper::all_snarls

◆ dagify_steps

int vg::Genotyper::dagify_steps = 1

◆ default_sequence_quality

int vg::Genotyper::default_sequence_quality = 15

◆ deleted_prior_logprob

double vg::Genotyper::deleted_prior_logprob = prob_to_logprob(0.1)

◆ diploid_prior_logprob

double vg::Genotyper::diploid_prior_logprob = prob_to_logprob(0.999)

◆ haploid_prior_logprob

double vg::Genotyper::haploid_prior_logprob = prob_to_logprob(0.5)

◆ het_prior_logprob

double vg::Genotyper::het_prior_logprob = prob_to_logprob(0.1)

◆ max_het_bias

double vg::Genotyper::max_het_bias = 4

◆ max_path_search_steps

size_t vg::Genotyper::max_path_search_steps = 100

◆ min_recurrence

int vg::Genotyper::min_recurrence = 2

◆ min_score_per_base

double vg::Genotyper::min_score_per_base = 0.90

◆ min_unique_per_strand

int vg::Genotyper::min_unique_per_strand = 2

◆ normal_aligners

vector<Aligner> vg::Genotyper::normal_aligners

◆ polyploid_prior_success_logprob

double vg::Genotyper::polyploid_prior_success_logprob = prob_to_logprob(0.5)

◆ quality_aligners

vector<QualAdjAligner> vg::Genotyper::quality_aligners

◆ realign_indels

bool vg::Genotyper::realign_indels = false

◆ show_progress

bool vg::Genotyper::show_progress = false

◆ snarl_reference_length_histogram

unordered_map<size_t, size_t> vg::Genotyper::snarl_reference_length_histogram

◆ snarl_traversals

unordered_set<const Snarl*> vg::Genotyper::snarl_traversals

◆ translator

Translator vg::Genotyper::translator

◆ traversal_alg

TraversalAlg vg::Genotyper::traversal_alg = TraversalAlg::Reads

◆ unfold_max_length

int vg::Genotyper::unfold_max_length = 200

◆ use_mapq

bool vg::Genotyper::use_mapq = false

The documentation for this class was generated from the following files:
vg::Genotyper::Representative
@ Representative
Definition: genotyper.hpp:140
vg::Genotyper::Adaptive
@ Adaptive
Definition: genotyper.hpp:140
vg::Genotyper::Reads
@ Reads
Definition: genotyper.hpp:140
vg::Genotyper::Exhaustive
@ Exhaustive
Definition: genotyper.hpp:140