vg
tools for working with variation graphs
|
#include <mapper.hpp>
Public Member Functions | |
BaseMapper (PathPositionHandleGraph *xidex, gcsa::GCSA *g, gcsa::LCPArray *a, haplo::ScoreProvider *haplo_score_provider=nullptr) | |
BaseMapper (void) | |
int | random_match_length (double chance_random) |
void | set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, double haplotype_consistency_exponent=1) |
Override alignment score setting to support haplotype consistency exponent. More... | |
void | set_alignment_scores (istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, double haplotype_consistency_exponent=1) |
Same, but loading a 4x4 substitution score matrix from a stream. More... | |
vector< MaximalExactMatch > | find_mems_deep (string::const_iterator seq_begin, string::const_iterator seq_end, double &lcp_avg, double &fraction_filtered, int max_mem_length=0, int min_mem_length=1, int reseed_length=0, bool use_lcp_reseed_heuristic=false, bool use_diff_based_fast_reseed=false, bool include_parent_in_sub_mem_count=false, bool record_max_lcp=false, int reseed_below_count=0) |
vector< MaximalExactMatch > | find_mems_simple (string::const_iterator seq_begin, string::const_iterator seq_end, int max_mem_length=0, int min_mem_length=1, int reseed_length=0) |
vector< MaximalExactMatch > | find_stripped_matches (string::const_iterator seq_begin, string::const_iterator seq_end, size_t strip_length, size_t max_match_length, size_t target_count) |
vector< MaximalExactMatch > | find_fanout_mems (string::const_iterator seq_begin, string::const_iterator seq_end, string::const_iterator qual_begin, int max_fans_out, char max_fanout_base_quality, vector< deque< pair< string::const_iterator, char >>> *mem_fanout_breaks=nullptr) |
vector< pos_t > | walk_fanout_path (string::const_iterator begin, string::const_iterator end, const deque< pair< string::const_iterator, char >> &fanout_breaks, gcsa::node_type pos) |
void | rescue_high_count_order_length_mems (vector< MaximalExactMatch > &mems, size_t max_rescue_hit_count) |
void | precollapse_order_length_runs (string::const_iterator seq_begin, vector< MaximalExactMatch > &mems) |
void | prefilter_redundant_sub_mems (vector< MaximalExactMatch > &mems, vector< pair< int, vector< size_t >>> &sub_mem_containment_graph) |
void | find_sub_mems (const vector< MaximalExactMatch > &mems, int parent_layer_begin, int parent_layer_end, int mem_idx, string::const_iterator next_mem_end, int min_mem_length, vector< pair< MaximalExactMatch, vector< size_t >>> &sub_mems_out) |
void | find_sub_mems_fast (const vector< MaximalExactMatch > &mems, int parent_layer_begin, int parent_layer_end, int mem_idx, string::const_iterator leftmost_guaranteed_disjoint_bound, string::const_iterator leftmost_seeding_bound, int min_sub_mem_length, vector< pair< MaximalExactMatch, vector< size_t >>> &sub_mems_out) |
gcsa::range_type | accelerate_mem_query (string::const_iterator begin, string::const_iterator &cursor) const |
set< pos_t > | sequence_positions (const string &seq) |
size_t | get_adaptive_min_reseed_length (size_t parent_mem_length) |
void | apply_haplotype_consistency_scores (const vector< Alignment * > &alns) |
Public Member Functions inherited from vg::AlignerClient | |
virtual void | set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
Set all the aligner scoring parameters and create the stored aligner instances. More... | |
virtual void | set_alignment_scores (std::istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
virtual void | set_alignment_scores (const int8_t *score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
Public Member Functions inherited from vg::PairedEndMapper | |
void | set_fragment_length_distr_params (size_t maximum_sample_size=1000, size_t reestimation_frequency=1000, double robust_estimation_fraction=0.95) |
bool | has_fixed_fragment_length_distr () |
Returns true if fragment length distribution has been fixed. More... | |
void | force_fragment_length_distr (double mean, double stddev) |
Static Public Member Functions | |
static double | estimate_gc_content (const gcsa::GCSA *gcsa) |
Static Public Member Functions inherited from vg::AlignerClient | |
static int8_t * | parse_matrix (std::istream &matrix_stream) |
Allocates an array to hold a 4x4 substitution matrix and returns it. More... | |
Static Public Attributes | |
static thread_local vector< size_t > | adaptive_reseed_length_memo |
Additional Inherited Members | |
Protected Member Functions inherited from vg::AlignerClient | |
AlignerClient (double gc_content_estimate=vg::default_gc_content) | |
const GSSWAligner * | get_aligner (bool have_qualities=true) const |
const QualAdjAligner * | get_qual_adj_aligner () const |
const Aligner * | get_regular_aligner () const |
Protected Attributes inherited from vg::PairedEndMapper | |
FragmentLengthDistribution | fragment_length_distr |
Holds the actual fragment length distribution and estimation information. More... | |
Base class for basic mapping functionality shared between the Mapper, MultipathMapper, etc. Handles holding on to the random access and text indexes needed for mapping operations.
vg::BaseMapper::BaseMapper | ( | PathPositionHandleGraph * | xidex, |
gcsa::GCSA * | g, | ||
gcsa::LCPArray * | a, | ||
haplo::ScoreProvider * | haplo_score_provider = nullptr |
||
) |
Make a BaseMapper that pulls from an PathPositionHandleGraph succinct graph and a GCSA2 kmer index + LCP array, and which can score reads against haplotypes using the given ScoreProvider.
If the GCSA and LCPArray are null, cannot do search, only alignment.
vg::BaseMapper::BaseMapper | ( | void | ) |
gcsa::range_type vg::BaseMapper::accelerate_mem_query | ( | string::const_iterator | begin, |
string::const_iterator & | cursor | ||
) | const |
If possible, use the MEMAcclerator to get the initial range for a MEM and update the cursor accordingly. If this is not possible, return the full GCSA2 range and leave the cursor unaltered.
void vg::BaseMapper::apply_haplotype_consistency_scores | ( | const vector< Alignment * > & | alns | ) |
Score all of the alignments in the vector for haplotype consistency. If all of them can be scored (i.e. none of them visit nodes/edges with no haplotypes), adjust all of their scores to reflect haplotype consistency. If one or more cannot be scored for haplotype consistency, leave the alignment scores alone.
|
static |
We need to be able to estimate the GC content from the GCSA index in the constructor. The given index may be null.
vector< MaximalExactMatch > vg::BaseMapper::find_fanout_mems | ( | string::const_iterator | seq_begin, |
string::const_iterator | seq_end, | ||
string::const_iterator | qual_begin, | ||
int | max_fans_out, | ||
char | max_fanout_base_quality, | ||
vector< deque< pair< string::const_iterator, char >>> * | mem_fanout_breaks = nullptr |
||
) |
vector< MaximalExactMatch > vg::BaseMapper::find_mems_deep | ( | string::const_iterator | seq_begin, |
string::const_iterator | seq_end, | ||
double & | lcp_avg, | ||
double & | fraction_filtered, | ||
int | max_mem_length = 0 , |
||
int | min_mem_length = 1 , |
||
int | reseed_length = 0 , |
||
bool | use_lcp_reseed_heuristic = false , |
||
bool | use_diff_based_fast_reseed = false , |
||
bool | include_parent_in_sub_mem_count = false , |
||
bool | record_max_lcp = false , |
||
int | reseed_below_count = 0 |
||
) |
vector< MaximalExactMatch > vg::BaseMapper::find_mems_simple | ( | string::const_iterator | seq_begin, |
string::const_iterator | seq_end, | ||
int | max_mem_length = 0 , |
||
int | min_mem_length = 1 , |
||
int | reseed_length = 0 |
||
) |
vector< MaximalExactMatch > vg::BaseMapper::find_stripped_matches | ( | string::const_iterator | seq_begin, |
string::const_iterator | seq_end, | ||
size_t | strip_length, | ||
size_t | max_match_length, | ||
size_t | target_count | ||
) |
void vg::BaseMapper::find_sub_mems | ( | const vector< MaximalExactMatch > & | mems, |
int | parent_layer_begin, | ||
int | parent_layer_end, | ||
int | mem_idx, | ||
string::const_iterator | next_mem_end, | ||
int | min_mem_length, | ||
vector< pair< MaximalExactMatch, vector< size_t >>> & | sub_mems_out | ||
) |
Locate the sub-MEMs contained in the last MEM of the mems vector that have ending positions before the end the next SMEM, label each of the sub-MEMs with the indices of all of the SMEMs that contain it
void vg::BaseMapper::find_sub_mems_fast | ( | const vector< MaximalExactMatch > & | mems, |
int | parent_layer_begin, | ||
int | parent_layer_end, | ||
int | mem_idx, | ||
string::const_iterator | leftmost_guaranteed_disjoint_bound, | ||
string::const_iterator | leftmost_seeding_bound, | ||
int | min_sub_mem_length, | ||
vector< pair< MaximalExactMatch, vector< size_t >>> & | sub_mems_out | ||
) |
Provides same semantics as find_sub_mems but with a different algorithm. This algorithm uses the min_mem_length as a pruning tool instead of the LCP index. It can be expected to be faster when both the min_mem_length reasonably large relative to the reseed_length (e.g. 1/2 of SMEM size or similar).
size_t vg::BaseMapper::get_adaptive_min_reseed_length | ( | size_t | parent_mem_length | ) |
void vg::BaseMapper::precollapse_order_length_runs | ( | string::const_iterator | seq_begin, |
vector< MaximalExactMatch > & | mems | ||
) |
identifies hits for order-length MEMs that are actually part of longer MEMs above the GCSA's limit and merges them. for speed's sake, can have false negatives but no false positives
void vg::BaseMapper::prefilter_redundant_sub_mems | ( | vector< MaximalExactMatch > & | mems, |
vector< pair< int, vector< size_t >>> & | sub_mem_containment_graph | ||
) |
identifies hits for sub-MEMs that are redundant hits to the parent MEMs and removes them from the hit lists. for speed's sake, can have false negatives but no false positives
int vg::BaseMapper::random_match_length | ( | double | chance_random | ) |
void vg::BaseMapper::rescue_high_count_order_length_mems | ( | vector< MaximalExactMatch > & | mems, |
size_t | max_rescue_hit_count | ||
) |
identifies tracts of order-length MEMs that were unfilled because their hit count was above the max and fills one MEM in the tract (the one with the smallest hit count), assumes MEMs are lexicographically ordered by read index
set<pos_t> vg::BaseMapper::sequence_positions | ( | const string & | seq | ) |
void vg::BaseMapper::set_alignment_scores | ( | int8_t | match, |
int8_t | mismatch, | ||
int8_t | gap_open, | ||
int8_t | gap_extend, | ||
int8_t | full_length_bonus, | ||
double | haplotype_consistency_exponent = 1 |
||
) |
Override alignment score setting to support haplotype consistency exponent.
void vg::BaseMapper::set_alignment_scores | ( | istream & | matrix_stream, |
int8_t | gap_open, | ||
int8_t | gap_extend, | ||
int8_t | full_length_bonus, | ||
double | haplotype_consistency_exponent = 1 |
||
) |
Same, but loading a 4x4 substitution score matrix from a stream.
vector< pos_t > vg::BaseMapper::walk_fanout_path | ( | string::const_iterator | begin, |
string::const_iterator | end, | ||
const deque< pair< string::const_iterator, char >> & | fanout_breaks, | ||
gcsa::node_type | pos | ||
) |
MEMAccelerator* vg::BaseMapper::accelerator = nullptr |
double vg::BaseMapper::adaptive_diff_exponent = 0.065 |
bool vg::BaseMapper::adaptive_reseed_diff = true |
|
static |
bool vg::BaseMapper::assume_acyclic |
double vg::BaseMapper::avg_node_length = 0 |
bool vg::BaseMapper::debug = false |
Set to enable debugging messages to cerr from the mapper, so a user can understand why a read maps the way it does.
bool vg::BaseMapper::exclude_unaligned = false |
int vg::BaseMapper::fanout_length_threshold = 0 |
bool vg::BaseMapper::fast_reseed = true |
double vg::BaseMapper::fast_reseed_length_diff = 0.45 |
bool vg::BaseMapper::filter_short_mems = false |
gcsa::GCSA* vg::BaseMapper::gcsa = nullptr |
bool vg::BaseMapper::greedy_restart_assume_substitution = false |
int vg::BaseMapper::greedy_restart_max_count = 2 |
int vg::BaseMapper::greedy_restart_max_lcp = 0 |
int vg::BaseMapper::greedy_restart_min_length = 40 |
haplo::ScoreProvider* vg::BaseMapper::haplo_score_provider = nullptr |
double vg::BaseMapper::haplotype_consistency_exponent = 1 |
int vg::BaseMapper::hard_hit_max = 0 |
int vg::BaseMapper::hit_max = 0 |
gcsa::LCPArray* vg::BaseMapper::lcp = nullptr |
MappingQualityMethod vg::BaseMapper::mapping_quality_method |
int vg::BaseMapper::max_mapping_quality |
int vg::BaseMapper::max_sub_mem_recursion_depth = 2 |
int vg::BaseMapper::mem_reseed_length |
int vg::BaseMapper::min_mem_length |
bool vg::BaseMapper::precollapse_order_length_hits = true |
bool vg::BaseMapper::prefilter_redundant_hits = true |
double vg::BaseMapper::recombination_penalty = 20.7 |
double vg::BaseMapper::short_mem_filter_factor = 0.45 |
bool vg::BaseMapper::strip_bonuses |
int vg::BaseMapper::sub_mem_count_thinning = 4 |
int vg::BaseMapper::sub_mem_thinning_burn_in = 16 |
size_t vg::BaseMapper::total_seq_length = 0 |
int vg::BaseMapper::unpaired_penalty = 17 |
bool vg::BaseMapper::use_approx_sub_mem_count = false |
bool vg::BaseMapper::use_greedy_mem_restarts = false |
PathPositionHandleGraph* vg::BaseMapper::xindex = nullptr |