vg
tools for working with variation graphs
Public Member Functions | Static Public Member Functions | Public Attributes | Static Public Attributes | List of all members
vg::BaseMapper Class Reference

#include <mapper.hpp>

Inheritance diagram for vg::BaseMapper:
vg::AlignerClient vg::PairedEndMapper vg::Mapper vg::MultipathMapper

Public Member Functions

 BaseMapper (PathPositionHandleGraph *xidex, gcsa::GCSA *g, gcsa::LCPArray *a, haplo::ScoreProvider *haplo_score_provider=nullptr)
 
 BaseMapper (void)
 
int random_match_length (double chance_random)
 
void set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, double haplotype_consistency_exponent=1)
 Override alignment score setting to support haplotype consistency exponent. More...
 
void set_alignment_scores (istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, double haplotype_consistency_exponent=1)
 Same, but loading a 4x4 substitution score matrix from a stream. More...
 
vector< MaximalExactMatchfind_mems_deep (string::const_iterator seq_begin, string::const_iterator seq_end, double &lcp_avg, double &fraction_filtered, int max_mem_length=0, int min_mem_length=1, int reseed_length=0, bool use_lcp_reseed_heuristic=false, bool use_diff_based_fast_reseed=false, bool include_parent_in_sub_mem_count=false, bool record_max_lcp=false, int reseed_below_count=0)
 
vector< MaximalExactMatchfind_mems_simple (string::const_iterator seq_begin, string::const_iterator seq_end, int max_mem_length=0, int min_mem_length=1, int reseed_length=0)
 
vector< MaximalExactMatchfind_stripped_matches (string::const_iterator seq_begin, string::const_iterator seq_end, size_t strip_length, size_t max_match_length, size_t target_count)
 
vector< MaximalExactMatchfind_fanout_mems (string::const_iterator seq_begin, string::const_iterator seq_end, string::const_iterator qual_begin, int max_fans_out, char max_fanout_base_quality, vector< deque< pair< string::const_iterator, char >>> *mem_fanout_breaks=nullptr)
 
vector< pos_twalk_fanout_path (string::const_iterator begin, string::const_iterator end, const deque< pair< string::const_iterator, char >> &fanout_breaks, gcsa::node_type pos)
 
void rescue_high_count_order_length_mems (vector< MaximalExactMatch > &mems, size_t max_rescue_hit_count)
 
void precollapse_order_length_runs (string::const_iterator seq_begin, vector< MaximalExactMatch > &mems)
 
void prefilter_redundant_sub_mems (vector< MaximalExactMatch > &mems, vector< pair< int, vector< size_t >>> &sub_mem_containment_graph)
 
void find_sub_mems (const vector< MaximalExactMatch > &mems, int parent_layer_begin, int parent_layer_end, int mem_idx, string::const_iterator next_mem_end, int min_mem_length, vector< pair< MaximalExactMatch, vector< size_t >>> &sub_mems_out)
 
void find_sub_mems_fast (const vector< MaximalExactMatch > &mems, int parent_layer_begin, int parent_layer_end, int mem_idx, string::const_iterator leftmost_guaranteed_disjoint_bound, string::const_iterator leftmost_seeding_bound, int min_sub_mem_length, vector< pair< MaximalExactMatch, vector< size_t >>> &sub_mems_out)
 
gcsa::range_type accelerate_mem_query (string::const_iterator begin, string::const_iterator &cursor) const
 
set< pos_tsequence_positions (const string &seq)
 
size_t get_adaptive_min_reseed_length (size_t parent_mem_length)
 
void apply_haplotype_consistency_scores (const vector< Alignment * > &alns)
 
- Public Member Functions inherited from vg::AlignerClient
virtual void set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 Set all the aligner scoring parameters and create the stored aligner instances. More...
 
virtual void set_alignment_scores (std::istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 
virtual void set_alignment_scores (const int8_t *score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 
- Public Member Functions inherited from vg::PairedEndMapper
void set_fragment_length_distr_params (size_t maximum_sample_size=1000, size_t reestimation_frequency=1000, double robust_estimation_fraction=0.95)
 
bool has_fixed_fragment_length_distr ()
 Returns true if fragment length distribution has been fixed. More...
 
void force_fragment_length_distr (double mean, double stddev)
 

Static Public Member Functions

static double estimate_gc_content (const gcsa::GCSA *gcsa)
 
- Static Public Member Functions inherited from vg::AlignerClient
static int8_t * parse_matrix (std::istream &matrix_stream)
 Allocates an array to hold a 4x4 substitution matrix and returns it. More...
 

Public Attributes

int sub_mem_thinning_burn_in = 16
 
int sub_mem_count_thinning = 4
 
int min_mem_length
 
int mem_reseed_length
 
bool fast_reseed = true
 
double fast_reseed_length_diff = 0.45
 
bool adaptive_reseed_diff = true
 
double adaptive_diff_exponent = 0.065
 
int hit_max = 0
 
int hard_hit_max = 0
 
bool use_approx_sub_mem_count = false
 
bool prefilter_redundant_hits = true
 
int max_sub_mem_recursion_depth = 2
 
bool use_greedy_mem_restarts = false
 
int greedy_restart_min_length = 40
 
int greedy_restart_max_count = 2
 
int greedy_restart_max_lcp = 0
 
bool greedy_restart_assume_substitution = false
 
bool filter_short_mems = false
 
double short_mem_filter_factor = 0.45
 
int unpaired_penalty = 17
 
bool precollapse_order_length_hits = true
 
double avg_node_length = 0
 
size_t total_seq_length = 0
 
int fanout_length_threshold = 0
 
double recombination_penalty = 20.7
 
bool strip_bonuses
 
bool assume_acyclic
 
MappingQualityMethod mapping_quality_method
 
int max_mapping_quality
 
bool exclude_unaligned = false
 
bool debug = false
 Set to enable debugging messages to cerr from the mapper, so a user can understand why a read maps the way it does. More...
 
PathPositionHandleGraphxindex = nullptr
 
gcsa::GCSA * gcsa = nullptr
 
gcsa::LCPArray * lcp = nullptr
 
MEMAcceleratoraccelerator = nullptr
 
haplo::ScoreProviderhaplo_score_provider = nullptr
 
double haplotype_consistency_exponent = 1
 
- Public Attributes inherited from vg::AlignerClient
bool adjust_alignments_for_base_quality = false
 

Static Public Attributes

static thread_local vector< size_t > adaptive_reseed_length_memo
 

Additional Inherited Members

- Protected Member Functions inherited from vg::AlignerClient
 AlignerClient (double gc_content_estimate=vg::default_gc_content)
 
const GSSWAlignerget_aligner (bool have_qualities=true) const
 
const QualAdjAlignerget_qual_adj_aligner () const
 
const Alignerget_regular_aligner () const
 
- Protected Attributes inherited from vg::PairedEndMapper
FragmentLengthDistribution fragment_length_distr
 Holds the actual fragment length distribution and estimation information. More...
 

Detailed Description

Base class for basic mapping functionality shared between the Mapper, MultipathMapper, etc. Handles holding on to the random access and text indexes needed for mapping operations.

Constructor & Destructor Documentation

◆ BaseMapper() [1/2]

vg::BaseMapper::BaseMapper ( PathPositionHandleGraph xidex,
gcsa::GCSA *  g,
gcsa::LCPArray *  a,
haplo::ScoreProvider haplo_score_provider = nullptr 
)

Make a BaseMapper that pulls from an PathPositionHandleGraph succinct graph and a GCSA2 kmer index + LCP array, and which can score reads against haplotypes using the given ScoreProvider.

If the GCSA and LCPArray are null, cannot do search, only alignment.

◆ BaseMapper() [2/2]

vg::BaseMapper::BaseMapper ( void  )

Member Function Documentation

◆ accelerate_mem_query()

gcsa::range_type vg::BaseMapper::accelerate_mem_query ( string::const_iterator  begin,
string::const_iterator &  cursor 
) const

If possible, use the MEMAcclerator to get the initial range for a MEM and update the cursor accordingly. If this is not possible, return the full GCSA2 range and leave the cursor unaltered.

◆ apply_haplotype_consistency_scores()

void vg::BaseMapper::apply_haplotype_consistency_scores ( const vector< Alignment * > &  alns)

Score all of the alignments in the vector for haplotype consistency. If all of them can be scored (i.e. none of them visit nodes/edges with no haplotypes), adjust all of their scores to reflect haplotype consistency. If one or more cannot be scored for haplotype consistency, leave the alignment scores alone.

◆ estimate_gc_content()

double vg::BaseMapper::estimate_gc_content ( const gcsa::GCSA *  gcsa)
static

We need to be able to estimate the GC content from the GCSA index in the constructor. The given index may be null.

◆ find_fanout_mems()

vector< MaximalExactMatch > vg::BaseMapper::find_fanout_mems ( string::const_iterator  seq_begin,
string::const_iterator  seq_end,
string::const_iterator  qual_begin,
int  max_fans_out,
char  max_fanout_base_quality,
vector< deque< pair< string::const_iterator, char >>> *  mem_fanout_breaks = nullptr 
)

◆ find_mems_deep()

vector< MaximalExactMatch > vg::BaseMapper::find_mems_deep ( string::const_iterator  seq_begin,
string::const_iterator  seq_end,
double &  lcp_avg,
double &  fraction_filtered,
int  max_mem_length = 0,
int  min_mem_length = 1,
int  reseed_length = 0,
bool  use_lcp_reseed_heuristic = false,
bool  use_diff_based_fast_reseed = false,
bool  include_parent_in_sub_mem_count = false,
bool  record_max_lcp = false,
int  reseed_below_count = 0 
)

◆ find_mems_simple()

vector< MaximalExactMatch > vg::BaseMapper::find_mems_simple ( string::const_iterator  seq_begin,
string::const_iterator  seq_end,
int  max_mem_length = 0,
int  min_mem_length = 1,
int  reseed_length = 0 
)

◆ find_stripped_matches()

vector< MaximalExactMatch > vg::BaseMapper::find_stripped_matches ( string::const_iterator  seq_begin,
string::const_iterator  seq_end,
size_t  strip_length,
size_t  max_match_length,
size_t  target_count 
)

◆ find_sub_mems()

void vg::BaseMapper::find_sub_mems ( const vector< MaximalExactMatch > &  mems,
int  parent_layer_begin,
int  parent_layer_end,
int  mem_idx,
string::const_iterator  next_mem_end,
int  min_mem_length,
vector< pair< MaximalExactMatch, vector< size_t >>> &  sub_mems_out 
)

Locate the sub-MEMs contained in the last MEM of the mems vector that have ending positions before the end the next SMEM, label each of the sub-MEMs with the indices of all of the SMEMs that contain it

◆ find_sub_mems_fast()

void vg::BaseMapper::find_sub_mems_fast ( const vector< MaximalExactMatch > &  mems,
int  parent_layer_begin,
int  parent_layer_end,
int  mem_idx,
string::const_iterator  leftmost_guaranteed_disjoint_bound,
string::const_iterator  leftmost_seeding_bound,
int  min_sub_mem_length,
vector< pair< MaximalExactMatch, vector< size_t >>> &  sub_mems_out 
)

Provides same semantics as find_sub_mems but with a different algorithm. This algorithm uses the min_mem_length as a pruning tool instead of the LCP index. It can be expected to be faster when both the min_mem_length reasonably large relative to the reseed_length (e.g. 1/2 of SMEM size or similar).

◆ get_adaptive_min_reseed_length()

size_t vg::BaseMapper::get_adaptive_min_reseed_length ( size_t  parent_mem_length)

◆ precollapse_order_length_runs()

void vg::BaseMapper::precollapse_order_length_runs ( string::const_iterator  seq_begin,
vector< MaximalExactMatch > &  mems 
)

identifies hits for order-length MEMs that are actually part of longer MEMs above the GCSA's limit and merges them. for speed's sake, can have false negatives but no false positives

◆ prefilter_redundant_sub_mems()

void vg::BaseMapper::prefilter_redundant_sub_mems ( vector< MaximalExactMatch > &  mems,
vector< pair< int, vector< size_t >>> &  sub_mem_containment_graph 
)

identifies hits for sub-MEMs that are redundant hits to the parent MEMs and removes them from the hit lists. for speed's sake, can have false negatives but no false positives

◆ random_match_length()

int vg::BaseMapper::random_match_length ( double  chance_random)

◆ rescue_high_count_order_length_mems()

void vg::BaseMapper::rescue_high_count_order_length_mems ( vector< MaximalExactMatch > &  mems,
size_t  max_rescue_hit_count 
)

identifies tracts of order-length MEMs that were unfilled because their hit count was above the max and fills one MEM in the tract (the one with the smallest hit count), assumes MEMs are lexicographically ordered by read index

◆ sequence_positions()

set<pos_t> vg::BaseMapper::sequence_positions ( const string &  seq)

◆ set_alignment_scores() [1/2]

void vg::BaseMapper::set_alignment_scores ( int8_t  match,
int8_t  mismatch,
int8_t  gap_open,
int8_t  gap_extend,
int8_t  full_length_bonus,
double  haplotype_consistency_exponent = 1 
)

Override alignment score setting to support haplotype consistency exponent.

◆ set_alignment_scores() [2/2]

void vg::BaseMapper::set_alignment_scores ( istream &  matrix_stream,
int8_t  gap_open,
int8_t  gap_extend,
int8_t  full_length_bonus,
double  haplotype_consistency_exponent = 1 
)

Same, but loading a 4x4 substitution score matrix from a stream.

◆ walk_fanout_path()

vector< pos_t > vg::BaseMapper::walk_fanout_path ( string::const_iterator  begin,
string::const_iterator  end,
const deque< pair< string::const_iterator, char >> &  fanout_breaks,
gcsa::node_type  pos 
)

Member Data Documentation

◆ accelerator

MEMAccelerator* vg::BaseMapper::accelerator = nullptr

◆ adaptive_diff_exponent

double vg::BaseMapper::adaptive_diff_exponent = 0.065

◆ adaptive_reseed_diff

bool vg::BaseMapper::adaptive_reseed_diff = true

◆ adaptive_reseed_length_memo

thread_local vector< size_t > vg::BaseMapper::adaptive_reseed_length_memo
static

◆ assume_acyclic

bool vg::BaseMapper::assume_acyclic

◆ avg_node_length

double vg::BaseMapper::avg_node_length = 0

◆ debug

bool vg::BaseMapper::debug = false

Set to enable debugging messages to cerr from the mapper, so a user can understand why a read maps the way it does.

◆ exclude_unaligned

bool vg::BaseMapper::exclude_unaligned = false

◆ fanout_length_threshold

int vg::BaseMapper::fanout_length_threshold = 0

◆ fast_reseed

bool vg::BaseMapper::fast_reseed = true

◆ fast_reseed_length_diff

double vg::BaseMapper::fast_reseed_length_diff = 0.45

◆ filter_short_mems

bool vg::BaseMapper::filter_short_mems = false

◆ gcsa

gcsa::GCSA* vg::BaseMapper::gcsa = nullptr

◆ greedy_restart_assume_substitution

bool vg::BaseMapper::greedy_restart_assume_substitution = false

◆ greedy_restart_max_count

int vg::BaseMapper::greedy_restart_max_count = 2

◆ greedy_restart_max_lcp

int vg::BaseMapper::greedy_restart_max_lcp = 0

◆ greedy_restart_min_length

int vg::BaseMapper::greedy_restart_min_length = 40

◆ haplo_score_provider

haplo::ScoreProvider* vg::BaseMapper::haplo_score_provider = nullptr

◆ haplotype_consistency_exponent

double vg::BaseMapper::haplotype_consistency_exponent = 1

◆ hard_hit_max

int vg::BaseMapper::hard_hit_max = 0

◆ hit_max

int vg::BaseMapper::hit_max = 0

◆ lcp

gcsa::LCPArray* vg::BaseMapper::lcp = nullptr

◆ mapping_quality_method

MappingQualityMethod vg::BaseMapper::mapping_quality_method

◆ max_mapping_quality

int vg::BaseMapper::max_mapping_quality

◆ max_sub_mem_recursion_depth

int vg::BaseMapper::max_sub_mem_recursion_depth = 2

◆ mem_reseed_length

int vg::BaseMapper::mem_reseed_length

◆ min_mem_length

int vg::BaseMapper::min_mem_length

◆ precollapse_order_length_hits

bool vg::BaseMapper::precollapse_order_length_hits = true

◆ prefilter_redundant_hits

bool vg::BaseMapper::prefilter_redundant_hits = true

◆ recombination_penalty

double vg::BaseMapper::recombination_penalty = 20.7

◆ short_mem_filter_factor

double vg::BaseMapper::short_mem_filter_factor = 0.45

◆ strip_bonuses

bool vg::BaseMapper::strip_bonuses

◆ sub_mem_count_thinning

int vg::BaseMapper::sub_mem_count_thinning = 4

◆ sub_mem_thinning_burn_in

int vg::BaseMapper::sub_mem_thinning_burn_in = 16

◆ total_seq_length

size_t vg::BaseMapper::total_seq_length = 0

◆ unpaired_penalty

int vg::BaseMapper::unpaired_penalty = 17

◆ use_approx_sub_mem_count

bool vg::BaseMapper::use_approx_sub_mem_count = false

◆ use_greedy_mem_restarts

bool vg::BaseMapper::use_greedy_mem_restarts = false

◆ xindex

PathPositionHandleGraph* vg::BaseMapper::xindex = nullptr

The documentation for this class was generated from the following files: