vg
tools for working with variation graphs
Classes | Public Types | Public Member Functions | Public Attributes | Static Public Attributes | Protected Types | Protected Member Functions | Static Protected Member Functions | Protected Attributes | Static Protected Attributes | Friends | List of all members
vg::MinimizerMapper Class Reference

#include <minimizer_mapper.hpp>

Inheritance diagram for vg::MinimizerMapper:
vg::AlignerClient

Classes

struct  Minimizer
 

Public Types

enum  RescueAlgorithm { rescue_none, rescue_dozeu, rescue_gssw }
 Implemented rescue algorithms: no rescue, dozeu, GSSW. More...
 
typedef SnarlDistanceIndexClusterer::Seed Seed
 The information we store for each seed. More...
 

Public Member Functions

 MinimizerMapper (const gbwtgraph::GBWTGraph &graph, const gbwtgraph::DefaultMinimizerIndex &minimizer_index, SnarlDistanceIndex *distance_index, const PathPositionHandleGraph *path_graph=nullptr)
 
virtual void set_alignment_scores (const int8_t *score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 
void map (Alignment &aln, AlignmentEmitter &alignment_emitter)
 
vector< Alignmentmap (Alignment &aln)
 
vector< Alignmentmap_from_chains (Alignment &aln)
 
vector< Alignmentmap_from_extensions (Alignment &aln)
 
pair< vector< Alignment >, vector< Alignment > > map_paired (Alignment &aln1, Alignment &aln2, vector< pair< Alignment, Alignment >> &ambiguous_pair_buffer)
 
pair< vector< Alignment >, vector< Alignment > > map_paired (Alignment &aln1, Alignment &aln2)
 
bool fragment_distr_is_finalized ()
 
void finalize_fragment_length_distr ()
 
void force_fragment_length_distr (double mean, double stdev)
 
double get_fragment_length_mean () const
 
double get_fragment_length_stdev () const
 
size_t get_fragment_length_sample_size () const
 
size_t get_distance_limit (size_t read_length) const
 
virtual void set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 Set all the aligner scoring parameters and create the stored aligner instances. More...
 
virtual void set_alignment_scores (std::istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 
virtual void set_alignment_scores (const int8_t *score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 
- Public Member Functions inherited from vg::AlignerClient
virtual void set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 Set all the aligner scoring parameters and create the stored aligner instances. More...
 
virtual void set_alignment_scores (std::istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 

Public Attributes

size_t hit_cap = default_hit_cap
 
size_t hard_hit_cap = default_hard_hit_cap
 
double minimizer_score_fraction = default_minimizer_score_fraction
 
size_t max_unique_min = default_max_unique_min
 
size_t num_bp_per_min = default_num_bp_per_min
 
bool exclude_overlapping_min = default_exclude_overlapping_min
 
size_t min_extensions = default_min_extensions
 
size_t max_extensions = default_max_extensions
 
double extension_set_score_threshold = default_extension_set_score_threshold
 
int extension_score_threshold = default_extension_score_threshold
 
int min_extension_sets = default_min_extension_sets
 
int extension_set_min_score = default_extension_set_min_score
 
size_t max_alignments = default_max_alignments
 
size_t max_local_extensions = default_max_local_extensions
 
double cluster_score_threshold = default_cluster_score_threshold
 
double pad_cluster_score_threshold = default_pad_cluster_score_threshold
 
double cluster_coverage_threshold = default_cluster_coverage_threshold
 
bool align_from_chains = default_align_from_chains
 
size_t chaining_cluster_distance = default_chaining_cluster_distance
 
double precluster_connection_coverage_threshold = default_precluster_connection_coverage_threshold
 
size_t min_precluster_connections = default_min_precluster_connections
 
size_t max_precluster_connections = default_max_precluster_connections
 
size_t reseed_search_distance = default_reseed_search_distance
 
size_t min_clusters_to_chain = default_min_clusters_to_chain
 
size_t max_clusters_to_chain = default_max_clusters_to_chain
 
size_t max_chain_connection = default_max_chain_connection
 
size_t max_tail_length = default_max_tail_length
 
size_t max_lookback_bases = default_max_lookback_bases
 
size_t min_lookback_items = default_min_lookback_items
 
size_t lookback_item_hard_cap = lookback_item_hard_cap
 
size_t initial_lookback_threshold = default_initial_lookback_threshold
 
double lookback_scale_factor = default_lookback_scale_factor
 
double min_good_transition_score_per_base = default_min_good_transition_score_per_base
 
int item_bonus = default_item_bonus
 
size_t max_indel_bases = default_max_indel_bases
 
double chain_score_threshold = default_chain_score_threshold
 
int min_chains = default_min_chains
 
int chain_min_score = default_chain_min_score
 
size_t max_dp_cells = default_max_dp_cells
 
size_t max_multimaps = default_max_multimaps
 
size_t distance_limit = default_distance_limit
 
bool do_dp = default_do_dp
 
bool track_provenance = default_track_provenance
 
bool track_correctness = default_track_correctness
 
bool show_work = default_show_work
 
double paired_distance_stdevs = default_paired_distance_stdevs
 
double paired_rescue_score_limit = default_paired_rescue_score_limit
 
double rescue_subgraph_stdevs = default_rescue_subgraph_stdevs
 
size_t rescue_seed_limit = default_rescue_seed_limit
 
size_t max_rescue_attempts = default_max_rescue_attempts
 
size_t max_dozeu_cells = default_max_dozeu_cells
 
size_t max_fragment_length = default_max_fragment_length
 
RescueAlgorithm rescue_algorithm = rescue_dozeu
 The algorithm used for rescue. More...
 
string sample_name
 Apply this sample name. More...
 
string read_group
 Apply this read group name. More...
 
atomic_flag warned_about_rescue_size = ATOMIC_FLAG_INIT
 Have we complained about hitting the size limit for rescue? More...
 
atomic_flag warned_about_tail_size = ATOMIC_FLAG_INIT
 Have we complained about hitting the size limit for tails? More...
 
- Public Attributes inherited from vg::AlignerClient
bool adjust_alignments_for_base_quality = false
 

Static Public Attributes

static constexpr size_t default_hit_cap = 10
 Use all minimizers with at most hit_cap hits. More...
 
static constexpr size_t default_hard_hit_cap = 500
 Ignore all minimizers with more than hard_hit_cap hits. More...
 
static constexpr double default_minimizer_score_fraction = 0.9
 
static constexpr size_t default_max_unique_min = 500
 Maximum number of distinct minimizers to take. More...
 
static constexpr size_t default_num_bp_per_min = 1000
 Number of minimzers to select based on read_len/num_min_per_bp. More...
 
static constexpr bool default_exclude_overlapping_min = false
 If set, exclude overlapping minimizers. More...
 
static constexpr size_t default_min_extensions = 2
 Accept at least this many clusters for gapless extension. More...
 
static constexpr size_t default_max_extensions = 800
 How many clusters should we produce gapless extensions for, max? More...
 
static constexpr double default_extension_set_score_threshold = 20
 
static constexpr int default_extension_score_threshold = 1
 
static constexpr int default_min_extension_sets = 2
 
static constexpr int default_extension_set_min_score = 20
 
static constexpr size_t default_max_alignments = 8
 How many extended clusters should we align, max? More...
 
static constexpr size_t default_max_local_extensions = numeric_limits<size_t>::max()
 How many extensions should we try as seeds within a mapping location? More...
 
static constexpr double default_cluster_score_threshold = 50
 
static constexpr double default_pad_cluster_score_threshold = 20
 
static constexpr double default_cluster_coverage_threshold = 0.3
 
static constexpr bool default_align_from_chains = false
 
static constexpr size_t default_chaining_cluster_distance = 100
 What read-length-independent distance threshold do we want to use for clustering? More...
 
static constexpr double default_precluster_connection_coverage_threshold = 0.3
 
static constexpr size_t default_min_precluster_connections = 10
 How many connections between preclusters should we reseed over, minimum? More...
 
static constexpr size_t default_max_precluster_connections = 50
 How many connections between preclusters should we reseed over, maximum? More...
 
static constexpr size_t default_reseed_search_distance = 10000
 When connecting subclusters for reseeding, how far should we search? More...
 
static constexpr size_t default_min_clusters_to_chain = 2
 Accept at least this many clusters for chain generation. More...
 
static constexpr size_t default_max_clusters_to_chain = 20
 How many clusters should we produce chains for, max? More...
 
static constexpr size_t default_max_chain_connection = 100
 
static constexpr size_t default_max_tail_length = 100
 Similarly, what is the maximum tail length we will try to align? More...
 
static constexpr size_t default_max_lookback_bases = 100
 
static constexpr size_t default_min_lookback_items = 1
 How many chaining sources should we make sure to consider regardless of distance? More...
 
static constexpr size_t default_lookback_item_hard_cap = 15
 How many chaining sources should we allow ourselves to consider ever? More...
 
static constexpr size_t default_initial_lookback_threshold = 10
 How many bases should we try to look back initially when chaining? More...
 
static constexpr double default_lookback_scale_factor = 2.0
 How much chould we increase lookback when we can't find anything good? More...
 
static constexpr double default_min_good_transition_score_per_base = -0.1
 How bad can a transition be per base before lookback accepts it? More...
 
static constexpr int default_item_bonus = 0
 How much of a bonus should we give to each item in chaining? More...
 
static constexpr size_t default_max_indel_bases = 50
 How many bases of indel should we allow in chaining? More...
 
static constexpr double default_chain_score_threshold = 100
 
static constexpr int default_min_chains = 1
 
static constexpr int default_chain_min_score = 100
 
static constexpr int MAX_DP_LENGTH = 30000
 
static constexpr size_t default_max_dp_cells = 16UL * 1024UL * 1024UL
 
static constexpr size_t default_max_multimaps = 1
 
static constexpr size_t default_distance_limit = 200
 
static constexpr bool default_do_dp = true
 If false, skip computing base-level alignments. More...
 
static constexpr bool default_track_provenance = false
 
static constexpr bool default_track_correctness = false
 
static constexpr bool default_show_work = false
 If set, log what the mapper is thinking in its mapping of each read. More...
 
static constexpr double default_paired_distance_stdevs = 2.0
 
static constexpr double default_paired_rescue_score_limit = 0.9
 How close does an alignment have to be to the best alignment for us to rescue on it. More...
 
static constexpr double default_rescue_subgraph_stdevs = 4.0
 How many stdevs from the mean do we extract a subgraph from? More...
 
static constexpr size_t default_rescue_seed_limit = 100
 Do not attempt rescue if there are more seeds in the rescue subgraph. More...
 
static constexpr size_t default_max_rescue_attempts = 15
 For paired end mapping, how many times should we attempt rescue (per read)? More...
 
static constexpr size_t default_max_dozeu_cells = (size_t)(1.5 * 1024 * 1024)
 
static constexpr size_t default_max_fragment_length = 2000
 What is the maximum fragment length that we accept as valid for paired-end reads? More...
 

Protected Types

typedef SnarlDistanceIndexClusterer::Cluster Cluster
 The information we store for each cluster. More...
 
using ImmutablePath = structures::ImmutableList< Mapping >
 

Protected Member Functions

double distance_to_annotation (int64_t distance) const
 
std::vector< algorithms::Anchorto_anchors (const Alignment &aln, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds) const
 Convert a collection of seeds to a collection of chaining anchors. More...
 
algorithms::Anchor to_anchor (const Alignment &aln, const VectorView< Minimizer > &minimizers, const Seed &seed) const
 Convert a single seed to a single chaining anchor. More...
 
WFAAlignment to_wfa_alignment (const algorithms::Anchor &anchor) const
 Convert an Anchor to a WFAAlignment. More...
 
std::vector< Minimizerfind_minimizers (const std::string &sequence, Funnel &funnel) const
 
std::vector< size_t > sort_minimizers_by_score (const std::vector< Minimizer > &minimizers) const
 
std::vector< Seedfind_seeds (const VectorView< Minimizer > &minimizers, const Alignment &aln, Funnel &funnel) const
 
void tag_seeds (const Alignment &aln, const std::vector< Seed >::const_iterator &begin, const std::vector< Seed >::const_iterator &end, const VectorView< Minimizer > &minimizers, size_t funnel_offset, Funnel &funnel) const
 
void score_cluster (Cluster &cluster, size_t i, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, size_t seq_length, Funnel &funnel) const
 
void score_merged_cluster (Cluster &cluster, size_t i, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, size_t first_new_seed, const std::vector< size_t > &seed_to_precluster, const std::vector< Cluster > &preclusters, size_t seq_length, Funnel &funnel) const
 
std::vector< Seedreseed_between (size_t read_region_start, size_t read_region_end, pos_t left_graph_pos, pos_t right_graph_pos, const HandleGraph &graph, const VectorView< Minimizer > &minimizers, const std::function< void(const Minimizer &, const std::vector< nid_t > &, const std::function< void(const pos_t &)> &)> &for_each_pos_for_source_in_subgraph) const
 
vector< GaplessExtensionextend_cluster (const Cluster &cluster, size_t cluster_num, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, const string &sequence, vector< vector< size_t >> &minimizer_kept_cluster_count, Funnel &funnel) const
 
std::vector< int > score_extensions (const std::vector< std::vector< GaplessExtension >> &extensions, const Alignment &aln, Funnel &funnel) const
 
std::vector< int > score_extensions (const std::vector< std::pair< std::vector< GaplessExtension >, size_t >> &extensions, const Alignment &aln, Funnel &funnel) const
 
Alignment find_chain_alignment (const Alignment &aln, const VectorView< algorithms::Anchor > &to_chain, const std::vector< size_t > &chain) const
 
void find_optimal_tail_alignments (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, LazyRNG &rng, Alignment &best, Alignment &second_best) const
 
void attempt_rescue (const Alignment &aligned_read, Alignment &rescued_alignment, const VectorView< Minimizer > &minimizers, bool rescue_forward)
 
GaplessExtender::cluster_type seeds_in_subgraph (const VectorView< Minimizer > &minimizers, const std::unordered_set< nid_t > &subgraph) const
 
void fix_dozeu_score (Alignment &rescued_alignment, const HandleGraph &rescue_graph, const std::vector< handle_t > &topological_order) const
 
void fix_dozeu_end_deletions (Alignment &rescued_alignment) const
 
int64_t distance_between (const pos_t &pos1, const pos_t &pos2)
 
int64_t distance_between (const Alignment &aln1, const Alignment &aln2)
 
int64_t unoriented_distance_between (const pos_t &pos1, const pos_t &pos2) const
 
void extension_to_alignment (const GaplessExtension &extension, Alignment &alignment) const
 
void wfa_alignment_to_alignment (const WFAAlignment &wfa_alignment, Alignment &alignment) const
 
void pair_all (std::array< vector< Alignment >, 2 > &mappings) const
 
void annotate_with_minimizer_statistics (Alignment &target, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, size_t old_seed_count, size_t new_seed_offset, const Funnel &funnel) const
 
double compute_mapq_caps (const Alignment &aln, const VectorView< Minimizer > &minimizers, const SmallBitset &explored)
 
vector< TreeSubgraphget_tail_forest (const GaplessExtension &extended_seed, size_t read_length, bool left_tails, size_t *longest_detectable_gap=nullptr) const
 
pair< Path, size_t > get_best_alignment_against_any_tree (const vector< TreeSubgraph > &trees, const string &sequence, const Position &default_position, bool pin_left, size_t longest_detectable_gap, LazyRNG &rng) const
 
void dfs_gbwt (const Position &from, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const
 
void dfs_gbwt (handle_t from_handle, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const
 
void dfs_gbwt (const gbwt::SearchState &start_state, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const
 
double score_alignment_pair (Alignment &aln1, Alignment &aln2, int64_t fragment_distance)
 
template<typename Score = double>
void process_until_threshold_a (size_t items, const function< Score(size_t)> &get_score, double threshold, size_t min_count, size_t max_count, LazyRNG &rng, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const
 
template<typename Score = double>
void process_until_threshold_b (const vector< Score > &scores, double threshold, size_t min_count, size_t max_count, LazyRNG &rng, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const
 
template<typename Score = double>
void process_until_threshold_c (size_t items, const function< Score(size_t)> &get_score, const function< bool(size_t, size_t)> &comparator, double threshold, size_t min_count, size_t max_count, LazyRNG &get_seed, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const
 
bool validate_clusters (const std::vector< std::vector< Cluster >> &clusters, const std::vector< std::vector< Seed >> &seeds, size_t read_limit, size_t fragment_limit) const
 Do a brute check of the clusters. Print errors to stderr. More...
 
- Protected Member Functions inherited from vg::AlignerClient
 AlignerClient (double gc_content_estimate=vg::default_gc_content)
 
const GSSWAlignerget_aligner (bool have_qualities=true) const
 
const QualAdjAlignerget_qual_adj_aligner () const
 
const Alignerget_regular_aligner () const
 

Static Protected Member Functions

static gbwtgraph::Payload no_chain_info ()
 How should we initialize chain info when it's not stored in the minimizer index? More...
 
static Seed chain_info_to_seed (const pos_t &hit, size_t minimizer, const gbwtgraph::Payload &chain_info)
 
static int score_extension_group (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, int gap_open_penalty, int gap_extend_penalty)
 
static void with_dagified_local_graph (const pos_t &left_anchor, const pos_t &right_anchor, size_t max_path_length, const HandleGraph &graph, const std::function< void(DeletableHandleGraph &, const std::function< std::pair< nid_t, bool >(const handle_t &)> &)> &callback)
 
static void align_sequence_between (const pos_t &left_anchor, const pos_t &right_anchor, size_t max_path_length, const HandleGraph *graph, const GSSWAligner *aligner, Alignment &alignment, size_t max_dp_cells=std::numeric_limits< size_t >::max())
 
static double window_breaking_quality (const VectorView< Minimizer > &minimizers, vector< size_t > &broken, const string &sequence, const string &quality_bytes)
 
static double faster_cap (const VectorView< Minimizer > &minimizers, vector< size_t > &minimizers_explored, const string &sequence, const string &quality_bytes)
 
static void for_each_agglomeration_interval (const VectorView< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t > &minimizer_indices, const function< void(size_t, size_t, size_t, size_t)> &iteratee)
 
static double get_log10_prob_of_disruption_in_interval (const VectorView< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t >::iterator &disrupt_begin, const vector< size_t >::iterator &disrupt_end, size_t left, size_t right)
 
static double get_prob_of_disruption_in_column (const VectorView< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t >::iterator &disrupt_begin, const vector< size_t >::iterator &disrupt_end, size_t index)
 
static size_t immutable_path_from_length (const ImmutablePath &path)
 
static Path to_path (const ImmutablePath &path)
 
static string log_name ()
 Get the thread identifier prefix for logging. More...
 
static string log_alignment (const Alignment &aln)
 Turn an Alignment into a conveniently-sized string for logging. More...
 
static string log_alignment (const Path &path, bool force_condensed=false)
 Turn an Path from an alignment into a conveniently-sized string for logging. More...
 
static string log_bits (const std::vector< bool > &bits)
 Turn a list of bit flags into a compact representation. More...
 
static void dump_chaining_problem (const std::vector< algorithms::Anchor > &anchors, const std::vector< size_t > &cluster_seeds_sorted, const HandleGraph &graph)
 Dump a whole chaining problem. More...
 
static void dump_debug_minimizers (const VectorView< Minimizer > &minimizers, const string &sequence, const vector< size_t > *to_include=nullptr, size_t start_offset=0, size_t length_limit=std::numeric_limits< size_t >::max())
 Dump all the given minimizers, with optional subset restriction. More...
 
static void dump_debug_extension_set (const HandleGraph &graph, const Alignment &aln, const vector< GaplessExtension > &extended_seeds)
 Dump all the extansions in an extension set. More...
 
static void dump_debug_sequence (ostream &out, const string &sequence, size_t start_offset=0, size_t length_limit=std::numeric_limits< size_t >::max())
 Print a sequence with base numbering. More...
 
static void dump_debug_clustering (const Cluster &cluster, size_t cluster_number, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds)
 Print the seed content of a cluster. More...
 
static void dump_debug_seeds (const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, const std::vector< size_t > &selected_seeds)
 Print information about a selected set of seeds. More...
 
static void dump_debug_query (const Alignment &aln)
 Print information about a read to be aligned. More...
 
static void dump_debug_query (const Alignment &aln1, const Alignment &aln2)
 Print information about a read pair to be aligned. More...
 

Protected Attributes

const PathPositionHandleGraphpath_graph
 
const gbwtgraph::DefaultMinimizerIndex & minimizer_index
 
SnarlDistanceIndex * distance_index
 
const gbwtgraph::GBWTGraph & gbwt_graph
 This is our primary graph. More...
 
std::unique_ptr< GaplessExtenderextender
 
SnarlDistanceIndexClusterer clusterer
 We have a clusterer. More...
 
FragmentLengthDistribution fragment_length_distr
 
atomic_flag warned_about_bad_distribution = ATOMIC_FLAG_INIT
 We may need to complain exactly once that the distribution is bad. More...
 

Static Protected Attributes

const static size_t LONG_LIMIT = 256
 Length at which we cut over to long-alignment logging. More...
 
const static size_t MANY_LIMIT = 20
 Count at which we cut over to summary logging. More...
 

Friends

class TestMinimizerMapper
 

Additional Inherited Members

- Static Public Member Functions inherited from vg::AlignerClient
static int8_t * parse_matrix (std::istream &matrix_stream)
 Allocates an array to hold a 4x4 substitution matrix and returns it. More...
 

Member Typedef Documentation

◆ Cluster

The information we store for each cluster.

◆ ImmutablePath

using vg::MinimizerMapper::ImmutablePath = structures::ImmutableList<Mapping>
protected

We define a type for shared-tail lists of Mappings, to avoid constantly copying Path objects.

◆ Seed

The information we store for each seed.

Member Enumeration Documentation

◆ RescueAlgorithm

Implemented rescue algorithms: no rescue, dozeu, GSSW.

Enumerator
rescue_none 
rescue_dozeu 
rescue_gssw 

Constructor & Destructor Documentation

◆ MinimizerMapper()

vg::MinimizerMapper::MinimizerMapper ( const gbwtgraph::GBWTGraph &  graph,
const gbwtgraph::DefaultMinimizerIndex &  minimizer_index,
SnarlDistanceIndex *  distance_index,
const PathPositionHandleGraph path_graph = nullptr 
)

Construct a new MinimizerMapper using the given indexes. The PathPositionhandleGraph can be nullptr, as we only use it for correctness tracking.

Member Function Documentation

◆ align_sequence_between()

void vg::MinimizerMapper::align_sequence_between ( const pos_t left_anchor,
const pos_t right_anchor,
size_t  max_path_length,
const HandleGraph graph,
const GSSWAligner aligner,
Alignment alignment,
size_t  max_dp_cells = std::numeric_limits<size_t>::max() 
)
staticprotected

Clip out the part of the graph between the given positions and global-align the sequence of the given Alignment to it. Populate the Alignment's path and score.

Finds an alignment against a graph path if it is <= max_path_length, and uses <= max_dp_cells GSSW cells.

If one of the anchor positions is empty, does pinned alighnment against the other position.

◆ annotate_with_minimizer_statistics()

void vg::MinimizerMapper::annotate_with_minimizer_statistics ( Alignment target,
const VectorView< Minimizer > &  minimizers,
const std::vector< Seed > &  seeds,
size_t  old_seed_count,
size_t  new_seed_offset,
const Funnel funnel 
) const
protected

Add annotations to an Alignment with statistics about the minimizers.

old_seed_count is the number of seeds in the seed vector actually created at the "seed" stage of the alignment process. new_seed_offset is where the first of thos eseeds appears in the funnel at the reseed stage.

◆ attempt_rescue()

void vg::MinimizerMapper::attempt_rescue ( const Alignment aligned_read,
Alignment rescued_alignment,
const VectorView< Minimizer > &  minimizers,
bool  rescue_forward 
)
protected

Given an aligned read, extract a subgraph of the graph within a distance range based on the fragment length distribution and attempt to align the unaligned read to it. Rescue_forward is true if the aligned read is the first and false otherwise. Assumes that both reads are facing the same direction. TODO: This should be const, but some of the function calls are not.

◆ chain_info_to_seed()

static Seed vg::MinimizerMapper::chain_info_to_seed ( const pos_t hit,
size_t  minimizer,
const gbwtgraph::Payload &  chain_info 
)
inlinestaticprotected

How do we convert chain info to an actual seed of the type we are using? Also needs to know the hit position, and the minimizer number.

◆ compute_mapq_caps()

double vg::MinimizerMapper::compute_mapq_caps ( const Alignment aln,
const VectorView< Minimizer > &  minimizers,
const SmallBitset explored 
)
protected

Compute MAPQ caps based on all minimizers that are explored, for some definition of explored.

Needs access to the input alignment for sequence and quality information.

Returns only an "extended" cap at the moment.

◆ dfs_gbwt() [1/3]

void vg::MinimizerMapper::dfs_gbwt ( const gbwt::SearchState &  start_state,
size_t  from_offset,
size_t  walk_distance,
const function< void(const handle_t &)> &  enter_handle,
const function< void(void)>  exit_handle 
) const
protected

The same as dfs_gbwt on a handle and an offset, but takes a gbwt::SearchState that defines only some haplotypes on a handle to start with.

◆ dfs_gbwt() [2/3]

void vg::MinimizerMapper::dfs_gbwt ( const Position from,
size_t  walk_distance,
const function< void(const handle_t &)> &  enter_handle,
const function< void(void)>  exit_handle 
) const
protected

Run a DFS on valid haplotypes in the GBWT starting from the given Position, and continuing up to the given number of bases.

Calls enter_handle when the DFS enters a haplotype visit to a particular handle, and exit_handle when it exits a visit. These let the caller maintain a stack and track the traversals.

The starting node is only entered if its offset isn't equal to its length (i.e. bases remain to be visited).

Stopping early is not permitted.

◆ dfs_gbwt() [3/3]

void vg::MinimizerMapper::dfs_gbwt ( handle_t  from_handle,
size_t  from_offset,
size_t  walk_distance,
const function< void(const handle_t &)> &  enter_handle,
const function< void(void)>  exit_handle 
) const
protected

The same as dfs_gbwt on a Position, but takes a handle in the backing gbwt_graph and an offset from the start of the handle instead.

◆ distance_between() [1/2]

int64_t vg::MinimizerMapper::distance_between ( const Alignment aln1,
const Alignment aln2 
)
protected

Get the distance between a pair of read alignments, or std::numeric_limits<int64_t>::max() if unreachable.

◆ distance_between() [2/2]

int64_t vg::MinimizerMapper::distance_between ( const pos_t pos1,
const pos_t pos2 
)
protected

Get the distance between a pair of positions, or std::numeric_limits<int64_t>::max() if unreachable.

◆ distance_to_annotation()

double vg::MinimizerMapper::distance_to_annotation ( int64_t  distance) const
protected

Convert an integer distance, with limits standing for no distance, to a double annotation that can safely be parsed back from JSON into an integer if it is integral.

◆ dump_chaining_problem()

void vg::MinimizerMapper::dump_chaining_problem ( const std::vector< algorithms::Anchor > &  anchors,
const std::vector< size_t > &  cluster_seeds_sorted,
const HandleGraph graph 
)
staticprotected

Dump a whole chaining problem.

◆ dump_debug_clustering()

void vg::MinimizerMapper::dump_debug_clustering ( const Cluster cluster,
size_t  cluster_number,
const VectorView< Minimizer > &  minimizers,
const std::vector< Seed > &  seeds 
)
staticprotected

Print the seed content of a cluster.

◆ dump_debug_extension_set()

void vg::MinimizerMapper::dump_debug_extension_set ( const HandleGraph graph,
const Alignment aln,
const vector< GaplessExtension > &  extended_seeds 
)
staticprotected

Dump all the extansions in an extension set.

◆ dump_debug_minimizers()

void vg::MinimizerMapper::dump_debug_minimizers ( const VectorView< Minimizer > &  minimizers,
const string &  sequence,
const vector< size_t > *  to_include = nullptr,
size_t  start_offset = 0,
size_t  length_limit = std::numeric_limits<size_t>::max() 
)
staticprotected

Dump all the given minimizers, with optional subset restriction.

◆ dump_debug_query() [1/2]

void vg::MinimizerMapper::dump_debug_query ( const Alignment aln)
staticprotected

Print information about a read to be aligned.

◆ dump_debug_query() [2/2]

void vg::MinimizerMapper::dump_debug_query ( const Alignment aln1,
const Alignment aln2 
)
staticprotected

Print information about a read pair to be aligned.

◆ dump_debug_seeds()

void vg::MinimizerMapper::dump_debug_seeds ( const VectorView< Minimizer > &  minimizers,
const std::vector< Seed > &  seeds,
const std::vector< size_t > &  selected_seeds 
)
staticprotected

Print information about a selected set of seeds.

◆ dump_debug_sequence()

void vg::MinimizerMapper::dump_debug_sequence ( ostream &  out,
const string &  sequence,
size_t  start_offset = 0,
size_t  length_limit = std::numeric_limits<size_t>::max() 
)
staticprotected

Print a sequence with base numbering.

◆ extend_cluster()

vector< GaplessExtension > vg::MinimizerMapper::extend_cluster ( const Cluster cluster,
size_t  cluster_num,
const VectorView< Minimizer > &  minimizers,
const std::vector< Seed > &  seeds,
const string &  sequence,
vector< vector< size_t >> &  minimizer_kept_cluster_count,
Funnel funnel 
) const
protected

Extends the seeds in a cluster into a collection of GaplessExtension objects.

◆ extension_to_alignment()

void vg::MinimizerMapper::extension_to_alignment ( const GaplessExtension extension,
Alignment alignment 
) const
protected

Convert the GaplessExtension into an alignment. This assumes that the extension is a full-length alignment and that the sequence field of the alignment has been set.

◆ faster_cap()

double vg::MinimizerMapper::faster_cap ( const VectorView< Minimizer > &  minimizers,
vector< size_t > &  minimizers_explored,
const string &  sequence,
const string &  quality_bytes 
)
staticprotected

Compute a bound on the Phred score probability of a mapping beign wrong due to base errors and unlocated minimizer hits prevented us from finding the true alignment.

Algorithm uses a "sweep line" dynamic programming approach. For a read with minimizers aligned to it:

         000000000011111111112222222222
         012345678901234567890123456789

Read: ****************************** Minimizer 1: ***** Minimizer 2: ***** Minimizer 3: ***** Minimizer 4: *****

For each distinct read interval of overlapping minimizers, e.g. in the example the intervals 3,4,5; 6,7; 8,9,10; 18,19,20; 21,22; and 23,24,25 we consider base errors that would result in the minimizers in the interval being incorrect

We use dynamic programming sweeping left-to-right over the intervals to compute the probability of the minimum number of base errors needed to disrupt all the minimizers.

Will sort minimizers_explored (which is indices into minimizers) by minimizer start position.

◆ finalize_fragment_length_distr()

void vg::MinimizerMapper::finalize_fragment_length_distr ( )
inline

◆ find_chain_alignment()

Alignment vg::MinimizerMapper::find_chain_alignment ( const Alignment aln,
const VectorView< algorithms::Anchor > &  to_chain,
const std::vector< size_t > &  chain 
) const
protected

Turn a chain into an Alignment.

Operating on the given input alignment, align the tails and intervening sequences along the given chain of perfect-match seeds, and return an optimal Alignment.

◆ find_minimizers()

std::vector< MinimizerMapper::Minimizer > vg::MinimizerMapper::find_minimizers ( const std::string &  sequence,
Funnel funnel 
) const
protected

Find the minimizers in the sequence using the minimizer index, and return them sorted in read order.

◆ find_optimal_tail_alignments()

void vg::MinimizerMapper::find_optimal_tail_alignments ( const Alignment aln,
const vector< GaplessExtension > &  extended_seeds,
LazyRNG rng,
Alignment best,
Alignment second_best 
) const
protected

Operating on the given input alignment, align the tails dangling off the given extended perfect-match seeds and produce an optimal alignment into the given output Alignment object, best, and the second best alignment into second_best.

Uses the given RNG to break ties.

◆ find_seeds()

std::vector< MinimizerMapper::Seed > vg::MinimizerMapper::find_seeds ( const VectorView< Minimizer > &  minimizers,
const Alignment aln,
Funnel funnel 
) const
protected

Find seeds for all minimizers passing the filters.

◆ fix_dozeu_end_deletions()

void vg::MinimizerMapper::fix_dozeu_end_deletions ( Alignment rescued_alignment) const
protected

When dozeu doesn't have any seeds, it's scan heuristic can lead to inaccurate anchoring with the end result that one end of the alignment has a deletion that doesn't connect to an aligned base. This function removes those deletions

◆ fix_dozeu_score()

void vg::MinimizerMapper::fix_dozeu_score ( Alignment rescued_alignment,
const HandleGraph rescue_graph,
const std::vector< handle_t > &  topological_order 
) const
protected

When we use dozeu for rescue, the reported alignment score is incorrect. 1) Dozeu only gives the full-length bonus once. 2) There is no penalty for a softclip at the edge of the subgraph. This function calculates the score correctly. If the score is <= 0, we realign the read using GSSW. TODO: This should be unnecessary.

◆ for_each_agglomeration_interval()

void vg::MinimizerMapper::for_each_agglomeration_interval ( const VectorView< Minimizer > &  minimizers,
const string &  sequence,
const string &  quality_bytes,
const vector< size_t > &  minimizer_indices,
const function< void(size_t, size_t, size_t, size_t)> &  iteratee 
)
staticprotected

Given a collection of minimizers, and a list of the minimizers we actually care about (as indices into the collection), iterate over common intervals of overlapping minimizer agglomerations.

Calls the given callback with (left, right, bottom, top), where left is the first base of the agglomeration interval (inclusive), right is the last base of the agglomeration interval (exclusive), bottom is the index of the first minimizer with an agglomeration in the interval and top is the index of the last minimizer with an agglomeration in the interval (exclusive).

minimizer_indices must be sorted by agglomeration end, and then by agglomeration start, so they can be decomposed into nice rectangles.

Note that bottom and top are offsets into minimizer_indices, NOT minimizers itself. Only contiguous ranges in minimizer_indices actually make sense.

◆ force_fragment_length_distr()

void vg::MinimizerMapper::force_fragment_length_distr ( double  mean,
double  stdev 
)
inline

◆ fragment_distr_is_finalized()

bool vg::MinimizerMapper::fragment_distr_is_finalized ( )
inline

◆ get_best_alignment_against_any_tree()

pair< Path, size_t > vg::MinimizerMapper::get_best_alignment_against_any_tree ( const vector< TreeSubgraph > &  trees,
const string &  sequence,
const Position default_position,
bool  pin_left,
size_t  longest_detectable_gap,
LazyRNG rng 
) const
protected

Find the best alignment of the given sequence against any of the trees provided in trees, where each tree is a TreeSubgraph over the GBWT graph. Each tree subgraph is rooted at the left in its own local coordinate space, even if we are pinning on the right.

If no mapping is possible (for example, because there are no trees), produce a pure insert at default_position.

Alignment is always pinned.

If pin_left is true, pin the alignment on the left to the root of each tree. Otherwise pin it on the right to the root of each tree.

Limits the length of the longest gap to longest_detectable_gap.

Returns alignments in gbwt_graph space.

◆ get_distance_limit()

size_t vg::MinimizerMapper::get_distance_limit ( size_t  read_length) const
inline

Get the distance limit for the given read length

◆ get_fragment_length_mean()

double vg::MinimizerMapper::get_fragment_length_mean ( ) const
inline

◆ get_fragment_length_sample_size()

size_t vg::MinimizerMapper::get_fragment_length_sample_size ( ) const
inline

◆ get_fragment_length_stdev()

double vg::MinimizerMapper::get_fragment_length_stdev ( ) const
inline

◆ get_log10_prob_of_disruption_in_interval()

double vg::MinimizerMapper::get_log10_prob_of_disruption_in_interval ( const VectorView< Minimizer > &  minimizers,
const string &  sequence,
const string &  quality_bytes,
const vector< size_t >::iterator &  disrupt_begin,
const vector< size_t >::iterator &  disrupt_end,
size_t  left,
size_t  right 
)
staticprotected

Gives the log10 prob of a base error in the given interval of the read, accounting for the disruption of specified minimizers.

minimizers is the collection of all minimizers

disrupt_begin and disrupt_end are iterators defining a sequence of indices of minimizers in minimizers that are disrupted.

left and right are the inclusive and exclusive bounds of the interval of the read where the disruption occurs.

◆ get_prob_of_disruption_in_column()

double vg::MinimizerMapper::get_prob_of_disruption_in_column ( const VectorView< Minimizer > &  minimizers,
const string &  sequence,
const string &  quality_bytes,
const vector< size_t >::iterator &  disrupt_begin,
const vector< size_t >::iterator &  disrupt_end,
size_t  index 
)
staticprotected

Gives the raw probability of a base error in the given column of the read, accounting for the disruption of specified minimizers.

minimizers is the collection of all minimizers

disrupt_begin and disrupt_end are iterators defining a sequence of indices of minimizers in minimizers that are disrupted.

index is the position in the read where the disruption occurs.

◆ get_tail_forest()

vector< TreeSubgraph > vg::MinimizerMapper::get_tail_forest ( const GaplessExtension extended_seed,
size_t  read_length,
bool  left_tails,
size_t *  longest_detectable_gap = nullptr 
) const
protected

Get all the trees defining tails off the specified side of the specified gapless extension. Should only be called if a tail on that side exists, or this is a waste of time.

If the gapless extension starts or ends at a node boundary, there may be multiple trees produced, each with a distinct root.

If the gapless extension abuts the edge of the read, an empty forest will be produced.

Each tree is represented as a TreeSubgraph over our gbwt_graph.

If left_tails is true, the trees read out of the left sides of the gapless extension. Otherwise they read out of the right side.

As a side effect, saves the length of the longest detectable gap in an alignment of a tail to the forest into the provided location, if set.

◆ immutable_path_from_length()

size_t vg::MinimizerMapper::immutable_path_from_length ( const ImmutablePath path)
staticprotected

Get the from length of an ImmutabelPath.

Can't be called path_from_length or it will shadow the one for Paths instead of overloading.

◆ log_alignment() [1/2]

string vg::MinimizerMapper::log_alignment ( const Alignment aln)
staticprotected

Turn an Alignment into a conveniently-sized string for logging.

◆ log_alignment() [2/2]

string vg::MinimizerMapper::log_alignment ( const Path path,
bool  force_condensed = false 
)
staticprotected

Turn an Path from an alignment into a conveniently-sized string for logging.

◆ log_bits()

string vg::MinimizerMapper::log_bits ( const std::vector< bool > &  bits)
staticprotected

Turn a list of bit flags into a compact representation.

◆ log_name()

string vg::MinimizerMapper::log_name ( )
staticprotected

Get the thread identifier prefix for logging.

◆ map() [1/2]

vector< Alignment > vg::MinimizerMapper::map ( Alignment aln)

Map the given read. Return a vector of alignments that it maps to, winner first.

◆ map() [2/2]

void vg::MinimizerMapper::map ( Alignment aln,
AlignmentEmitter alignment_emitter 
)

Map the given read, and send output to the given AlignmentEmitter. May be run from any thread. TODO: Can't be const because the clusterer's cluster_seeds isn't const.

◆ map_from_chains()

vector< Alignment > vg::MinimizerMapper::map_from_chains ( Alignment aln)

Map the given read using chaining of seeds. Return a vector of alignments that it maps to, winner first.

◆ map_from_extensions()

vector< Alignment > vg::MinimizerMapper::map_from_extensions ( Alignment aln)

Map the given read using gapless extensions. Return a vector of alignments that it maps to, winner first.

◆ map_paired() [1/2]

pair< vector< Alignment >, vector< Alignment > > vg::MinimizerMapper::map_paired ( Alignment aln1,
Alignment aln2 
)

Map the given pair of reads, where aln1 is upstream of aln2 and they are oriented towards each other in the graph.

If the fragment length distribution is not yet fixed, reads will be mapped independently. Otherwise, they will be mapped according to the fragment length distribution.

◆ map_paired() [2/2]

pair< vector< Alignment >, vector< Alignment > > vg::MinimizerMapper::map_paired ( Alignment aln1,
Alignment aln2,
vector< pair< Alignment, Alignment >> &  ambiguous_pair_buffer 
)

Map the given pair of reads, where aln1 is upstream of aln2 and they are oriented towards each other in the graph.

If the reads are ambiguous and there's no fragment length distribution fixed yet, they will be dropped into ambiguous_pair_buffer.

Otherwise, at least one result will be returned for them (although it may be the unmapped alignment).

◆ no_chain_info()

static gbwtgraph::Payload vg::MinimizerMapper::no_chain_info ( )
inlinestaticprotected

How should we initialize chain info when it's not stored in the minimizer index?

◆ pair_all()

void vg::MinimizerMapper::pair_all ( std::array< vector< Alignment >, 2 > &  mappings) const
protected

Set pair partner references for paired mapping results.

◆ process_until_threshold_a()

template<typename Score >
void vg::MinimizerMapper::process_until_threshold_a ( size_t  items,
const function< Score(size_t)> &  get_score,
double  threshold,
size_t  min_count,
size_t  max_count,
LazyRNG rng,
const function< bool(size_t)> &  process_item,
const function< void(size_t)> &  discard_item_by_count,
const function< void(size_t)> &  discard_item_by_score 
) const
protected

Given a count of items, a function to get the score of each, a score-difference-from-the-best cutoff, a min and max processed item count, and a function to get a sort-shuffling seed for breaking ties, process items in descending score order by calling process_item with the item's number, until min_count items are processed and either max_count items are processed or the score difference threshold is hit (or we run out of items).

If process_item returns false, the item is skipped and does not count against min_count or max_count.

Call discard_item_by_count with the item's number for all remaining items that would pass the score threshold.

Call discard_item_by_score with the item's number for all remaining items that would fail the score threshold.

◆ process_until_threshold_b()

template<typename Score >
void vg::MinimizerMapper::process_until_threshold_b ( const vector< Score > &  scores,
double  threshold,
size_t  min_count,
size_t  max_count,
LazyRNG rng,
const function< bool(size_t)> &  process_item,
const function< void(size_t)> &  discard_item_by_count,
const function< void(size_t)> &  discard_item_by_score 
) const
protected

Same as the other process_until_threshold functions, except using a vector to supply scores.

◆ process_until_threshold_c()

template<typename Score >
void vg::MinimizerMapper::process_until_threshold_c ( size_t  items,
const function< Score(size_t)> &  get_score,
const function< bool(size_t, size_t)> &  comparator,
double  threshold,
size_t  min_count,
size_t  max_count,
LazyRNG get_seed,
const function< bool(size_t)> &  process_item,
const function< void(size_t)> &  discard_item_by_count,
const function< void(size_t)> &  discard_item_by_score 
) const
protected

Same as the other process_until_threshold functions, except user supplies comparator to sort the items (must still be sorted by score).

◆ reseed_between()

std::vector< MinimizerMapper::Seed > vg::MinimizerMapper::reseed_between ( size_t  read_region_start,
size_t  read_region_end,
pos_t  left_graph_pos,
pos_t  right_graph_pos,
const HandleGraph graph,
const VectorView< Minimizer > &  minimizers,
const std::function< void(const Minimizer &, const std::vector< nid_t > &, const std::function< void(const pos_t &)> &)> &  for_each_pos_for_source_in_subgraph 
) const
protected

Reseed between the given graph and read positions. Produces new seeds by asking the given callback for minimizers' occurrence positions. Up to one end of the graph region can be a read end, with a pos_t matching is_empty(). The read region always needs to be fully defined.

◆ score_alignment_pair()

double vg::MinimizerMapper::score_alignment_pair ( Alignment aln1,
Alignment aln2,
int64_t  fragment_distance 
)
protected

Score a pair of alignments given the distance between them

◆ score_cluster()

void vg::MinimizerMapper::score_cluster ( Cluster cluster,
size_t  i,
const VectorView< Minimizer > &  minimizers,
const std::vector< Seed > &  seeds,
size_t  seq_length,
Funnel funnel 
) const
protected

Determine cluster score, read coverage, and a vector of flags for the minimizers present in the cluster. Score is the sum of the scores of distinct minimizers in the cluster, while read coverage is the fraction of the read covered by seeds in the cluster.

Puts the cluster in the funnel as coming from its seeds.

◆ score_extension_group()

int vg::MinimizerMapper::score_extension_group ( const Alignment aln,
const vector< GaplessExtension > &  extended_seeds,
int  gap_open_penalty,
int  gap_extend_penalty 
)
staticprotected

Score the given group of gapless extensions. Determines the best score that can be obtained by chaining extensions together, using the given gap open and gap extend penalties to charge for either overlaps or gaps in coverage of the read.

Enforces that overlaps cannot result in containment.

Input extended seeds must be sorted by start position.

◆ score_extensions() [1/2]

std::vector< int > vg::MinimizerMapper::score_extensions ( const std::vector< std::pair< std::vector< GaplessExtension >, size_t >> &  extensions,
const Alignment aln,
Funnel funnel 
) const
protected

Score the set of extensions for each cluster using score_extension_group(). Return the scores in the same order as the extensions.

This version allows the collections of extensions to be scored to come with annotating read numbers, which are ignored.

◆ score_extensions() [2/2]

std::vector< int > vg::MinimizerMapper::score_extensions ( const std::vector< std::vector< GaplessExtension >> &  extensions,
const Alignment aln,
Funnel funnel 
) const
protected

Score the set of extensions for each cluster using score_extension_group(). Return the scores in the same order as the extension groups.

◆ score_merged_cluster()

void vg::MinimizerMapper::score_merged_cluster ( Cluster cluster,
size_t  i,
const VectorView< Minimizer > &  minimizers,
const std::vector< Seed > &  seeds,
size_t  first_new_seed,
const std::vector< size_t > &  seed_to_precluster,
const std::vector< Cluster > &  preclusters,
size_t  seq_length,
Funnel funnel 
) const
protected

Determine cluster score, read coverage, and a vector of flags for the minimizers present in the cluster. Score is the sum of the scores of distinct minimizers in the cluster, while read coverage is the fraction of the read covered by seeds in the cluster.

Thinks of the cluster as being made out of some previous clusters and some new seeds from the tail end of seeds, which are already in the funnel, clusters first. seed_to_precluster maps from seed to the old cluster it is part of, or std::numeric_limits<size_t>::max() if it isn't from an old cluster.

Puts the cluster in the funnel.

◆ seeds_in_subgraph()

GaplessExtender::cluster_type vg::MinimizerMapper::seeds_in_subgraph ( const VectorView< Minimizer > &  minimizers,
const std::unordered_set< nid_t > &  subgraph 
) const
protected

Return the all non-redundant seeds in the subgraph, including those from minimizers not used for mapping.

◆ set_alignment_scores() [1/4]

void vg::AlignerClient::set_alignment_scores

Set the algner scoring parameters and create the stored aligner instances. The score matrix should by a 4 x 4 array in the order (ACGT). Other overloads of set_alignment_scores all call this one.

◆ set_alignment_scores() [2/4]

void vg::MinimizerMapper::set_alignment_scores ( const int8_t *  score_matrix,
int8_t  gap_open,
int8_t  gap_extend,
int8_t  full_length_bonus 
)
virtual

Set the algner scoring parameters and create the stored aligner instances. The score matrix should by a 4 x 4 array in the order (ACGT). Other overloads of set_alignment_scores all call this one.

Reimplemented from vg::AlignerClient.

◆ set_alignment_scores() [3/4]

void vg::AlignerClient::set_alignment_scores

Set all the aligner scoring parameters and create the stored aligner instances.

◆ set_alignment_scores() [4/4]

void vg::AlignerClient::set_alignment_scores

Set the algner scoring parameters and create the stored aligner instances. The stream should contain a 4 x 4 whitespace-separated substitution matrix (in the order ACGT)

◆ sort_minimizers_by_score()

std::vector< size_t > vg::MinimizerMapper::sort_minimizers_by_score ( const std::vector< Minimizer > &  minimizers) const
protected

Return the indices of all the minimizers, sorted in descending order by theit minimizers' scores.

◆ tag_seeds()

void vg::MinimizerMapper::tag_seeds ( const Alignment aln,
const std::vector< Seed >::const_iterator &  begin,
const std::vector< Seed >::const_iterator &  end,
const VectorView< Minimizer > &  minimizers,
size_t  funnel_offset,
Funnel funnel 
) const
protected

If tracking correctness, mark seeds that are correctly mapped as correct in the funnel, based on proximity along paths to the input read's refpos. Otherwise, tag just as placed, with the seed's read interval. Assumes we are tracking provenance.

◆ to_anchor()

algorithms::Anchor vg::MinimizerMapper::to_anchor ( const Alignment aln,
const VectorView< Minimizer > &  minimizers,
const Seed seed 
) const
protected

Convert a single seed to a single chaining anchor.

◆ to_anchors()

std::vector< algorithms::Anchor > vg::MinimizerMapper::to_anchors ( const Alignment aln,
const VectorView< Minimizer > &  minimizers,
const std::vector< Seed > &  seeds 
) const
protected

Convert a collection of seeds to a collection of chaining anchors.

◆ to_path()

Path vg::MinimizerMapper::to_path ( const ImmutablePath path)
staticprotected

Convert an ImmutablePath to a Path.

◆ to_wfa_alignment()

WFAAlignment vg::MinimizerMapper::to_wfa_alignment ( const algorithms::Anchor anchor) const
protected

Convert an Anchor to a WFAAlignment.

◆ unoriented_distance_between()

int64_t vg::MinimizerMapper::unoriented_distance_between ( const pos_t pos1,
const pos_t pos2 
) const
protected

Get the unoriented distance between a pair of positions

◆ validate_clusters()

bool vg::MinimizerMapper::validate_clusters ( const std::vector< std::vector< Cluster >> &  clusters,
const std::vector< std::vector< Seed >> &  seeds,
size_t  read_limit,
size_t  fragment_limit 
) const
protected

Do a brute check of the clusters. Print errors to stderr.

◆ wfa_alignment_to_alignment()

void vg::MinimizerMapper::wfa_alignment_to_alignment ( const WFAAlignment wfa_alignment,
Alignment alignment 
) const
protected

Convert a WFAAlignment into a vg Alignment. This assumes that the WFAAlignment is a full-length alignment and that the sequence field of the vg Alignment has been set.

◆ window_breaking_quality()

static double vg::MinimizerMapper::window_breaking_quality ( const VectorView< Minimizer > &  minimizers,
vector< size_t > &  broken,
const string &  sequence,
const string &  quality_bytes 
)
staticprotected

Compute a bound on the Phred score probability of having created the agglomerations of the specified minimizers by base errors from the given sequence, which was sequenced with the given qualities.

No limit is imposed if broken is empty.

Takes the collection of all minimizers found, and a vector of the indices of minimizers we are interested in the agglomerations of. May modify the order of that index vector.

Also takes the sequence of the read (to avoid Ns) and the quality string (interpreted as a byte array).

Currently computes a lower-score-bound, upper-probability-bound, suitable for use as a mapping quality cap, by assuming the easiest-to-disrupt possible layout of the windows, and the lowest possible qualities for the disrupting bases.

◆ with_dagified_local_graph()

void vg::MinimizerMapper::with_dagified_local_graph ( const pos_t left_anchor,
const pos_t right_anchor,
size_t  max_path_length,
const HandleGraph graph,
const std::function< void(DeletableHandleGraph &, const std::function< std::pair< nid_t, bool >(const handle_t &)> &)> &  callback 
)
staticprotected

Clip out the part of the graph between the given positions, and dagify it from the perspective of the anchors. If a left anchor is set, all heads should correspond to the left anchor, and if a right anchor is set, all tails should correspond to the right anchor. At least one anchor must be set.

Calls the callback with an extracted, strand-split, dagified graph, and a function that translates from handle in the dagified graph to node ID and orientation in the base graph.

Friends And Related Function Documentation

◆ TestMinimizerMapper

friend class TestMinimizerMapper
friend

Member Data Documentation

◆ align_from_chains

bool vg::MinimizerMapper::align_from_chains = default_align_from_chains

◆ chain_min_score

int vg::MinimizerMapper::chain_min_score = default_chain_min_score

◆ chain_score_threshold

double vg::MinimizerMapper::chain_score_threshold = default_chain_score_threshold

◆ chaining_cluster_distance

size_t vg::MinimizerMapper::chaining_cluster_distance = default_chaining_cluster_distance

◆ cluster_coverage_threshold

double vg::MinimizerMapper::cluster_coverage_threshold = default_cluster_coverage_threshold

◆ cluster_score_threshold

double vg::MinimizerMapper::cluster_score_threshold = default_cluster_score_threshold

◆ clusterer

SnarlDistanceIndexClusterer vg::MinimizerMapper::clusterer
protected

We have a clusterer.

◆ default_align_from_chains

constexpr bool vg::MinimizerMapper::default_align_from_chains = false
staticconstexpr

If true, produce alignments from extension sets by chaining gapless extensions up and aligning the sequences between them. If false, produce alignments by aligning the tails off of individual gapless extensions.

◆ default_chain_min_score

constexpr int vg::MinimizerMapper::default_chain_min_score = 100
staticconstexpr

Even if we would have fewer than min_chains results, don't process anything with a score smaller than this.

◆ default_chain_score_threshold

constexpr double vg::MinimizerMapper::default_chain_score_threshold = 100
staticconstexpr

If a chain's score is smaller than the best chain's score by more than this much, don't align it

◆ default_chaining_cluster_distance

constexpr size_t vg::MinimizerMapper::default_chaining_cluster_distance = 100
staticconstexpr

What read-length-independent distance threshold do we want to use for clustering?

◆ default_cluster_coverage_threshold

constexpr double vg::MinimizerMapper::default_cluster_coverage_threshold = 0.3
staticconstexpr

If the read coverage of a cluster is less than the best coverage of any cluster by more than this much, don't extend it

◆ default_cluster_score_threshold

constexpr double vg::MinimizerMapper::default_cluster_score_threshold = 50
staticconstexpr

If a cluster's score is smaller than the best score of any cluster by more than this much, then don't extend it

◆ default_distance_limit

constexpr size_t vg::MinimizerMapper::default_distance_limit = 200
staticconstexpr

◆ default_do_dp

constexpr bool vg::MinimizerMapper::default_do_dp = true
staticconstexpr

If false, skip computing base-level alignments.

◆ default_exclude_overlapping_min

constexpr bool vg::MinimizerMapper::default_exclude_overlapping_min = false
staticconstexpr

If set, exclude overlapping minimizers.

◆ default_extension_score_threshold

constexpr int vg::MinimizerMapper::default_extension_score_threshold = 1
staticconstexpr

◆ default_extension_set_min_score

constexpr int vg::MinimizerMapper::default_extension_set_min_score = 20
staticconstexpr

Even if we would have fewer than min_extension_sets results, don't process anything with a score smaller than this.

◆ default_extension_set_score_threshold

constexpr double vg::MinimizerMapper::default_extension_set_score_threshold = 20
staticconstexpr

◆ default_hard_hit_cap

constexpr size_t vg::MinimizerMapper::default_hard_hit_cap = 500
staticconstexpr

Ignore all minimizers with more than hard_hit_cap hits.

◆ default_hit_cap

constexpr size_t vg::MinimizerMapper::default_hit_cap = 10
staticconstexpr

Use all minimizers with at most hit_cap hits.

◆ default_initial_lookback_threshold

constexpr size_t vg::MinimizerMapper::default_initial_lookback_threshold = 10
staticconstexpr

How many bases should we try to look back initially when chaining?

◆ default_item_bonus

constexpr int vg::MinimizerMapper::default_item_bonus = 0
staticconstexpr

How much of a bonus should we give to each item in chaining?

◆ default_lookback_item_hard_cap

constexpr size_t vg::MinimizerMapper::default_lookback_item_hard_cap = 15
staticconstexpr

How many chaining sources should we allow ourselves to consider ever?

◆ default_lookback_scale_factor

constexpr double vg::MinimizerMapper::default_lookback_scale_factor = 2.0
staticconstexpr

How much chould we increase lookback when we can't find anything good?

◆ default_max_alignments

constexpr size_t vg::MinimizerMapper::default_max_alignments = 8
staticconstexpr

How many extended clusters should we align, max?

◆ default_max_chain_connection

constexpr size_t vg::MinimizerMapper::default_max_chain_connection = 100
staticconstexpr

When converting chains to alignments, what's the longest gap between items we will actually try to align? Passing strings longer than ~100bp can cause WFAAligner to run for a pathologically long amount of time. May not be 0.

◆ default_max_clusters_to_chain

constexpr size_t vg::MinimizerMapper::default_max_clusters_to_chain = 20
staticconstexpr

How many clusters should we produce chains for, max?

◆ default_max_dozeu_cells

constexpr size_t vg::MinimizerMapper::default_max_dozeu_cells = (size_t)(1.5 * 1024 * 1024)
staticconstexpr

How big of an alignment in POA cells should we ever try to do with Dozeu? TODO: Lift this when Dozeu's allocator is able to work with >4 MB of memory. Each cell is 16 bits in Dozeu, and we leave some room for the query and padding to full SSE registers. Note that a very chopped graph might still break this!

◆ default_max_dp_cells

constexpr size_t vg::MinimizerMapper::default_max_dp_cells = 16UL * 1024UL * 1024UL
staticconstexpr

How many DP cells should we be willing to do in GSSW for an end-pinned alignment? If we want to do more than this, just leave tail unaligned.

◆ default_max_extensions

constexpr size_t vg::MinimizerMapper::default_max_extensions = 800
staticconstexpr

How many clusters should we produce gapless extensions for, max?

◆ default_max_fragment_length

constexpr size_t vg::MinimizerMapper::default_max_fragment_length = 2000
staticconstexpr

What is the maximum fragment length that we accept as valid for paired-end reads?

◆ default_max_indel_bases

constexpr size_t vg::MinimizerMapper::default_max_indel_bases = 50
staticconstexpr

How many bases of indel should we allow in chaining?

◆ default_max_local_extensions

constexpr size_t vg::MinimizerMapper::default_max_local_extensions = numeric_limits<size_t>::max()
staticconstexpr

How many extensions should we try as seeds within a mapping location?

◆ default_max_lookback_bases

constexpr size_t vg::MinimizerMapper::default_max_lookback_bases = 100
staticconstexpr

How many bases should we look back when chaining? Needs to be about the same as the clustering distance or we will be able to cluster but not chain.

◆ default_max_multimaps

constexpr size_t vg::MinimizerMapper::default_max_multimaps = 1
staticconstexpr

◆ default_max_precluster_connections

constexpr size_t vg::MinimizerMapper::default_max_precluster_connections = 50
staticconstexpr

How many connections between preclusters should we reseed over, maximum?

◆ default_max_rescue_attempts

constexpr size_t vg::MinimizerMapper::default_max_rescue_attempts = 15
staticconstexpr

For paired end mapping, how many times should we attempt rescue (per read)?

◆ default_max_tail_length

constexpr size_t vg::MinimizerMapper::default_max_tail_length = 100
staticconstexpr

Similarly, what is the maximum tail length we will try to align?

◆ default_max_unique_min

constexpr size_t vg::MinimizerMapper::default_max_unique_min = 500
staticconstexpr

Maximum number of distinct minimizers to take.

◆ default_min_chains

constexpr int vg::MinimizerMapper::default_min_chains = 1
staticconstexpr

Disregard the chain score thresholds when they would give us fewer than this many chains.

◆ default_min_clusters_to_chain

constexpr size_t vg::MinimizerMapper::default_min_clusters_to_chain = 2
staticconstexpr

Accept at least this many clusters for chain generation.

◆ default_min_extension_sets

constexpr int vg::MinimizerMapper::default_min_extension_sets = 2
staticconstexpr

Disregard the extension set score thresholds when they would give us fewer than this many extension sets.

◆ default_min_extensions

constexpr size_t vg::MinimizerMapper::default_min_extensions = 2
staticconstexpr

Accept at least this many clusters for gapless extension.

◆ default_min_good_transition_score_per_base

constexpr double vg::MinimizerMapper::default_min_good_transition_score_per_base = -0.1
staticconstexpr

How bad can a transition be per base before lookback accepts it?

◆ default_min_lookback_items

constexpr size_t vg::MinimizerMapper::default_min_lookback_items = 1
staticconstexpr

How many chaining sources should we make sure to consider regardless of distance?

◆ default_min_precluster_connections

constexpr size_t vg::MinimizerMapper::default_min_precluster_connections = 10
staticconstexpr

How many connections between preclusters should we reseed over, minimum?

◆ default_minimizer_score_fraction

constexpr double vg::MinimizerMapper::default_minimizer_score_fraction = 0.9
staticconstexpr

Take minimizers between hit_cap and hard_hit_cap hits until this fraction of total score

◆ default_num_bp_per_min

constexpr size_t vg::MinimizerMapper::default_num_bp_per_min = 1000
staticconstexpr

Number of minimzers to select based on read_len/num_min_per_bp.

◆ default_pad_cluster_score_threshold

constexpr double vg::MinimizerMapper::default_pad_cluster_score_threshold = 20
staticconstexpr

If the second best cluster's score is no more than this many points below the cutoff set by cluster_score_threshold, snap that cutoff down to the second best cluster's score, to avoid throwing away promising secondaries.

◆ default_paired_distance_stdevs

constexpr double vg::MinimizerMapper::default_paired_distance_stdevs = 2.0
staticconstexpr

◆ default_paired_rescue_score_limit

constexpr double vg::MinimizerMapper::default_paired_rescue_score_limit = 0.9
staticconstexpr

How close does an alignment have to be to the best alignment for us to rescue on it.

◆ default_precluster_connection_coverage_threshold

constexpr double vg::MinimizerMapper::default_precluster_connection_coverage_threshold = 0.3
staticconstexpr

If the read coverage of a precluster connection is less than the best of any by more than this much, don't extend it

◆ default_rescue_seed_limit

constexpr size_t vg::MinimizerMapper::default_rescue_seed_limit = 100
staticconstexpr

Do not attempt rescue if there are more seeds in the rescue subgraph.

◆ default_rescue_subgraph_stdevs

constexpr double vg::MinimizerMapper::default_rescue_subgraph_stdevs = 4.0
staticconstexpr

How many stdevs from the mean do we extract a subgraph from?

◆ default_reseed_search_distance

constexpr size_t vg::MinimizerMapper::default_reseed_search_distance = 10000
staticconstexpr

When connecting subclusters for reseeding, how far should we search?

◆ default_show_work

constexpr bool vg::MinimizerMapper::default_show_work = false
staticconstexpr

If set, log what the mapper is thinking in its mapping of each read.

◆ default_track_correctness

constexpr bool vg::MinimizerMapper::default_track_correctness = false
staticconstexpr

Guess which seed hits are correct by location in the linear reference and track if/when their descendants make it through stages of the algorithm. Only works if track_provenance is true.

◆ default_track_provenance

constexpr bool vg::MinimizerMapper::default_track_provenance = false
staticconstexpr

Track which internal work items came from which others during each stage of the mapping algorithm.

◆ distance_index

SnarlDistanceIndex* vg::MinimizerMapper::distance_index
protected

◆ distance_limit

size_t vg::MinimizerMapper::distance_limit = default_distance_limit

◆ do_dp

bool vg::MinimizerMapper::do_dp = default_do_dp

◆ exclude_overlapping_min

bool vg::MinimizerMapper::exclude_overlapping_min = default_exclude_overlapping_min

◆ extender

std::unique_ptr<GaplessExtender> vg::MinimizerMapper::extender
protected

We have a gapless extender to extend seed hits in haplotype space. Because this needs a reference to an Aligner, and because changing the scoring parameters deletes all the alignmers, we need to keep this somewhere we can clear out.

◆ extension_score_threshold

int vg::MinimizerMapper::extension_score_threshold = default_extension_score_threshold

◆ extension_set_min_score

int vg::MinimizerMapper::extension_set_min_score = default_extension_set_min_score

◆ extension_set_score_threshold

double vg::MinimizerMapper::extension_set_score_threshold = default_extension_set_score_threshold

◆ fragment_length_distr

FragmentLengthDistribution vg::MinimizerMapper::fragment_length_distr
protected

We have a distribution for read fragment lengths that takes care of knowing when we've observed enough good ones to learn a good distribution.

◆ gbwt_graph

const gbwtgraph::GBWTGraph& vg::MinimizerMapper::gbwt_graph
protected

This is our primary graph.

◆ hard_hit_cap

size_t vg::MinimizerMapper::hard_hit_cap = default_hard_hit_cap

◆ hit_cap

size_t vg::MinimizerMapper::hit_cap = default_hit_cap

◆ initial_lookback_threshold

size_t vg::MinimizerMapper::initial_lookback_threshold = default_initial_lookback_threshold

◆ item_bonus

int vg::MinimizerMapper::item_bonus = default_item_bonus

◆ LONG_LIMIT

const static size_t vg::MinimizerMapper::LONG_LIMIT = 256
staticprotected

Length at which we cut over to long-alignment logging.

◆ lookback_item_hard_cap

size_t vg::MinimizerMapper::lookback_item_hard_cap = lookback_item_hard_cap

◆ lookback_scale_factor

double vg::MinimizerMapper::lookback_scale_factor = default_lookback_scale_factor

◆ MANY_LIMIT

const static size_t vg::MinimizerMapper::MANY_LIMIT = 20
staticprotected

Count at which we cut over to summary logging.

◆ max_alignments

size_t vg::MinimizerMapper::max_alignments = default_max_alignments

◆ max_chain_connection

size_t vg::MinimizerMapper::max_chain_connection = default_max_chain_connection

◆ max_clusters_to_chain

size_t vg::MinimizerMapper::max_clusters_to_chain = default_max_clusters_to_chain

◆ max_dozeu_cells

size_t vg::MinimizerMapper::max_dozeu_cells = default_max_dozeu_cells

◆ max_dp_cells

size_t vg::MinimizerMapper::max_dp_cells = default_max_dp_cells

◆ MAX_DP_LENGTH

constexpr int vg::MinimizerMapper::MAX_DP_LENGTH = 30000
staticconstexpr

How long of a DP can we do before GSSW crashes due to 16-bit score overflow?

◆ max_extensions

size_t vg::MinimizerMapper::max_extensions = default_max_extensions

◆ max_fragment_length

size_t vg::MinimizerMapper::max_fragment_length = default_max_fragment_length

◆ max_indel_bases

size_t vg::MinimizerMapper::max_indel_bases = default_max_indel_bases

◆ max_local_extensions

size_t vg::MinimizerMapper::max_local_extensions = default_max_local_extensions

◆ max_lookback_bases

size_t vg::MinimizerMapper::max_lookback_bases = default_max_lookback_bases

◆ max_multimaps

size_t vg::MinimizerMapper::max_multimaps = default_max_multimaps

◆ max_precluster_connections

size_t vg::MinimizerMapper::max_precluster_connections = default_max_precluster_connections

◆ max_rescue_attempts

size_t vg::MinimizerMapper::max_rescue_attempts = default_max_rescue_attempts

◆ max_tail_length

size_t vg::MinimizerMapper::max_tail_length = default_max_tail_length

◆ max_unique_min

size_t vg::MinimizerMapper::max_unique_min = default_max_unique_min

◆ min_chains

int vg::MinimizerMapper::min_chains = default_min_chains

◆ min_clusters_to_chain

size_t vg::MinimizerMapper::min_clusters_to_chain = default_min_clusters_to_chain

◆ min_extension_sets

int vg::MinimizerMapper::min_extension_sets = default_min_extension_sets

◆ min_extensions

size_t vg::MinimizerMapper::min_extensions = default_min_extensions

◆ min_good_transition_score_per_base

double vg::MinimizerMapper::min_good_transition_score_per_base = default_min_good_transition_score_per_base

◆ min_lookback_items

size_t vg::MinimizerMapper::min_lookback_items = default_min_lookback_items

◆ min_precluster_connections

size_t vg::MinimizerMapper::min_precluster_connections = default_min_precluster_connections

◆ minimizer_index

const gbwtgraph::DefaultMinimizerIndex& vg::MinimizerMapper::minimizer_index
protected

◆ minimizer_score_fraction

double vg::MinimizerMapper::minimizer_score_fraction = default_minimizer_score_fraction

◆ num_bp_per_min

size_t vg::MinimizerMapper::num_bp_per_min = default_num_bp_per_min

◆ pad_cluster_score_threshold

double vg::MinimizerMapper::pad_cluster_score_threshold = default_pad_cluster_score_threshold

◆ paired_distance_stdevs

double vg::MinimizerMapper::paired_distance_stdevs = default_paired_distance_stdevs

◆ paired_rescue_score_limit

double vg::MinimizerMapper::paired_rescue_score_limit = default_paired_rescue_score_limit

◆ path_graph

const PathPositionHandleGraph* vg::MinimizerMapper::path_graph
protected

◆ precluster_connection_coverage_threshold

double vg::MinimizerMapper::precluster_connection_coverage_threshold = default_precluster_connection_coverage_threshold

◆ read_group

string vg::MinimizerMapper::read_group

Apply this read group name.

◆ rescue_algorithm

RescueAlgorithm vg::MinimizerMapper::rescue_algorithm = rescue_dozeu

The algorithm used for rescue.

◆ rescue_seed_limit

size_t vg::MinimizerMapper::rescue_seed_limit = default_rescue_seed_limit

◆ rescue_subgraph_stdevs

double vg::MinimizerMapper::rescue_subgraph_stdevs = default_rescue_subgraph_stdevs

◆ reseed_search_distance

size_t vg::MinimizerMapper::reseed_search_distance = default_reseed_search_distance

◆ sample_name

string vg::MinimizerMapper::sample_name

Apply this sample name.

◆ show_work

bool vg::MinimizerMapper::show_work = default_show_work

◆ track_correctness

bool vg::MinimizerMapper::track_correctness = default_track_correctness

◆ track_provenance

bool vg::MinimizerMapper::track_provenance = default_track_provenance

◆ warned_about_bad_distribution

atomic_flag vg::MinimizerMapper::warned_about_bad_distribution = ATOMIC_FLAG_INIT
protected

We may need to complain exactly once that the distribution is bad.

◆ warned_about_rescue_size

atomic_flag vg::MinimizerMapper::warned_about_rescue_size = ATOMIC_FLAG_INIT

Have we complained about hitting the size limit for rescue?

◆ warned_about_tail_size

atomic_flag vg::MinimizerMapper::warned_about_tail_size = ATOMIC_FLAG_INIT
mutable

Have we complained about hitting the size limit for tails?


The documentation for this class was generated from the following files: