vg
tools for working with variation graphs
|
#include <minimizer_mapper.hpp>
Classes | |
struct | Minimizer |
Public Types | |
enum | RescueAlgorithm { rescue_none, rescue_dozeu, rescue_gssw } |
Implemented rescue algorithms: no rescue, dozeu, GSSW. More... | |
typedef SnarlDistanceIndexClusterer::Seed | Seed |
The information we store for each seed. More... | |
Public Member Functions | |
MinimizerMapper (const gbwtgraph::GBWTGraph &graph, const gbwtgraph::DefaultMinimizerIndex &minimizer_index, SnarlDistanceIndex *distance_index, const PathPositionHandleGraph *path_graph=nullptr) | |
virtual void | set_alignment_scores (const int8_t *score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
void | map (Alignment &aln, AlignmentEmitter &alignment_emitter) |
vector< Alignment > | map (Alignment &aln) |
vector< Alignment > | map_from_chains (Alignment &aln) |
vector< Alignment > | map_from_extensions (Alignment &aln) |
pair< vector< Alignment >, vector< Alignment > > | map_paired (Alignment &aln1, Alignment &aln2, vector< pair< Alignment, Alignment >> &ambiguous_pair_buffer) |
pair< vector< Alignment >, vector< Alignment > > | map_paired (Alignment &aln1, Alignment &aln2) |
bool | fragment_distr_is_finalized () |
void | finalize_fragment_length_distr () |
void | force_fragment_length_distr (double mean, double stdev) |
double | get_fragment_length_mean () const |
double | get_fragment_length_stdev () const |
size_t | get_fragment_length_sample_size () const |
size_t | get_distance_limit (size_t read_length) const |
virtual void | set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
Set all the aligner scoring parameters and create the stored aligner instances. More... | |
virtual void | set_alignment_scores (std::istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
virtual void | set_alignment_scores (const int8_t *score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
Public Member Functions inherited from vg::AlignerClient | |
virtual void | set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
Set all the aligner scoring parameters and create the stored aligner instances. More... | |
virtual void | set_alignment_scores (std::istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
Static Public Attributes | |
static constexpr size_t | default_hit_cap = 10 |
Use all minimizers with at most hit_cap hits. More... | |
static constexpr size_t | default_hard_hit_cap = 500 |
Ignore all minimizers with more than hard_hit_cap hits. More... | |
static constexpr double | default_minimizer_score_fraction = 0.9 |
static constexpr size_t | default_max_unique_min = 500 |
Maximum number of distinct minimizers to take. More... | |
static constexpr size_t | default_num_bp_per_min = 1000 |
Number of minimzers to select based on read_len/num_min_per_bp. More... | |
static constexpr bool | default_exclude_overlapping_min = false |
If set, exclude overlapping minimizers. More... | |
static constexpr size_t | default_min_extensions = 2 |
Accept at least this many clusters for gapless extension. More... | |
static constexpr size_t | default_max_extensions = 800 |
How many clusters should we produce gapless extensions for, max? More... | |
static constexpr double | default_extension_set_score_threshold = 20 |
static constexpr int | default_extension_score_threshold = 1 |
static constexpr int | default_min_extension_sets = 2 |
static constexpr int | default_extension_set_min_score = 20 |
static constexpr size_t | default_max_alignments = 8 |
How many extended clusters should we align, max? More... | |
static constexpr size_t | default_max_local_extensions = numeric_limits<size_t>::max() |
How many extensions should we try as seeds within a mapping location? More... | |
static constexpr double | default_cluster_score_threshold = 50 |
static constexpr double | default_pad_cluster_score_threshold = 20 |
static constexpr double | default_cluster_coverage_threshold = 0.3 |
static constexpr bool | default_align_from_chains = false |
static constexpr size_t | default_chaining_cluster_distance = 100 |
What read-length-independent distance threshold do we want to use for clustering? More... | |
static constexpr double | default_precluster_connection_coverage_threshold = 0.3 |
static constexpr size_t | default_min_precluster_connections = 10 |
How many connections between preclusters should we reseed over, minimum? More... | |
static constexpr size_t | default_max_precluster_connections = 50 |
How many connections between preclusters should we reseed over, maximum? More... | |
static constexpr size_t | default_reseed_search_distance = 10000 |
When connecting subclusters for reseeding, how far should we search? More... | |
static constexpr size_t | default_min_clusters_to_chain = 2 |
Accept at least this many clusters for chain generation. More... | |
static constexpr size_t | default_max_clusters_to_chain = 20 |
How many clusters should we produce chains for, max? More... | |
static constexpr size_t | default_max_chain_connection = 100 |
static constexpr size_t | default_max_tail_length = 100 |
Similarly, what is the maximum tail length we will try to align? More... | |
static constexpr size_t | default_max_lookback_bases = 100 |
static constexpr size_t | default_min_lookback_items = 1 |
How many chaining sources should we make sure to consider regardless of distance? More... | |
static constexpr size_t | default_lookback_item_hard_cap = 15 |
How many chaining sources should we allow ourselves to consider ever? More... | |
static constexpr size_t | default_initial_lookback_threshold = 10 |
How many bases should we try to look back initially when chaining? More... | |
static constexpr double | default_lookback_scale_factor = 2.0 |
How much chould we increase lookback when we can't find anything good? More... | |
static constexpr double | default_min_good_transition_score_per_base = -0.1 |
How bad can a transition be per base before lookback accepts it? More... | |
static constexpr int | default_item_bonus = 0 |
How much of a bonus should we give to each item in chaining? More... | |
static constexpr size_t | default_max_indel_bases = 50 |
How many bases of indel should we allow in chaining? More... | |
static constexpr double | default_chain_score_threshold = 100 |
static constexpr int | default_min_chains = 1 |
static constexpr int | default_chain_min_score = 100 |
static constexpr int | MAX_DP_LENGTH = 30000 |
static constexpr size_t | default_max_dp_cells = 16UL * 1024UL * 1024UL |
static constexpr size_t | default_max_multimaps = 1 |
static constexpr size_t | default_distance_limit = 200 |
static constexpr bool | default_do_dp = true |
If false, skip computing base-level alignments. More... | |
static constexpr bool | default_track_provenance = false |
static constexpr bool | default_track_correctness = false |
static constexpr bool | default_show_work = false |
If set, log what the mapper is thinking in its mapping of each read. More... | |
static constexpr double | default_paired_distance_stdevs = 2.0 |
static constexpr double | default_paired_rescue_score_limit = 0.9 |
How close does an alignment have to be to the best alignment for us to rescue on it. More... | |
static constexpr double | default_rescue_subgraph_stdevs = 4.0 |
How many stdevs from the mean do we extract a subgraph from? More... | |
static constexpr size_t | default_rescue_seed_limit = 100 |
Do not attempt rescue if there are more seeds in the rescue subgraph. More... | |
static constexpr size_t | default_max_rescue_attempts = 15 |
For paired end mapping, how many times should we attempt rescue (per read)? More... | |
static constexpr size_t | default_max_dozeu_cells = (size_t)(1.5 * 1024 * 1024) |
static constexpr size_t | default_max_fragment_length = 2000 |
What is the maximum fragment length that we accept as valid for paired-end reads? More... | |
Protected Types | |
typedef SnarlDistanceIndexClusterer::Cluster | Cluster |
The information we store for each cluster. More... | |
using | ImmutablePath = structures::ImmutableList< Mapping > |
Protected Member Functions | |
double | distance_to_annotation (int64_t distance) const |
std::vector< algorithms::Anchor > | to_anchors (const Alignment &aln, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds) const |
Convert a collection of seeds to a collection of chaining anchors. More... | |
algorithms::Anchor | to_anchor (const Alignment &aln, const VectorView< Minimizer > &minimizers, const Seed &seed) const |
Convert a single seed to a single chaining anchor. More... | |
WFAAlignment | to_wfa_alignment (const algorithms::Anchor &anchor) const |
Convert an Anchor to a WFAAlignment. More... | |
std::vector< Minimizer > | find_minimizers (const std::string &sequence, Funnel &funnel) const |
std::vector< size_t > | sort_minimizers_by_score (const std::vector< Minimizer > &minimizers) const |
std::vector< Seed > | find_seeds (const VectorView< Minimizer > &minimizers, const Alignment &aln, Funnel &funnel) const |
void | tag_seeds (const Alignment &aln, const std::vector< Seed >::const_iterator &begin, const std::vector< Seed >::const_iterator &end, const VectorView< Minimizer > &minimizers, size_t funnel_offset, Funnel &funnel) const |
void | score_cluster (Cluster &cluster, size_t i, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, size_t seq_length, Funnel &funnel) const |
void | score_merged_cluster (Cluster &cluster, size_t i, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, size_t first_new_seed, const std::vector< size_t > &seed_to_precluster, const std::vector< Cluster > &preclusters, size_t seq_length, Funnel &funnel) const |
std::vector< Seed > | reseed_between (size_t read_region_start, size_t read_region_end, pos_t left_graph_pos, pos_t right_graph_pos, const HandleGraph &graph, const VectorView< Minimizer > &minimizers, const std::function< void(const Minimizer &, const std::vector< nid_t > &, const std::function< void(const pos_t &)> &)> &for_each_pos_for_source_in_subgraph) const |
vector< GaplessExtension > | extend_cluster (const Cluster &cluster, size_t cluster_num, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, const string &sequence, vector< vector< size_t >> &minimizer_kept_cluster_count, Funnel &funnel) const |
std::vector< int > | score_extensions (const std::vector< std::vector< GaplessExtension >> &extensions, const Alignment &aln, Funnel &funnel) const |
std::vector< int > | score_extensions (const std::vector< std::pair< std::vector< GaplessExtension >, size_t >> &extensions, const Alignment &aln, Funnel &funnel) const |
Alignment | find_chain_alignment (const Alignment &aln, const VectorView< algorithms::Anchor > &to_chain, const std::vector< size_t > &chain) const |
void | find_optimal_tail_alignments (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, LazyRNG &rng, Alignment &best, Alignment &second_best) const |
void | attempt_rescue (const Alignment &aligned_read, Alignment &rescued_alignment, const VectorView< Minimizer > &minimizers, bool rescue_forward) |
GaplessExtender::cluster_type | seeds_in_subgraph (const VectorView< Minimizer > &minimizers, const std::unordered_set< nid_t > &subgraph) const |
void | fix_dozeu_score (Alignment &rescued_alignment, const HandleGraph &rescue_graph, const std::vector< handle_t > &topological_order) const |
void | fix_dozeu_end_deletions (Alignment &rescued_alignment) const |
int64_t | distance_between (const pos_t &pos1, const pos_t &pos2) |
int64_t | distance_between (const Alignment &aln1, const Alignment &aln2) |
int64_t | unoriented_distance_between (const pos_t &pos1, const pos_t &pos2) const |
void | extension_to_alignment (const GaplessExtension &extension, Alignment &alignment) const |
void | wfa_alignment_to_alignment (const WFAAlignment &wfa_alignment, Alignment &alignment) const |
void | pair_all (std::array< vector< Alignment >, 2 > &mappings) const |
void | annotate_with_minimizer_statistics (Alignment &target, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, size_t old_seed_count, size_t new_seed_offset, const Funnel &funnel) const |
double | compute_mapq_caps (const Alignment &aln, const VectorView< Minimizer > &minimizers, const SmallBitset &explored) |
vector< TreeSubgraph > | get_tail_forest (const GaplessExtension &extended_seed, size_t read_length, bool left_tails, size_t *longest_detectable_gap=nullptr) const |
pair< Path, size_t > | get_best_alignment_against_any_tree (const vector< TreeSubgraph > &trees, const string &sequence, const Position &default_position, bool pin_left, size_t longest_detectable_gap, LazyRNG &rng) const |
void | dfs_gbwt (const Position &from, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const |
void | dfs_gbwt (handle_t from_handle, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const |
void | dfs_gbwt (const gbwt::SearchState &start_state, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const |
double | score_alignment_pair (Alignment &aln1, Alignment &aln2, int64_t fragment_distance) |
template<typename Score = double> | |
void | process_until_threshold_a (size_t items, const function< Score(size_t)> &get_score, double threshold, size_t min_count, size_t max_count, LazyRNG &rng, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const |
template<typename Score = double> | |
void | process_until_threshold_b (const vector< Score > &scores, double threshold, size_t min_count, size_t max_count, LazyRNG &rng, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const |
template<typename Score = double> | |
void | process_until_threshold_c (size_t items, const function< Score(size_t)> &get_score, const function< bool(size_t, size_t)> &comparator, double threshold, size_t min_count, size_t max_count, LazyRNG &get_seed, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const |
bool | validate_clusters (const std::vector< std::vector< Cluster >> &clusters, const std::vector< std::vector< Seed >> &seeds, size_t read_limit, size_t fragment_limit) const |
Do a brute check of the clusters. Print errors to stderr. More... | |
Protected Member Functions inherited from vg::AlignerClient | |
AlignerClient (double gc_content_estimate=vg::default_gc_content) | |
const GSSWAligner * | get_aligner (bool have_qualities=true) const |
const QualAdjAligner * | get_qual_adj_aligner () const |
const Aligner * | get_regular_aligner () const |
Static Protected Member Functions | |
static gbwtgraph::Payload | no_chain_info () |
How should we initialize chain info when it's not stored in the minimizer index? More... | |
static Seed | chain_info_to_seed (const pos_t &hit, size_t minimizer, const gbwtgraph::Payload &chain_info) |
static int | score_extension_group (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, int gap_open_penalty, int gap_extend_penalty) |
static void | with_dagified_local_graph (const pos_t &left_anchor, const pos_t &right_anchor, size_t max_path_length, const HandleGraph &graph, const std::function< void(DeletableHandleGraph &, const std::function< std::pair< nid_t, bool >(const handle_t &)> &)> &callback) |
static void | align_sequence_between (const pos_t &left_anchor, const pos_t &right_anchor, size_t max_path_length, const HandleGraph *graph, const GSSWAligner *aligner, Alignment &alignment, size_t max_dp_cells=std::numeric_limits< size_t >::max()) |
static double | window_breaking_quality (const VectorView< Minimizer > &minimizers, vector< size_t > &broken, const string &sequence, const string &quality_bytes) |
static double | faster_cap (const VectorView< Minimizer > &minimizers, vector< size_t > &minimizers_explored, const string &sequence, const string &quality_bytes) |
static void | for_each_agglomeration_interval (const VectorView< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t > &minimizer_indices, const function< void(size_t, size_t, size_t, size_t)> &iteratee) |
static double | get_log10_prob_of_disruption_in_interval (const VectorView< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t >::iterator &disrupt_begin, const vector< size_t >::iterator &disrupt_end, size_t left, size_t right) |
static double | get_prob_of_disruption_in_column (const VectorView< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t >::iterator &disrupt_begin, const vector< size_t >::iterator &disrupt_end, size_t index) |
static size_t | immutable_path_from_length (const ImmutablePath &path) |
static Path | to_path (const ImmutablePath &path) |
static string | log_name () |
Get the thread identifier prefix for logging. More... | |
static string | log_alignment (const Alignment &aln) |
Turn an Alignment into a conveniently-sized string for logging. More... | |
static string | log_alignment (const Path &path, bool force_condensed=false) |
Turn an Path from an alignment into a conveniently-sized string for logging. More... | |
static string | log_bits (const std::vector< bool > &bits) |
Turn a list of bit flags into a compact representation. More... | |
static void | dump_chaining_problem (const std::vector< algorithms::Anchor > &anchors, const std::vector< size_t > &cluster_seeds_sorted, const HandleGraph &graph) |
Dump a whole chaining problem. More... | |
static void | dump_debug_minimizers (const VectorView< Minimizer > &minimizers, const string &sequence, const vector< size_t > *to_include=nullptr, size_t start_offset=0, size_t length_limit=std::numeric_limits< size_t >::max()) |
Dump all the given minimizers, with optional subset restriction. More... | |
static void | dump_debug_extension_set (const HandleGraph &graph, const Alignment &aln, const vector< GaplessExtension > &extended_seeds) |
Dump all the extansions in an extension set. More... | |
static void | dump_debug_sequence (ostream &out, const string &sequence, size_t start_offset=0, size_t length_limit=std::numeric_limits< size_t >::max()) |
Print a sequence with base numbering. More... | |
static void | dump_debug_clustering (const Cluster &cluster, size_t cluster_number, const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds) |
Print the seed content of a cluster. More... | |
static void | dump_debug_seeds (const VectorView< Minimizer > &minimizers, const std::vector< Seed > &seeds, const std::vector< size_t > &selected_seeds) |
Print information about a selected set of seeds. More... | |
static void | dump_debug_query (const Alignment &aln) |
Print information about a read to be aligned. More... | |
static void | dump_debug_query (const Alignment &aln1, const Alignment &aln2) |
Print information about a read pair to be aligned. More... | |
Protected Attributes | |
const PathPositionHandleGraph * | path_graph |
const gbwtgraph::DefaultMinimizerIndex & | minimizer_index |
SnarlDistanceIndex * | distance_index |
const gbwtgraph::GBWTGraph & | gbwt_graph |
This is our primary graph. More... | |
std::unique_ptr< GaplessExtender > | extender |
SnarlDistanceIndexClusterer | clusterer |
We have a clusterer. More... | |
FragmentLengthDistribution | fragment_length_distr |
atomic_flag | warned_about_bad_distribution = ATOMIC_FLAG_INIT |
We may need to complain exactly once that the distribution is bad. More... | |
Static Protected Attributes | |
const static size_t | LONG_LIMIT = 256 |
Length at which we cut over to long-alignment logging. More... | |
const static size_t | MANY_LIMIT = 20 |
Count at which we cut over to summary logging. More... | |
Friends | |
class | TestMinimizerMapper |
Additional Inherited Members | |
Static Public Member Functions inherited from vg::AlignerClient | |
static int8_t * | parse_matrix (std::istream &matrix_stream) |
Allocates an array to hold a 4x4 substitution matrix and returns it. More... | |
|
protected |
The information we store for each cluster.
|
protected |
We define a type for shared-tail lists of Mappings, to avoid constantly copying Path objects.
The information we store for each seed.
vg::MinimizerMapper::MinimizerMapper | ( | const gbwtgraph::GBWTGraph & | graph, |
const gbwtgraph::DefaultMinimizerIndex & | minimizer_index, | ||
SnarlDistanceIndex * | distance_index, | ||
const PathPositionHandleGraph * | path_graph = nullptr |
||
) |
Construct a new MinimizerMapper using the given indexes. The PathPositionhandleGraph can be nullptr, as we only use it for correctness tracking.
|
staticprotected |
Clip out the part of the graph between the given positions and global-align the sequence of the given Alignment to it. Populate the Alignment's path and score.
Finds an alignment against a graph path if it is <= max_path_length, and uses <= max_dp_cells GSSW cells.
If one of the anchor positions is empty, does pinned alighnment against the other position.
|
protected |
Add annotations to an Alignment with statistics about the minimizers.
old_seed_count is the number of seeds in the seed vector actually created at the "seed" stage of the alignment process. new_seed_offset is where the first of thos eseeds appears in the funnel at the reseed stage.
|
protected |
Given an aligned read, extract a subgraph of the graph within a distance range based on the fragment length distribution and attempt to align the unaligned read to it. Rescue_forward is true if the aligned read is the first and false otherwise. Assumes that both reads are facing the same direction. TODO: This should be const, but some of the function calls are not.
|
inlinestaticprotected |
How do we convert chain info to an actual seed of the type we are using? Also needs to know the hit position, and the minimizer number.
|
protected |
Compute MAPQ caps based on all minimizers that are explored, for some definition of explored.
Needs access to the input alignment for sequence and quality information.
Returns only an "extended" cap at the moment.
|
protected |
The same as dfs_gbwt on a handle and an offset, but takes a gbwt::SearchState that defines only some haplotypes on a handle to start with.
|
protected |
Run a DFS on valid haplotypes in the GBWT starting from the given Position, and continuing up to the given number of bases.
Calls enter_handle when the DFS enters a haplotype visit to a particular handle, and exit_handle when it exits a visit. These let the caller maintain a stack and track the traversals.
The starting node is only entered if its offset isn't equal to its length (i.e. bases remain to be visited).
Stopping early is not permitted.
|
protected |
The same as dfs_gbwt on a Position, but takes a handle in the backing gbwt_graph and an offset from the start of the handle instead.
|
protected |
Get the distance between a pair of read alignments, or std::numeric_limits<int64_t>::max() if unreachable.
Get the distance between a pair of positions, or std::numeric_limits<int64_t>::max() if unreachable.
|
protected |
Convert an integer distance, with limits standing for no distance, to a double annotation that can safely be parsed back from JSON into an integer if it is integral.
|
staticprotected |
Dump a whole chaining problem.
|
staticprotected |
Print the seed content of a cluster.
|
staticprotected |
Dump all the extansions in an extension set.
|
staticprotected |
Dump all the given minimizers, with optional subset restriction.
|
staticprotected |
Print information about a read to be aligned.
|
staticprotected |
Print information about a read pair to be aligned.
|
staticprotected |
Print information about a selected set of seeds.
|
staticprotected |
Print a sequence with base numbering.
|
protected |
Extends the seeds in a cluster into a collection of GaplessExtension objects.
|
protected |
Convert the GaplessExtension into an alignment. This assumes that the extension is a full-length alignment and that the sequence field of the alignment has been set.
|
staticprotected |
Compute a bound on the Phred score probability of a mapping beign wrong due to base errors and unlocated minimizer hits prevented us from finding the true alignment.
Algorithm uses a "sweep line" dynamic programming approach. For a read with minimizers aligned to it:
000000000011111111112222222222 012345678901234567890123456789
Read: ****************************** Minimizer 1: ***** Minimizer 2: ***** Minimizer 3: ***** Minimizer 4: *****
For each distinct read interval of overlapping minimizers, e.g. in the example the intervals 3,4,5; 6,7; 8,9,10; 18,19,20; 21,22; and 23,24,25 we consider base errors that would result in the minimizers in the interval being incorrect
We use dynamic programming sweeping left-to-right over the intervals to compute the probability of the minimum number of base errors needed to disrupt all the minimizers.
Will sort minimizers_explored (which is indices into minimizers) by minimizer start position.
|
inline |
|
protected |
|
protected |
Find the minimizers in the sequence using the minimizer index, and return them sorted in read order.
|
protected |
Operating on the given input alignment, align the tails dangling off the given extended perfect-match seeds and produce an optimal alignment into the given output Alignment object, best, and the second best alignment into second_best.
Uses the given RNG to break ties.
|
protected |
Find seeds for all minimizers passing the filters.
|
protected |
When dozeu doesn't have any seeds, it's scan heuristic can lead to inaccurate anchoring with the end result that one end of the alignment has a deletion that doesn't connect to an aligned base. This function removes those deletions
|
protected |
When we use dozeu for rescue, the reported alignment score is incorrect. 1) Dozeu only gives the full-length bonus once. 2) There is no penalty for a softclip at the edge of the subgraph. This function calculates the score correctly. If the score is <= 0, we realign the read using GSSW. TODO: This should be unnecessary.
|
staticprotected |
Given a collection of minimizers, and a list of the minimizers we actually care about (as indices into the collection), iterate over common intervals of overlapping minimizer agglomerations.
Calls the given callback with (left, right, bottom, top), where left is the first base of the agglomeration interval (inclusive), right is the last base of the agglomeration interval (exclusive), bottom is the index of the first minimizer with an agglomeration in the interval and top is the index of the last minimizer with an agglomeration in the interval (exclusive).
minimizer_indices must be sorted by agglomeration end, and then by agglomeration start, so they can be decomposed into nice rectangles.
Note that bottom and top are offsets into minimizer_indices, NOT minimizers itself. Only contiguous ranges in minimizer_indices actually make sense.
|
inline |
|
inline |
|
protected |
Find the best alignment of the given sequence against any of the trees provided in trees, where each tree is a TreeSubgraph over the GBWT graph. Each tree subgraph is rooted at the left in its own local coordinate space, even if we are pinning on the right.
If no mapping is possible (for example, because there are no trees), produce a pure insert at default_position.
Alignment is always pinned.
If pin_left is true, pin the alignment on the left to the root of each tree. Otherwise pin it on the right to the root of each tree.
Limits the length of the longest gap to longest_detectable_gap.
Returns alignments in gbwt_graph space.
|
inline |
Get the distance limit for the given read length
|
inline |
|
inline |
|
inline |
|
staticprotected |
Gives the log10 prob of a base error in the given interval of the read, accounting for the disruption of specified minimizers.
minimizers is the collection of all minimizers
disrupt_begin and disrupt_end are iterators defining a sequence of indices of minimizers in minimizers that are disrupted.
left and right are the inclusive and exclusive bounds of the interval of the read where the disruption occurs.
|
staticprotected |
Gives the raw probability of a base error in the given column of the read, accounting for the disruption of specified minimizers.
minimizers is the collection of all minimizers
disrupt_begin and disrupt_end are iterators defining a sequence of indices of minimizers in minimizers that are disrupted.
index is the position in the read where the disruption occurs.
|
protected |
Get all the trees defining tails off the specified side of the specified gapless extension. Should only be called if a tail on that side exists, or this is a waste of time.
If the gapless extension starts or ends at a node boundary, there may be multiple trees produced, each with a distinct root.
If the gapless extension abuts the edge of the read, an empty forest will be produced.
Each tree is represented as a TreeSubgraph over our gbwt_graph.
If left_tails is true, the trees read out of the left sides of the gapless extension. Otherwise they read out of the right side.
As a side effect, saves the length of the longest detectable gap in an alignment of a tail to the forest into the provided location, if set.
|
staticprotected |
Get the from length of an ImmutabelPath.
Can't be called path_from_length or it will shadow the one for Paths instead of overloading.
|
staticprotected |
Turn an Alignment into a conveniently-sized string for logging.
|
staticprotected |
Turn an Path from an alignment into a conveniently-sized string for logging.
|
staticprotected |
Turn a list of bit flags into a compact representation.
|
staticprotected |
Get the thread identifier prefix for logging.
Map the given read. Return a vector of alignments that it maps to, winner first.
void vg::MinimizerMapper::map | ( | Alignment & | aln, |
AlignmentEmitter & | alignment_emitter | ||
) |
Map the given read, and send output to the given AlignmentEmitter. May be run from any thread. TODO: Can't be const because the clusterer's cluster_seeds isn't const.
Map the given read using chaining of seeds. Return a vector of alignments that it maps to, winner first.
Map the given read using gapless extensions. Return a vector of alignments that it maps to, winner first.
pair< vector< Alignment >, vector< Alignment > > vg::MinimizerMapper::map_paired | ( | Alignment & | aln1, |
Alignment & | aln2 | ||
) |
Map the given pair of reads, where aln1 is upstream of aln2 and they are oriented towards each other in the graph.
If the fragment length distribution is not yet fixed, reads will be mapped independently. Otherwise, they will be mapped according to the fragment length distribution.
pair< vector< Alignment >, vector< Alignment > > vg::MinimizerMapper::map_paired | ( | Alignment & | aln1, |
Alignment & | aln2, | ||
vector< pair< Alignment, Alignment >> & | ambiguous_pair_buffer | ||
) |
Map the given pair of reads, where aln1 is upstream of aln2 and they are oriented towards each other in the graph.
If the reads are ambiguous and there's no fragment length distribution fixed yet, they will be dropped into ambiguous_pair_buffer.
Otherwise, at least one result will be returned for them (although it may be the unmapped alignment).
|
inlinestaticprotected |
How should we initialize chain info when it's not stored in the minimizer index?
|
protected |
Set pair partner references for paired mapping results.
|
protected |
Given a count of items, a function to get the score of each, a score-difference-from-the-best cutoff, a min and max processed item count, and a function to get a sort-shuffling seed for breaking ties, process items in descending score order by calling process_item with the item's number, until min_count items are processed and either max_count items are processed or the score difference threshold is hit (or we run out of items).
If process_item returns false, the item is skipped and does not count against min_count or max_count.
Call discard_item_by_count with the item's number for all remaining items that would pass the score threshold.
Call discard_item_by_score with the item's number for all remaining items that would fail the score threshold.
|
protected |
Same as the other process_until_threshold functions, except using a vector to supply scores.
|
protected |
Same as the other process_until_threshold functions, except user supplies comparator to sort the items (must still be sorted by score).
|
protected |
Reseed between the given graph and read positions. Produces new seeds by asking the given callback for minimizers' occurrence positions. Up to one end of the graph region can be a read end, with a pos_t matching is_empty(). The read region always needs to be fully defined.
|
protected |
Score a pair of alignments given the distance between them
|
protected |
Determine cluster score, read coverage, and a vector of flags for the minimizers present in the cluster. Score is the sum of the scores of distinct minimizers in the cluster, while read coverage is the fraction of the read covered by seeds in the cluster.
Puts the cluster in the funnel as coming from its seeds.
|
staticprotected |
Score the given group of gapless extensions. Determines the best score that can be obtained by chaining extensions together, using the given gap open and gap extend penalties to charge for either overlaps or gaps in coverage of the read.
Enforces that overlaps cannot result in containment.
Input extended seeds must be sorted by start position.
|
protected |
Score the set of extensions for each cluster using score_extension_group(). Return the scores in the same order as the extensions.
This version allows the collections of extensions to be scored to come with annotating read numbers, which are ignored.
|
protected |
Score the set of extensions for each cluster using score_extension_group(). Return the scores in the same order as the extension groups.
|
protected |
Determine cluster score, read coverage, and a vector of flags for the minimizers present in the cluster. Score is the sum of the scores of distinct minimizers in the cluster, while read coverage is the fraction of the read covered by seeds in the cluster.
Thinks of the cluster as being made out of some previous clusters and some new seeds from the tail end of seeds, which are already in the funnel, clusters first. seed_to_precluster maps from seed to the old cluster it is part of, or std::numeric_limits<size_t>::max() if it isn't from an old cluster.
Puts the cluster in the funnel.
|
protected |
Return the all non-redundant seeds in the subgraph, including those from minimizers not used for mapping.
void vg::AlignerClient::set_alignment_scores |
Set the algner scoring parameters and create the stored aligner instances. The score matrix should by a 4 x 4 array in the order (ACGT). Other overloads of set_alignment_scores all call this one.
|
virtual |
Set the algner scoring parameters and create the stored aligner instances. The score matrix should by a 4 x 4 array in the order (ACGT). Other overloads of set_alignment_scores all call this one.
Reimplemented from vg::AlignerClient.
void vg::AlignerClient::set_alignment_scores |
Set all the aligner scoring parameters and create the stored aligner instances.
void vg::AlignerClient::set_alignment_scores |
Set the algner scoring parameters and create the stored aligner instances. The stream should contain a 4 x 4 whitespace-separated substitution matrix (in the order ACGT)
|
protected |
Return the indices of all the minimizers, sorted in descending order by theit minimizers' scores.
|
protected |
If tracking correctness, mark seeds that are correctly mapped as correct in the funnel, based on proximity along paths to the input read's refpos. Otherwise, tag just as placed, with the seed's read interval. Assumes we are tracking provenance.
|
protected |
Convert a single seed to a single chaining anchor.
|
protected |
Convert a collection of seeds to a collection of chaining anchors.
|
staticprotected |
Convert an ImmutablePath to a Path.
|
protected |
Convert an Anchor to a WFAAlignment.
|
protected |
Get the unoriented distance between a pair of positions
|
protected |
Do a brute check of the clusters. Print errors to stderr.
|
protected |
Convert a WFAAlignment into a vg Alignment. This assumes that the WFAAlignment is a full-length alignment and that the sequence field of the vg Alignment has been set.
|
staticprotected |
Compute a bound on the Phred score probability of having created the agglomerations of the specified minimizers by base errors from the given sequence, which was sequenced with the given qualities.
No limit is imposed if broken is empty.
Takes the collection of all minimizers found, and a vector of the indices of minimizers we are interested in the agglomerations of. May modify the order of that index vector.
Also takes the sequence of the read (to avoid Ns) and the quality string (interpreted as a byte array).
Currently computes a lower-score-bound, upper-probability-bound, suitable for use as a mapping quality cap, by assuming the easiest-to-disrupt possible layout of the windows, and the lowest possible qualities for the disrupting bases.
|
staticprotected |
Clip out the part of the graph between the given positions, and dagify it from the perspective of the anchors. If a left anchor is set, all heads should correspond to the left anchor, and if a right anchor is set, all tails should correspond to the right anchor. At least one anchor must be set.
Calls the callback with an extracted, strand-split, dagified graph, and a function that translates from handle in the dagified graph to node ID and orientation in the base graph.
|
friend |
bool vg::MinimizerMapper::align_from_chains = default_align_from_chains |
int vg::MinimizerMapper::chain_min_score = default_chain_min_score |
double vg::MinimizerMapper::chain_score_threshold = default_chain_score_threshold |
size_t vg::MinimizerMapper::chaining_cluster_distance = default_chaining_cluster_distance |
double vg::MinimizerMapper::cluster_coverage_threshold = default_cluster_coverage_threshold |
double vg::MinimizerMapper::cluster_score_threshold = default_cluster_score_threshold |
|
protected |
We have a clusterer.
|
staticconstexpr |
If true, produce alignments from extension sets by chaining gapless extensions up and aligning the sequences between them. If false, produce alignments by aligning the tails off of individual gapless extensions.
|
staticconstexpr |
Even if we would have fewer than min_chains results, don't process anything with a score smaller than this.
|
staticconstexpr |
If a chain's score is smaller than the best chain's score by more than this much, don't align it
|
staticconstexpr |
What read-length-independent distance threshold do we want to use for clustering?
|
staticconstexpr |
If the read coverage of a cluster is less than the best coverage of any cluster by more than this much, don't extend it
|
staticconstexpr |
If a cluster's score is smaller than the best score of any cluster by more than this much, then don't extend it
|
staticconstexpr |
|
staticconstexpr |
If false, skip computing base-level alignments.
|
staticconstexpr |
If set, exclude overlapping minimizers.
|
staticconstexpr |
|
staticconstexpr |
Even if we would have fewer than min_extension_sets results, don't process anything with a score smaller than this.
|
staticconstexpr |
|
staticconstexpr |
Ignore all minimizers with more than hard_hit_cap hits.
|
staticconstexpr |
Use all minimizers with at most hit_cap hits.
|
staticconstexpr |
How many bases should we try to look back initially when chaining?
|
staticconstexpr |
How much of a bonus should we give to each item in chaining?
|
staticconstexpr |
How many chaining sources should we allow ourselves to consider ever?
|
staticconstexpr |
How much chould we increase lookback when we can't find anything good?
|
staticconstexpr |
How many extended clusters should we align, max?
|
staticconstexpr |
When converting chains to alignments, what's the longest gap between items we will actually try to align? Passing strings longer than ~100bp can cause WFAAligner to run for a pathologically long amount of time. May not be 0.
|
staticconstexpr |
How many clusters should we produce chains for, max?
|
staticconstexpr |
How big of an alignment in POA cells should we ever try to do with Dozeu? TODO: Lift this when Dozeu's allocator is able to work with >4 MB of memory. Each cell is 16 bits in Dozeu, and we leave some room for the query and padding to full SSE registers. Note that a very chopped graph might still break this!
|
staticconstexpr |
How many DP cells should we be willing to do in GSSW for an end-pinned alignment? If we want to do more than this, just leave tail unaligned.
|
staticconstexpr |
How many clusters should we produce gapless extensions for, max?
|
staticconstexpr |
What is the maximum fragment length that we accept as valid for paired-end reads?
|
staticconstexpr |
How many bases of indel should we allow in chaining?
|
staticconstexpr |
How many extensions should we try as seeds within a mapping location?
|
staticconstexpr |
How many bases should we look back when chaining? Needs to be about the same as the clustering distance or we will be able to cluster but not chain.
|
staticconstexpr |
|
staticconstexpr |
How many connections between preclusters should we reseed over, maximum?
|
staticconstexpr |
For paired end mapping, how many times should we attempt rescue (per read)?
|
staticconstexpr |
Similarly, what is the maximum tail length we will try to align?
|
staticconstexpr |
Maximum number of distinct minimizers to take.
|
staticconstexpr |
Disregard the chain score thresholds when they would give us fewer than this many chains.
|
staticconstexpr |
Accept at least this many clusters for chain generation.
|
staticconstexpr |
Disregard the extension set score thresholds when they would give us fewer than this many extension sets.
|
staticconstexpr |
Accept at least this many clusters for gapless extension.
|
staticconstexpr |
How bad can a transition be per base before lookback accepts it?
|
staticconstexpr |
How many chaining sources should we make sure to consider regardless of distance?
|
staticconstexpr |
How many connections between preclusters should we reseed over, minimum?
|
staticconstexpr |
Take minimizers between hit_cap and hard_hit_cap hits until this fraction of total score
|
staticconstexpr |
Number of minimzers to select based on read_len/num_min_per_bp.
|
staticconstexpr |
If the second best cluster's score is no more than this many points below the cutoff set by cluster_score_threshold, snap that cutoff down to the second best cluster's score, to avoid throwing away promising secondaries.
|
staticconstexpr |
|
staticconstexpr |
How close does an alignment have to be to the best alignment for us to rescue on it.
|
staticconstexpr |
If the read coverage of a precluster connection is less than the best of any by more than this much, don't extend it
|
staticconstexpr |
Do not attempt rescue if there are more seeds in the rescue subgraph.
|
staticconstexpr |
How many stdevs from the mean do we extract a subgraph from?
|
staticconstexpr |
When connecting subclusters for reseeding, how far should we search?
|
staticconstexpr |
If set, log what the mapper is thinking in its mapping of each read.
|
staticconstexpr |
Guess which seed hits are correct by location in the linear reference and track if/when their descendants make it through stages of the algorithm. Only works if track_provenance is true.
|
staticconstexpr |
Track which internal work items came from which others during each stage of the mapping algorithm.
|
protected |
size_t vg::MinimizerMapper::distance_limit = default_distance_limit |
bool vg::MinimizerMapper::do_dp = default_do_dp |
bool vg::MinimizerMapper::exclude_overlapping_min = default_exclude_overlapping_min |
|
protected |
We have a gapless extender to extend seed hits in haplotype space. Because this needs a reference to an Aligner, and because changing the scoring parameters deletes all the alignmers, we need to keep this somewhere we can clear out.
int vg::MinimizerMapper::extension_score_threshold = default_extension_score_threshold |
int vg::MinimizerMapper::extension_set_min_score = default_extension_set_min_score |
double vg::MinimizerMapper::extension_set_score_threshold = default_extension_set_score_threshold |
|
protected |
We have a distribution for read fragment lengths that takes care of knowing when we've observed enough good ones to learn a good distribution.
|
protected |
This is our primary graph.
size_t vg::MinimizerMapper::hard_hit_cap = default_hard_hit_cap |
size_t vg::MinimizerMapper::hit_cap = default_hit_cap |
size_t vg::MinimizerMapper::initial_lookback_threshold = default_initial_lookback_threshold |
int vg::MinimizerMapper::item_bonus = default_item_bonus |
|
staticprotected |
Length at which we cut over to long-alignment logging.
size_t vg::MinimizerMapper::lookback_item_hard_cap = lookback_item_hard_cap |
double vg::MinimizerMapper::lookback_scale_factor = default_lookback_scale_factor |
|
staticprotected |
Count at which we cut over to summary logging.
size_t vg::MinimizerMapper::max_alignments = default_max_alignments |
size_t vg::MinimizerMapper::max_chain_connection = default_max_chain_connection |
size_t vg::MinimizerMapper::max_clusters_to_chain = default_max_clusters_to_chain |
size_t vg::MinimizerMapper::max_dozeu_cells = default_max_dozeu_cells |
size_t vg::MinimizerMapper::max_dp_cells = default_max_dp_cells |
|
staticconstexpr |
How long of a DP can we do before GSSW crashes due to 16-bit score overflow?
size_t vg::MinimizerMapper::max_extensions = default_max_extensions |
size_t vg::MinimizerMapper::max_fragment_length = default_max_fragment_length |
size_t vg::MinimizerMapper::max_indel_bases = default_max_indel_bases |
size_t vg::MinimizerMapper::max_local_extensions = default_max_local_extensions |
size_t vg::MinimizerMapper::max_lookback_bases = default_max_lookback_bases |
size_t vg::MinimizerMapper::max_multimaps = default_max_multimaps |
size_t vg::MinimizerMapper::max_precluster_connections = default_max_precluster_connections |
size_t vg::MinimizerMapper::max_rescue_attempts = default_max_rescue_attempts |
size_t vg::MinimizerMapper::max_tail_length = default_max_tail_length |
size_t vg::MinimizerMapper::max_unique_min = default_max_unique_min |
int vg::MinimizerMapper::min_chains = default_min_chains |
size_t vg::MinimizerMapper::min_clusters_to_chain = default_min_clusters_to_chain |
int vg::MinimizerMapper::min_extension_sets = default_min_extension_sets |
size_t vg::MinimizerMapper::min_extensions = default_min_extensions |
double vg::MinimizerMapper::min_good_transition_score_per_base = default_min_good_transition_score_per_base |
size_t vg::MinimizerMapper::min_lookback_items = default_min_lookback_items |
size_t vg::MinimizerMapper::min_precluster_connections = default_min_precluster_connections |
|
protected |
double vg::MinimizerMapper::minimizer_score_fraction = default_minimizer_score_fraction |
size_t vg::MinimizerMapper::num_bp_per_min = default_num_bp_per_min |
double vg::MinimizerMapper::pad_cluster_score_threshold = default_pad_cluster_score_threshold |
double vg::MinimizerMapper::paired_distance_stdevs = default_paired_distance_stdevs |
double vg::MinimizerMapper::paired_rescue_score_limit = default_paired_rescue_score_limit |
|
protected |
double vg::MinimizerMapper::precluster_connection_coverage_threshold = default_precluster_connection_coverage_threshold |
string vg::MinimizerMapper::read_group |
Apply this read group name.
RescueAlgorithm vg::MinimizerMapper::rescue_algorithm = rescue_dozeu |
The algorithm used for rescue.
size_t vg::MinimizerMapper::rescue_seed_limit = default_rescue_seed_limit |
double vg::MinimizerMapper::rescue_subgraph_stdevs = default_rescue_subgraph_stdevs |
size_t vg::MinimizerMapper::reseed_search_distance = default_reseed_search_distance |
string vg::MinimizerMapper::sample_name |
Apply this sample name.
bool vg::MinimizerMapper::show_work = default_show_work |
bool vg::MinimizerMapper::track_correctness = default_track_correctness |
bool vg::MinimizerMapper::track_provenance = default_track_provenance |
|
protected |
We may need to complain exactly once that the distribution is bad.
atomic_flag vg::MinimizerMapper::warned_about_rescue_size = ATOMIC_FLAG_INIT |
Have we complained about hitting the size limit for rescue?
|
mutable |
Have we complained about hitting the size limit for tails?