vg
tools for working with variation graphs
|
#include <sampler.hpp>
Classes | |
class | MarkovDistribution |
Public Member Functions | |
NGSSimulator (PathPositionHandleGraph &graph, const string &ngs_fastq_file, const string &ngs_paired_fastq_file="", bool interleaved_fastq=false, const vector< string > &source_paths={}, const vector< double > &source_path_ploidies={}, const vector< pair< string, double >> &transcript_expressions={}, const vector< tuple< string, string, size_t >> &haplotype_transcripts={}, double substition_polymorphism_rate=0.001, double indel_polymorphism_rate=0.0002, double indel_error_proportion=0.01, double fragment_length_mean=300.0, double fragment_length_stdev=50.0, double error_multiplier=1.0, bool retry_on_Ns=true, bool sample_unsheared_paths=false, uint64_t seed=0) | |
Alignment | sample_read () |
Sample an individual read and alignment. More... | |
pair< Alignment, Alignment > | sample_read_pair () |
Sample a pair of reads an alignments. More... | |
void | connect_to_position_file (const string &filename) |
Open up a stream to output read positions to. More... | |
Public Member Functions inherited from vg::AbstractReadSampler | |
virtual | ~AbstractReadSampler ()=default |
AbstractReadSampler (PathPositionHandleGraph &graph) | |
Make a new sampler using the given graph. More... | |
Private Member Functions | |
NGSSimulator (void)=delete | |
void | record_read_quality (const Alignment &aln, bool read_2=false) |
Add a quality string to the training data. More... | |
void | record_read_pair_quality (const Alignment &aln_1, const Alignment &aln_2) |
Add a pair of quality strings to the training data. More... | |
void | finalize () |
Indicate that there is no more training data. More... | |
pair< string, vector< bool > > | sample_read_quality () |
Get a quality string and a vector of 'N'-masks that mimics the training data. More... | |
pair< pair< string, vector< bool > >, pair< string, vector< bool > > > | sample_read_quality_pair () |
Get a pair of quality strings and vectors of 'N'-masks that mimic the training data. More... | |
pair< string, vector< bool > > | sample_read_quality_internal (pair< uint8_t, bool > first, bool transitions_1) |
Wrapped internal function for quality sampling. More... | |
void | sample_read_internal (Alignment &aln, int64_t &offset, bool &is_reverse, pos_t &curr_pos, const string &source_path) |
size_t | sample_path () |
Return the index of a path if using source_paths or else numeric_limits<size_t>::max() More... | |
void | register_sampled_position (const Alignment &aln, const string &path_name, size_t offset, bool is_reverse) |
Ouput a sampled position to the path position file. More... | |
void | sample_start_pos (const size_t &source_path_idx, const int64_t &fragment_length, int64_t &offset, bool &is_reverse, pos_t &pos) |
pos_t | sample_start_graph_pos () |
Get a random position in the graph. More... | |
tuple< int64_t, bool, pos_t > | sample_start_path_pos (const size_t &source_path_idx, const int64_t &fragment_length) |
string | get_read_name () |
Get an unclashing read name. More... | |
bool | advance (int64_t &offset, bool &is_reverse, pos_t &pos, char &graph_char, const string &source_path) |
bool | advance_by_distance (int64_t &offset, bool &is_reverse, pos_t &pos, int64_t distance, const string &source_path) |
bool | advance_on_path (int64_t &offset, bool &is_reverse, pos_t &pos, char &graph_char, const string &source_path) |
bool | advance_on_path_by_distance (int64_t &offset, bool &is_reverse, pos_t &pos, int64_t distance, const string &source_path) |
bool | advance_on_graph (pos_t &pos, char &graph_char) |
bool | advance_on_graph_by_distance (pos_t &pos, int64_t distance) |
void | apply_N_mask (string &sequence, const vector< bool > &n_mask) |
Mask out bases with 'N's if the mask is true. More... | |
bool | walk_backwards (int64_t &offset, bool &is_reverse, pos_t &pos, int64_t distance, const string &source_path, const Path &path) |
Walk backwards either along an alignment path or a source path, updates positions. More... | |
bool | walk_backwards_along_alignment (const Path &path, int64_t distance, pos_t &pos) |
Walk backwards along the alignment path. More... | |
void | apply_deletion (Alignment &aln, const pos_t &pos) |
Add a deletion to the alignment. More... | |
void | apply_insertion (Alignment &aln, const pos_t &pos) |
Add an insertion to the alignment. More... | |
void | apply_aligned_base (Alignment &aln, const pos_t &pos, char graph_char, char read_char) |
Add a match/mismatch to the alignment. More... | |
mt19937_64 & | prng () |
Private Attributes | |
unordered_map< char, string > | mutation_alphabets |
Remainder of the alphabet after removing a given character. More... | |
size_t | total_seq_length = 0 |
The total sequence length in our graph. More... | |
vector< double > | phred_prob |
Memo for pre-multiplied Phred -> probability conversion. More... | |
vector< MarkovDistribution< pair< uint8_t, bool >, pair< uint8_t, bool > > > | transition_distrs_1 |
A Markov distribution for each read position indicating quality and whether the base is an 'N'. More... | |
vector< MarkovDistribution< pair< uint8_t, bool >, pair< uint8_t, bool > > > | transition_distrs_2 |
A second set of Markov distributions for the second read in a pair. More... | |
MarkovDistribution< pair< uint8_t, bool >, pair< pair< uint8_t, bool >, pair< uint8_t, bool > > > | joint_initial_distr |
A distribution for the joint initial qualities of a read pair. More... | |
vector< mt19937_64 > | prngs |
vg::discrete_distribution | path_sampler |
vector< vg::uniform_int_distribution< size_t > > | start_pos_samplers |
vg::uniform_int_distribution< uint8_t > | strand_sampler |
vg::uniform_int_distribution< size_t > | background_sampler |
vg::uniform_int_distribution< size_t > | mut_sampler |
vg::uniform_real_distribution< double > | prob_sampler |
const double | sub_poly_rate |
const double | indel_poly_rate |
const double | indel_error_prop |
const double | fragment_mean |
const double | fragment_sd |
size_t | sample_counter = 0 |
uint64_t | seed |
const bool | retry_on_Ns |
Should we try again for a read without Ns of we get Ns? More... | |
const bool | sample_unsheared_paths |
vector< string > | source_paths |
Restrict reads to just these paths (path-only mode) if nonempty. More... | |
ofstream | position_file |
Static Private Attributes | |
static const string | alphabet = "ACGT" |
DNA alphabet. More... | |
Additional Inherited Members | |
Public Attributes inherited from vg::AbstractReadSampler | |
bool | multi_position_annotations = false |
size_t | max_tries = 100 |
What limit should we use for retry loops before giving up or failing? More... | |
std::unique_ptr< std::function< bool(const path_handle_t &)> > | annotation_path_filter |
Protected Member Functions inherited from vg::AbstractReadSampler | |
void | annotate_with_path_positions (Alignment &aln) |
Protected Attributes inherited from vg::AbstractReadSampler | |
PathPositionHandleGraph & | graph |
The graph being simulated against. More... | |
Class that simulates reads with alignments to a graph that mimic the error profile of NGS sequencing data.
vg::NGSSimulator::NGSSimulator | ( | PathPositionHandleGraph & | graph, |
const string & | ngs_fastq_file, | ||
const string & | ngs_paired_fastq_file = "" , |
||
bool | interleaved_fastq = false , |
||
const vector< string > & | source_paths = {} , |
||
const vector< double > & | source_path_ploidies = {} , |
||
const vector< pair< string, double >> & | transcript_expressions = {} , |
||
const vector< tuple< string, string, size_t >> & | haplotype_transcripts = {} , |
||
double | substition_polymorphism_rate = 0.001 , |
||
double | indel_polymorphism_rate = 0.0002 , |
||
double | indel_error_proportion = 0.01 , |
||
double | fragment_length_mean = 300.0 , |
||
double | fragment_length_stdev = 50.0 , |
||
double | error_multiplier = 1.0 , |
||
bool | retry_on_Ns = true , |
||
bool | sample_unsheared_paths = false , |
||
uint64_t | seed = 0 |
||
) |
Initialize simulator. FASTQ file will be used to train an error distribution. Most reads in the FASTQ should be the same length. Polymorphism rates apply uniformly along a read, whereas errors are distributed as indicated by the learned distribution. The simulation can also be restricted to named paths in the graph. Alternatively, it can match an expression profile. However, it cannot be simulateously restricted to paths and to an expression profile.
|
privatedelete |
|
private |
Move forward one position in either the source path or the graph, depending on mode. Update the arguments. Return true if we can't because we hit a tip or false otherwise
|
private |
Move forward a certain distance in either the source path or the graph, depending on mode. Update the arguments. Return true if we can't because we hit a tip or false otherwise
|
private |
Move forward one position in the graph along a random path, return true if we can't because we hit a tip or false otherwise
|
private |
Move forward a certain distance in the graph along a random path, return true if we can't because we hit a tip or false otherwise
|
private |
Move forward one position in the source path, return true if we can't because we hit a tip or false otherwise
|
private |
Move forward a certain distance in the source path, return true if we can't because we hit a tip or false otherwise
|
private |
Add a match/mismatch to the alignment.
Add a deletion to the alignment.
Add an insertion to the alignment.
|
private |
Mask out bases with 'N's if the mask is true.
void vg::NGSSimulator::connect_to_position_file | ( | const string & | filename | ) |
Open up a stream to output read positions to.
|
private |
Indicate that there is no more training data.
|
private |
Get an unclashing read name.
|
private |
|
private |
Add a pair of quality strings to the training data.
|
private |
Add a quality string to the training data.
|
private |
Ouput a sampled position to the path position file.
|
private |
Return the index of a path if using source_paths or else numeric_limits<size_t>::max()
Alignment vg::NGSSimulator::sample_read | ( | ) |
Sample an individual read and alignment.
|
private |
Internal method called by paired and unpaired samplers for both whole- graph and path sources. Offset and is_reverse are only used (and drive the iteration and update of curr_pos) in path node. Otherwise, in whole graph mode, they are ignored and curr_pos is used to traverse the graph directly.
Sample a pair of reads an alignments.
|
private |
Get a quality string and a vector of 'N'-masks that mimics the training data.
|
private |
Wrapped internal function for quality sampling.
|
private |
Get a pair of quality strings and vectors of 'N'-masks that mimic the training data.
|
private |
Get a random position in the graph.
|
private |
Get a random position along the source path. Enforce fragment length restrictions if argument is positive.
|
private |
Sample an appropriate starting position according to the mode. Updates the arguments. Providing a negative number for fragment length indicates no fragment length restrictions.
|
private |
Walk backwards either along an alignment path or a source path, updates positions.
|
private |
Walk backwards along the alignment path.
|
staticprivate |
DNA alphabet.
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
A distribution for the joint initial qualities of a read pair.
|
private |
|
private |
Remainder of the alphabet after removing a given character.
|
private |
|
private |
Memo for pre-multiplied Phred -> probability conversion.
|
private |
|
private |
|
private |
|
private |
Should we try again for a read without Ns of we get Ns?
|
private |
|
private |
|
private |
|
private |
Restrict reads to just these paths (path-only mode) if nonempty.
|
private |
|
private |
|
private |
|
private |
The total sequence length in our graph.
|
private |
A Markov distribution for each read position indicating quality and whether the base is an 'N'.
|
private |
A second set of Markov distributions for the second read in a pair.