vg
tools for working with variation graphs
Classes | Public Member Functions | Private Member Functions | Private Attributes | Static Private Attributes | List of all members
vg::NGSSimulator Class Reference

#include <sampler.hpp>

Inheritance diagram for vg::NGSSimulator:
vg::AbstractReadSampler

Classes

class  MarkovDistribution
 

Public Member Functions

 NGSSimulator (PathPositionHandleGraph &graph, const string &ngs_fastq_file, const string &ngs_paired_fastq_file="", bool interleaved_fastq=false, const vector< string > &source_paths={}, const vector< double > &source_path_ploidies={}, const vector< pair< string, double >> &transcript_expressions={}, const vector< tuple< string, string, size_t >> &haplotype_transcripts={}, double substition_polymorphism_rate=0.001, double indel_polymorphism_rate=0.0002, double indel_error_proportion=0.01, double fragment_length_mean=300.0, double fragment_length_stdev=50.0, double error_multiplier=1.0, bool retry_on_Ns=true, bool sample_unsheared_paths=false, uint64_t seed=0)
 
Alignment sample_read ()
 Sample an individual read and alignment. More...
 
pair< Alignment, Alignmentsample_read_pair ()
 Sample a pair of reads an alignments. More...
 
void connect_to_position_file (const string &filename)
 Open up a stream to output read positions to. More...
 
- Public Member Functions inherited from vg::AbstractReadSampler
virtual ~AbstractReadSampler ()=default
 
 AbstractReadSampler (PathPositionHandleGraph &graph)
 Make a new sampler using the given graph. More...
 

Private Member Functions

 NGSSimulator (void)=delete
 
void record_read_quality (const Alignment &aln, bool read_2=false)
 Add a quality string to the training data. More...
 
void record_read_pair_quality (const Alignment &aln_1, const Alignment &aln_2)
 Add a pair of quality strings to the training data. More...
 
void finalize ()
 Indicate that there is no more training data. More...
 
pair< string, vector< bool > > sample_read_quality ()
 Get a quality string and a vector of 'N'-masks that mimics the training data. More...
 
pair< pair< string, vector< bool > >, pair< string, vector< bool > > > sample_read_quality_pair ()
 Get a pair of quality strings and vectors of 'N'-masks that mimic the training data. More...
 
pair< string, vector< bool > > sample_read_quality_internal (pair< uint8_t, bool > first, bool transitions_1)
 Wrapped internal function for quality sampling. More...
 
void sample_read_internal (Alignment &aln, int64_t &offset, bool &is_reverse, pos_t &curr_pos, const string &source_path)
 
size_t sample_path ()
 Return the index of a path if using source_paths or else numeric_limits<size_t>::max() More...
 
void register_sampled_position (const Alignment &aln, const string &path_name, size_t offset, bool is_reverse)
 Ouput a sampled position to the path position file. More...
 
void sample_start_pos (const size_t &source_path_idx, const int64_t &fragment_length, int64_t &offset, bool &is_reverse, pos_t &pos)
 
pos_t sample_start_graph_pos ()
 Get a random position in the graph. More...
 
tuple< int64_t, bool, pos_tsample_start_path_pos (const size_t &source_path_idx, const int64_t &fragment_length)
 
string get_read_name ()
 Get an unclashing read name. More...
 
bool advance (int64_t &offset, bool &is_reverse, pos_t &pos, char &graph_char, const string &source_path)
 
bool advance_by_distance (int64_t &offset, bool &is_reverse, pos_t &pos, int64_t distance, const string &source_path)
 
bool advance_on_path (int64_t &offset, bool &is_reverse, pos_t &pos, char &graph_char, const string &source_path)
 
bool advance_on_path_by_distance (int64_t &offset, bool &is_reverse, pos_t &pos, int64_t distance, const string &source_path)
 
bool advance_on_graph (pos_t &pos, char &graph_char)
 
bool advance_on_graph_by_distance (pos_t &pos, int64_t distance)
 
void apply_N_mask (string &sequence, const vector< bool > &n_mask)
 Mask out bases with 'N's if the mask is true. More...
 
bool walk_backwards (int64_t &offset, bool &is_reverse, pos_t &pos, int64_t distance, const string &source_path, const Path &path)
 Walk backwards either along an alignment path or a source path, updates positions. More...
 
bool walk_backwards_along_alignment (const Path &path, int64_t distance, pos_t &pos)
 Walk backwards along the alignment path. More...
 
void apply_deletion (Alignment &aln, const pos_t &pos)
 Add a deletion to the alignment. More...
 
void apply_insertion (Alignment &aln, const pos_t &pos)
 Add an insertion to the alignment. More...
 
void apply_aligned_base (Alignment &aln, const pos_t &pos, char graph_char, char read_char)
 Add a match/mismatch to the alignment. More...
 
mt19937_64 & prng ()
 

Private Attributes

unordered_map< char, string > mutation_alphabets
 Remainder of the alphabet after removing a given character. More...
 
size_t total_seq_length = 0
 The total sequence length in our graph. More...
 
vector< double > phred_prob
 Memo for pre-multiplied Phred -> probability conversion. More...
 
vector< MarkovDistribution< pair< uint8_t, bool >, pair< uint8_t, bool > > > transition_distrs_1
 A Markov distribution for each read position indicating quality and whether the base is an 'N'. More...
 
vector< MarkovDistribution< pair< uint8_t, bool >, pair< uint8_t, bool > > > transition_distrs_2
 A second set of Markov distributions for the second read in a pair. More...
 
MarkovDistribution< pair< uint8_t, bool >, pair< pair< uint8_t, bool >, pair< uint8_t, bool > > > joint_initial_distr
 A distribution for the joint initial qualities of a read pair. More...
 
vector< mt19937_64 > prngs
 
vg::discrete_distribution path_sampler
 
vector< vg::uniform_int_distribution< size_t > > start_pos_samplers
 
vg::uniform_int_distribution< uint8_t > strand_sampler
 
vg::uniform_int_distribution< size_t > background_sampler
 
vg::uniform_int_distribution< size_t > mut_sampler
 
vg::uniform_real_distribution< double > prob_sampler
 
const double sub_poly_rate
 
const double indel_poly_rate
 
const double indel_error_prop
 
const double fragment_mean
 
const double fragment_sd
 
size_t sample_counter = 0
 
uint64_t seed
 
const bool retry_on_Ns
 Should we try again for a read without Ns of we get Ns? More...
 
const bool sample_unsheared_paths
 
vector< string > source_paths
 Restrict reads to just these paths (path-only mode) if nonempty. More...
 
ofstream position_file
 

Static Private Attributes

static const string alphabet = "ACGT"
 DNA alphabet. More...
 

Additional Inherited Members

- Public Attributes inherited from vg::AbstractReadSampler
bool multi_position_annotations = false
 
size_t max_tries = 100
 What limit should we use for retry loops before giving up or failing? More...
 
std::unique_ptr< std::function< bool(const path_handle_t &)> > annotation_path_filter
 
- Protected Member Functions inherited from vg::AbstractReadSampler
void annotate_with_path_positions (Alignment &aln)
 
- Protected Attributes inherited from vg::AbstractReadSampler
PathPositionHandleGraphgraph
 The graph being simulated against. More...
 

Detailed Description

Class that simulates reads with alignments to a graph that mimic the error profile of NGS sequencing data.

Constructor & Destructor Documentation

◆ NGSSimulator() [1/2]

vg::NGSSimulator::NGSSimulator ( PathPositionHandleGraph graph,
const string &  ngs_fastq_file,
const string &  ngs_paired_fastq_file = "",
bool  interleaved_fastq = false,
const vector< string > &  source_paths = {},
const vector< double > &  source_path_ploidies = {},
const vector< pair< string, double >> &  transcript_expressions = {},
const vector< tuple< string, string, size_t >> &  haplotype_transcripts = {},
double  substition_polymorphism_rate = 0.001,
double  indel_polymorphism_rate = 0.0002,
double  indel_error_proportion = 0.01,
double  fragment_length_mean = 300.0,
double  fragment_length_stdev = 50.0,
double  error_multiplier = 1.0,
bool  retry_on_Ns = true,
bool  sample_unsheared_paths = false,
uint64_t  seed = 0 
)

Initialize simulator. FASTQ file will be used to train an error distribution. Most reads in the FASTQ should be the same length. Polymorphism rates apply uniformly along a read, whereas errors are distributed as indicated by the learned distribution. The simulation can also be restricted to named paths in the graph. Alternatively, it can match an expression profile. However, it cannot be simulateously restricted to paths and to an expression profile.

◆ NGSSimulator() [2/2]

vg::NGSSimulator::NGSSimulator ( void  )
privatedelete

Member Function Documentation

◆ advance()

bool vg::NGSSimulator::advance ( int64_t &  offset,
bool &  is_reverse,
pos_t pos,
char &  graph_char,
const string &  source_path 
)
private

Move forward one position in either the source path or the graph, depending on mode. Update the arguments. Return true if we can't because we hit a tip or false otherwise

◆ advance_by_distance()

bool vg::NGSSimulator::advance_by_distance ( int64_t &  offset,
bool &  is_reverse,
pos_t pos,
int64_t  distance,
const string &  source_path 
)
private

Move forward a certain distance in either the source path or the graph, depending on mode. Update the arguments. Return true if we can't because we hit a tip or false otherwise

◆ advance_on_graph()

bool vg::NGSSimulator::advance_on_graph ( pos_t pos,
char &  graph_char 
)
private

Move forward one position in the graph along a random path, return true if we can't because we hit a tip or false otherwise

◆ advance_on_graph_by_distance()

bool vg::NGSSimulator::advance_on_graph_by_distance ( pos_t pos,
int64_t  distance 
)
private

Move forward a certain distance in the graph along a random path, return true if we can't because we hit a tip or false otherwise

◆ advance_on_path()

bool vg::NGSSimulator::advance_on_path ( int64_t &  offset,
bool &  is_reverse,
pos_t pos,
char &  graph_char,
const string &  source_path 
)
private

Move forward one position in the source path, return true if we can't because we hit a tip or false otherwise

◆ advance_on_path_by_distance()

bool vg::NGSSimulator::advance_on_path_by_distance ( int64_t &  offset,
bool &  is_reverse,
pos_t pos,
int64_t  distance,
const string &  source_path 
)
private

Move forward a certain distance in the source path, return true if we can't because we hit a tip or false otherwise

◆ apply_aligned_base()

void vg::NGSSimulator::apply_aligned_base ( Alignment aln,
const pos_t pos,
char  graph_char,
char  read_char 
)
private

Add a match/mismatch to the alignment.

◆ apply_deletion()

void vg::NGSSimulator::apply_deletion ( Alignment aln,
const pos_t pos 
)
private

Add a deletion to the alignment.

◆ apply_insertion()

void vg::NGSSimulator::apply_insertion ( Alignment aln,
const pos_t pos 
)
private

Add an insertion to the alignment.

◆ apply_N_mask()

void vg::NGSSimulator::apply_N_mask ( string &  sequence,
const vector< bool > &  n_mask 
)
private

Mask out bases with 'N's if the mask is true.

◆ connect_to_position_file()

void vg::NGSSimulator::connect_to_position_file ( const string &  filename)

Open up a stream to output read positions to.

◆ finalize()

void vg::NGSSimulator::finalize ( )
private

Indicate that there is no more training data.

◆ get_read_name()

string vg::NGSSimulator::get_read_name ( )
private

Get an unclashing read name.

◆ prng()

mt19937_64 & vg::NGSSimulator::prng ( )
private

◆ record_read_pair_quality()

void vg::NGSSimulator::record_read_pair_quality ( const Alignment aln_1,
const Alignment aln_2 
)
private

Add a pair of quality strings to the training data.

◆ record_read_quality()

void vg::NGSSimulator::record_read_quality ( const Alignment aln,
bool  read_2 = false 
)
private

Add a quality string to the training data.

◆ register_sampled_position()

void vg::NGSSimulator::register_sampled_position ( const Alignment aln,
const string &  path_name,
size_t  offset,
bool  is_reverse 
)
private

Ouput a sampled position to the path position file.

◆ sample_path()

size_t vg::NGSSimulator::sample_path ( )
private

Return the index of a path if using source_paths or else numeric_limits<size_t>::max()

◆ sample_read()

Alignment vg::NGSSimulator::sample_read ( )

Sample an individual read and alignment.

◆ sample_read_internal()

void vg::NGSSimulator::sample_read_internal ( Alignment aln,
int64_t &  offset,
bool &  is_reverse,
pos_t curr_pos,
const string &  source_path 
)
private

Internal method called by paired and unpaired samplers for both whole- graph and path sources. Offset and is_reverse are only used (and drive the iteration and update of curr_pos) in path node. Otherwise, in whole graph mode, they are ignored and curr_pos is used to traverse the graph directly.

◆ sample_read_pair()

pair< Alignment, Alignment > vg::NGSSimulator::sample_read_pair ( )

Sample a pair of reads an alignments.

◆ sample_read_quality()

pair< string, vector< bool > > vg::NGSSimulator::sample_read_quality ( )
private

Get a quality string and a vector of 'N'-masks that mimics the training data.

◆ sample_read_quality_internal()

pair< string, vector< bool > > vg::NGSSimulator::sample_read_quality_internal ( pair< uint8_t, bool >  first,
bool  transitions_1 
)
private

Wrapped internal function for quality sampling.

◆ sample_read_quality_pair()

pair< pair< string, vector< bool > >, pair< string, vector< bool > > > vg::NGSSimulator::sample_read_quality_pair ( )
private

Get a pair of quality strings and vectors of 'N'-masks that mimic the training data.

◆ sample_start_graph_pos()

pos_t vg::NGSSimulator::sample_start_graph_pos ( )
private

Get a random position in the graph.

◆ sample_start_path_pos()

tuple< int64_t, bool, pos_t > vg::NGSSimulator::sample_start_path_pos ( const size_t &  source_path_idx,
const int64_t &  fragment_length 
)
private

Get a random position along the source path. Enforce fragment length restrictions if argument is positive.

◆ sample_start_pos()

void vg::NGSSimulator::sample_start_pos ( const size_t &  source_path_idx,
const int64_t &  fragment_length,
int64_t &  offset,
bool &  is_reverse,
pos_t pos 
)
private

Sample an appropriate starting position according to the mode. Updates the arguments. Providing a negative number for fragment length indicates no fragment length restrictions.

◆ walk_backwards()

bool vg::NGSSimulator::walk_backwards ( int64_t &  offset,
bool &  is_reverse,
pos_t pos,
int64_t  distance,
const string &  source_path,
const Path path 
)
private

Walk backwards either along an alignment path or a source path, updates positions.

◆ walk_backwards_along_alignment()

bool vg::NGSSimulator::walk_backwards_along_alignment ( const Path path,
int64_t  distance,
pos_t pos 
)
private

Walk backwards along the alignment path.

Member Data Documentation

◆ alphabet

const string vg::NGSSimulator::alphabet = "ACGT"
staticprivate

DNA alphabet.

◆ background_sampler

vg::uniform_int_distribution<size_t> vg::NGSSimulator::background_sampler
private

◆ fragment_mean

const double vg::NGSSimulator::fragment_mean
private

◆ fragment_sd

const double vg::NGSSimulator::fragment_sd
private

◆ indel_error_prop

const double vg::NGSSimulator::indel_error_prop
private

◆ indel_poly_rate

const double vg::NGSSimulator::indel_poly_rate
private

◆ joint_initial_distr

MarkovDistribution<pair<uint8_t, bool>, pair<pair<uint8_t, bool>, pair<uint8_t, bool> > > vg::NGSSimulator::joint_initial_distr
private

A distribution for the joint initial qualities of a read pair.

◆ mut_sampler

vg::uniform_int_distribution<size_t> vg::NGSSimulator::mut_sampler
private

◆ mutation_alphabets

unordered_map<char, string> vg::NGSSimulator::mutation_alphabets
private

Remainder of the alphabet after removing a given character.

◆ path_sampler

vg::discrete_distribution vg::NGSSimulator::path_sampler
private

◆ phred_prob

vector<double> vg::NGSSimulator::phred_prob
private

Memo for pre-multiplied Phred -> probability conversion.

◆ position_file

ofstream vg::NGSSimulator::position_file
private

◆ prngs

vector<mt19937_64> vg::NGSSimulator::prngs
private

◆ prob_sampler

vg::uniform_real_distribution<double> vg::NGSSimulator::prob_sampler
private

◆ retry_on_Ns

const bool vg::NGSSimulator::retry_on_Ns
private

Should we try again for a read without Ns of we get Ns?

◆ sample_counter

size_t vg::NGSSimulator::sample_counter = 0
private

◆ sample_unsheared_paths

const bool vg::NGSSimulator::sample_unsheared_paths
private

◆ seed

uint64_t vg::NGSSimulator::seed
private

◆ source_paths

vector<string> vg::NGSSimulator::source_paths
private

Restrict reads to just these paths (path-only mode) if nonempty.

◆ start_pos_samplers

vector<vg::uniform_int_distribution<size_t> > vg::NGSSimulator::start_pos_samplers
private

◆ strand_sampler

vg::uniform_int_distribution<uint8_t> vg::NGSSimulator::strand_sampler
private

◆ sub_poly_rate

const double vg::NGSSimulator::sub_poly_rate
private

◆ total_seq_length

size_t vg::NGSSimulator::total_seq_length = 0
private

The total sequence length in our graph.

◆ transition_distrs_1

vector<MarkovDistribution<pair<uint8_t, bool>, pair<uint8_t, bool> > > vg::NGSSimulator::transition_distrs_1
private

A Markov distribution for each read position indicating quality and whether the base is an 'N'.

◆ transition_distrs_2

vector<MarkovDistribution<pair<uint8_t, bool>, pair<uint8_t, bool> > > vg::NGSSimulator::transition_distrs_2
private

A second set of Markov distributions for the second read in a pair.


The documentation for this class was generated from the following files: