#include <sampler.hpp>

Inheritance diagram for vg::NGSSimulator:

Classes
class	MarkovDistribution

Public Member Functions
	NGSSimulator (PathPositionHandleGraph &graph, const string &ngs_fastq_file, const string &ngs_paired_fastq_file="", bool interleaved_fastq=false, const vector< string > &source_paths={}, const vector< double > &source_path_ploidies={}, const vector< pair< string, double >> &transcript_expressions={}, const vector< tuple< string, string, size_t >> &haplotype_transcripts={}, double substition_polymorphism_rate=0.001, double indel_polymorphism_rate=0.0002, double indel_error_proportion=0.01, double fragment_length_mean=300.0, double fragment_length_stdev=50.0, double error_multiplier=1.0, bool retry_on_Ns=true, bool sample_unsheared_paths=false, uint64_t seed=0)

Alignment	sample_read ()
	Sample an individual read and alignment. More...

pair< Alignment, Alignment >	sample_read_pair ()
	Sample a pair of reads an alignments. More...

void	connect_to_position_file (const string &filename)
	Open up a stream to output read positions to. More...

Public Member Functions inherited from vg::AbstractReadSampler
virtual	~AbstractReadSampler ()=default

	AbstractReadSampler (PathPositionHandleGraph &graph)
	Make a new sampler using the given graph. More...

Private Member Functions
	NGSSimulator (void)=delete

void	record_read_quality (const Alignment &aln, bool read_2=false)
	Add a quality string to the training data. More...

void	record_read_pair_quality (const Alignment &aln_1, const Alignment &aln_2)
	Add a pair of quality strings to the training data. More...

void	finalize ()
	Indicate that there is no more training data. More...

pair< string, vector< bool > >	sample_read_quality ()
	Get a quality string and a vector of 'N'-masks that mimics the training data. More...

pair< pair< string, vector< bool > >, pair< string, vector< bool > > >	sample_read_quality_pair ()
	Get a pair of quality strings and vectors of 'N'-masks that mimic the training data. More...

pair< string, vector< bool > >	sample_read_quality_internal (pair< uint8_t, bool > first, bool transitions_1)
	Wrapped internal function for quality sampling. More...

void	sample_read_internal (Alignment &aln, int64_t &offset, bool &is_reverse, pos_t &curr_pos, const string &source_path)

size_t	sample_path ()
	Return the index of a path if using source_paths or else numeric_limits<size_t>::max() More...

void	register_sampled_position (const Alignment &aln, const string &path_name, size_t offset, bool is_reverse)
	Ouput a sampled position to the path position file. More...

void	sample_start_pos (const size_t &source_path_idx, const int64_t &fragment_length, int64_t &offset, bool &is_reverse, pos_t &pos)

pos_t	sample_start_graph_pos ()
	Get a random position in the graph. More...

tuple< int64_t, bool, pos_t >	sample_start_path_pos (const size_t &source_path_idx, const int64_t &fragment_length)

string	get_read_name ()
	Get an unclashing read name. More...

bool	advance (int64_t &offset, bool &is_reverse, pos_t &pos, char &graph_char, const string &source_path)

bool	advance_by_distance (int64_t &offset, bool &is_reverse, pos_t &pos, int64_t distance, const string &source_path)

bool	advance_on_path (int64_t &offset, bool &is_reverse, pos_t &pos, char &graph_char, const string &source_path)

bool	advance_on_path_by_distance (int64_t &offset, bool &is_reverse, pos_t &pos, int64_t distance, const string &source_path)

bool	advance_on_graph (pos_t &pos, char &graph_char)

bool	advance_on_graph_by_distance (pos_t &pos, int64_t distance)

void	apply_N_mask (string &sequence, const vector< bool > &n_mask)
	Mask out bases with 'N's if the mask is true. More...

bool	walk_backwards (int64_t &offset, bool &is_reverse, pos_t &pos, int64_t distance, const string &source_path, const Path &path)
	Walk backwards either along an alignment path or a source path, updates positions. More...

bool	walk_backwards_along_alignment (const Path &path, int64_t distance, pos_t &pos)
	Walk backwards along the alignment path. More...

void	apply_deletion (Alignment &aln, const pos_t &pos)
	Add a deletion to the alignment. More...

void	apply_insertion (Alignment &aln, const pos_t &pos)
	Add an insertion to the alignment. More...

void	apply_aligned_base (Alignment &aln, const pos_t &pos, char graph_char, char read_char)
	Add a match/mismatch to the alignment. More...

mt19937_64 &	prng ()

Private Attributes
unordered_map< char, string >	mutation_alphabets
	Remainder of the alphabet after removing a given character. More...

size_t	total_seq_length = 0
	The total sequence length in our graph. More...

vector< double >	phred_prob
	Memo for pre-multiplied Phred -> probability conversion. More...

vector< MarkovDistribution< pair< uint8_t, bool >, pair< uint8_t, bool > > >	transition_distrs_1
	A Markov distribution for each read position indicating quality and whether the base is an 'N'. More...

vector< MarkovDistribution< pair< uint8_t, bool >, pair< uint8_t, bool > > >	transition_distrs_2
	A second set of Markov distributions for the second read in a pair. More...

MarkovDistribution< pair< uint8_t, bool >, pair< pair< uint8_t, bool >, pair< uint8_t, bool > > >	joint_initial_distr
	A distribution for the joint initial qualities of a read pair. More...

vector< mt19937_64 >	prngs

vg::discrete_distribution	path_sampler

vector< vg::uniform_int_distribution< size_t > >	start_pos_samplers

vg::uniform_int_distribution< uint8_t >	strand_sampler

vg::uniform_int_distribution< size_t >	background_sampler

vg::uniform_int_distribution< size_t >	mut_sampler

vg::uniform_real_distribution< double >	prob_sampler

const double	sub_poly_rate

const double	indel_poly_rate

const double	indel_error_prop

const double	fragment_mean

const double	fragment_sd

size_t	sample_counter = 0

uint64_t	seed

const bool	retry_on_Ns
	Should we try again for a read without Ns of we get Ns? More...

const bool	sample_unsheared_paths

vector< string >	source_paths
	Restrict reads to just these paths (path-only mode) if nonempty. More...

ofstream	position_file

Static Private Attributes
static const string	alphabet = "ACGT"
	DNA alphabet. More...

Additional Inherited Members
Public Attributes inherited from vg::AbstractReadSampler
bool	multi_position_annotations = false

size_t	max_tries = 100
	What limit should we use for retry loops before giving up or failing? More...

std::unique_ptr< std::function< bool(const path_handle_t &)> >	annotation_path_filter

Protected Member Functions inherited from vg::AbstractReadSampler
void	annotate_with_path_positions (Alignment &aln)

Protected Attributes inherited from vg::AbstractReadSampler
PathPositionHandleGraph &	graph
	The graph being simulated against. More...

Detailed Description

Class that simulates reads with alignments to a graph that mimic the error profile of NGS sequencing data.

Constructor & Destructor Documentation

◆ NGSSimulator() [1/2]

vg::NGSSimulator::NGSSimulator	(	PathPositionHandleGraph &	graph,
		const string &	ngs_fastq_file,
		const string &	ngs_paired_fastq_file = `""`,
		bool	interleaved_fastq = `false`,
		const vector< string > &	source_paths = `{}`,
		const vector< double > &	source_path_ploidies = `{}`,
		const vector< pair< string, double >> &	transcript_expressions = `{}`,
		const vector< tuple< string, string, size_t >> &	haplotype_transcripts = `{}`,
		double	substition_polymorphism_rate = `0.001`,
		double	indel_polymorphism_rate = `0.0002`,
		double	indel_error_proportion = `0.01`,
		double	fragment_length_mean = `300.0`,
		double	fragment_length_stdev = `50.0`,
		double	error_multiplier = `1.0`,
		bool	retry_on_Ns = `true`,
		bool	sample_unsheared_paths = `false`,
		uint64_t	seed = `0`
	)

Initialize simulator. FASTQ file will be used to train an error distribution. Most reads in the FASTQ should be the same length. Polymorphism rates apply uniformly along a read, whereas errors are distributed as indicated by the learned distribution. The simulation can also be restricted to named paths in the graph. Alternatively, it can match an expression profile. However, it cannot be simulateously restricted to paths and to an expression profile.

◆ NGSSimulator() [2/2]

vg::NGSSimulator::NGSSimulator ( void )

privatedelete

Member Function Documentation

◆ advance()

bool vg::NGSSimulator::advance	(	int64_t &	offset,
		bool &	is_reverse,
		pos_t &	pos,
		char &	graph_char,
		const string &	source_path
	)

private

Move forward one position in either the source path or the graph, depending on mode. Update the arguments. Return true if we can't because we hit a tip or false otherwise

◆ advance_by_distance()

bool vg::NGSSimulator::advance_by_distance	(	int64_t &	offset,
		bool &	is_reverse,
		pos_t &	pos,
		int64_t	distance,
		const string &	source_path
	)

private

Move forward a certain distance in either the source path or the graph, depending on mode. Update the arguments. Return true if we can't because we hit a tip or false otherwise

◆ advance_on_graph()

bool vg::NGSSimulator::advance_on_graph	(	pos_t &	pos,
		char &	graph_char
	)

private

Move forward one position in the graph along a random path, return true if we can't because we hit a tip or false otherwise

◆ advance_on_graph_by_distance()

bool vg::NGSSimulator::advance_on_graph_by_distance	(	pos_t &	pos,
		int64_t	distance
	)

private

Move forward a certain distance in the graph along a random path, return true if we can't because we hit a tip or false otherwise

◆ advance_on_path()

bool vg::NGSSimulator::advance_on_path	(	int64_t &	offset,
		bool &	is_reverse,
		pos_t &	pos,
		char &	graph_char,
		const string &	source_path
	)

private

Move forward one position in the source path, return true if we can't because we hit a tip or false otherwise

◆ advance_on_path_by_distance()

bool vg::NGSSimulator::advance_on_path_by_distance	(	int64_t &	offset,
		bool &	is_reverse,
		pos_t &	pos,
		int64_t	distance,
		const string &	source_path
	)

private

Move forward a certain distance in the source path, return true if we can't because we hit a tip or false otherwise

◆ apply_aligned_base()

void vg::NGSSimulator::apply_aligned_base	(	Alignment &	aln,
		const pos_t &	pos,
		char	graph_char,
		char	read_char
	)

private

Add a match/mismatch to the alignment.

◆ apply_deletion()

void vg::NGSSimulator::apply_deletion	(	Alignment &	aln,
		const pos_t &	pos
	)

private

Add a deletion to the alignment.

◆ apply_insertion()

void vg::NGSSimulator::apply_insertion	(	Alignment &	aln,
		const pos_t &	pos
	)

private

Add an insertion to the alignment.

◆ apply_N_mask()

void vg::NGSSimulator::apply_N_mask	(	string &	sequence,
		const vector< bool > &	n_mask
	)

private

Mask out bases with 'N's if the mask is true.

◆ connect_to_position_file()

void vg::NGSSimulator::connect_to_position_file ( const string & filename )

Open up a stream to output read positions to.

◆ finalize()

void vg::NGSSimulator::finalize ( )

private

Indicate that there is no more training data.

◆ get_read_name()

string vg::NGSSimulator::get_read_name ( )

private

Get an unclashing read name.

◆ prng()

mt19937_64 & vg::NGSSimulator::prng ( )

private

◆ record_read_pair_quality()

void vg::NGSSimulator::record_read_pair_quality	(	const Alignment &	aln_1,
		const Alignment &	aln_2
	)

private

Add a pair of quality strings to the training data.

◆ record_read_quality()

void vg::NGSSimulator::record_read_quality	(	const Alignment &	aln,
		bool	read_2 = `false`
	)

private

Add a quality string to the training data.

◆ register_sampled_position()

void vg::NGSSimulator::register_sampled_position	(	const Alignment &	aln,
		const string &	path_name,
		size_t	offset,
		bool	is_reverse
	)

private

Ouput a sampled position to the path position file.

◆ sample_path()

size_t vg::NGSSimulator::sample_path ( )

private

Return the index of a path if using source_paths or else numeric_limits<size_t>::max()

◆ sample_read()

Alignment vg::NGSSimulator::sample_read ( )

Sample an individual read and alignment.

◆ sample_read_internal()

void vg::NGSSimulator::sample_read_internal	(	Alignment &	aln,
		int64_t &	offset,
		bool &	is_reverse,
		pos_t &	curr_pos,
		const string &	source_path
	)

private

Internal method called by paired and unpaired samplers for both whole- graph and path sources. Offset and is_reverse are only used (and drive the iteration and update of curr_pos) in path node. Otherwise, in whole graph mode, they are ignored and curr_pos is used to traverse the graph directly.

◆ sample_read_pair()

pair< Alignment, Alignment > vg::NGSSimulator::sample_read_pair ( )

Sample a pair of reads an alignments.

◆ sample_read_quality()

pair< string, vector< bool > > vg::NGSSimulator::sample_read_quality ( )

private

Get a quality string and a vector of 'N'-masks that mimics the training data.

◆ sample_read_quality_internal()

pair< string, vector< bool > > vg::NGSSimulator::sample_read_quality_internal	(	pair< uint8_t, bool >	first,
		bool	transitions_1
	)

private

Wrapped internal function for quality sampling.

◆ sample_read_quality_pair()

pair< pair< string, vector< bool > >, pair< string, vector< bool > > > vg::NGSSimulator::sample_read_quality_pair ( )

private

Get a pair of quality strings and vectors of 'N'-masks that mimic the training data.

◆ sample_start_graph_pos()

pos_t vg::NGSSimulator::sample_start_graph_pos ( )

private

Get a random position in the graph.

◆ sample_start_path_pos()

tuple< int64_t, bool, pos_t > vg::NGSSimulator::sample_start_path_pos	(	const size_t &	source_path_idx,
		const int64_t &	fragment_length
	)

private

Get a random position along the source path. Enforce fragment length restrictions if argument is positive.

◆ sample_start_pos()

void vg::NGSSimulator::sample_start_pos	(	const size_t &	source_path_idx,
		const int64_t &	fragment_length,
		int64_t &	offset,
		bool &	is_reverse,
		pos_t &	pos
	)

private

Sample an appropriate starting position according to the mode. Updates the arguments. Providing a negative number for fragment length indicates no fragment length restrictions.

◆ walk_backwards()

bool vg::NGSSimulator::walk_backwards	(	int64_t &	offset,
		bool &	is_reverse,
		pos_t &	pos,
		int64_t	distance,
		const string &	source_path,
		const Path &	path
	)

private

Walk backwards either along an alignment path or a source path, updates positions.

◆ walk_backwards_along_alignment()

bool vg::NGSSimulator::walk_backwards_along_alignment	(	const Path &	path,
		int64_t	distance,
		pos_t &	pos
	)

private

Walk backwards along the alignment path.

Member Data Documentation

◆ alphabet

const string vg::NGSSimulator::alphabet = "ACGT"

staticprivate

DNA alphabet.

◆ background_sampler

vg::uniform_int_distribution<size_t> vg::NGSSimulator::background_sampler

private

◆ fragment_mean

const double vg::NGSSimulator::fragment_mean

private

◆ fragment_sd

const double vg::NGSSimulator::fragment_sd

private

◆ indel_error_prop

const double vg::NGSSimulator::indel_error_prop

private

◆ indel_poly_rate

const double vg::NGSSimulator::indel_poly_rate

private

◆ joint_initial_distr

MarkovDistribution<pair<uint8_t, bool>, pair<pair<uint8_t, bool>, pair<uint8_t, bool> > > vg::NGSSimulator::joint_initial_distr

private

A distribution for the joint initial qualities of a read pair.

◆ mut_sampler

vg::uniform_int_distribution<size_t> vg::NGSSimulator::mut_sampler

private

◆ mutation_alphabets

unordered_map<char, string> vg::NGSSimulator::mutation_alphabets

private

Remainder of the alphabet after removing a given character.

◆ path_sampler

vg::discrete_distribution vg::NGSSimulator::path_sampler

private

◆ phred_prob

vector<double> vg::NGSSimulator::phred_prob

private

Memo for pre-multiplied Phred -> probability conversion.

◆ position_file

ofstream vg::NGSSimulator::position_file

private

◆ prngs

vector<mt19937_64> vg::NGSSimulator::prngs

private

◆ prob_sampler

vg::uniform_real_distribution<double> vg::NGSSimulator::prob_sampler

private

◆ retry_on_Ns

const bool vg::NGSSimulator::retry_on_Ns

private

Should we try again for a read without Ns of we get Ns?

◆ sample_counter

size_t vg::NGSSimulator::sample_counter = 0

private

◆ sample_unsheared_paths

const bool vg::NGSSimulator::sample_unsheared_paths

private

◆ seed

uint64_t vg::NGSSimulator::seed

private

◆ source_paths

vector<string> vg::NGSSimulator::source_paths

private

Restrict reads to just these paths (path-only mode) if nonempty.

◆ start_pos_samplers

vector<vg::uniform_int_distribution<size_t> > vg::NGSSimulator::start_pos_samplers

private

◆ strand_sampler

vg::uniform_int_distribution<uint8_t> vg::NGSSimulator::strand_sampler

private

◆ sub_poly_rate

const double vg::NGSSimulator::sub_poly_rate

private

◆ total_seq_length

size_t vg::NGSSimulator::total_seq_length = 0

private

The total sequence length in our graph.

◆ transition_distrs_1

vector<MarkovDistribution<pair<uint8_t, bool>, pair<uint8_t, bool> > > vg::NGSSimulator::transition_distrs_1

private

A Markov distribution for each read position indicating quality and whether the base is an 'N'.

◆ transition_distrs_2

vector<MarkovDistribution<pair<uint8_t, bool>, pair<uint8_t, bool> > > vg::NGSSimulator::transition_distrs_2

private

A second set of Markov distributions for the second read in a pair.

The documentation for this class was generated from the following files:

src/sampler.hpp
src/sampler.cpp

Classes

Public Member Functions

Private Member Functions

Private Attributes

Static Private Attributes

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ NGSSimulator() [1/2]

◆ NGSSimulator() [2/2]

Member Function Documentation

◆ advance()

◆ advance_by_distance()

◆ advance_on_graph()

◆ advance_on_graph_by_distance()

◆ advance_on_path()

◆ advance_on_path_by_distance()

◆ apply_aligned_base()

◆ apply_deletion()

◆ apply_insertion()

◆ apply_N_mask()

◆ connect_to_position_file()

◆ finalize()

◆ get_read_name()

◆ prng()

◆ record_read_pair_quality()

◆ record_read_quality()

◆ register_sampled_position()

◆ sample_path()

◆ sample_read()

◆ sample_read_internal()

◆ sample_read_pair()

◆ sample_read_quality()

◆ sample_read_quality_internal()

◆ sample_read_quality_pair()

◆ sample_start_graph_pos()

◆ sample_start_path_pos()

◆ sample_start_pos()

◆ walk_backwards()

◆ walk_backwards_along_alignment()

Member Data Documentation

◆ alphabet

◆ background_sampler

◆ fragment_mean

◆ fragment_sd

◆ indel_error_prop

◆ indel_poly_rate

◆ joint_initial_distr

◆ mut_sampler

◆ mutation_alphabets

◆ path_sampler

◆ phred_prob

◆ position_file

◆ prngs

◆ prob_sampler

◆ retry_on_Ns

◆ sample_counter

◆ sample_unsheared_paths

◆ seed

◆ source_paths

◆ start_pos_samplers

◆ strand_sampler

◆ sub_poly_rate

◆ total_seq_length

◆ transition_distrs_1

◆ transition_distrs_2