vg
tools for working with variation graphs
Classes | Public Member Functions | Public Attributes | Private Types | Private Member Functions | Static Private Member Functions | List of all members
vg::ZipCodeForest Class Reference

#include <zip_code_tree.hpp>

Classes

struct  child_info_t
 
struct  forest_growing_state_t
 This stores information about the state of the forest as we fill it in. More...
 
struct  interval_state_t
 
struct  sort_value_t
 

Public Member Functions

 ZipCodeForest ()
 Constructor. More...
 
template<typename Minimizer >
void fill_in_forest (const vector< Seed > &seeds, const VectorView< Minimizer > &minimizers, const SnarlDistanceIndex &distance_index, size_t gap_distance_limit, size_t distance_limit=std::numeric_limits< size_t >::max())
 
template<typename Minimizer >
void print_self (const vector< Seed > *seeds, const VectorView< Minimizer > *minimizers) const
 
void validate_zip_forest (const SnarlDistanceIndex &distance_index, const vector< Seed > *seeds, size_t distance_limit=std::numeric_limits< size_t >::max()) const
 
template<typename Minimizer >
void get_cyclic_snarl_intervals (forest_growing_state_t &forest_state, const VectorView< Minimizer > &minimizers, const ZipCodeForest::interval_state_t &snarl_interval, const ZipCodeForest::interval_state_t &parent_interval, const forward_list< ZipCodeForest::interval_state_t > &child_intervals, forward_list< ZipCodeForest::interval_state_t > &next_intervals) const
 

Public Attributes

vector< ZipCodeTreetrees
 The actual data, a collection of ZipCodeTrees. More...
 

Private Types

typedef SnarlDistanceIndexClusterer::Seed Seed
 
typedef ZipCodeTree::tree_item_type_t tree_item_type_t
 
typedef ZipCodeTree::tree_item_t tree_item_t
 

Private Member Functions

void sort_one_interval (forest_growing_state_t &forest_state, const interval_state_t &interval) const
 
void radix_sort_zipcodes (vector< size_t > &zipcode_sort_order, const vector< sort_value_t > &sort_values_by_seed, const interval_state_t &interval, bool reverse_order, size_t min_value, size_t max_value, bool sort_by_chain_component=false) const
 
void default_sort_zipcodes (vector< size_t > &zipcode_sort_order, const vector< sort_value_t > &sort_values_by_seed, const interval_state_t &interval, bool reverse_order) const
 
void get_next_intervals (forest_growing_state_t &forest_state, const interval_state_t &interval, std::forward_list< interval_state_t > &next_intervals) const
 
template<typename Minimizer >
void get_cyclic_snarl_intervals (forest_growing_state_t &forest_state, const VectorView< Minimizer > &minimizers, const interval_state_t &snarl_interval, const interval_state_t &parent_interval, const forward_list< interval_state_t > &child_intervals, forward_list< interval_state_t > &next_intervals) const
 
void open_chain (forest_growing_state_t &forest_state, const size_t &depth, size_t seed_index, bool chain_is_reversed)
 
void close_chain (forest_growing_state_t &forest_state, const size_t &depth, const Seed &last_seed, bool chain_is_reversed)
 
void add_child_to_chain (forest_growing_state_t &forest_state, const size_t &depth, const size_t &seed_index, bool child_is_reversed, bool chain_is_reversed)
 
void open_snarl (forest_growing_state_t &forest_state, const size_t &depth)
 
void close_snarl (forest_growing_state_t &forest_state, const size_t &depth, const Seed &last_seed, bool last_is_reversed, bool is_cyclic_snarl)
 
void add_snarl_distances (forest_growing_state_t &forest_state, const size_t &depth, const Seed &seed, bool child_is_reversed, bool snarl_is_reversed, bool to_snarl_end, bool is_cyclic_snarl)
 

Static Private Member Functions

static double get_correlation (const vector< std::pair< size_t, size_t >> &values)
 

Detailed Description

A collection of ZipCodeTrees The ZipCodeForest takes a set of seeds and makes ZipCodeTrees There will be a separate tree for each connected component or slice of a chain that is too far from anything else on both sides, using the given distance limit

Member Typedef Documentation

◆ Seed

◆ tree_item_t

◆ tree_item_type_t

Constructor & Destructor Documentation

◆ ZipCodeForest()

vg::ZipCodeForest::ZipCodeForest ( )
inline

Member Function Documentation

◆ add_child_to_chain()

void vg::ZipCodeForest::add_child_to_chain ( forest_growing_state_t forest_state,
const size_t &  depth,
const size_t &  seed_index,
bool  child_is_reversed,
bool  chain_is_reversed 
)
private

◆ add_snarl_distances()

void vg::ZipCodeForest::add_snarl_distances ( forest_growing_state_t forest_state,
const size_t &  depth,
const Seed seed,
bool  child_is_reversed,
bool  snarl_is_reversed,
bool  to_snarl_end,
bool  is_cyclic_snarl 
)
private

◆ close_chain()

void vg::ZipCodeForest::close_chain ( forest_growing_state_t forest_state,
const size_t &  depth,
const Seed last_seed,
bool  chain_is_reversed 
)
private

◆ close_snarl()

void vg::ZipCodeForest::close_snarl ( forest_growing_state_t forest_state,
const size_t &  depth,
const Seed last_seed,
bool  last_is_reversed,
bool  is_cyclic_snarl 
)
private

◆ default_sort_zipcodes()

void vg::ZipCodeForest::default_sort_zipcodes ( vector< size_t > &  zipcode_sort_order,
const vector< sort_value_t > &  sort_values_by_seed,
const interval_state_t interval,
bool  reverse_order 
) const
private

Helper function to sort the seeds using std::sort Sorts the slice of seeds in the given interval of zipcode_sort_order, which is a vector of indices into seeds

◆ fill_in_forest()

template<typename Minimizer >
void vg::ZipCodeForest::fill_in_forest ( const vector< Seed > &  seeds,
const VectorView< Minimizer > &  minimizers,
const SnarlDistanceIndex &  distance_index,
size_t  gap_distance_limit,
size_t  distance_limit = std::numeric_limits<size_t>::max() 
)

Populate the zip forest If a distance limit is given, then also partition the tree into subtrees that are farther than the distance_limit from each other Otherwise, the forest will just be connected components The gap_distance_limit is the limit for making runs of seeds in a cyclic snarl- it should be roughly the distance that the dynamic programming is willing to jump to connect two consecutive minimizers TODO: I think the distance_limit should just be the same as the gap_distance_limit If a distance_limit is given, then distances larger than the distance limit are not guaranteed to be accurate, but will be greater than the distance_limit

◆ get_correlation()

double vg::ZipCodeForest::get_correlation ( const vector< std::pair< size_t, size_t >> &  values)
staticprivate

Given a vector of value pairs, and a bool indicating if the pair is used for the correlation, return the correlation. This is the spearman correlation for now

◆ get_cyclic_snarl_intervals() [1/2]

template<typename Minimizer >
void vg::ZipCodeForest::get_cyclic_snarl_intervals ( forest_growing_state_t forest_state,
const VectorView< Minimizer > &  minimizers,
const interval_state_t snarl_interval,
const interval_state_t parent_interval,
const forward_list< interval_state_t > &  child_intervals,
forward_list< interval_state_t > &  next_intervals 
) const
private

Given intervals representing child chains on a cyclic snarl, re-partition them and get new intervals representing runs of seeds that are "close" in each chain. Like in get_next_intervals, new intervals are added to next_intervals in their sort order. Two seeds are close to each other if: (1) the distance between them on the read is <= t, where t is a given distance limit, (2) the minimum distance between them on the chain is <= t, and (3) they are on the same strand in the read. Runs are sorted by their latest position in the read, and oriented according to the orientation of the read through the snarl. The orientation of the read in the snarl's parent chain and in the snarl children are estimated by finding the spearman correlation of the seeds. If the orientation of a run is unclear, then it is duplicated to be oriented in each direction

◆ get_cyclic_snarl_intervals() [2/2]

template<typename Minimizer >
void vg::ZipCodeForest::get_cyclic_snarl_intervals ( forest_growing_state_t forest_state,
const VectorView< Minimizer > &  minimizers,
const ZipCodeForest::interval_state_t snarl_interval,
const ZipCodeForest::interval_state_t parent_interval,
const forward_list< ZipCodeForest::interval_state_t > &  child_intervals,
forward_list< ZipCodeForest::interval_state_t > &  next_intervals 
) const

◆ get_next_intervals()

void vg::ZipCodeForest::get_next_intervals ( forest_growing_state_t forest_state,
const interval_state_t interval,
std::forward_list< interval_state_t > &  next_intervals 
) const
private

Assuming that the range of seeds in sort_values_by_seeds given by the interval is sorted, add the intervals of the children of the interval to the front of next_intervals. The new intervals get added in their sort order, so the start of a chain will be at the start of the list, to be popped first. For children of chains, seeds that are on the chain itself and not nested will be put on the same interval if there are no seeds in snarls between them, even if they are not on the same node

◆ open_chain()

void vg::ZipCodeForest::open_chain ( forest_growing_state_t forest_state,
const size_t &  depth,
size_t  seed_index,
bool  chain_is_reversed 
)
private

◆ open_snarl()

void vg::ZipCodeForest::open_snarl ( forest_growing_state_t forest_state,
const size_t &  depth 
)
private

◆ print_self()

template<typename Minimizer >
void vg::ZipCodeForest::print_self ( const vector< Seed > *  seeds,
const VectorView< Minimizer > *  minimizers 
) const
inline

◆ radix_sort_zipcodes()

void vg::ZipCodeForest::radix_sort_zipcodes ( vector< size_t > &  zipcode_sort_order,
const vector< sort_value_t > &  sort_values_by_seed,
const interval_state_t interval,
bool  reverse_order,
size_t  min_value,
size_t  max_value,
bool  sort_by_chain_component = false 
) const
private

Helper function to sort the seeds using radix sort Sorts the slice of seeds in the given interval of zipcode_sort_order, which is a vector of indices into seeds reverse_order is true if the order should be reversed. The interval also has an is_reversed field, which refers to the orientation in the snarl tree This should run in linear time, but it is dependent on the values being sorted on to have a small range min_ and max_value are the minimum and maximum value being sorted on If sort_by_chain_component is true, then sort on the chain component in sort_values

◆ sort_one_interval()

void vg::ZipCodeForest::sort_one_interval ( forest_growing_state_t forest_state,
const interval_state_t interval 
) const
private

Sorts the given interval (which must contain seeds on the same snarl/chain/node at the given depth) Sorting is roughly linear along the top-level chains, in a topological-ish order in snarls. Uses radix_sort_zipcodes and default_sort_zipcodes For chains, everything is sorted with the prefix sum value of the chain itself from the distance index, not the order in the chain in the zip code tree. Everything will be sorted in the order of the zip code tree, but the values will be set from the distance index. This means that later, the values may be out of order or may need to be subtracted from the length of the chain to get the distances to the ends of the chain

◆ validate_zip_forest()

void vg::ZipCodeForest::validate_zip_forest ( const SnarlDistanceIndex &  distance_index,
const vector< Seed > *  seeds,
size_t  distance_limit = std::numeric_limits<size_t>::max() 
) const

Member Data Documentation

◆ trees

vector<ZipCodeTree> vg::ZipCodeForest::trees

The actual data, a collection of ZipCodeTrees.


The documentation for this class was generated from the following files: