|
tuple< vector< structures::UnionFind >, structures::UnionFind > | cluster_seeds_internal (vector< vector< SeedCache > * > &all_seeds, size_t read_distance_limit, size_t fragment_distance_limit=0) const |
|
void | get_nodes (ClusteringProblem &clustering_problem, vector< vector< net_handle_t >> &chains_by_level) const |
|
void | cluster_snarl_level (ClusteringProblem &clustering_problem) const |
|
void | cluster_chain_level (ClusteringProblem &clustering_problem, size_t depth) const |
|
void | cluster_one_node (ClusteringProblem &clustering_problem, SnarlTreeNodeProblem *node_problem) const |
|
void | cluster_one_snarl (ClusteringProblem &clustering_problem, SnarlTreeNodeProblem *snarl_problem) const |
|
void | cluster_one_chain (ClusteringProblem &clustering_problem, SnarlTreeNodeProblem *chain_problem, bool is_top_level_chain) const |
|
void | add_seed_to_chain_problem (ClusteringProblem &clustering_problem, SnarlTreeNodeProblem *chain_problem, SnarlTreeNodeProblem::SnarlTreeChild &last_child, size_t &last_prefix_sum, size_t &last_length, size_t &last_chain_component_end, vector< ClusterHead > &cluster_heads_to_add_again, bool &found_first_node, pair< bool, bool > &found_first_node_by_read, const SnarlTreeNodeProblem::SnarlTreeChild ¤t_child, bool is_first_child, bool is_last_child, bool skip_distances_to_ends) const |
|
void | add_snarl_to_chain_problem (ClusteringProblem &clustering_problem, SnarlTreeNodeProblem *chain_problem, SnarlTreeNodeProblem::SnarlTreeChild &last_child, size_t &last_prefix_sum, size_t &last_length, size_t &last_chain_component_end, vector< ClusterHead > &cluster_heads_to_add_again, bool &found_first_node, pair< bool, bool > &found_first_node_by_read, const SnarlTreeNodeProblem::SnarlTreeChild ¤t_child, bool is_first_child, bool is_last_child, bool skip_distances_to_ends) const |
|
void | cluster_root (ClusteringProblem &clustering_problem) const |
|
void | cluster_seeds_on_linear_structure (ClusteringProblem &clustering_problem, SnarlTreeNodeProblem *problem, size_t structure_length, bool include_prefix_sum, bool skip_distances_to_ends) const |
|
void | compare_and_combine_cluster_on_child_structures (ClusteringProblem &clustering_problem, SnarlTreeNodeProblem *child_problem1, SnarlTreeNodeProblem *child_problem2, SnarlTreeNodeProblem *parent_problem, const vector< pair< size_t, size_t >> &child_distances, bool is_root, bool first_child) const |
|
void | compare_and_combine_cluster_on_one_child (ClusteringProblem &clustering_problem, SnarlTreeNodeProblem *child_problem) const |
|
SnarlDistanceIndexClusterer is used for clustering seeds (positions on the graph) A "cluster" is a partition of seeds that is based on the minimum distance between them in the graph Consider a graph where each seed is a node and two seeds are connected if the minimum distance between them is smaller than a given distance limit. Each connected component of this graph is a cluster
The clustering algorithm is based on the snarl tree Clusters are formed on nodes of the snarl tree, which represent nodes/snarls/chains Each node/snarl/chain represents a subgraph of the variation graph A clustered snarl tree node contains all seeds that occur on its subgraph, and the seeds have been partitioned into clusters Each cluster knows the shortest distance from any seed it contains to both ends of the snarl tree node containing it Clustering is done progressively by walking up the snarl tree and forming clusters on each snarl tree node (only visiting nodes that have seeds on them) At each snarl tree node, assume that its children have already been clustered. The clusters of the children are compared to each other, and any pair that are close enough are combined to produce clusters on the parent The distances from each cluster to the ends of the parent are updated
The algorithm starts by assigning each seed to its node on the snarl tree Since nodes are all on chains, this fills in all the children of chains that are nodes It then walks up the snarl tree, level by level, and clusters each snarl tree node that contains seeds At a given level, first cluster each chain in the level. After clustering a chain, assign it to its parent snarl. Then, go through each of the snarls that have just been given children, and cluster the snarls. Each snarl then gets assigned to its parent chain This completes one level of the snarl tree. Each chain in the next level has just been populated by the snarls from this level, and already knew about its nodes from the first step, so it is ready to be clustered
Every time the clusterer is run, a ClusteringProblem is made to store information about the state of the clusterer The ClusteringProblem keeps track of which level of the snarl tree is currently being clustered, and keeps track of the children of the current and next level of the snarl tree. Each snarl tree node that contains seeds is represented by a SnarlTreeNodeProblem. The SnarlTreeNodeProblem represents the problem of clustering one snarl tree node. It knows the identities of its children and keeps track of its cluster heads