vg
tools for working with variation graphs
|
#include <constructor.hpp>
Public Member Functions | |
ConstructedChunk | construct_chunk (string reference_sequence, string reference_path_name, vector< vcflib::Variant > variants, size_t chunk_offset) const |
void | construct_graph (string vcf_contig, FastaReference &reference, VcfBuffer &variant_source, const vector< FastaReference * > &insertion, const function< void(Graph &)> &callback) |
void | construct_graph (const vector< FastaReference * > &references, const vector< vcflib::VariantCallFile * > &variant_files, const vector< FastaReference * > &insertions, const function< void(Graph &)> &callback) |
void | construct_graph (const vector< string > &reference_filenames, const vector< string > &variant_filenames, const vector< string > &insertion_filenames, const function< void(Graph &)> &callback) |
void | construct_graph (const vector< FastaReference * > &references, const vector< vcflib::VariantCallFile * > &variant_files, const vector< FastaReference * > &insertions, MutablePathMutableHandleGraph *destination) |
void | construct_graph (const vector< string > &reference_filenames, const vector< string > &variant_filenames, const vector< string > &insertion_filenames, MutablePathMutableHandleGraph *destination) |
Public Member Functions inherited from vg::Progressive | |
void | preload_progress (const string &message) |
void | create_progress (const string &message, long count) |
void | create_progress (long count) |
void | update_progress (long i) |
void | increment_progress () |
void | destroy_progress (void) |
Public Member Functions inherited from vg::NameMapper | |
void | add_name_mapping (const string &vcf_name, const string &fasta_name) |
string | vcf_to_fasta (const string &vcf_name) const |
string | fasta_to_vcf (const string &fasta_name) const |
Public Attributes | |
bool | flat = false |
size_t | max_parsed_variant_size = 100 |
bool | alt_paths = false |
bool | sha1_variant_name = true |
bool | do_svs = false |
bool | trim_indels = true |
bool | alts_as_loci = false |
bool | greedy_pieces = false |
bool | chain_deletions = true |
bool | warn_on_lowercase = true |
bool | warn_on_ambiguous = true |
size_t | max_node_size = 1000 |
size_t | vars_per_chunk = 1024 |
size_t | bases_per_chunk = 1024 * 1024 |
set< string > | allowed_vcf_names |
map< string, pair< size_t, size_t > > | allowed_vcf_regions |
Public Attributes inherited from vg::Progressive | |
bool | show_progress = false |
Protected Attributes | |
set< string > | symbolic_allele_warnings |
id_t | max_id = 0 |
Protected Attributes inherited from vg::NameMapper | |
map< string, string > | vcf_to_fasta_renames |
map< string, string > | fasta_to_vcf_renames |
This is the reverse map from FASTA sequence name to VCF sequence name. More... | |
Private Member Functions | |
bool | sanitize_sequence_in_place (string &sequence, const string *sequence_name=nullptr, size_t sequence_start_offset=0, const vcflib::Variant *variant=nullptr) const |
Static Private Member Functions | |
static void | trim_to_variable (vector< list< vcflib::VariantAllele >> &parsed_alleles) |
static void | condense_edits (list< vcflib::VariantAllele > &parsed_allele) |
static pair< int64_t, int64_t > | get_bounds (const vector< list< vcflib::VariantAllele >> &trimmed_variant) |
static pair< int64_t, int64_t > | get_symbolic_bounds (vcflib::Variant var) |
Private Attributes | |
unordered_set< string > | lowercase_warned_sequences |
What sequences have we warned about containing lowercase characters? More... | |
bool | lowercase_warned_alt = false |
Have we given a warning yet about lowercase alt alleles? More... | |
bool | multiallelic_sv_warned = false |
Have we given a warning yet about multiallelic SVs? More... | |
bool | uncanonicalizable_sv_warned = false |
Have we given a warning yet about uncanonicalizable SVs? More... | |
unordered_set< string > | ambiguous_warned_sequences |
What sequences have we warned about containing unsupported ambiguity codes? More... | |
|
staticprivate |
Given a list of VariantAllele edits, condense adjacent perfect match edits to be maximally long.
ConstructedChunk vg::Constructor::construct_chunk | ( | string | reference_sequence, |
string | reference_path_name, | ||
vector< vcflib::Variant > | variants, | ||
size_t | chunk_offset | ||
) | const |
Construct a ConstructedChunk of graph from the given piece of sequence, with the given name, applying the given variants. The variants need to be sorted by start position, and have their start positions set to be ZERO- BASED. However, they also need to have their start positions relative to the global start of the contig, so that hash-based names come out right for them. They also need to not overlap with any variants not in the vector we have (i.e. we need access to all overlapping variants for this region). The variants must not extend beyond the given sequence, though they can abut its edges.
Variants in the vector may not use symbolic alleles.
All variants must have has their canonical field set, either manually to false or by canonicalize() to true.
chunk_offset gives the global 0-based position at which this chunk starts in the reference contig it is part of, which is used to correctly place variants.
Find the next breakpoint.
Takes in the search position, like the position of the next un-made reference base. So searches from the left edge of the passed inclusive position.
Finds the next required breakpoint within this clump, after the given position, given created nodes and deletions that already exist.
Returns the inclusive position of the base to the left of this breakpoint, so the breakpoint is immediately to the right of the base at the returned position. This means that sometimes, such as if the next piece of the reference would be 1 bp long, this function will return the same value it was passed.
void vg::Constructor::construct_graph | ( | const vector< FastaReference * > & | references, |
const vector< vcflib::VariantCallFile * > & | variant_files, | ||
const vector< FastaReference * > & | insertions, | ||
const function< void(Graph &)> & | callback | ||
) |
Construct a graph using the given FASTA references and VCFlib VCF files. The VCF files are assumed to be grouped by contig and then sorted by position within the contig, such that each contig is present in only one file. If multiple FASTAs are used, each contig must be present in only one FASTA file. Reference and VCF vectors may not contain nulls.
insertions contains FASTAs containing serquences for resolving symbolic insert alleles in the VCFs.
Calls the given callback with constructed graph chunks, eventually (hopefully) in multiple threads. Chunks may contain dangling edges into the next chunk.
void vg::Constructor::construct_graph | ( | const vector< FastaReference * > & | references, |
const vector< vcflib::VariantCallFile * > & | variant_files, | ||
const vector< FastaReference * > & | insertions, | ||
MutablePathMutableHandleGraph * | destination | ||
) |
Construct a graph using the given FASTA references and VCFlib VCF files. The VCF files are assumed to be grouped by contig and then sorted by position within the contig, such that each contig is present in only one file. If multiple FASTAs are used, each contig must be present in only one FASTA file. Reference and VCF vectors may not contain nulls.
insertions contains FASTAs containing serquences for resolving symbolic insert alleles in the VCFs.
Builds the graph into the given mutable graph object, which may not be thread safe.
void vg::Constructor::construct_graph | ( | const vector< string > & | reference_filenames, |
const vector< string > & | variant_filenames, | ||
const vector< string > & | insertion_filenames, | ||
const function< void(Graph &)> & | callback | ||
) |
Construct a graph using the given FASTA references and VCF files on disk. The VCF files are assumed to be grouped by contig and then sorted by position within the contig, such that each contig is present in only one file. If multiple FASTAs are used, each contig must be present in only one FASTA file.
insertions contains FASTA filenames containing serquences for resolving symbolic insert alleles in the VCFs.
Calls the given callback with constructed graph chunks, eventually (hopefully) in multiple threads. Chunks may contain dangling edges into the next chunk.
void vg::Constructor::construct_graph | ( | const vector< string > & | reference_filenames, |
const vector< string > & | variant_filenames, | ||
const vector< string > & | insertion_filenames, | ||
MutablePathMutableHandleGraph * | destination | ||
) |
Construct a graph using the given FASTA references and VCF files on disk. The VCF files are assumed to be grouped by contig and then sorted by position within the contig, such that each contig is present in only one file. If multiple FASTAs are used, each contig must be present in only one FASTA file.
insertions contains FASTA filenames containing serquences for resolving symbolic insert alleles in the VCFs.
Builds the graph into the given mutable graph object, which may not be thread safe.
void vg::Constructor::construct_graph | ( | string | vcf_contig, |
FastaReference & | reference, | ||
VcfBuffer & | variant_source, | ||
const vector< FastaReference * > & | insertion, | ||
const function< void(Graph &)> & | callback | ||
) |
Construct a graph for the given VCF contig name, using the given reference and the variants from the given buffered VCF file. Emits a sequence of Graph chunks, which may be too big to serealize directly.
Doesn't handle any of the setup for VCF indexing. Just scans all the variants that can come out of the buffer, so make sure indexing is set on the file first before passing it in.
insertion contains FASTAs containing serquences for resolving symbolic insert alleles in the VCF.
Calls the given callback with constructed graph chunks, in a single thread. Chunks may contain dangling edges into the next chunk.
|
staticprivate |
Given a vector of lists of VariantAllele edits that have been trimmed with trim_to_variable() above, one per non-reference alt for a variant, return the position of the first varaible base, and the position of the last variable base. If there's no variable-region, the result is max int64_t and -1, and if there's a 0-length variable region, the result is the base after it and the base before it.
|
staticprivate |
Given a symbolic variant, check its bounds and return them. This function is needed to handle SVs properly, since they won't always have their ref and alt fields put in. Note that insertions may have an end bound before their start, because the anchoring base isn't included.
|
private |
Given a sequence, get rid of all the lowercase characters and all the ambiguity codes. Warn if configured, and the sequence has a name assigned, and no warning has yet been issued for that name, or if a variant is specified.
Will error if this results in a string with anything other than A, C, G, T, and N.
sequence_start_offset can be set to produce useful messages if the sequence we are looking at is an excerpt from a longer sequence.
Santitizing may move the stored string data in memory.
Returns true if the string was modified.
We need this as a function because vcflib reaches back and reads the FASTA files directly, so we can't just preprocess the reference and we need to constantly clean up the variants.
|
staticprivate |
Given a vector of lists of VariantAllele edits, trim in from the left and right, leaving a core of edits bounded by edits that actually change the reference in at least one allele.
Postcondition: either all lists of VariantAlleles are empty, or at least one begins with a non-match and at least one ends with a non-match. Adjacent edits in the list abut; there are no uncovered gaps in the edits. This means that internal perfect match edits will be preserved.
set<string> vg::Constructor::allowed_vcf_names |
map<string, pair<size_t, size_t> > vg::Constructor::allowed_vcf_regions |
bool vg::Constructor::alt_paths = false |
bool vg::Constructor::alts_as_loci = false |
|
mutableprivate |
What sequences have we warned about containing unsupported ambiguity codes?
size_t vg::Constructor::bases_per_chunk = 1024 * 1024 |
bool vg::Constructor::chain_deletions = true |
bool vg::Constructor::do_svs = false |
bool vg::Constructor::flat = false |
bool vg::Constructor::greedy_pieces = false |
|
mutableprivate |
Have we given a warning yet about lowercase alt alleles?
|
mutableprivate |
What sequences have we warned about containing lowercase characters?
|
protected |
All chunks are generated with IDs starting at 1, but graphs emitted from construct_graph need to have the IDs rewritten so they don't overlap. Moreover, multiple calls to construct_graph need to not have conflicting IDs, because some construct_graph implementations call other ones. What we do for now is globally track the max ID already used, so all calls to construct_graph follow a single ID ordering.
size_t vg::Constructor::max_node_size = 1000 |
size_t vg::Constructor::max_parsed_variant_size = 100 |
|
mutableprivate |
Have we given a warning yet about multiallelic SVs?
bool vg::Constructor::sha1_variant_name = true |
|
protected |
Remembers which unusable symbolic alleles we've already emitted a warning about during construction.
bool vg::Constructor::trim_indels = true |
|
mutableprivate |
Have we given a warning yet about uncanonicalizable SVs?
size_t vg::Constructor::vars_per_chunk = 1024 |
bool vg::Constructor::warn_on_ambiguous = true |
bool vg::Constructor::warn_on_lowercase = true |