vg
tools for working with variation graphs
|
#include <haplotype_indexer.hpp>
Public Member Functions | |
HaplotypeIndexer () | |
Perform initialization of backing libraries. More... | |
std::vector< std::string > | parse_vcf (const std::string &filename, const PathHandleGraph &graph, const std::string &job_name="GBWT") const |
std::vector< std::string > | parse_vcf (const std::string &filename, const PathHandleGraph &graph, const std::vector< path_handle_t > &paths, const std::string &job_name="GBWT") const |
std::unique_ptr< gbwt::DynamicGBWT > | build_gbwt (const std::vector< std::string > &vcf_parse_files, const std::string &job_name="GBWT", const PathHandleGraph *graph=nullptr, const std::unordered_set< path_handle_t > *paths=nullptr, bool skip_unvisited_paths=false) const |
std::unique_ptr< gbwt::DynamicGBWT > | build_gbwt (const PathHandleGraph &graph) const |
std::unique_ptr< gbwt::GBWT > | build_gbwt (const HandleGraph &graph, const std::vector< std::string > &aln_filenames, const std::string &aln_format, size_t parallel_jobs=1) const |
Public Member Functions inherited from vg::Progressive | |
void | preload_progress (const string &message) |
void | create_progress (const string &message, long count) |
void | create_progress (long count) |
void | update_progress (long i) |
void | increment_progress () |
void | destroy_progress (void) |
Public Attributes | |
bool | warn_on_missing_variants = true |
Print a warning if variants in the VCF can't be found in the graph. More... | |
size_t | max_missing_variant_warnings = 10 |
Only report up to this many of them. More... | |
std::map< std::string, std::string > | path_to_vcf |
bool | rename_variants = true |
std::string | batch_file_prefix = "" |
bool | phase_homozygous = true |
Phase homozygous unphased variants. More... | |
bool | force_phasing = false |
Arbitrarily phase all unphased variants. More... | |
bool | discard_overlaps = false |
Join together overlapping haplotypes. More... | |
size_t | samples_in_batch = 200 |
Number of samples to process together in a haplotype batch. More... | |
size_t | gbwt_buffer_size = gbwt::DynamicGBWT::INSERT_BATCH_SIZE / gbwt::MILLION |
Size of the GBWT buffer in millions of nodes. More... | |
size_t | id_interval = gbwt::DynamicGBWT::SAMPLE_INTERVAL |
Interval at which to sample for GBWT locate. More... | |
std::pair< size_t, size_t > | sample_range = std::pair<size_t, size_t>(0, std::numeric_limits<size_t>::max()) |
Range of VCF samples to process (first to past-last). More... | |
std::map< std::string, std::pair< size_t, size_t > > | regions |
std::unordered_set< std::string > | excluded_samples |
Public Attributes inherited from vg::Progressive | |
bool | show_progress = false |
Allows indexing haplotypes, either to pre-parsed haplotype files or to a GBWT.
vg::HaplotypeIndexer::HaplotypeIndexer | ( | ) |
Perform initialization of backing libraries.
std::unique_ptr< gbwt::GBWT > vg::HaplotypeIndexer::build_gbwt | ( | const HandleGraph & | graph, |
const std::vector< std::string > & | aln_filenames, | ||
const std::string & | aln_format, | ||
size_t | parallel_jobs = 1 |
||
) | const |
Build a GBWT from the alignments. Each distinct alignment name becomes a sample in the GBWT metadata. If there are multiple alignments with the same name, the corresponding GBWT path names will have the same sample identifier but different values in the count field.
aln_format can be "GAM" or "GAF".
Runs approximately the given number of jobs in parallel. The exact number depends on the sizes of weakly connected components in the graph. Each job uses at most 2 threads.
std::unique_ptr< gbwt::DynamicGBWT > vg::HaplotypeIndexer::build_gbwt | ( | const PathHandleGraph & | graph | ) | const |
Build a GBWT from the embedded non-alt paths in the graph.
std::unique_ptr< gbwt::DynamicGBWT > vg::HaplotypeIndexer::build_gbwt | ( | const std::vector< std::string > & | vcf_parse_files, |
const std::string & | job_name = "GBWT" , |
||
const PathHandleGraph * | graph = nullptr , |
||
const std::unordered_set< path_handle_t > * | paths = nullptr , |
||
bool | skip_unvisited_paths = false |
||
) | const |
Build a GBWT from the haplotypes in the given VCF parse files.
Respects excluded_samples and does not produce threads for them.
We expect that all parse files contain sample/contig names and that the sample names are the same in all files.
There may be no parse files.
If graph is provided and is not null, also includes embedded non-alt paths from the graph.
If paths is provided and is not null, include only those specified paths from the graph. If skip_unvisited_paths is set, paths whose contigs are not visited by VCF parse files will be skipped.
std::vector< std::string > vg::HaplotypeIndexer::parse_vcf | ( | const std::string & | filename, |
const PathHandleGraph & | graph, | ||
const std::string & | job_name = "GBWT" |
||
) | const |
Parse the VCF file into the types needed for GBWT indexing.
Returns the file names for the VCF parses of non-alt paths. If batch_file_prefix is set, these are permanent files. Otherwise they are temporary files that persist until the program exits.
std::vector< std::string > vg::HaplotypeIndexer::parse_vcf | ( | const std::string & | filename, |
const PathHandleGraph & | graph, | ||
const std::vector< path_handle_t > & | paths, | ||
const std::string & | job_name = "GBWT" |
||
) | const |
Parse the VCF file into the types needed for GBWT indexing.
Returns the file names for the VCF parses of the specified paths. If batch_file_prefix is set, these are permanent files. Otherwise they are temporary files that persist until the program exits.
std::string vg::HaplotypeIndexer::batch_file_prefix = "" |
If batch_file_prefix is nonempty, a file for each contig is saved to PREFIX_VCFCONTIG, and files for each batch of haplotypes are saved to files named like PREFIX_VCFCONTIG_STARTSAMPLE_ENDSAMPLE. Otherwise, the batch files are still saved, but to temporary files.
bool vg::HaplotypeIndexer::discard_overlaps = false |
Join together overlapping haplotypes.
std::unordered_set<std::string> vg::HaplotypeIndexer::excluded_samples |
Excluded VCF sample names, for which threads will not be generated. Ignored during VCF parsing.
bool vg::HaplotypeIndexer::force_phasing = false |
Arbitrarily phase all unphased variants.
size_t vg::HaplotypeIndexer::gbwt_buffer_size = gbwt::DynamicGBWT::INSERT_BATCH_SIZE / gbwt::MILLION |
Size of the GBWT buffer in millions of nodes.
size_t vg::HaplotypeIndexer::id_interval = gbwt::DynamicGBWT::SAMPLE_INTERVAL |
Interval at which to sample for GBWT locate.
size_t vg::HaplotypeIndexer::max_missing_variant_warnings = 10 |
Only report up to this many of them.
std::map<std::string, std::string> vg::HaplotypeIndexer::path_to_vcf |
Path names in the graph are mapped to VCF contig names via path_to_vcf, or used as-is if no entry there is found.
bool vg::HaplotypeIndexer::phase_homozygous = true |
Phase homozygous unphased variants.
std::map<std::string, std::pair<size_t, size_t> > vg::HaplotypeIndexer::regions |
Region restrictions for contigs, in VCF name space, as 0-based exclusive-end ranges.
bool vg::HaplotypeIndexer::rename_variants = true |
Use graph path names instead of VCF path names when composing variant alt paths.
std::pair<size_t, size_t> vg::HaplotypeIndexer::sample_range = std::pair<size_t, size_t>(0, std::numeric_limits<size_t>::max()) |
Range of VCF samples to process (first to past-last).
size_t vg::HaplotypeIndexer::samples_in_batch = 200 |
Number of samples to process together in a haplotype batch.
bool vg::HaplotypeIndexer::warn_on_missing_variants = true |
Print a warning if variants in the VCF can't be found in the graph.