#include <haplotype_indexer.hpp>

Inheritance diagram for vg::HaplotypeIndexer:

Public Member Functions
	HaplotypeIndexer ()
	Perform initialization of backing libraries. More...

std::vector< std::string >	parse_vcf (const std::string &filename, const PathHandleGraph &graph, const std::string &job_name="GBWT") const

std::vector< std::string >	parse_vcf (const std::string &filename, const PathHandleGraph &graph, const std::vector< path_handle_t > &paths, const std::string &job_name="GBWT") const

std::unique_ptr< gbwt::DynamicGBWT >	build_gbwt (const std::vector< std::string > &vcf_parse_files, const std::string &job_name="GBWT", const PathHandleGraph graph=nullptr, const std::unordered_set< path_handle_t > paths=nullptr, bool skip_unvisited_paths=false) const

std::unique_ptr< gbwt::DynamicGBWT >	build_gbwt (const PathHandleGraph &graph) const

std::unique_ptr< gbwt::GBWT >	build_gbwt (const HandleGraph &graph, const std::vector< std::string > &aln_filenames, const std::string &aln_format, size_t parallel_jobs=1) const

Public Member Functions inherited from vg::Progressive
void	preload_progress (const string &message)

void	create_progress (const string &message, long count)

void	create_progress (long count)

void	ensure_progress (long count)

void	update_progress (long i)

void	increment_progress ()

void	destroy_progress (void)

Public Attributes
bool	warn_on_missing_variants = true
	Print a warning if variants in the VCF can't be found in the graph. More...

size_t	max_missing_variant_warnings = 10
	Only report up to this many of them. More...

std::map< std::string, std::string >	path_to_vcf

bool	rename_variants = true

std::string	batch_file_prefix = ""

bool	phase_homozygous = true
	Phase homozygous unphased variants. More...

bool	force_phasing = false
	Arbitrarily phase all unphased variants. More...

bool	discard_overlaps = false
	Join together overlapping haplotypes. More...

size_t	samples_in_batch = 200
	Number of samples to process together in a haplotype batch. More...

size_t	gbwt_buffer_size = gbwt::DynamicGBWT::INSERT_BATCH_SIZE / gbwt::MILLION
	Size of the GBWT buffer in millions of nodes. More...

size_t	id_interval = gbwt::DynamicGBWT::SAMPLE_INTERVAL
	Interval at which to sample for GBWT locate. More...

std::pair< size_t, size_t >	sample_range = std::pair<size_t, size_t>(0, std::numeric_limits<size_t>::max())
	Range of VCF samples to process (first to past-last). More...

std::map< std::string, std::pair< size_t, size_t > >	regions

std::unordered_set< std::string >	excluded_samples

Public Attributes inherited from vg::Progressive
bool	show_progress = false

Additional Inherited Members
Static Public Member Functions inherited from vg::Progressive
static void	with_progress (bool show_progress, const std::string &task, const std::function< void(const std::function< void(size_t, size_t)> &progress)> &callback)

Detailed Description

Allows indexing haplotypes, either to pre-parsed haplotype files or to a GBWT.

Constructor & Destructor Documentation

◆ HaplotypeIndexer()

vg::HaplotypeIndexer::HaplotypeIndexer ( )

Perform initialization of backing libraries.

Member Function Documentation

◆ build_gbwt() [1/3]

std::unique_ptr< gbwt::GBWT > vg::HaplotypeIndexer::build_gbwt	(	const HandleGraph &	graph,
		const std::vector< std::string > &	aln_filenames,
		const std::string &	aln_format,
		size_t	parallel_jobs = `1`
	)		const

Build a GBWT from the alignments. Each distinct alignment name becomes a sample in the GBWT metadata. If there are multiple alignments with the same name, the corresponding GBWT path names will have the same sample identifier but different values in the count field.

aln_format can be "GAM" or "GAF".

Runs approximately the given number of jobs in parallel. The exact number depends on the sizes of weakly connected components in the graph. Each job uses at most 2 threads.

In order to save space, the longest common prefix of alignment names is stored as GBWT tag sample_prefix. Only the diverging suffixes are stored as sample names.

Additionally, the list of values used in quality strings is stored as GBWT tag quality_values. Tag quality_codes stores a comma-separated list of canonical Huffman code lengths for the quality values. The values are sorted by code length.

◆ build_gbwt() [2/3]

std::unique_ptr< gbwt::DynamicGBWT > vg::HaplotypeIndexer::build_gbwt ( const PathHandleGraph & graph ) const

Build a GBWT from the embedded non-alt paths in the graph.

◆ build_gbwt() [3/3]

std::unique_ptr< gbwt::DynamicGBWT > vg::HaplotypeIndexer::build_gbwt	(	const std::vector< std::string > &	vcf_parse_files,
		const std::string &	job_name = `"GBWT"`,
		const PathHandleGraph *	graph = `nullptr`,
		const std::unordered_set< path_handle_t > *	paths = `nullptr`,
		bool	skip_unvisited_paths = `false`
	)		const

Build a GBWT from the haplotypes in the given VCF parse files.

Respects excluded_samples and does not produce threads for them.

We expect that all parse files contain sample/contig names and that the sample names are the same in all files.

There may be no parse files.

If graph is provided and is not null, also includes embedded non-alt paths from the graph.

If paths is provided and is not null, include only those specified paths from the graph. If skip_unvisited_paths is set, paths whose contigs are not visited by VCF parse files will be skipped.

◆ parse_vcf() [1/2]

std::vector< std::string > vg::HaplotypeIndexer::parse_vcf	(	const std::string &	filename,
		const PathHandleGraph &	graph,
		const std::string &	job_name = `"GBWT"`
	)		const

Parse the VCF file into the types needed for GBWT indexing.

Returns the file names for the VCF parses of non-alt paths. If batch_file_prefix is set, these are permanent files. Otherwise they are temporary files that persist until the program exits.

◆ parse_vcf() [2/2]

std::vector< std::string > vg::HaplotypeIndexer::parse_vcf	(	const std::string &	filename,
		const PathHandleGraph &	graph,
		const std::vector< path_handle_t > &	paths,
		const std::string &	job_name = `"GBWT"`
	)		const

Parse the VCF file into the types needed for GBWT indexing.

Returns the file names for the VCF parses of the specified paths. If batch_file_prefix is set, these are permanent files. Otherwise they are temporary files that persist until the program exits.

Member Data Documentation

◆ batch_file_prefix

std::string vg::HaplotypeIndexer::batch_file_prefix = ""

If batch_file_prefix is nonempty, a file for each contig is saved to PREFIX_VCFCONTIG, and files for each batch of haplotypes are saved to files named like PREFIX_VCFCONTIG_STARTSAMPLE_ENDSAMPLE. Otherwise, the batch files are still saved, but to temporary files.

◆ discard_overlaps

bool vg::HaplotypeIndexer::discard_overlaps = false

Join together overlapping haplotypes.

◆ excluded_samples

std::unordered_set<std::string> vg::HaplotypeIndexer::excluded_samples

Excluded VCF sample names, for which threads will not be generated. Ignored during VCF parsing.

◆ force_phasing

bool vg::HaplotypeIndexer::force_phasing = false

Arbitrarily phase all unphased variants.

◆ gbwt_buffer_size

size_t vg::HaplotypeIndexer::gbwt_buffer_size = gbwt::DynamicGBWT::INSERT_BATCH_SIZE / gbwt::MILLION

Size of the GBWT buffer in millions of nodes.

◆ id_interval

size_t vg::HaplotypeIndexer::id_interval = gbwt::DynamicGBWT::SAMPLE_INTERVAL

Interval at which to sample for GBWT locate.

◆ max_missing_variant_warnings

size_t vg::HaplotypeIndexer::max_missing_variant_warnings = 10

Only report up to this many of them.

◆ path_to_vcf

std::map<std::string, std::string> vg::HaplotypeIndexer::path_to_vcf

Path names in the graph are mapped to VCF contig names via path_to_vcf, or used as-is if no entry there is found.

◆ phase_homozygous

bool vg::HaplotypeIndexer::phase_homozygous = true

Phase homozygous unphased variants.

◆ regions

std::map<std::string, std::pair<size_t, size_t> > vg::HaplotypeIndexer::regions

Region restrictions for contigs, in VCF name space, as 0-based exclusive-end ranges.

◆ rename_variants

bool vg::HaplotypeIndexer::rename_variants = true

Use graph path names instead of VCF path names when composing variant alt paths.

◆ sample_range

std::pair<size_t, size_t> vg::HaplotypeIndexer::sample_range = std::pair<size_t, size_t>(0, std::numeric_limits<size_t>::max())

Range of VCF samples to process (first to past-last).

◆ samples_in_batch

size_t vg::HaplotypeIndexer::samples_in_batch = 200

Number of samples to process together in a haplotype batch.

◆ warn_on_missing_variants

bool vg::HaplotypeIndexer::warn_on_missing_variants = true

Print a warning if variants in the VCF can't be found in the graph.

The documentation for this class was generated from the following files:

src/haplotype_indexer.hpp
src/haplotype_indexer.cpp

Public Member Functions

Public Attributes

Additional Inherited Members