vg
tools for working with variation graphs
Public Member Functions | Public Attributes | Private Member Functions | Private Attributes | List of all members
vg::ReadFilter< Read > Class Template Reference

#include <readfilter.hpp>

Public Member Functions

Counts filter_alignment (Read &aln)
 
int filter (istream *alignment_stream)
 
bool trim_ambiguous_ends (Read &read, int k) const
 
int filter (istream *alignment_stream)
 
int filter (istream *alignment_stream)
 
bool trim_ambiguous_ends (Alignment &alignment, int k) const
 
bool trim_ambiguous_ends (MultipathAlignment &aln, int k) const
 

Public Attributes

bool complement_filter = false
 Actually take the complement of the filter. More...
 
vector< string > name_prefixes
 
bool exact_name = false
 Read name must not have anything in it besides the prefix. More...
 
vector< regex > excluded_refpos_contigs
 Read must not have a refpos set with a contig name containing a match to any of these. More...
 
vector< string > subsequences
 Read must contain at least one of these strings as a subsequence. More...
 
unordered_set< string > excluded_features
 
double min_secondary = numeric_limits<double>::lowest()
 
double min_primary = numeric_limits<double>::lowest()
 
bool rescore = false
 
bool frac_score = false
 
bool sub_score = false
 
int max_overhang = numeric_limits<int>::max() / 2
 
int min_end_matches = numeric_limits<int>::min() / 2
 
bool verbose = false
 
double min_mapq = numeric_limits<double>::lowest()
 
int repeat_size = 0
 
bool drop_split = false
 Should we drop split reads that follow edges not in the graph? More...
 
double downsample_probability = 1.0
 We can also pseudorandomly drop reads. What's the probability that we keep a read? More...
 
uint32_t downsample_seed_mask = 0
 
int defray_length = 0
 
int defray_count = 99999
 Limit defray recursion to visit this many nodes. More...
 
bool only_proper_pairs = true
 Filter to proper pairs. More...
 
bool only_mapped = true
 Filter to only mapped reads. More...
 
int threads = -1
 Number of threads from omp. More...
 
int buffer_size = 512
 GAM output buffer size. More...
 
bool write_output = true
 
bool write_tsv = false
 Sometimes we want to pick out fields and write a tsv instead of a gam;. More...
 
vector< string > output_fields
 What fields do we want to write to the tsv? More...
 
const HandleGraphgraph = nullptr
 A HandleGraph is required for some filters (Note: ReadFilter doesn't own/free this) More...
 
bool interleaved = false
 Interleaved input. More...
 
bool filter_on_all = false
 
int min_base_quality = numeric_limits<int>::min() / 2
 
double min_base_quality_fraction = numeric_limits<double>::lowest()
 
string annotation_to_match = ""
 
bool only_correctly_mapped = false
 Filter to only correctly mapped reads. More...
 

Private Member Functions

bool has_repeat (const Read &read, int k) const
 
bool is_mapped (const Read &read) const
 
bool trim_ambiguous_end (Alignment &alignment, int k) const
 
bool is_split (const Read &read) const
 
bool sample_read (const Read &read) const
 
Alignment to_alignment (const MultipathAlignment &multipath_aln) const
 
double get_score (const Read &read) const
 
bool matches_name (const Read &read) const
 
bool has_excluded_refpos (const Read &read) const
 
bool has_excuded_feature (const Read &read) const
 
bool is_secondary (const Read &read) const
 
int get_overhang (const Read &read) const
 
int alignment_overhang (const Alignment &aln) const
 
int get_end_matches (const Read &read) const
 
int alignment_end_matches (const Alignment &aln) const
 
int get_mapq (const Read &read) const
 
double get_min_base_qual_fraction (const Read &read) const
 
bool get_is_paired (const Read &read) const
 
bool is_proper_pair (const Read &read) const
 
bool contains_subsequence (const Read &read) const
 
bool matches_annotation (const Read &read) const
 
bool is_correctly_mapped (const Read &read) const
 
void emit (Read &read)
 
void emit (Read &read1, Read &read2)
 
void emit_tsv (Read &read)
 
void filter_internal (istream *in)
 Helper function for filter. More...
 
bool is_mapped (const Alignment &alignment) const
 
bool is_mapped (const MultipathAlignment &mp_alignment) const
 
bool is_correctly_mapped (const Alignment &alignment) const
 
bool is_correctly_mapped (const MultipathAlignment &alignment) const
 
void emit (Alignment &aln)
 
void emit (Alignment &aln1, Alignment &aln2)
 
void emit (MultipathAlignment &mp_aln)
 
void emit (MultipathAlignment &mp_aln1, MultipathAlignment &mp_aln2)
 
double get_score (const Alignment &aln) const
 
double get_score (const MultipathAlignment &read) const
 
bool has_excluded_refpos (const MultipathAlignment &read) const
 
bool has_excluded_refpos (const Alignment &aln) const
 
bool is_secondary (const MultipathAlignment &mp_aln) const
 
bool is_secondary (const Alignment &aln) const
 
int get_overhang (const Alignment &aln) const
 
int get_overhang (const MultipathAlignment &mp_aln) const
 
int get_end_matches (const MultipathAlignment &read) const
 
int get_end_matches (const Alignment &read) const
 
bool is_split (const Alignment &alignment) const
 
bool is_split (const MultipathAlignment &mp_aln) const
 
bool get_is_paired (const Alignment &aln) const
 
bool get_is_paired (const MultipathAlignment &mp_aln) const
 
void emit_tsv (MultipathAlignment &read)
 
void emit_tsv (Alignment &read)
 

Private Attributes

unique_ptr< AlignmentEmitteraln_emitter
 The twp specializations have different writing infrastructure. More...
 
unique_ptr< MultipathAlignmentEmittermp_aln_emitter
 

Member Function Documentation

◆ alignment_end_matches()

template<typename Read >
int vg::ReadFilter< Read >::alignment_end_matches ( const Alignment aln) const
private

internal helper for get_end_matches

◆ alignment_overhang()

template<typename Read >
int vg::ReadFilter< Read >::alignment_overhang ( const Alignment aln) const
private

Internal helper for get_overhang

◆ contains_subsequence()

template<typename Read >
bool vg::ReadFilter< Read >::contains_subsequence ( const Read &  read) const
private

Does the read contain at least one of the indicated sequences

◆ emit() [1/6]

void vg::ReadFilter< Alignment >::emit ( Alignment aln)
inlineprivate

◆ emit() [2/6]

void vg::ReadFilter< Alignment >::emit ( Alignment aln1,
Alignment aln2 
)
inlineprivate

◆ emit() [3/6]

void vg::ReadFilter< MultipathAlignment >::emit ( MultipathAlignment mp_aln)
inlineprivate

◆ emit() [4/6]

void vg::ReadFilter< MultipathAlignment >::emit ( MultipathAlignment mp_aln1,
MultipathAlignment mp_aln2 
)
inlineprivate

◆ emit() [5/6]

template<typename Read >
void vg::ReadFilter< Read >::emit ( Read &  read)
private

Write the read to stdout

◆ emit() [6/6]

template<typename Read >
void vg::ReadFilter< Read >::emit ( Read &  read1,
Read &  read2 
)
private

Write a read pair to stdout

◆ emit_tsv() [1/3]

void vg::ReadFilter< Alignment >::emit_tsv ( Alignment read)
inlineprivate

◆ emit_tsv() [2/3]

void vg::ReadFilter< MultipathAlignment >::emit_tsv ( MultipathAlignment read)
inlineprivate

◆ emit_tsv() [3/3]

template<typename Read >
void vg::ReadFilter< Read >::emit_tsv ( Read &  read)
private

Write a tsv line for a read to stdout

◆ filter() [1/3]

template<typename Read >
int vg::ReadFilter< Read >::filter ( istream *  alignment_stream)

Filter the alignments available from the given stream, placing them on standard output or in the appropriate file. Returns 0 on success, exit code to use on error.

◆ filter() [2/3]

int vg::ReadFilter< Alignment >::filter ( istream *  alignment_stream)
inline

◆ filter() [3/3]

int vg::ReadFilter< MultipathAlignment >::filter ( istream *  alignment_stream)
inline

◆ filter_alignment()

template<typename Read >
Counts vg::ReadFilter< Read >::filter_alignment ( Read &  aln)

Run all the filters on an alignment. The alignment may get modified in-place by the defray filter

◆ filter_internal()

template<typename Read >
void vg::ReadFilter< Read >::filter_internal ( istream *  in)
private

Helper function for filter.

Template implementations

◆ get_end_matches() [1/3]

int vg::ReadFilter< Alignment >::get_end_matches ( const Alignment read) const
inlineprivate

◆ get_end_matches() [2/3]

int vg::ReadFilter< MultipathAlignment >::get_end_matches ( const MultipathAlignment read) const
inlineprivate

◆ get_end_matches() [3/3]

template<typename Read >
int vg::ReadFilter< Read >::get_end_matches ( const Read &  read) const
private

What is the shortest run of end matches on either end?

◆ get_is_paired() [1/3]

bool vg::ReadFilter< Alignment >::get_is_paired ( const Alignment aln) const
inlineprivate

◆ get_is_paired() [2/3]

bool vg::ReadFilter< MultipathAlignment >::get_is_paired ( const MultipathAlignment mp_aln) const
inlineprivate

◆ get_is_paired() [3/3]

template<typename Read >
bool vg::ReadFilter< Read >::get_is_paired ( const Read &  read) const
private

Is the read paired?

◆ get_mapq()

template<typename Read >
int vg::ReadFilter< Read >::get_mapq ( const Read &  read) const
private

What is the read's mapping quality

◆ get_min_base_qual_fraction()

template<typename Read >
double vg::ReadFilter< Read >::get_min_base_qual_fraction ( const Read &  read) const
private

What fraction of the base qualities are at least as large as the min base quality

◆ get_overhang() [1/3]

int vg::ReadFilter< Alignment >::get_overhang ( const Alignment aln) const
inlineprivate

◆ get_overhang() [2/3]

int vg::ReadFilter< MultipathAlignment >::get_overhang ( const MultipathAlignment mp_aln) const
inlineprivate

◆ get_overhang() [3/3]

template<typename Read >
int vg::ReadFilter< Read >::get_overhang ( const Read &  read) const
private

How long are the read overhangs?

◆ get_score() [1/3]

double vg::ReadFilter< Alignment >::get_score ( const Alignment aln) const
inlineprivate

◆ get_score() [2/3]

double vg::ReadFilter< MultipathAlignment >::get_score ( const MultipathAlignment read) const
inlineprivate

◆ get_score() [3/3]

template<typename Read >
double vg::ReadFilter< Read >::get_score ( const Read &  read) const
private

Get the score indicated by the params

◆ has_excluded_refpos() [1/3]

bool vg::ReadFilter< Alignment >::has_excluded_refpos ( const Alignment aln) const
inlineprivate

◆ has_excluded_refpos() [2/3]

bool vg::ReadFilter< MultipathAlignment >::has_excluded_refpos ( const MultipathAlignment read) const
inlineprivate

◆ has_excluded_refpos() [3/3]

template<typename Read >
bool vg::ReadFilter< Read >::has_excluded_refpos ( const Read &  read) const
private

Does the read match one of the excluded refpos contigs?

◆ has_excuded_feature()

template<typename Read >
bool vg::ReadFilter< Read >::has_excuded_feature ( const Read &  read) const
private

Is the read annotated with any of the excluded features

◆ has_repeat()

template<typename Read >
bool vg::ReadFilter< Read >::has_repeat ( const Read &  read,
int  k 
) const
private

quick and dirty filter to see if removing reads that can slip around and still map perfectly helps vg call. returns true if at either end of read sequence, at least k bases are repetitive, checking repeats of up to size 2k

◆ is_correctly_mapped() [1/3]

bool vg::ReadFilter< Alignment >::is_correctly_mapped ( const Alignment alignment) const
inlineprivate

◆ is_correctly_mapped() [2/3]

bool vg::ReadFilter< MultipathAlignment >::is_correctly_mapped ( const MultipathAlignment alignment) const
inlineprivate

◆ is_correctly_mapped() [3/3]

template<typename Read >
bool vg::ReadFilter< Read >::is_correctly_mapped ( const Read &  read) const
private

Check if the alignment is marked as being correctly mapped

◆ is_mapped() [1/3]

bool vg::ReadFilter< Alignment >::is_mapped ( const Alignment alignment) const
private

◆ is_mapped() [2/3]

bool vg::ReadFilter< MultipathAlignment >::is_mapped ( const MultipathAlignment mp_alignment) const
private

◆ is_mapped() [3/3]

template<typename Read >
bool vg::ReadFilter< Read >::is_mapped ( const Read &  read) const
private

Check if the alignment includes any aligned bases

◆ is_proper_pair()

template<typename Read >
bool vg::ReadFilter< Read >::is_proper_pair ( const Read &  read) const
private

Is the read in a proper-mapped pair?

◆ is_secondary() [1/3]

bool vg::ReadFilter< Alignment >::is_secondary ( const Alignment aln) const
inlineprivate

◆ is_secondary() [2/3]

bool vg::ReadFilter< MultipathAlignment >::is_secondary ( const MultipathAlignment mp_aln) const
inlineprivate

◆ is_secondary() [3/3]

template<typename Read >
bool vg::ReadFilter< Read >::is_secondary ( const Read &  read) const
private

Is the read a secondary alignments?

◆ is_split() [1/3]

bool vg::ReadFilter< Alignment >::is_split ( const Alignment alignment) const
inlineprivate

◆ is_split() [2/3]

bool vg::ReadFilter< MultipathAlignment >::is_split ( const MultipathAlignment mp_aln) const
inlineprivate

◆ is_split() [3/3]

template<typename Read >
bool vg::ReadFilter< Read >::is_split ( const Read &  read) const
private

Return false if the read only follows edges in the graph, and true if the read is split (or just incorrect) and takes edges not in the index.

Throws an error if no graph is specified.

◆ matches_annotation()

template<typename Read >
bool vg::ReadFilter< Read >::matches_annotation ( const Read &  read) const
private

Does has the given annotation and does it match

◆ matches_name()

template<typename Read >
bool vg::ReadFilter< Read >::matches_name ( const Read &  read) const
private

Does the read name have one of the indicated prefixes? If exact_name is set, only finds complete matches of a "prefix" to the whole read name.

◆ sample_read()

template<typename Read >
bool vg::ReadFilter< Read >::sample_read ( const Read &  read) const
private

Based on the read name and paired-ness, compute the SAM-style QNAME and use that and the configured sampling probability and seed in the Samtools read sampling algorithm, to determine if the read should be kept. Returns true if the read should stay, and false if it should be removed. Always accepts or rejects paired reads together.

◆ to_alignment()

template<typename Read >
Alignment vg::ReadFilter< Read >::to_alignment ( const MultipathAlignment multipath_aln) const
private

Convert a multipath alignment to a single path

◆ trim_ambiguous_end()

template<typename Read >
bool vg::ReadFilter< Read >::trim_ambiguous_end ( Alignment alignment,
int  k 
) const
private

Trim only the end of the given alignment, leaving the start alone. Two calls of this implement trim_ambiguous_ends above.

◆ trim_ambiguous_ends() [1/3]

bool vg::ReadFilter< Alignment >::trim_ambiguous_ends ( Alignment alignment,
int  k 
) const
inline

◆ trim_ambiguous_ends() [2/3]

bool vg::ReadFilter< MultipathAlignment >::trim_ambiguous_ends ( MultipathAlignment aln,
int  k 
) const
inline

◆ trim_ambiguous_ends() [3/3]

template<typename Read >
bool vg::ReadFilter< Read >::trim_ambiguous_ends ( Read &  read,
int  k 
) const

Look at either end of the given alignment, up to k bases in from the end. See if that tail of the alignment is mapped such that another embedding in the given graph can produce the same sequence as the sequence along the embedding that the read actually has, and if so trim back the read.

In the case of softclips, the aligned portion of the read is considered, and if trimmign is required, the softclips are hard-clipped off.

Returns true if the read had to be modified, and false otherwise.

MUST NOT be called with a null index.

Member Data Documentation

◆ aln_emitter

template<typename Read >
unique_ptr<AlignmentEmitter> vg::ReadFilter< Read >::aln_emitter
private

The twp specializations have different writing infrastructure.

◆ annotation_to_match

template<typename Read >
string vg::ReadFilter< Read >::annotation_to_match = ""

A string formatted "annotation[.subfield]*:value" Value is optional if the key is a flag Used like jq select

◆ buffer_size

template<typename Read >
int vg::ReadFilter< Read >::buffer_size = 512

GAM output buffer size.

◆ complement_filter

template<typename Read >
bool vg::ReadFilter< Read >::complement_filter = false

Actually take the complement of the filter.

◆ defray_count

template<typename Read >
int vg::ReadFilter< Read >::defray_count = 99999

Limit defray recursion to visit this many nodes.

◆ defray_length

template<typename Read >
int vg::ReadFilter< Read >::defray_length = 0

How far in from the end should we look for ambiguous end alignment to clip off?

◆ downsample_probability

template<typename Read >
double vg::ReadFilter< Read >::downsample_probability = 1.0

We can also pseudorandomly drop reads. What's the probability that we keep a read?

◆ downsample_seed_mask

template<typename Read >
uint32_t vg::ReadFilter< Read >::downsample_seed_mask = 0

Samtools-compatible internal seed mask, for deciding which read pairs to keep. To be generated with rand() after srand() from the user-visible seed.

◆ drop_split

template<typename Read >
bool vg::ReadFilter< Read >::drop_split = false

Should we drop split reads that follow edges not in the graph?

◆ exact_name

template<typename Read >
bool vg::ReadFilter< Read >::exact_name = false

Read name must not have anything in it besides the prefix.

◆ excluded_features

template<typename Read >
unordered_set<string> vg::ReadFilter< Read >::excluded_features

If a read has one of the features in this set as annotations, the read is filtered out.

◆ excluded_refpos_contigs

template<typename Read >
vector<regex> vg::ReadFilter< Read >::excluded_refpos_contigs

Read must not have a refpos set with a contig name containing a match to any of these.

◆ filter_on_all

template<typename Read >
bool vg::ReadFilter< Read >::filter_on_all = false

When outputting paired reads, fail the pair only if both (all) reads fail (true) instead of if either (any) read fails (false)

◆ frac_score

template<typename Read >
bool vg::ReadFilter< Read >::frac_score = false

◆ graph

template<typename Read >
const HandleGraph* vg::ReadFilter< Read >::graph = nullptr

A HandleGraph is required for some filters (Note: ReadFilter doesn't own/free this)

◆ interleaved

template<typename Read >
bool vg::ReadFilter< Read >::interleaved = false

Interleaved input.

◆ max_overhang

template<typename Read >
int vg::ReadFilter< Read >::max_overhang = numeric_limits<int>::max() / 2

◆ min_base_quality

template<typename Read >
int vg::ReadFilter< Read >::min_base_quality = numeric_limits<int>::min() / 2

◆ min_base_quality_fraction

template<typename Read >
double vg::ReadFilter< Read >::min_base_quality_fraction = numeric_limits<double>::lowest()

◆ min_end_matches

template<typename Read >
int vg::ReadFilter< Read >::min_end_matches = numeric_limits<int>::min() / 2

◆ min_mapq

template<typename Read >
double vg::ReadFilter< Read >::min_mapq = numeric_limits<double>::lowest()

◆ min_primary

template<typename Read >
double vg::ReadFilter< Read >::min_primary = numeric_limits<double>::lowest()

◆ min_secondary

template<typename Read >
double vg::ReadFilter< Read >::min_secondary = numeric_limits<double>::lowest()

◆ mp_aln_emitter

template<typename Read >
unique_ptr<MultipathAlignmentEmitter> vg::ReadFilter< Read >::mp_aln_emitter
private

◆ name_prefixes

template<typename Read >
vector<string> vg::ReadFilter< Read >::name_prefixes

Read name must have one of these prefixes, if any are present. TODO: This should be a trie but I don't have one handy. Must be sorted for vaguely efficient search.

◆ only_correctly_mapped

template<typename Read >
bool vg::ReadFilter< Read >::only_correctly_mapped = false

Filter to only correctly mapped reads.

◆ only_mapped

template<typename Read >
bool vg::ReadFilter< Read >::only_mapped = true

Filter to only mapped reads.

◆ only_proper_pairs

template<typename Read >
bool vg::ReadFilter< Read >::only_proper_pairs = true

Filter to proper pairs.

◆ output_fields

template<typename Read >
vector<string> vg::ReadFilter< Read >::output_fields

What fields do we want to write to the tsv?

◆ repeat_size

template<typename Read >
int vg::ReadFilter< Read >::repeat_size = 0

◆ rescore

template<typename Read >
bool vg::ReadFilter< Read >::rescore = false

Should we rescore each alignment with default parameters and no e.g. haplotype info?

◆ sub_score

template<typename Read >
bool vg::ReadFilter< Read >::sub_score = false

◆ subsequences

template<typename Read >
vector<string> vg::ReadFilter< Read >::subsequences

Read must contain at least one of these strings as a subsequence.

◆ threads

template<typename Read >
int vg::ReadFilter< Read >::threads = -1

Number of threads from omp.

◆ verbose

template<typename Read >
bool vg::ReadFilter< Read >::verbose = false

◆ write_output

template<typename Read >
bool vg::ReadFilter< Read >::write_output = true

Sometimes we only want a report, and not a filtered gam. toggling off output speeds things up considerably.

◆ write_tsv

template<typename Read >
bool vg::ReadFilter< Read >::write_tsv = false

Sometimes we want to pick out fields and write a tsv instead of a gam;.


The documentation for this class was generated from the following file: