Skip to content

Commit

Permalink
variant representation changes and adjusting is_good_alt criteria
Browse files Browse the repository at this point in the history
  • Loading branch information
hannespetur committed Nov 2, 2023
1 parent a8cfb38 commit a27904d
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
ignore = dirty
[submodule "htslib"]
path = htslib
url = https://github.com/samtools/htslib.git
url = [email protected]:hannese/htslib.git
ignore = dirty
[submodule "catch"]
path = catch
Expand Down
3 changes: 1 addition & 2 deletions include/graphtyper/typer/variant.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,10 @@ struct VariantHash
};

std::vector<Variant> break_down_variant(Variant && variant,
long const reach,
bool const is_no_variant_overlapping,
bool const is_all_biallelic);

std::vector<Variant> break_down_skyr(Variant && var, long const reach);
std::vector<Variant> break_down_skyr(Variant && var);
std::vector<Variant> extract_sequences_from_aligned_variant(Variant const && variant, std::size_t const THRESHOLD);
std::vector<Variant> simplify_complex_haplotype(Variant && variant, std::size_t const THRESHOLD);
std::vector<Variant> break_multi_snps(Variant && var);
Expand Down
2 changes: 1 addition & 1 deletion paw
63 changes: 51 additions & 12 deletions src/typer/variant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1063,10 +1063,9 @@ std::vector<int8_t> Variant::generate_infos()
double const qd = qd_alt[a];

is_good_alt[a] =
static_cast<int>(aa_score[a] >= 0.05 && qd > 1.0 && per_al.maximum_alt_support >= 3 &&
per_al.maximum_alt_support_ratio >= 0.175 &&
(seqs.size() < 71 || (qd > 1.25 && per_al.maximum_alt_support_ratio >= 0.2)) &&
(seqs.size() < 131 || (qd > 1.5 && per_al.maximum_alt_support_ratio >= 0.225)));
static_cast<int>(qd >= 1.0 && per_al.maximum_alt_support >= 2 &&
(seqs.size() < 71 || (qd >= 1.5 && per_al.maximum_alt_support_ratio >= 0.2)) &&
(seqs.size() < 131 || (qd >= 2.0 && per_al.maximum_alt_support_ratio >= 0.225)));
}

#ifndef NDEBUG
Expand Down Expand Up @@ -1651,7 +1650,6 @@ std::vector<Variant> make_biallelic(Variant && var)

// Variants functions
std::vector<Variant> break_down_variant(Variant && var,
long const reach,
bool const is_no_variant_overlapping,
bool const is_all_biallelic)
{
Expand Down Expand Up @@ -1686,7 +1684,7 @@ std::vector<Variant> break_down_variant(Variant && var,
{
// Use the skyr
print_debug("Using the skyr");
std::vector<Variant> new_broken_down_vars = break_down_skyr(std::move(var), reach);
std::vector<Variant> new_broken_down_vars = break_down_skyr(std::move(var));
print_debug("skyr finished.");

std::move(new_broken_down_vars.begin(), new_broken_down_vars.end(), std::back_inserter(broken_down_vars));
Expand Down Expand Up @@ -2001,13 +1999,33 @@ std::vector<Variant> break_multi_snps(Variant && var)
std::vector<std::vector<char>> const & seqs = var.seqs;
std::vector<Variant> new_vars;

// find which sequence is called
std::vector<int> ac(seqs.size(), 0);

for (SampleCall const & call : var.calls)
{
auto const & gt_call = call.get_gt_call();

assert(gt_call.first < ac.size());
assert(gt_call.second < ac.size());

ac[gt_call.first]++;
ac[gt_call.second]++;
}

for (long j = 0; j < static_cast<long>(seqs[0].size()); ++j)
{
std::vector<char> new_seqs(1, seqs[0][j]);
std::vector<uint16_t> old_phred_to_new_phred(1, 0);

for (long k = 1; k < static_cast<long>(seqs.size()); ++k)
{
if (ac[k] == 0)
{
old_phred_to_new_phred.push_back(0);
continue;
}

auto find_it = std::find(new_seqs.begin(), new_seqs.end(), seqs[k][j]);

if (find_it == new_seqs.end())
Expand Down Expand Up @@ -2092,29 +2110,50 @@ std::vector<Variant> break_multi_snps(Variant && var)
return new_vars;
}

std::vector<Variant> break_down_skyr(Variant && var, long const reach)
std::vector<Variant> break_down_skyr(Variant && var)
{
std::vector<Variant> new_vars;

long constexpr OPTIMAL_EXTRA = 50l;
bool const use_asterisks = !Options::instance()->no_asterisks;

long const extra_bases_before =
use_asterisks ? std::min(OPTIMAL_EXTRA, static_cast<long>(var.abs_pos) - reach - 1l) : OPTIMAL_EXTRA;
long const extra_bases_before = OPTIMAL_EXTRA;
// use_asterisks ? std::min(OPTIMAL_EXTRA, static_cast<long>(var.abs_pos) - reach - 1l) : OPTIMAL_EXTRA;

long constexpr extra_bases_after{OPTIMAL_EXTRA};
// long constexpr extra_bases_after{OPTIMAL_EXTRA};

for (long i = 0; i < extra_bases_before && var.add_base_in_front(false); ++i)
{
}

for (long i = 0; i < extra_bases_after && var.add_base_in_back(false); ++i)
// for (long i = 0; i < extra_bases_after && var.add_base_in_back(false); ++i)
// {
// }

auto const & seqs = var.seqs;

// find which sequence is called
std::vector<int> ac(var.seqs.size(), 0);

for (SampleCall const & call : var.calls)
{
auto const & gt_call = call.get_gt_call();

assert(gt_call.first < ac.size());
assert(gt_call.second < ac.size());

ac[gt_call.first]++;
ac[gt_call.second]++;
}

auto const & seqs = var.seqs;
paw::Skyr skyr(var.seqs);

for (int i{1}; i < static_cast<int>(var.seqs.size()); ++i)
{
if (ac[i] == 0)
skyr.seqs[i] = skyr.seqs[0];
}

bool constexpr is_normalize = true; // The events must the normalized
skyr.find_all_edits(is_normalize);
skyr.find_variants_from_edits();
Expand Down
3 changes: 1 addition & 2 deletions src/typer/variant_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,14 +394,13 @@ void VariantMap::filter_varmap_for_all()
var_cp.seqs = var.seqs;

// Force that no variant is overlapping
long const reach{-1};
bool const is_no_variant_overlapping{true}; // Force it to be true so we wont use skyr
bool const is_all_biallelic{false};
Variant new_var(var_cp);
new_var.infos["SBF1"] = ".";

std::vector<Variant> new_broken_down_vars =
break_down_variant(std::move(new_var), reach, is_no_variant_overlapping, is_all_biallelic);
break_down_variant(std::move(new_var), is_no_variant_overlapping, is_all_biallelic);

assert(new_broken_down_vars.size() != 0);

Expand Down
2 changes: 1 addition & 1 deletion src/typer/vcf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ void Vcf::write_header(bool const is_dropping_genotypes)
<< "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele frequency.\">\n"
<< "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Number of alleles in called genotypes.\">\n"
<< "##INFO=<ID=CR,Number=1,Type=Integer,Description=\"Number of clipped reads in the graph alignment.\">\n"
<< "##INFO=<ID=CRal,Number=.,Type=String,Description=\"Number of clipped reads per allele.\">\n"
<< "##INFO=<ID=CRal,Number=.,Type=String,Description=\"Number of clipped bp per allele.\">\n"
<< "##INFO=<ID=CRalt,Number=A,Type=Float,Description=\"Percent of clipped reads per allele.\">\n"
<< "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of an SV.\">\n"
<< "##INFO=<ID=FEATURE,Number=1,Type=String,Description=\"Gene feature.\">\n"
Expand Down
5 changes: 3 additions & 2 deletions src/typer/vcf_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -626,7 +626,7 @@ void vcf_merge_and_break(std::vector<std::string> const & vcfs,
}
else
{
new_variants = break_down_variant(std::move(var), reach, is_no_variant_overlapping, is_all_biallelic);
new_variants = break_down_variant(std::move(var), is_no_variant_overlapping, is_all_biallelic);
}

assert(new_variants.size() > 0);
Expand Down Expand Up @@ -963,8 +963,9 @@ void vcf_break_down(std::string const & vcf, std::string const & output, std::st
bool const is_no_variant_overlapping{Options::const_instance()->no_variant_overlapping};
bool const is_all_biallelic{Options::const_instance()->is_all_biallelic};
assert(vcf_in.variants[0].infos.count("SBF1") == 1);

std::vector<Variant> new_variants =
break_down_variant(std::move(vcf_in.variants[0]), reach, is_no_variant_overlapping, is_all_biallelic);
break_down_variant(std::move(vcf_in.variants[0]), is_no_variant_overlapping, is_all_biallelic);

update_reach(new_variants);
std::move(new_variants.begin(), new_variants.end(), std::back_inserter(vcf_out.variants));
Expand Down
7 changes: 3 additions & 4 deletions src/utilities/bamshrink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,11 +682,10 @@ void qualityFilterSlice2(Options const & opts,
std::exit(1);
}

int const begin = std::max(chr_start_end.i2 - (opts.maxFragLen - 100), 0);

// bamshrink regions are expanded by 100 which should be removed here
if (!setRegion(bamFileIn,
toCString(chr_start_end.i1),
chr_start_end.i2 - (opts.maxFragLen - 100),
chr_start_end.i3 + (opts.maxFragLen - 100)))
if (!setRegion(bamFileIn, toCString(chr_start_end.i1), begin, chr_start_end.i3 + (opts.maxFragLen - 100)))
{
print_log(gyper::log_severity::error,
__HERE__,
Expand Down

0 comments on commit a27904d

Please sign in to comment.