Skip to content

Commit

Permalink
add normalizer to vg paths
Browse files Browse the repository at this point in the history
  • Loading branch information
glennhickey committed Sep 11, 2024
1 parent 3bfb4b9 commit b57adcf
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 9 deletions.
34 changes: 26 additions & 8 deletions src/subcommand/paths_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "../vg.hpp"
#include "../xg.hpp"
#include "../gbwt_helper.hpp"
#include "../traversal_clusters.hpp"
#include <vg/io/vpkg.hpp>
#include <vg/io/stream.hpp>
#include <vg/io/alignment_emitter.hpp>
Expand All @@ -37,6 +38,7 @@ void help_paths(char** argv) {
<< " -V, --extract-vg output a path-only graph covering the selected paths" << endl
<< " -d, --drop-paths output a graph with the selected paths removed" << endl
<< " -r, --retain-paths output a graph with only the selected paths retained" << endl
<< " -n, --normalize-paths output a graph where all equivalent paths in a site a merged (using selected paths to snap to if possible)" << endl
<< " output path data:" << endl
<< " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl
<< " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl
Expand Down Expand Up @@ -117,6 +119,7 @@ int main_paths(int argc, char** argv) {
size_t input_formats = 0;
bool coverage = false;
const size_t coverage_bins = 10;
bool normalize_paths = false;

int c;
optind = 2; // force optind past command positional argument
Expand All @@ -130,6 +133,7 @@ int main_paths(int argc, char** argv) {
{"extract-vg", no_argument, 0, 'V'},
{"drop-paths", no_argument, 0, 'd'},
{"retain-paths", no_argument, 0, 'r'},
{"normalize-paths", no_argument, 0, 'n'},
{"extract-gam", no_argument, 0, 'X'},
{"extract-gaf", no_argument, 0, 'A'},
{"list", no_argument, 0, 'L'},
Expand All @@ -154,7 +158,7 @@ int main_paths(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGRHp:c",
c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drnaGRHp:c",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -189,6 +193,11 @@ int main_paths(int argc, char** argv) {
retain_paths = true;
output_formats++;
break;

case 'n':
normalize_paths = true;
output_formats++;
break;

case 'X':
extract_as_gam = true;
Expand Down Expand Up @@ -309,7 +318,7 @@ int main_paths(int argc, char** argv) {
if (!gbwt_file.empty()) {
bool need_graph = (extract_as_gam || extract_as_gaf || extract_as_vg || drop_paths || retain_paths || extract_as_fasta || list_lengths);
if (need_graph && graph_file.empty()) {
std::cerr << "error: [vg paths] a graph is needed for extracting threads in -X, -A, -V, -d, -r, -E or -F format" << std::endl;
std::cerr << "error: [vg paths] a graph is needed for extracting threads in -X, -A, -V, -d, -r, -n, -E or -F format" << std::endl;
std::exit(EXIT_FAILURE);
}
if (!need_graph && !graph_file.empty()) {
Expand All @@ -320,7 +329,7 @@ int main_paths(int argc, char** argv) {
}
}
if (output_formats != 1) {
std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl;
std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -n, -L, -F, -E, -C or -c) must be specified" << std::endl;
std::exit(EXIT_FAILURE);
}
if (selection_criteria > 1) {
Expand All @@ -335,8 +344,8 @@ int main_paths(int argc, char** argv) {
std::cerr << "error: [vg paths] listing path metadata is not compatible with a GBWT index" << std::endl;
std::exit(EXIT_FAILURE);
}
if ((drop_paths || retain_paths) && !gbwt_file.empty()) {
std::cerr << "error: [vg paths] dropping or retaining paths only works on embedded graph paths, not GBWT threads" << std::endl;
if ((drop_paths || retain_paths || normalize_paths) && !gbwt_file.empty()) {
std::cerr << "error: [vg paths] dropping, retaining or normalizing paths only works on embedded graph paths, not GBWT threads" << std::endl;
std::exit(EXIT_FAILURE);
}
if (coverage && !gbwt_file.empty()) {
Expand Down Expand Up @@ -566,7 +575,7 @@ int main_paths(int argc, char** argv) {

};

if (drop_paths || retain_paths) {
if (drop_paths || retain_paths || normalize_paths) {
MutablePathMutableHandleGraph* mutable_graph = dynamic_cast<MutablePathMutableHandleGraph*>(graph.get());
if (!mutable_graph) {
std::cerr << "error[vg paths]: graph cannot be modified" << std::endl;
Expand All @@ -583,12 +592,21 @@ int main_paths(int argc, char** argv) {
for_each_selected_path([&](const path_handle_t& path_handle) {
to_destroy.push_back(path_handle);
});
} else {
} else if (retain_paths) {
for_each_unselected_path([&](const path_handle_t& path_handle) {
to_destroy.push_back(path_handle);
});
} else {
assert(normalize_paths);
unordered_set<path_handle_t> selected_paths;
for_each_selected_path([&](const path_handle_t& path_handle) {
selected_paths.insert(path_handle);
});
merge_equivalent_traversals_in_graph(mutable_graph, selected_paths);
}
if (!to_destroy.empty()) {
mutable_graph->destroy_paths(to_destroy);
}
mutable_graph->destroy_paths(to_destroy);

// output the graph
serializable_graph->serialize(cout);
Expand Down
132 changes: 131 additions & 1 deletion src/traversal_clusters.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
#include "traversal_clusters.hpp"
#include "traversal_finder.hpp"
#include "integrated_snarl_finder.hpp"
#include "snarl_distance_index.hpp"

//#define debug

namespace vg {

using namespace bdsg;

// specialized version of jaccard_coefficient() that weights jaccard by node lenths.
double weighted_jaccard_coefficient(const PathHandleGraph* graph,
Expand Down Expand Up @@ -279,9 +283,135 @@ vector<vector<int>> assign_child_snarls_to_traversals(const PathHandleGraph* gra
return trav_to_child_snarls;
}

static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths,
PathTraversalFinder& path_trav_finder,
const handle_t& start_handle, const handle_t& end_handle ) {
// find the path traversals through the snarl
vector<Traversal> path_travs;
vector<PathInterval> path_intervals;
std::tie(path_travs, path_intervals) = path_trav_finder.find_path_traversals(start_handle, end_handle);
vector<string> trav_names(path_travs.size());
vector<path_handle_t> trav_paths(path_travs.size());
vector<bool> trav_reversed(path_travs.size());

// organize paths by their alleles strings
// todo: we could use a fancier index to save memory here (prob not necessary tho)
unordered_map<string, vector<int64_t>> string_to_travs;
for (int64_t i = 0; i < path_travs.size(); ++i) {
const Traversal& trav = path_travs[i];
string allele;
for (int64_t j = 1; j < trav.size() - 1; ++j) {
allele += toUppercase(graph->get_sequence(trav[j]));
}
string_to_travs[allele].push_back(i);
trav_paths[i] = graph->get_path_handle_of_step(path_intervals[i].first);
trav_names[i] = graph->get_path_name(trav_paths[i]);
trav_reversed[i] = graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].first)) !=
graph->get_is_reverse(start_handle);
#ifdef debug
cerr << "trav " << i << ": ";
cerr << "n=" << trav_names[i] << " "
<< "i=" << (graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].first)) ? "<" : ">")
<< graph->get_id(graph->get_handle_of_step(path_intervals[i].first))
<< (graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].second)) ? "<" : ">")
<< graph->get_id(graph->get_handle_of_step(path_intervals[i].second)) << " t=";
for (auto xx : trav) {
cerr << (graph->get_is_reverse(xx) ? "<" : ">") << graph->get_id(xx);
}
cerr << " a=" << allele << " r=" << trav_reversed[i] << endl;
#endif
}

// rank the traversals, putting selected ones first otherwise alphabetical
function<bool(int64_t, int64_t)> trav_idx_less = [&](int64_t i, int64_t j) {
bool iref = selected_paths.count(trav_paths[i]);
bool jref = selected_paths.count(trav_paths[j]);
if (iref != jref) {
return iref;
}
return trav_names[i] < trav_names[j];
};

// merge up the paths
for (const auto& allele_travs : string_to_travs) {
if (allele_travs.first.length() > 0 && allele_travs.second.size() > 1) {
const vector<int64_t>& eq_travs = allele_travs.second;
auto canonical_it = std::min_element(eq_travs.begin(), eq_travs.end(), trav_idx_less);
assert(canonical_it != eq_travs.end());
int64_t canonical_idx = *canonical_it;
Traversal canonical_trav = path_travs[canonical_idx];
if (trav_reversed[canonical_idx]) {
Traversal flip_trav;
for (auto i = canonical_trav.rbegin(); i != canonical_trav.rend(); ++i) {
flip_trav.push_back(graph->flip(*i));
}
std::swap(canonical_trav, flip_trav);
}

// edit it into every other path
//
// WARNING: in the case of loops, we could be editing the same path more than once
// so making the big undocumented assumption that step handles outside edit
// are unaffected!!
for (int64_t i = 0; i < eq_travs.size(); ++i) {
int64_t replace_idx = eq_travs[i];
if (replace_idx != canonical_idx && path_travs[replace_idx] != path_travs[canonical_idx]) {
PathInterval interval_to_replace = path_intervals[replace_idx];
if (trav_reversed[replace_idx]) {
std::swap(interval_to_replace.first, interval_to_replace.second);
}
#ifdef debug
cerr << "editing interval of size " << path_travs[replace_idx].size()
<< " from path " << trav_names[replace_idx] << " with canonical interval of size "
<< path_travs[canonical_idx].size() << " from path "
<< trav_names[canonical_idx] << endl;


cerr << "interval to replace first " << graph->get_id(graph->get_handle_of_step(interval_to_replace.first))
<< " second " << graph->get_id(graph->get_handle_of_step(interval_to_replace.second)) << endl;

#endif
graph->rewrite_segment(interval_to_replace.first, graph->get_next_step(interval_to_replace.second),
canonical_trav);
}
}
}
}
}

void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths) {
// compute the snarls
SnarlDistanceIndex distance_index;
{
IntegratedSnarlFinder snarl_finder(*graph);
// todo: why can't I pass in 0 below -- I don't want any dinstances!
fill_in_distance_index(&distance_index, graph, &snarl_finder, 1);
}

// only consider embedded paths that span snarl
PathTraversalFinder path_trav_finder(*graph);

// do every snarl top-down. this is because it's possible (tho probably rare) for a child snarl to
// be redundant after normalizing its parent. don't think the opposite (normalizing child)
// causes redundant parent.. todo: can we guarantee?!
net_handle_t root = distance_index.get_root();
deque<net_handle_t> queue = {root};

while (!queue.empty()) {
net_handle_t net_handle = queue.front();
queue.pop_front();
if (distance_index.is_snarl(net_handle)) {
net_handle_t start_bound = distance_index.get_bound(net_handle, false, true);
net_handle_t end_bound = distance_index.get_bound(net_handle, true, false);
handle_t start_handle = distance_index.get_handle(start_bound, graph);
handle_t end_handle = distance_index.get_handle(end_bound, graph);
merge_equivalent_traversals_in_snarl(graph, selected_paths, path_trav_finder, start_handle, end_handle);
}
if (net_handle == root || distance_index.is_snarl(net_handle) || distance_index.is_chain(net_handle)) {
distance_index.for_each_child(net_handle, [&](net_handle_t child_handle) {
queue.push_back(child_handle);
});
}
}
}

}
11 changes: 11 additions & 0 deletions src/traversal_clusters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,15 @@ vector<vector<int>> assign_child_snarls_to_traversals(const PathHandleGraph* gra
const vector<Traversal>& traversals,
const vector<pair<handle_t, handle_t>>& child_snarls);

/// For every top-level snarl in the graph, compute the traversal strings of every embedded path that spans it
/// If two or more traversals share an allele string, then a "canoncial" path is chosen and all remaining paths
/// are edited so that they share the exact same interval through the snarl as the canonical path's traversal.
/// A path is considered "canoncial" if it's in the "selected_paths" and the other paths are not
/// (otherwise the lowest name is used as a fallback)
///
/// Note: this doesn't modify the graph toplogy, so uncovered nodes and edges as a result of path editing
/// would usually need removale with vg clip afterwards
///
void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths);

}

1 comment on commit b57adcf

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch path-normalize. View the full report here.

12 tests passed, 0 tests failed and 0 tests skipped in 9442 seconds

Please sign in to comment.