From 9a837c6ca781560dcb03fdd78ccabb77eed15e2d Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 20 Jun 2024 17:29:05 -0400 Subject: [PATCH 01/12] Add --max-tail-len option to vg surject --- src/multipath_alignment_graph.cpp | 45 ++++++++++++++++++---- src/multipath_alignment_graph.hpp | 19 +++++---- src/multipath_mapper.cpp | 10 ++--- src/subcommand/surject_main.cpp | 14 +++++-- src/surjector.cpp | 1 + src/surjector.hpp | 3 ++ src/unittest/multipath_alignment_graph.cpp | 12 +++--- 7 files changed, 75 insertions(+), 29 deletions(-) diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index 6af414160c6..2e3791e2511 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -2289,7 +2289,7 @@ namespace vg { // Align the tails, not collecting a set of source subpaths. // TODO: factor of 1/2 is arbitray, but i do think it should be fewer than the max auto tail_alignments = align_tails(alignment, align_graph, aligner, max(1, max_alt_alns / 2), - dynamic_alt_alns, max_gap, pessimistic_tail_gap_multiplier, max_alt_alns, nullptr); + dynamic_alt_alns, max_gap, pessimistic_tail_gap_multiplier, max_alt_alns, std::numeric_limits::max(), nullptr); for (bool handling_right_tail : {false, true}) { @@ -4225,8 +4225,9 @@ namespace vg { void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, - double pessimistic_tail_gap_multiplier, bool simplify_topologies, size_t unmergeable_len, - size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, + double pessimistic_tail_gap_multiplier, size_t max_tail_length, bool simplify_topologies, + size_t unmergeable_len, size_t band_padding, + multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, const function(id_t)>* project, bool allow_negative_scores, unordered_map* left_align_strand) { @@ -4242,6 +4243,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap dynamic_alt_alns, max_gap, pessimistic_tail_gap_multiplier, + max_tail_length, simplify_topologies, unmergeable_len, constant_padding, @@ -5182,7 +5184,8 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, - double pessimistic_tail_gap_multiplier, bool simplify_topologies, size_t unmergeable_len, + double pessimistic_tail_gap_multiplier, size_t max_tail_length, + bool simplify_topologies, size_t unmergeable_len, function band_padding_function, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, const function(id_t)>* project, @@ -5448,7 +5451,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap // Actually align the tails auto tail_alignments = align_tails(alignment, align_graph, aligner, max_alt_alns, dynamic_alt_alns, - max_gap, pessimistic_tail_gap_multiplier, 0, &sources); + max_gap, pessimistic_tail_gap_multiplier, 0, max_tail_length, &sources); // TODO: merge and simplify the tail alignments? rescoring would be kind of a pain... @@ -5992,7 +5995,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap unordered_map>> MultipathAlignmentGraph::align_tails(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, - size_t min_paths, unordered_set* sources) { + size_t min_paths, size_t max_tail_length, unordered_set* sources) { #ifdef debug_multipath_alignment cerr << "doing tail alignments to:" << endl; @@ -6108,7 +6111,20 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap // align against the graph auto& alt_alignments = right_alignments[j]; - if (num_alt_alns == 1) { + if (right_tail_sequence.sequence().size() > max_tail_length) { +#ifdef debug_multipath_alignment + cerr << "softclip long right" << endl; +#endif + alt_alignments.emplace_back(std::move(right_tail_sequence)); + Mapping* m = alt_alignments.back().mutable_path()->add_mapping(); + m->mutable_position()->set_node_id(id(end_pos)); + m->mutable_position()->set_is_reverse(is_rev(end_pos)); + m->mutable_position()->set_offset(offset(end_pos)); + Edit* e = m->add_edit(); + e->set_to_length(alt_alignments.back().sequence().size()); + e->set_sequence(alt_alignments.back().sequence()); + } + else if (num_alt_alns == 1) { #ifdef debug_multipath_alignment cerr << "align right with dozeu with gap " << gap << endl; #endif @@ -6229,7 +6245,20 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap // align against the graph auto& alt_alignments = left_alignments[j]; - if (num_alt_alns == 1) { + if (left_tail_sequence.sequence().size() > max_tail_length) { +#ifdef debug_multipath_alignment + cerr << "softclip long left" << endl; +#endif + alt_alignments.emplace_back(std::move(left_tail_sequence)); + Mapping* m = alt_alignments.back().mutable_path()->add_mapping(); + m->mutable_position()->set_node_id(id(begin_pos)); + m->mutable_position()->set_is_reverse(is_rev(begin_pos)); + m->mutable_position()->set_offset(offset(begin_pos)); + Edit* e = m->add_edit(); + e->set_to_length(alt_alignments.back().sequence().size()); + e->set_sequence(alt_alignments.back().sequence()); + } + else if (num_alt_alns == 1) { #ifdef debug_multipath_alignment cerr << "align left with dozeu using gap " << gap << endl; #endif diff --git a/src/multipath_alignment_graph.hpp b/src/multipath_alignment_graph.hpp index 752c9f56cd0..e687fd43037 100644 --- a/src/multipath_alignment_graph.hpp +++ b/src/multipath_alignment_graph.hpp @@ -194,10 +194,11 @@ namespace vg { /// order, even if this MultipathAlignmentGraph is. You MUST sort it /// with topologically_order_subpaths() before trying to run DP on it. void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches, - size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies, - size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, - SnarlDistanceIndex* dist_index = nullptr, const function(id_t)>* project = nullptr, - bool allow_negative_scores = false, unordered_map* left_align_strand = nullptr); + size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length, + bool simplify_topologies, size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, + SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr, + const function(id_t)>* project = nullptr, bool allow_negative_scores = false, + unordered_map* left_align_strand = nullptr); /// Do intervening and tail alignments between the anchoring paths and /// store the result in a multipath_alignment_t. Reachability edges must @@ -212,8 +213,8 @@ namespace vg { /// order, even if this MultipathAlignmentGraph is. You MUST sort it /// with topologically_order_subpaths() before trying to run DP on it. void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches, - size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies, - size_t unmergeable_len, function band_padding_function, + size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length, + bool simplify_topologies, size_t unmergeable_len, function band_padding_function, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr, const function(id_t)>* project = nullptr, bool allow_negative_scores = false, unordered_map* left_align_strand = nullptr); @@ -312,12 +313,16 @@ namespace vg { /// Alignments of the tail off of that subpath. Also computes the /// source subpaths and adds their numbers to the given set if not /// null. + /// + /// If a tail is longer than max_tail_length, produces an alignment + /// softclipping it. + /// /// If dynamic alignment count is also selected, can indicate a minimum number /// of paths that must be in the extending graph in order to do an alignment unordered_map>> align_tails(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, - size_t min_paths, unordered_set* sources = nullptr); + size_t min_paths, size_t max_tail_length, unordered_set* sources = nullptr); /// Removes alignments that follow the same path through the graph, retaining only the /// highest scoring ones. If deduplicating leftward, then also removes paths that take a diff --git a/src/multipath_mapper.cpp b/src/multipath_mapper.cpp index 55f765f3fbc..ccf1a9e8246 100644 --- a/src/multipath_mapper.cpp +++ b/src/multipath_mapper.cpp @@ -6256,9 +6256,9 @@ namespace vg { // do the connecting alignments and fill out the multipath_alignment_t object multi_aln_graph.align(alignment, *align_dag, aligner, true, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap, - use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies, - max_tail_merge_supress_length, choose_band_padding, multipath_aln_out, snarl_manager, - distance_index, &translator); + use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits::max(), + simplify_topologies, max_tail_merge_supress_length, choose_band_padding, multipath_aln_out, + snarl_manager, distance_index, &translator); // Note that we do NOT topologically order the multipath_alignment_t. The // caller has to do that, after it is finished breaking it up into @@ -6313,8 +6313,8 @@ namespace vg { // do the connecting alignments and fill out the multipath_alignment_t object multi_aln_graph.align(alignment, subgraph, aligner, false, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap, - use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies, - max_tail_merge_supress_length, choose_band_padding, multipath_aln_out); + use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits::max(), + simplify_topologies, max_tail_merge_supress_length, choose_band_padding, multipath_aln_out); for (size_t j = 0; j < multipath_aln_out.subpath_size(); j++) { translate_oriented_node_ids(*multipath_aln_out.mutable_subpath(j)->mutable_path(), translator); diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 8121ababf47..f1a3b48a4d3 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -47,6 +47,7 @@ void help_surject(char** argv) { << " -b, --bam-output write BAM to stdout" << endl << " -s, --sam-output write SAM to stdout" << endl << " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl + << " -T, --max-tail-len N do not align read tails longer than N" << endl << " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl << " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl << " -S, --spliced interpret long deletions against paths as spliced alignments" << endl @@ -96,6 +97,7 @@ int main_surject(int argc, char** argv) { int min_splice_length = 20; size_t watchdog_timeout = 10; bool subpath_global = true; // force full length alignments in mpmap resolution + size_t max_tail_len = std::numeric_limits::max(); bool qual_adj = false; bool prune_anchors = false; size_t max_anchors = 200; @@ -114,7 +116,8 @@ int main_surject(int argc, char** argv) { {"into-path", required_argument, 0, 'p'}, {"into-paths", required_argument, 0, 'F'}, {"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths - {"subpath-local", required_argument, 0, 'l'}, + {"subpath-local", no_argument, 0, 'l'}, + {"max-tail-len", no_argument, 0, 'T'}, {"interleaved", no_argument, 0, 'i'}, {"multimap", no_argument, 0, 'M'}, {"gaf-input", no_argument, 0, 'G'}, @@ -137,7 +140,7 @@ int main_surject(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPa:ALMVw:", + c = getopt_long (argc, argv, "hx:p:F:lT:iGmcbsN:R:f:C:t:SPa:ALMVw:", long_options, &option_index); // Detect the end of the options. @@ -163,6 +166,10 @@ int main_surject(int argc, char** argv) { subpath_global = false; break; + case 'T': + max_tail_len = parse(optarg); + break; + case 'i': interleaved = true; break; @@ -303,8 +310,9 @@ int main_surject(int argc, char** argv) { else { surjector.min_splice_length = numeric_limits::max(); } + surjector.max_tail_length = max_tail_len; surjector.annotate_with_all_path_scores = annotate_with_all_path_scores; - + // Count our threads int thread_count = vg::get_thread_count(); diff --git a/src/surjector.cpp b/src/surjector.cpp index b3068efea1a..9e993815bac 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -3089,6 +3089,7 @@ using namespace std; false, // dynamic alt alns numeric_limits::max(), // max gap 0.0, // pessimistic tail gap multiplier + max_tail_length, // max length of tail to align false, // simplify topologies 0, // unmergeable len 1, // band padding diff --git a/src/surjector.hpp b/src/surjector.hpp index e3c3acec96a..0b45591a4b1 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -107,6 +107,9 @@ using namespace std; /// the minimum length apparent intron that we will try to repair int64_t min_splice_repair_length = 250; + + /// the maximum length of a tail that we will try to align + size_t max_tail_length = std::numeric_limits::max(); /// How big of a graph in bp should we ever try to align against for realigning surjection? size_t max_subgraph_bases = 100 * 1024; diff --git a/src/unittest/multipath_alignment_graph.cpp b/src/unittest/multipath_alignment_graph.cpp index 7d323e33675..9b08215dfd7 100644 --- a/src/unittest/multipath_alignment_graph.cpp +++ b/src/unittest/multipath_alignment_graph.cpp @@ -110,7 +110,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath mpg.resect_snarls_from_paths(&snarl_manager, identity, 5); // Make it align, with alignments per gap/tail - mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, 5, out); + mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits::max(), false, 5, out); // Make sure to topologically sort the resulting alignment. TODO: Should // the MultipathAlignmentGraph guarantee this for us by construction? @@ -148,7 +148,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath SECTION("Handles tails when anchors for them are not generated") { // Make it align, with alignments per gap/tail - mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, 5, out); + mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits::max(), false, 5, out); // Make sure to topologically sort the resulting alignment. TODO: Should // the MultipathAlignmentGraph guarantee this for us by construction? @@ -208,7 +208,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath MultipathAlignmentGraph::create_projector(identity), 5); // Make it align, with alignments per gap/tail - mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out); + mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits::max(), false, 0, 5, out); // Make sure to topologically sort the resulting alignment. TODO: Should // the MultipathAlignmentGraph guarantee this for us by construction? @@ -244,7 +244,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath SECTION("Handles tails when anchors for them are not generated") { // Make it align, with alignments per gap/tail - mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out); + mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits::max(), false, 0, 5, out); // Make sure to topologically sort the resulting alignment. TODO: Should // the MultipathAlignmentGraph guarantee this for us by construction? @@ -301,7 +301,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath MultipathAlignmentGraph::create_projector(identity), 5); // Make it align, with alignments per gap/tail - mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out); + mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits::max(), false, 0, 5, out); // Make sure to topologically sort the resulting alignment. TODO: Should // the MultipathAlignmentGraph guarantee this for us by construction? @@ -338,7 +338,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath SECTION("Handles tails when anchors for them are not generated") { // Make it align, with alignments per gap/tail - mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out); + mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits::max(), false, 0, 5, out); // Make sure to topologically sort the resulting alignment. TODO: Should // the MultipathAlignmentGraph guarantee this for us by construction? From b888fa8dff10c3bad77b427450d4307403c9a5c2 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 25 Jun 2024 16:55:12 -0400 Subject: [PATCH 02/12] Parse an argument for max-tail-len --- src/subcommand/surject_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index f1a3b48a4d3..78e397431c8 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -117,7 +117,7 @@ int main_surject(int argc, char** argv) { {"into-paths", required_argument, 0, 'F'}, {"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths {"subpath-local", no_argument, 0, 'l'}, - {"max-tail-len", no_argument, 0, 'T'}, + {"max-tail-len", required_argument, 0, 'T'}, {"interleaved", no_argument, 0, 'i'}, {"multimap", no_argument, 0, 'M'}, {"gaf-input", no_argument, 0, 'G'}, From 094b9561875fcd2cd992f3211dab52d1085e4a21 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 25 Jun 2024 14:47:42 -0700 Subject: [PATCH 03/12] Make surject generate tail softclips in the original graph space --- src/multipath_alignment_graph.cpp | 190 +++++++++++++++++------------- 1 file changed, 107 insertions(+), 83 deletions(-) diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index 2e3791e2511..02cd7e285a7 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -6061,17 +6061,19 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap } int64_t target_length = tail_length + gap; - - pos_t end_pos = final_position(path_node.path); bdsg::HashGraph tail_graph; - unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, - &tail_graph, - target_length, - end_pos, - false, // search forward - false); // no need to preserve cycles (in a DAG) + unordered_map tail_trans; + if (tail_length <= max_tail_length || dynamic_alt_alns) { + // We need to pull out the tail graph + tail_trans = algorithms::extract_extending_graph(&align_graph, + &tail_graph, + target_length, + end_pos, + false, // search forward + false); // no need to preserve cycles (in a DAG) + } size_t num_alt_alns; if (dynamic_alt_alns) { @@ -6084,47 +6086,42 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap else { num_alt_alns = max_alt_alns; } + + if (num_alt_alns == 0) { + // Don't do any alignments + continue; + } + + // Otherwise we need an alignment to fill. + // get the sequence remaining in the right tail + Alignment right_tail_sequence; + right_tail_sequence.set_sequence(alignment.sequence().substr(path_node.end - alignment.sequence().begin(), + alignment.sequence().end() - path_node.end)); + if (!alignment.quality().empty()) { + right_tail_sequence.set_quality(alignment.quality().substr(path_node.end - alignment.sequence().begin(), + alignment.sequence().end() - path_node.end)); + } + + // And the place to put it + auto& alt_alignments = right_alignments[j]; - if (num_alt_alns > 0) { - - // get the sequence remaining in the right tail - Alignment right_tail_sequence; - right_tail_sequence.set_sequence(alignment.sequence().substr(path_node.end - alignment.sequence().begin(), - alignment.sequence().end() - path_node.end)); - if (!alignment.quality().empty()) { - right_tail_sequence.set_quality(alignment.quality().substr(path_node.end - alignment.sequence().begin(), - alignment.sequence().end() - path_node.end)); - } - #ifdef debug_multipath_alignment - cerr << "making " << num_alt_alns << " alignments of sequence: " << right_tail_sequence.sequence() << endl << "to right tail graph" << endl; - tail_graph.for_each_handle([&](const handle_t& handle) { - cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; - tail_graph.follow_edges(handle, true, [&](const handle_t& prev) { - cerr << "\t" << tail_graph.get_id(prev) << " <-" << endl; - }); - tail_graph.follow_edges(handle, false, [&](const handle_t& next) { - cerr << "\t-> " << tail_graph.get_id(next) << endl; - }); + cerr << "making " << num_alt_alns << " alignments of sequence: " << right_tail_sequence.sequence() << endl << "to right tail graph" << endl; + tail_graph.for_each_handle([&](const handle_t& handle) { + cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; + tail_graph.follow_edges(handle, true, [&](const handle_t& prev) { + cerr << "\t" << tail_graph.get_id(prev) << " <-" << endl; + }); + tail_graph.follow_edges(handle, false, [&](const handle_t& next) { + cerr << "\t-> " << tail_graph.get_id(next) << endl; }); + }); #endif - + + if (tail_length <= max_tail_length) { // align against the graph - auto& alt_alignments = right_alignments[j]; - if (right_tail_sequence.sequence().size() > max_tail_length) { -#ifdef debug_multipath_alignment - cerr << "softclip long right" << endl; -#endif - alt_alignments.emplace_back(std::move(right_tail_sequence)); - Mapping* m = alt_alignments.back().mutable_path()->add_mapping(); - m->mutable_position()->set_node_id(id(end_pos)); - m->mutable_position()->set_is_reverse(is_rev(end_pos)); - m->mutable_position()->set_offset(offset(end_pos)); - Edit* e = m->add_edit(); - e->set_to_length(alt_alignments.back().sequence().size()); - e->set_sequence(alt_alignments.back().sequence()); - } - else if (num_alt_alns == 1) { + + if (num_alt_alns == 1) { #ifdef debug_multipath_alignment cerr << "align right with dozeu with gap " << gap << endl; #endif @@ -6170,6 +6167,20 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap cerr << i << ": " << pb2json(alt_alignments[i]) << endl; } #endif + } else { + // Tail is too long. Just make a softclip directly in the base graph ID space. + // TODO: What if we just don't produce this? Do we get softclips for free? +#ifdef debug_multipath_alignment + cerr << "softclip long right" << endl; +#endif + alt_alignments.emplace_back(std::move(right_tail_sequence)); + Mapping* m = alt_alignments.back().mutable_path()->add_mapping(); + m->mutable_position()->set_node_id(id(end_pos)); + m->mutable_position()->set_is_reverse(is_rev(end_pos)); + m->mutable_position()->set_offset(offset(end_pos)); + Edit* e = m->add_edit(); + e->set_to_length(alt_alignments.back().sequence().size()); + e->set_sequence(alt_alignments.back().sequence()); } } } @@ -6201,14 +6212,17 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap pos_t begin_pos = initial_position(path_node.path); - bdsg::HashGraph tail_graph; - unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, - &tail_graph, - target_length, - begin_pos, - true, // search backward - false); // no need to preserve cycles (in a DAG) + unordered_map tail_trans; + if (tail_length <= max_tail_length || dynamic_alt_alns) { + // We need to pull out the tail graph + tail_trans = algorithms::extract_extending_graph(&align_graph, + &tail_graph, + target_length, + begin_pos, + true, // search backward + false); // no need to preserve cycles (in a DAG) + } size_t num_alt_alns; if (dynamic_alt_alns) { @@ -6221,44 +6235,40 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap else { num_alt_alns = max_alt_alns; } - - if (num_alt_alns > 0) { - - Alignment left_tail_sequence; - left_tail_sequence.set_sequence(alignment.sequence().substr(0, path_node.begin - alignment.sequence().begin())); - if (!alignment.quality().empty()) { - left_tail_sequence.set_quality(alignment.quality().substr(0, path_node.begin - alignment.sequence().begin())); - } + + if (num_alt_alns == 0) { + // Don't do any alignments + continue; + } + + // Otherwise we need an alignment to fill. + // get the sequence remaining in the left tail + Alignment left_tail_sequence; + left_tail_sequence.set_sequence(alignment.sequence().substr(0, path_node.begin - alignment.sequence().begin())); + if (!alignment.quality().empty()) { + left_tail_sequence.set_quality(alignment.quality().substr(0, path_node.begin - alignment.sequence().begin())); + } + + // And the place to put it + auto& alt_alignments = left_alignments[j]; #ifdef debug_multipath_alignment - cerr << "making " << num_alt_alns << " alignments of sequence: " << left_tail_sequence.sequence() << endl << "to left tail graph" << endl; - tail_graph.for_each_handle([&](const handle_t& handle) { - cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; - tail_graph.follow_edges(handle, false, [&](const handle_t& next) { - cerr << "\t-> " << tail_graph.get_id(next) << endl; - }); - tail_graph.follow_edges(handle, true, [&](const handle_t& prev) { - cerr << "\t" << tail_graph.get_id(prev) << " <-" << endl; - }); + cerr << "making " << num_alt_alns << " alignments of sequence: " << left_tail_sequence.sequence() << endl << "to left tail graph" << endl; + tail_graph.for_each_handle([&](const handle_t& handle) { + cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; + tail_graph.follow_edges(handle, false, [&](const handle_t& next) { + cerr << "\t-> " << tail_graph.get_id(next) << endl; }); + tail_graph.follow_edges(handle, true, [&](const handle_t& prev) { + cerr << "\t" << tail_graph.get_id(prev) << " <-" << endl; + }); + }); #endif + if (tail_length <= max_tail_length) { // align against the graph - auto& alt_alignments = left_alignments[j]; - if (left_tail_sequence.sequence().size() > max_tail_length) { -#ifdef debug_multipath_alignment - cerr << "softclip long left" << endl; -#endif - alt_alignments.emplace_back(std::move(left_tail_sequence)); - Mapping* m = alt_alignments.back().mutable_path()->add_mapping(); - m->mutable_position()->set_node_id(id(begin_pos)); - m->mutable_position()->set_is_reverse(is_rev(begin_pos)); - m->mutable_position()->set_offset(offset(begin_pos)); - Edit* e = m->add_edit(); - e->set_to_length(alt_alignments.back().sequence().size()); - e->set_sequence(alt_alignments.back().sequence()); - } - else if (num_alt_alns == 1) { + + if (num_alt_alns == 1) { #ifdef debug_multipath_alignment cerr << "align left with dozeu using gap " << gap << endl; #endif @@ -6302,6 +6312,20 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap cerr << i << ": " << pb2json(alt_alignments[i]) << endl; } #endif + } else { + // Tail is too long. Just make a softclip directly in the base graph ID space. + // TODO: What if we just don't produce this? Do we get softclips for free? +#ifdef debug_multipath_alignment + cerr << "softclip long left" << endl; +#endif + alt_alignments.emplace_back(std::move(left_tail_sequence)); + Mapping* m = alt_alignments.back().mutable_path()->add_mapping(); + m->mutable_position()->set_node_id(id(begin_pos)); + m->mutable_position()->set_is_reverse(is_rev(begin_pos)); + m->mutable_position()->set_offset(offset(begin_pos)); + Edit* e = m->add_edit(); + e->set_to_length(alt_alignments.back().sequence().size()); + e->set_sequence(alt_alignments.back().sequence()); } } } From 3602e58bd9c2bb6d8a9a1d98c89fc248c88e54ba Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 11 Jul 2024 07:47:06 -0700 Subject: [PATCH 04/12] Make surject stop if unused command line arguments are kicking around --- src/subcommand/surject_main.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 8121ababf47..cb53cce7948 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -251,6 +251,14 @@ int main_surject(int argc, char** argv) { } } + string file_name = get_input_file_name(optind, argc, argv); + + if (have_input_file(optind, argc, argv)) { + // We only take one input file. + cerr << "error[vg surject] Extra argument provided: " << get_input_file_name(optind, argc, argv, false) << endl; + exit(1); + } + // Create a preprocessor to apply read group and sample name overrides in place auto set_metadata = [&](Alignment& update) { if (!sample_name.empty()) { @@ -261,8 +269,6 @@ int main_surject(int argc, char** argv) { } }; - string file_name = get_input_file_name(optind, argc, argv); - PathPositionHandleGraph* xgidx = nullptr; unique_ptr path_handle_graph; // If we add an overlay for path position queries, use one optimized for From 462dc6b96757eadb903f6272f818f5698c9338ed Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 15 Jul 2024 11:27:43 -0400 Subject: [PATCH 05/12] Remove extra `-headerpad` linker argument According to the Clang 15 man page it needs a size argument, and I don't think it does anything we need that `-headerpad_max_install_names` doesn't do. And it is causing build failures for some people (see ) because it doesn't have an argument. --- Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 6c8fd7262a1..9dbfe97f2ea 100644 --- a/Makefile +++ b/Makefile @@ -216,8 +216,9 @@ ifeq ($(shell uname -s),Darwin) START_STATIC = END_STATIC = - # We need to use special flags to let us rename libraries - LD_RENAMEABLE_FLAGS = -Wl,-headerpad -Wl,-headerpad_max_install_names + # We need to use special flags to let us rename libraries. + # See + LD_RENAMEABLE_FLAGS = -Wl,-headerpad_max_install_names else # We are not running on OS X $(info OS is Linux) From d20f82b6622530cf871fc79ee97015a861d2d00e Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Mon, 15 Jul 2024 14:59:03 -0700 Subject: [PATCH 06/12] convert max tail len into soft limit --- src/multipath_alignment_graph.cpp | 271 ++++++++++++++++-------------- src/subcommand/surject_main.cpp | 4 +- 2 files changed, 147 insertions(+), 128 deletions(-) diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index 02cd7e285a7..fdee5addcdd 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -6054,26 +6054,24 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap #endif // figure out how long we need to try to align out to - int64_t tail_length = alignment.sequence().end() - path_node.end; - int64_t gap = min(aligner->longest_detectable_gap(alignment, path_node.end), max_gap); + size_t tail_length = alignment.sequence().end() - path_node.end; + size_t aligning_tail_length = min(tail_length, max_tail_length); + int64_t gap = min(max_gap, aligner->longest_detectable_gap(alignment.sequence().size() - tail_length + aligning_tail_length, + path_node.end - alignment.sequence().begin())); if (pessimistic_tail_gap_multiplier) { - gap = min(gap, pessimistic_tail_gap(tail_length, pessimistic_tail_gap_multiplier)); + gap = min(gap, pessimistic_tail_gap(aligning_tail_length, pessimistic_tail_gap_multiplier)); } - int64_t target_length = tail_length + gap; + int64_t target_length = aligning_tail_length + gap; pos_t end_pos = final_position(path_node.path); bdsg::HashGraph tail_graph; - unordered_map tail_trans; - if (tail_length <= max_tail_length || dynamic_alt_alns) { - // We need to pull out the tail graph - tail_trans = algorithms::extract_extending_graph(&align_graph, - &tail_graph, - target_length, - end_pos, - false, // search forward - false); // no need to preserve cycles (in a DAG) - } + unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, + &tail_graph, + target_length, + end_pos, + false, // search forward + false); // no need to preserve cycles (in a DAG) size_t num_alt_alns; if (dynamic_alt_alns) { @@ -6096,10 +6094,10 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap // get the sequence remaining in the right tail Alignment right_tail_sequence; right_tail_sequence.set_sequence(alignment.sequence().substr(path_node.end - alignment.sequence().begin(), - alignment.sequence().end() - path_node.end)); + aligning_tail_length)); if (!alignment.quality().empty()) { right_tail_sequence.set_quality(alignment.quality().substr(path_node.end - alignment.sequence().begin(), - alignment.sequence().end() - path_node.end)); + aligning_tail_length)); } // And the place to put it @@ -6117,70 +6115,78 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap }); }); #endif - - if (tail_length <= max_tail_length) { - // align against the graph - - if (num_alt_alns == 1) { + + // align against the graph + + if (num_alt_alns == 1) { #ifdef debug_multipath_alignment - cerr << "align right with dozeu with gap " << gap << endl; + cerr << "align right with dozeu with gap " << gap << endl; #endif - // we can speed things up by using the dozeu pinned alignment - alt_alignments.emplace_back(move(right_tail_sequence)); - aligner->align_pinned(alt_alignments.back(), tail_graph, true, true, gap); - } - else { - + // we can speed things up by using the dozeu pinned alignment + alt_alignments.emplace_back(move(right_tail_sequence)); + aligner->align_pinned(alt_alignments.back(), tail_graph, true, true, gap); + } + else { + #ifdef debug_multipath_alignment - cerr << "align right with gssw" << endl; + cerr << "align right with gssw" << endl; #endif - - // TODO: it would be cleaner to handle these in one multiplier, but i already tuned - // this for the tails and i don't feel like doing that again... - double tail_multiplier = low_complexity_multiplier(right_tail_sequence.sequence().begin(), - right_tail_sequence.sequence().end()); - double anchor_multiplier = low_complexity_multiplier(max(path_node.end - anchor_low_cmplx_len, - alignment.sequence().begin()), - path_node.end); - double multiplier = max(tail_multiplier, anchor_multiplier); - if (multiplier != 1.0) { - num_alt_alns = round(multiplier * num_alt_alns); + + // TODO: it would be cleaner to handle these in one multiplier, but i already tuned + // this for the tails and i don't feel like doing that again... + double tail_multiplier = low_complexity_multiplier(right_tail_sequence.sequence().begin(), + right_tail_sequence.sequence().end()); + double anchor_multiplier = low_complexity_multiplier(max(path_node.end - anchor_low_cmplx_len, + alignment.sequence().begin()), + path_node.end); + double multiplier = max(tail_multiplier, anchor_multiplier); + if (multiplier != 1.0) { + num_alt_alns = round(multiplier * num_alt_alns); #ifdef debug_multipath_alignment - cerr << "increase num alns for low complexity sequence to " << num_alt_alns << endl; + cerr << "increase num alns for low complexity sequence to " << num_alt_alns << endl; #endif - } - aligner->align_pinned_multi(right_tail_sequence, alt_alignments, tail_graph, true, num_alt_alns); - } - - // Translate back into non-extracted graph. - // Make sure to account for having removed the left end of the cut node relative to end_pos - for (auto& aln : alt_alignments) { - // We always remove end_pos's offset, since we - // search forward from it to extract the subgraph, - // but that may be from the left or right end of - // its node depending on orientation. - translate_node_ids(*aln.mutable_path(), tail_trans, id(end_pos), offset(end_pos), is_rev(end_pos)); } + aligner->align_pinned_multi(right_tail_sequence, alt_alignments, tail_graph, true, num_alt_alns); + } + + // Translate back into non-extracted graph. + // Make sure to account for having removed the left end of the cut node relative to end_pos + for (auto& aln : alt_alignments) { + // We always remove end_pos's offset, since we + // search forward from it to extract the subgraph, + // but that may be from the left or right end of + // its node depending on orientation. + translate_node_ids(*aln.mutable_path(), tail_trans, id(end_pos), offset(end_pos), is_rev(end_pos)); + } #ifdef debug_multipath_alignment - cerr << "made " << alt_alignments.size() << " tail alignments" << endl; - for (size_t i = 0; i < alt_alignments.size(); ++i) { - cerr << i << ": " << pb2json(alt_alignments[i]) << endl; - } + cerr << "made " << alt_alignments.size() << " tail alignments" << endl; + for (size_t i = 0; i < alt_alignments.size(); ++i) { + cerr << i << ": " << pb2json(alt_alignments[i]) << endl; + } #endif - } else { + + if (aligning_tail_length < tail_length) { // Tail is too long. Just make a softclip directly in the base graph ID space. // TODO: What if we just don't produce this? Do we get softclips for free? #ifdef debug_multipath_alignment cerr << "softclip long right" << endl; #endif - alt_alignments.emplace_back(std::move(right_tail_sequence)); - Mapping* m = alt_alignments.back().mutable_path()->add_mapping(); - m->mutable_position()->set_node_id(id(end_pos)); - m->mutable_position()->set_is_reverse(is_rev(end_pos)); - m->mutable_position()->set_offset(offset(end_pos)); - Edit* e = m->add_edit(); - e->set_to_length(alt_alignments.back().sequence().size()); - e->set_sequence(alt_alignments.back().sequence()); + size_t cut_point = (path_node.end - alignment.sequence().begin()) + aligning_tail_length; + size_t tail_remaining = tail_length - aligning_tail_length; + for (auto& alt_aln : alt_alignments) { + alt_aln.mutable_sequence()->append(alignment.sequence(), cut_point, tail_remaining); + if (!alt_aln.quality().empty()) { + alt_aln.mutable_sequence()->append(alignment.quality(),cut_point, tail_remaining); + } + + Mapping* m = alt_aln.mutable_path()->add_mapping(); + m->mutable_position()->set_node_id(id(end_pos)); + m->mutable_position()->set_is_reverse(is_rev(end_pos)); + m->mutable_position()->set_offset(offset(end_pos)); + Edit* e = m->add_edit(); + e->set_to_length(tail_remaining); + e->set_sequence(alignment.sequence().substr(cut_point, tail_remaining)); + } } } } @@ -6203,8 +6209,10 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap if (path_node.begin != alignment.sequence().begin()) { // figure out how far we need to try to align out to - int64_t tail_length = path_node.begin - alignment.sequence().begin(); - int64_t gap = min(aligner->longest_detectable_gap(alignment, path_node.begin), max_gap); + size_t tail_length = path_node.begin - alignment.sequence().begin(); + size_t aligning_tail_length = min(tail_length, max_tail_length); + int64_t gap = min(max_gap, aligner->longest_detectable_gap(alignment.sequence().size() - tail_length + aligning_tail_length, + aligning_tail_length)); if (pessimistic_tail_gap_multiplier) { gap = min(gap, pessimistic_tail_gap(tail_length, pessimistic_tail_gap_multiplier)); } @@ -6213,16 +6221,12 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap pos_t begin_pos = initial_position(path_node.path); bdsg::HashGraph tail_graph; - unordered_map tail_trans; - if (tail_length <= max_tail_length || dynamic_alt_alns) { - // We need to pull out the tail graph - tail_trans = algorithms::extract_extending_graph(&align_graph, - &tail_graph, - target_length, - begin_pos, - true, // search backward - false); // no need to preserve cycles (in a DAG) - } + unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, + &tail_graph, + target_length, + begin_pos, + true, // search backward + false); // no need to preserve cycles (in a DAG) size_t num_alt_alns; if (dynamic_alt_alns) { @@ -6244,9 +6248,11 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap // Otherwise we need an alignment to fill. // get the sequence remaining in the left tail Alignment left_tail_sequence; - left_tail_sequence.set_sequence(alignment.sequence().substr(0, path_node.begin - alignment.sequence().begin())); + left_tail_sequence.set_sequence(alignment.sequence().substr(tail_length - aligning_tail_length, + path_node.begin - alignment.sequence().begin())); if (!alignment.quality().empty()) { - left_tail_sequence.set_quality(alignment.quality().substr(0, path_node.begin - alignment.sequence().begin())); + left_tail_sequence.set_quality(alignment.quality().substr(tail_length - aligning_tail_length, + path_node.begin - alignment.sequence().begin())); } // And the place to put it @@ -6265,67 +6271,80 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap }); #endif - if (tail_length <= max_tail_length) { - // align against the graph - - if (num_alt_alns == 1) { + // align against the graph + + if (num_alt_alns == 1) { #ifdef debug_multipath_alignment - cerr << "align left with dozeu using gap " << gap << endl; + cerr << "align left with dozeu using gap " << gap << endl; #endif - // we can speed things up by using the dozeu pinned alignment - alt_alignments.emplace_back(move(left_tail_sequence)); - aligner->align_pinned(alt_alignments.back(), tail_graph, false, true, gap); - } - else { + // we can speed things up by using the dozeu pinned alignment + alt_alignments.emplace_back(move(left_tail_sequence)); + aligner->align_pinned(alt_alignments.back(), tail_graph, false, true, gap); + } + else { #ifdef debug_multipath_alignment - cerr << "align left with gssw" << endl; + cerr << "align left with gssw" << endl; #endif - double anchor_multiplier = low_complexity_multiplier(path_node.begin, - min(path_node.begin + anchor_low_cmplx_len, - alignment.sequence().end())); - double tail_multiplier = low_complexity_multiplier(left_tail_sequence.sequence().begin(), - left_tail_sequence.sequence().end()); - double multiplier = max(tail_multiplier, anchor_multiplier); - if (multiplier != 1.0) { - num_alt_alns = round(multiplier * num_alt_alns); + double anchor_multiplier = low_complexity_multiplier(path_node.begin, + min(path_node.begin + anchor_low_cmplx_len, + alignment.sequence().end())); + double tail_multiplier = low_complexity_multiplier(left_tail_sequence.sequence().begin(), + left_tail_sequence.sequence().end()); + double multiplier = max(tail_multiplier, anchor_multiplier); + if (multiplier != 1.0) { + num_alt_alns = round(multiplier * num_alt_alns); #ifdef debug_multipath_alignment - cerr << "increase num alns for low complexity sequence to " << num_alt_alns << endl; + cerr << "increase num alns for low complexity sequence to " << num_alt_alns << endl; #endif - } - - aligner->align_pinned_multi(left_tail_sequence, alt_alignments, tail_graph, false, num_alt_alns); } - // Translate back into non-extracted graph. - // Make sure to account for having removed the right end of the cut node relative to begin_pos - for (auto& aln : alt_alignments) { - // begin_pos's offset is how much we keep, so we removed the node's length minus that. - // And by default it comes off the right side of the node. - translate_node_ids(*aln.mutable_path(), tail_trans, - id(begin_pos), - align_graph.get_length(align_graph.get_handle(id(begin_pos))) - offset(begin_pos), - !is_rev(begin_pos)); - } + aligner->align_pinned_multi(left_tail_sequence, alt_alignments, tail_graph, false, num_alt_alns); + } + + // Translate back into non-extracted graph. + // Make sure to account for having removed the right end of the cut node relative to begin_pos + for (auto& aln : alt_alignments) { + // begin_pos's offset is how much we keep, so we removed the node's length minus that. + // And by default it comes off the right side of the node. + translate_node_ids(*aln.mutable_path(), tail_trans, + id(begin_pos), + align_graph.get_length(align_graph.get_handle(id(begin_pos))) - offset(begin_pos), + !is_rev(begin_pos)); + } #ifdef debug_multipath_alignment - cerr << "made " << alt_alignments.size() << " tail alignments" << endl; - for (size_t i = 0; i < alt_alignments.size(); ++i) { - cerr << i << ": " << pb2json(alt_alignments[i]) << endl; - } + cerr << "made " << alt_alignments.size() << " tail alignments" << endl; + for (size_t i = 0; i < alt_alignments.size(); ++i) { + cerr << i << ": " << pb2json(alt_alignments[i]) << endl; + } #endif - } else { + if (aligning_tail_length < tail_length) { // Tail is too long. Just make a softclip directly in the base graph ID space. // TODO: What if we just don't produce this? Do we get softclips for free? #ifdef debug_multipath_alignment cerr << "softclip long left" << endl; #endif - alt_alignments.emplace_back(std::move(left_tail_sequence)); - Mapping* m = alt_alignments.back().mutable_path()->add_mapping(); - m->mutable_position()->set_node_id(id(begin_pos)); - m->mutable_position()->set_is_reverse(is_rev(begin_pos)); - m->mutable_position()->set_offset(offset(begin_pos)); - Edit* e = m->add_edit(); - e->set_to_length(alt_alignments.back().sequence().size()); - e->set_sequence(alt_alignments.back().sequence()); + size_t tail_remaining = tail_length - aligning_tail_length; + for (auto& alt_aln : alt_alignments) { + + alt_aln.mutable_sequence()->append(alignment.sequence(), 0, tail_remaining); + if (!alt_aln.quality().empty()) { + alt_aln.mutable_sequence()->append(alignment.quality(), 0, tail_remaining); + } + + Path* p = alt_aln.mutable_path(); + // protobuf makes you manually insert at the beginning of an array + p->add_mapping(); + for (size_t i = p->mapping_size() - 1; i > 0; --i) { + p->mutable_mapping()->SwapElements(i, i - 1); + } + Mapping* m = p->mutable_mapping(0); + m->mutable_position()->set_node_id(id(begin_pos)); + m->mutable_position()->set_is_reverse(is_rev(begin_pos)); + m->mutable_position()->set_offset(offset(begin_pos)); + Edit* e = m->add_edit(); + e->set_to_length(tail_remaining); + e->set_sequence(alignment.sequence().substr(0, tail_remaining)); + } } } } diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 78e397431c8..1d374912738 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -47,7 +47,7 @@ void help_surject(char** argv) { << " -b, --bam-output write BAM to stdout" << endl << " -s, --sam-output write SAM to stdout" << endl << " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl - << " -T, --max-tail-len N do not align read tails longer than N" << endl + << " -T, --max-tail-len N only align up to N bases of read tails (default: 150)" << endl << " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl << " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl << " -S, --spliced interpret long deletions against paths as spliced alignments" << endl @@ -97,7 +97,7 @@ int main_surject(int argc, char** argv) { int min_splice_length = 20; size_t watchdog_timeout = 10; bool subpath_global = true; // force full length alignments in mpmap resolution - size_t max_tail_len = std::numeric_limits::max(); + size_t max_tail_len = 150; bool qual_adj = false; bool prune_anchors = false; size_t max_anchors = 200; From 62a514be885c63439a805ae74ff32f72f8f57e40 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Mon, 15 Jul 2024 18:08:49 -0700 Subject: [PATCH 07/12] fix the edit placement for trimmed tails --- src/multipath_alignment_graph.cpp | 56 +++++++++++++++++++------------ 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index fdee5addcdd..412fabda4c6 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -214,7 +214,7 @@ namespace vg { for (const auto& path_chunk : path_chunks) { #ifdef debug_multipath_alignment - cerr << "performing DFS to walk out path " << pb2json(path_chunk.second) << endl; + cerr << "performing DFS to walk out path " << debug_string(path_chunk.second) << endl; #endif const auto& path = path_chunk.second; @@ -256,7 +256,7 @@ namespace vg { // position matched the path #ifdef debug_multipath_alignment - cerr << "chunk position " << pb2json(pos) << " matches traversal " << projected_trav.first << (projected_trav.second ? "-" : "+") << ", walked " << stack.size() << " of " << path.mapping_size() << " mappings" << endl; + cerr << "chunk position " << debug_string(pos) << " matches traversal " << projected_trav.first << (projected_trav.second ? "-" : "+") << ", walked " << stack.size() << " of " << path.mapping_size() << " mappings" << endl; #endif if (stack.size() == path.mapping_size()) { @@ -6173,19 +6173,26 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap #endif size_t cut_point = (path_node.end - alignment.sequence().begin()) + aligning_tail_length; size_t tail_remaining = tail_length - aligning_tail_length; + for (auto& alt_aln : alt_alignments) { + alt_aln.mutable_sequence()->append(alignment.sequence(), cut_point, tail_remaining); if (!alt_aln.quality().empty()) { - alt_aln.mutable_sequence()->append(alignment.quality(),cut_point, tail_remaining); + alt_aln.mutable_sequence()->append(alignment.quality(), cut_point, tail_remaining); } - Mapping* m = alt_aln.mutable_path()->add_mapping(); - m->mutable_position()->set_node_id(id(end_pos)); - m->mutable_position()->set_is_reverse(is_rev(end_pos)); - m->mutable_position()->set_offset(offset(end_pos)); - Edit* e = m->add_edit(); - e->set_to_length(tail_remaining); - e->set_sequence(alignment.sequence().substr(cut_point, tail_remaining)); + Mapping* mapping = alt_aln.mutable_path()->mutable_mapping(alt_aln.path().mapping_size() - 1); + Edit* final_edit = mapping->mutable_edit(mapping->edit_size() - 1); + if (final_edit->from_length() == 0 && !final_edit->sequence().empty()) { + // extend the softclip + final_edit->set_sequence(final_edit->sequence() + alignment.sequence().substr(cut_point, tail_remaining)); + } + else { + final_edit = mapping->add_edit(); + final_edit->set_from_length(0); + final_edit->set_to_length(tail_remaining); + final_edit->set_sequence(alignment.sequence().substr(cut_point, tail_remaining)); + } } } } @@ -6331,19 +6338,24 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap alt_aln.mutable_sequence()->append(alignment.quality(), 0, tail_remaining); } - Path* p = alt_aln.mutable_path(); - // protobuf makes you manually insert at the beginning of an array - p->add_mapping(); - for (size_t i = p->mapping_size() - 1; i > 0; --i) { - p->mutable_mapping()->SwapElements(i, i - 1); + Mapping* mapping = alt_aln.mutable_path()->mutable_mapping(0); + Edit* first_edit = mapping->mutable_edit(0); + if (first_edit->from_length() == 0 && !first_edit->sequence().empty()) { + // it softclipped anyway, extend the softclip + first_edit->set_sequence(alignment.sequence().substr(0, tail_remaining) + first_edit->sequence()); + first_edit->set_to_length(first_edit->sequence().size()); + } + else { + mapping->add_edit(); + // protobuf makes you manually insert at the beginning of an array + for (size_t i = mapping->edit_size() - 1; i > 0; --i) { + mapping->mutable_edit()->SwapElements(i, i - 1); + } + first_edit = mapping->mutable_edit(0); + first_edit->set_from_length(0); + first_edit->set_to_length(tail_remaining); + first_edit->set_sequence(alignment.sequence().substr(0, tail_remaining)); } - Mapping* m = p->mutable_mapping(0); - m->mutable_position()->set_node_id(id(begin_pos)); - m->mutable_position()->set_is_reverse(is_rev(begin_pos)); - m->mutable_position()->set_offset(offset(begin_pos)); - Edit* e = m->add_edit(); - e->set_to_length(tail_remaining); - e->set_sequence(alignment.sequence().substr(0, tail_remaining)); } } } From 41c20d40020f5f288bad506452821f697d66f5ac Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 16 Jul 2024 11:25:13 -0400 Subject: [PATCH 08/12] Stop suggesting old versions of Clang that people could install We managed to confuse people on BioStars by noting that we *had* built with Clang 9 at one time, even though it would be a bad idea to actually try and install and use that Clang to build vg on a Mac right now. This updates the README to describe how we actually decide what Clang to build with. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6acfeb9ce29..200ecff311c 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ As with Linux, you can add `-j16` or other numbers at the end to run multiple bu **Note that static binaries cannot yet be built for Mac.** -Our team has successfully built vg on Mac with GCC versions 4.9, 5.3, 6, 7, and 7.3, as well as Clang 9.0. +The vg Mac build targets whatever the current version of Apple Clang is, and whatever version of Apple Clang is provided by our Github Actions Mac CI system. If your Clang is up to date and vg does not build for you, please open an issue. #### Mac: Run From ef4624d3fa4ae616637356adcc21e2f5ce613f42 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Tue, 16 Jul 2024 09:52:03 -0700 Subject: [PATCH 09/12] adjust default max tail len --- src/subcommand/surject_main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 1d374912738..1ca5fde97cd 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -47,7 +47,7 @@ void help_surject(char** argv) { << " -b, --bam-output write BAM to stdout" << endl << " -s, --sam-output write SAM to stdout" << endl << " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl - << " -T, --max-tail-len N only align up to N bases of read tails (default: 150)" << endl + << " -T, --max-tail-len N only align up to N bases of read tails (default: 300)" << endl << " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl << " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl << " -S, --spliced interpret long deletions against paths as spliced alignments" << endl @@ -97,7 +97,7 @@ int main_surject(int argc, char** argv) { int min_splice_length = 20; size_t watchdog_timeout = 10; bool subpath_global = true; // force full length alignments in mpmap resolution - size_t max_tail_len = 150; + size_t max_tail_len = 300; bool qual_adj = false; bool prune_anchors = false; size_t max_anchors = 200; From ce756c2ae39871b72082fdae6d600e8046060481 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Tue, 16 Jul 2024 15:33:35 -0700 Subject: [PATCH 10/12] fix bug in softclip reconstruction --- src/multipath_alignment_graph.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index 412fabda4c6..5358e0bcd77 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -6056,6 +6056,11 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap // figure out how long we need to try to align out to size_t tail_length = alignment.sequence().end() - path_node.end; size_t aligning_tail_length = min(tail_length, max_tail_length); +#ifdef debug_multipath_alignment + if (aligning_tail_length != tail_length) { + cerr << "trimming tail from length " << tail_length << " to " << aligning_tail_length << endl; + } +#endif int64_t gap = min(max_gap, aligner->longest_detectable_gap(alignment.sequence().size() - tail_length + aligning_tail_length, path_node.end - alignment.sequence().begin())); if (pessimistic_tail_gap_multiplier) { @@ -6104,7 +6109,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap auto& alt_alignments = right_alignments[j]; #ifdef debug_multipath_alignment - cerr << "making " << num_alt_alns << " alignments of sequence: " << right_tail_sequence.sequence() << endl << "to right tail graph" << endl; + cerr << "making " << num_alt_alns << " alignments of sequence: " << right_tail_sequence.sequence() << endl << "to right tail graph extracted from " << end_pos << endl; tail_graph.for_each_handle([&](const handle_t& handle) { cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; tail_graph.follow_edges(handle, true, [&](const handle_t& prev) { @@ -6218,6 +6223,11 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap // figure out how far we need to try to align out to size_t tail_length = path_node.begin - alignment.sequence().begin(); size_t aligning_tail_length = min(tail_length, max_tail_length); +#ifdef debug_multipath_alignment + if (aligning_tail_length != tail_length) { + cerr << "trimming tail from length " << tail_length << " to " << aligning_tail_length << endl; + } +#endif int64_t gap = min(max_gap, aligner->longest_detectable_gap(alignment.sequence().size() - tail_length + aligning_tail_length, aligning_tail_length)); if (pessimistic_tail_gap_multiplier) { @@ -6256,17 +6266,17 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap // get the sequence remaining in the left tail Alignment left_tail_sequence; left_tail_sequence.set_sequence(alignment.sequence().substr(tail_length - aligning_tail_length, - path_node.begin - alignment.sequence().begin())); + aligning_tail_length)); if (!alignment.quality().empty()) { left_tail_sequence.set_quality(alignment.quality().substr(tail_length - aligning_tail_length, - path_node.begin - alignment.sequence().begin())); + aligning_tail_length)); } // And the place to put it auto& alt_alignments = left_alignments[j]; #ifdef debug_multipath_alignment - cerr << "making " << num_alt_alns << " alignments of sequence: " << left_tail_sequence.sequence() << endl << "to left tail graph" << endl; + cerr << "making " << num_alt_alns << " alignments of sequence: " << left_tail_sequence.sequence() << endl << "to left tail graph extracted from " << begin_pos << endl; tail_graph.for_each_handle([&](const handle_t& handle) { cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; tail_graph.follow_edges(handle, false, [&](const handle_t& next) { From dfac8686c555e471f2cc21193b114fac1b5459e2 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Tue, 16 Jul 2024 17:27:04 -0700 Subject: [PATCH 11/12] revise up default tail limit a lot --- src/subcommand/surject_main.cpp | 4 ++-- src/surjector.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 1ca5fde97cd..b24dd304273 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -47,7 +47,7 @@ void help_surject(char** argv) { << " -b, --bam-output write BAM to stdout" << endl << " -s, --sam-output write SAM to stdout" << endl << " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl - << " -T, --max-tail-len N only align up to N bases of read tails (default: 300)" << endl + << " -T, --max-tail-len N only align up to N bases of read tails (default: 10000)" << endl << " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl << " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl << " -S, --spliced interpret long deletions against paths as spliced alignments" << endl @@ -97,7 +97,7 @@ int main_surject(int argc, char** argv) { int min_splice_length = 20; size_t watchdog_timeout = 10; bool subpath_global = true; // force full length alignments in mpmap resolution - size_t max_tail_len = 300; + size_t max_tail_len = 10000; bool qual_adj = false; bool prune_anchors = false; size_t max_anchors = 200; diff --git a/src/surjector.hpp b/src/surjector.hpp index 40c9b44f5a8..685988f1875 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -110,7 +110,7 @@ using namespace std; int64_t min_splice_repair_length = 250; /// the maximum length of a tail that we will try to align - size_t max_tail_length = std::numeric_limits::max(); + size_t max_tail_length = 10000; /// How big of a graph in bp should we ever try to align against for realigning surjection? size_t max_subgraph_bases = 100 * 1024; From a0e8e8a0cd11b00fd2c670a098d7d57bf3c886f2 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 17 Jul 2024 11:13:15 -0400 Subject: [PATCH 12/12] Sort minimizers by score in Giraffe single-end mode Apparently in 40bf613207369f2e35d4c5367a7199a2f3bff4ce I meant to keep the minimizers sorted by score in single-end mapping mode, but when https://github.com/vgteam/vg/pull/3810 actually merged it changed the order that find_minimizers returns results in to read order and added a score sort to the paired-end non-chaining codepath but not the single-end non-chaining codepath. This should restore the vg 1.44.0 behavior of taking or not taking identical minimizers all across the read, and also of prioritizing minimizers to take by score, in single-end mode. --- src/minimizer_mapper.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index 9def875f42e..b8fa8ed8bd8 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -589,10 +589,13 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { return aln.sequence(); }); - - // Minimizers sorted by score in descending order. - std::vector minimizers = this->find_minimizers(aln.sequence(), funnel); - + // Minimizers sorted by position + std::vector minimizers_in_read = this->find_minimizers(aln.sequence(), funnel); + // Indexes of minimizers, sorted into score order, best score first + std::vector minimizer_score_order = sort_minimizers_by_score(minimizers_in_read); + // Minimizers sorted by best score first + VectorView minimizers{minimizers_in_read, minimizer_score_order}; + // Find the seeds and mark the minimizers that were located. vector seeds = this->find_seeds(minimizers, aln, funnel);